Repository: RapidAI/RapidLayout Branch: main Commit: 3f9b92213979 Files: 83 Total size: 188.8 KB Directory structure: gitextract_f1yjucyy/ ├── .github/ │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ ├── 01-feature_request.md │ │ ├── 02-bug.md │ │ └── 03-blank.md │ └── workflows/ │ ├── SyncToGitee.yml │ ├── docs_build_develop.yml │ ├── docs_build_release.yml │ ├── publish_whl.yml │ └── push_discord.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── cliff.toml ├── demo.py ├── docs/ │ ├── blog/ │ │ ├── .authors.yml │ │ ├── .meta.yml │ │ ├── index.md │ │ └── posts/ │ │ └── support_pp_doc_layout.md │ ├── contributing.md │ ├── doc_whl_rapid_layout.md │ ├── hooks/ │ │ ├── change_copyright.py │ │ ├── expiry.py │ │ └── link.py │ ├── index.md │ ├── install_usage/ │ │ ├── how_to_use_other_engine.md │ │ ├── installation.md │ │ └── usage.md │ ├── models.md │ ├── quickstart.md │ └── stylesheets/ │ └── extra.css ├── mkdocs.yml ├── overrides/ │ ├── 404.html │ ├── main.html │ └── partials/ │ ├── comments.html │ ├── content.html │ └── expired_notice.html ├── rapid_layout/ │ ├── __init__.py │ ├── configs/ │ │ ├── __init__.py │ │ ├── default_models.yaml │ │ └── engine_cfg.yaml │ ├── inference_engine/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── onnxruntime/ │ │ │ ├── __init__.py │ │ │ ├── main.py │ │ │ └── provider_config.py │ │ └── openvino/ │ │ ├── __init__.py │ │ ├── device_config.py │ │ └── main.py │ ├── main.py │ ├── model_handler/ │ │ ├── __init__.py │ │ ├── base/ │ │ │ └── __init__.py │ │ ├── doc_layout/ │ │ │ ├── __init__.py │ │ │ ├── main.py │ │ │ ├── post_process.py │ │ │ └── pre_process.py │ │ ├── main.py │ │ ├── pp/ │ │ │ ├── __init__.py │ │ │ ├── main.py │ │ │ ├── post_process.py │ │ │ └── pre_process.py │ │ ├── pp_doc_layout/ │ │ │ ├── __init__.py │ │ │ ├── main.py │ │ │ ├── post_process.py │ │ │ └── pre_process.py │ │ ├── utils.py │ │ └── yolov8/ │ │ ├── __init__.py │ │ ├── main.py │ │ ├── post_process.py │ │ └── pre_process.py │ ├── models/ │ │ ├── .gitkeep │ │ └── __init__.py │ └── utils/ │ ├── __init__.py │ ├── download_file.py │ ├── load_image.py │ ├── logger.py │ ├── typings.py │ ├── utils.py │ └── vis_res.py ├── requirements.txt ├── setup.py └── tests/ ├── test_engine.py └── test_main.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] patreon: # Replace with a single Patreon username open_collective: # Replace with a single Open Collective username ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry liberapay: # Replace with a single Liberapay username issuehunt: # Replace with a single IssueHunt username otechie: # Replace with a single Otechie username lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry custom: https://raw.githubusercontent.com/RapidAI/.github/6db6b6b9273f3151094a462a61fbc8e88564562c/assets/Sponsor.png ================================================ FILE: .github/ISSUE_TEMPLATE/01-feature_request.md ================================================ --- name: Feature Request about: requests for new RapidOCR features title: 'Feature Request' labels: 'Feature Request' assignees: '' --- 请您详细描述想要添加的新功能或者是新特性 (Please describe in detail the new function or new feature you want to add) ================================================ FILE: .github/ISSUE_TEMPLATE/02-bug.md ================================================ --- name: 🐞 Bug about: Bug title: 'Bug' labels: 'Bug' assignees: '' --- #### 问题描述 / Problem Description #### 运行环境 / Runtime Environment #### 复现代码 / Reproduction Code ```python ``` #### 可能解决方案 / Possible solutions ================================================ FILE: .github/ISSUE_TEMPLATE/03-blank.md ================================================ --- name: Blank Template about: Blank Template title: 'Blank Template' labels: 'Blank Template' assignees: '' --- ================================================ FILE: .github/workflows/SyncToGitee.yml ================================================ name: SyncToGitee on: push: branches: - main jobs: repo-sync: runs-on: ubuntu-latest steps: - name: Checkout source codes uses: actions/checkout@v3 - name: Mirror the Github organization repos to Gitee. uses: Yikun/hub-mirror-action@v1.4 with: src: 'github/RapidAI' dst: 'gitee/RapidAI' dst_key: ${{ secrets.GITEE_PRIVATE_KEY }} dst_token: ${{ secrets.GITEE_TOKEN }} force_update: true # only sync this repo static_list: "RapidLayout" debug: true ================================================ FILE: .github/workflows/docs_build_develop.yml ================================================ name: Build/Publish Develop Docs on: push: branches: - main paths: - "docs/**" - ".github/workflows/docs_build_develop.yml" - ".github/workflows/docs_build_release.yml" - "mkdocs.yml" permissions: contents: write jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Configure Git Credentials run: | git config user.name github-actions[bot] git config user.email 41898282+github-actions[bot]@users.noreply.github.com - uses: actions/setup-python@v5 with: python-version: 3.x - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV - uses: actions/cache@v4 with: key: mkdocs-material-${{ env.cache_id }} path: .cache restore-keys: | mkdocs-material- - run: pip install mike mkdocs mkdocs-material jieba mkdocs-git-revision-date-localized-plugin mkdocs-git-committers-plugin-2 - run: | git fetch origin gh-pages --depth=1 || true mkdocs build ls -la site/ mike deploy --push --update-aliases main latest mike set-default --push latest ================================================ FILE: .github/workflows/docs_build_release.yml ================================================ name: Build/Publish Release Docs on: push: tags: - v* permissions: contents: write jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Configure Git Credentials run: | git config user.name github-actions[bot] git config user.email github-actions[bot]@users.noreply.github.com - uses: actions/setup-python@v5 with: python-version: 3.x - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV - uses: actions/cache@v4 with: key: mkdocs-material-${{ env.cache_id }} path: .cache restore-keys: | mkdocs-material- - run: pip install mike mkdocs mkdocs-material jieba mkdocs-git-revision-date-localized-plugin mkdocs-git-committers-plugin-2 - run: | git fetch origin gh-pages --depth=1 mike deploy --push "${{ github.ref_name }}" ================================================ FILE: .github/workflows/publish_whl.yml ================================================ name: Push rapidocr_layout to pypi on: push: tags: - v* env: RESOURCES_URL: https://github.com/RapidAI/RapidLayout/releases/download/v0.0.0/rapid_layout_models.zip jobs: UnitTesting: runs-on: ubuntu-latest steps: - name: Pull latest code uses: actions/checkout@v3 - name: Set up Python 3.10 uses: actions/setup-python@v4 with: python-version: '3.10' architecture: 'x64' - name: Display Python version run: python -c "import sys; print(sys.version)" - name: Unit testings run: | pip install -r requirements.txt pip install pytest wheel onnxruntime pytest tests/test*.py GenerateWHL_PushPyPi: needs: UnitTesting runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Run setup run: | pip install -r requirements.txt pip install get_pypi_latest_version wget $RESOURCES_URL ZIP_NAME=${RESOURCES_URL##*/} DIR_NAME=${ZIP_NAME%.*} unzip $ZIP_NAME mv $DIR_NAME/*.onnx rapid_layout/models/ python setup.py bdist_wheel ${{ github.ref_name }} - name: Publish distribution 📦 to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.PYPI_API_TOKEN }} packages_dir: dist/ ================================================ FILE: .github/workflows/push_discord.yml ================================================ name: discord message on: release: types: [published] jobs: notify: runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Prepare Discord message id: prepare_message run: | full_msg="🚀 **New Release!** ${{ github.event.release.name }} **Tag:** ${{ github.event.release.tag_name }} **Author:** ${{ github.event.release.author.login }} **Release Notes:** ${{ github.event.release.body }}" if [ ${#full_msg} -gt 1990 ]; then truncated_msg="${full_msg:0:1987}..." else truncated_msg="$full_msg" fi echo "message<> "$GITHUB_OUTPUT" echo "$truncated_msg" >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" - name: Send to Discord env: DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK_URL }} DISCORD_USERNAME: Github Actions DISCORD_AVATAR: https://cdn.discordapp.com/avatars/1460099944252702846/e57fd67dc7ca0cc840a0e87a82281bc5.webp?size=80 uses: Ilshidur/action-discord@0.4.0 with: args: ${{ steps.prepare_message.outputs.message }} ================================================ FILE: .gitignore ================================================ # Created by .ignore support plugin (hsz.mobi) ### Python template # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class .pytest_cache # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. # *.manifest # *.spec *.res # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ #idea .vs .vscode .idea /images /models #models *.onnx *.ttf *.ttc long1.jpg *.bin *.mapping *.xml *.pdiparams *.pdiparams.info *.pdmodel .DS_Store ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://gitee.com/SWHL/autoflake rev: v2.1.1 hooks: - id: autoflake args: [ "--recursive", "--in-place", "--remove-all-unused-imports", "--remove-unused-variable", "--ignore-init-module-imports", ] files: \.py$ - repo: https://gitee.com/SWHL/black rev: 23.1.0 hooks: - id: black files: \.py$ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2024 RapidAI Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================

Rapid 📄 Layout

 
文档版面分析 - 定位标题、段落、表格与图片等版面元素
 
PyPI SemVer2.0
### 📝 简介 Rapid Layout 汇集全网开源的版面分析能力,对文档类图像(论文截图、研报等)进行分析,定位其中的**类别与位置**,如标题、段落、表格、图片等版面元素。 **支持场景概览:** 支持表格、中文、英文、论文、研报及通用版面等多种类型,内置 PP 系列、YOLOv8 系列以及推荐的 DocLayout-YOLO 等模型。不同场景版面差异较大,暂无单一模型覆盖所有场景;若业务效果不佳,建议自建训练集微调。完整模型列表与下载见[文档站](https://rapidai.github.io/RapidLayout/)。 如果您觉得本仓库对您有帮助,欢迎给个 ⭐ 支持一下。 ### 🎥 效果展示
### 🛠️ 安装 ```bash pip install rapid-layout onnxruntime ``` ### 📋 使用 ```python from rapid_layout import RapidLayout layout_engine = RapidLayout() img_path = "https://raw.githubusercontent.com/RapidAI/RapidLayout/718b60e927ab893c2fad67c98f753b2105a6f421/tests/test_files/layout.jpg" results = layout_engine(img_path) print(results) results.vis("layout_res.png") ``` 终端运行:`rapid_layout test_images/layout.png` ### 📚 文档 完整文档(安装、使用方式、模型列表、GPU/NPU 配置、参考项目等)请移步:[**Rapid Layout 文档**](https://rapidai.github.io/RapidLayout/) ### 📋 更新日志 版本更新与发布说明请查看:[**Releases**](https://github.com/RapidAI/RapidLayout/releases)。 ### 🙏 致谢 - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) - [PaddleOCR 版面分析](https://github.com/PaddlePaddle/PaddleOCR/blob/133d67f27dc8a241d6b2e30a9f047a0fb75bebbe/ppstructure/layout/README_ch.md) - [360LayoutAnalysis](https://github.com/360AILAB-NLP/360LayoutAnalysis) - [ONNX-YOLOv8-Object-Detection](https://github.com/ibaiGorordo/ONNX-YOLOv8-Object-Detection) - [ChineseDocumentPDF](https://github.com/SWHL/ChineseDocumentPDF) ### 🤝 贡献指南 欢迎通过 Issue 反馈问题与建议,或通过 Pull Request 参与代码与文档贡献。完整流程请参阅:[贡献指南](https://rapidai.github.io/RapidLayout/main/contributing/)。 ### 🎖 贡献者

### 📜 引用 若该项目对您的研究有帮助,可考虑引用: ```bibtex @misc{RapidLayout, title={{Rapid Layout}: Document Layout Analysis}, author={RapidAI Team}, howpublished = {\url{https://github.com/RapidAI/RapidLayout}}, year={2024} } ``` ### ⭐️ Star history [![Stargazers over time](https://starchart.cc/RapidAI/RapidLayout.svg?variant=adaptive)](https://starchart.cc/RapidAI/RapidLayout) ### ⚖️ 开源许可证 本项目采用 [Apache 2.0 license](LICENSE) 开源许可证。 ================================================ FILE: cliff.toml ================================================ [changelog] body = """ {% for group, commits in commits | group_by(attribute="group") %} ### {{ group | striptags | trim | upper_first }} {% for commit in commits | filter(attribute="scope") | sort(attribute="scope") %} - **({{commit.scope}})**{% if commit.breaking %} [**breaking**]{% endif %} \ {{ commit.message }} by [@{{ commit.author.name }}](https://github.com/{{ commit.author.name }}) in [{{ commit.id | truncate(length=7, end="") }}]($REPO/commit/{{ commit.id }}) {%- endfor -%} {% raw %}\n{% endraw %}\ {%- for commit in commits %} {%- if commit.scope -%} {% else -%} - {% if commit.breaking %} [**breaking**]{% endif %}\ {{ commit.message }} by [@{{ commit.author.name }}](https://github.com/{{ commit.author.name }}) in [{{ commit.id | truncate(length=7, end="") }}]($REPO/commit/{{ commit.id }}) {% endif -%} {% endfor -%} {% endfor %} {% if github.contributors | length > 0 %} ### 🎉 Contributors {% for contributor in github.contributors %} - [@{{ contributor.username }}](https://github.com/{{ contributor.username }}) {%- endfor -%} {% endif %} {% if version %} {% if previous.version %}\ **Full Changelog**: [{{ version | trim_start_matches(pat="v") }}]($REPO/compare/{{ previous.version }}..{{ version }}) {% else %}\ **Full Changelog**: [{{ version | trim_start_matches(pat="v") }}] {% endif %}\ {% else %}\ ## [unreleased] {% endif %} """ footer = """ """ # Remove leading and trailing whitespaces from the changelog's body. trim = true postprocessors = [ # Replace the placeholder `` with a URL. { pattern = '\$REPO', replace = "https://github.com/RapidAI/RapidLayout" }, # replace repository URL # 去掉每行末尾的无效空格 { pattern = "(?m)[ \t]+$", replace = "" }, # 将连续多个空行压缩为最多一个空行 { pattern = "\n{3,}", replace = "\n\n" }, ] [git] # Parse commits according to the conventional commits specification. # See https://www.conventionalcommits.org conventional_commits = true # Exclude commits that do not match the conventional commits specification. filter_unconventional = true # Split commits on newlines, treating each line as an individual commit. split_commits = false # An array of regex based parsers to modify commit messages prior to further processing. commit_preprocessors = [ # Replace issue numbers with link templates to be updated in `changelog.postprocessors`. #{ pattern = '\((\w+\s)?#([0-9]+)\)', replace = "([#${2}](https://github.com/orhun/git-cliff/issues/${2}))"}, ] # An array of regex based parsers for extracting data from the commit message. # Assigns commits to groups. # Optionally sets the commit's scope and can decide to exclude commits from further processing. commit_parsers = [ { message = "^feat", group = "🚀 Features" }, { message = "^fix", group = "🐛 Bug Fixes" }, { message = "^doc", group = "📚 Documentation" }, { message = "^perf", group = "⚡ Performance" }, { message = "^refactor", group = "🚜 Refactor" }, { message = "^style", group = "🎨 Styling" }, { message = "^test", group = "🧪 Testing" }, { message = "^chore\\(release\\): prepare for", skip = true }, { message = "^chore\\(deps.*\\)", skip = true }, { message = "^chore\\(pr\\)", skip = true }, { message = "^chore\\(pull\\)", skip = true }, { message = "^chore|^ci", group = "⚙️ Miscellaneous Tasks" }, { body = ".*security", group = "🛡️ Security" }, { message = "^revert", group = "◀️ Revert" }, { message = ".*", group = "💼 Other" }, ] # Exclude commits that are not matched by any commit parser. filter_commits = false # Order releases topologically instead of chronologically. topo_order = false # Order of commits in each group/release within the changelog. # Allowed values: newest, oldest sort_commits = "newest" ================================================ FILE: demo.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from rapid_layout import EngineType, ModelType, RapidLayout layout_engine = RapidLayout( engine_type=EngineType.ONNXRUNTIME, model_type=ModelType.PP_DOC_LAYOUTV2, ) img_url = "https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/master/resources/test_files/pp_doc_layoutv2_layout.jpg" results = layout_engine(img_url) print(results) results.vis("layout_res.png") ================================================ FILE: docs/blog/.authors.yml ================================================ authors: SWHL: name: SWHL description: Creator avatar: https://avatars.githubusercontent.com/u/28639377?v=4 url: https://swhl.github.io/latest/ ================================================ FILE: docs/blog/.meta.yml ================================================ comments: true hide: - feedback ================================================ FILE: docs/blog/index.md ================================================ # Blog ================================================ FILE: docs/blog/posts/support_pp_doc_layout.md ================================================ --- title: 支持 PP-DocLayoutV2/V3 系列模型 date: created: 2026-02-10 updated: 2026-03-24 authors: [SWHL] slug: support-PP-DocLayoutv2-v3 categories: - General comments: true --- 本篇文章主要记录如何集成 PP-DocLayoutV2/V3 模型的 ### 引言 PP-DocLayout 系列模型在版面分析方面效果很好,目前已经作为 PaddleOCR-VL 系列模型的前置,起着至关重要的作用。 文档智能的关键地方就在于此。因此,想着将该模型纳入 RapidLayout 系列模型中,方便小伙伴们快速使用。 ### 运行环境 - 操作系统:Ubuntu - Python:3.10.14 - 其他依赖环境: ```text linenums="1" paddle2onnx==2.1.0 paddlepaddle==3.3.0 onnx==1.17.0 onnxruntime==1.23.2 ``` ### 转换命令 ```bash paddle2onnx --model_dir=models/PP-DocLayoutV2 --model_filename inference.json --params_filename inference.pdiparams --save_file=./models/PP-DocLayoutV2/inference.onnx --enable_onnx_checker=True ``` ### 比较结果 我在 `/xxxx/miniforge3/envs/wjh_debug/lib/python3.10/site-packages/paddlex/inference/models/layout_analysis/predictor.py` 中插入以下代码(在 **L103** 行左右),来保证输入相同,比较输出。 #### PP-DocLayoutV2 ⚠️注意:按照上面直接转换后,在相同输入下,ONNX 模型和 Paddle 模型推理结果误差为 **14.8%**。在我看来,这个误差其实挺大的。 但是从可视化示例图结果来看,两者并无明显区别。可能在某些图上会有较大区别。 ```python linenums="1" title="比较两种格式模型推理结果" # 省略前面代码... ... import onnxruntime import numpy as np model_path = "models/PP-DocLayoutV2/inference.onnx" ort_session = onnxruntime.InferenceSession(model_path) ort_inputs = { "im_shape": batch_inputs[0], "image": batch_inputs[1], "scale_factor": batch_inputs[2], } ort_outputs = ort_session.run(None, ort_inputs) # do infer batch_preds = self.infer(batch_inputs) # 千分位是否相同 np.testing.assert_allclose(batch_preds[0], ort_outputs[0], atol=1e-3, rtol=0) ``` 输出结果如下: ```bash linenums="1" hl_lines="21-23" Traceback (most recent call last): File "/xxxx/paddleocr/test_pp_doc_layoutv2.py", line 4, in output = model.predict( File "/xxxx/lib/python3.10/site-packages/paddleocr/_models/base.py", line 57, in predict result = list(self.predict_iter(*args, **kwargs)) File "/xxxx/lib/python3.10/site-packages/paddlex/inference/models/base/predictor/base_predictor.py", line 281, in __call__ yield from self.apply(input, **kwargs) File "/xxxx/lib/python3.10/site-packages/paddlex/inference/models/base/predictor/base_predictor.py", line 338, in apply prediction = self.process(batch_data, **kwargs) File "/xxxx/lib/python3.10/site-packages/paddlex/inference/models/layout_analysis/predictor.py", line 119, in process np.testing.assert_allclose(batch_preds[0], ort_outputs[0], atol=1e-3, rtol=0) File "/xxxx/lib/python3.10/site-packages/numpy/testing/_private/utils.py", line 1504, in assert_allclose assert_array_compare(compare, actual, desired, err_msg=str(err_msg), File "/xxxx/lib/python3.10/contextlib.py", line 79, in inner return func(*args, **kwds) File "/xxxx/lib/python3.10/site-packages/numpy/testing/_private/utils.py", line 797, in assert_array_compare raise AssertionError(msg) AssertionError: Not equal to tolerance rtol=0, atol=0.001 Mismatched elements: 354 / 2400 (14.8%) Max absolute difference: 196. Max relative difference: 194. x: array([[2.200000e+01, 9.889924e-01, 3.354079e+01, ..., 6.150450e+02, 2.900000e+02, 2.900000e+02], [2.200000e+01, 9.888635e-01, 3.372379e+01, ..., 8.526023e+02,... y: array([[2.200000e+01, 9.889925e-01, 3.354081e+01, ..., 6.150450e+02, 2.900000e+02, 2.900000e+02], [2.200000e+01, 9.888635e-01, 3.372382e+01, ..., 8.526024e+02,... ``` 暂时先用这个 ONNX 模型,该问题已经反馈到了 Paddle2ONNX issue [#1608](https://github.com/PaddlePaddle/Paddle2ONNX/issues/1608#issuecomment-3875561303) #### PP-DocLayoutV3 和 PP-DocLayoutV2 相同环境,相同转换代码,这个模型误差就小很多了,仅有 **1.57%** 了。 ```bash AssertionError: Not equal to tolerance rtol=0, atol=0.001 Mismatched elements: 33 / 2100 (1.57%) Max absolute difference among violations: 1. Max relative difference among violations: 0.01754386 ACTUAL: array([[2.200000e+01, 9.658169e-01, 3.387792e+01, ..., 3.626684e+02, 8.528884e+02, 1.540000e+02], [2.200000e+01, 9.657925e-01, 3.363610e+01, ..., 3.633332e+02,... DESIRED: array([[2.200000e+01, 9.658167e-01, 3.387791e+01, ..., 3.626685e+02, 8.528885e+02, 1.530000e+02], [2.200000e+01, 9.657924e-01, 3.363615e+01, ..., 3.633333e+02,... ``` ### 剥离推理代码 因为 PaddleOCR 库中需要兼容的推理代码较多,大而全。这也导致了有些臃肿。这是难以避免的。但是如果只看 PP-DocLayout 推理代码的话,很多问题就很简单了。 完整的推理代码,我放到了 Gist 上 → [link](https://gist.github.com/SWHL/c9455e8947f4abdfbbd8439c0bb83410) ### 字典写入 ONNX ```python linenums="1" title="write_dict.py" from pathlib import Path from typing import List, Union import onnx import onnxruntime as ort from onnx import ModelProto class ONNXMetaOp: @classmethod def add_meta( cls, model_path: Union[str, Path], key: str, value: List[str], delimiter: str = "\n", ) -> ModelProto: model = onnx.load_model(model_path) meta = model.metadata_props.add() meta.key = key meta.value = delimiter.join(value) return model @classmethod def get_meta( cls, model_path: Union[str, Path], key: str, split_sym: str = "\n" ) -> List[str]: sess = ort.InferenceSession(model_path) meta_map = sess.get_modelmeta().custom_metadata_map key_content = meta_map.get(key) key_list = key_content.split(split_sym) return key_list @classmethod def del_meta(cls, model_path: Union[str, Path]) -> ModelProto: model = onnx.load_model(model_path) del model.metadata_props[:] return model @classmethod def save_model(cls, save_path: Union[str, Path], model: ModelProto): onnx.save_model(model, save_path) paper_label = [ "abstract", "algorithm", "aside_text", "chart", "content", "display_formula", "doc_title", "figure_title", "footer", "footer_image", "footnote", "formula_number", "header", "header_image", "image", "inline_formula", "number", "paragraph_title", "reference", "reference_content", "seal", "table", "text", "vertical_text", "vision_footnote", ] model_path = "models/inference.onnx" model = ONNXMetaOp.add_meta(model_path, key="character", value=paper_label) new_model_path = "models/pp_doc_layoutv2.onnx" ONNXMetaOp.save_model(new_model_path, model) t = ONNXMetaOp.get_meta(new_model_path, key="character") print(t) ``` 输出以下 `label`,则认为成功: ```bash linenums="1" $ python write_dict.py ['abstract', 'algorithm', 'aside_text', 'chart', 'content', 'display_formula', 'doc_title', 'figure_title', 'footer', 'footer_image', 'footnote', 'formula_number', 'header', 'header_image', 'image', 'inline_formula', 'number', 'paragraph_title', 'reference', 'reference_content', 'seal', 'table', 'text', 'vertical_text', 'vision_footnote'] ``` PP-DocLayoutV2 和 PP-DocLayoutV3 字典是一样的。 ### 使用 目前 PP-DocLayoutV2 在 `rapid_layout>=1.1.0` 已经支持。PP-DocLayoutV3 在 `rapid_layout>=1.2.0` 中支持。使用示例: ```python linenums="1" from rapid_layout import EngineType, ModelType, RapidLayout layout_engine = RapidLayout( engine_type=EngineType.ONNXRUNTIME, model_type=ModelType.PP_DOC_LAYOUTV2, ) img_url = "https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/master/resources/test_files/pp_doc_layoutv2_layout.jpg" results = layout_engine(img_url) print(results) results.vis("layout_res.png") ``` ================================================ FILE: docs/contributing.md ================================================ --- comments: true title: 贡献指南 hide: - navigation # - toc --- 感谢你对 Rapid Layout 的关注与贡献!本文档说明如何参与项目的代码开发与文档贡献,包括环境准备、开发流程和提交流程。 ## 前置要求 - Python >= 3.6(推荐 3.8+) - Git - 已注册的 GitHub 账号 --- ## 一、克隆源码 从 Rapid Layout 主仓库克隆项目到本地: ```bash git clone https://github.com/RapidAI/RapidLayout.git cd RapidLayout ``` 若网络受限,可先 Fork 到个人账号后再克隆(见后文「准备提交」部分)。 --- ## 二、配置开发环境 建议使用虚拟环境,避免与系统 Python 冲突: ```bash # 使用 venv python -m venv .venv source .venv/bin/activate # Linux/macOS # .venv\Scripts\activate # Windows # 或使用 conda conda create -n rapidlayout python=3.10 conda activate rapidlayout ``` 安装依赖(开发时建议可编辑安装以便本地修改生效): ```bash pip install -r requirements.txt pip install pytest # 运行单元测试需要 pip install -e . ``` 如需使用 ONNX Runtime 等推理后端,请按 [安装文档](https://rapidai.github.io/RapidLayout/install_usage/installation/) 安装对应依赖。 --- ## 三、安装代码格式化与 pre-commit 钩子 在已激活的虚拟环境中安装 pre-commit,并在 **仓库根目录** 启用 Git 提交前钩子,以便自动做代码格式检查与整理(如 black, autoflake 等): ```bash pip install pre-commit pre-commit install ``` 安装成功后,每次执行 `git commit` 时会自动运行配置好的格式化工具;若检查未通过,提交会被拒绝,请根据提示修改后再次提交。也可在提交前手动跑一遍: ```bash pre-commit run --all-files ``` --- ## 四、运行单元测试 在 **仓库根目录** 下执行: ```bash # 运行全部测试 pytest tests/ -v # 仅运行部分测试文件 pytest tests/test_main.py -v # 查看测试覆盖率(需先安装 pytest-cov) pytest tests/ -v --cov=rapid_layout ``` 确认当前主分支在你本机环境下测试通过,再进行修改。 --- ## 五、复现问题 / 增加新功能 ### 反馈问题与建议 - **Bug 反馈**:在 [Issues](https://github.com/RapidAI/RapidLayout/issues) 中提交 Bug 报告,请尽量包含复现步骤、环境信息与报错信息。 - **功能建议**:在 Issues 中使用 Feature Request 模板描述你的需求或使用场景。 - **文档与示例**:发现文档错误或希望补充示例时,可直接提 Issue 或 PR。 ### 复现 Bug 1. 在 [Issues](https://github.com/RapidAI/RapidLayout/issues) 中选定或创建对应 issue。 2. 根据 issue 描述与报错信息,在本地用仓库代码复现问题。 3. 在 `rapid_layout/` 或 `tests/` 下定位并修改代码,直到问题消失。 ### 增加新功能 1. 与 maintainer 或现有 issue 讨论需求与实现方式(可选但推荐)。 2. 在 `rapid_layout/` 下实现新逻辑,保持与现有代码风格一致(项目使用 [black](https://github.com/psf/black) 等规范)。 3. 新功能应有对应单元测试覆盖。 --- ## 六、编写对应单元测试 - 测试文件放在 **`tests/`** 下,命名建议 `test_*.py`。 - 使用 **pytest** 编写用例,可参考现有 `test_main.py`。 - 测试用图片等资源放在 `tests/test_files/`。 - 新增测试应: - 能稳定复现你要验证的行为(Bug 修复或新功能); - 不依赖未在仓库或文档中说明的外部服务(必要时用 mock 或跳过)。 示例: ```python # tests/test_xxx.py import pytest from pathlib import Path cur_dir = Path(__file__).resolve().parent root_dir = cur_dir.parent test_dir = cur_dir / "test_files" def get_engine(): from rapid_layout import RapidLayout return RapidLayout() def test_your_new_feature(): engine = get_engine() img_path = test_dir / "layout.jpg" result = engine(img_path) assert result is not None # 更多断言... ``` --- ## 七、运行所有单元测试 在 **仓库根目录** 下再次全量跑测,确保无回归: ```bash pytest tests/ -v ``` 若有测试被跳过(如缺少某推理引擎),请确认你修改或新增的测试在现有环境下已执行并通过。 --- ## 八、准备提交到仓库 ### 8.1 Fork Rapid Layout 主仓库到个人账号 1. 打开 [Rapid Layout 主仓库](https://github.com/RapidAI/RapidLayout)。 2. 点击右上角 **Fork**,将仓库 fork 到你自己的 GitHub 账号下(例如 `https://github.com/你的用户名/RapidLayout`)。 ### 8.2 将代码提交到个人 Fork 若最初是克隆的主仓库,需要添加你的 fork 为远程,并推送到 fork: ```bash # 在项目根目录 RapidLayout 下执行 git remote add myfork https://github.com/你的用户名/RapidLayout.git # 若已有 origin 且就是主仓库,可保留;推送时用 myfork # 创建分支(推荐为每个 issue/功能单独分支) git checkout -b fix/xxx # 或 feat/xxx、docs/xxx # 添加并提交修改 git add . git status # 确认只提交预期文件 git commit -m "fix: 简短描述" # 推送到你的 fork git push myfork fix/xxx ``` **请按约定式提交规范(Conventional Commits)书写 commit 信息**,便于维护者阅读与自动生成 Changelog。格式为: ```text <类型>[可选范围]: <简短描述> [可选正文] [可选脚注] ``` 常用类型示例: | 类型 | 说明 | |------------|------------------------| | `feat` | 新功能 | | `fix` | Bug 修复 | | `docs` | 文档变更 | | `style` | 代码格式(不影响逻辑) | | `refactor` | 重构 | | `test` | 测试相关 | | `chore` | 构建 / 工具等 | 示例:`fix: 修复某条件下版面结果为空`、`feat: 支持 xxx 输入格式`、`docs: 更新安装说明`。 ### 8.3 向 Rapid Layout 主仓库提交 Pull Request(PR) 1. 打开你 fork 后的仓库页面(如 `https://github.com/你的用户名/RapidLayout`)。 2. 若刚推送分支,页面上通常会出现 **Compare & pull request**,点击即可;否则在 **Branches** 里选择你刚推送的分支,再点 **New pull request**。 3. 确认 **base 仓库** 为 `RapidAI/RapidLayout`、**base 分支** 为 `main`(或仓库默认主分支),**head 仓库** 为你的 fork、**head 分支** 为你的分支(如 `fix/xxx`)。 4. 填写 PR 标题和说明: - 标题:简要概括修改内容(如「Fix: 修复 xxx 问题」)。 - 说明中建议包含: - 对应 Issue 编号(若有):`Fixes #123` 或 `Related to #123`。 - 修改原因与主要改动。 - 如何验证:例如「在仓库根目录执行 `pytest tests/ -v` 通过」。 5. 提交 PR,等待 maintainer 审查;根据反馈再在本地修改并推送同一分支,PR 会自动更新。 --- ## 流程小结 | 步骤 | 说明 | |------|------| | 1 | 克隆 Rapid Layout 源码 | | 2 | 配置虚拟环境并安装依赖与 pytest,可编辑安装 `pip install -e .` | | 3 | 安装 pre-commit(`pip install pre-commit`),在仓库根目录执行 `pre-commit install` | | 4 | 运行单元测试(`pytest tests/ -v`),确认基线通过 | | 5 | 复现问题或实现新功能 | | 6 | 编写 / 补充对应单元测试 | | 7 | 在仓库根目录运行全部测试并确认通过 | | 8 | Fork 主仓库到个人账号 | | 9 | 按约定式提交规范编写 commit,将修改提交并推送到个人 Fork 的对应分支 | | 10 | 在主仓库创建 PR,从个人 Fork 分支指向主仓库 main | --- ## 文档本地预览 修改 `docs/` 下内容后,可使用 MkDocs 本地预览: ```bash pip install mkdocs mkdocs-material mkdocs serve ``` 在浏览器中打开提示的地址(一般为 `http://127.0.0.1:8000`)即可查看效果。 --- ## 其他说明 - **代码风格**:项目采用 [black](https://github.com/psf/black)、autoflake 等规范,已通过 pre-commit 钩子在提交时自动检查;也可在仓库根目录执行 `pre-commit run --all-files` 手动跑一遍。 - **文档**:更多安装与使用说明见 [Rapid Layout 文档](https://rapidai.github.io/RapidLayout/)。 - **问题与讨论**:Bug 与功能建议可通过 [GitHub Issues](https://github.com/RapidAI/RapidLayout/issues) 反馈。 - 提交 Issue 或 PR 时,请使用清晰、简洁的标题与描述,便于维护者处理。若你希望参与长期维护或较大功能开发,欢迎在 Issue 中说明,我们会与你沟通协作方式。 再次感谢你的贡献! ================================================ FILE: docs/doc_whl_rapid_layout.md ================================================ See [link](https://github.com/RapidAI/RapidLayout) for details. ================================================ FILE: docs/hooks/change_copyright.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from datetime import datetime def on_config(config, **kwargs): config.copyright = f"Copyright © {datetime.now().year} Maintained by SWHL." ================================================ FILE: docs/hooks/expiry.py ================================================ import re from datetime import datetime def on_page_context(context, page, config, nav): expiry_days = config.get("extra", {}).get("expiry_days", 365) def compute_expiry(meta): revision = ( meta.get("git_revision_date_localized") or meta.get("git_creation_date_localized") or meta.get("revision_date") ) is_expired = False last_update = None if revision: m = re.search(r"(\d{4}-\d{2}-\d{2})", str(revision)) if m: last_update = m.group(1) try: dt = datetime.strptime(last_update, "%Y-%m-%d") if (datetime.now() - dt).days > expiry_days: is_expired = True except Exception: # 无法解析日期时,保持不显示过期提示 pass return is_expired, last_update page.is_expired, page.last_update = compute_expiry(page.meta) context["is_expired"] = page.is_expired context["last_update"] = page.last_update context["expiry_days"] = expiry_days return context ================================================ FILE: docs/hooks/link.py ================================================ import fnmatch import re def on_page_markdown(markdown, page, config, files): """ 将 'issue #数字'、'PR #数字'、'commit 哈希' 替换为 GitHub 链接 (忽略代码块和行内代码,只在指定页面生效,支持通配符) """ repo_url = config.get("repo_url", "").rstrip("/") if not repo_url: return markdown # 页面白名单,支持通配符 allowed_pages = config.get("link_pages", []) page_src = page.file.src_path # 相对于 docs/ 的路径 if allowed_pages: matched = any(fnmatch.fnmatch(page_src, pattern) for pattern in allowed_pages) if not matched: return markdown # 保存代码块和行内代码 placeholders = {} def store_placeholder(match): key = f"__PLACEHOLDER_{len(placeholders)}__" placeholders[key] = match.group(0) return key # 提取代码块(```...``` 或 ~~~...~~~) markdown = re.sub(r"```.*?```", store_placeholder, markdown, flags=re.DOTALL) markdown = re.sub(r"~~~.*?~~~", store_placeholder, markdown, flags=re.DOTALL) # 提取行内代码(`...`) markdown = re.sub(r"`.*?`", store_placeholder, markdown) # --- issue 替换 --- # 支持 issue#123 / issue: #123 / issue #123 def issue_replacer(match): num = match.group(1) return f"issue [#{num}]({repo_url}/issues/{num})" markdown = re.sub(r"(?i)issue\s*[:#]?\s*#?(\d+)", issue_replacer, markdown) # --- PR 替换 --- def pr_replacer(match): num = match.group(1) return f"PR [#{num}]({repo_url}/pull/{num})" markdown = re.sub(r"(?i)PR\s*[:#]?\s*#?(\d+)", pr_replacer, markdown) # --- commit 替换 --- def commit_replacer(match): sha = match.group(1) short_sha = sha[:7] return f"commit [{short_sha}]({repo_url}/commit/{sha})" markdown = re.sub(r"(?i)commit\s+([0-9a-f]{6,40})", commit_replacer, markdown) # 还原代码块和行内代码 for key, value in placeholders.items(): markdown = markdown.replace(key, value) return markdown ================================================ FILE: docs/index.md ================================================ --- comments: true hide: - navigation - toc ---

Rapid 📄 Layout

PyPI SemVer2.0
### 简介 该项目主要是汇集全网开源的版面分析的项目,具体来说,就是分析给定的文档类别图像(论文截图、研报等),定位其中类别和位置,如标题、段落、表格和图片等各个部分。 ### TODO - [ ] [PP-DocLayout](https://github.com/PaddlePaddle/PaddleX/blob/release/3.0-rc/docs/module_usage/tutorials/ocr_modules/layout_detection.md) 整理 ================================================ FILE: docs/install_usage/how_to_use_other_engine.md ================================================ --- comments: true hide: # - navigation - toc --- ## 引言 版面分析支持多种推理引擎与设备: - **ONNX Runtime**:默认引擎,支持 CPU / CUDA / DirectML / CANN,需按需安装对应包。 - **OpenVINO**:可选,`pip install openvino` 后通过 `engine_type=EngineType.OPENVINO` 使用。 默认依赖为 CPU 版 `onnxruntime`;使用 GPU 推理需手动安装 `onnxruntime-gpu`。详细使用和评测可参见 [AI Studio](https://aistudio.baidu.com/projectdetail/8094594)。 ## 使用 ONNX Runtime + GPU (CUDA) ```bash pip install rapid_layout # 请确保 onnxruntime-gpu 与当前 GPU/CUDA 版本对应 # 参见 https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements pip install onnxruntime-gpu ``` ```python linenums="1" from rapid_layout import EngineType, ModelType, RapidLayout, RapidLayoutInput cfg = RapidLayoutInput( model_type=ModelType.PP_LAYOUT_CDLA, engine_type=EngineType.ONNXRUNTIME, engine_cfg={"use_cuda": True, "cuda_ep_cfg": {"device_id": 0}}, ) layout_engine = RapidLayout(cfg=cfg) img_path = "https://raw.githubusercontent.com/RapidAI/RapidLayout/718b60e927ab893c2fad67c98f753b2105a6f421/tests/test_files/layout.jpg" results = layout_engine(img_path) print(results) results.vis("layout_res.png") ``` 多卡时可通过 `cuda_ep_cfg.device_id` 指定卡号(与 [engine_cfg.yaml](https://github.com/RapidAI/RapidLayout/blob/main/rapid_layout/configs/engine_cfg.yaml) 中 `cuda_ep_cfg.device_id` 一致)。 ## 使用 NPU (CANN) 详细配置参数参见:[engine_cfg.yaml](https://github.com/RapidAI/RapidLayout/blob/main/rapid_layout/configs/engine_cfg.yaml) ```python linenums="1" from rapid_layout import EngineType, ModelType, RapidLayout, RapidLayoutInput cfg = RapidLayoutInput( model_type=ModelType.PP_LAYOUT_CDLA, engine_type=EngineType.ONNXRUNTIME, engine_cfg={"use_cann": True, "cann_ep_cfg": {"device_id": 0}}, ) layout_engine = RapidLayout(cfg=cfg) img_path = "https://raw.githubusercontent.com/RapidAI/RapidLayout/718b60e927ab893c2fad67c98f753b2105a6f421/tests/test_files/layout.jpg" results = layout_engine(img_path) print(results) results.vis("layout_res.png") ``` ## 使用 OpenVINO ```bash pip install rapid-layout onnxruntime openvino ``` ```python linenums="1" from rapid_layout import EngineType, ModelType, RapidLayout layout_engine = RapidLayout( model_type=ModelType.PP_LAYOUT_CDLA, engine_type=EngineType.OPENVINO, ) img_path = "https://raw.githubusercontent.com/RapidAI/RapidLayout/718b60e927ab893c2fad67c98f753b2105a6f421/tests/test_files/layout.jpg" results = layout_engine(img_path) print(results) results.vis("layout_res.png") ``` OpenVINO 设备与线程等配置见 [engine_cfg.yaml](https://github.com/RapidAI/RapidLayout/blob/main/rapid_layout/configs/engine_cfg.yaml) 中 `openvino` 段。 ================================================ FILE: docs/install_usage/installation.md ================================================ --- comments: true hide: # - navigation - toc --- 由于模型较小,预先将中文版面分析模型(`layout_cdla.onnx`)打包进了 whl 包内,若仅做中文版面分析,可直接安装使用: ```bash pip install rapid-layout onnxruntime ``` ================================================ FILE: docs/install_usage/usage.md ================================================ --- comments: true hide: # - navigation - toc --- ## Python 脚本运行 **默认用法**(默认模型 `pp_layout_cdla` + `onnxruntime` 引擎): ```python from rapid_layout import RapidLayout layout_engine = RapidLayout() img_path = "https://raw.githubusercontent.com/RapidAI/RapidLayout/718b60e927ab893c2fad67c98f753b2105a6f421/tests/test_files/layout.jpg" results = layout_engine(img_path) print(results) results.vis("layout_res.png") ``` **指定模型与引擎**(关键字参数): ```python from rapid_layout import EngineType, ModelType, RapidLayout layout_engine = RapidLayout( model_type=ModelType.PP_LAYOUT_CDLA, engine_type=EngineType.ONNXRUNTIME, conf_thresh=0.5, iou_thresh=0.5, ) results = layout_engine(img_path) print(results) results.vis("layout_res.png") ``` **使用配置对象**(与上方等价): ```python from rapid_layout import EngineType, ModelType, RapidLayout, RapidLayoutInput cfg = RapidLayoutInput( model_type=ModelType.PP_LAYOUT_CDLA, engine_type=EngineType.ONNXRUNTIME, conf_thresh=0.5, iou_thresh=0.5, ) layout_engine = RapidLayout(cfg=cfg) results = layout_engine(img_path) print(results) results.vis("layout_res.png") ``` ## 终端运行 ```bash rapid_layout test_images/layout.png rapid_layout test_images/layout.png -m pp_layout_cdla --conf_thresh 0.5 --iou_thresh 0.5 ``` ## 构造函数参数(RapidLayout / RapidLayoutInput) | 参数 | 类型 | 默认值 | 说明 | |------|------|--------|------| | `model_type` | ModelType / str | `pp_layout_cdla` | 模型类型 | | `model_dir_or_path` | str / Path / None | None | 模型路径,不传则按 model_type 解析 | | `engine_type` | EngineType / str | `onnxruntime` | 推理引擎:`onnxruntime`、`openvino` | | `engine_cfg` | dict | `{}` | 引擎额外配置 | | `conf_thresh` | float | 0.5 | 框置信度阈值 [0, 1] | | `iou_thresh` | float | 0.5 | IoU 阈值 [0, 1] | ## 可视化结果
================================================ FILE: docs/models.md ================================================ --- comments: true title: 模型列表 hide: - navigation - toc --- !!! tip 由于不同场景下的版面差异较大,现阶段不存在一个模型可以搞定所有场景。如果实际业务需要,以下模型效果不好的话,建议构建自己的训练集微调。 | `model_type` | 版面类型 | 支持类别 | | :------ | :----- | :----- | |`pp_doc_layoutv3 (rapid_layout>=1.2.0)`|文档|`['abstract', 'algorithm', 'aside_text', 'chart', 'content', 'display_formula', 'doc_title', 'figure_title', 'footer', 'footer_image', 'footnote', 'formula_number', 'header', 'header_image', 'image', 'inline_formula', 'number', 'paragraph_title', 'reference', 'reference_content', 'seal', 'table', 'text', 'vertical_text', 'vision_footnote']`| |`pp_doc_layoutv2 (rapid_layout>=1.1.0)`|文档|`['abstract', 'algorithm', 'aside_text', 'chart', 'content', 'display_formula', 'doc_title', 'figure_title', 'footer', 'footer_image', 'footnote', 'formula_number', 'header', 'header_image', 'image', 'inline_formula', 'number', 'paragraph_title', 'reference', 'reference_content', 'seal', 'table', 'text', 'vertical_text', 'vision_footnote']`| |||| | `pp_layout_table` | 表格 | `["table"]` | | `pp_layout_publaynet` | 英文 | `["text", "title", "list", "table", "figure"]` | | `pp_layout_cdla` | 中文 | `['text', 'title', 'figure', 'figure_caption', 'table', 'table_caption', 'header', 'footer', 'reference', 'equation']` | |||| | `yolov8n_layout_paper` | 论文 | `['Text', 'Title', 'Header', 'Footer', 'Figure', 'Table', 'Toc', 'Figure caption', 'Table caption']` | | `yolov8n_layout_report` | 研报 | `['Text', 'Title', 'Header', 'Footer', 'Figure', 'Table', 'Toc', 'Figure caption', 'Table caption']` | | `yolov8n_layout_publaynet` | 英文 | `["Text", "Title", "List", "Table", "Figure"]` | | `yolov8n_layout_general6` | 通用 | `["Text", "Title", "Figure", "Table", "Caption", "Equation"]` | |||| | `doclayout_docstructbench` | 通用 | `['title', 'plain text', 'abandon', 'figure', 'figure_caption', 'table', 'table_caption', 'table_footnote', 'isolate_formula', 'formula_caption']` | | `doclayout_d4la` | 通用 | `['DocTitle', 'ParaTitle', 'ParaText', 'ListText', 'RegionTitle', 'Date', 'LetterHead', 'LetterDear', 'LetterSign', 'Question', 'OtherText', 'RegionKV', 'RegionList', 'Abstract', 'Author', 'TableName', 'Table', 'Figure', 'FigureName', 'Equation', 'Reference', 'Footer', 'PageHeader', 'PageFooter', 'Number', 'Catalog', 'PageNumber']` | | `doclayout_docsynth` | 通用 | `['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title']` | ## 模型来源 **🔥 PP-DocLayoutV3**: [PP-DocLayoutV2](https://huggingface.co/PaddlePaddle/PP-DocLayoutV3) **🔥 PP-DocLayoutV2**: [PP-DocLayoutV2](https://huggingface.co/PaddlePaddle/PP-DocLayoutV2) **PP 模型**:[PaddleOCR 版面分析](https://github.com/PaddlePaddle/PaddleOCR/blob/133d67f27dc8a241d6b2e30a9f047a0fb75bebbe/ppstructure/layout/README_ch.md) **yolov8n 系列**:[360LayoutAnalysis](https://github.com/360AILAB-NLP/360LayoutAnalysis) **doclayout_yolo(推荐)**:[DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO),目前较为优秀的开源版面分析模型,提供基于不同训练集的三个模型: - `doclayout_docstructbench`:[Hugging Face](https://huggingface.co/juliozhao/DocLayout-YOLO-DocStructBench/tree/main) - `doclayout_d4la`:[Hugging Face](https://huggingface.co/juliozhao/DocLayout-YOLO-D4LA-Docsynth300K_pretrained/blob/main/doclayout_yolo_d4la_imgsz1600_docsynth_pretrain.pt) - `doclayout_docsynth`:[Hugging Face](https://huggingface.co/juliozhao/DocLayout-YOLO-DocLayNet-Docsynth300K_pretrained/tree/main) ## 模型下载 模型均已经托管在 [魔搭平台](https://www.modelscope.cn/models/RapidAI/RapidLayout/files)。 ================================================ FILE: docs/quickstart.md ================================================ --- comments: true title: 快速开始 hide: - navigation - toc --- ## 安装 ```bash pip install rapid-layout onnxruntime ``` 如需使用 OpenVINO 引擎,请额外安装:`pip install openvino`。 ## 运行 === "Python 脚本(默认)" 不传参数时使用默认模型 `pp_layout_cdla` 与 `onnxruntime` 引擎: ```python linenums="1" from rapid_layout import RapidLayout layout_engine = RapidLayout() img_path = "https://raw.githubusercontent.com/RapidAI/RapidLayout/718b60e927ab893c2fad67c98f753b2105a6f421/tests/test_files/layout.jpg" results = layout_engine(img_path) print(results) results.vis("layout_res.png") ``` === "Python 脚本(指定模型与引擎)" 通过关键字参数指定 `model_type`、`engine_type`、`conf_thresh` 等: ```python linenums="1" from rapid_layout import EngineType, ModelType, RapidLayout layout_engine = RapidLayout( model_type=ModelType.PP_LAYOUT_CDLA, engine_type=EngineType.ONNXRUNTIME, conf_thresh=0.5, ) img_path = "https://raw.githubusercontent.com/RapidAI/RapidLayout/718b60e927ab893c2fad67c98f753b2105a6f421/tests/test_files/layout.jpg" results = layout_engine(img_path) print(results) results.vis("layout_res.png") ``` === "Python 脚本(使用配置对象)" ```python linenums="1" from rapid_layout import EngineType, ModelType, RapidLayout, RapidLayoutInput cfg = RapidLayoutInput( model_type=ModelType.PP_LAYOUT_CDLA, engine_type=EngineType.ONNXRUNTIME, ) layout_engine = RapidLayout(cfg=cfg) img_path = "https://raw.githubusercontent.com/RapidAI/RapidLayout/718b60e927ab893c2fad67c98f753b2105a6f421/tests/test_files/layout.jpg" results = layout_engine(img_path) print(results) results.vis("layout_res.png") ``` === "终端运行" ```bash linenums="1" rapid_layout test_images/layout.png rapid_layout test_images/layout.png -m pp_layout_cdla --conf_thresh 0.5 ``` ## 构造函数参数说明 `RapidLayout(cfg=None, **kwargs)` 支持以下关键字参数(与 `RapidLayoutInput` 一致): | 参数 | 类型 | 默认值 | 说明 | |------|------|--------|------| | `model_type` | `ModelType` 或 str | `pp_layout_cdla` | 模型类型,见 [模型列表](models.md) | | `model_dir_or_path` | str / Path / None | None | 模型路径,不传则按 `model_type` 自动解析 | | `engine_type` | `EngineType` 或 str | `onnxruntime` | 推理引擎:`onnxruntime`、`openvino` | | `engine_cfg` | dict | `{}` | 引擎额外配置,见 [engine_cfg.yaml](https://github.com/RapidAI/RapidLayout/blob/main/rapid_layout/configs/engine_cfg.yaml) | | `conf_thresh` | float | 0.5 | 框置信度阈值 [0, 1] | | `iou_thresh` | float | 0.5 | IoU 阈值 [0, 1] | 传入 `cfg` 时,`kwargs` 会覆盖同名字段。 ## 可视化结果
================================================ FILE: docs/stylesheets/extra.css ================================================ :root { --admonition-border-left-width: 0.2rem; --base-border-radius: 1rem; --md-text-font: "LXGW WenKai Screen"; --md-code-font: "consolas, 'Courier New', monospace"; } /*亮色样式*/ [data-md-color-scheme="default"] { --md-primary-fg-color: rgba(255, 255, 255, 0.7); --md-header-bg-color: rgba(255, 255, 255, 0.7); --md-typeset-a-color: steelblue; --md-footer-bg-color: #f6f6f6; --md-footer-bg-color--dark: #f6f6f6; --md-footer-fg-color: #222; --md-footer-fg-color--light: #505050; --md-footer-fg-color--lighter: #777777; --md-code-hl-comment-color: #999999; } /*暗色样式*/ [data-md-color-scheme="slate"] { --md-primary-fg-color: rgba(17, 16, 17, 0.7); --md-header-bg-color: rgba(17, 16, 17, 0.7); --md-typeset-a-color: royalblue; --md-footer-bg-color: #101010; --md-footer-bg-color--dark: #101010; --md-code-hl-comment-color: #777777; } /* 卡片圆角与悬浮阴影 */ .md-typeset .grid.cards>ul>li, .md-typeset .md-button, .md-typeset table:not([class]) { border-radius: var(--base-border-radius); } .md-typeset .grid.cards>ul>li:hover { box-shadow: var(--card-hover-shadow); } /* 页脚社交图标高度 */ .md-social__link svg { max-height: 1rem; } /* 搜索框及下拉结果圆角 */ .md-search__form { border-radius: var(--base-border-radius); } [data-md-toggle="search"]:checked~.md-header .md-search__form { border-top-right-radius: var(--base-border-radius); border-top-left-radius: var(--base-border-radius); } [dir="ltr"] .md-search__output { border-bottom-right-radius: var(--base-border-radius); border-bottom-left-radius: var(--base-border-radius); } .banner{ font-family: var(--font-family); } /* 可选:如需恢复代码块、警告框等样式,取消注释即可 */ /* .highlight span.filename { border-bottom: none; border-radius: var(--base-border-radius); display: inline; font-family: var(--md-code-font-family); border-bottom-left-radius: 0; border-bottom-right-radius: 0; margin-bottom: 5px; text-align: center; } .highlight span.filename + pre > code, .md-typeset pre > code { border-radius: var(--base-border-radius); border-top-left-radius: 0; } .md-typeset .admonition { border-width: 0px; border-left-width: var(--admonition-border-left-width); } [dir="ltr"] .md-typeset blockquote { border-radius: 0.2rem; border-left-width: var(--admonition-border-left-width); } */ /* 可选:博客相关样式,按需启用 */ /* .md-post--excerpt { background-color: rgba(68,138,255,.1); box-shadow: 0 0 0 1rem rgba(68,138,255,.1); border-radius: var(--base-border-radius); } .md-post--excerpt .md-post__header { justify-content: left; } .md-post--excerpt .md-post__content > h2, .md-post__action { text-align: left; } */ /* 让所有admonition(包括!!! tip)圆角化且更自然 */ .md-typeset .admonition, .md-typeset details { border-radius: 1.5em; box-shadow: 0 2px 12px 0 rgba(60, 60, 60, 0.07); transition: border-radius 0.4s cubic-bezier(.4, 2, .6, 1), box-shadow 0.3s; overflow: hidden; } /*图像圆角*/ img.img1 { border-radius: 25px; } ================================================ FILE: mkdocs.yml ================================================ site_name: RapidLayout 文档 site_url: https://rapidai.github.io/RapidLayout/ site_author: RapidAI site_description: Analysis of Chinese and English document layouts. repo_name: RapidAI/RapidLayout repo_url: https://github.com/RapidAI/RapidLayout copyright: Copyright © 2026 Maintained by RapidAI. edit_uri: https://github.com/RapidAI/RapidLayout/blob/main/docs theme: name: material favicon: ./static/logo.svg language: zh custom_dir: overrides features: - announce.dismiss - content.tooltips - content.code.copy - content.tabs.link - content.action.edit # 显示编辑按钮 - content.action.view # 显示查看源码按钮 - content.footnote.tooltips - navigation.expand # 默认打开所有的子节 - navigation.tabs # 顶级索引被作为tab - navigation.tabs.sticky # tab始终可见 - navigation.top # 开启顶部导航栏 - navigation.tracking # 导航栏跟踪 - navigation.footer - navigation.indexes - search.highlight # 搜索高亮 - search.share # 搜索分享 - search.suggest # 搜索建议 - toc.follow # 目录跟踪-页面右侧的小目录 palette: - media: "(prefers-color-scheme)" # 系统主题 toggle: icon: material/theme-light-dark name: 系统主题 - media: "(prefers-color-scheme: light)" # 亮色主题 scheme: default primary: white accent: light blue toggle: icon: material/weather-sunny name: 明亮主题 - media: "(prefers-color-scheme: dark)" # 暗色主题 scheme: slate primary: black accent: yellow toggle: icon: material/weather-night name: 暗黑主题 icon: logo: material/file-document-multiple previous: fontawesome/solid/angle-left next: fontawesome/solid/angle-right repo: fontawesome/brands/github edit: material/pencil view: material/eye tag: default-tag: fontawesome/solid/tag hardware-tag: fontawesome/solid/microchip software-tag: fontawesome/solid/laptop-code plugins: - blog: archive: false post_date_format: short blog_toc: true categories_toc: true pagination: false - search: separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' - git-committers: repository: RapidAI/RapidLayout branch: main token: !!python/object/apply:os.getenv ["MKDOCS_GIT_COMMITTERS_APIKEY"] - git-revision-date-localized: enable_creation_date: true timezone: Asia/Shanghai type: iso_datetime hooks: - docs/hooks/expiry.py - docs/hooks/change_copyright.py - docs/hooks/link.py extra_css: - stylesheets/extra.css - https://cdn.jsdelivr.net/npm/lxgw-wenkai-screen-web/style.css extra: social: - icon: fontawesome/brands/github link: https://github.com/RapidAI - icon: fontawesome/brands/weixin link: https://raw.githubusercontent.com/RapidAI/.github/main/assets/RapidAI_poster_compose.png - icon: fontawesome/brands/python link: https://pypi.org/project/rapid-layout/ version: provider: mike expiry_days: 180 markdown_extensions: - abbr - attr_list - pymdownx.snippets - pymdownx.critic - pymdownx.caret - pymdownx.keys - pymdownx.mark - pymdownx.tilde - pymdownx.details - footnotes - def_list - md_in_html - tables - pymdownx.tasklist: custom_checkbox: true - toc: permalink: true - pymdownx.betterem: smart_enable: all - pymdownx.superfences: custom_fences: - name: mermaid class: mermaid format: !!python/name:pymdownx.superfences.fence_code_format - pymdownx.emoji: emoji_index: !!python/name:material.extensions.emoji.twemoji emoji_generator: !!python/name:material.extensions.emoji.to_svg - pymdownx.highlight: anchor_linenums: true line_spans: __span pygments_lang_class: true - pymdownx.inlinehilite - pymdownx.tabbed: alternate_style: true - admonition nav: - 概览: index.md - 快速开始: quickstart.md - 模型列表: models.md - 安装及使用: - 安装: install_usage/installation.md - 使用: install_usage/usage.md - 使用其他推理引擎: install_usage/how_to_use_other_engine.md - 贡献指南: contributing.md - 博客: - blog/index.md ================================================ FILE: overrides/404.html ================================================ {% extends "main.html" %} {% block content %}

页面未找到

抱歉,您访问的页面 {{ page.url | url }} 不存在。

可能的原因:

  • 链接已失效或拼写错误
  • 该页面已在当前版本中被移除
  • 您正在访问旧版文档中的链接

接下来您可以:

注:该页面返回 HTTP 404 状态,搜索引擎将不会索引此地址。

{% endblock %} ================================================ FILE: overrides/main.html ================================================ {% extends "base.html" %} {% block announce %} {% endblock %} {% block outdated %} You're not viewing the latest version. Click here to go to latest. {% endblock %} ================================================ FILE: overrides/partials/comments.html ================================================ {% if page.meta.comments %}

{{ lang.t("meta.comments") }}

{% endif %} ================================================ FILE: overrides/partials/content.html ================================================ {% include "partials/tags.html" %} {% include "partials/actions.html" %} {% if "\u003ch1" not in page.content %}

{{ page.title | d(config.site_name, true)}}

{% endif %} {% include "partials/expired_notice.html" %} {{ page.content }} {% include "partials/source-file.html" %} {% include "partials/feedback.html" %} {% include "partials/comments.html" %} ================================================ FILE: overrides/partials/expired_notice.html ================================================ {% if is_expired %}

Warning

本文档最后更新于 {{ last_update }}, 已超过 {{ expiry_days }} 天未更新,内容可能已经过时,阅读注意甄别。

{% endif %} ================================================ FILE: rapid_layout/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from .main import RapidLayout from .utils.typings import EngineType, ModelType, RapidLayoutInput __all__ = ['RapidLayout', 'EngineType', 'ModelType', 'RapidLayoutInput'] ================================================ FILE: rapid_layout/configs/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com ================================================ FILE: rapid_layout/configs/default_models.yaml ================================================ pp_doc_layoutv3: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/pp_doc_layout/pp_doc_layoutv3.onnx SHA256: 250dbad1dfb9e4983fab75e1bf5085cd56ec3f41d5c7d0f8623ec74856e7aa67 pp_doc_layoutv2: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/pp_doc_layout/pp_doc_layoutv2.onnx SHA256: 0bd2ea0997fe0789f0300292291f8bbf897d890b44a9a3bd5be72afd6198aa90 pp_layout_cdla: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/pp_layout/layout_cdla.onnx SHA256: 25b1f27ec56aa932a48f30cbd6293c358a156280f4b20b0a973bab210c39f62c pp_layout_publaynet: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/pp_layout/layout_publaynet.onnx SHA256: 958aa6dcef1cc1a542d0a513b5976a3d5edbcc37d76460ec1e9f126358e4d100 pp_layout_table: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/pp_layout/layout_table.onnx SHA256: 5b07ba6df1d1889bed2877c9d7501235c6fb6e2212aca8f2f56f4b1b8d0e37b5 yolov8n_layout_paper: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/360/yolov8n_layout_paper.onnx SHA256: bc074c8d8fbe89e5d90c3e21b7e3b52f279c70fe210ae96d73b74141df64347c yolov8n_layout_report: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/360/yolov8n_layout_report.onnx SHA256: 9d5ada6a69b5825eb255da2b82d2c8d11636a0adae801074d88892527b535980 yolov8n_layout_publaynet: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/360/yolov8n_layout_publaynet.onnx SHA256: 5304bf18e538312a1bd211eb2ad3283524dff956e5cbffcefb3ad294c6e3cba6 yolov8n_layout_general6: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/360/yolov8n_layout_general6.onnx SHA256: 927b6edcb268e896e6a170f7d78980591b408e04b3908f54d58eb69efd018c95 doclayout_docstructbench: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/doclayout/doclayout_yolo_docstructbench_imgsz1024.onnx SHA256: 3b452baef10ecabd615491bc82cc4d49475fbc2cd7a8e535044f2c6bb28fb9fe doclayout_d4la: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/doclayout/doclayout_yolo_d4la_imgsz1600_docsynth_pretrain.onnx SHA256: 1c81715d45d5bee2e6b644f92563a9eaa5cb4cad3d4293f890f99c0862937e69 doclayout_docsynth: model_dir_or_path: https://www.modelscope.cn/models/RapidAI/RapidLayout/resolve/v1.2.0/onnx/doclayout/doclayout_yolo_doclaynet_imgsz1120_docsynth_pretrain.onnx SHA256: 527e60cefc2801dec727dddbfa3a2bd225876a8c5505461c9d3b1193f50a7c84 ================================================ FILE: rapid_layout/configs/engine_cfg.yaml ================================================ onnxruntime: intra_op_num_threads: -1 inter_op_num_threads: -1 enable_cpu_mem_arena: false cpu_ep_cfg: arena_extend_strategy: "kSameAsRequested" use_cuda: false cuda_ep_cfg: device_id: 0 arena_extend_strategy: "kNextPowerOfTwo" cudnn_conv_algo_search: "EXHAUSTIVE" do_copy_in_default_stream: true use_dml: false dm_ep_cfg: null use_cann: false cann_ep_cfg: device_id: 0 arena_extend_strategy: "kNextPowerOfTwo" npu_mem_limit: 21474836480 # 20 * 1024 * 1024 * 1024 op_select_impl_mode: "high_performance" optypelist_for_implmode: "Gelu" enable_cann_graph: true openvino: device: "CPU" inference_num_threads: -1 performance_hint: "LATENCY" performance_num_requests: -1 enable_cpu_pinning: null num_streams: -1 enable_hyper_threading: null scheduling_core_type: null ================================================ FILE: rapid_layout/inference_engine/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com ================================================ FILE: rapid_layout/inference_engine/base.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from abc import ABC, abstractmethod from pathlib import Path from typing import Any, Dict, List, Union import numpy as np from omegaconf import DictConfig, OmegaConf from ..utils.logger import Logger from ..utils.typings import EngineType from ..utils.utils import import_package logger = Logger(logger_name=__name__).get_log() class InferSession(ABC): cur_dir = Path(__file__).resolve().parent.parent MODEL_URL_PATH = cur_dir / "configs" / "default_models.yaml" ENGINE_CFG_PATH = cur_dir / "configs" / "engine_cfg.yaml" model_info = OmegaConf.load(MODEL_URL_PATH) DEFAULT_MODEL_PATH = cur_dir / "models" engine_cfg = OmegaConf.load(ENGINE_CFG_PATH) @abstractmethod def __init__(self, config): pass @abstractmethod def __call__(self, input_content: np.ndarray) -> np.ndarray: pass @staticmethod def _verify_model(model_path: Union[str, Path, None]): if model_path is None: raise ValueError("model_path is None!") model_path = Path(model_path) if not model_path.exists(): raise FileNotFoundError(f"{model_path} does not exists.") if not model_path.is_file(): raise FileExistsError(f"{model_path} is not a file.") @abstractmethod def have_key(self, key: str = "character") -> bool: pass @property def characters(self): return self.get_character_list() @abstractmethod def get_character_list(self, key: str = "character") -> List[str]: pass @staticmethod def update_params(cfg: DictConfig, params: Dict[str, Any]) -> DictConfig: for k, v in params.items(): OmegaConf.update(cfg, k, v) return cfg def get_engine(engine_type: EngineType): logger.info("Using engine_name: %s", engine_type.value) if engine_type == EngineType.ONNXRUNTIME: if not import_package(engine_type.value): raise ImportError(f"{engine_type.value} is not installed.") from .onnxruntime import OrtInferSession return OrtInferSession elif engine_type == EngineType.OPENVINO: if not import_package(engine_type.value): raise ImportError(f"{engine_type.value} is not installed.") from .openvino import OpenVINOInferSession return OpenVINOInferSession raise ValueError(f"Unsupported engine: {engine_type.value}") ================================================ FILE: rapid_layout/inference_engine/onnxruntime/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from .main import OrtInferSession ================================================ FILE: rapid_layout/inference_engine/onnxruntime/main.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import os import traceback from pathlib import Path from typing import Any, List import numpy as np from omegaconf import DictConfig from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions from ...model_handler.utils import ModelProcessor from ...utils.logger import Logger from ...utils.typings import RapidLayoutInput from ..base import InferSession from .provider_config import ProviderConfig class OrtInferSession(InferSession): def __init__(self, cfg: RapidLayoutInput): self.logger = Logger(logger_name=__name__).get_log() if cfg.model_dir_or_path is None: model_path = ModelProcessor.get_model_path(cfg.model_type) else: model_path = Path(cfg.model_dir_or_path) self._verify_model(model_path) self.logger.info(f"Using {model_path}") engine_cfg = self.update_params( self.engine_cfg[cfg.engine_type.value], cfg.engine_cfg ) sess_opt = self._init_sess_opts(engine_cfg) provider_cfg = ProviderConfig(engine_cfg=engine_cfg) self.session = InferenceSession( model_path, sess_options=sess_opt, providers=provider_cfg.get_ep_list(), ) provider_cfg.verify_providers(self.session.get_providers()) @staticmethod def _init_sess_opts(cfg: DictConfig) -> SessionOptions: sess_opt = SessionOptions() sess_opt.log_severity_level = 4 sess_opt.enable_cpu_mem_arena = cfg.enable_cpu_mem_arena sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL cpu_nums = os.cpu_count() intra_op_num_threads = cfg.get("intra_op_num_threads", -1) if intra_op_num_threads != -1 and 1 <= intra_op_num_threads <= cpu_nums: sess_opt.intra_op_num_threads = intra_op_num_threads inter_op_num_threads = cfg.get("inter_op_num_threads", -1) if inter_op_num_threads != -1 and 1 <= inter_op_num_threads <= cpu_nums: sess_opt.inter_op_num_threads = inter_op_num_threads return sess_opt def __call__(self, input_content: np.ndarray) -> Any: if isinstance(input_content, list): input_dict = dict(zip(self.get_input_names(), input_content)) else: input_dict = dict(zip(self.get_input_names(), [input_content])) try: return self.session.run(self.get_output_names(), input_dict) except Exception as e: error_info = traceback.format_exc() raise ONNXRuntimeError(error_info) from e def get_input_names(self) -> List[str]: return [v.name for v in self.session.get_inputs()] def get_output_names(self) -> List[str]: return [v.name for v in self.session.get_outputs()] @property def characters(self): return self.get_character_list() def get_character_list(self, key: str = "character") -> List[str]: meta_dict = self.session.get_modelmeta().custom_metadata_map return meta_dict[key].splitlines() def have_key(self, key: str = "character") -> bool: meta_dict = self.session.get_modelmeta().custom_metadata_map if key in meta_dict.keys(): return True return False class ONNXRuntimeError(Exception): pass ================================================ FILE: rapid_layout/inference_engine/onnxruntime/provider_config.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import platform from enum import Enum from typing import Any, Dict, List, Sequence, Tuple from omegaconf import DictConfig from onnxruntime import get_available_providers, get_device from ...utils.logger import Logger class EP(Enum): CPU_EP = "CPUExecutionProvider" CUDA_EP = "CUDAExecutionProvider" DIRECTML_EP = "DmlExecutionProvider" CANN_EP = "CANNExecutionProvider" class ProviderConfig: def __init__(self, engine_cfg: DictConfig): self.logger = Logger(logger_name=__name__).get_log() self.had_providers: List[str] = get_available_providers() self.default_provider = self.had_providers[0] self.cfg_use_cuda = engine_cfg.get("use_cuda", False) self.cfg_use_dml = engine_cfg.get("use_dml", False) self.cfg_use_cann = engine_cfg.get("use_cann", False) self.cfg = engine_cfg def get_ep_list(self) -> List[Tuple[str, Dict[str, Any]]]: results = [(EP.CPU_EP.value, self.cpu_ep_cfg())] if self.is_cuda_available(): results.insert(0, (EP.CUDA_EP.value, self.cuda_ep_cfg())) if self.is_dml_available(): self.logger.info( "Windows 10 or above detected, try to use DirectML as primary provider" ) results.insert(0, (EP.DIRECTML_EP.value, self.dml_ep_cfg())) if self.is_cann_available(): self.logger.info("Try to use CANNExecutionProvider to infer") results.insert(0, (EP.CANN_EP.value, self.cann_ep_cfg())) return results def cpu_ep_cfg(self) -> Dict[str, Any]: return dict(self.cfg.cpu_ep_cfg) def cuda_ep_cfg(self) -> Dict[str, Any]: return dict(self.cfg.cuda_ep_cfg) def dml_ep_cfg(self) -> Dict[str, Any]: if self.cfg.dm_ep_cfg is not None: return self.cfg.dm_ep_cfg if self.is_cuda_available(): return self.cuda_ep_cfg() return self.cpu_ep_cfg() def cann_ep_cfg(self) -> Dict[str, Any]: return dict(self.cfg.cann_ep_cfg) def verify_providers(self, session_providers: Sequence[str]): if not session_providers: raise ValueError("Session Providers is empty") first_provider = session_providers[0] providers_to_check = { EP.CUDA_EP: self.is_cuda_available, EP.DIRECTML_EP: self.is_dml_available, EP.CANN_EP: self.is_cann_available, } for ep, check_func in providers_to_check.items(): if check_func() and first_provider != ep.value: self.logger.warning( f"{ep.value} is available, but the inference part is automatically shifted to be executed under {first_provider}. " ) self.logger.warning(f"The available lists are {session_providers}") def is_cuda_available(self) -> bool: if not self.cfg_use_cuda: return False CUDA_EP = EP.CUDA_EP.value if get_device() == "GPU" and CUDA_EP in self.had_providers: return True self.logger.warning( f"{CUDA_EP} is not in available providers ({self.had_providers}). Use {self.default_provider} inference by default." ) install_instructions = [ f"If you want to use {CUDA_EP} acceleration, you must do:" "(For reference only) If you want to use GPU acceleration, you must do:", "First, uninstall all onnxruntime packages in current environment.", "Second, install onnxruntime-gpu by `pip install onnxruntime-gpu`.", "Note the onnxruntime-gpu version must match your cuda and cudnn version.", "You can refer this link: https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html", f"Third, ensure {CUDA_EP} is in available providers list. e.g. ['CUDAExecutionProvider', 'CPUExecutionProvider']", ] self.print_log(install_instructions) return False def is_dml_available(self) -> bool: if not self.cfg_use_dml: return False cur_os = platform.system() if cur_os != "Windows": self.logger.warning( f"DirectML is only supported in Windows OS. The current OS is {cur_os}. Use {self.default_provider} inference by default.", ) return False window_build_number_str = platform.version().split(".")[-1] window_build_number = ( int(window_build_number_str) if window_build_number_str.isdigit() else 0 ) if window_build_number < 18362: self.logger.warning( f"DirectML is only supported in Windows 10 Build 18362 and above OS. The current Windows Build is {window_build_number}. Use {self.default_provider} inference by default.", ) return False DML_EP = EP.DIRECTML_EP.value if DML_EP in self.had_providers: return True self.logger.warning( f"{DML_EP} is not in available providers ({self.had_providers}). Use {self.default_provider} inference by default." ) install_instructions = [ "If you want to use DirectML acceleration, you must do:", "First, uninstall all onnxruntime packages in current environment.", "Second, install onnxruntime-directml by `pip install onnxruntime-directml`", f"Third, ensure {DML_EP} is in available providers list. e.g. ['DmlExecutionProvider', 'CPUExecutionProvider']", ] self.print_log(install_instructions) return False def is_cann_available(self) -> bool: if not self.cfg_use_cann: return False CANN_EP = EP.CANN_EP.value if CANN_EP in self.had_providers: return True self.logger.warning( f"{CANN_EP} is not in available providers ({self.had_providers}). Use {self.default_provider} inference by default." ) install_instructions = [ "If you want to use CANN acceleration, you must do:", "First, ensure you have installed Huawei Ascend software stack.", "Second, install onnxruntime with CANN support by following the instructions at:", "\thttps://onnxruntime.ai/docs/execution-providers/community-maintained/CANN-ExecutionProvider.html", f"Third, ensure {CANN_EP} is in available providers list. e.g. ['CANNExecutionProvider', 'CPUExecutionProvider']", ] self.print_log(install_instructions) return False def print_log(self, log_list: List[str]): for log_info in log_list: self.logger.info(log_info) ================================================ FILE: rapid_layout/inference_engine/openvino/__init__.py ================================================ from .main import OpenVINOInferSession ================================================ FILE: rapid_layout/inference_engine/openvino/device_config.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import os from typing import Any, Dict from omegaconf import DictConfig from ...utils.logger import logger class OpenVINOConfig: def __init__(self, engine_cfg: DictConfig): self.cfg = engine_cfg def get_config(self) -> Dict[str, Any]: config = {} infer_num_threads = self.cfg.get("inference_num_threads", -1) if infer_num_threads != -1 and 1 <= infer_num_threads <= os.cpu_count(): config["INFERENCE_NUM_THREADS"] = str(infer_num_threads) performance_hint = self.cfg.get("performance_hint", None) if performance_hint is not None: config["PERFORMANCE_HINT"] = str(performance_hint) performance_num_requests = self.cfg.get("performance_num_requests", -1) if performance_num_requests != -1: config["PERFORMANCE_HINT_NUM_REQUESTS"] = str(performance_num_requests) enable_cpu_pinning = self.cfg.get("enable_cpu_pinning", None) if enable_cpu_pinning is not None: config["ENABLE_CPU_PINNING"] = str(enable_cpu_pinning) num_streams = self.cfg.get("num_streams", -1) if num_streams != -1: config["NUM_STREAMS"] = str(num_streams) enable_hyper_threading = self.cfg.get("enable_hyper_threading", None) if enable_hyper_threading is not None: config["ENABLE_HYPER_THREADING"] = str(enable_hyper_threading) scheduling_core_type = self.cfg.get("scheduling_core_type", None) if scheduling_core_type is not None: config["SCHEDULING_CORE_TYPE"] = str(scheduling_core_type) logger.info(f"Using OpenVINO config: {config}") return config ================================================ FILE: rapid_layout/inference_engine/openvino/main.py ================================================ # -*- encoding: utf-8 -*- import traceback from pathlib import Path from typing import Any, List import numpy as np try: from openvino import Core, Tensor except ImportError: from openvino.runtime import Core, Tensor from ...model_handler.utils import ModelProcessor from ...utils.logger import logger from ...utils.typings import RapidLayoutInput from ..base import InferSession from .device_config import OpenVINOConfig class OpenVINOInferSession(InferSession): def __init__(self, cfg: RapidLayoutInput): if cfg.model_dir_or_path is None: model_path = ModelProcessor.get_model_path(cfg.model_type) else: model_path = Path(cfg.model_dir_or_path) self._verify_model(model_path) logger.info(f"Using {model_path}") core = Core() self.model = core.read_model(model=str(model_path)) self.input_tensors = self.model.inputs self.output_tensors = self.model.outputs engine_cfg = self.update_params( self.engine_cfg[cfg.engine_type.value], cfg.engine_cfg ) device = engine_cfg.get("device", "CPU") ov_config = OpenVINOConfig(engine_cfg) core.set_property(device, ov_config.get_config()) self.compiled_model = core.compile_model(self.model, device_name=device) self.infer_request = self.compiled_model.create_infer_request() def __call__(self, input_content: np.ndarray) -> Any: if not isinstance(input_content, list): input_content = [input_content] if len(input_content) != len(self.input_tensors): raise OpenVINOError( f"The number of inputs ({len(input_content)}) does not match the number of model inputs ({len(self.input_tensors)})." ) try: for input_tensor, input_content in zip(self.input_tensors, input_content): input_tensor_name = input_tensor.get_any_name() self.infer_request.set_tensor(input_tensor_name, Tensor(input_content)) self.infer_request.infer() outputs = [] for output_tensor in self.output_tensors: output_tensor_name = output_tensor.get_any_name() output = self.infer_request.get_tensor(output_tensor_name).data outputs.append(output) return outputs except Exception as e: error_info = traceback.format_exc() raise OpenVINOError(error_info) from e def get_input_names(self) -> List[str]: return [tensor.get_any_name() for tensor in self.model.inputs] def get_output_names(self) -> List[str]: return [tensor.get_any_name() for tensor in self.model.outputs] @property def characters(self): return self.get_character_list() def get_character_list(self, key: str = "character") -> List[str]: framework_info = self.get_rt_info_framework() if framework_info is None: return [] val = framework_info[key] if key in framework_info else None if val is None or not hasattr(val, "value"): return [] value = getattr(val, "value", None) if value is None: return [] return value.splitlines() def have_key(self, key: str = "character") -> bool: try: framework_info = self.get_rt_info_framework() return framework_info is not None and key in framework_info except (AttributeError, TypeError, KeyError): return False def get_rt_info_framework(self): rt_info = self.model.get_rt_info() if "framework" not in rt_info: return None return rt_info["framework"] class OpenVINOError(Exception): pass ================================================ FILE: rapid_layout/main.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import argparse import dataclasses from typing import List, Optional from .inference_engine.base import get_engine from .model_handler import ModelHandler, ModelProcessor from .utils.load_image import InputType, LoadImage from .utils.typings import ModelType, RapidLayoutInput, RapidLayoutOutput from .utils.utils import is_url class RapidLayout: def __init__(self, cfg: Optional[RapidLayoutInput] = None, **kwargs): """初始化布局检测引擎。 Args: cfg: 可选,完整配置;若为 None 则仅用 kwargs 构造配置。 Kwargs(与 RapidLayoutInput 字段一致,传入时会覆盖 cfg 中同名字段): model_type: 模型类型,ModelType 或 str(如 "pp_layout_cdla"),默认 PP_LAYOUT_CDLA。 model_dir_or_path: 模型目录或单文件路径,str | Path | None,默认 None(按 model_type 自动解析)。 engine_type: 推理引擎,EngineType 或 str("onnxruntime" | "openvino"),默认 onnxruntime。 engine_cfg: 引擎额外配置,dict,默认 {}。 conf_thresh: 框置信度阈值 [0, 1],默认 0.5。 iou_thresh: IoU 阈值 [0, 1],默认 0.5。 """ if cfg is None: cfg = RapidLayoutInput(**RapidLayoutInput.normalize_kwargs(kwargs)) elif kwargs: cfg = dataclasses.replace(cfg, **RapidLayoutInput.normalize_kwargs(kwargs)) if not cfg.model_dir_or_path: cfg.model_dir_or_path = ModelProcessor.get_model_path(cfg.model_type) self.session = get_engine(cfg.engine_type)(cfg) self.model_handler = ModelHandler(cfg, self.session) self.load_img = LoadImage() def __call__(self, img_content: InputType) -> RapidLayoutOutput: img = self.load_img(img_content) result = self.model_handler(img) return result def parse_args(arg_list: Optional[List[str]] = None): parser = argparse.ArgumentParser() parser.add_argument("img_path", type=str, help="Path to image for layout.") parser.add_argument( "-m", "--model_type", type=str, default=ModelType.PP_LAYOUT_CDLA.value, choices=[v.value for v in ModelType], help="Support model type", ) parser.add_argument( "--conf_thresh", type=float, default=0.5, help="Box threshold, the range is [0, 1]", ) parser.add_argument( "--iou_thresh", type=float, default=0.5, help="IoU threshold, the range is [0, 1]", ) parser.add_argument( "-v", "--vis", action="store_true", help="Wheter to visualize the layout results.", ) args = parser.parse_args(arg_list) return args def main(arg_list: Optional[List[str]] = None): args = parse_args(arg_list) input_args = RapidLayoutInput( model_type=ModelType(args.model_type), iou_thresh=args.iou_thresh, conf_thresh=args.conf_thresh, ) layout_engine = RapidLayout(input_args) results = layout_engine(args.img_path) print(results) if args.vis: save_path = "layout_vis.jpg" if not is_url(args.img_path): save_path = args.img_path.resolve().parent / "layout_vis.jpg" results.vis(save_path) if __name__ == "__main__": main() ================================================ FILE: rapid_layout/model_handler/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from .main import ModelHandler from .utils import ModelProcessor ================================================ FILE: rapid_layout/model_handler/base/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from abc import ABC, abstractmethod class BaseModelHandler(ABC): @abstractmethod def __call__(self): pass @abstractmethod def preprocess(self): pass @abstractmethod def postprocess(self): pass ================================================ FILE: rapid_layout/model_handler/doc_layout/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from .main import DocLayoutModelHandler ================================================ FILE: rapid_layout/model_handler/doc_layout/main.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import time import numpy as np from ...inference_engine.base import InferSession from ...utils.typings import RapidLayoutOutput from ..base import BaseModelHandler from .post_process import DocLayoutPostProcess from .pre_process import DocLayoutPreProcess class DocLayoutModelHandler(BaseModelHandler): def __init__(self, labels, conf_thres, iou_thres, session: InferSession): self.img_size = (1024, 1024) self.preprocess = DocLayoutPreProcess(img_size=self.img_size) self.postprocess = DocLayoutPostProcess(labels, conf_thres, iou_thres) self.session = session def __call__(self, ori_img: np.ndarray) -> RapidLayoutOutput: s1 = time.perf_counter() ori_img_shape = ori_img.shape[:2] img = self.preprocess(ori_img) preds = self.session(img) boxes, scores, class_names = self.postprocess( preds, ori_img_shape, self.img_size ) elapse = time.perf_counter() - s1 return RapidLayoutOutput( img=ori_img, boxes=boxes, class_names=class_names, scores=scores, elapse=elapse, ) def preprocess(self, image): return self.preprocess(image) def postprocess(self, preds, ori_img_shape, img_shape): return self.postprocess(preds, ori_img_shape, img_shape) ================================================ FILE: rapid_layout/model_handler/doc_layout/post_process.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from typing import List, Tuple from ..utils import scale_boxes class DocLayoutPostProcess: def __init__(self, labels: List[str], conf_thres=0.2, iou_thres=0.5): self.labels = labels self.conf_threshold = conf_thres self.iou_threshold = iou_thres self.input_width, self.input_height = None, None self.img_width, self.img_height = None, None def __call__( self, preds, ori_img_shape: Tuple[int, int], img_shape: Tuple[int, int] = (1024, 1024), ): preds = preds[0] mask = preds[..., 4] > self.conf_threshold preds = [p[mask[idx]] for idx, p in enumerate(preds)][0] preds[:, :4] = scale_boxes(list(img_shape), preds[:, :4], list(ori_img_shape)) boxes = preds[:, :4] confidences = preds[:, 4] class_ids = preds[:, 5].astype(int) labels = [self.labels[i] for i in class_ids] return boxes, confidences, labels ================================================ FILE: rapid_layout/model_handler/doc_layout/pre_process.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from pathlib import Path from typing import Tuple, Union import numpy as np from ..utils import LetterBox InputType = Union[str, np.ndarray, bytes, Path] class DocLayoutPreProcess: def __init__(self, img_size: Tuple[int, int]): self.img_size = img_size self.letterbox = LetterBox(new_shape=img_size, auto=False, stride=32) def __call__(self, image: np.ndarray) -> np.ndarray: input_img = self.letterbox(image=image) input_img = input_img[None, ...] input_img = input_img[..., ::-1].transpose(0, 3, 1, 2) input_img = np.ascontiguousarray(input_img) input_img = input_img / 255 input_tensor = input_img.astype(np.float32) return input_tensor ================================================ FILE: rapid_layout/model_handler/main.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from typing import Any import numpy as np from ..inference_engine.base import InferSession from ..utils.logger import Logger from ..utils.typings import RapidLayoutInput, RapidLayoutOutput from .doc_layout import DocLayoutModelHandler from .pp import PPModelHandler from .pp_doc_layout import PPDocLayoutModelHandler from .yolov8 import YOLOv8ModelHandler class ModelHandler: def __init__(self, cfg: RapidLayoutInput, session: InferSession): self.logger = Logger(logger_name=__name__).get_log() self.model_processors = self._init_handler(cfg, session) def _init_handler(self, cfg: RapidLayoutInput, session: InferSession) -> Any: model_type = cfg.model_type.value self.logger.info(f"{model_type} contains {session.characters}") if model_type.startswith("pp_doc_layout"): return PPDocLayoutModelHandler( session.characters, cfg.conf_thresh, cfg.iou_thresh, session ) if model_type.startswith("pp"): return PPModelHandler( session.characters, cfg.conf_thresh, cfg.iou_thresh, session ) if model_type.startswith("yolov8"): return YOLOv8ModelHandler( session.characters, cfg.conf_thresh, cfg.iou_thresh, session ) if model_type.startswith("doclayout"): return DocLayoutModelHandler( session.characters, cfg.conf_thresh, cfg.iou_thresh, session ) raise ValueError(f"{model_type.value} is not supported!") def __call__(self, img: np.ndarray) -> RapidLayoutOutput: return self.model_processors(img) ================================================ FILE: rapid_layout/model_handler/pp/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from .main import PPModelHandler ================================================ FILE: rapid_layout/model_handler/pp/main.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import time import numpy as np from ...inference_engine.base import InferSession from ...utils.typings import RapidLayoutOutput from ..base import BaseModelHandler from .post_process import PPPostProcess from .pre_process import PPPreProcess class PPModelHandler(BaseModelHandler): def __init__(self, labels, conf_thres, iou_thres, session: InferSession): self.img_size = (800, 608) self.pp_preprocess = PPPreProcess(img_size=self.img_size) self.pp_postprocess = PPPostProcess(labels, conf_thres, iou_thres) self.session = session def __call__(self, ori_img: np.ndarray) -> RapidLayoutOutput: s1 = time.perf_counter() ori_img_shape = ori_img.shape[:2] img = self.preprocess(ori_img) preds = self.session(img) boxes, scores, class_names = self.postprocess(ori_img_shape, img, preds) elapse = time.perf_counter() - s1 return RapidLayoutOutput( img=ori_img, boxes=boxes, class_names=class_names, scores=scores, elapse=elapse, ) def preprocess(self, image: np.ndarray) -> np.ndarray: return self.pp_preprocess(image) def postprocess(self, ori_img_shape, img, preds): return self.pp_postprocess(ori_img_shape, img, preds) ================================================ FILE: rapid_layout/model_handler/pp/post_process.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from typing import List, Tuple import numpy as np class PPPostProcess: def __init__(self, labels, conf_thres=0.4, iou_thres=0.5): self.labels = labels self.strides = [8, 16, 32, 64] self.conf_thres = conf_thres self.iou_thres = iou_thres self.nms_top_k = 1000 self.keep_top_k = 100 def __call__( self, ori_shape, img: np.ndarray, preds: List[np.ndarray] ) -> Tuple[List[List[float]], List[float], List[str]]: scores, raw_boxes = [], [] num_outs = int(len(preds) / 2) for out_idx in range(num_outs): scores.append(preds[out_idx]) raw_boxes.append(preds[out_idx + num_outs]) batch_size = raw_boxes[0].shape[0] reg_max = int(raw_boxes[0].shape[-1] / 4 - 1) out_boxes_num, out_boxes_list = [], [] ori_shape, input_shape, scale_factor = self.img_info(ori_shape, img) for batch_id in range(batch_size): # generate centers decode_boxes, select_scores = [], [] for stride, box_distribute, score in zip(self.strides, raw_boxes, scores): box_distribute = box_distribute[batch_id] score = score[batch_id] # centers fm_h = input_shape[0] / stride fm_w = input_shape[1] / stride h_range = np.arange(fm_h) w_range = np.arange(fm_w) ww, hh = np.meshgrid(w_range, h_range) ct_row = (hh.flatten() + 0.5) * stride ct_col = (ww.flatten() + 0.5) * stride center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1) # box distribution to distance reg_range = np.arange(reg_max + 1) box_distance = box_distribute.reshape((-1, reg_max + 1)) box_distance = self.softmax(box_distance, axis=1) box_distance = box_distance * np.expand_dims(reg_range, axis=0) box_distance = np.sum(box_distance, axis=1).reshape((-1, 4)) box_distance = box_distance * stride # top K candidate topk_idx = np.argsort(score.max(axis=1))[::-1] topk_idx = topk_idx[: self.nms_top_k] center = center[topk_idx] score = score[topk_idx] box_distance = box_distance[topk_idx] # decode box decode_box = center + [-1, -1, 1, 1] * box_distance select_scores.append(score) decode_boxes.append(decode_box) # nms bboxes = np.concatenate(decode_boxes, axis=0) confidences = np.concatenate(select_scores, axis=0) picked_box_probs, picked_labels = [], [] for class_index in range(0, confidences.shape[1]): probs = confidences[:, class_index] mask = probs > self.conf_thres probs = probs[mask] if probs.shape[0] == 0: continue subset_boxes = bboxes[mask, :] box_probs = np.concatenate([subset_boxes, probs.reshape(-1, 1)], axis=1) box_probs = self.hard_nms( box_probs, iou_thres=self.iou_thres, top_k=self.keep_top_k, ) picked_box_probs.append(box_probs) picked_labels.extend([class_index] * box_probs.shape[0]) if len(picked_box_probs) == 0: out_boxes_list.append(np.empty((0, 4))) out_boxes_num.append(0) else: picked_box_probs = np.concatenate(picked_box_probs) # resize output boxes picked_box_probs[:, :4] = self.warp_boxes( picked_box_probs[:, :4], ori_shape[batch_id] ) im_scale = np.concatenate( [scale_factor[batch_id][::-1], scale_factor[batch_id][::-1]] ) picked_box_probs[:, :4] /= im_scale # clas score box out_boxes_list.append( np.concatenate( [ np.expand_dims(np.array(picked_labels), axis=-1), np.expand_dims(picked_box_probs[:, 4], axis=-1), picked_box_probs[:, :4], ], axis=1, ) ) out_boxes_num.append(len(picked_labels)) out_boxes_list = np.concatenate(out_boxes_list, axis=0) out_boxes_num = np.asarray(out_boxes_num).astype(np.int32) boxes, scores, class_names = [], [], [] for dt in out_boxes_list: clsid, bbox, score = int(dt[0]), dt[2:], dt[1] label = self.labels[clsid] boxes.append(bbox.tolist()) scores.append(float(score)) class_names.append(label) return boxes, scores, class_names def warp_boxes(self, boxes, ori_shape): """Apply transform to boxes""" width, height = ori_shape[1], ori_shape[0] n = len(boxes) if n: # warp points xy = np.ones((n * 4, 3)) xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( n * 4, 2 ) # x1y1, x2y2, x1y2, x2y1 # xy = xy @ M.T # transform xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale # create new boxes x = xy[:, [0, 2, 4, 6]] y = xy[:, [1, 3, 5, 7]] xy = ( np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T ) # clip boxes xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) return xy.astype(np.float32) return boxes def img_info(self, origin_shape, img): resize_shape = img.shape im_scale_y = resize_shape[2] / float(origin_shape[0]) im_scale_x = resize_shape[3] / float(origin_shape[1]) scale_factor = np.array([im_scale_y, im_scale_x], dtype=np.float32) img_shape = np.array(img.shape[2:], dtype=np.float32) input_shape = np.array(img).astype("float32").shape[2:] ori_shape = np.array((img_shape,)).astype("float32") scale_factor = np.array((scale_factor,)).astype("float32") return ori_shape, input_shape, scale_factor @staticmethod def softmax(x, axis=None): def logsumexp(a, axis=None, b=None, keepdims=False): a_max = np.amax(a, axis=axis, keepdims=True) if a_max.ndim > 0: a_max[~np.isfinite(a_max)] = 0 elif not np.isfinite(a_max): a_max = 0 tmp = np.exp(a - a_max) # suppress warnings about log of zero with np.errstate(divide="ignore"): s = np.sum(tmp, axis=axis, keepdims=keepdims) out = np.log(s) if not keepdims: a_max = np.squeeze(a_max, axis=axis) out += a_max return out return np.exp(x - logsumexp(x, axis=axis, keepdims=True)) def hard_nms(self, box_scores, iou_thres, top_k=-1, candidate_size=200): """ Args: box_scores (N, 5): boxes in corner-form and probabilities. iou_thres: intersection over union threshold. top_k: keep top_k results. If k <= 0, keep all the results. candidate_size: only consider the candidates with the highest scores. Returns: picked: a list of indexes of the kept boxes """ scores = box_scores[:, -1] boxes = box_scores[:, :-1] picked = [] indexes = np.argsort(scores) indexes = indexes[-candidate_size:] while len(indexes) > 0: current = indexes[-1] picked.append(current) if 0 < top_k == len(picked) or len(indexes) == 1: break current_box = boxes[current, :] indexes = indexes[:-1] rest_boxes = boxes[indexes, :] iou = self.iou_of( rest_boxes, np.expand_dims(current_box, axis=0), ) indexes = indexes[iou <= iou_thres] return box_scores[picked, :] def iou_of(self, boxes0, boxes1, eps=1e-5): """Return intersection-over-union (Jaccard index) of boxes. Args: boxes0 (N, 4): ground truth boxes. boxes1 (N or 1, 4): predicted boxes. eps: a small number to avoid 0 as denominator. Returns: iou (N): IoU values. """ overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2]) overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:]) overlap_area = self.area_of(overlap_left_top, overlap_right_bottom) area0 = self.area_of(boxes0[..., :2], boxes0[..., 2:]) area1 = self.area_of(boxes1[..., :2], boxes1[..., 2:]) return overlap_area / (area0 + area1 - overlap_area + eps) @staticmethod def area_of(left_top, right_bottom): """Compute the areas of rectangles given two corners. Args: left_top (N, 2): left top corner. right_bottom (N, 2): right bottom corner. Returns: area (N): return the area. """ hw = np.clip(right_bottom - left_top, 0.0, None) return hw[..., 0] * hw[..., 1] ================================================ FILE: rapid_layout/model_handler/pp/pre_process.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from pathlib import Path from typing import Optional, Tuple, Union import cv2 import numpy as np InputType = Union[str, np.ndarray, bytes, Path] class PPPreProcess: def __init__(self, img_size: Tuple[int, int]): self.size = img_size self.mean = np.array([0.485, 0.456, 0.406]) self.std = np.array([0.229, 0.224, 0.225]) self.scale = 1 / 255.0 def __call__(self, img: Optional[np.ndarray] = None) -> np.ndarray: if img is None: raise ValueError("img is None.") img = self.resize(img) img = self.normalize(img) img = self.permute(img) img = np.expand_dims(img, axis=0) return img.astype(np.float32) def resize(self, img: np.ndarray) -> np.ndarray: resize_h, resize_w = self.size img = cv2.resize(img, (int(resize_w), int(resize_h))) return img def normalize(self, img: np.ndarray) -> np.ndarray: return (img.astype("float32") * self.scale - self.mean) / self.std def permute(self, img: np.ndarray) -> np.ndarray: return img.transpose((2, 0, 1)) ================================================ FILE: rapid_layout/model_handler/pp_doc_layout/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from .main import PPDocLayoutModelHandler ================================================ FILE: rapid_layout/model_handler/pp_doc_layout/main.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import time from typing import Any, List import numpy as np from ...inference_engine.base import InferSession from ...utils.typings import RapidLayoutOutput from ..base import BaseModelHandler from .post_process import PPDocLayoutPostProcess from .pre_process import PPDocLayoutPreProcess class PPDocLayoutModelHandler(BaseModelHandler): def __init__( self, labels: List[str], conf_thres: float, iou_thres: float, session: InferSession, ): self.img_size = (800, 800) self.pp_preprocess = PPDocLayoutPreProcess(img_size=self.img_size) self.pp_postprocess = PPDocLayoutPostProcess(labels=labels) self.session = session self.conf_thres = conf_thres self.iou_thres = iou_thres def __call__(self, ori_img: np.ndarray) -> RapidLayoutOutput: s1 = time.perf_counter() ori_data, ort_inputs = self.preprocess(ori_img) ort_outputs = self.session(ort_inputs) preds_list = self.format_output(ort_outputs) boxes, scores, class_names = self.postprocess( batch_outputs=preds_list, datas=[ori_data], threshold=self.conf_thres, layout_nms=True, layout_shape_mode="auto", filter_overlap_boxes=True, skip_order_labels=None, ) elapse = time.perf_counter() - s1 return RapidLayoutOutput( img=ori_img, boxes=boxes, class_names=class_names, scores=scores, elapse=elapse, ) def preprocess(self, image: np.ndarray) -> np.ndarray: return self.pp_preprocess(image) def postprocess(self, **kwargs: Any): return self.pp_postprocess(**kwargs) @staticmethod def format_output(pred): box_idx_start = 0 np_boxes_num = pred[1][0] box_idx_end = box_idx_start + np_boxes_num np_boxes = pred[0][box_idx_start:box_idx_end] return [{"boxes": np.array(np_boxes)}] ================================================ FILE: rapid_layout/model_handler/pp_doc_layout/post_process.py ================================================ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict, List, Optional, Tuple, Union import cv2 import numpy as np from numpy import ndarray Boxes = List[dict] Number = Union[int, float] class PPDocLayoutPostProcess: def __init__( self, labels: Optional[List[str]] = None, scale_size: Optional[List[int]] = None ) -> None: self.labels = labels self.scale_size = scale_size def apply( self, boxes: ndarray, img_size: Tuple[int, int], threshold: Union[float, dict], layout_nms: Optional[bool], layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]], layout_merge_bboxes_mode: Optional[Union[str, dict]], masks: Optional[ndarray] = None, layout_shape_mode: Optional[str] = "auto", ) -> Boxes: """Apply post-processing to the detection boxes. Args: boxes (ndarray): The input detection boxes with scores. img_size (tuple): The original image size. Returns: Boxes: The post-processed detection boxes. """ if layout_shape_mode == "rect": masks = None boxes[:, 2:6] = np.round(boxes[:, 2:6]).astype(int) if isinstance(threshold, float): expect_boxes = (boxes[:, 1] > threshold) & (boxes[:, 0] > -1) boxes = boxes[expect_boxes, :] if masks is not None: masks = masks[expect_boxes, ...] elif isinstance(threshold, dict): category_filtered_boxes = [] if masks is not None: category_filtered_masks = [] for cat_id in np.unique(boxes[:, 0]): category_boxes = boxes[boxes[:, 0] == cat_id] if masks is not None: category_masks = masks[boxes[:, 0] == cat_id] category_threshold = threshold.get(int(cat_id), 0.5) selected_indices = (category_boxes[:, 1] > category_threshold) & ( category_boxes[:, 0] > -1 ) if masks is not None: category_masks = category_masks[selected_indices] category_filtered_masks.append(category_masks) category_filtered_boxes.append(category_boxes[selected_indices]) boxes = ( np.vstack(category_filtered_boxes) if category_filtered_boxes else np.array([]) ) if masks is not None: masks = ( np.concatenate(category_filtered_masks) if category_filtered_masks else np.array([]) ) if layout_nms: selected_indices = nms(boxes[:, :6], iou_same=0.6, iou_diff=0.98) boxes = np.array(boxes[selected_indices]) if masks is not None: masks = [masks[i] for i in selected_indices] filter_large_image = True # boxes.shape[1] == 6 is object detection, 7 is new ordered object detection, 8 is ordered object detection if filter_large_image and len(boxes) > 1 and boxes.shape[1] in [6, 7, 8]: if img_size[0] > img_size[1]: area_thres = 0.82 else: area_thres = 0.93 image_index = self.labels.index("image") if "image" in self.labels else None img_area = img_size[0] * img_size[1] filtered_boxes = [] filtered_masks = [] for idx, box in enumerate(boxes): ( label_index, score, xmin, ymin, xmax, ymax, ) = box[:6] if label_index == image_index: xmin = max(0, xmin) ymin = max(0, ymin) xmax = min(img_size[0], xmax) ymax = min(img_size[1], ymax) box_area = (xmax - xmin) * (ymax - ymin) if box_area <= area_thres * img_area: filtered_boxes.append(box) if masks is not None: filtered_masks.append(masks[idx]) else: filtered_boxes.append(box) if masks is not None: filtered_masks.append(masks[idx]) if len(filtered_boxes) == 0: filtered_boxes = boxes if masks is not None: filtered_masks = masks boxes = np.array(filtered_boxes) if masks is not None: masks = filtered_masks if layout_merge_bboxes_mode: formula_index = ( self.labels.index("formula") if "formula" in self.labels else None ) if isinstance(layout_merge_bboxes_mode, str): assert layout_merge_bboxes_mode in [ "union", "large", "small", ], f"The value of `layout_merge_bboxes_mode` must be one of ['union', 'large', 'small'], but got {layout_merge_bboxes_mode}" if layout_merge_bboxes_mode == "union": pass else: contains_other, contained_by_other = check_containment( boxes[:, :6], formula_index ) if layout_merge_bboxes_mode == "large": boxes = boxes[contained_by_other == 0] if masks is not None: masks = [ mask for i, mask in enumerate(masks) if contained_by_other[i] == 0 ] elif layout_merge_bboxes_mode == "small": boxes = boxes[(contains_other == 0) | (contained_by_other == 1)] if masks is not None: masks = [ mask for i, mask in enumerate(masks) if (contains_other[i] == 0) | (contained_by_other[i] == 1) ] elif isinstance(layout_merge_bboxes_mode, dict): keep_mask = np.ones(len(boxes), dtype=bool) for category_index, layout_mode in layout_merge_bboxes_mode.items(): assert layout_mode in [ "union", "large", "small", ], f"The value of `layout_merge_bboxes_mode` must be one of ['union', 'large', 'small'], but got {layout_mode}" if layout_mode == "union": pass else: if layout_mode == "large": contains_other, contained_by_other = check_containment( boxes[:, :6], formula_index, category_index, mode=layout_mode, ) # Remove boxes that are contained by other boxes keep_mask &= contained_by_other == 0 elif layout_mode == "small": contains_other, contained_by_other = check_containment( boxes[:, :6], formula_index, category_index, mode=layout_mode, ) # Keep boxes that do not contain others or are contained by others keep_mask &= (contains_other == 0) | ( contained_by_other == 1 ) boxes = boxes[keep_mask] if masks is not None: masks = [mask for i, mask in enumerate(masks) if keep_mask[i]] if boxes.size == 0: return np.array([]) if boxes.shape[1] == 8: # Sort boxes by their order sorted_idx = np.lexsort((-boxes[:, 7], boxes[:, 6])) sorted_boxes = boxes[sorted_idx] boxes = sorted_boxes[:, :6] if masks is not None: sorted_masks = [masks[i] for i in sorted_idx] masks = sorted_masks if boxes.shape[1] == 7: # Sort boxes by their order sorted_idx = np.argsort(boxes[:, 6]) sorted_boxes = boxes[sorted_idx] boxes = sorted_boxes[:, :6] if masks is not None: sorted_masks = [masks[i] for i in sorted_idx] masks = sorted_masks polygon_points = None if masks is not None: scale_ratio = [h / s for h, s in zip(self.scale_size, img_size)] polygon_points = extract_polygon_points_by_masks( boxes, np.array(masks), scale_ratio, layout_shape_mode ) if layout_unclip_ratio: if isinstance(layout_unclip_ratio, float): layout_unclip_ratio = (layout_unclip_ratio, layout_unclip_ratio) elif isinstance(layout_unclip_ratio, (tuple, list)): assert ( len(layout_unclip_ratio) == 2 ), f"The length of `layout_unclip_ratio` should be 2." elif isinstance(layout_unclip_ratio, dict): pass else: raise ValueError( f"The type of `layout_unclip_ratio` must be float, Tuple[float, float] or Dict[int, Tuple[float, float]], but got {type(layout_unclip_ratio)}." ) boxes = unclip_boxes(boxes, layout_unclip_ratio) if boxes.shape[1] == 6: """For Normal Object Detection""" boxes = restructured_boxes(boxes, self.labels, img_size, polygon_points) else: """Unexpected Input Box Shape""" raise ValueError( f"The shape of boxes should be 6 or 10, instead of {boxes.shape[1]}" ) return boxes def __call__( self, batch_outputs: List[dict], datas: List[dict], threshold: Optional[Union[float, dict]] = None, layout_nms: Optional[bool] = None, layout_unclip_ratio: Optional[Union[float, Tuple[float, float]]] = None, layout_merge_bboxes_mode: Optional[str] = None, layout_shape_mode: Optional[str] = None, filter_overlap_boxes: Optional[bool] = None, skip_order_labels: Optional[List[str]] = None, ) -> Tuple[List[float], List[float], List[str]]: outputs = [] for idx, (data, output) in enumerate(zip(datas, batch_outputs)): if "masks" in output: masks = output["masks"] else: layout_shape_mode = "rect" if idx == 0 and layout_shape_mode not in ["rect", "auto"]: print( f"The model you are using does not support polygon output, but the layout_shape_mode is specified as {layout_shape_mode}, which will be set to 'rect'" ) masks = None boxes = self.apply( output["boxes"], data["ori_img_size"], threshold, layout_nms, layout_unclip_ratio, layout_merge_bboxes_mode, masks, layout_shape_mode, ) if filter_overlap_boxes: boxes = filter_boxes(boxes, layout_shape_mode) skip_order_labels = ( skip_order_labels if skip_order_labels is not None else SKIP_ORDER_LABELS ) boxes = update_order_index(boxes, skip_order_labels) outputs.append(boxes) if len(outputs) != 1: raise ValueError( f"The length of outputs should be 1, but got {len(outputs)}" ) output = outputs[0] boxes, scores, class_names = [], [], [] for data in output: boxes.append(data["coordinate"]) scores.append(float(data["score"])) class_names.append(data["label"]) return boxes, scores, class_names def is_contained(box1, box2): """Check if box1 is contained within box2.""" _, _, x1, y1, x2, y2 = box1 _, _, x1_p, y1_p, x2_p, y2_p = box2 box1_area = (x2 - x1) * (y2 - y1) xi1 = max(x1, x1_p) yi1 = max(y1, y1_p) xi2 = min(x2, x2_p) yi2 = min(y2, y2_p) inter_width = max(0, xi2 - xi1) inter_height = max(0, yi2 - yi1) intersect_area = inter_width * inter_height iou = intersect_area / box1_area if box1_area > 0 else 0 return iou >= 0.9 def check_containment(boxes, formula_index=None, category_index=None, mode=None): """Check containment relationships among boxes.""" n = len(boxes) contains_other = np.zeros(n, dtype=int) contained_by_other = np.zeros(n, dtype=int) for i in range(n): for j in range(n): if i == j: continue if formula_index is not None: if boxes[i][0] == formula_index and boxes[j][0] != formula_index: continue if category_index is not None and mode is not None: if mode == "large" and boxes[j][0] == category_index: if is_contained(boxes[i], boxes[j]): contained_by_other[i] = 1 contains_other[j] = 1 if mode == "small" and boxes[i][0] == category_index: if is_contained(boxes[i], boxes[j]): contained_by_other[i] = 1 contains_other[j] = 1 else: if is_contained(boxes[i], boxes[j]): contained_by_other[i] = 1 contains_other[j] = 1 return contains_other, contained_by_other def nms(boxes, iou_same=0.6, iou_diff=0.95): """Perform Non-Maximum Suppression (NMS) with different IoU thresholds for same and different classes.""" # Extract class scores scores = boxes[:, 1] # Sort indices by scores in descending order indices = np.argsort(scores)[::-1] selected_boxes = [] while len(indices) > 0: current = indices[0] current_box = boxes[current] current_class = current_box[0] current_box[1] current_coords = current_box[2:] selected_boxes.append(current) indices = indices[1:] filtered_indices = [] for i in indices: box = boxes[i] box_class = box[0] box_coords = box[2:] iou_value = iou(current_coords, box_coords) threshold = iou_same if current_class == box_class else iou_diff # If the IoU is below the threshold, keep the box if iou_value < threshold: filtered_indices.append(i) indices = filtered_indices return selected_boxes def iou(box1, box2): """Compute the Intersection over Union (IoU) of two bounding boxes.""" x1, y1, x2, y2 = box1 x1_p, y1_p, x2_p, y2_p = box2 # Compute the intersection coordinates x1_i = max(x1, x1_p) y1_i = max(y1, y1_p) x2_i = min(x2, x2_p) y2_i = min(y2, y2_p) # Compute the area of intersection inter_area = max(0, x2_i - x1_i + 1) * max(0, y2_i - y1_i + 1) # Compute the area of both bounding boxes box1_area = (x2 - x1 + 1) * (y2 - y1 + 1) box2_area = (x2_p - x1_p + 1) * (y2_p - y1_p + 1) # Compute the IoU iou_value = inter_area / float(box1_area + box2_area - inter_area) return iou_value SKIP_ORDER_LABELS = [ "figure_title", "vision_footnote", "image", "chart", "table", "header", "header_image", "footer", "footer_image", "footnote", "aside_text", ] def is_convex(p_prev, p_curr, p_next): """ Calculate if the polygon is convex. """ v1 = p_curr - p_prev v2 = p_next - p_curr cross = v1[0] * v2[1] - v1[1] * v2[0] return cross < 0 def angle_between_vectors(v1, v2): """ Calculate the angle between two vectors. """ unit_v1 = v1 / np.linalg.norm(v1) unit_v2 = v2 / np.linalg.norm(v2) dot_prod = np.clip(np.dot(unit_v1, unit_v2), -1.0, 1.0) angle_rad = np.arccos(dot_prod) return np.degrees(angle_rad) def calc_new_point(p_curr, v1, v2, distance=20): """ Calculate the new point based on the direction of two vectors. """ dir_vec = v1 / np.linalg.norm(v1) + v2 / np.linalg.norm(v2) dir_vec = dir_vec / np.linalg.norm(dir_vec) p_new = p_curr + dir_vec * distance return p_new def extract_custom_vertices( polygon, max_allowed_dist, sharp_angle_thresh=45, max_dist_ratio=0.3 ): poly = np.array(polygon) n = len(poly) max_allowed_dist *= max_dist_ratio point_info = [] for i in range(n): p_prev, p_curr, p_next = poly[(i - 1) % n], poly[i], poly[(i + 1) % n] v1, v2 = p_prev - p_curr, p_next - p_curr is_convex_point = is_convex(p_prev, p_curr, p_next) angle = angle_between_vectors(v1, v2) point_info.append( { "index": i, "is_convex": is_convex_point, "angle": angle, "v1": v1, "v2": v2, } ) concave_indices = [i for i, info in enumerate(point_info) if not info["is_convex"]] preserve_concave = set() if concave_indices: groups = [] current_group = [concave_indices[0]] for i in range(1, len(concave_indices)): if concave_indices[i] - concave_indices[i - 1] == 1 or ( concave_indices[i - 1] == n - 1 and concave_indices[i] == 0 ): current_group.append(concave_indices[i]) else: if len(current_group) >= 2: groups.extend(current_group) current_group = [concave_indices[i]] if len(current_group) >= 2: groups.extend(current_group) if ( len(concave_indices) >= 2 and concave_indices[0] == 0 and concave_indices[-1] == n - 1 ): if 0 in groups and n - 1 in groups: preserve_concave.update(groups) else: preserve_concave.update(groups) kept_points = [ i for i, info in enumerate(point_info) if info["is_convex"] or (i in preserve_concave and info["angle"] >= 120) ] final_points = [] for idx in range(len(kept_points)): current_idx = kept_points[idx] next_idx = kept_points[(idx + 1) % len(kept_points)] final_points.append(current_idx) dist = np.linalg.norm(poly[current_idx] - poly[next_idx]) if dist > max_allowed_dist: intermediate = ( list(range(current_idx + 1, next_idx)) if next_idx > current_idx else list(range(current_idx + 1, n)) + list(range(0, next_idx)) ) if intermediate: num_needed = int(np.ceil(dist / max_allowed_dist)) - 1 if len(intermediate) <= num_needed: final_points.extend(intermediate) else: step = len(intermediate) / num_needed final_points.extend( [intermediate[int(i * step)] for i in range(num_needed)] ) final_points = sorted(set(final_points)) res = [] for i in final_points: info = point_info[i] p_curr = poly[i] if info["is_convex"] and abs(info["angle"] - sharp_angle_thresh) < 1: v1_norm = info["v1"] / np.linalg.norm(info["v1"]) v2_norm = info["v2"] / np.linalg.norm(info["v2"]) dir_vec = v1_norm + v2_norm dir_vec /= np.linalg.norm(dir_vec) d = (np.linalg.norm(info["v1"]) + np.linalg.norm(info["v2"])) / 2 res.append(tuple(p_curr + dir_vec * d)) else: res.append(tuple(p_curr)) return res def mask2polygon(mask, max_allowed_dist, epsilon_ratio=0.004, extract_custom=True): """ Postprocess mask by removing small noise. Args: mask (ndarray): The input mask of shape [H, W]. epsilon_ratio (float): The ratio of epsilon. Returns: ndarray: The output mask after postprocessing. """ cnts, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) if not cnts: return None cnt = max(cnts, key=cv2.contourArea) epsilon = epsilon_ratio * cv2.arcLength(cnt, True) approx_cnt = cv2.approxPolyDP(cnt, epsilon, True) polygon_points = approx_cnt.squeeze() polygon_points = np.atleast_2d(polygon_points) if extract_custom: polygon_points = extract_custom_vertices(polygon_points, max_allowed_dist) return polygon_points def extract_polygon_points_by_masks(boxes, masks, scale_ratio, layout_shape_mode): """ 修改后的提取函数:auto 模式下信任几何决策 """ scale_w, scale_h = scale_ratio[0] / 4, scale_ratio[1] / 4 h_m, w_m = masks.shape[1:] polygon_points = [] iou_threshold = 0.95 max_box_w = max(boxes[:, 4] - boxes[:, 3]) for i in range(len(boxes)): x_min, y_min, x_max, y_max = boxes[i, 2:6].astype(np.int32) box_w, box_h = x_max - x_min, y_max - y_min # default rect rect = np.array( [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]], dtype=np.float32, ) if box_w <= 0 or box_h <= 0: polygon_points.append(rect) continue # crop mask x_s = np.clip( [int(round(x_min * scale_w)), int(round(x_max * scale_w))], 0, w_m ) y_s = np.clip( [int(round(y_min * scale_h)), int(round(y_max * scale_h))], 0, h_m ) cropped = masks[i, y_s[0] : y_s[1], x_s[0] : x_s[1]] if cropped.size == 0 or np.sum(cropped) == 0: polygon_points.append(rect) continue if layout_shape_mode == "rect": polygon_points.append(rect) continue # resize mask to match box size resized_mask = cv2.resize( cropped.astype(np.uint8), (box_w, box_h), interpolation=cv2.INTER_NEAREST ) if box_w > max_box_w * 0.6: max_allowed_dist = box_w else: max_allowed_dist = max_box_w polygon = mask2polygon(resized_mask, max_allowed_dist) if polygon is not None and len(polygon) < 4: polygon_points.append(rect) continue if polygon is not None and len(polygon) > 0: polygon = polygon + np.array([x_min, y_min]) if layout_shape_mode == "poly": polygon_points.append(polygon) elif layout_shape_mode == "quad": # convert polygon to quadrilateral quad = convert_polygon_to_quad(polygon) polygon_points.append(quad if quad is not None else rect) elif layout_shape_mode == "auto": iou_threshold = 0.8 rect_list = rect.tolist() quad = convert_polygon_to_quad(polygon) if quad is not None: quad_list = quad.tolist() iou_quad = calculate_polygon_overlap_ratio( rect_list, quad_list, mode="union", ) if iou_quad >= 0.95: # if quad is very similar to rect, use rect instead quad = rect poly_list = ( polygon.tolist() if isinstance(polygon, np.ndarray) else polygon ) iou_quad = calculate_polygon_overlap_ratio( poly_list, quad_list, mode="union" ) pre_poly = polygon_points[-1] if len(polygon_points) > 0 else None iou_pre = 0 if pre_poly is not None: iou_pre = calculate_polygon_overlap_ratio( pre_poly.tolist(), rect_list, mode="small", ) if iou_quad >= iou_threshold and iou_pre < 0.01: # if quad is similar to polygon, use quad polygon_points.append(quad) continue # if all ious are less than threshold, use polygon polygon_points.append(polygon) else: raise ValueError( "layout_shape_mode must be one of ['rect', 'poly', 'quad', 'auto']" ) return polygon_points def convert_polygon_to_quad(polygon): """ Convert polygon to minimum bounding rectangle (quad). Args: polygon (ndarray): The polygon points of shape [N, 2]. Returns: quad (ndarray): The 4-point quad, clockwise from top-left, or None if invalid. """ if polygon is None or len(polygon) < 3: return None points = np.array(polygon, dtype=np.float32) if len(points.shape) == 1: points = points.reshape(-1, 2) min_rect = cv2.minAreaRect(points) quad = cv2.boxPoints(min_rect) center = quad.mean(axis=0) angles = np.arctan2(quad[:, 1] - center[1], quad[:, 0] - center[0]) sorted_indices = np.argsort(angles) quad = quad[sorted_indices] sums = quad[:, 0] + quad[:, 1] top_left_idx = np.argmin(sums) quad = np.roll(quad, -top_left_idx, axis=0) return quad def restructured_boxes( boxes: ndarray, labels: List[str], img_size: Tuple[int, int], polygon_points: ndarray = None, ) -> Boxes: """ Restructure the given bounding boxes and labels based on the image size. Args: boxes (ndarray): A 2D array of bounding boxes with each box represented as [cls_id, score, xmin, ymin, xmax, ymax]. labels (List[str]): A list of class labels corresponding to the class ids. img_size (Tuple[int, int]): A tuple representing the width and height of the image. polygon_points (ndarray): A 2D array of polygon points with each point represented as [x, y]. Returns: Boxes: A list of dictionaries, each containing 'cls_id', 'label', 'score', and 'coordinate' keys. """ box_list = [] w, h = img_size for idx, box in enumerate(boxes): xmin, ymin, xmax, ymax = box[2:] xmin = int(max(0, xmin)) ymin = int(max(0, ymin)) xmax = int(min(w, xmax)) ymax = int(min(h, ymax)) if xmax <= xmin or ymax <= ymin: continue res = { "cls_id": int(box[0]), "label": labels[int(box[0])], "score": float(box[1]), "coordinate": [xmin, ymin, xmax, ymax], "order": idx + 1, } if polygon_points is not None: polygon_point = polygon_points[idx] if polygon_point is None: continue res["polygon_points"] = polygon_point box_list.append(res) return box_list def unclip_boxes(boxes, unclip_ratio=None): """ Expand bounding boxes from (x1, y1, x2, y2) format using an unclipping ratio. Parameters: - boxes: np.ndarray of shape (N, 4), where each row is (x1, y1, x2, y2). - unclip_ratio: tuple of (width_ratio, height_ratio), optional. Returns: - expanded_boxes: np.ndarray of shape (N, 4), where each row is (x1, y1, x2, y2). """ if unclip_ratio is None: return boxes if isinstance(unclip_ratio, dict): expanded_boxes = [] for box in boxes: class_id, score, x1, y1, x2, y2 = box if class_id in unclip_ratio: width_ratio, height_ratio = unclip_ratio[class_id] width = x2 - x1 height = y2 - y1 new_w = width * width_ratio new_h = height * height_ratio center_x = x1 + width / 2 center_y = y1 + height / 2 new_x1 = center_x - new_w / 2 new_y1 = center_y - new_h / 2 new_x2 = center_x + new_w / 2 new_y2 = center_y + new_h / 2 expanded_boxes.append([class_id, score, new_x1, new_y1, new_x2, new_y2]) else: expanded_boxes.append(box) return np.array(expanded_boxes) else: widths = boxes[:, 4] - boxes[:, 2] heights = boxes[:, 5] - boxes[:, 3] new_w = widths * unclip_ratio[0] new_h = heights * unclip_ratio[1] center_x = boxes[:, 2] + widths / 2 center_y = boxes[:, 3] + heights / 2 new_x1 = center_x - new_w / 2 new_y1 = center_y - new_h / 2 new_x2 = center_x + new_w / 2 new_y2 = center_y + new_h / 2 expanded_boxes = np.column_stack( (boxes[:, 0], boxes[:, 1], new_x1, new_y1, new_x2, new_y2) ) return expanded_boxes def make_valid(poly): if not poly.is_valid: poly = poly.buffer(0) return poly def calculate_polygon_overlap_ratio( polygon1: List[Tuple[int, int]], polygon2: List[Tuple[int, int]], mode: str = "union", ) -> float: """ Calculate the overlap ratio between two polygons. Args: polygon1 (List[Tuple[int, int]]): First polygon represented as a list of points. polygon2 (List[Tuple[int, int]]): Second polygon represented as a list of points. mode (str, optional): Overlap calculation mode. Defaults to "union". Returns: float: Overlap ratio value between 0 and 1. """ try: from shapely.geometry import Polygon except ImportError: raise ImportError("Please install Shapely library.") poly1 = Polygon(polygon1) poly2 = Polygon(polygon2) poly1 = make_valid(poly1) poly2 = make_valid(poly2) intersection = poly1.intersection(poly2).area union = poly1.union(poly2).area if mode == "union": return intersection / union elif mode == "small": small_area = min(poly1.area, poly2.area) return intersection / small_area elif mode == "large": large_area = max(poly1.area, poly2.area) return intersection / large_area else: raise ValueError(f"Unknown mode: {mode}") def calculate_bbox_area(bbox): """Calculate bounding box area""" x1, y1, x2, y2 = map(float, bbox) area = abs((x2 - x1) * (y2 - y1)) return area def calculate_overlap_ratio( bbox1: Union[np.ndarray, list, tuple], bbox2: Union[np.ndarray, list, tuple], mode="union", ) -> float: """ Calculate the overlap ratio between two bounding boxes using NumPy. Args: bbox1 (np.ndarray, list or tuple): The first bounding box, format [x_min, y_min, x_max, y_max] bbox2 (np.ndarray, list or tuple): The second bounding box, format [x_min, y_min, x_max, y_max] mode (str): The mode of calculation, either 'union', 'small', or 'large'. Returns: float: The overlap ratio value between the two bounding boxes """ bbox1 = np.array(bbox1) bbox2 = np.array(bbox2) x_min_inter = np.maximum(bbox1[0], bbox2[0]) y_min_inter = np.maximum(bbox1[1], bbox2[1]) x_max_inter = np.minimum(bbox1[2], bbox2[2]) y_max_inter = np.minimum(bbox1[3], bbox2[3]) inter_width = np.maximum(0, x_max_inter - x_min_inter) inter_height = np.maximum(0, y_max_inter - y_min_inter) inter_area = inter_width * inter_height bbox1_area = calculate_bbox_area(bbox1) bbox2_area = calculate_bbox_area(bbox2) if mode == "union": ref_area = bbox1_area + bbox2_area - inter_area elif mode == "small": ref_area = np.minimum(bbox1_area, bbox2_area) elif mode == "large": ref_area = np.maximum(bbox1_area, bbox2_area) else: raise ValueError( f"Invalid mode {mode}, must be one of ['union', 'small', 'large']." ) if ref_area == 0: return 0.0 return inter_area / ref_area def filter_boxes( src_boxes: Dict[str, List[Dict]], layout_shape_mode: str ) -> Dict[str, List[Dict]]: """ Remove overlapping boxes from layout detection results based on a given overlap ratio. Args: boxes (Dict[str, List[Dict]]): Layout detection result dict containing a 'boxes' list. Returns: Dict[str, List[Dict]]: Filtered dict with overlapping boxes removed. """ boxes = [box for box in src_boxes if box["label"] != "reference"] dropped_indexes = set() for i in range(len(boxes)): x1, y1, x2, y2 = boxes[i]["coordinate"] w, h = x2 - x1, y2 - y1 if w < 6 or h < 6: dropped_indexes.add(i) for j in range(i + 1, len(boxes)): if i in dropped_indexes or j in dropped_indexes: continue overlap_ratio = calculate_overlap_ratio( boxes[i]["coordinate"], boxes[j]["coordinate"], "small" ) if ( boxes[i]["label"] == "inline_formula" or boxes[j]["label"] == "inline_formula" ): if overlap_ratio > 0.5: if boxes[i]["label"] == "inline_formula": dropped_indexes.add(i) if boxes[j]["label"] == "inline_formula": dropped_indexes.add(j) continue if overlap_ratio > 0.7: if layout_shape_mode != "rect" and "polygon_points" in boxes[i]: poly_overlap_ratio = calculate_polygon_overlap_ratio( boxes[i]["polygon_points"], boxes[j]["polygon_points"], "small" ) if poly_overlap_ratio < 0.7: continue box_area_i = calculate_bbox_area(boxes[i]["coordinate"]) box_area_j = calculate_bbox_area(boxes[j]["coordinate"]) if ( boxes[i]["label"] == "image" or boxes[j]["label"] == "image" ) and boxes[i]["label"] != boxes[j]["label"]: continue if box_area_i >= box_area_j: dropped_indexes.add(j) else: dropped_indexes.add(i) out_boxes = [box for idx, box in enumerate(boxes) if idx not in dropped_indexes] return out_boxes def update_order_index(boxes: List[Dict], skip_order_labels: List[str]): """ Update the 'order_index' field of each box in the provided list of boxes. Args: boxes (List[Dict]): A list of boxes, where each box is represented as a dictionary with an 'order_index' field. Returns: None. The function updates the 'order_index' field of each box in the input list. """ order_index = 1 for box in boxes: label = box["label"] if label not in skip_order_labels: box["order"] = order_index order_index += 1 else: box["order"] = None return boxes def find_label_position(box, polygon_points, text_w, text_h, max_shift=50): try: from shapely.geometry import Polygon except ImportError: raise ImportError("Please install Shapely library.") poly = Polygon(polygon_points) min_x = min([p[0] for p in polygon_points]) min_y = min([p[1] for p in polygon_points]) for dy in range(max_shift): x1, y1 = min_x, min_y + dy x2, y2 = x1 + text_w, y1 + text_h label_rect = box(x1, y1, x2, y2) if poly.intersects(label_rect): return int(x1), int(y1) return int(min_x), int(min_y) ================================================ FILE: rapid_layout/model_handler/pp_doc_layout/pre_process.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import copy from pathlib import Path from typing import Any, Dict, Optional, Tuple, Union import cv2 import numpy as np InputType = Union[str, np.ndarray, bytes, Path] class PPDocLayoutPreProcess: def __init__(self, img_size: Tuple[int, int]): self.size = [800, 800] self.mean = [0.0, 0.0, 0.0] self.std = [1.0, 1.0, 1.0] self.scale = 1 / 255.0 self.alpha = [self.scale / self.std[i] for i in range(len(self.std))] self.beta = [-self.mean[i] / self.std[i] for i in range(len(self.std))] def __call__(self, img: Optional[np.ndarray] = None) -> Dict[str, Any]: if img is None: raise ValueError("img is None.") data = self.resize(img) data = self.normalize(data) data = self.permute(data) ori_data = copy.deepcopy(data) batch_inputs = self.to_batch(data) return ori_data, batch_inputs def resize(self, img: np.ndarray): resize_h, resize_w = self.size img_ori_h, img_ori_w = img.shape[:2] img = cv2.resize( img, (int(resize_w), int(resize_h)), interpolation=cv2.INTER_CUBIC ) img_h, img_w = img.shape[:2] data = { "img": img, "img_size": [img_w, img_h], "scale_factors": [img_w / img_ori_w, img_h / img_ori_h], "ori_img_size": [img_ori_w, img_ori_h], } return data def normalize(self, data: Dict[str, Any]) -> np.ndarray: img = data["img"] split_im = list(cv2.split(img)) for c in range(img.shape[2]): split_im[c] = split_im[c].astype(np.float32) split_im[c] *= self.alpha[c] split_im[c] += self.beta[c] res = cv2.merge(split_im) data["img"] = res return data def permute(self, data: Dict[str, Any]) -> np.ndarray: img = data["img"] data["img"] = img.transpose((2, 0, 1)) return data def to_batch(self, data, dtype: np.dtype = np.float32) -> list[np.ndarray]: result = [] for key in ["img_size", "img", "scale_factors"]: if key == "img_size": val = [data[key][::-1]] elif key == "scale_factors": val = [data.get(key, [1.0, 1.0])[::-1]] else: val = [data[key]] result.append(np.array(val, dtype=dtype)) return result ================================================ FILE: rapid_layout/model_handler/utils.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from pathlib import Path from typing import Dict import cv2 import numpy as np from ..utils.download_file import DownloadFile, DownloadFileInput from ..utils.logger import Logger from ..utils.typings import ModelType from ..utils.utils import mkdir, read_yaml class ModelProcessor: logger = Logger(logger_name=__name__).get_log() cur_dir = Path(__file__).resolve().parent root_dir = cur_dir.parent DEFAULT_MODEL_PATH = root_dir / "configs" / "default_models.yaml" DEFAULT_MODEL_DIR = root_dir / "models" mkdir(DEFAULT_MODEL_DIR) model_map = read_yaml(DEFAULT_MODEL_PATH) @classmethod def get_model_path(cls, model_type: ModelType) -> str: return cls.get_single_model_path(model_type) @classmethod def get_single_model_path(cls, model_type: ModelType) -> str: model_info = cls.model_map[model_type.value] save_model_path = ( cls.DEFAULT_MODEL_DIR / Path(model_info["model_dir_or_path"]).name ) download_params = DownloadFileInput( file_url=model_info["model_dir_or_path"], sha256=model_info["SHA256"], save_path=save_model_path, logger=cls.logger, ) DownloadFile.run(download_params) return str(save_model_path) @classmethod def get_multi_models_dict(cls, model_type: ModelType) -> Dict[str, str]: model_info = cls.model_map[model_type.value] results = {} model_root_dir = model_info["model_dir_or_path"] save_model_dir = cls.DEFAULT_MODEL_DIR / Path(model_root_dir).name for file_name, sha256 in model_info["SHA256"].items(): save_path = save_model_dir / file_name download_params = DownloadFileInput( file_url=f"{model_root_dir}/{file_name}", sha256=sha256, save_path=save_path, logger=cls.logger, ) DownloadFile.run(download_params) results[Path(file_name).stem] = str(save_path) return results class LetterBox: """Resize image and padding for detection, instance segmentation, pose.""" def __init__( self, new_shape=(640, 640), auto=False, scaleFill=False, scaleup=True, center=True, stride=32, ): """Initialize LetterBox object with specific parameters.""" self.new_shape = new_shape self.auto = auto self.scaleFill = scaleFill self.scaleup = scaleup self.stride = stride self.center = center # Put the image in the middle or top-left def __call__(self, labels=None, image=None): """Return updated labels and image with added border.""" if labels is None: labels = {} img = labels.get("img") if image is None else image shape = img.shape[:2] # current shape [height, width] new_shape = labels.pop("rect_shape", self.new_shape) if isinstance(new_shape, int): new_shape = (new_shape, new_shape) # Scale ratio (new / old) r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) if not self.scaleup: # only scale down, do not scale up (for better val mAP) r = min(r, 1.0) # Compute padding ratio = r, r # width, height ratios new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding if self.auto: # minimum rectangle dw, dh = np.mod(dw, self.stride), np.mod(dh, self.stride) # wh padding elif self.scaleFill: # stretch dw, dh = 0.0, 0.0 new_unpad = (new_shape[1], new_shape[0]) ratio = ( new_shape[1] / shape[1], new_shape[0] / shape[0], ) # width, height ratios if self.center: dw /= 2 # divide padding into 2 sides dh /= 2 if shape[::-1] != new_unpad: # resize img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1)) left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1)) img = cv2.copyMakeBorder( img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114) ) # add border if labels.get("ratio_pad"): labels["ratio_pad"] = (labels["ratio_pad"], (left, top)) # for evaluation if len(labels): labels = self._update_labels(labels, ratio, dw, dh) labels["img"] = img labels["resized_shape"] = new_shape return labels return img def _update_labels(self, labels, ratio, padw, padh): """Update labels.""" labels["instances"].convert_bbox(format="xyxy") labels["instances"].denormalize(*labels["img"].shape[:2][::-1]) labels["instances"].scale(*ratio) labels["instances"].add_padding(padw, padh) return labels def rescale_boxes(boxes, input_width, input_height, img_width, img_height): # Rescale boxes to original image dimensions input_shape = np.array([input_width, input_height, input_width, input_height]) boxes = np.divide(boxes, input_shape, dtype=np.float32) boxes *= np.array([img_width, img_height, img_width, img_height]) return boxes def scale_boxes( img1_shape, boxes, img0_shape, ratio_pad=None, padding=True, xywh=False ): """ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally specified in (img1_shape) to the shape of a different image (img0_shape). Args: img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width). boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) img0_shape (tuple): the shape of the target image, in the format of (height, width). ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be calculated based on the size difference between the two images. padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular rescaling. xywh (bool): The box format is xywh or not, default=False. Returns: boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) """ if ratio_pad is None: # calculate from img0_shape gain = min( img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1] ) # gain = old / new pad = ( round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1), ) # wh padding else: gain = ratio_pad[0][0] pad = ratio_pad[1] if padding: boxes[..., 0] -= pad[0] # x padding boxes[..., 1] -= pad[1] # y padding if not xywh: boxes[..., 2] -= pad[0] # x padding boxes[..., 3] -= pad[1] # y padding boxes[..., :4] /= gain return clip_boxes(boxes, img0_shape) def clip_boxes(boxes, shape): boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2 boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2 return boxes def nms(boxes, scores, iou_threshold): # Sort by score sorted_indices = np.argsort(scores)[::-1] keep_boxes = [] while sorted_indices.size > 0: # Pick the last box box_id = sorted_indices[0] keep_boxes.append(box_id) # Compute IoU of the picked box with the rest ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :]) # Remove boxes with IoU over the threshold keep_indices = np.where(ious < iou_threshold)[0] # print(keep_indices.shape, sorted_indices.shape) sorted_indices = sorted_indices[keep_indices + 1] return keep_boxes def multiclass_nms(boxes, scores, class_ids, iou_threshold): unique_class_ids = np.unique(class_ids) keep_boxes = [] for class_id in unique_class_ids: class_indices = np.where(class_ids == class_id)[0] class_boxes = boxes[class_indices, :] class_scores = scores[class_indices] class_keep_boxes = nms(class_boxes, class_scores, iou_threshold) keep_boxes.extend(class_indices[class_keep_boxes]) return keep_boxes def compute_iou(box, boxes): # Compute xmin, ymin, xmax, ymax for both boxes xmin = np.maximum(box[0], boxes[:, 0]) ymin = np.maximum(box[1], boxes[:, 1]) xmax = np.minimum(box[2], boxes[:, 2]) ymax = np.minimum(box[3], boxes[:, 3]) # Compute intersection area intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin) # Compute union area box_area = (box[2] - box[0]) * (box[3] - box[1]) boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) union_area = box_area + boxes_area - intersection_area # Compute IoU iou = intersection_area / union_area return iou def xywh2xyxy(x): # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2) y = np.copy(x) y[..., 0] = x[..., 0] - x[..., 2] / 2 y[..., 1] = x[..., 1] - x[..., 3] / 2 y[..., 2] = x[..., 0] + x[..., 2] / 2 y[..., 3] = x[..., 1] + x[..., 3] / 2 return y ================================================ FILE: rapid_layout/model_handler/yolov8/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from .main import YOLOv8ModelHandler ================================================ FILE: rapid_layout/model_handler/yolov8/main.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import time from typing import List, Tuple import numpy as np from ...inference_engine.base import InferSession from ...utils.typings import RapidLayoutOutput from ..base import BaseModelHandler from .post_process import YOLOv8PostProcess from .pre_process import YOLOv8PreProcess class YOLOv8ModelHandler(BaseModelHandler): def __init__(self, labels, conf_thres, iou_thres, session: InferSession): self.img_size = (640, 640) self.preprocess = YOLOv8PreProcess(img_size=self.img_size) self.postprocess = YOLOv8PostProcess(labels, conf_thres, iou_thres) self.session = session def __call__(self, ori_img: np.ndarray) -> RapidLayoutOutput: s1 = time.perf_counter() ori_img_shape = ori_img.shape[:2] img = self.preprocess(ori_img) preds = self.session(img) boxes, scores, class_names = self.postprocess( preds, ori_img_shape, self.img_size ) elapse = time.perf_counter() - s1 return RapidLayoutOutput( img=ori_img, boxes=boxes, class_names=class_names, scores=scores, elapse=elapse, ) def preprocess(self, image: np.ndarray) -> np.ndarray: return self.preprocess(image) def postprocess(self, model_output) -> Tuple[np.ndarray, np.ndarray, List[str]]: return self.postprocess(model_output) ================================================ FILE: rapid_layout/model_handler/yolov8/post_process.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from typing import List, Tuple import numpy as np from ..utils import multiclass_nms, rescale_boxes, xywh2xyxy class YOLOv8PostProcess: def __init__(self, labels: List[str], conf_thres=0.7, iou_thres=0.5): self.labels = labels self.conf_threshold = conf_thres self.iou_threshold = iou_thres self.input_width, self.input_height = None, None self.img_width, self.img_height = None, None def __call__( self, output: List[np.ndarray], ori_img_shape: Tuple[int, int], img_shape: Tuple[int, int], ) -> Tuple[np.ndarray, np.ndarray, List[str]]: self.img_height, self.img_width = ori_img_shape self.input_height, self.input_width = img_shape predictions = np.squeeze(output[0]).T # Filter out object confidence scores below threshold scores = np.max(predictions[:, 4:], axis=1) predictions = predictions[scores > self.conf_threshold, :] scores = scores[scores > self.conf_threshold] if len(scores) == 0: return [], [], [] # Get the class with the highest confidence class_ids = np.argmax(predictions[:, 4:], axis=1) # Get bounding boxes for each object boxes = self.extract_boxes(predictions) # Apply non-maxima suppression to suppress weak, overlapping bounding boxes # indices = nms(boxes, scores, self.iou_threshold) indices = multiclass_nms(boxes, scores, class_ids, self.iou_threshold) labels = [self.labels[i] for i in class_ids[indices]] return boxes[indices], scores[indices], labels def extract_boxes(self, predictions: np.ndarray) -> np.ndarray: # Extract boxes from predictions boxes = predictions[:, :4] # Scale boxes to original image dimensions boxes = rescale_boxes( boxes, self.input_width, self.input_height, self.img_width, self.img_height ) # Convert boxes to xyxy format boxes = xywh2xyxy(boxes) return boxes ================================================ FILE: rapid_layout/model_handler/yolov8/pre_process.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from typing import Tuple import cv2 import numpy as np class YOLOv8PreProcess: def __init__(self, img_size: Tuple[int, int]): self.img_size = img_size def __call__(self, image: np.ndarray) -> np.ndarray: input_img = cv2.resize(image, self.img_size) input_img = input_img / 255.0 input_img = input_img.transpose(2, 0, 1) return input_img[np.newaxis, :, :, :].astype(np.float32) ================================================ FILE: rapid_layout/models/.gitkeep ================================================ ================================================ FILE: rapid_layout/models/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com ================================================ FILE: rapid_layout/utils/__init__.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com ================================================ FILE: rapid_layout/utils/download_file.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import logging import sys from dataclasses import dataclass from pathlib import Path from typing import Optional, Union import requests from tqdm import tqdm from .utils import get_file_sha256 @dataclass class DownloadFileInput: file_url: str save_path: Union[str, Path] logger: logging.Logger sha256: Optional[str] = None class DownloadFile: BLOCK_SIZE = 1024 # 1 KiB REQUEST_TIMEOUT = 60 @classmethod def run(cls, input_params: DownloadFileInput): save_path = Path(input_params.save_path) logger = input_params.logger cls._ensure_parent_dir_exists(save_path) if cls._should_skip_download(save_path, input_params.sha256, logger): return response = cls._make_http_request(input_params.file_url, logger) cls._save_response_with_progress(response, save_path, logger) @staticmethod def _ensure_parent_dir_exists(path: Path): path.parent.mkdir(parents=True, exist_ok=True) @classmethod def _should_skip_download( cls, path: Path, expected_sha256: Optional[str], logger: logging.Logger ) -> bool: if not path.exists(): return False if expected_sha256 is None: logger.info("File exists (no checksum verification): %s", path) return True if cls.check_file_sha256(path, expected_sha256): logger.info("File exists and is valid: %s", path) return True logger.warning("File exists but is invalid, redownloading: %s", path) return False @classmethod def _make_http_request(cls, url: str, logger: logging.Logger) -> requests.Response: logger.info("Initiating download: %s", url) try: response = requests.get(url, stream=True, timeout=cls.REQUEST_TIMEOUT) response.raise_for_status() # Raises HTTPError for 4XX/5XX return response except requests.RequestException as e: logger.error("Download failed: %s", url) raise DownloadFileException(f"Failed to download {url}") from e @classmethod def _save_response_with_progress( cls, response: requests.Response, save_path: Path, logger: logging.Logger ) -> None: total_size = int(response.headers.get("content-length", 0)) logger.info("Download size: %.2fMB", total_size / 1024 / 1024) with tqdm( total=total_size, unit="iB", unit_scale=True, disable=not cls.check_is_atty(), ) as progress_bar: with open(save_path, "wb") as output_file: for chunk in response.iter_content(chunk_size=cls.BLOCK_SIZE): progress_bar.update(len(chunk)) output_file.write(chunk) logger.info("Successfully saved to: %s", save_path) @staticmethod def check_file_sha256(file_path: Union[str, Path], gt_sha256: str) -> bool: return get_file_sha256(file_path) == gt_sha256 @staticmethod def check_is_atty() -> bool: try: is_interactive = sys.stderr.isatty() except AttributeError: return False return is_interactive class DownloadFileException(Exception): pass ================================================ FILE: rapid_layout/utils/load_image.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from io import BytesIO from pathlib import Path from typing import Any, Union import cv2 import numpy as np import requests from PIL import Image, ImageOps, UnidentifiedImageError from .utils import is_url root_dir = Path(__file__).resolve().parent InputType = Union[str, np.ndarray, bytes, Path, Image.Image] class LoadImage: def __init__(self): pass def __call__(self, img: InputType) -> np.ndarray: if not isinstance(img, InputType.__args__): raise LoadImageError( f"The img type {type(img)} does not in {InputType.__args__}" ) origin_img_type = type(img) img = self.load_img(img) img = self.convert_img(img, origin_img_type) return img def load_img(self, img: InputType) -> np.ndarray: if isinstance(img, (str, Path)): if is_url(str(img)): img = Image.open(requests.get(img, stream=True, timeout=60).raw) else: self.verify_exist(img) img = Image.open(img) img = self.exif_transpose(img) try: img = self.img_to_ndarray(img) except UnidentifiedImageError as e: raise LoadImageError(f"cannot identify image file {img}") from e return img if isinstance(img, bytes): img = self.img_to_ndarray(Image.open(BytesIO(img))) return img if isinstance(img, np.ndarray): return img if isinstance(img, Image.Image): return self.img_to_ndarray(img) raise LoadImageError(f"{type(img)} is not supported!") @staticmethod def verify_exist(file_path: Union[str, Path]): if not Path(file_path).exists(): raise LoadImageError(f"{file_path} does not exist.") @staticmethod def exif_transpose(img: Image.Image) -> Image.Image: try: img_corrected = ImageOps.exif_transpose(img) if img_corrected is None: return img return img_corrected except Exception: return img def img_to_ndarray(self, img: Image.Image) -> np.ndarray: if img.mode == "1": img = img.convert("L") return np.array(img) return np.array(img) def convert_img(self, img: np.ndarray, origin_img_type: Any) -> np.ndarray: if img.ndim == 2: return cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) if img.ndim == 3: channel = img.shape[2] if channel == 1: return cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) if channel == 2: return self.cvt_two_to_three(img) if channel == 3: if issubclass(origin_img_type, (str, Path, bytes, Image.Image)): return cv2.cvtColor(img, cv2.COLOR_RGB2BGR) return img if channel == 4: return self.cvt_four_to_three(img) raise LoadImageError( f"The channel({channel}) of the img is not in [1, 2, 3, 4]" ) raise LoadImageError(f"The ndim({img.ndim}) of the img is not in [2, 3]") @staticmethod def cvt_two_to_three(img: np.ndarray) -> np.ndarray: """gray + alpha → BGR""" img_gray = img[..., 0] img_bgr = cv2.cvtColor(img_gray, cv2.COLOR_GRAY2BGR) img_alpha = img[..., 1] not_a = cv2.bitwise_not(img_alpha) not_a = cv2.cvtColor(not_a, cv2.COLOR_GRAY2BGR) new_img = cv2.bitwise_and(img_bgr, img_bgr, mask=img_alpha) new_img = cv2.add(new_img, not_a) return new_img @staticmethod def cvt_four_to_three(img: np.ndarray) -> np.ndarray: """自动调整背景颜色,以增强文字对比度""" rgb = img[:, :, :3] # shape (H, W, 3) alpha = img[:, :, 3] # shape (H, W) # 获取非透明区域的 RGB 像素 mask = alpha > 0 non_transparent_rgb = rgb[mask] # shape (N, 3) if non_transparent_rgb.size == 0: # 全透明图像:默认用白色背景 bg_color = (255, 255, 255) else: # 使用加权灰度公式计算亮度均值 # luminance = 0.299*R + 0.587*G + 0.114*B r, g, b = ( non_transparent_rgb[:, 0], non_transparent_rgb[:, 1], non_transparent_rgb[:, 2], ) luminance = 0.299 * r + 0.587 * g + 0.114 * b avg_luminance = np.mean(luminance) # 根据平均亮度选择高对比度背景 bg_color = (255, 255, 255) if avg_luminance < 128 else (0, 0, 0) # 构建背景图像 background = np.full_like(rgb, bg_color, dtype=np.uint8) # 合成:前景 = rgb * (alpha/255), 背景 = bg * (1 - alpha/255) alpha_norm = alpha.astype(np.float32) / 255.0 foreground_blend = rgb.astype(np.float32) * alpha_norm[..., None] background_blend = background.astype(np.float32) * (1.0 - alpha_norm)[..., None] blended = (foreground_blend + background_blend).astype(np.uint8) return cv2.cvtColor(blended, cv2.COLOR_RGB2BGR) class LoadImageError(Exception): pass ================================================ FILE: rapid_layout/utils/logger.py ================================================ # -*- encoding: utf-8 -*- import logging import colorlog class Logger: def __init__(self, log_level=logging.INFO, logger_name=None): self.logger = logging.getLogger(logger_name) self.logger.setLevel(log_level) self.logger.propagate = False formatter = colorlog.ColoredFormatter( f"%(log_color)s[%(levelname)s] %(asctime)s [{logger_name}] %(filename)s:%(lineno)d: %(message)s", log_colors={ "DEBUG": "cyan", "INFO": "green", "WARNING": "yellow", "ERROR": "red", "CRITICAL": "red,bg_white", }, ) if not self.logger.handlers: console_handler = logging.StreamHandler() console_handler.setFormatter(formatter) for handler in self.logger.handlers: self.logger.removeHandler(handler) console_handler.setLevel(log_level) self.logger.addHandler(console_handler) def get_log(self): return self.logger logger = Logger(log_level=logging.INFO, logger_name="RapidLayout").get_log() ================================================ FILE: rapid_layout/utils/typings.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import dataclasses from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import List, Optional, Union import numpy as np from .logger import Logger from .utils import save_img logger = Logger(logger_name=__name__).get_log() class ModelType(Enum): PP_LAYOUT_CDLA = "pp_layout_cdla" PP_LAYOUT_PUBLAYNET = "pp_layout_publaynet" PP_LAYOUT_TABLE = "pp_layout_table" YOLOV8N_LAYOUT_PAPER = "yolov8n_layout_paper" YOLOV8N_LAYOUT_REPORT = "yolov8n_layout_report" YOLOV8N_LAYOUT_PUBLAYNET = "yolov8n_layout_publaynet" YOLOV8N_LAYOUT_GENERAL6 = "yolov8n_layout_general6" DOCLAYOUT_DOCSTRUCTBENCH = "doclayout_docstructbench" DOCLAYOUT_D4LA = "doclayout_d4la" DOCLAYOUT_DOCSYNTH = "doclayout_docsynth" PP_DOC_LAYOUTV2 = "pp_doc_layoutv2" PP_DOC_LAYOUTV3 = "pp_doc_layoutv3" class EngineType(Enum): ONNXRUNTIME = "onnxruntime" OPENVINO = "openvino" @dataclass class RapidLayoutInput: model_type: ModelType = ModelType.PP_LAYOUT_CDLA model_dir_or_path: Union[str, Path, None] = None engine_type: EngineType = EngineType.ONNXRUNTIME engine_cfg: dict = field(default_factory=dict) conf_thresh: float = 0.5 iou_thresh: float = 0.5 @classmethod def normalize_kwargs(cls, kwargs: dict) -> dict: """只保留本 dataclass 的字段,并将 model_type/engine_type 从 str 转为枚举。""" valid = {f.name for f in dataclasses.fields(cls)} filtered = {k: v for k, v in kwargs.items() if k in valid} if "model_type" in filtered and isinstance(filtered["model_type"], str): filtered["model_type"] = ModelType(filtered["model_type"]) if "engine_type" in filtered and isinstance(filtered["engine_type"], str): filtered["engine_type"] = EngineType(filtered["engine_type"]) return filtered @dataclass class RapidLayoutOutput: img: Optional[np.ndarray] = None boxes: Optional[List[List[float]]] = None class_names: Optional[List[str]] = None scores: Optional[List[float]] = None elapse: Optional[float] = None def vis(self, save_path: Union[str, Path, None] = None) -> Optional[np.ndarray]: if self.img is None or self.boxes is None: logger.warning("No image or boxes to visualize.") return None from .vis_res import VisLayout vis_img = VisLayout.draw_detections( self.img, np.array(self.boxes), np.array(self.scores), np.array(self.class_names), ) if save_path is not None and vis_img is not None: save_img(save_path, vis_img) logger.info(f"Visualization saved as {save_path}") return vis_img ================================================ FILE: rapid_layout/utils/utils.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import hashlib import importlib from pathlib import Path from typing import Tuple, Union from urllib.parse import urlparse import cv2 import numpy as np from omegaconf import DictConfig, OmegaConf def mkdir(dir_path): Path(dir_path).mkdir(parents=True, exist_ok=True) def read_yaml(file_path: Union[str, Path]) -> DictConfig: return OmegaConf.load(file_path) def quads_to_rect_bbox(bbox: np.ndarray) -> Tuple[float, float, float, float]: if bbox.ndim != 3: raise ValueError("bbox shape must be 3") if bbox.shape[1] != 4 and bbox.shape[2] != 2: raise ValueError("bbox shape must be (N, 4, 2)") all_x, all_y = (bbox[:, :, 0].flatten(), bbox[:, :, 1].flatten()) x_min, y_min = np.min(all_x), np.min(all_y) x_max, y_max = np.max(all_x), np.max(all_y) return float(x_min), float(y_min), float(x_max), float(y_max) def has_chinese_char(text: str) -> bool: return any("\u4e00" <= ch <= "\u9fff" for ch in text) def get_file_sha256(file_path: Union[str, Path], chunk_size: int = 65536) -> str: with open(file_path, "rb") as file: sha_signature = hashlib.sha256() while True: chunk = file.read(chunk_size) if not chunk: break sha_signature.update(chunk) return sha_signature.hexdigest() def save_img(save_path: Union[str, Path], img: np.ndarray): if not Path(save_path).parent.exists(): Path(save_path).parent.mkdir(parents=True, exist_ok=True) cv2.imwrite(str(save_path), img) def is_url(url: str) -> bool: try: result = urlparse(url) return all([result.scheme, result.netloc]) except Exception: return False def import_package(name, package=None): try: module = importlib.import_module(name, package=package) return module except ModuleNotFoundError: return None ================================================ FILE: rapid_layout/utils/vis_res.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com from typing import Optional, Tuple import cv2 import numpy as np class VisLayout: @classmethod def draw_detections( cls, image: np.ndarray, boxes: Optional[np.ndarray], scores: Optional[np.ndarray], class_names: Optional[np.ndarray], mask_alpha=0.3, ) -> Optional[np.ndarray]: """_summary_ Args: image (np.ndarray): H x W x C boxes (np.ndarray): (N, 4) scores (np.ndarray): (N, ) class_ids (np.ndarray): (N, ) mask_alpha (float, optional): _description_. Defaults to 0.3. Returns: np.ndarray: _description_ """ if boxes is None or scores is None or class_names is None: return None det_img = image.copy() img_height, img_width = image.shape[:2] font_size = min([img_height, img_width]) * 0.0006 text_thickness = int(min([img_height, img_width]) * 0.001) det_img = cls.draw_masks(det_img, boxes, mask_alpha) for label, box, score in zip(class_names, boxes, scores): color = cls.get_color() cls.draw_box(det_img, box, color) caption = f"{label} {int(score * 100)}%" cls.draw_text(det_img, caption, box, color, font_size, text_thickness) return det_img @staticmethod def draw_box( image: np.ndarray, box: np.ndarray, color: Tuple[int, int, int] = (0, 0, 255), thickness: int = 2, ) -> np.ndarray: x1, y1, x2, y2 = box.astype(int) return cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness) @staticmethod def draw_text( image: np.ndarray, text: str, box: np.ndarray, color: Tuple[int, int, int] = (0, 0, 255), font_size: float = 0.001, text_thickness: int = 2, ) -> np.ndarray: x1, y1, x2, y2 = box.astype(int) (tw, th), _ = cv2.getTextSize( text=text, fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=font_size, thickness=text_thickness, ) th = int(th * 1.2) cv2.rectangle(image, (x1, y1), (x1 + tw, y1 - th), color, -1) return cv2.putText( image, text, (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, font_size, (255, 255, 255), text_thickness, cv2.LINE_AA, ) @classmethod def draw_masks( cls, image: np.ndarray, boxes: np.ndarray, mask_alpha: float = 0.3, ) -> np.ndarray: mask_img = image.copy() for box in boxes: color = cls.get_color() x1, y1, x2, y2 = box.astype(int) cv2.rectangle(mask_img, (x1, y1), (x2, y2), color, -1) return cv2.addWeighted(mask_img, mask_alpha, image, 1 - mask_alpha, 0) @staticmethod def get_color(): colors = ( np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255), ) return colors ================================================ FILE: requirements.txt ================================================ opencv_python>=4.5.1.48 numpy>=2.0.0 Pillow tqdm requests colorlog omegaconf ================================================ FILE: setup.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import sys from pathlib import Path from typing import List, Union from get_pypi_latest_version import GetPyPiLatestVersion from setuptools import find_packages, setup def read_txt(txt_path: Union[Path, str]) -> List[str]: with open(txt_path, "r", encoding="utf-8") as f: data = [v.rstrip("\n") for v in f] return data def get_readme(): root_dir = Path(__file__).resolve().parent readme_path = str(root_dir / "docs" / "doc_whl_rapid_layout.md") with open(readme_path, "r", encoding="utf-8") as f: readme = f.read() return readme MODULE_NAME = "rapid_layout" obtainer = GetPyPiLatestVersion() latest_version = obtainer(MODULE_NAME) VERSION_NUM = obtainer.version_add_one(latest_version, add_patch=True) if len(sys.argv) > 2: match_str = " ".join(sys.argv[2:]) matched_versions = obtainer.extract_version(match_str) if matched_versions: VERSION_NUM = matched_versions sys.argv = sys.argv[:2] setup( name=MODULE_NAME, version=VERSION_NUM, platforms="Any", long_description=get_readme(), long_description_content_type="text/markdown", description="Tools for document layout analysis based ONNXRuntime.", author="SWHL", author_email="liekkaskono@163.com", url="https://github.com/RapidAI/RapidLayout", license="Apache-2.0", include_package_data=True, install_requires=read_txt("requirements.txt"), packages=find_packages(), package_data={"": ["*.onnx", "*.yaml"]}, keywords=["ppstructure,layout,rapidocr,rapid_layout"], classifiers=[ "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", ], python_requires=">=3.6", entry_points={"console_scripts": [f"{MODULE_NAME}={MODULE_NAME}.main:main"]}, ) ================================================ FILE: tests/test_engine.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import sys from pathlib import Path import pytest cur_dir = Path(__file__).resolve().parent root_dir = cur_dir.parent sys.path.insert(0, str(root_dir)) from rapid_layout import EngineType, ModelType, RapidLayout, RapidLayoutInput test_dir = cur_dir / "test_files" # 与 test_main.py 保持一致:(图片名, 模型类型, 期望检测框数量) ENGINE_TEST_CASES = [ ("layout.jpg", "pp_layout_cdla", 14), ("PMC3576793_00004.jpg", "yolov8n_layout_publaynet", 12), ("PMC3576793_00004.jpg", "yolov8n_layout_general6", 13), ("PMC3576793_00004.jpg", "doclayout_docstructbench", 14), ] def get_engine(params: RapidLayoutInput): return RapidLayout(cfg=params) @pytest.mark.parametrize("img_name, model_type, gt", ENGINE_TEST_CASES) def test_engine_onnxruntime(img_name, model_type, gt): """使用 onnxruntime 引擎推理,结果与 test_main 预期一致。""" params = RapidLayoutInput( model_type=ModelType(model_type), engine_type=EngineType.ONNXRUNTIME, ) engine = get_engine(params) img_path = test_dir / img_name results = engine(img_path) assert results.boxes is not None assert len(results.boxes) == gt @pytest.mark.parametrize("img_name, model_type, gt", ENGINE_TEST_CASES) def test_engine_openvino(img_name, model_type, gt): """使用 openvino 引擎推理,结果与 test_main 预期一致。""" pytest.importorskip( "openvino", reason="openvino not installed, skip openvino tests" ) params = RapidLayoutInput( model_type=ModelType(model_type), engine_type=EngineType.OPENVINO, ) engine = get_engine(params) img_path = test_dir / img_name results = engine(img_path) assert results.boxes is not None assert len(results.boxes) == gt ================================================ FILE: tests/test_main.py ================================================ # -*- encoding: utf-8 -*- # @Author: SWHL # @Contact: liekkaskono@163.com import shlex import sys from pathlib import Path from typing import Optional import pytest cur_dir = Path(__file__).resolve().parent root_dir = cur_dir.parent sys.path.append(str(root_dir)) from rapid_layout import ModelType, RapidLayout, RapidLayoutInput from rapid_layout.main import main test_dir = cur_dir / "test_files" def get_engine(params: Optional[RapidLayoutInput] = None): if params: engine = RapidLayout(cfg=params) return engine engine = RapidLayout() return engine @pytest.mark.parametrize( "img_name,model_type,gt", [ ("layout.jpg", "pp_layout_cdla", 14), ("PMC3576793_00004.jpg", "yolov8n_layout_publaynet", 12), ("PMC3576793_00004.jpg", "yolov8n_layout_general6", 13), ("PMC3576793_00004.jpg", "doclayout_docstructbench", 14), ("pp_doc_layoutv2_layout.jpg", "pp_doc_layoutv2", 13), ("pp_doc_layoutv2_layout.jpg", "pp_doc_layoutv3", 13), ], ) def test_normal(img_name, model_type, gt): img_path = test_dir / img_name engine = get_engine(params=RapidLayoutInput(model_type=ModelType(model_type))) results = engine(img_path) assert results.boxes is not None assert len(results.boxes) == gt @pytest.mark.parametrize( "command, expected_output", [ (f"{test_dir / 'layout.jpg'} --model_type pp_layout_cdla", 0), ], ) def test_main_cli(capsys, command, expected_output): main(shlex.split(command)) output = capsys.readouterr().out.rstrip() assert len(output) > expected_output def test_init_with_kwargs(): """仅用关键字参数构造,不传 cfg。""" engine = RapidLayout(model_type=ModelType.PP_LAYOUT_CDLA, conf_thresh=0.5) img_path = test_dir / "layout.jpg" results = engine(img_path) assert results.boxes is not None assert len(results.boxes) == 14 def test_init_with_kwargs_model_type_string(): """kwargs 中 model_type 传字符串,应被正确转为枚举。""" engine = RapidLayout(model_type="pp_layout_cdla", conf_thresh=0.5) img_path = test_dir / "layout.jpg" results = engine(img_path) assert results.boxes is not None assert len(results.boxes) == 14 def test_init_with_cfg(): """仅用配置对象构造。""" cfg = RapidLayoutInput(model_type=ModelType.PP_LAYOUT_CDLA, conf_thresh=0.5) engine = RapidLayout(cfg=cfg) img_path = test_dir / "layout.jpg" results = engine(img_path) assert results.boxes is not None assert len(results.boxes) == 14 def test_init_with_cfg_and_kwargs_override(): """传入 cfg 的同时用 kwargs 覆盖部分字段。""" cfg = RapidLayoutInput(model_type=ModelType.PP_LAYOUT_CDLA, conf_thresh=0.5) engine = RapidLayout(cfg=cfg, conf_thresh=0.4) img_path = test_dir / "layout.jpg" results = engine(img_path) assert results.boxes is not None assert len(results.boxes) == 15