[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncoverage.xml\n*.mo\n*.pot\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# PyCharm\n.idea/\n*.iml\n\n# VS Code\n.vscode/\n!.vscode/settings.json\n!.vscode/tasks.json\n!.vscode/launch.json\n!.vscode/extensions.json\n\n# macOS\n.DS_Store\n\n# Windows\nThumbs.db\nehthumbs.db\nDesktop.ini\n\nfusion_result.json\nkernel_meta/\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n  # 1. isort - 自动排序 Python imports\n  - repo: https://github.com/pycqa/isort\n    rev: 6.0.1  # 使用固定版本号\n    hooks:\n      - id: isort\n        name: isort (python)\n        args: [--profile=black]  # 与 Black 兼容的配置\n        language: python\n\n  # 2. Black - 自动格式化 Python 代码\n  - repo: https://github.com/psf/black\n    rev: 25.1.0  # 使用固定版本号\n    hooks:\n      - id: black\n        language: python\n\n  # 3. flake8 - Python 静态检查\n  - repo: https://github.com/pycqa/flake8\n    rev: 7.2.0\n    hooks:\n      - id: flake8\n        args: [--max-line-length=120, --ignore=E203]  # 设置行长度为 120\n        additional_dependencies: [flake8-bugbear==24.12.12]  # 可选：增强检查\n\n  # 4. pre-commit-hooks - 通用 Git 钩子\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v5.0.0\n    hooks:\n      - id: trailing-whitespace  # 删除行尾空格\n      - id: end-of-file-fixer    # 确保文件以换行符结束\n      - id: check-yaml           # 验证 YAML 文件语法\n      - id: check-added-large-files  # 阻止大文件提交\n        args: [\"--maxkb=512\"]\n"
  },
  {
    "path": "LICENSE",
    "content": "Qwen RESEARCH LICENSE AGREEMENT\n\nQwen RESEARCH LICENSE AGREEMENT Release Date: September 19, 2024\n\nBy clicking to agree or by using or distributing any portion or element of the Qwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.\n\n1. Definitions\n    a. This Qwen RESEARCH LICENSE AGREEMENT (this \"Agreement\") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.\n    b. \"We\" (or \"Us\") shall mean Alibaba Cloud.\n    c. \"You\" (or \"Your\") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.\n    d. \"Third Parties\" shall mean individuals or legal entities that are not under common control with us or you.\n    e. \"Qwen\" shall mean the large language models, and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by us.\n    f. \"Materials\" shall mean, collectively, Alibaba Cloud's proprietary Qwen and Documentation (and any portion thereof) made available under this Agreement.\n    g. \"Source\" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.\n    h. \"Object\" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.\n    i. \"Non-Commercial\" shall mean for research or evaluation purposes only.\n\n2. Grant of Rights\n    a. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials FOR NON-COMMERCIAL PURPOSES ONLY. \n    b. If you are commercially using the Materials, you shall request a license from us.\n\n3. Redistribution\nYou may distribute copies or make the Materials, or derivative works thereof, available as part of a product or service that contains any of them, with or without modifications, and in Source or Object form, provided that you meet the following conditions:\n    a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;\n    b. You shall cause any modified files to carry prominent notices stating that you changed the files;\n    c. You shall retain in all copies of the Materials that you distribute the following attribution notices within a \"Notice\" text file distributed as a part of such copies: \"Qwen is licensed under the Qwen RESEARCH LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved.\"; and\n    d. You may add your own copyright statement to your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of your modifications, or for any such derivative works as a whole, provided your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.\n\n4. Rules of use\n    a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.\n    b. If you use the Materials or any outputs or results therefrom to create, train, fine-tune, or improve an AI model that is distributed or made available, you shall prominently display “Built with Qwen” or “Improved using Qwen” in the related product documentation.\n\n5. Intellectual Property\n    a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.\n    b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.\n    c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licenses granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.\n\n6. Disclaimer of Warranty and Limitation of Liability\n    a. We are not obligated to support, update, provide training for, or develop any further version of the Qwen Materials or to grant any license thereto.\n    b. THE MATERIALS ARE PROVIDED \"AS IS\" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.\n    c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.\n    d. You will defend, indemnify and hold harmless us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.\n\n7. Survival and Termination.\n    a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.\n    b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 6 and 8 shall survive the termination of this Agreement.\n\n8. Governing Law and Jurisdiction.\n    a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.\n    b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.\n\n9. Other Terms and Conditions.\n    a. Any arrangements, understandings, or agreements regarding the Material not stated herein are separate from and independent of the terms and conditions of this Agreement. You shall request a separate license from us, if you use the Materials in ways not expressly agreed to in this Agreement. \n    b. We shall not be bound by any additional or different terms or conditions communicated by you unless expressly agreed.\n"
  },
  {
    "path": "README.md",
    "content": "<div align=\"center\">\n  <img src=\"./assets/dolphin.png\" width=\"300\">\n</div>\n\n<div align=\"center\">\n  <a href=\"https://arxiv.org/abs/2505.14059\">\n    <img src=\"https://img.shields.io/badge/Paper-arXiv-red\">\n  </a>\n  <a href=\"https://huggingface.co/ByteDance/Dolphin-v2\">\n    <img src=\"https://img.shields.io/badge/HuggingFace-Dolphin-yellow\">\n  </a>\n  <a href=\"https://github.com/bytedance/Dolphin\">\n    <img src=\"https://img.shields.io/badge/Code-Github-green\">\n  </a>\n  <a href=\"https://opensource.org/licenses/MIT\">\n    <img src=\"https://img.shields.io/badge/License-MIT-lightgray\">\n  </a>\n  <br>\n</div>\n\n<br>\n\n<div align=\"center\">\n  <img src=\"./assets/demo.gif\" width=\"800\">\n</div>\n\n# Dolphin: Document Image Parsing via Heterogeneous Anchor Prompting\nDolphin-v2 is an enhanced universal document parsing model that substantially improves upon the original Dolphin. It seamlessly handles any document type—whether digital-born or photographed—through a document-type-aware two-stage architecture with scalable anchor prompting.\n\n\n## 📑 Overview\n\nDocument image parsing is challenging due to diverse document types and complexly intertwined elements such as text paragraphs, figures, formulas, tables, and code blocks. Dolphin-v2 addresses these challenges through a document-type-aware two-stage approach:\n\n1. **🔍 Stage 1**: Document type classification (digital vs. photographed) + layout analysis with reading order prediction\n2. **🧩 Stage 2**: Hybrid parsing strategy - holistic parsing for photographed documents, parallel element-wise parsing for digital documents\n\n<div align=\"center\">\n  <img src=\"./assets/framework.png\" width=\"680\">\n</div>\n\nDolphin achieves promising performance across diverse page-level and element-level parsing tasks while ensuring superior efficiency through its lightweight architecture and parallel parsing mechanism.\n\n<!-- ## 🚀 Demo\nTry our demo on [Demo-Dolphin](https://huggingface.co/spaces/ByteDance/Dolphin). -->\n\n## 📅 Changelog\n- 🔥 **2025.12.12** Released *Dolphin-v2* model. Upgraded to 3B parameters with 21-element detection, attribute field extraction, dedicated formula/code parsing, and robust photographed document parsing. (Dolphin-1.5 moved to [v1.5 branch](https://github.com/bytedance/Dolphin/tree/v1.5))\n- 🔥 **2025.10.16** Released *Dolphin-1.5* model. While maintaining the lightweight 0.3B architecture, this version achieves significant parsing improvements. (Dolphin 1.0 moved to [v1.0 branch](https://github.com/bytedance/Dolphin/tree/v1.0))\n- 🔥 **2025.07.10** Released the *Fox-Page Benchmark*, a manually refined subset of the original [Fox dataset](https://github.com/ucaslcl/Fox). Download via: [Baidu Yun](https://pan.baidu.com/share/init?surl=t746ULp6iU5bUraVrPlMSw&pwd=fox1) | [Google Drive](https://drive.google.com/file/d/1yZQZqI34QCqvhB4Tmdl3X_XEvYvQyP0q/view?usp=sharing).\n- 🔥 **2025.06.30** Added [TensorRT-LLM support](https://github.com/bytedance/Dolphin/blob/master/deployment/tensorrt_llm/ReadMe.md) for accelerated inference！\n- 🔥 **2025.06.27** Added [vLLM support](https://github.com/bytedance/Dolphin/blob/master/deployment/vllm/ReadMe.md) for accelerated inference！\n- 🔥 **2025.06.13** Added multi-page PDF document parsing capability.\n- 🔥 **2025.05.21** Our demo is released at [link](http://115.190.42.15:8888/dolphin/). Check it out!\n- 🔥 **2025.05.20** The pretrained model and inference code of Dolphin are released.\n- 🔥 **2025.05.16** Our paper has been accepted by ACL 2025. Paper link: [arXiv](https://arxiv.org/abs/2505.14059).\n\n## 📈 Performance\n\n<table style=\"width:90%; border-collapse: collapse; text-align: center;\">\n    <caption>Comprehensive evaluation of document parsing on OmniDocBench (v1.5)</caption>\n    <thead>\n        <tr>\n            <th style=\"text-align: center !important;\">Model</th>\n            <th style=\"text-align: center !important;\">Size</th>\n            <th style=\"text-align: center !important;\">Overall&#x2191;</th>\n            <th style=\"text-align: center !important;\">Text<sup>Edit</sup>&#x2193;</th>\n            <th style=\"text-align: center !important;\">Formula<sup>CDM</sup>&#x2191;</th>\n            <th style=\"text-align: center !important;\">Table<sup>TEDS</sup>&#x2191;</th>\n            <th style=\"text-align: center !important;\">Table<sup>TEDS-S</sup>&#x2191;</th>\n            <th style=\"text-align: center !important;\">Read Order<sup>Edit</sup>&#x2193;</th>\n        </tr>\n    </thead>\n    <tbody>\n        <tr>\n            <td>Dolphin</td>\n            <td>0.3B</td>\n            <td>74.67</td>\n            <td>0.125</td>\n            <td>67.85</td>\n            <td>68.70</td>\n            <td>77.77</td>\n            <td>0.124</td>\n        </tr>\n        <tr>\n            <td>Dolphin-1.5</td>\n            <td>0.3B</td>\n            <td>85.06</td>\n            <td>0.085</td>\n            <td>79.44</td>\n            <td>84.25</td>\n            <td>88.06</td>\n            <td>0.071</td>\n        </tr>\n        <tr>\n            <td>Dolphin-v2</td>\n            <td>3B</td>\n            <td><strong>89.78</strong></td>\n            <td><strong>0.054</strong></td>\n            <td><strong>87.63</strong></td>\n            <td><strong>87.02</strong></td>\n            <td><strong>90.48</strong></td>\n            <td><strong>0.054</strong></td>\n        </tr>\n    </tbody>\n</table>\n\n## 🛠️ Installation\n\n1. Clone the repository:\n   ```bash\n   git clone https://github.com/ByteDance/Dolphin.git\n   cd Dolphin\n   ```\n\n2. Install the dependencies:\n   ```bash\n   pip install -r requirements.txt\n   ```\n\n3. Download the pre-trained models of *Dolphin-v2*:\n\n   Visit our Huggingface [model card](https://huggingface.co/ByteDance/Dolphin-v2), or download model by:\n   \n   ```bash\n   # Download the model from Hugging Face Hub\n   git lfs install\n   git clone https://huggingface.co/ByteDance/Dolphin-v2 ./hf_model\n   # Or use the Hugging Face CLI\n   pip install huggingface_hub\n   huggingface-cli download ByteDance/Dolphin-v2 --local-dir ./hf_model\n   ```\n\n## ⚡ Inference\n\nDolphin provides two inference frameworks with support for two parsing granularities:\n- **Page-level Parsing**: Parse the entire document page into a structured JSON and Markdown format\n- **Element-level Parsing**: Parse individual document elements (text, table, formula)\n\n\n### 📄 Page-level Parsing\n\n```bash\n# Process a single document image\npython demo_page.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs/page_1.png \n\n# Process a single document pdf\npython demo_page.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs/page_6.pdf \n\n# Process all documents in a directory\npython demo_page.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs \n\n# Process with custom batch size for parallel element decoding\npython demo_page.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs \\\n    --max_batch_size 8\n```\n\n### 🧩 Element-level Parsing\n\n````bash\n# Process element images (specify element_type: table, formula, text, or code)\npython demo_element.py --model_path ./hf_model --save_dir ./results \\\n    --input_path  \\\n    --element_type [table|formula|text|code]\n````\n\n### 🎨 Layout Parsing\n````bash\n# Process a single document image\npython demo_layout.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs/page_1.png \\\n    \n# Process a single PDF document\npython demo_layout.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs/page_6.pdf \\\n\n# Process all documents in a directory\npython demo_layout.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs \n````\n\n\n## 🌟 Key Features\n\n- 🔄 Two-stage analyze-then-parse approach based on a single VLM\n- 📊 Promising performance on document parsing tasks\n- 🔍 Natural reading order element sequence generation\n- 🧩 Heterogeneous anchor prompting for different document elements\n- ⏱️ Efficient parallel parsing mechanism\n- 🤗 Support for Hugging Face Transformers for easier integration\n\n\n## 📮 Notice\n**Call for Bad Cases:** If you have encountered any cases where the model performs poorly, we would greatly appreciate it if you could share them in the issue. We are continuously working to optimize and improve the model.\n\n## 💖 Acknowledgement\n\nWe would like to acknowledge the following open-source projects that provided inspiration and reference for this work:\n- [OmniDocBench](https://github.com/opendatalab/OmniDocBench)\n- [Donut](https://github.com/clovaai/donut/)\n- [Nougat](https://github.com/facebookresearch/nougat)\n- [GOT](https://github.com/Ucas-HaoranWei/GOT-OCR2.0)\n- [MinerU](https://github.com/opendatalab/MinerU/tree/master)\n- [Swin](https://github.com/microsoft/Swin-Transformer)\n- [Hugging Face Transformers](https://github.com/huggingface/transformers)\n\n## 📝 Citation\n\nIf you find this code useful for your research, please use the following BibTeX entry.\n\n```bibtex\n@article{feng2025dolphin,\n  title={Dolphin: Document Image Parsing via Heterogeneous Anchor Prompting},\n  author={Feng, Hao and Wei, Shu and Fei, Xiang and Shi, Wei and Han, Yingdong and Liao, Lei and Lu, Jinghui and Wu, Binghong and Liu, Qi and Lin, Chunhui and others},\n  journal={arXiv preprint arXiv:2505.14059},\n  year={2025}\n}\n```\n\n## Star History\n\n[![Star History Chart](https://api.star-history.com/svg?repos=bytedance/Dolphin&type=Date)](https://www.star-history.com/#bytedance/Dolphin&Date)\n"
  },
  {
    "path": "README_CN.md",
    "content": "<div align=\"center\">\n  <img src=\"./assets/dolphin.png\" width=\"300\">\n</div>\n\n<div align=\"center\">\n  <a href=\"https://arxiv.org/abs/2505.14059\">\n    <img src=\"https://img.shields.io/badge/论文-arXiv-red\">\n  </a>\n  <a href=\"https://huggingface.co/ByteDance/Dolphin-v2\">\n    <img src=\"https://img.shields.io/badge/HuggingFace-Dolphin-yellow\">\n  </a>\n  <a href=\"https://github.com/bytedance/Dolphin\">\n    <img src=\"https://img.shields.io/badge/代码-Github-green\">\n  </a>\n  <a href=\"https://opensource.org/licenses/MIT\">\n    <img src=\"https://img.shields.io/badge/许可证-MIT-lightgray\">\n  </a>\n  <br>\n</div>\n\n<br>\n\n<div align=\"center\">\n  <img src=\"./assets/demo.gif\" width=\"800\">\n</div>\n\n# Dolphin: 基于异构锚点提示的文档图像解析\n\nDolphin（**Do**cument Image **P**arsing via **H**eterogeneous Anchor Prompt**in**g）是一个创新的多模态文档图像解析模型（**0.3B**），采用\"分析-解析\"的两阶段范式。本仓库包含Dolphin的演示代码和预训练模型。\n\n## 📑 概述\n\n由于文档图像中文本段落、图表、公式和表格等元素的复杂交织，文档图像解析具有挑战性。Dolphin通过两阶段方法解决这些挑战：\n\n1. **🔍 第一阶段**：通过按自然阅读顺序生成元素序列进行全面的页面级布局分析\n2. **🧩 第二阶段**：使用异构锚点和任务特定提示高效并行解析文档元素\n\n<div align=\"center\">\n  <img src=\"./assets/framework.png\" width=\"680\">\n</div>\n\nDolphin在多样化的页面级和元素级解析任务中取得了优异的性能，同时通过其轻量级架构和并行解析机制确保了卓越的效率。\n\n## 📅 更新日志\n- 🔥 **2025.12.12** *Dolphin-v2* 开源！支持 21 类元素检测、属性字段提取、代码专用解析，以及拍照文档解析。（原1.5版本已迁移至[v1.5分支](https://github.com/bytedance/Dolphin/tree/v1.5)）\n- 🔥 **2025.10.16** *Dolphin-1.5* 开源！在保持轻量级0.3B架构的同时，该版本实现了显著的解析性能提升。（原1.0版本已迁移至[v1.0分支](https://github.com/bytedance/Dolphin/tree/v1.0)）\n- 🔥 **2025.07.10** *Fox-Page* 基准测试开源。这是原始 [Fox 数据集](https://github.com/ucaslcl/Fox) 人工矫正标注后的版本。下载地址：[百度网盘](https://pan.baidu.com/share/init?surl=t746ULp6iU5bUraVrPlMSw&pwd=fox1) | [Google Drive](https://drive.google.com/file/d/1yZQZqI34QCqvhB4Tmdl3X_XEvYvQyP0q/view?usp=sharing)。\n- 🔥 **2025.06.30** 新增[TensorRT-LLM](https://github.com/bytedance/Dolphin/blob/master/deployment/tensorrt_llm/ReadMe.md)支持，提升推理速度！\n- 🔥 **2025.06.27** 新增[vLLM](https://github.com/bytedance/Dolphin/blob/master/deployment/vllm/ReadMe.md)支持，提升推理速度！\n- 🔥 **2025.06.13** 新增多页PDF文档解析功能。\n- 🔥 **2025.05.21** 我们的演示已在 [链接](http://115.190.42.15:8888/dolphin/) 发布。快来体验吧！\n- 🔥 **2025.05.20** Dolphin的预训练模型和推理代码已发布。\n- 🔥 **2025.05.16** 我们的论文已被ACL 2025接收。论文链接：[arXiv](https://arxiv.org/abs/2505.14059)。\n\n## 📈 性能表现\n\n<table style=\"width:90%; border-collapse: collapse; text-align: center;\">\n    <caption>OmniDocBench (v1.5) 测试基准上评估结果</caption>\n    <thead>\n        <tr>\n            <th style=\"text-align: center !important;\">模型</th>\n            <th style=\"text-align: center !important;\">参数</th>\n            <th style=\"text-align: center !important;\">总体&#x2191;</th>\n            <th style=\"text-align: center !important;\">文本<sup>Edit</sup>&#x2193;</th>\n            <th style=\"text-align: center !important;\">公式<sup>CDM</sup>&#x2191;</th>\n            <th style=\"text-align: center !important;\">表格<sup>TEDS</sup>&#x2191;</th>\n            <th style=\"text-align: center !important;\">表格<sup>TEDS-S</sup>&#x2191;</th>\n            <th style=\"text-align: center !important;\">阅读顺序<sup>Edit</sup>&#x2193;</th>\n        </tr>\n    </thead>\n        <tr>\n            <td>Dolphin</td>\n            <td>0.3B</td>\n            <td>74.67</td>\n            <td>0.125</td>\n            <td>67.85</td>\n            <td>68.70</td>\n            <td>77.77</td>\n            <td>0.124</td>\n        </tr>\n        <tr>\n            <td>Dolphin-1.5</td>\n            <td>0.3B</td>\n            <td>85.06</td>\n            <td>0.085</td>\n            <td>79.44</td>\n            <td>84.25</td>\n            <td>88.06</td>\n            <td>0.071</td>\n        </tr>\n        <tr>\n            <td>Dolphin-v2</td>\n            <td>0.3B</td>\n            <td><strong>89.78</strong></td>\n            <td><strong>0.054</strong></td>\n            <td><strong>87.63</strong></td>\n            <td><strong>87.02</strong></td>\n            <td><strong>90.48</strong></td>\n            <td><strong>0.054</strong></td>\n        </tr>\n    </tbody>\n</table>\n\n## 🛠️ 安装\n\n1. 克隆仓库：\n   ```bash\n   git clone https://github.com/ByteDance/Dolphin.git\n   cd Dolphin\n   ```\n\n2. 安装依赖：\n   ```bash\n   pip install -r requirements.txt\n   ```\n\n3. 使用以下选项之一下载 *Dolphin-v2* 的预训练模型：\n   访问我们的Huggingface [模型卡片](https://huggingface.co/ByteDance/Dolphin-v2)，或通过以下方式下载模型：\n   \n   ```bash\n   # 从Hugging Face Hub下载模型\n   git lfs install\n   git clone https://huggingface.co/ByteDance/Dolphin-v2 ./hf_model\n   # 或使用Hugging Face CLI\n   pip install huggingface_hub\n   huggingface-cli download ByteDance/Dolphin-v2 --local-dir ./hf_model\n   ```\n\n## ⚡ 推理\n\nDolphin提供两个推理框架，支持两种解析粒度：\n- **页面级解析**：将整个文档页面解析为结构化的JSON和Markdown格式\n- **元素级解析**：解析单个文档元素（文本、表格、公式）\n\n\n### 📄 页面级解析\n\n```bash\n# 处理单个文档图像\npython demo_page.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs/page_1.png \n\n# 处理单个文档PDF\npython demo_page.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs/page_6.pdf \n\n# 处理目录中的所有文档\npython demo_page.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs \n\n# 使用自定义批次大小进行并行元素解码\npython demo_page.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs \\\n    --max_batch_size 8\n```\n\n### 🧩 元素级解析\n\n````bash\n# 解析块图像 (支持块图像类型: table, formula, text, or code)\npython demo_element.py --model_path ./hf_model --save_dir ./results \\\n    --input_path  \\\n    --element_type [table|formula|text|code]\n````\n\n### 🎨 元素定位及阅读顺序解析\n````bash\n# 处理单个文档图像\npython demo_layout.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs/page_1.png \\\n    \n# 处理单个文档PDF\npython demo_layout.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs/page_6.pdf \\\n\n# 处理目录中的所有文档\npython demo_layout.py --model_path ./hf_model --save_dir ./results \\\n    --input_path ./demo/page_imgs \n````\n\n\n## 🌟 主要特性\n\n- 🔄 基于单一VLM的两阶段分析-解析方法\n- 📊 在文档解析任务上的优异性能\n- 🔍 自然阅读顺序元素序列生成\n- 🧩 针对不同文档元素的异构锚点提示\n- ⏱️ 高效的并行解析机制\n- 🤗 支持Hugging Face Transformers，便于集成\n\n\n## 📮 通知\n**征集不良案例：** 如果您遇到模型表现不佳的案例，我们非常欢迎您在issue中分享。我们正在持续优化和改进模型。\n\n\n## 💖 致谢\n\n我们要感谢以下开源项目为本工作提供的灵感和参考：\n- [OmniDocBench](https://github.com/opendatalab/OmniDocBench)\n- [Donut](https://github.com/clovaai/donut/)\n- [Nougat](https://github.com/facebookresearch/nougat)\n- [GOT](https://github.com/Ucas-HaoranWei/GOT-OCR2.0)\n- [MinerU](https://github.com/opendatalab/MinerU/tree/master)\n- [Swin](https://github.com/microsoft/Swin-Transformer)\n- [Hugging Face Transformers](https://github.com/huggingface/transformers)\n\n\n## 📝 引用\n\n如果您在研究中发现此代码有用，请使用以下BibTeX条目。\n\n```bibtex\n@article{feng2025dolphin,\n  title={Dolphin: Document Image Parsing via Heterogeneous Anchor Prompting},\n  author={Feng, Hao and Wei, Shu and Fei, Xiang and Shi, Wei and Han, Yingdong and Liao, Lei and Lu, Jinghui and Wu, Binghong and Liu, Qi and Lin, Chunhui and others},\n  journal={arXiv preprint arXiv:2505.14059},\n  year={2025}\n}\n```\n\n## 星标历史\n\n[![Star History Chart](https://api.star-history.com/svg?repos=bytedance/Dolphin&type=Date)](https://www.star-history.com/#bytedance/Dolphin&Date)\n"
  },
  {
    "path": "demo_element.py",
    "content": "\"\"\"\nCopyright (c) 2025 Bytedance Ltd. and/or its affiliates\nSPDX-License-Identifier: MIT\n\"\"\"\n\nimport argparse\nimport glob\nimport os\n\nimport cv2\nimport torch\nfrom PIL import Image\nfrom transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration\nfrom qwen_vl_utils import process_vision_info\n\nfrom utils.utils import *\n\n\nclass DOLPHIN:\n    def __init__(self, model_id_or_path):\n        \"\"\"Initialize the Hugging Face model\n        \n        Args:\n            model_id_or_path: Path to local model or Hugging Face model ID\n        \"\"\"\n        # Load model from local path or Hugging Face hub\n        self.processor = AutoProcessor.from_pretrained(model_id_or_path)\n        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id_or_path)\n        self.model.eval()\n        \n        # Set device and precision\n        self.device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n        self.model.to(self.device)\n\n        if self.device == \"cuda\":\n            self.model = self.model.bfloat16()\n        else:\n            self.model = self.model.float()\n        \n        # set tokenizer\n        self.tokenizer = self.processor.tokenizer\n        self.tokenizer.padding_side = \"left\"\n\n    def chat(self, prompt, image):\n        # Check if we're dealing with a batch\n        is_batch = isinstance(image, list)\n        \n        if not is_batch:\n            # Single image, wrap it in a list for consistent processing\n            images = [image]\n            prompts = [prompt]\n        else:\n            # Batch of images\n            images = image\n            prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)\n        \n        assert len(images) == len(prompts)\n        \n        # preprocess all images\n        processed_images = [resize_img(img) for img in images]\n        # generate all messages\n        all_messages = []\n        for img, question in zip(processed_images, prompts):\n            messages = [\n                {\n                    \"role\": \"user\",\n                    \"content\": [\n                        {\n                            \"type\": \"image\",\n                            \"image\": img,\n                        },\n                        {\"type\": \"text\", \"text\": question}\n                    ],\n                }\n            ]\n            all_messages.append(messages)\n        # prepare all texts\n        texts = [\n            self.processor.apply_chat_template(\n                msgs, tokenize=False, add_generation_prompt=True\n            )\n            for msgs in all_messages\n        ]\n        # collect all image inputs\n        all_image_inputs = []\n        all_video_inputs = None\n        for msgs in all_messages:\n            image_inputs, video_inputs = process_vision_info(msgs)\n            all_image_inputs.extend(image_inputs)\n        # prepare model inputs\n        inputs = self.processor(\n            text=texts,\n            images=all_image_inputs if all_image_inputs else None,\n            videos=all_video_inputs if all_video_inputs else None,\n            padding=True,\n            return_tensors=\"pt\",\n        )\n        inputs = inputs.to(self.model.device)\n        # inference\n        generated_ids = self.model.generate(\n            **inputs,\n            max_new_tokens=4096,\n            # repetition_penalty=1.05\n        )\n        generated_ids_trimmed = [\n            out_ids[len(in_ids):] \n            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n        ]\n        \n        results = self.processor.batch_decode(\n            generated_ids_trimmed, \n            skip_special_tokens=True, \n            clean_up_tokenization_spaces=False\n        )\n        # Return a single result for single image input\n        if not is_batch:\n            return results[0]\n        return results\n\n\ndef process_element(image_path, model, element_type, save_dir=None):\n    \"\"\"Process a single element image (text, table, formula)\n    \n    Args:\n        image_path: Path to the element image\n        model: HFModel model instance\n        element_type: Type of element ('text', 'table', 'formula')\n        save_dir: Directory to save results (default: same as input directory)\n        \n    Returns:\n        Parsed content of the element and recognition results\n    \"\"\"\n    # Load and prepare image\n    pil_image = Image.open(image_path).convert(\"RGB\")\n    # pil_image = crop_margin(pil_image)\n    \n    # Select appropriate prompt based on element type\n    if element_type == \"table\":\n        prompt = \"Parse the table in the image.\"\n        label = \"tab\"\n    elif element_type == \"formula\":\n        prompt = \"Read formula in the image.\"\n        label = \"equ\"\n    elif element_type == \"code\":\n        prompt = \"Read code in the image.\"\n        label = \"code\"\n    else:  # Default to text\n        prompt = \"Read text in the image.\"\n        label = \"para\"\n    \n    # Process the element\n    result = model.chat(prompt, pil_image)\n    \n    # Create recognition result in the same format as the document parser\n    recognition_results = [\n        {\n            \"label\": label,\n            \"text\": result.strip(),\n        }\n    ]\n    \n    # Save results if save_dir is provided\n    save_outputs(recognition_results, pil_image, os.path.basename(image_path).split(\".\")[0], save_dir)\n    print(f\"Results saved to {save_dir}\")\n    \n    return result, recognition_results\n\n\ndef main():\n    parser = argparse.ArgumentParser(description=\"Element-level processing using DOLPHIN model\")\n    parser.add_argument(\"--model_path\", default=\"./hf_model\", help=\"Path to Hugging Face model\")\n    parser.add_argument(\"--input_path\", type=str, required=True, help=\"Path to input image or directory of images\")\n    parser.add_argument(\n        \"--element_type\",\n        type=str,\n        choices=[\"text\", \"table\", \"formula\", \"code\"],\n        default=\"text\",\n        help=\"Type of element to process (text, table, formula)\",\n    )\n    parser.add_argument(\n        \"--save_dir\",\n        type=str,\n        default=None,\n        help=\"Directory to save parsing results (default: same as input directory)\",\n    )\n    parser.add_argument(\"--print_results\", action=\"store_true\", help=\"Print recognition results to console\")\n    args = parser.parse_args()\n    \n    # Load Model\n    model = DOLPHIN(args.model_path)\n    \n    # Set save directory\n    save_dir = args.save_dir or (\n        args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)\n    )\n    setup_output_dirs(save_dir)\n    \n    # Collect Images\n    if os.path.isdir(args.input_path):\n        image_files = []\n        for ext in [\".jpg\", \".jpeg\", \".png\", \".JPG\", \".JPEG\", \".PNG\"]:\n            image_files.extend(glob.glob(os.path.join(args.input_path, f\"*{ext}\")))\n        image_files = sorted(image_files)\n    else:\n        if not os.path.exists(args.input_path):\n            raise FileNotFoundError(f\"Input path {args.input_path} does not exist\")\n        image_files = [args.input_path]\n    \n    total_samples = len(image_files)\n    print(f\"\\nTotal samples to process: {total_samples}\")\n    \n    # Process images one by one\n    for image_path in image_files:\n        print(f\"\\nProcessing {image_path}\")\n        try:\n            result, recognition_result = process_element(\n                image_path=image_path,\n                model=model,\n                element_type=args.element_type,\n                save_dir=save_dir,\n            )\n\n            if args.print_results:\n                print(\"\\nRecognition result:\")\n                print(result)\n                print(\"-\" * 40)\n        except Exception as e:\n            print(f\"Error processing {image_path}: {str(e)}\")\n            continue\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "demo_layout.py",
    "content": "\"\"\" \nCopyright (c) 2025 Bytedance Ltd. and/or its affiliates\nSPDX-License-Identifier: MIT\n\"\"\"\n\nimport argparse\nimport glob\nimport os\n\nimport torch\nfrom PIL import Image\nfrom transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration\nfrom qwen_vl_utils import process_vision_info\n\nfrom utils.utils import *\n\n\nclass DOLPHIN:\n    def __init__(self, model_id_or_path):\n        \"\"\"Initialize the Hugging Face model\n        \n        Args:\n            model_id_or_path: Path to local model or Hugging Face model ID\n        \"\"\"\n        # Load model from local path or Hugging Face hub\n        self.processor = AutoProcessor.from_pretrained(model_id_or_path)\n        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id_or_path)\n        self.model.eval()\n        \n        # Set device and precision\n        self.device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n        self.model.to(self.device)\n\n        if self.device == \"cuda\":\n            self.model = self.model.bfloat16()\n        else:\n            self.model = self.model.float()\n        \n        # set tokenizer\n        self.tokenizer = self.processor.tokenizer\n        self.tokenizer.padding_side = \"left\"\n\n    def chat(self, prompt, image):\n        # Check if we're dealing with a batch\n        is_batch = isinstance(image, list)\n        \n        if not is_batch:\n            # Single image, wrap it in a list for consistent processing\n            images = [image]\n            prompts = [prompt]\n        else:\n            # Batch of images\n            images = image\n            prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)\n        \n        assert len(images) == len(prompts)\n        \n        # preprocess all images\n        processed_images = [resize_img(img) for img in images]\n        # generate all messages\n        all_messages = []\n        for img, question in zip(processed_images, prompts):\n            messages = [\n                {\n                    \"role\": \"user\",\n                    \"content\": [\n                        {\n                            \"type\": \"image\",\n                            \"image\": img,\n                        },\n                        {\"type\": \"text\", \"text\": question}\n                    ],\n                }\n            ]\n            all_messages.append(messages)\n\n        # prepare all texts\n        texts = [\n            self.processor.apply_chat_template(\n                msgs, tokenize=False, add_generation_prompt=True\n            )\n            for msgs in all_messages\n        ]\n\n        # collect all image inputs\n        all_image_inputs = []\n        all_video_inputs = None\n        for msgs in all_messages:\n            image_inputs, video_inputs = process_vision_info(msgs)\n            all_image_inputs.extend(image_inputs)\n\n        # prepare model inputs\n        inputs = self.processor(\n            text=texts,\n            images=all_image_inputs if all_image_inputs else None,\n            videos=all_video_inputs if all_video_inputs else None,\n            padding=True,\n            return_tensors=\"pt\",\n        )\n        inputs = inputs.to(self.model.device)\n\n        # inference\n        generated_ids = self.model.generate(\n            **inputs,\n            max_new_tokens=4096,\n            do_sample=False,\n            temperature=None,\n            # repetition_penalty=1.05\n        )\n        generated_ids_trimmed = [\n            out_ids[len(in_ids):] \n            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n        ]\n        \n        results = self.processor.batch_decode(\n            generated_ids_trimmed, \n            skip_special_tokens=True, \n            clean_up_tokenization_spaces=False\n        )\n\n        # Return a single result for single image input\n        if not is_batch:\n            return results[0]\n        return results\n\n\n\ndef process_layout(input_path, model, save_dir):\n    \"\"\"Process layout detection for image or PDF\n    \n    Args:\n        input_path: Path to input image or PDF\n        model: DOLPHIN model instance\n        save_dir: Directory to save results\n    \"\"\"\n    file_ext = os.path.splitext(input_path)[1].lower()\n    \n    if file_ext == '.pdf':\n        # Convert PDF to images\n        images = convert_pdf_to_images(input_path)\n        if not images:\n            raise Exception(f\"Failed to convert PDF {input_path} to images\")\n        \n        # Process each page\n        for page_idx, pil_image in enumerate(images):\n            print(f\"\\nProcessing page {page_idx + 1}/{len(images)}\")\n            \n            # Generate output name for this page\n            base_name = os.path.splitext(os.path.basename(input_path))[0]\n            page_name = f\"{base_name}_page_{page_idx + 1:03d}\"\n            \n            # Process layout for this page\n            process_single_layout(pil_image, model, save_dir, page_name)\n    \n    else:\n        # Process regular image file\n        pil_image = Image.open(input_path).convert(\"RGB\")\n        base_name = os.path.splitext(os.path.basename(input_path))[0]\n        process_single_layout(pil_image, model, save_dir, base_name)\n\n\ndef process_single_layout(pil_image, model, save_dir, image_name):\n    \"\"\"Process layout for a single image\n    \n    Args:\n        pil_image: PIL Image object\n        model: DOLPHIN model instance\n        save_dir: Directory to save results\n        image_name: Name for the output files\n    \"\"\"\n    # Parse layout\n    print(\"Parsing layout and reading order...\")\n    layout_results = model.chat(\"Parse the reading order of this document.\", pil_image)\n\n    # Parse the layout string\n    layout_results_list = parse_layout_string(layout_results)\n    if not layout_results_list or not (layout_results.startswith(\"[\") and layout_results.endswith(\"]\")):\n        layout_results_list = [([0, 0, *pil_image.size], 'distorted_page', [])]\n    \n    # map bbox to original image coordinates\n    recognition_results = []\n    reading_order = 0\n    for bbox, label, tags in layout_results_list:\n        x1, y1, x2, y2 = process_coordinates(bbox, pil_image)\n        recognition_results.append({\n                        \"label\": label,\n                        \"bbox\": [x1, y1, x2, y2],\n                        \"text\": \"\", # empty for now\n                        \"reading_order\": reading_order,\n                        \"tags\": tags,\n                    })\n        reading_order += 1\n    json_path = save_outputs(recognition_results, pil_image, image_name, save_dir)\n\n\ndef main():\n    parser = argparse.ArgumentParser(description=\"Layout detection and visualization using DOLPHIN model\")\n    parser.add_argument(\"--model_path\", default=\"./hf_model\", help=\"Path to Hugging Face model\")\n    parser.add_argument(\n        \"--input_path\", \n        type=str, \n        required=True, \n        help=\"Path to input image/PDF or directory of files\"\n    )\n    parser.add_argument(\n        \"--save_dir\",\n        type=str,\n        default=None,\n        help=\"Directory to save results (default: same as input directory)\",\n    )\n    args = parser.parse_args()\n    \n    # Load Model\n    print(\"Loading model...\")\n    model = DOLPHIN(args.model_path)\n    \n    # Set save directory\n    save_dir = args.save_dir or (\n        args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)\n    )\n    \n    # Create save directory if it doesn't exist\n    os.makedirs(save_dir, exist_ok=True)\n    \n    # Collect files\n    if os.path.isdir(args.input_path):\n        # Support both image and PDF files\n        file_extensions = [\".jpg\", \".jpeg\", \".png\", \".JPG\", \".JPEG\", \".PNG\", \".pdf\", \".PDF\"]\n        \n        input_files = []\n        for ext in file_extensions:\n            input_files.extend(glob.glob(os.path.join(args.input_path, f\"*{ext}\")))\n        input_files = sorted(input_files)\n    else:\n        if not os.path.exists(args.input_path):\n            raise FileNotFoundError(f\"Input path {args.input_path} does not exist\")\n        \n        # Check if it's a supported file type\n        file_ext = os.path.splitext(args.input_path)[1].lower()\n        supported_exts = ['.jpg', '.jpeg', '.png', '.pdf']\n        \n        if file_ext not in supported_exts:\n            raise ValueError(f\"Unsupported file type: {file_ext}. Supported types: {supported_exts}\")\n        \n        input_files = [args.input_path]\n    \n    total_files = len(input_files)\n    print(f\"\\nTotal files to process: {total_files}\")\n    \n    # Process files\n    for file_path in input_files:\n        print(f\"\\n{'='*60}\")\n        print(f\"Processing: {file_path}\")\n        print('='*60)\n        \n        try:\n            process_layout(\n                input_path=file_path,\n                model=model,\n                save_dir=save_dir,\n            )\n            print(f\"\\n✓ Processing completed for {file_path}\")\n            \n        except Exception as e:\n            print(f\"\\n✗ Error processing {file_path}: {str(e)}\")\n            continue\n    \n    print(f\"\\n{'='*60}\")\n    print(f\"All processing completed. Results saved to {save_dir}\")\n    print('='*60)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "demo_page.py",
    "content": "\"\"\" \nCopyright (c) 2025 Bytedance Ltd. and/or its affiliates\nSPDX-License-Identifier: MIT\n\"\"\"\n\nimport argparse\nimport glob\nimport os\n\nimport torch\nfrom PIL import Image\nfrom transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration\nfrom qwen_vl_utils import process_vision_info\n\nfrom utils.utils import *\n\n\nclass DOLPHIN:\n    def __init__(self, model_id_or_path):\n        \"\"\"Initialize the Hugging Face model\n        \n        Args:\n            model_id_or_path: Path to local model or Hugging Face model ID\n        \"\"\"\n        # Load model from local path or Hugging Face hub\n        self.processor = AutoProcessor.from_pretrained(model_id_or_path)\n        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id_or_path)\n        self.model.eval()\n        \n        # Set device and precision\n        self.device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n        self.model.to(self.device)\n\n        if self.device == \"cuda\":\n            self.model = self.model.bfloat16()\n        else:\n            self.model = self.model.float()\n        \n        # set tokenizer\n        self.tokenizer = self.processor.tokenizer\n        self.tokenizer.padding_side = \"left\"\n\n    def chat(self, prompt, image):\n        # Check if we're dealing with a batch\n        is_batch = isinstance(image, list)\n        \n        if not is_batch:\n            # Single image, wrap it in a list for consistent processing\n            images = [image]\n            prompts = [prompt]\n        else:\n            # Batch of images\n            images = image\n            prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)\n        \n        assert len(images) == len(prompts)\n        \n        # preprocess all images\n        processed_images = [resize_img(img) for img in images]\n        # generate all messages\n        all_messages = []\n        for img, question in zip(processed_images, prompts):\n            messages = [\n                {\n                    \"role\": \"user\",\n                    \"content\": [\n                        {\n                            \"type\": \"image\",\n                            \"image\": img,\n                        },\n                        {\"type\": \"text\", \"text\": question}\n                    ],\n                }\n            ]\n            all_messages.append(messages)\n\n        # prepare all texts\n        texts = [\n            self.processor.apply_chat_template(\n                msgs, tokenize=False, add_generation_prompt=True\n            )\n            for msgs in all_messages\n        ]\n\n        # collect all image inputs\n        all_image_inputs = []\n        all_video_inputs = None\n        for msgs in all_messages:\n            image_inputs, video_inputs = process_vision_info(msgs)\n            all_image_inputs.extend(image_inputs)\n\n        # prepare model inputs\n        inputs = self.processor(\n            text=texts,\n            images=all_image_inputs if all_image_inputs else None,\n            videos=all_video_inputs if all_video_inputs else None,\n            padding=True,\n            return_tensors=\"pt\",\n        )\n        inputs = inputs.to(self.model.device)\n\n        # inference\n        generated_ids = self.model.generate(\n            **inputs,\n            max_new_tokens=4096,\n            do_sample=False,\n            temperature=None,\n            # repetition_penalty=1.05\n        )\n        generated_ids_trimmed = [\n            out_ids[len(in_ids):] \n            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n        ]\n        \n        results = self.processor.batch_decode(\n            generated_ids_trimmed, \n            skip_special_tokens=True, \n            clean_up_tokenization_spaces=False\n        )\n\n        # Return a single result for single image input\n        if not is_batch:\n            return results[0]\n        return results\n\n\ndef process_document(document_path, model, save_dir, max_batch_size=None):\n    \"\"\"Parse documents with two stages - Handles both images and PDFs\"\"\"\n    file_ext = os.path.splitext(document_path)[1].lower()\n    \n    if file_ext == '.pdf':\n        # Convert PDF to images\n        images = convert_pdf_to_images(document_path)\n        if not images:\n            raise Exception(f\"Failed to convert PDF {document_path} to images\")\n        \n        all_results = []\n        \n        # Process each page\n        for page_idx, pil_image in enumerate(images):\n            print(f\"Processing page {page_idx + 1}/{len(images)}\")\n            \n            # Generate output name for this page\n            base_name = os.path.splitext(os.path.basename(document_path))[0]\n            page_name = f\"{base_name}_page_{page_idx + 1:03d}\"\n            \n            # Process this page (don't save individual page results)\n            json_path, recognition_results = process_single_image(\n                pil_image, model, save_dir, page_name, max_batch_size, save_individual=False\n            )\n            \n            # Add page information to results\n            page_results = {\n                \"page_number\": page_idx + 1,\n                \"elements\": recognition_results\n            }\n            all_results.append(page_results)\n        \n        # Save combined results for multi-page PDF\n        combined_json_path = save_combined_pdf_results(all_results, document_path, save_dir)\n        \n        return combined_json_path, all_results\n    \n    else:\n        # Process regular image file\n        pil_image = Image.open(document_path).convert(\"RGB\")\n        base_name = os.path.splitext(os.path.basename(document_path))[0]\n        return process_single_image(pil_image, model, save_dir, base_name, max_batch_size)\n\n\ndef process_single_image(image, model, save_dir, image_name, max_batch_size=None, save_individual=True):\n    \"\"\"Process a single image (either from file or converted from PDF page)\n    \n    Args:\n        image: PIL Image object\n        model: DOLPHIN model instance\n        save_dir: Directory to save results\n        image_name: Name for the output file\n        max_batch_size: Maximum batch size for processing\n        save_individual: Whether to save individual results (False for PDF pages)\n        \n    Returns:\n        Tuple of (json_path, recognition_results)\n    \"\"\"\n    # Stage 1: Page-level layout and reading order parsing\n    layout_output = model.chat(\"Parse the reading order of this document.\", image)\n    # print(layout_output)\n\n    # Stage 2: Element-level content parsing\n    recognition_results = process_elements(layout_output, image, model, max_batch_size, save_dir, image_name)\n\n    # Save outputs only if requested (skip for PDF pages)\n    json_path = None\n    if save_individual:\n        # Create a dummy image path for save_outputs function\n        json_path = save_outputs(recognition_results, image, image_name, save_dir)\n\n    return json_path, recognition_results\n\n\ndef process_elements(layout_results, image, model, max_batch_size, save_dir=None, image_name=None):\n    \"\"\"Parse all document elements with parallel decoding\"\"\"\n    layout_results_list = parse_layout_string(layout_results)\n    if not layout_results_list or not (layout_results.startswith(\"[\") and layout_results.endswith(\"]\")):\n        layout_results_list = [([0, 0, *image.size], 'distorted_page', [])]\n    # Check for bbox overlap - if too many overlaps, treat as distorted page\n    elif len(layout_results_list) > 1 and check_bbox_overlap(layout_results_list, image):\n        print(\"Falling back to distorted_page mode due to high bbox overlap\")\n        layout_results_list = [([0, 0, *image.size], 'distorted_page', [])]\n        \n    tab_elements = []      \n    equ_elements = []     \n    code_elements = []    \n    text_elements = []     \n    figure_results = []    \n    reading_order = 0\n\n    # Collect elements and group\n    for bbox, label, tags in layout_results_list:\n        try:\n            if label == \"distorted_page\":\n                x1, y1, x2, y2 = 0, 0, *image.size\n                pil_crop = image\n            else:\n                # get coordinates in the original image\n                x1, y1, x2, y2 = process_coordinates(bbox, image)\n                # crop the image\n                pil_crop = image.crop((x1, y1, x2, y2))\n\n            if pil_crop.size[0] > 3 and pil_crop.size[1] > 3:\n                if label == \"fig\":\n                    figure_filename = save_figure_to_local(pil_crop, save_dir, image_name, reading_order)\n                    figure_results.append({\n                        \"label\": label,\n                        \"text\": f\"![Figure](figures/{figure_filename})\",\n                        \"figure_path\": f\"figures/{figure_filename}\",\n                        \"bbox\": [x1, y1, x2, y2],\n                        \"reading_order\": reading_order,\n                        \"tags\": tags,\n                    })\n                else:\n                    # Prepare element information\n                    element_info = {\n                        \"crop\": pil_crop,\n                        \"label\": label,\n                        \"bbox\": [x1, y1, x2, y2],\n                        \"reading_order\": reading_order,\n                        \"tags\": tags,\n                    }\n                    \n                    if label == \"tab\":\n                        tab_elements.append(element_info)\n                    elif label == \"equ\":\n                        equ_elements.append(element_info)\n                    elif label == \"code\":\n                        code_elements.append(element_info)\n                    else:\n                        text_elements.append(element_info)\n\n            reading_order += 1\n\n        except Exception as e:\n            print(f\"Error processing bbox with label {label}: {str(e)}\")\n            continue\n\n    recognition_results = figure_results.copy()\n    \n    if tab_elements:\n        results = process_element_batch(tab_elements, model, \"Parse the table in the image.\", max_batch_size)\n        recognition_results.extend(results)\n    \n    if equ_elements:\n        results = process_element_batch(equ_elements, model, \"Read formula in the image.\", max_batch_size)\n        recognition_results.extend(results)\n    \n    if code_elements:\n        results = process_element_batch(code_elements, model, \"Read code in the image.\", max_batch_size)\n        recognition_results.extend(results)\n    \n    if text_elements:\n        results = process_element_batch(text_elements, model, \"Read text in the image.\", max_batch_size)\n        recognition_results.extend(results)\n\n    recognition_results.sort(key=lambda x: x.get(\"reading_order\", 0))\n\n    return recognition_results\n\n\ndef process_element_batch(elements, model, prompt, max_batch_size=None):\n    \"\"\"Process elements of the same type in batches\"\"\"\n    results = []\n    \n    # Determine batch size\n    batch_size = len(elements)\n    if max_batch_size is not None and max_batch_size > 0:\n        batch_size = min(batch_size, max_batch_size)\n    \n    # Process in batches\n    for i in range(0, len(elements), batch_size):\n        batch_elements = elements[i:i+batch_size]\n        crops_list = [elem[\"crop\"] for elem in batch_elements]\n        \n        # Use the same prompt for all elements in the batch\n        prompts_list = [prompt] * len(crops_list)\n        \n        # Batch inference\n        batch_results = model.chat(prompts_list, crops_list)\n        \n        # Add results\n        for j, result in enumerate(batch_results):\n            elem = batch_elements[j]\n            results.append({\n                \"label\": elem[\"label\"],\n                \"bbox\": elem[\"bbox\"],\n                \"text\": result.strip(),\n                \"reading_order\": elem[\"reading_order\"],\n                \"tags\": elem[\"tags\"],\n            })\n    \n    return results\n\n\ndef main():\n    parser = argparse.ArgumentParser(description=\"Document parsing based on DOLPHIN\")\n    parser.add_argument(\"--model_path\", default=\"./hf_model\", help=\"Path to Hugging Face model\")\n    parser.add_argument(\"--input_path\", type=str, default=\"./demo\", help=\"Path to input image/PDF or directory of files\")\n    parser.add_argument(\n        \"--save_dir\",\n        type=str,\n        default=None,\n        help=\"Directory to save parsing results (default: same as input directory)\",\n    )\n    parser.add_argument(\n        \"--max_batch_size\",\n        type=int,\n        default=4,\n        help=\"Maximum number of document elements to parse in a single batch (default: 4)\",\n    )\n    args = parser.parse_args()\n\n    # Load Model\n    model = DOLPHIN(args.model_path)\n\n    # Collect Document Files (images and PDFs)\n    if os.path.isdir(args.input_path):\n        # Support both image and PDF files\n        file_extensions = [\".jpg\", \".jpeg\", \".png\", \".JPG\", \".JPEG\", \".PNG\", \".pdf\", \".PDF\"]\n        \n        document_files = []\n        for ext in file_extensions:\n            document_files.extend(glob.glob(os.path.join(args.input_path, f\"*{ext}\")))\n        document_files = sorted(document_files)\n    else:\n        if not os.path.exists(args.input_path):\n            raise FileNotFoundError(f\"Input path {args.input_path} does not exist\")\n        \n        # Check if it's a supported file type\n        file_ext = os.path.splitext(args.input_path)[1].lower()\n        supported_exts = ['.jpg', '.jpeg', '.png', '.pdf']\n        \n        if file_ext not in supported_exts:\n            raise ValueError(f\"Unsupported file type: {file_ext}. Supported types: {supported_exts}\")\n        \n        document_files = [args.input_path]\n\n    save_dir = args.save_dir or (\n        args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)\n    )\n    setup_output_dirs(save_dir)\n\n    total_samples = len(document_files)\n    print(f\"\\nTotal files to process: {total_samples}\")\n\n    # Process All Document Files\n    for file_path in document_files:\n        print(f\"\\nProcessing {file_path}\")\n        try:\n            json_path, recognition_results = process_document(\n                document_path=file_path,\n                model=model,\n                save_dir=save_dir,\n                max_batch_size=args.max_batch_size,\n            )\n\n            print(f\"Processing completed. Results saved to {save_dir}\")\n\n        except Exception as e:\n            print(f\"Error processing {file_path}: {str(e)}\")\n            continue\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[tool.black]\nline-length = 120\ninclude = '\\.pyi?$'\nexclude = '''\n/(\n    \\.git\n  | \\.hg\n  | \\.mypy_cache\n  | \\.tox\n  | \\.venv\n  | _build\n  | buck-out\n  | build\n  | dist\n)/\n'''\n"
  },
  {
    "path": "requirements.txt",
    "content": "datasets==3.6.0\ntorch==2.6.0\ntorchvision==0.21.0\ntransformers==4.51.0\ndeepspeed==0.16.4\ntriton==3.2.0\naccelerate==1.4.0\ntorchcodec==0.2\ndecord==0.6.0\nLevenshtein==0.27.1\nqwen_vl_utils\nmatplotlib\njieba\nopencv-python\nbs4\nalbumentations==1.4.0\npymupdf==1.26\n"
  },
  {
    "path": "utils/markdown_utils.py",
    "content": "\"\"\" \nCopyright (c) 2025 Bytedance Ltd. and/or its affiliates\nSPDX-License-Identifier: MIT\n\"\"\"\n\nimport re\nimport base64\nfrom typing import List, Dict, Any, Optional\n\n\ndef extract_table_from_html(html_string):\n    \"\"\"Extract and clean table tags from HTML string\"\"\"\n    try:\n        table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)\n        tables = table_pattern.findall(html_string)\n        tables = [re.sub(r'<table[^>]*>', '<table>', table) for table in tables]\n        return '\\n'.join(tables)\n    except Exception as e:\n        print(f\"extract_table_from_html error: {str(e)}\")\n        return f\"<table><tr><td>Error extracting table: {str(e)}</td></tr></table>\"\n\n\nclass MarkdownConverter:\n    \"\"\"Convert structured recognition results to Markdown format\"\"\"\n    \n    def __init__(self):\n        # Define heading levels for different section types\n        self.heading_levels = {\n            'sec_0': '#',\n            'sec_1': '##',\n            'sec_2': '###',\n            'sec_3': '###',\n            'sec_4': '###',\n            'sec_5': '###',\n        }\n        \n        # Define which labels need special handling\n        self.special_labels = {\n            'sec_0', 'sec_1', 'sec_2', 'sec_3', 'sec_4', 'sec_5',\n            'list', 'equ', 'tab', 'fig'\n        }\n\n        # Define replacements for special formulas\n        self.replace_dict = {\n            '\\\\bm': '\\mathbf ',\n            '\\eqno': '\\quad ',\n            '\\quad': '\\quad ',\n            '\\leq': '\\leq ',\n            '\\pm': '\\pm ',\n            '\\\\varmathbb': '\\mathbb ',\n            '\\in fty': '\\infty',\n            '\\mu': '\\mu ',\n            '\\cdot': '\\cdot ',\n            '\\langle': '\\langle ',\n            '\\pm': '\\pm '\n        }\n    \n    def try_remove_newline(self, text: str) -> str:\n        try:\n            # Preprocess text to handle line breaks\n            text = text.strip()\n            text = text.replace('-\\n', '')\n            \n            # Handle Chinese text line breaks\n            def is_chinese(char):\n                return '\\u4e00' <= char <= '\\u9fff'\n\n            lines = text.split('\\n')\n            processed_lines = []\n            \n            # Process all lines except the last one\n            for i in range(len(lines)-1):\n                current_line = lines[i].strip()\n                next_line = lines[i+1].strip()\n                \n                # Always add the current line, but determine if we need a newline\n                if current_line:  # If current line is not empty\n                    if next_line:  # If next line is not empty\n                        # For Chinese text handling\n                        if is_chinese(current_line[-1]) and is_chinese(next_line[0]):\n                            processed_lines.append(current_line)\n                        else:\n                            processed_lines.append(current_line + ' ')\n                    else:\n                        # Next line is empty, add current line with newline\n                        processed_lines.append(current_line + '\\n')\n                else:\n                    # Current line is empty, add an empty line\n                    processed_lines.append('\\n')\n            \n            # Add the last line\n            if lines and lines[-1].strip():\n                processed_lines.append(lines[-1].strip())\n            \n            text = ''.join(processed_lines)\n            return text\n        \n        except Exception as e:\n            print(f\"try_remove_newline error: {str(e)}\")\n            return text  # Return original text on error\n\n    def _handle_text(self, text: str) -> str:\n        \"\"\"\n        Process regular text content, preserving paragraph structure\n        \"\"\"\n        try:\n            if not text:\n                return \"\"\n            \n            # Process formulas in text before handling other text processing\n            text = self._process_formulas_in_text(text)\n            text = self.try_remove_newline(text)\n            return text\n        except Exception as e:\n            print(f\"_handle_text error: {str(e)}\")\n            return text  # Return original text on error\n    \n    def _process_formulas_in_text(self, text: str) -> str:\n        \"\"\"\n        Process mathematical formulas in text by iteratively finding and replacing formulas.\n        - Identify inline and block formulas\n        - Replace newlines within formulas with \\\\\n        \"\"\"\n        try:\n            text = text.replace(r'\\upmu', r'\\mu')\n            for key, value in self.replace_dict.items():\n                text = text.replace(key, value)\n            return text\n        \n        except Exception as e:\n            print(f\"_process_formulas_in_text error: {str(e)}\")\n            return text  # Return original text on error\n    \n    def _remove_newline_in_heading(self, text: str) -> str:\n        \"\"\"\n        Remove newline in heading\n        \"\"\"\n        try:\n            # Handle Chinese text line breaks\n            def is_chinese(char):\n                return '\\u4e00' <= char <= '\\u9fff'\n            \n            # Check if the text contains Chinese characters\n            if any(is_chinese(char) for char in text):\n                return text.replace('\\n', '')\n            else:\n                return text.replace('\\n', ' ')\n\n        except Exception as e:\n            print(f\"_remove_newline_in_heading error: {str(e)}\")\n            return text\n    \n    def _handle_heading(self, text: str, label: str) -> str:\n        \"\"\"\n        Convert section headings to appropriate markdown format\n        \"\"\"\n        try:\n            level = self.heading_levels.get(label, '#')\n            text = text.strip()\n            text = self._remove_newline_in_heading(text)\n            text = self._handle_text(text)\n            return f\"{level} {text}\\n\\n\"\n        \n        except Exception as e:\n            print(f\"_handle_heading error: {str(e)}\")\n            return f\"# Error processing heading: {text}\\n\\n\"\n    \n    def _handle_list_item(self, text: str) -> str:\n        \"\"\"\n        Convert list items to markdown list format\n        \"\"\"\n        try:\n            return f\"- {text.strip()}\\n\"\n        except Exception as e:\n            print(f\"_handle_list_item error: {str(e)}\")\n            return f\"- Error processing list item: {text}\\n\"\n    \n    def _handle_figure(self, text: str, section_count: int) -> str:\n        \"\"\"\n        Handle figure content\n        \"\"\"\n        try:\n            # Check if it's a file path starting with \"figures/\"\n            if text.startswith(\"figures/\"):\n                # Convert to relative path from markdown directory to figures directory\n                relative_path = f\"../{text}\"\n                return f\"![Figure {section_count}]({relative_path})\\n\\n\"\n\n            # Check if it's already a markdown format image link\n            if text.startswith(\"![\"):\n                # Already in markdown format, return directly\n                return f\"{text}\\n\\n\"\n\n            # If it's still base64 format, maintain original logic\n            if text.startswith(\"data:image/\"):\n                return f\"![Figure {section_count}]({text})\\n\\n\"\n            elif \";\" in text and \",\" in text:\n                return f\"![Figure {section_count}]({text})\\n\\n\"\n            else:\n                # Assume it's raw base64, convert to data URI\n                img_format = \"png\"\n                data_uri = f\"data:image/{img_format};base64,{text}\"\n                return f\"![Figure {section_count}]({data_uri})\\n\\n\"\n                \n        except Exception as e:\n            print(f\"_handle_figure error: {str(e)}\")\n            return f\"*[Error processing figure: {str(e)}]*\\n\\n\"\n\n    def _handle_table(self, text: str) -> str:\n        \"\"\"\n        Convert table content to markdown format\n        \"\"\"\n        try:\n            markdown_content = []\n            markdown_table = extract_table_from_html(text)\n            markdown_content.append(markdown_table + \"\\n\")\n            return '\\n'.join(markdown_content) + '\\n\\n'\n        \n        except Exception as e:\n            print(f\"_handle_table error: {str(e)}\")\n            return f\"*[Error processing table: {str(e)}]*\\n\\n\"\n\n    def _handle_formula(self, text: str) -> str:\n        \"\"\"\n        Handle formula-specific content\n        \"\"\"\n        try:\n            text = text.strip('$').rstrip(\"\\ \").replace(r'\\upmu', r'\\mu')\n            for key, value in self.replace_dict.items():\n                text = text.replace(key, value)\n            processed_text = '$$' + text + '$$'\n            return f\"{processed_text}\\n\\n\"\n        \n        except Exception as e:\n            print(f\"_handle_formula error: {str(e)}\")\n            return f\"*[Error processing formula: {str(e)}]*\\n\\n\"\n\n    def convert(self, recognition_results: List[Dict[str, Any]]) -> str:\n        \"\"\"\n        Convert recognition results to markdown format\n        \"\"\"\n        try:\n            markdown_content = []\n            \n            for section_count, result in enumerate(recognition_results):\n                try:\n                    label = result.get('label', '')\n                    text = result.get('text', '').strip()\n                    \n                    # Skip empty text\n                    if not text:\n                        continue\n                        \n                    # Handle different content types\n                    if label in {'sec_0', 'sec_1', 'sec_2', 'sec_3', 'sec_4', 'sec_5'}:\n                        markdown_content.append(self._handle_heading(text, label))\n                    elif label == 'fig':\n                        markdown_content.append(self._handle_figure(text, section_count))\n                    elif label == 'tab':\n                        markdown_content.append(self._handle_table(text))\n                    elif label == 'equ':\n                        markdown_content.append(self._handle_formula(text))\n                    elif label == 'list':\n                        markdown_content.append(self._handle_list_item(text))\n                    elif label == 'code':\n                        markdown_content.append(f\"```bash\\n{text}\\n```\\n\\n\")\n                    else:\n                        # Handle regular text (paragraphs, etc.)\n                        processed_text = self._handle_text(text)\n                        markdown_content.append(f\"{processed_text}\\n\\n\")\n                        # TODO: distoraged page\n\n                except Exception as e:\n                    print(f\"Error processing item {section_count}: {str(e)}\")\n                    # Add a placeholder for the failed item\n                    markdown_content.append(f\"*[Error processing content]*\\n\\n\")\n            \n            # Join all content and apply post-processing\n            result = ''.join(markdown_content)\n            return result\n        \n        except Exception as e:\n            print(f\"convert error: {str(e)}\")\n            return f\"Error generating markdown content: {str(e)}\"\n"
  },
  {
    "path": "utils/utils.py",
    "content": "\"\"\"\nCopyright (c) 2025 Bytedance Ltd. and/or its affiliates\nSPDX-License-Identifier: MIT\n\"\"\"\n\nimport io\nimport json\nimport os\nimport re\nfrom dataclasses import dataclass\nfrom typing import List, Tuple\n\nimport cv2\nimport numpy as np\nimport pymupdf\nfrom PIL import Image\nfrom qwen_vl_utils import smart_resize\nfrom utils.markdown_utils import MarkdownConverter\n\n\ndef save_figure_to_local(pil_crop, save_dir, image_name, reading_order):\n    \"\"\"Save cropped figure to local file system\n\n    Args:\n        pil_crop: PIL Image object of the cropped figure\n        save_dir: Base directory to save results\n        image_name: Name of the source image/document\n        reading_order: Reading order of the figure in the document\n\n    Returns:\n        str: Filename of the saved figure\n    \"\"\"\n    try:\n        # Create figures directory if it doesn't exist\n        figures_dir = os.path.join(save_dir, \"markdown\", \"figures\")\n        # os.makedirs(figures_dir, exist_ok=True)\n\n        # Generate figure filename\n        figure_filename = f\"{image_name}_figure_{reading_order:03d}.png\"\n        figure_path = os.path.join(figures_dir, figure_filename)\n\n        # Save the figure\n        pil_crop.save(figure_path, format=\"PNG\", quality=95)\n\n        # print(f\"Saved figure: {figure_filename}\")\n        return figure_filename\n\n    except Exception as e:\n        print(f\"Error saving figure: {str(e)}\")\n        # Return a fallback filename\n        return f\"{image_name}_figure_{reading_order:03d}_error.png\"\n\n\ndef convert_pdf_to_images(pdf_path, target_size=896):\n    \"\"\"Convert PDF pages to images\n\n    Args:\n        pdf_path: Path to PDF file\n        target_size: Target size for the longest dimension\n\n    Returns:\n        List of PIL Images\n    \"\"\"\n    images = []\n    try:\n        doc = pymupdf.open(pdf_path)\n\n        for page_num in range(len(doc)):\n            page = doc[page_num]\n\n            # Calculate scale to make longest dimension equal to target_size\n            rect = page.rect\n            scale = target_size / max(rect.width, rect.height)\n\n            # Render page as image\n            mat = pymupdf.Matrix(scale, scale)\n            pix = page.get_pixmap(matrix=mat)\n\n            # Convert to PIL Image\n            img_data = pix.tobytes(\"png\")\n            pil_image = Image.open(io.BytesIO(img_data))\n            images.append(pil_image)\n\n        doc.close()\n        print(f\"Successfully converted {len(images)} pages from PDF\")\n        return images\n\n    except Exception as e:\n        print(f\"Error converting PDF to images: {str(e)}\")\n        return []\n\n\ndef save_combined_pdf_results(all_page_results, pdf_path, save_dir):\n    \"\"\"Save combined results for multi-page PDF with both JSON and Markdown\n\n    Args:\n        all_page_results: List of results for all pages\n        pdf_path: Path to original PDF file\n        save_dir: Directory to save results\n\n    Returns:\n        Path to saved combined JSON file\n    \"\"\"\n    # Create output filename based on PDF name\n    base_name = os.path.splitext(os.path.basename(pdf_path))[0]\n\n    # Prepare combined results\n    combined_results = {\"source_file\": pdf_path, \"total_pages\": len(all_page_results), \"pages\": all_page_results}\n\n    # Save combined JSON results\n    json_filename = f\"{base_name}.json\"\n    json_path = os.path.join(save_dir, \"recognition_json\", json_filename)\n    os.makedirs(os.path.dirname(json_path), exist_ok=True)\n\n    with open(json_path, \"w\", encoding=\"utf-8\") as f:\n        json.dump(combined_results, f, indent=2, ensure_ascii=False)\n\n    # Generate and save combined markdown\n    try:\n        markdown_converter = MarkdownConverter()\n\n        # Combine all page results into a single list for markdown conversion\n        all_elements = []\n        for page_data in all_page_results:\n            page_elements = page_data.get(\"elements\", [])\n            if page_elements:\n                # Add page separator if not the first page\n                if all_elements:\n                    all_elements.append(\n                        {\"label\": \"page_separator\", \"text\": f\"\\n\\n---\\n\\n\", \"reading_order\": len(all_elements)}\n                    )\n                all_elements.extend(page_elements)\n\n        # Generate markdown content\n        markdown_content = markdown_converter.convert(all_elements)\n\n        # Save markdown file\n        markdown_filename = f\"{base_name}.md\"\n        markdown_path = os.path.join(save_dir, \"markdown\", markdown_filename)\n        os.makedirs(os.path.dirname(markdown_path), exist_ok=True)\n\n        with open(markdown_path, \"w\", encoding=\"utf-8\") as f:\n            f.write(markdown_content)\n\n        # print(f\"Combined markdown saved to: {markdown_path}\")\n\n    except ImportError:\n        print(\"MarkdownConverter not available, skipping markdown generation\")\n    except Exception as e:\n        print(f\"Error generating markdown: {e}\")\n\n    # print(f\"Combined JSON results saved to: {json_path}\")\n    return json_path\n\n\ndef extract_labels_from_string(text):\n    \"\"\"\n    from [202,217,921,325][para][author] extract para and author\n    \"\"\"\n    all_matches = re.findall(r'\\[([^\\]]+)\\]', text)\n    \n    labels = []\n    for match in all_matches:\n        if not re.match(r'^\\d+,\\d+,\\d+,\\d+$', match):\n            labels.append(match)\n    \n    return labels\n\n\ndef parse_layout_string(bbox_str):\n    \"\"\"\n    Dolphin-V1.5 layout string parsing function\n    Parse layout string to extract bbox and category information\n    Supports multiple formats:\n    1. Original format: [x1,y1,x2,y2] label\n    2. New format: [x1,y1,x2,y2][label][PAIR_SEP] or [x1,y1,x2,y2][label][meta_info][PAIR_SEP]\n    \"\"\"\n    parsed_results = []\n    \n    segments = bbox_str.split('[PAIR_SEP]')\n    new_segments = []\n    for seg in segments:\n        new_segments.extend(seg.split('[RELATION_SEP]'))\n    segments = new_segments\n    for segment in segments:\n        segment = segment.strip()\n        if not segment:\n            continue\n        \n        coord_pattern = r'\\[(\\d*\\.?\\d+),(\\d*\\.?\\d+),(\\d*\\.?\\d+),(\\d*\\.?\\d+)\\]'\n        coord_match = re.search(coord_pattern, segment)\n        label_matches = extract_labels_from_string(segment)\n        \n        if coord_match and label_matches:\n            coords = [float(coord_match.group(i)) for i in range(1, 5)]\n            label = label_matches[0].strip()\n            parsed_results.append((coords, label, label_matches[1:])) # label_matches[1:] 是 tags\n    \n    return parsed_results\n\n\ndef process_coordinates(coords, pil_image):\n    original_w, original_h = pil_image.size[:2]\n    # use the same resize logic as the model\n    resized_pil = resize_img(pil_image)\n    resized_image = np.array(resized_pil)\n    resized_h, resized_w = resized_image.shape[:2]\n    resized_h, resized_w = smart_resize(resized_h, resized_w, factor=28, min_pixels=784, max_pixels=2560000)\n\n    w_ratio, h_ratio = original_w / resized_w, original_h / resized_h\n    x1 = int(coords[0] * w_ratio)\n    y1 = int(coords[1] * h_ratio)\n    x2 = int(coords[2] * w_ratio)\n    y2 = int(coords[3] * h_ratio)\n\n    x1 = max(0, min(x1, original_w - 1))\n    y1 = max(0, min(y1, original_h - 1))\n    x2 = max(x1 + 1, min(x2, original_w))\n    y2 = max(y1 + 1, min(y2, original_h))\n    return x1, y1, x2, y2\n\n\ndef setup_output_dirs(save_dir):\n    \"\"\"Create necessary output directories\"\"\"\n    os.makedirs(save_dir, exist_ok=True)\n    os.makedirs(os.path.join(save_dir, \"markdown\"), exist_ok=True)\n    os.makedirs(os.path.join(save_dir, \"output_json\"), exist_ok=True)\n    os.makedirs(os.path.join(save_dir, \"markdown\", \"figures\"), exist_ok=True)\n    os.makedirs(os.path.join(save_dir, \"layout_visualization\"), exist_ok=True)\n\n\ndef save_outputs(recognition_results, image, image_name, save_dir):\n    \"\"\"Save JSON and markdown outputs\"\"\"\n\n    # Save JSON file\n    json_path = os.path.join(save_dir, \"output_json\", f\"{image_name}.json\")\n    with open(json_path, \"w\", encoding=\"utf-8\") as f:\n        json.dump(recognition_results, f, ensure_ascii=False, indent=2)\n\n    # Generate and save markdown file\n    markdown_converter = MarkdownConverter()\n    markdown_content = markdown_converter.convert(recognition_results)\n    markdown_path = os.path.join(save_dir, \"markdown\", f\"{image_name}.md\")\n    with open(markdown_path, \"w\", encoding=\"utf-8\") as f:\n        f.write(markdown_content)\n\n    # visualize layout\n    # Save visualization (pass original PIL image for coordinate mapping)\n    vis_path = os.path.join(save_dir, \"layout_visualization\", f\"{image_name}_layout.png\")\n\n    visualize_layout(image, recognition_results, vis_path)\n    return json_path\n\n\ndef crop_margin(img: Image.Image) -> Image.Image:\n    \"\"\"Crop margins from image\"\"\"\n    try:\n        width, height = img.size\n        if width == 0 or height == 0:\n            print(\"Warning: Image has zero width or height\")\n            return img\n\n        data = np.array(img.convert(\"L\"))\n        data = data.astype(np.uint8)\n        max_val = data.max()\n        min_val = data.min()\n        if max_val == min_val:\n            return img\n        data = (data - min_val) / (max_val - min_val) * 255\n        gray = 255 * (data < 200).astype(np.uint8)\n\n        coords = cv2.findNonZero(gray)  # Find all non-zero points (text)\n        if coords is None:\n            return img\n        a, b, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box\n\n        # Ensure crop coordinates are within image bounds\n        a = max(0, a)\n        b = max(0, b)\n        w = min(w, width - a)\n        h = min(h, height - b)\n\n        # Only crop if we have a valid region\n        if w > 0 and h > 0:\n            return img.crop((a, b, a + w, b + h))\n        return img\n    except Exception as e:\n        print(f\"crop_margin error: {str(e)}\")\n        return img  # Return original image on error\n\ndef visualize_layout(image_path, layout_results, save_path, alpha=0.3):\n    \"\"\"Visualize layout detection results on the image\n    \n    Args:\n        image_path: Path to the input image\n        layout_results: List of (bbox, label, tags) dict\n        save_path: Path to save the visualization\n        alpha: Transparency of the overlay (0-1, lower = more transparent)\n    \"\"\"\n    # Read image\n    if isinstance(image_path, str):\n        image = cv2.imread(image_path)\n    else:\n        # If it's already a PIL Image\n        image = cv2.cvtColor(np.array(image_path), cv2.COLOR_RGB2BGR)\n    \n    if image is None:\n        raise ValueError(f\"Failed to load image from {image_path}\")\n    \n    # Assign colors to all elements at once\n    element_colors = assign_colors_to_elements(len(layout_results))\n    \n    # Create overlay\n    overlay = image.copy()\n    \n    # Draw each layout element\n    for idx, layout_res in enumerate(layout_results):\n        if \"bbox\" not in layout_res:\n            return\n        bbox, label, reading_order, tags = layout_res[\"bbox\"], layout_res[\"label\"], layout_res[\"reading_order\"], layout_res[\"tags\"]\n       \n        x1,y1,x2,y2 = bbox\n        \n        # Get color for this element (assigned by order, not by label)\n        color = element_colors[idx]\n        \n        # Draw filled rectangle with transparency\n        cv2.rectangle(overlay, (x1,y1), (x2,y2), color, -1)\n        \n        # Draw border\n        cv2.rectangle(image, (x1,y1), (x2,y2), color, 3)\n        \n        # Add label text with background at the top-left corner (outside the box)\n        label_text = f\"{reading_order}: {label} | {tags}\"\n        font = cv2.FONT_HERSHEY_SIMPLEX\n        font_scale = 0.5\n        thickness = 1\n        \n        # Get text size\n        (text_width, text_height), baseline = cv2.getTextSize(\n            label_text, font, font_scale, thickness\n        )\n        \n        # Position text above the box (outside)\n        text_x = x1\n        text_y = y1 - 5  # 5 pixels above the box\n        \n        # If text would go outside the image at the top, put it inside the box instead\n        if text_y - text_height < 0:\n            text_y = y1 + text_height + 5\n        \n        # Draw text background\n        cv2.rectangle(\n            image,\n            (text_x - 2, text_y - text_height - 2),\n            (text_x + text_width + 2, text_y + baseline + 2),\n            (255, 255, 255),\n            -1\n        )\n        \n        # Draw text\n        cv2.putText(\n            image,\n            label_text,\n            (text_x, text_y),\n            font,\n            font_scale,\n            (0, 0, 0),\n            thickness\n        )\n    \n    # Blend the overlay with the original image\n    result = cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0)\n    \n    # Save the result\n    cv2.imwrite(save_path, result)\n    # print(f\"Layout visualization saved to {save_path}\")\n\n\ndef get_color_palette():\n    \"\"\"Get a visually pleasing color palette for layout visualization\n    \n    Returns:\n        List of BGR color tuples (semi-transparent, good for overlay)\n    \"\"\"\n    # Carefully selected color palette with good visual distinction\n    # Colors are chosen to be light, pleasant, and distinguishable\n    color_palette = [\n        (200, 255, 255),  # Light cyan\n        (255, 200, 255),  # Light magenta\n        (255, 255, 200),  # Light yellow\n        (200, 255, 200),  # Light green\n        (255, 220, 200),  # Light orange\n        (220, 200, 255),  # Light purple\n        (200, 240, 255),  # Light sky blue\n        (255, 240, 220),  # Light peach\n        (220, 255, 240),  # Light mint\n        (255, 220, 240),  # Light pink\n        (240, 255, 200),  # Light lime\n        (240, 220, 255),  # Light lavender\n        (200, 255, 240),  # Light turquoise\n        (255, 240, 200),  # Light apricot\n        (220, 240, 255),  # Light periwinkle\n        (255, 200, 220),  # Light rose\n        (220, 255, 220),  # Light jade\n        (255, 230, 200),  # Light salmon\n        (210, 230, 255),  # Light cornflower\n        (255, 210, 230),  # Light carnation\n    ]\n    return color_palette\n\n\ndef assign_colors_to_elements(num_elements):\n    \"\"\"Assign colors to elements in order\n    \n    Args:\n        num_elements: Number of elements to assign colors to\n        \n    Returns:\n        List of color tuples, one for each element\n    \"\"\"\n    palette = get_color_palette()\n    colors = []\n    \n    for i in range(num_elements):\n        # Cycle through the palette if we have more elements than colors\n        color_idx = i % len(palette)\n        colors.append(palette[color_idx])\n    \n    return colors\n\ndef resize_img(image, max_size=1600, min_size=28):\n    width, height = image.size\n    if max(width, height) < max_size and min(width, height) >= 28:\n        return image\n    \n    if max(width, height) > max_size:\n        if width > height:\n            new_width = max_size\n            new_height = int(height * (max_size / width))\n        else:\n            new_height = max_size\n            new_width = int(width * (max_size / height))\n        image = image.resize((new_width, new_height))\n        width, height = image.size\n    \n    if min(width, height) < 28:\n        if width < height:\n            new_width = min_size\n            new_height = int(height * (min_size / width))\n        else:\n            new_height = min_size\n            new_width = int(width * (min_size / height))\n        image = image.resize((new_width, new_height))\n\n    return image\n\n\ndef calculate_iou_matrix(boxes):\n    \"\"\"Vectorized IoU matrix calculation [N, N]\n    \n    Args:\n        boxes: List of bounding boxes in [x1, y1, x2, y2] format\n        \n    Returns:\n        numpy.ndarray: IoU matrix of shape [N, N]\n    \"\"\"\n    boxes = np.array(boxes)  # [N, 4]\n    \n    # Calculate areas\n    areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])  # [N]\n    \n    # Broadcast to calculate intersection\n    lt = np.maximum(boxes[:, None, :2], boxes[None, :, :2])  # [N, N, 2]\n    rb = np.minimum(boxes[:, None, 2:], boxes[None, :, 2:])  # [N, N, 2]\n    \n    wh = np.clip(rb - lt, 0, None)  # [N, N, 2]\n    inter = wh[:, :, 0] * wh[:, :, 1]  # [N, N]\n    \n    # Calculate IoU\n    union = areas[:, None] + areas[None, :] - inter\n    iou = inter / np.clip(union, 1e-6, None)\n    \n    return iou\n\n\ndef check_bbox_overlap(layout_results_list, image, iou_threshold=0.1, overlap_box_ratio=0.25):\n    \"\"\"Check if bounding boxes have significant overlaps, indicating a distorted/photographed document\n    \n    If more than 60% of boxes have overlaps (IoU > threshold with at least 1 other box),\n    treat as photographed document.\n    \n    Args:\n        layout_results_list: List of (bbox, label, tags) tuples\n        image: PIL Image object\n        iou_threshold: IoU threshold to consider two boxes as overlapping (default: 0.3)\n        overlap_box_ratio: Ratio threshold of boxes with overlaps (default: 0.6, i.e., 60%)\n    \n    Returns:\n        bool: True if significant overlap detected (should treat as distorted_page)\n    \"\"\"\n    if len(layout_results_list) <= 1:\n        return False\n    \n    # Convert to absolute coordinates\n    bboxes = []\n    for bbox, label, tags in layout_results_list:\n        x1, y1, x2, y2 = process_coordinates(bbox, image)\n        bboxes.append([x1, y1, x2, y2])\n    \n    # Vectorized IoU matrix calculation\n    iou_matrix = calculate_iou_matrix(bboxes)\n    \n    # Check if each box has overlap with any other box (excluding itself)\n    overlap_mask = iou_matrix > iou_threshold\n    np.fill_diagonal(overlap_mask, False)  # Exclude self\n    has_overlap = overlap_mask.any(axis=1)  # Whether each box has overlap\n    \n    # Count boxes with overlaps\n    overlap_count = has_overlap.sum()\n    total_boxes = len(bboxes)\n    overlap_ratio = overlap_count / total_boxes\n    \n    # print(f\"Overlap detection: {overlap_count}/{total_boxes} boxes have overlaps (ratio: {overlap_ratio:.2%})\")\n    \n    # If more than 60% boxes have overlaps, treat as photographed document\n    if overlap_ratio > overlap_box_ratio:\n        print(f\"⚠️ High overlap detected ({overlap_ratio:.2%} > {overlap_box_ratio:.2%}), treating as distorted/photographed document\")\n        return True\n    \n    return False\n\nif __name__ == \"__main__\":\n    bbox_str = \"[210,136,910,172][sec_0][PAIR_SEP][202,217,921,325][para][author][PAIR_SEP][520,341,604,367][para][PAIR_SEP][290,404,384,432][sec_1][paper_abstract][PAIR_SEP][156,448,520,723][para][paper_abstract][PAIR_SEP][125,740,290,768][sec_1][PAIR_SEP][125,781,552,1143][para][PAIR_SEP][125,1144,552,1400][para][RELATION_SEP][573,406,1000,561][para][PAIR_SEP][573,581,1001,943][para][PAIR_SEP][573,962,1001,1222][para][PAIR_SEP][573,1241,1001,1475][para][PAIR_SEP][126,1410,551,1470][fnote][PAIR_SEP][21,499,63,1163][watermark][meta_num]\"\n    print(parse_layout_string(bbox_str))"
  }
]