[
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.yml",
    "content": "name: Bug Report\ndescription: File a bug report\ntitle: \"[Bug]:\"\nlabels: [\"bug\", \"triage\"]\n\nbody:\n  - type: checkboxes\n    id: existingcheck\n    attributes:\n      label: Do you need to file an issue?\n      description: Please help us manage our time by avoiding duplicates and common bugs with the steps below.\n      options:\n        - label: I have searched the existing issues and this bug is not already filed.\n        - label: I believe this is a legitimate bug, not just a question or feature request.\n  - type: textarea\n    id: description\n    attributes:\n      label: Describe the bug\n      description: A clear and concise description of what the bug is.\n      placeholder: What went wrong?\n  - type: textarea\n    id: reproduce\n    attributes:\n      label: Steps to reproduce\n      description: Steps to reproduce the behavior.\n      placeholder: How can we replicate the issue?\n  - type: textarea\n    id: expected_behavior\n    attributes:\n      label: Expected Behavior\n      description: A clear and concise description of what you expected to happen.\n      placeholder: What should have happened?\n  - type: textarea\n    id: configused\n    attributes:\n      label: LightRAG Config Used\n      description: The LightRAG configuration used for the run.\n      placeholder: The settings content or LightRAG configuration\n      value: |\n        # Paste your config here\n  - type: textarea\n    id: screenshotslogs\n    attributes:\n      label: Logs and screenshots\n      description: If applicable, add screenshots and logs to help explain your problem.\n      placeholder: Add logs and screenshots here\n  - type: textarea\n    id: additional_information\n    attributes:\n      label: Additional Information\n      description: |\n        - LightRAG Version: e.g., v0.1.1\n        - Operating System: e.g., Windows 10, Ubuntu 20.04\n        - Python Version: e.g., 3.8\n        - Related Issues: e.g., #1\n        - Any other relevant information.\n      value: |\n        - LightRAG Version:\n        - Operating System:\n        - Python Version:\n        - Related Issues:\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "content": "blank_issues_enabled: false\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.yml",
    "content": "name: Feature Request\ndescription: File a feature request\nlabels: [\"enhancement\"]\ntitle: \"[Feature Request]:\"\n\nbody:\n  - type: checkboxes\n    id: existingcheck\n    attributes:\n      label: Do you need to file a feature request?\n      description: Please help us manage our time by avoiding duplicates and common feature request with the steps below.\n      options:\n        - label: I have searched the existing feature request and this feature request is not already filed.\n        - label: I believe this is a legitimate feature request, not just a question or bug.\n  - type: textarea\n    id: feature_request_description\n    attributes:\n      label: Feature Request Description\n      description: A clear and concise description of the feature request you would like.\n      placeholder: What this feature request add more or improve?\n  - type: textarea\n    id: additional_context\n    attributes:\n      label: Additional Context\n      description: Add any other context or screenshots about the feature request here.\n      placeholder: Any additional information\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/question.yml",
    "content": "name: Question\ndescription: Ask a general question\nlabels: [\"question\"]\ntitle: \"[Question]:\"\n\nbody:\n  - type: checkboxes\n    id: existingcheck\n    attributes:\n      label: Do you need to ask a question?\n      description: Please help us manage our time by avoiding duplicates and common questions with the steps below.\n      options:\n        - label: I have searched the existing question and discussions and this question is not already answered.\n        - label: I believe this is a legitimate question, not just a bug or feature request.\n  - type: textarea\n    id: question\n    attributes:\n      label: Your Question\n      description: A clear and concise description of your question.\n      placeholder: What is your question?\n  - type: textarea\n    id: context\n    attributes:\n      label: Additional Context\n      description: Provide any additional context or details that might help us understand your question better.\n      placeholder: Add any relevant information here\n"
  },
  {
    "path": ".github/dependabot.yml",
    "content": "# To get started with Dependabot version updates, you'll need to specify which\n# package ecosystems to update and where the package manifests are located.\n# Please see the documentation for all configuration options:\n# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file\n\nversion: 2\nupdates:\n  - package-ecosystem: \"pip\" # See documentation for possible values\n    directory: \"/\" # Location of package manifests\n    schedule:\n      interval: \"weekly\"\n"
  },
  {
    "path": ".github/pull_request_template.md",
    "content": "<!--\nThanks for contributing to RAGAnything!\n\nPlease ensure your pull request is ready for review before submitting.\n\nAbout this template\n\nThis template helps contributors provide a clear and concise description of their changes. Feel free to adjust it as needed.\n-->\n\n## Description\n\n[Briefly describe the changes made in this pull request.]\n\n## Related Issues\n\n[Reference any related issues or tasks addressed by this pull request.]\n\n## Changes Made\n\n[List the specific changes made in this pull request.]\n\n## Checklist\n\n- [ ] Changes tested locally\n- [ ] Code reviewed\n- [ ] Documentation updated (if necessary)\n- [ ] Unit tests added (if applicable)\n\n## Additional Notes\n\n[Add any additional notes or context for the reviewer(s).]\n"
  },
  {
    "path": ".github/workflows/linting.yaml",
    "content": "name: Linting and Formatting\n\non:\n    push:\n        branches:\n            - main\n    pull_request:\n        branches:\n            - main\n\njobs:\n    lint-and-format:\n        runs-on: ubuntu-latest\n\n        steps:\n            - name: Checkout code\n              uses: actions/checkout@v2\n\n            - name: Set up Python\n              uses: actions/setup-python@v2\n              with:\n                python-version: '3.x'\n\n            - name: Install dependencies\n              run: |\n                python -m pip install --upgrade pip\n                pip install pre-commit\n\n            - name: Run pre-commit\n              run: pre-commit run --all-files --show-diff-on-failure\n\n            - name: Commit lint changes\n              uses: stefanzweifel/git-auto-commit-action@v5\n              with:\n                commit_message: \"chore: apply linting and formatting\"\n                branch: ${{ github.head_ref }}\n"
  },
  {
    "path": ".github/workflows/pypi-publish.yml",
    "content": "name: Upload RAGAnything Package\n\non:\n  release:\n    types: [published]\n\npermissions:\n  contents: read\n\njobs:\n  release-build:\n    runs-on: ubuntu-latest\n\n    steps:\n      - uses: actions/checkout@v4\n\n      - uses: actions/setup-python@v5\n        with:\n          python-version: \"3.x\"\n\n      - name: Build release distributions\n        run: |\n          python -m pip install build\n          python -m build\n\n      - name: Upload distributions\n        uses: actions/upload-artifact@v4\n        with:\n          name: release-dists\n          path: dist/\n\n  pypi-publish:\n    runs-on: ubuntu-latest\n    needs:\n      - release-build\n    permissions:\n      id-token: write\n\n    environment:\n      name: pypi\n\n    steps:\n      - name: Retrieve release distributions\n        uses: actions/download-artifact@v4\n        with:\n          name: release-dists\n          path: dist/\n\n      - name: Publish release distributions to PyPI\n        uses: pypa/gh-action-pypi-publish@release/v1\n        with:\n          packages-dir: dist/\n"
  },
  {
    "path": ".gitignore",
    "content": "# Python-related files\n__pycache__/\n*.py[cod]\n*.egg-info/\n.eggs/\n*.tgz\n*.tar.gz\n*.ini\n\n# Virtual Environment\n.venv/\nenv/\nvenv/\n\n*.env*\n.env_example\n\n# Build / Distribution\ndist/\nbuild/\nsite/\n\n# Logs / Reports\n*.log\n*.log.*\n*.logfire\n*.coverage/\nlog/\n\n# Caches\n.cache/\n.mypy_cache/\n.pytest_cache/\n.ruff_cache/\n.gradio/\n.history/\ntemp/\n\n# IDE / Editor Files\n.idea/\n.vscode/\n.vscode/settings.json\n\n# Framework-specific files\nlocal_neo4jWorkDir/\nneo4jWorkDir/\n\n# Data & Storage\ninputs/\nrag_storage*/\nexamples/input/\nexamples/output/\noutput*/\n\n# Miscellaneous\n.DS_Store\nTODO.md\nignore_this.txt\n*.ignore.*\n\n# Project-specific files\ndickens*/\nbook.txt\nLightRAG.pdf\nLightRAG_2-4.pdf\ndownload_models_hf.py\nlightrag-dev/\ngui/\ntiktoken_cache/\n\n# unit-test files\ntest_*\n\n# Cline files\nmemory-bank/\n\n# AI\n.claude/\n.cursor/\nCLAUDE.md\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v5.0.0\n    hooks:\n      - id: trailing-whitespace\n      - id: end-of-file-fixer\n      - id: requirements-txt-fixer\n\n  - repo: https://github.com/astral-sh/ruff-pre-commit\n    rev: v0.6.4\n    hooks:\n      - id: ruff-format\n      - id: ruff\n        args: [--fix, --ignore=E402]\n\n\n  - repo: https://github.com/mgedmin/check-manifest\n    rev: \"0.49\"\n    hooks:\n      - id: check-manifest\n        stages: [manual]\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2025 ✨Data Intelligence Lab@HKU✨\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "include requirements.txt\ninclude README.md\ninclude README_zh.md\ninclude LICENSE\nrecursive-include raganything *.py\nrecursive-include examples *.py\nglobal-exclude *.pyc\nglobal-exclude __pycache__\nglobal-exclude *.egg-info\n"
  },
  {
    "path": "README.md",
    "content": "<div align=\"center\">\n\n<div style=\"margin: 20px 0;\">\n  <img src=\"./assets/logo.png\" width=\"120\" height=\"120\" alt=\"RAG-Anything Logo\" style=\"border-radius: 20px; box-shadow: 0 8px 32px rgba(0, 217, 255, 0.3);\">\n</div>\n\n# 🚀 RAG-Anything: All-in-One RAG Framework\n\n<a href=\"https://trendshift.io/repositories/14959\" target=\"_blank\"><img src=\"https://trendshift.io/api/badge/repositories/14959\" alt=\"HKUDS%2FRAG-Anything | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/></a>\n\n<div align=\"center\">\n  <img src=\"https://readme-typing-svg.herokuapp.com?font=Orbitron&size=24&duration=3000&pause=1000&color=00D9FF&center=true&vCenter=true&width=600&lines=Welcome+to+RAG-Anything;Next-Gen+Multimodal+RAG+System;Powered+by+Advanced+AI+Technology\" alt=\"Typing Animation\" />\n</div>\n\n<div align=\"center\">\n  <div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 25px; text-align: center;\">\n    <p>\n      <a href='https://github.com/HKUDS/RAG-Anything'><img src='https://img.shields.io/badge/🔥Project-Page-00d9ff?style=for-the-badge&logo=github&logoColor=white&labelColor=1a1a2e'></a>\n      <a href='https://arxiv.org/abs/2510.12323'><img src='https://img.shields.io/badge/📄arXiv-2510.12323-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>\n      <a href='https://github.com/HKUDS/LightRAG'><img src='https://img.shields.io/badge/⚡Based%20on-LightRAG-4ecdc4?style=for-the-badge&logo=lightning&logoColor=white&labelColor=1a1a2e'></a>\n    </p>\n    <p>\n      <a href=\"https://github.com/HKUDS/RAG-Anything/stargazers\"><img src='https://img.shields.io/github/stars/HKUDS/RAG-Anything?color=00d9ff&style=for-the-badge&logo=star&logoColor=white&labelColor=1a1a2e' /></a>\n      <img src=\"https://img.shields.io/badge/🐍Python-3.10-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e\">\n      <a href=\"https://pypi.org/project/raganything/\"><img src=\"https://img.shields.io/pypi/v/raganything.svg?style=for-the-badge&logo=pypi&logoColor=white&labelColor=1a1a2e&color=ff6b6b\"></a>\n      <a href=\"https://github.com/astral-sh/uv\"><img src=\"https://img.shields.io/badge/⚡uv-Ready-ff6b6b?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e\"></a>\n    </p>\n    <p>\n      <a href=\"https://discord.gg/yF2MmDJyGJ\"><img src=\"https://img.shields.io/badge/💬Discord-Community-7289da?style=for-the-badge&logo=discord&logoColor=white&labelColor=1a1a2e\"></a>\n      <a href=\"https://github.com/HKUDS/RAG-Anything/issues/7\"><img src=\"https://img.shields.io/badge/💬WeChat-Group-07c160?style=for-the-badge&logo=wechat&logoColor=white&labelColor=1a1a2e\"></a>\n    </p>\n    <p>\n      <a href=\"README_zh.md\"><img src=\"https://img.shields.io/badge/🇨🇳中文版-1a1a2e?style=for-the-badge\"></a>\n      <a href=\"README.md\"><img src=\"https://img.shields.io/badge/🇺🇸English-1a1a2e?style=for-the-badge\"></a>\n    </p>\n  </div>\n</div>\n\n</div>\n\n<div align=\"center\">\n  <div style=\"width: 100%; height: 2px; margin: 20px 0; background: linear-gradient(90deg, transparent, #00d9ff, transparent);\"></div>\n</div>\n\n<div align=\"center\">\n  <a href=\"#-quick-start\" style=\"text-decoration: none;\">\n    <img src=\"https://img.shields.io/badge/Quick%20Start-Get%20Started%20Now-00d9ff?style=for-the-badge&logo=rocket&logoColor=white&labelColor=1a1a2e\">\n  </a>\n</div>\n\n---\n\n<div align=\"center\">\n  <table>\n    <tr>\n      <td style=\"vertical-align: middle;\">\n        <img src=\"./assets/LiteWrite.png\"\n             width=\"56\"\n             height=\"56\"\n             alt=\"LiteWrite\"\n             style=\"border-radius: 12px;\" />\n      </td>\n      <td style=\"vertical-align: middle; padding-left: 12px;\">\n        <a href=\"https://litewrite.ai\">\n          <img src=\"https://img.shields.io/badge/🚀%20LiteWrite-AI%20Native%20LaTeX%20Editor-ff6b6b?style=for-the-badge&logoColor=white&labelColor=1a1a2e\">\n        </a>\n      </td>\n    </tr>\n  </table>\n</div>\n\n---\n\n## 🎉 News\n- [X] [2025.10]🎯📢 🚀 We have released the technical report of [RAG-Anything](http://arxiv.org/abs/2510.12323). Access it now to explore our latest research findings.\n- [X] [2025.08]🎯📢 🔍 RAG-Anything now features **VLM-Enhanced Query** mode! When documents include images, the system seamlessly integrates them into VLM for advanced multimodal analysis, combining visual and textual context for deeper insights.\n- [X] [2025.07]🎯📢 RAG-Anything now features a [context configuration module](docs/context_aware_processing.md), enabling intelligent integration of relevant contextual information to enhance multimodal content processing.\n- [X] [2025.07]🎯📢 🚀 RAG-Anything now supports multimodal query capabilities, enabling enhanced RAG with seamless processing of text, images, tables, and equations.\n- [X] [2025.07]🎯📢 🎉 RAG-Anything has reached 1k🌟 stars on GitHub! Thank you for your incredible support and valuable contributions to the project.\n\n---\n\n## 🌟 System Overview\n\n*Next-Generation Multimodal Intelligence*\n\n<div style=\"background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border: 2px solid #00d9ff; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);\">\n\nModern documents increasingly contain diverse multimodal content—text, images, tables, equations, charts, and multimedia—that traditional text-focused RAG systems cannot effectively process. **RAG-Anything** addresses this challenge as a comprehensive **All-in-One Multimodal Document Processing RAG system** built on [LightRAG](https://github.com/HKUDS/LightRAG).\n\nAs a unified solution, RAG-Anything **eliminates the need for multiple specialized tools**. It provides **seamless processing and querying across all content modalities** within a single integrated framework. Unlike conventional RAG approaches that struggle with non-textual elements, our all-in-one system delivers **comprehensive multimodal retrieval capabilities**.\n\nUsers can query documents containing **interleaved text**, **visual diagrams**, **structured tables**, and **mathematical formulations** through **one cohesive interface**. This consolidated approach makes RAG-Anything particularly valuable for academic research, technical documentation, financial reports, and enterprise knowledge management where rich, mixed-content documents demand a **unified processing framework**.\n\n<img src=\"assets/rag_anything_framework.png\" alt=\"RAG-Anything\" />\n\n</div>\n\n### 🎯 Key Features\n\n<div style=\"background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 15px; padding: 25px; margin: 20px 0;\">\n\n- **🔄 End-to-End Multimodal Pipeline** - Complete workflow from document ingestion and parsing to intelligent multimodal query answering\n- **📄 Universal Document Support** - Seamless processing of PDFs, Office documents, images, and diverse file formats\n- **🧠 Specialized Content Analysis** - Dedicated processors for images, tables, mathematical equations, and heterogeneous content types\n- **🔗 Multimodal Knowledge Graph** - Automatic entity extraction and cross-modal relationship discovery for enhanced understanding\n- **⚡ Adaptive Processing Modes** - Flexible MinerU-based parsing or direct multimodal content injection workflows\n- **📋 Direct Content List Insertion** - Bypass document parsing by directly inserting pre-parsed content lists from external sources\n- **🎯 Hybrid Intelligent Retrieval** - Advanced search capabilities spanning textual and multimodal content with contextual understanding\n\n</div>\n\n---\n\n## 🏗️ Algorithm & Architecture\n\n<div style=\"background: linear-gradient(135deg, #0f0f23 0%, #1a1a2e 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border-left: 5px solid #00d9ff;\">\n\n### Core Algorithm\n\n**RAG-Anything** implements an effective **multi-stage multimodal pipeline** that fundamentally extends traditional RAG architectures to seamlessly handle diverse content modalities through intelligent orchestration and cross-modal understanding.\n\n</div>\n\n<div align=\"center\">\n  <div style=\"width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);\">\n    <div style=\"display: flex; justify-content: space-around; align-items: center; flex-wrap: wrap; gap: 20px;\">\n      <div style=\"text-align: center;\">\n        <div style=\"font-size: 24px; margin-bottom: 10px;\">📄</div>\n        <div style=\"font-size: 14px; color: #00d9ff;\">Document Parsing</div>\n      </div>\n      <div style=\"font-size: 20px; color: #00d9ff;\">→</div>\n      <div style=\"text-align: center;\">\n        <div style=\"font-size: 24px; margin-bottom: 10px;\">🧠</div>\n        <div style=\"font-size: 14px; color: #00d9ff;\">Content Analysis</div>\n      </div>\n      <div style=\"font-size: 20px; color: #00d9ff;\">→</div>\n      <div style=\"text-align: center;\">\n        <div style=\"font-size: 24px; margin-bottom: 10px;\">🔍</div>\n        <div style=\"font-size: 14px; color: #00d9ff;\">Knowledge Graph</div>\n      </div>\n      <div style=\"font-size: 20px; color: #00d9ff;\">→</div>\n      <div style=\"text-align: center;\">\n        <div style=\"font-size: 24px; margin-bottom: 10px;\">🎯</div>\n        <div style=\"font-size: 14px; color: #00d9ff;\">Intelligent Retrieval</div>\n      </div>\n    </div>\n  </div>\n</div>\n\n### 1. Document Parsing Stage\n\n<div style=\"background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;\">\n\nThe system provides high-fidelity document extraction through adaptive content decomposition. It intelligently segments heterogeneous elements while preserving contextual relationships. Universal format compatibility is achieved via specialized optimized parsers.\n\n**Key Components:**\n\n- **⚙️ MinerU Integration**: Leverages [MinerU](https://github.com/opendatalab/MinerU) for high-fidelity document structure extraction and semantic preservation across complex layouts.\n\n- **🧩 Adaptive Content Decomposition**: Automatically segments documents into coherent text blocks, visual elements, structured tables, mathematical equations, and specialized content types while preserving contextual relationships.\n\n- **📁 Universal Format Support**: Provides comprehensive handling of PDFs, Office documents (DOC/DOCX/PPT/PPTX/XLS/XLSX), images, and emerging formats through specialized parsers with format-specific optimization.\n\n</div>\n\n### 2. Multi-Modal Content Understanding & Processing\n\n<div style=\"background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;\">\n\nThe system automatically categorizes and routes content through optimized channels. It uses concurrent pipelines for parallel text and multimodal processing. Document hierarchy and relationships are preserved during transformation.\n\n**Key Components:**\n\n- **🎯 Autonomous Content Categorization and Routing**: Automatically identify, categorize, and route different content types through optimized execution channels.\n\n- **⚡ Concurrent Multi-Pipeline Architecture**: Implements concurrent execution of textual and multimodal content through dedicated processing pipelines. This approach maximizes throughput efficiency while preserving content integrity.\n\n- **🏗️ Document Hierarchy Extraction**: Extracts and preserves original document hierarchy and inter-element relationships during content transformation.\n\n</div>\n\n### 3. Multimodal Analysis Engine\n\n<div style=\"background: linear-gradient(90deg, #0f3460 0%, #1a1a2e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #00d9ff;\">\n\nThe system deploys modality-aware processing units for heterogeneous data modalities:\n\n**Specialized Analyzers:**\n\n- **🔍 Visual Content Analyzer**:\n  - Integrate vision model for image analysis.\n  - Generates context-aware descriptive captions based on visual semantics.\n  - Extracts spatial relationships and hierarchical structures between visual elements.\n\n- **📊 Structured Data Interpreter**:\n  - Performs systematic interpretation of tabular and structured data formats.\n  - Implements statistical pattern recognition algorithms for data trend analysis.\n  - Identifies semantic relationships and dependencies across multiple tabular datasets.\n\n- **📐 Mathematical Expression Parser**:\n  - Parses complex mathematical expressions and formulas with high accuracy.\n  - Provides native LaTeX format support for seamless integration with academic workflows.\n  - Establishes conceptual mappings between mathematical equations and domain-specific knowledge bases.\n\n- **🔧 Extensible Modality Handler**:\n  - Provides configurable processing framework for custom and emerging content types.\n  - Enables dynamic integration of new modality processors through plugin architecture.\n  - Supports runtime configuration of processing pipelines for specialized use cases.\n\n</div>\n\n### 4. Multimodal Knowledge Graph Index\n\n<div style=\"background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;\">\n\nThe multi-modal knowledge graph construction module transforms document content into structured semantic representations. It extracts multimodal entities, establishes cross-modal relationships, and preserves hierarchical organization. The system applies weighted relevance scoring for optimized knowledge retrieval.\n\n**Core Functions:**\n\n- **🔍 Multi-Modal Entity Extraction**: Transforms significant multimodal elements into structured knowledge graph entities. The process includes semantic annotations and metadata preservation.\n\n- **🔗 Cross-Modal Relationship Mapping**: Establishes semantic connections and dependencies between textual entities and multimodal components. This is achieved through automated relationship inference algorithms.\n\n- **🏗️ Hierarchical Structure Preservation**: Maintains original document organization through \"belongs_to\" relationship chains. These chains preserve logical content hierarchy and sectional dependencies.\n\n- **⚖️ Weighted Relationship Scoring**: Assigns quantitative relevance scores to relationship types. Scoring is based on semantic proximity and contextual significance within the document structure.\n\n</div>\n\n### 5. Modality-Aware Retrieval\n\n<div style=\"background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;\">\n\nThe hybrid retrieval system combines vector similarity search with graph traversal algorithms for comprehensive content retrieval. It implements modality-aware ranking mechanisms and maintains relational coherence between retrieved elements to ensure contextually integrated information delivery.\n\n**Retrieval Mechanisms:**\n\n- **🔀 Vector-Graph Fusion**: Integrates vector similarity search with graph traversal algorithms. This approach leverages both semantic embeddings and structural relationships for comprehensive content retrieval.\n\n- **📊 Modality-Aware Ranking**: Implements adaptive scoring mechanisms that weight retrieval results based on content type relevance. The system adjusts rankings according to query-specific modality preferences.\n\n- **🔗 Relational Coherence Maintenance**: Maintains semantic and structural relationships between retrieved elements. This ensures coherent information delivery and contextual integrity.\n\n</div>\n\n---\n\n## 🚀 Quick Start\n\n*Initialize Your AI Journey*\n\n<div align=\"center\">\n  <img src=\"https://user-images.githubusercontent.com/74038190/212284158-e840e285-664b-44d7-b79b-e264b5e54825.gif\" width=\"400\">\n</div>\n\n### Installation\n\n#### Option 1: Install from PyPI (Recommended)\n\n```bash\n# Basic installation\npip install raganything\n\n# With optional dependencies for extended format support:\npip install 'raganything[all]'              # All optional features\npip install 'raganything[image]'            # Image format conversion (BMP, TIFF, GIF, WebP)\npip install 'raganything[text]'             # Text file processing (TXT, MD)\npip install 'raganything[image,text]'       # Multiple features\n```\n\n#### Option 2: Install from Source\n```bash\n# Install uv (if not already installed)\ncurl -LsSf https://astral.sh/uv/install.sh | sh\n\n# Clone and setup the project with uv\ngit clone https://github.com/HKUDS/RAG-Anything.git\ncd RAG-Anything\n\n# Install the package and dependencies in a virtual environment\nuv sync\n\n# If you encounter network timeouts (especially for opencv packages):\n# UV_HTTP_TIMEOUT=120 uv sync\n\n# Run commands directly with uv (recommended approach)\nuv run python examples/raganything_example.py --help\n\n# Install with optional dependencies\nuv sync --extra image --extra text  # Specific extras\nuv sync --all-extras                 # All optional features\n```\n\n#### Optional Dependencies\n\n- **`[image]`** - Enables processing of BMP, TIFF, GIF, WebP image formats (requires Pillow)\n- **`[text]`** - Enables processing of TXT and MD files (requires ReportLab)\n- **`[all]`** - Includes all Python optional dependencies\n\n> **⚠️ Office Document Processing Requirements:**\n> - Office documents (.doc, .docx, .ppt, .pptx, .xls, .xlsx) require **LibreOffice** installation\n> - Download from [LibreOffice official website](https://www.libreoffice.org/download/download/)\n> - **Windows**: Download installer from official website\n> - **macOS**: `brew install --cask libreoffice`\n> - **Ubuntu/Debian**: `sudo apt-get install libreoffice`\n> - **CentOS/RHEL**: `sudo yum install libreoffice`\n\n**Check MinerU installation:**\n\n```bash\n# Verify installation\nmineru --version\n\n# Check if properly configured\npython -c \"from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU installed properly' if rag.check_parser_installation() else '❌ MinerU installation issue')\"\n```\n\nModels are downloaded automatically on first use. For manual download, refer to [MinerU Model Source Configuration](https://github.com/opendatalab/MinerU/blob/master/README.md#22-model-source-configuration).\n\n### Usage Examples\n\n#### 1. End-to-End Document Processing\n\n```python\nimport asyncio\nfrom raganything import RAGAnything, RAGAnythingConfig\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.utils import EmbeddingFunc\n\nasync def main():\n    # Set up API configuration\n    api_key = \"your-api-key\"\n    base_url = \"your-base-url\"  # Optional\n\n    # Create RAGAnything configuration\n    config = RAGAnythingConfig(\n        working_dir=\"./rag_storage\",\n        parser=\"mineru\",  # Parser selection: mineru, docling, or paddleocr\n        parse_method=\"auto\",  # Parse method: auto, ocr, or txt\n        enable_image_processing=True,\n        enable_table_processing=True,\n        enable_equation_processing=True,\n    )\n\n    # Define LLM model function\n    def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):\n        return openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n\n    # Define vision model function for image processing\n    def vision_model_func(\n        prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs\n    ):\n        # If messages format is provided (for multimodal VLM enhanced query), use it directly\n        if messages:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=messages,\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # Traditional single image format\n        elif image_data:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=[\n                    {\"role\": \"system\", \"content\": system_prompt}\n                    if system_prompt\n                    else None,\n                    {\n                        \"role\": \"user\",\n                        \"content\": [\n                            {\"type\": \"text\", \"text\": prompt},\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image/jpeg;base64,{image_data}\"\n                                },\n                            },\n                        ],\n                    }\n                    if image_data\n                    else {\"role\": \"user\", \"content\": prompt},\n                ],\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # Pure text format\n        else:\n            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)\n\n    # Define embedding function\n    embedding_func = EmbeddingFunc(\n        embedding_dim=3072,\n        max_token_size=8192,\n        func=lambda texts: openai_embed.func(\n            texts,\n            model=\"text-embedding-3-large\",\n            api_key=api_key,\n            base_url=base_url,\n        ),\n    )\n\n    # Initialize RAGAnything\n    rag = RAGAnything(\n        config=config,\n        llm_model_func=llm_model_func,\n        vision_model_func=vision_model_func,\n        embedding_func=embedding_func,\n    )\n\n    # Process a document\n    await rag.process_document_complete(\n        file_path=\"path/to/your/document.pdf\",\n        output_dir=\"./output\",\n        parse_method=\"auto\"\n    )\n\n    # Query the processed content\n    # Pure text query - for basic knowledge base search\n    text_result = await rag.aquery(\n        \"What are the main findings shown in the figures and tables?\",\n        mode=\"hybrid\"\n    )\n    print(\"Text query result:\", text_result)\n\n    # Multimodal query with specific multimodal content\n    multimodal_result = await rag.aquery_with_multimodal(\n    \"Explain this formula and its relevance to the document content\",\n    multimodal_content=[{\n        \"type\": \"equation\",\n        \"latex\": \"P(d|q) = \\\\frac{P(q|d) \\\\cdot P(d)}{P(q)}\",\n        \"equation_caption\": \"Document relevance probability\"\n    }],\n    mode=\"hybrid\"\n)\n    print(\"Multimodal query result:\", multimodal_result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n#### 2. Direct Multimodal Content Processing\n\n```python\nimport asyncio\nfrom lightrag import LightRAG\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.utils import EmbeddingFunc\nfrom raganything.modalprocessors import ImageModalProcessor, TableModalProcessor\n\nasync def process_multimodal_content():\n    # Set up API configuration\n    api_key = \"your-api-key\"\n    base_url = \"your-base-url\"  # Optional\n\n    # Initialize LightRAG\n    rag = LightRAG(\n        working_dir=\"./rag_storage\",\n        llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        ),\n        embedding_func=EmbeddingFunc(\n            embedding_dim=3072,\n            max_token_size=8192,\n            func=lambda texts: openai_embed.func(\n                texts,\n                model=\"text-embedding-3-large\",\n                api_key=api_key,\n                base_url=base_url,\n            ),\n        )\n    )\n    await rag.initialize_storages()\n\n    # Process an image\n    image_processor = ImageModalProcessor(\n        lightrag=rag,\n        modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(\n            \"gpt-4o\",\n            \"\",\n            system_prompt=None,\n            history_messages=[],\n            messages=[\n                {\"role\": \"system\", \"content\": system_prompt} if system_prompt else None,\n                {\"role\": \"user\", \"content\": [\n                    {\"type\": \"text\", \"text\": prompt},\n                    {\"type\": \"image_url\", \"image_url\": {\"url\": f\"data:image/jpeg;base64,{image_data}\"}}\n                ]} if image_data else {\"role\": \"user\", \"content\": prompt}\n            ],\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        ) if image_data else openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n    )\n\n    image_content = {\n        \"img_path\": \"path/to/image.jpg\",\n        \"image_caption\": [\"Figure 1: Experimental results\"],\n        \"image_footnote\": [\"Data collected in 2024\"]\n    }\n\n    description, entity_info = await image_processor.process_multimodal_content(\n        modal_content=image_content,\n        content_type=\"image\",\n        file_path=\"research_paper.pdf\",\n        entity_name=\"Experimental Results Figure\"\n    )\n\n    # Process a table\n    table_processor = TableModalProcessor(\n        lightrag=rag,\n        modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n    )\n\n    table_content = {\n        \"table_body\": \"\"\"\n        | Method | Accuracy | F1-Score |\n        |--------|----------|----------|\n        | RAGAnything | 95.2% | 0.94 |\n        | Baseline | 87.3% | 0.85 |\n        \"\"\",\n        \"table_caption\": [\"Performance Comparison\"],\n        \"table_footnote\": [\"Results on test dataset\"]\n    }\n\n    description, entity_info = await table_processor.process_multimodal_content(\n        modal_content=table_content,\n        content_type=\"table\",\n        file_path=\"research_paper.pdf\",\n        entity_name=\"Performance Results Table\"\n    )\n\nif __name__ == \"__main__\":\n    asyncio.run(process_multimodal_content())\n```\n\n#### 3. Batch Processing\n\n```python\n# Process multiple documents\nawait rag.process_folder_complete(\n    folder_path=\"./documents\",\n    output_dir=\"./output\",\n    file_extensions=[\".pdf\", \".docx\", \".pptx\"],\n    recursive=True,\n    max_workers=4\n)\n```\n\n#### 4. Custom Modal Processors\n\n```python\nfrom raganything.modalprocessors import GenericModalProcessor\n\nclass CustomModalProcessor(GenericModalProcessor):\n    async def process_multimodal_content(self, modal_content, content_type, file_path, entity_name):\n        # Your custom processing logic\n        enhanced_description = await self.analyze_custom_content(modal_content)\n        entity_info = self.create_custom_entity(enhanced_description, entity_name)\n        return await self._create_entity_and_chunk(enhanced_description, entity_info, file_path)\n```\n\n#### 5. Query Options\n\nRAG-Anything provides three types of query methods:\n\n**Pure Text Queries** - Direct knowledge base search using LightRAG:\n```python\n# Different query modes for text queries\ntext_result_hybrid = await rag.aquery(\"Your question\", mode=\"hybrid\")\ntext_result_local = await rag.aquery(\"Your question\", mode=\"local\")\ntext_result_global = await rag.aquery(\"Your question\", mode=\"global\")\ntext_result_naive = await rag.aquery(\"Your question\", mode=\"naive\")\n\n# Synchronous version\nsync_text_result = rag.query(\"Your question\", mode=\"hybrid\")\n```\n\n**VLM Enhanced Queries** - Automatically analyze images in retrieved context using VLM:\n```python\n# VLM enhanced query (automatically enabled when vision_model_func is provided)\nvlm_result = await rag.aquery(\n    \"Analyze the charts and figures in the document\",\n    mode=\"hybrid\"\n    # vlm_enhanced=True is automatically set when vision_model_func is available\n)\n\n# Manually control VLM enhancement\nvlm_enabled = await rag.aquery(\n    \"What do the images show in this document?\",\n    mode=\"hybrid\",\n    vlm_enhanced=True  # Force enable VLM enhancement\n)\n\nvlm_disabled = await rag.aquery(\n    \"What do the images show in this document?\",\n    mode=\"hybrid\",\n    vlm_enhanced=False  # Force disable VLM enhancement\n)\n\n# When documents contain images, VLM can see and analyze them directly\n# The system will automatically:\n# 1. Retrieve relevant context containing image paths\n# 2. Load and encode images as base64\n# 3. Send both text context and images to VLM for comprehensive analysis\n```\n\n**Multimodal Queries** - Enhanced queries with specific multimodal content analysis:\n```python\n# Query with table data\ntable_result = await rag.aquery_with_multimodal(\n    \"Compare these performance metrics with the document content\",\n    multimodal_content=[{\n        \"type\": \"table\",\n        \"table_data\": \"\"\"Method,Accuracy,Speed\n                        RAGAnything,95.2%,120ms\n                        Traditional,87.3%,180ms\"\"\",\n        \"table_caption\": \"Performance comparison\"\n    }],\n    mode=\"hybrid\"\n)\n\n# Query with equation content\nequation_result = await rag.aquery_with_multimodal(\n    \"Explain this formula and its relevance to the document content\",\n    multimodal_content=[{\n        \"type\": \"equation\",\n        \"latex\": \"P(d|q) = \\\\frac{P(q|d) \\\\cdot P(d)}{P(q)}\",\n        \"equation_caption\": \"Document relevance probability\"\n    }],\n    mode=\"hybrid\"\n)\n```\n\n#### 6. Loading Existing LightRAG Instance\n\n```python\nimport asyncio\nfrom raganything import RAGAnything, RAGAnythingConfig\nfrom lightrag import LightRAG\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.kg.shared_storage import initialize_pipeline_status\nfrom lightrag.utils import EmbeddingFunc\nimport os\n\nasync def load_existing_lightrag():\n    # Set up API configuration\n    api_key = \"your-api-key\"\n    base_url = \"your-base-url\"  # Optional\n\n    # First, create or load existing LightRAG instance\n    lightrag_working_dir = \"./existing_lightrag_storage\"\n\n    # Check if previous LightRAG instance exists\n    if os.path.exists(lightrag_working_dir) and os.listdir(lightrag_working_dir):\n        print(\"✅ Found existing LightRAG instance, loading...\")\n    else:\n        print(\"❌ No existing LightRAG instance found, will create new one\")\n\n    # Create/load LightRAG instance with your configuration\n    lightrag_instance = LightRAG(\n        working_dir=lightrag_working_dir,\n        llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        ),\n        embedding_func=EmbeddingFunc(\n            embedding_dim=3072,\n            max_token_size=8192,\n            func=lambda texts: openai_embed.func(\n                texts,\n                model=\"text-embedding-3-large\",\n                api_key=api_key,\n                base_url=base_url,\n            ),\n        )\n    )\n\n    # Initialize storage (this will load existing data if available)\n    await lightrag_instance.initialize_storages()\n    await initialize_pipeline_status()\n\n    # Define vision model function for image processing\n    def vision_model_func(\n        prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs\n    ):\n        # If messages format is provided (for multimodal VLM enhanced query), use it directly\n        if messages:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=messages,\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # Traditional single image format\n        elif image_data:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=[\n                    {\"role\": \"system\", \"content\": system_prompt}\n                    if system_prompt\n                    else None,\n                    {\n                        \"role\": \"user\",\n                        \"content\": [\n                            {\"type\": \"text\", \"text\": prompt},\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image/jpeg;base64,{image_data}\"\n                                },\n                            },\n                        ],\n                    }\n                    if image_data\n                    else {\"role\": \"user\", \"content\": prompt},\n                ],\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # Pure text format\n        else:\n            return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)\n\n    # Now use existing LightRAG instance to initialize RAGAnything\n    rag = RAGAnything(\n        lightrag=lightrag_instance,  # Pass existing LightRAG instance\n        vision_model_func=vision_model_func,\n        # Note: working_dir, llm_model_func, embedding_func, etc. are inherited from lightrag_instance\n    )\n\n    # Query existing knowledge base\n    result = await rag.aquery(\n        \"What data has been processed in this LightRAG instance?\",\n        mode=\"hybrid\"\n    )\n    print(\"Query result:\", result)\n\n    # Add new multimodal document to existing LightRAG instance\n    await rag.process_document_complete(\n        file_path=\"path/to/new/multimodal_document.pdf\",\n        output_dir=\"./output\"\n    )\n\nif __name__ == \"__main__\":\n    asyncio.run(load_existing_lightrag())\n```\n\n#### 7. Direct Content List Insertion\n\nFor scenarios where you already have a pre-parsed content list (e.g., from external parsers or previous processing), you can directly insert it into RAGAnything without document parsing:\n\n```python\nimport asyncio\nfrom raganything import RAGAnything, RAGAnythingConfig\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.utils import EmbeddingFunc\n\nasync def insert_content_list_example():\n    # Set up API configuration\n    api_key = \"your-api-key\"\n    base_url = \"your-base-url\"  # Optional\n\n    # Create RAGAnything configuration\n    config = RAGAnythingConfig(\n        working_dir=\"./rag_storage\",\n        enable_image_processing=True,\n        enable_table_processing=True,\n        enable_equation_processing=True,\n    )\n\n    # Define model functions\n    def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):\n        return openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n\n    def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):\n        # If messages format is provided (for multimodal VLM enhanced query), use it directly\n        if messages:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=messages,\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # Traditional single image format\n        elif image_data:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=[\n                    {\"role\": \"system\", \"content\": system_prompt} if system_prompt else None,\n                    {\n                        \"role\": \"user\",\n                        \"content\": [\n                            {\"type\": \"text\", \"text\": prompt},\n                            {\"type\": \"image_url\", \"image_url\": {\"url\": f\"data:image/jpeg;base64,{image_data}\"}}\n                        ],\n                    } if image_data else {\"role\": \"user\", \"content\": prompt},\n                ],\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # Pure text format\n        else:\n            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)\n\n    embedding_func = EmbeddingFunc(\n        embedding_dim=3072,\n        max_token_size=8192,\n        func=lambda texts: openai_embed.func(\n            texts,\n            model=\"text-embedding-3-large\",\n            api_key=api_key,\n            base_url=base_url,\n        ),\n    )\n\n    # Initialize RAGAnything\n    rag = RAGAnything(\n        config=config,\n        llm_model_func=llm_model_func,\n        vision_model_func=vision_model_func,\n        embedding_func=embedding_func,\n    )\n\n    # Example: Pre-parsed content list from external source\n    content_list = [\n        {\n            \"type\": \"text\",\n            \"text\": \"This is the introduction section of our research paper.\",\n            \"page_idx\": 0  # Page number where this content appears\n        },\n        {\n            \"type\": \"image\",\n            \"img_path\": \"/absolute/path/to/figure1.jpg\",  # IMPORTANT: Use absolute path\n            \"image_caption\": [\"Figure 1: System Architecture\"],\n            \"image_footnote\": [\"Source: Authors' original design\"],\n            \"page_idx\": 1  # Page number where this image appears\n        },\n        {\n            \"type\": \"table\",\n            \"table_body\": \"| Method | Accuracy | F1-Score |\\n|--------|----------|----------|\\n| Ours | 95.2% | 0.94 |\\n| Baseline | 87.3% | 0.85 |\",\n            \"table_caption\": [\"Table 1: Performance Comparison\"],\n            \"table_footnote\": [\"Results on test dataset\"],\n            \"page_idx\": 2  # Page number where this table appears\n        },\n        {\n            \"type\": \"equation\",\n            \"latex\": \"P(d|q) = \\\\frac{P(q|d) \\\\cdot P(d)}{P(q)}\",\n            \"text\": \"Document relevance probability formula\",\n            \"page_idx\": 3  # Page number where this equation appears\n        },\n        {\n            \"type\": \"text\",\n            \"text\": \"In conclusion, our method demonstrates superior performance across all metrics.\",\n            \"page_idx\": 4  # Page number where this content appears\n        }\n    ]\n\n    # Insert the content list directly\n    await rag.insert_content_list(\n        content_list=content_list,\n        file_path=\"research_paper.pdf\",  # Reference file name for citation\n        split_by_character=None,         # Optional text splitting\n        split_by_character_only=False,   # Optional text splitting mode\n        doc_id=None,                     # Optional custom document ID (will be auto-generated if not provided)\n        display_stats=True               # Show content statistics\n    )\n\n    # Query the inserted content\n    result = await rag.aquery(\n        \"What are the key findings and performance metrics mentioned in the research?\",\n        mode=\"hybrid\"\n    )\n    print(\"Query result:\", result)\n\n    # You can also insert multiple content lists with different document IDs\n    another_content_list = [\n        {\n            \"type\": \"text\",\n            \"text\": \"This is content from another document.\",\n            \"page_idx\": 0  # Page number where this content appears\n        },\n        {\n            \"type\": \"table\",\n            \"table_body\": \"| Feature | Value |\\n|---------|-------|\\n| Speed | Fast |\\n| Accuracy | High |\",\n            \"table_caption\": [\"Feature Comparison\"],\n            \"page_idx\": 1  # Page number where this table appears\n        }\n    ]\n\n    await rag.insert_content_list(\n        content_list=another_content_list,\n        file_path=\"another_document.pdf\",\n        doc_id=\"custom-doc-id-123\"  # Custom document ID\n    )\n\nif __name__ == \"__main__\":\n    asyncio.run(insert_content_list_example())\n```\n\n**Content List Format:**\n\nThe `content_list` should follow the standard format with each item being a dictionary containing:\n\n- **Text content**: `{\"type\": \"text\", \"text\": \"content text\", \"page_idx\": 0}`\n- **Image content**: `{\"type\": \"image\", \"img_path\": \"/absolute/path/to/image.jpg\", \"image_caption\": [\"caption\"], \"image_footnote\": [\"note\"], \"page_idx\": 1}`\n- **Table content**: `{\"type\": \"table\", \"table_body\": \"markdown table\", \"table_caption\": [\"caption\"], \"table_footnote\": [\"note\"], \"page_idx\": 2}`\n- **Equation content**: `{\"type\": \"equation\", \"latex\": \"LaTeX formula\", \"text\": \"description\", \"page_idx\": 3}`\n- **Generic content**: `{\"type\": \"custom_type\", \"content\": \"any content\", \"page_idx\": 4}`\n\n**Important Notes:**\n- **`img_path`**: Must be an absolute path to the image file (e.g., `/home/user/images/chart.jpg` or `C:\\Users\\user\\images\\chart.jpg`)\n- **`page_idx`**: Represents the page number where the content appears in the original document (0-based indexing)\n- **Content ordering**: Items are processed in the order they appear in the list\n\nThis method is particularly useful when:\n- You have content from external parsers (non-MinerU/Docling)\n- You want to process programmatically generated content\n- You need to insert content from multiple sources into a single knowledge base\n- You have cached parsing results that you want to reuse\n\n---\n\n## 🛠️ Examples\n\n*Practical Implementation Demos*\n\n<div align=\"center\">\n  <img src=\"https://user-images.githubusercontent.com/74038190/212257455-13e3e01e-d6a6-45dc-bb92-3ab87b12dfc1.gif\" width=\"300\">\n</div>\n\nThe `examples/` directory contains comprehensive usage examples:\n\n- **`raganything_example.py`**: End-to-end document processing with MinerU\n- **`modalprocessors_example.py`**: Direct multimodal content processing\n- **`office_document_test.py`**: Office document parsing test with MinerU (no API key required)\n- **`image_format_test.py`**: Image format parsing test with MinerU (no API key required)\n- **`text_format_test.py`**: Text format parsing test with MinerU (no API key required)\n\n**Run examples:**\n\n```bash\n# End-to-end processing with parser selection\npython examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru\n\n# Direct modal processing\npython examples/modalprocessors_example.py --api-key YOUR_API_KEY\n\n# Office document parsing test (MinerU only)\npython examples/office_document_test.py --file path/to/document.docx\n\n# Image format parsing test (MinerU only)\npython examples/image_format_test.py --file path/to/image.bmp\n\n# Text format parsing test (MinerU only)\npython examples/text_format_test.py --file path/to/document.md\n\n# Check LibreOffice installation\npython examples/office_document_test.py --check-libreoffice --file dummy\n\n# Check PIL/Pillow installation\npython examples/image_format_test.py --check-pillow --file dummy\n\n# Check ReportLab installation\npython examples/text_format_test.py --check-reportlab --file dummy\n```\n\n---\n\n## 🔧 Configuration\n\n*System Optimization Parameters*\n\n### Environment Variables\n\nCreate a `.env` file (refer to `.env.example`):\n\n```bash\nOPENAI_API_KEY=your_openai_api_key\nOPENAI_BASE_URL=your_base_url  # Optional\nOUTPUT_DIR=./output             # Default output directory for parsed documents\nPARSER=mineru                   # Parser selection: mineru, docling, or paddleocr\nPARSE_METHOD=auto              # Parse method: auto, ocr, or txt\n```\n\n**Note:** For backward compatibility, legacy environment variable names are still supported:\n- `MINERU_PARSE_METHOD` is deprecated, please use `PARSE_METHOD`\n\n> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test parser functionality and do not require API keys.\n\n### Parser Configuration\n\nRAGAnything now supports multiple parsers, each with specific advantages:\n\n#### MinerU Parser\n- Supports PDF, images, Office documents, and more formats\n- Powerful OCR and table extraction capabilities\n- GPU acceleration support\n\n#### Docling Parser\n- Optimized for Office documents and HTML files\n- Better document structure preservation\n- Native support for multiple Office formats\n\n#### PaddleOCR Parser\n- OCR-focused parser for images and PDFs\n- Produces text blocks compatible with existing `content_list` processing\n- Supports optional Office/TXT/MD parsing by converting to PDF first\n\nInstall PaddleOCR parser extras:\n\n```bash\npip install -e \".[paddleocr]\"\n# or\nuv sync --extra paddleocr\n```\n\n> **Note**: PaddleOCR also requires `paddlepaddle` (CPU/GPU package varies by platform). Install it with the official guide: https://www.paddlepaddle.org.cn/install/quick\n\n### MinerU Configuration\n\n```bash\n# MinerU 2.0 uses command-line parameters instead of config files\n# Check available options:\nmineru --help\n\n# Common configurations:\nmineru -p input.pdf -o output_dir -m auto    # Automatic parsing mode\nmineru -p input.pdf -o output_dir -m ocr     # OCR-focused parsing\nmineru -p input.pdf -o output_dir -b pipeline --device cuda  # GPU acceleration\n```\n\nYou can also configure parsing through RAGAnything parameters:\n\n```python\n# Basic parsing configuration with parser selection\nawait rag.process_document_complete(\n    file_path=\"document.pdf\",\n    output_dir=\"./output/\",\n    parse_method=\"auto\",          # or \"ocr\", \"txt\"\n    parser=\"mineru\"               # Optional: \"mineru\", \"docling\", or \"paddleocr\"\n)\n\n# Advanced parsing configuration with special parameters\nawait rag.process_document_complete(\n    file_path=\"document.pdf\",\n    output_dir=\"./output/\",\n    parse_method=\"auto\",          # Parsing method: \"auto\", \"ocr\", \"txt\"\n    parser=\"mineru\",              # Parser selection: \"mineru\", \"docling\", or \"paddleocr\"\n\n    # MinerU special parameters - all supported kwargs:\n    lang=\"ch\",                   # Document language for OCR optimization (e.g., \"ch\", \"en\", \"ja\")\n    device=\"cuda:0\",             # Inference device: \"cpu\", \"cuda\", \"cuda:0\", \"npu\", \"mps\"\n    start_page=0,                # Starting page number (0-based, for PDF)\n    end_page=10,                 # Ending page number (0-based, for PDF)\n    formula=True,                # Enable formula parsing\n    table=True,                  # Enable table parsing\n    backend=\"pipeline\",          # Parsing backend: pipeline|hybrid-auto-engine|hybrid-http-client|vlm-auto-engine|vlm-http-client.\n    source=\"huggingface\",        # Model source: \"huggingface\", \"modelscope\", \"local\"\n    # vlm_url=\"http://127.0.0.1:3000\" # Service address when using backend=vlm-http-client\n\n    # Standard RAGAnything parameters\n    display_stats=True,          # Display content statistics\n    split_by_character=None,     # Optional character to split text by\n    doc_id=None                  # Optional document ID\n)\n```\n\n> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything supports multiple document parsers, including MinerU, Docling, and PaddleOCR.\n\n### Processing Requirements\n\nDifferent content types require specific optional dependencies:\n\n- **Office Documents** (.doc, .docx, .ppt, .pptx, .xls, .xlsx): Install [LibreOffice](https://www.libreoffice.org/download/download/)\n- **Extended Image Formats** (.bmp, .tiff, .gif, .webp): Install with `pip install raganything[image]`\n- **Text Files** (.txt, .md): Install with `pip install raganything[text]`\n- **PaddleOCR Parser** (`parser=\"paddleocr\"`): Install with `pip install raganything[paddleocr]`, then install `paddlepaddle` for your platform\n\n> **📋 Quick Install**: Use `pip install raganything[all]` to enable all format support (Python dependencies only - LibreOffice still needs separate installation)\n\n---\n\n## 🧪 Supported Content Types\n\n### Document Formats\n\n- **PDFs** - Research papers, reports, presentations\n- **Office Documents** - DOC, DOCX, PPT, PPTX, XLS, XLSX\n- **Images** - JPG, PNG, BMP, TIFF, GIF, WebP\n- **Text Files** - TXT, MD\n\n### Multimodal Elements\n\n- **Images** - Photographs, diagrams, charts, screenshots\n- **Tables** - Data tables, comparison charts, statistical summaries\n- **Equations** - Mathematical formulas in LaTeX format\n- **Generic Content** - Custom content types via extensible processors\n\n*For installation of format-specific dependencies, see the [Configuration](#-configuration) section.*\n\n---\n\n## 📖 Citation\n\n*Academic Reference*\n\n<div align=\"center\">\n  <div style=\"width: 60px; height: 60px; margin: 20px auto; position: relative;\">\n    <div style=\"width: 100%; height: 100%; border: 2px solid #00d9ff; border-radius: 50%; position: relative;\">\n      <div style=\"position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); font-size: 24px; color: #00d9ff;\">📖</div>\n    </div>\n    <div style=\"position: absolute; bottom: -5px; left: 50%; transform: translateX(-50%); width: 20px; height: 20px; background: white; border-right: 2px solid #00d9ff; border-bottom: 2px solid #00d9ff; transform: rotate(45deg);\"></div>\n  </div>\n</div>\n\nIf you find RAG-Anything useful in your research, please cite our paper:\n\n```bibtex\n@misc{guo2025raganythingallinoneragframework,\n      title={RAG-Anything: All-in-One RAG Framework},\n      author={Zirui Guo and Xubin Ren and Lingrui Xu and Jiahao Zhang and Chao Huang},\n      year={2025},\n      eprint={2510.12323},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https://arxiv.org/abs/2510.12323},\n}\n```\n\n---\n\n## 🔗 Related Projects\n\n*Ecosystem & Extensions*\n\n<div align=\"center\">\n  <table>\n    <tr>\n      <td align=\"center\">\n        <a href=\"https://github.com/HKUDS/LightRAG\">\n          <div style=\"width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;\">\n            <span style=\"font-size: 32px;\">⚡</span>\n          </div>\n          <b>LightRAG</b><br>\n          <sub>Simple and Fast RAG</sub>\n        </a>\n      </td>\n      <td align=\"center\">\n        <a href=\"https://github.com/HKUDS/VideoRAG\">\n          <div style=\"width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;\">\n            <span style=\"font-size: 32px;\">🎥</span>\n          </div>\n          <b>VideoRAG</b><br>\n          <sub>Extreme Long-Context Video RAG</sub>\n        </a>\n      </td>\n      <td align=\"center\">\n        <a href=\"https://github.com/HKUDS/MiniRAG\">\n          <div style=\"width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;\">\n            <span style=\"font-size: 32px;\">✨</span>\n          </div>\n          <b>MiniRAG</b><br>\n          <sub>Extremely Simple RAG</sub>\n        </a>\n      </td>\n    </tr>\n  </table>\n</div>\n\n---\n\n## ⭐ Star History\n\n*Community Growth Trajectory*\n\n<div align=\"center\">\n  <a href=\"https://star-history.com/#HKUDS/RAG-Anything&Date\">\n    <picture>\n      <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date&theme=dark\" />\n      <source media=\"(prefers-color-scheme: light)\" srcset=\"https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date\" />\n      <img alt=\"Star History Chart\" src=\"https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date\" style=\"border-radius: 15px; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);\" />\n    </picture>\n  </a>\n</div>\n\n---\n\n## 🤝 Contribution\n\n*Join the Innovation*\n\n<div align=\"center\">\n  We thank all our contributors for their valuable contributions.\n</div>\n\n<div align=\"center\">\n  <a href=\"https://github.com/HKUDS/RAG-Anything/graphs/contributors\">\n    <img src=\"https://contrib.rocks/image?repo=HKUDS/RAG-Anything\" style=\"border-radius: 15px; box-shadow: 0 0 20px rgba(0, 217, 255, 0.3);\" />\n  </a>\n</div>\n\n---\n\n<div align=\"center\" style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 30px; margin: 30px 0;\">\n  <div>\n    <img src=\"https://user-images.githubusercontent.com/74038190/212284100-561aa473-3905-4a80-b561-0d28506553ee.gif\" width=\"500\">\n  </div>\n  <div style=\"margin-top: 20px;\">\n    <a href=\"https://github.com/HKUDS/RAG-Anything\" style=\"text-decoration: none;\">\n      <img src=\"https://img.shields.io/badge/⭐%20Star%20us%20on%20GitHub-1a1a2e?style=for-the-badge&logo=github&logoColor=white\">\n    </a>\n    <a href=\"https://github.com/HKUDS/RAG-Anything/issues\" style=\"text-decoration: none;\">\n      <img src=\"https://img.shields.io/badge/🐛%20Report%20Issues-ff6b6b?style=for-the-badge&logo=github&logoColor=white\">\n    </a>\n    <a href=\"https://github.com/HKUDS/RAG-Anything/discussions\" style=\"text-decoration: none;\">\n      <img src=\"https://img.shields.io/badge/💬%20Discussions-4ecdc4?style=for-the-badge&logo=github&logoColor=white\">\n    </a>\n  </div>\n</div>\n\n<div align=\"center\">\n  <div style=\"width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);\">\n    <div style=\"display: flex; justify-content: center; align-items: center; gap: 15px;\">\n      <span style=\"font-size: 24px;\">⭐</span>\n      <span style=\"color: #00d9ff; font-size: 18px;\">Thank you for visiting RAG-Anything!</span>\n      <span style=\"font-size: 24px;\">⭐</span>\n    </div>\n    <div style=\"margin-top: 10px; color: #00d9ff; font-size: 16px;\">Building the Future of Multimodal AI</div>\n  </div>\n</div>\n"
  },
  {
    "path": "README_zh.md",
    "content": "<div align=\"center\">\n\n<div style=\"margin: 20px 0;\">\n  <img src=\"./assets/logo.png\" width=\"120\" height=\"120\" alt=\"RAG-Anything Logo\" style=\"border-radius: 20px; box-shadow: 0 8px 32px rgba(0, 217, 255, 0.3);\">\n</div>\n\n# 🚀 RAG-Anything: All-in-One RAG System\n\n<div align=\"center\">\n  <div style=\"width: 100%; height: 2px; margin: 20px 0; background: linear-gradient(90deg, transparent, #00d9ff, transparent);\"></div>\n</div>\n\n<div align=\"center\">\n  <div style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 25px; text-align: center;\">\n    <p>\n      <a href='https://github.com/HKUDS/RAG-Anything'><img src='https://img.shields.io/badge/🔥项目-主页-00d9ff?style=for-the-badge&logo=github&logoColor=white&labelColor=1a1a2e'></a>\n      <a href='https://arxiv.org/abs/2510.12323'><img src='https://img.shields.io/badge/📄arXiv-2510.12323-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>\n      <a href='https://github.com/HKUDS/LightRAG'><img src='https://img.shields.io/badge/⚡基于-LightRAG-4ecdc4?style=for-the-badge&logo=lightning&logoColor=white&labelColor=1a1a2e'></a>\n    </p>\n    <p>\n      <a href=\"https://github.com/HKUDS/RAG-Anything/stargazers\"><img src='https://img.shields.io/github/stars/HKUDS/RAG-Anything?color=00d9ff&style=for-the-badge&logo=star&logoColor=white&labelColor=1a1a2e' /></a>\n      <img src=\"https://img.shields.io/badge/🐍Python-3.10-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e\">\n      <a href=\"https://pypi.org/project/raganything/\"><img src=\"https://img.shields.io/pypi/v/raganything.svg?style=for-the-badge&logo=pypi&logoColor=white&labelColor=1a1a2e&color=ff6b6b\"></a>\n    </p>\n    <p>\n      <a href=\"https://discord.gg/yF2MmDJyGJ\"><img src=\"https://img.shields.io/badge/💬Discord-社区-7289da?style=for-the-badge&logo=discord&logoColor=white&labelColor=1a1a2e\"></a>\n      <a href=\"https://github.com/HKUDS/RAG-Anything/issues/7\"><img src=\"https://img.shields.io/badge/💬微信群-交流-07c160?style=for-the-badge&logo=wechat&logoColor=white&labelColor=1a1a2e\"></a>\n    </p>\n    <p>\n      <a href=\"README_zh.md\"><img src=\"https://img.shields.io/badge/🇨🇳中文版-1a1a2e?style=for-the-badge\"></a>\n      <a href=\"README.md\"><img src=\"https://img.shields.io/badge/🇺🇸English-1a1a2e?style=for-the-badge\"></a>\n    </p>\n  </div>\n</div>\n\n</div>\n\n<div align=\"center\" style=\"margin: 30px 0;\">\n  <img src=\"https://user-images.githubusercontent.com/74038190/212284100-561aa473-3905-4a80-b561-0d28506553ee.gif\" width=\"800\">\n</div>\n\n<div align=\"center\">\n  <a href=\"#-快速开始\" style=\"text-decoration: none;\">\n    <img src=\"https://img.shields.io/badge/快速开始-立即开始使用-00d9ff?style=for-the-badge&logo=rocket&logoColor=white&labelColor=1a1a2e\">\n  </a>\n</div>\n\n---\n\n<div align=\"center\">\n  <table>\n    <tr>\n      <td style=\"vertical-align: middle;\">\n        <img src=\"./assets/LiteWrite.png\"\n             width=\"56\"\n             height=\"56\"\n             alt=\"LiteWrite\"\n             style=\"border-radius: 12px;\" />\n      </td>\n      <td style=\"vertical-align: middle; padding-left: 12px;\">\n        <a href=\"https://litewrite.ai\">\n          <img src=\"https://img.shields.io/badge/🚀%20LiteWrite-AI%20原生%20LaTeX%20编辑器-ff6b6b?style=for-the-badge&logoColor=white&labelColor=1a1a2e\">\n        </a>\n      </td>\n    </tr>\n  </table>\n</div>\n\n---\n\n## 🎉 新闻\n- [X] [2025.08.12]🎯📢 🔍 RAGAnything 现在支持 **VLM增强查询** 模式！当文档包含图片时，系统可以自动将图片与文本上下文一起直接传递给VLM进行综合多模态分析。\n- [X] [2025.07.05]🎯📢 RAGAnything 新增[上下文配置模块](docs/context_aware_processing.md)，支持为多模态内容处理添加相关上下文信息。\n- [X] [2025.07.04]🎯📢 RAGAnything 现在支持多模态内容查询，实现了集成文本、图像、表格和公式处理的增强检索生成功能。\n- [X] [2025.07.03]🎯📢 RAGAnything 在GitHub上达到了1K星标🌟！感谢您的支持和贡献。\n\n---\n\n## 🌟 系统概述\n\n*下一代多模态智能*\n\n<div style=\"background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border: 2px solid #00d9ff; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);\">\n\n**RAG-Anything**是一个综合性多模态文档处理RAG系统。该系统能够无缝处理和查询包含文本、图像、表格、公式等多模态内容的复杂文档，提供完整的检索增强(RAG)生成解决方案。\n\n<img src=\"assets/rag_anything_framework.png\" alt=\"RAG-Anything\" />\n\n</div>\n\n### 🎯 核心特性\n\n<div style=\"background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 15px; padding: 25px; margin: 20px 0;\">\n\n- **🔄 端到端多模态处理流水线** - 提供从文档解析到多模态查询响应的完整处理链路，确保系统的一体化运行\n- **📄 多格式文档支持** - 支持PDF、Office文档（DOC/DOCX/PPT/PPTX/XLS/XLSX）、图像等主流文档格式的统一处理和解析\n- **🧠 多模态内容分析引擎** - 针对图像、表格、公式和通用文本内容部署专门的处理器，确保各类内容的精准解析\n- **🔗 基于知识图谱索引** - 实现自动化实体提取和关系构建，建立跨模态的语义连接网络\n- **⚡ 灵活的处理架构** - 支持基于MinerU的智能解析模式和直接多模态内容插入模式，满足不同应用场景需求\n- **📋 直接内容列表插入** - 跳过文档解析，直接插入来自外部源的预解析内容列表，支持多种数据来源整合\n- **🎯 跨模态检索机制** - 实现跨文本和多模态内容的智能检索，提供精准的信息定位和匹配能力\n\n</div>\n\n---\n\n## 🏗️ 算法原理与架构\n\n<div style=\"background: linear-gradient(135deg, #0f0f23 0%, #1a1a2e 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border-left: 5px solid #00d9ff;\">\n\n### 核心算法\n\n**RAG-Anything** 采用灵活的分层架构设计，实现多阶段多模态处理流水线，将传统RAG系统扩展为支持异构内容类型的综合处理平台。\n\n</div>\n\n<div align=\"center\">\n  <div style=\"width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);\">\n    <div style=\"display: flex; justify-content: space-around; align-items: center; flex-wrap: wrap; gap: 20px;\">\n      <div style=\"text-align: center;\">\n        <div style=\"font-size: 24px; margin-bottom: 10px;\">📄</div>\n        <div style=\"font-size: 14px; color: #00d9ff;\">文档解析</div>\n      </div>\n      <div style=\"font-size: 20px; color: #00d9ff;\">→</div>\n      <div style=\"text-align: center;\">\n        <div style=\"font-size: 24px; margin-bottom: 10px;\">🧠</div>\n        <div style=\"font-size: 14px; color: #00d9ff;\">内容分析</div>\n      </div>\n      <div style=\"font-size: 20px; color: #00d9ff;\">→</div>\n      <div style=\"text-align: center;\">\n        <div style=\"font-size: 24px; margin-bottom: 10px;\">🔍</div>\n        <div style=\"font-size: 14px; color: #00d9ff;\">知识图谱</div>\n      </div>\n      <div style=\"font-size: 20px; color: #00d9ff;\">→</div>\n      <div style=\"text-align: center;\">\n        <div style=\"font-size: 24px; margin-bottom: 10px;\">🎯</div>\n        <div style=\"font-size: 14px; color: #00d9ff;\">智能检索</div>\n      </div>\n    </div>\n  </div>\n</div>\n\n### 1. 文档解析阶段\n\n<div style=\"background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;\">\n\n该系统构建了高精度文档解析平台，通过结构化提取引擎实现多模态元素的完整识别与提取。系统采用自适应内容分解机制，智能分离文档中的文本、图像、表格、公式等异构内容，并保持其语义关联性。同时支持PDF、Office文档、图像等主流格式的统一处理，提供标准化的多模态内容输出。\n\n**核心组件：**\n\n- **⚙️ 结构化提取引擎**：集成 [MinerU](https://github.com/opendatalab/MinerU) 文档解析框架，实现精确的文档结构识别与内容提取，确保多模态元素的完整性和准确性。\n\n- **🧩 自适应内容分解机制**：建立智能内容分离系统，自动识别并提取文档中的文本块、图像、表格、公式等异构元素，保持元素间的语义关联关系。\n\n- **📁 多格式兼容处理**：部署专业化解析器矩阵，支持PDF、Office文档系列（DOC/DOCX/PPT/PPTX/XLS/XLSX）、图像等主流格式的统一处理与标准化输出。\n\n</div>\n\n### 2. 多模态内容理解与处理\n\n<div style=\"background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;\">\n\n该多模态内容处理系统通过自主分类路由机制实现异构内容的智能识别与优化分发。系统采用并发多流水线架构，确保文本和多模态内容的高效并行处理，在最大化吞吐量的同时保持内容完整性，并能完整提取和保持原始文档的层次结构与元素关联关系。\n\n**核心组件：**\n\n- **🎯 自主内容分类与路由**：自动识别、分类并将不同内容类型路由至优化的执行通道。\n\n- **⚡ 并发多流水线架构**：通过专用处理流水线实现文本和多模态内容的并发执行。这种方法在保持内容完整性的同时最大化吞吐效率。\n\n- **🏗️ 文档层次结构提取**：在内容转换过程中提取并保持原始文档的层次结构和元素间关系。\n\n</div>\n\n### 3. 多模态分析引擎\n\n<div style=\"background: linear-gradient(90deg, #0f3460 0%, #1a1a2e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #00d9ff;\">\n\n系统部署了面向异构数据模态的模态感知处理单元：\n\n**专用分析器：**\n\n- **🔍 视觉内容分析器**：\n  - 集成视觉模型进行图像分析和内容识别\n  - 基于视觉语义生成上下文感知的描述性标题\n  - 提取视觉元素间的空间关系和层次结构\n\n- **📊 结构化数据解释器**：\n  - 对表格和结构化数据格式进行系统性解释\n  - 实现数据趋势分析的统计模式识别算法\n  - 识别多个表格数据集间的语义关系和依赖性\n\n- **📐 数学表达式解析器**：\n  - 高精度解析复杂数学表达式和公式\n  - 提供原生LaTeX格式支持以实现与学术工作流的无缝集成\n  - 建立数学方程与领域特定知识库间的概念映射\n\n- **🔧 可扩展模态处理器**：\n  - 为自定义和新兴内容类型提供可配置的处理框架\n  - 通过插件架构实现新模态处理器的动态集成\n  - 支持专用场景下处理流水线的运行时配置\n\n</div>\n\n### 4. 多模态知识图谱索引\n\n<div style=\"background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;\">\n\n多模态知识图谱构建模块将文档内容转换为结构化语义表示。系统提取多模态实体，建立跨模态关系，并保持层次化组织结构。通过加权相关性评分实现优化的知识检索。\n\n**核心功能：**\n\n- **🔍 多模态实体提取**：将重要的多模态元素转换为结构化知识图谱实体。该过程包括语义标注和元数据保存。\n\n- **🔗 跨模态关系映射**：在文本实体和多模态组件之间建立语义连接和依赖关系。通过自动化关系推理算法实现这一功能。\n\n- **🏗️ 层次结构保持**：通过\"归属于\"关系链维护原始文档组织结构。这些关系链保持逻辑内容层次和章节依赖关系。\n\n- **⚖️ 加权关系评分**：为关系类型分配定量相关性分数。评分基于语义邻近性和文档结构内的上下文重要性。\n\n</div>\n\n### 5. 模态感知检索\n\n<div style=\"background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;\">\n\n混合检索系统结合向量相似性搜索与图遍历算法，实现全面的内容检索。系统实现模态感知排序机制，并维护检索元素间的关系一致性，确保上下文集成的信息传递。\n\n**检索机制：**\n\n- **🔀 向量-图谱融合**：集成向量相似性搜索与图遍历算法。该方法同时利用语义嵌入和结构关系实现全面的内容检索。\n\n- **📊 模态感知排序**：实现基于内容类型相关性的自适应评分机制。系统根据查询特定的模态偏好调整排序结果。\n\n- **🔗 关系一致性维护**：维护检索元素间的语义和结构关系。确保信息传递的连贯性和上下文完整性。\n\n</div>\n\n---\n\n## 🚀 快速开始\n\n*启动您的AI之旅*\n\n<div align=\"center\">\n  <img src=\"https://user-images.githubusercontent.com/74038190/212284158-e840e285-664b-44d7-b79b-e264b5e54825.gif\" width=\"400\">\n</div>\n\n### 安装\n\n#### 选项1：从PyPI安装（推荐）\n\n```bash\n# 基础安装\npip install raganything\n\n# 安装包含扩展格式支持的可选依赖：\npip install 'raganything[all]'              # 所有可选功能\npip install 'raganything[image]'            # 图像格式转换 (BMP, TIFF, GIF, WebP)\npip install 'raganything[text]'             # 文本文件处理 (TXT, MD)\npip install 'raganything[image,text]'       # 多个功能组合\n```\n\n#### 选项2：从源码安装\n\n```bash\ngit clone https://github.com/HKUDS/RAG-Anything.git\ncd RAG-Anything\npip install -e .\n\n# 安装可选依赖\npip install -e '.[all]'\n```\n\n#### 可选依赖\n\n- **`[image]`** - 启用BMP、TIFF、GIF、WebP图像格式处理（需要Pillow）\n- **`[text]`** - 启用TXT和MD文件处理（需要ReportLab）\n- **`[all]`** - 包含所有Python可选依赖\n\n> **⚠️ Office文档处理配置要求：**\n> - Office文档 (.doc, .docx, .ppt, .pptx, .xls, .xlsx) 需要安装 **LibreOffice**\n> - 从[LibreOffice官网](https://www.libreoffice.org/download/download/)下载安装\n> - **Windows**：从官网下载安装包\n> - **macOS**：`brew install --cask libreoffice`\n> - **Ubuntu/Debian**：`sudo apt-get install libreoffice`\n> - **CentOS/RHEL**：`sudo yum install libreoffice`\n\n**检查MinerU安装：**\n\n```bash\n# 验证安装\nmineru --version\n\n# 检查是否正确配置\npython -c \"from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU安装正常' if rag.check_parser_installation() else '❌ MinerU安装有问题')\"\n```\n\n模型在首次使用时自动下载。手动下载参考[MinerU模型源配置](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#22-%E6%A8%A1%E5%9E%8B%E6%BA%90%E9%85%8D%E7%BD%AE)：\n\n### 使用示例\n\n#### 1. 端到端文档处理\n\n```python\nimport asyncio\nfrom raganything import RAGAnything, RAGAnythingConfig\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.utils import EmbeddingFunc\n\nasync def main():\n    # 设置 API 配置\n    api_key = \"your-api-key\"\n    base_url = \"your-base-url\"  # 可选\n\n    # 创建 RAGAnything 配置\n    config = RAGAnythingConfig(\n        working_dir=\"./rag_storage\",\n        parser=\"mineru\",  # 选择解析器：mineru 或 docling\n        parse_method=\"auto\",  # 解析方法：auto, ocr 或 txt\n        enable_image_processing=True,\n        enable_table_processing=True,\n        enable_equation_processing=True,\n    )\n\n    # 定义 LLM 模型函数\n    def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):\n        return openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n\n    # 定义视觉模型函数用于图像处理\n    def vision_model_func(\n        prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs\n    ):\n        # 如果提供了messages格式（用于多模态VLM增强查询），直接使用\n        if messages:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=messages,\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # 传统单图片格式\n        elif image_data:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=[\n                    {\"role\": \"system\", \"content\": system_prompt}\n                    if system_prompt\n                    else None,\n                    {\n                        \"role\": \"user\",\n                        \"content\": [\n                            {\"type\": \"text\", \"text\": prompt},\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image/jpeg;base64,{image_data}\"\n                                },\n                            },\n                        ],\n                    }\n                    if image_data\n                    else {\"role\": \"user\", \"content\": prompt},\n                ],\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # 纯文本格式\n        else:\n            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)\n\n    # 定义嵌入函数\n    embedding_func = EmbeddingFunc(\n        embedding_dim=3072,\n        max_token_size=8192,\n        func=lambda texts: openai_embed.func(\n            texts,\n            model=\"text-embedding-3-large\",\n            api_key=api_key,\n            base_url=base_url,\n        ),\n    )\n\n    # 初始化 RAGAnything\n    rag = RAGAnything(\n        config=config,\n        llm_model_func=llm_model_func,\n        vision_model_func=vision_model_func,\n        embedding_func=embedding_func,\n    )\n\n    # 处理文档\n    await rag.process_document_complete(\n        file_path=\"path/to/your/document.pdf\",\n        output_dir=\"./output\",\n        parse_method=\"auto\"\n    )\n\n    # 查询处理后的内容\n    # 纯文本查询 - 基本知识库搜索\n    text_result = await rag.aquery(\n        \"文档的主要内容是什么？\",\n        mode=\"hybrid\"\n    )\n    print(\"文本查询结果:\", text_result)\n\n    # 多模态查询 - 包含具体多模态内容的查询\n    multimodal_result = await rag.aquery_with_multimodal(\n        \"分析这个性能数据并解释与现有文档内容的关系\",\n        multimodal_content=[{\n            \"type\": \"table\",\n            \"table_data\": \"\"\"系统,准确率,F1分数\n                            RAGAnything,95.2%,0.94\n                            基准方法,87.3%,0.85\"\"\",\n            \"table_caption\": \"性能对比结果\"\n        }],\n        mode=\"hybrid\"\n    )\n    print(\"多模态查询结果:\", multimodal_result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n#### 2. 直接多模态内容处理\n\n```python\nimport asyncio\nfrom lightrag import LightRAG\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.utils import EmbeddingFunc\nfrom raganything.modalprocessors import ImageModalProcessor, TableModalProcessor\n\nasync def process_multimodal_content():\n    # 设置 API 配置\n    api_key = \"your-api-key\"\n    base_url = \"your-base-url\"  # 可选\n\n    # 初始化 LightRAG\n    rag = LightRAG(\n        working_dir=\"./rag_storage\",\n        llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        ),\n        embedding_func=EmbeddingFunc(\n            embedding_dim=3072,\n            max_token_size=8192,\n            func=lambda texts: openai_embed.func(\n                texts,\n                model=\"text-embedding-3-large\",\n                api_key=api_key,\n                base_url=base_url,\n            ),\n        )\n    )\n    await rag.initialize_storages()\n\n    # 处理图像\n    image_processor = ImageModalProcessor(\n        lightrag=rag,\n        modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(\n            \"gpt-4o\",\n            \"\",\n            system_prompt=None,\n            history_messages=[],\n            messages=[\n                {\"role\": \"system\", \"content\": system_prompt} if system_prompt else None,\n                {\"role\": \"user\", \"content\": [\n                    {\"type\": \"text\", \"text\": prompt},\n                    {\"type\": \"image_url\", \"image_url\": {\"url\": f\"data:image/jpeg;base64,{image_data}\"}}\n                ]} if image_data else {\"role\": \"user\", \"content\": prompt}\n            ],\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        ) if image_data else openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n    )\n\n    image_content = {\n        \"img_path\": \"path/to/image.jpg\",\n        \"image_caption\": [\"图1：实验结果\"],\n        \"image_footnote\": [\"数据收集于2024年\"]\n    }\n\n    description, entity_info = await image_processor.process_multimodal_content(\n        modal_content=image_content,\n        content_type=\"image\",\n        file_path=\"research_paper.pdf\",\n        entity_name=\"实验结果图表\"\n    )\n\n    # 处理表格\n    table_processor = TableModalProcessor(\n        lightrag=rag,\n        modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n    )\n\n    table_content = {\n        \"table_body\": \"\"\"\n        | 方法 | 准确率 | F1分数 |\n        |------|--------|--------|\n        | RAGAnything | 95.2% | 0.94 |\n        | 基准方法 | 87.3% | 0.85 |\n        \"\"\",\n        \"table_caption\": [\"性能对比\"],\n        \"table_footnote\": [\"测试数据集结果\"]\n    }\n\n    description, entity_info = await table_processor.process_multimodal_content(\n        modal_content=table_content,\n        content_type=\"table\",\n        file_path=\"research_paper.pdf\",\n        entity_name=\"性能结果表格\"\n    )\n\nif __name__ == \"__main__\":\n    asyncio.run(process_multimodal_content())\n```\n\n#### 3. 批量处理\n\n```python\n# 处理多个文档\nawait rag.process_folder_complete(\n    folder_path=\"./documents\",\n    output_dir=\"./output\",\n    file_extensions=[\".pdf\", \".docx\", \".pptx\"],\n    recursive=True,\n    max_workers=4\n)\n```\n\n#### 4. 自定义模态处理器\n\n```python\nfrom raganything.modalprocessors import GenericModalProcessor\n\nclass CustomModalProcessor(GenericModalProcessor):\n    async def process_multimodal_content(self, modal_content, content_type, file_path, entity_name):\n        # 你的自定义处理逻辑\n        enhanced_description = await self.analyze_custom_content(modal_content)\n        entity_info = self.create_custom_entity(enhanced_description, entity_name)\n        return await self._create_entity_and_chunk(enhanced_description, entity_info, file_path)\n```\n\n#### 5. 查询选项\n\nRAG-Anything 提供三种类型的查询方法：\n\n**纯文本查询** - 使用LightRAG直接进行知识库搜索：\n```python\n# 文本查询的不同模式\ntext_result_hybrid = await rag.aquery(\"你的问题\", mode=\"hybrid\")\ntext_result_local = await rag.aquery(\"你的问题\", mode=\"local\")\ntext_result_global = await rag.aquery(\"你的问题\", mode=\"global\")\ntext_result_naive = await rag.aquery(\"你的问题\", mode=\"naive\")\n\n# 同步版本\nsync_text_result = rag.query(\"你的问题\", mode=\"hybrid\")\n```\n\n**VLM增强查询** - 使用VLM自动分析检索上下文中的图像：\n```python\n# VLM增强查询（当提供vision_model_func时自动启用）\nvlm_result = await rag.aquery(\n    \"分析文档中的图表和数据\",\n    mode=\"hybrid\"\n    # vlm_enhanced=True 当vision_model_func可用时自动设置\n)\n\n# 手动控制VLM增强\nvlm_enabled = await rag.aquery(\n    \"这个文档中的图片显示了什么内容？\",\n    mode=\"hybrid\",\n    vlm_enhanced=True  # 强制启用VLM增强\n)\n\nvlm_disabled = await rag.aquery(\n    \"这个文档中的图片显示了什么内容？\",\n    mode=\"hybrid\",\n    vlm_enhanced=False  # 强制禁用VLM增强\n)\n\n# 当文档包含图片时，VLM可以直接查看和分析图片\n# 系统将自动：\n# 1. 检索包含图片路径的相关上下文\n# 2. 加载图片并编码为base64格式\n# 3. 将文本上下文和图片一起发送给VLM进行综合分析\n```\n\n**多模态查询** - 包含特定多模态内容分析的增强查询：\n```python\n# 包含表格数据的查询\ntable_result = await rag.aquery_with_multimodal(\n    \"比较这些性能指标与文档内容\",\n    multimodal_content=[{\n        \"type\": \"table\",\n        \"table_data\": \"\"\"方法,准确率,速度\n                        LightRAG,95.2%,120ms\n                        传统方法,87.3%,180ms\"\"\",\n        \"table_caption\": \"性能对比\"\n    }],\n    mode=\"hybrid\"\n)\n\n# 包含公式内容的查询\nequation_result = await rag.aquery_with_multimodal(\n    \"解释这个公式及其与文档内容的相关性\",\n    multimodal_content=[{\n        \"type\": \"equation\",\n        \"latex\": \"P(d|q) = \\\\frac{P(q|d) \\\\cdot P(d)}{P(q)}\",\n        \"equation_caption\": \"文档相关性概率\"\n    }],\n    mode=\"hybrid\"\n)\n```\n\n#### 6. 加载已存在的LightRAG实例\n\n```python\nimport asyncio\nfrom raganything import RAGAnything\nfrom lightrag import LightRAG\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.utils import EmbeddingFunc\nimport os\n\nasync def load_existing_lightrag():\n    # 设置 API 配置\n    api_key = \"your-api-key\"\n    base_url = \"your-base-url\"  # 可选\n\n    # 首先，创建或加载已存在的 LightRAG 实例\n    lightrag_working_dir = \"./existing_lightrag_storage\"\n\n    # 检查是否存在之前的 LightRAG 实例\n    if os.path.exists(lightrag_working_dir) and os.listdir(lightrag_working_dir):\n        print(\"✅ 发现已存在的 LightRAG 实例，正在加载...\")\n    else:\n        print(\"❌ 未找到已存在的 LightRAG 实例，将创建新实例\")\n\n    # 使用您的配置创建/加载 LightRAG 实例\n    lightrag_instance = LightRAG(\n        working_dir=lightrag_working_dir,\n        llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        ),\n        embedding_func=EmbeddingFunc(\n            embedding_dim=3072,\n            max_token_size=8192,\n            func=lambda texts: openai_embed.func(\n                texts,\n                model=\"text-embedding-3-large\",\n                api_key=api_key,\n                base_url=base_url,\n            ),\n        )\n    )\n\n    # 初始化存储（如果有现有数据，这将加载它们）\n    await lightrag_instance.initialize_storages()\n    await initialize_pipeline_status()\n\n    # 定义视觉模型函数用于图像处理\n    def vision_model_func(\n        prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs\n    ):\n        # 如果提供了messages格式（用于多模态VLM增强查询），直接使用\n        if messages:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=messages,\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # 传统单图片格式\n        elif image_data:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=[\n                    {\"role\": \"system\", \"content\": system_prompt}\n                    if system_prompt\n                    else None,\n                    {\n                        \"role\": \"user\",\n                        \"content\": [\n                            {\"type\": \"text\", \"text\": prompt},\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image/jpeg;base64,{image_data}\"\n                                },\n                            },\n                        ],\n                    }\n                    if image_data\n                    else {\"role\": \"user\", \"content\": prompt},\n                ],\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # 纯文本格式\n        else:\n            return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)\n\n    # 现在使用已存在的 LightRAG 实例初始化 RAGAnything\n    rag = RAGAnything(\n        lightrag=lightrag_instance,  # 传入已存在的 LightRAG 实例\n        vision_model_func=vision_model_func,\n        # 注意：working_dir、llm_model_func、embedding_func 等都从 lightrag_instance 继承\n    )\n\n    # 查询已存在的知识库\n    result = await rag.aquery(\n        \"这个 LightRAG 实例中处理了哪些数据？\",\n        mode=\"hybrid\"\n    )\n    print(\"查询结果:\", result)\n\n    # 向已存在的 LightRAG 实例添加新的多模态文档\n    await rag.process_document_complete(\n        file_path=\"path/to/new/multimodal_document.pdf\",\n        output_dir=\"./output\"\n    )\n\nif __name__ == \"__main__\":\n    asyncio.run(load_existing_lightrag())\n```\n\n#### 7. 直接插入内容列表\n\n当您已经有预解析的内容列表（例如，来自外部解析器或之前的处理结果）时，可以直接插入到 RAGAnything 中而无需文档解析：\n\n```python\nimport asyncio\nfrom raganything import RAGAnything, RAGAnythingConfig\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.utils import EmbeddingFunc\n\nasync def insert_content_list_example():\n    # 设置 API 配置\n    api_key = \"your-api-key\"\n    base_url = \"your-base-url\"  # 可选\n\n    # 创建 RAGAnything 配置\n    config = RAGAnythingConfig(\n        working_dir=\"./rag_storage\",\n        enable_image_processing=True,\n        enable_table_processing=True,\n        enable_equation_processing=True,\n    )\n\n    # 定义模型函数\n    def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):\n        return openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n\n    def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):\n        # 如果提供了messages格式（用于多模态VLM增强查询），直接使用\n        if messages:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=messages,\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # 传统单图片格式\n        elif image_data:\n            return openai_complete_if_cache(\n                \"gpt-4o\",\n                \"\",\n                system_prompt=None,\n                history_messages=[],\n                messages=[\n                    {\"role\": \"system\", \"content\": system_prompt} if system_prompt else None,\n                    {\n                        \"role\": \"user\",\n                        \"content\": [\n                            {\"type\": \"text\", \"text\": prompt},\n                            {\"type\": \"image_url\", \"image_url\": {\"url\": f\"data:image/jpeg;base64,{image_data}\"}}\n                        ],\n                    } if image_data else {\"role\": \"user\", \"content\": prompt},\n                ],\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n        # 纯文本格式\n        else:\n            return llm_model_func(prompt, system_prompt, history_messages, **kwargs)\n\n    embedding_func = EmbeddingFunc(\n        embedding_dim=3072,\n        max_token_size=8192,\n        func=lambda texts: openai_embed.func(\n            texts,\n            model=\"text-embedding-3-large\",\n            api_key=api_key,\n            base_url=base_url,\n        ),\n    )\n\n    # 初始化 RAGAnything\n    rag = RAGAnything(\n        config=config,\n        llm_model_func=llm_model_func,\n        vision_model_func=vision_model_func,\n        embedding_func=embedding_func,\n    )\n\n    # 示例：来自外部源的预解析内容列表\n    content_list = [\n        {\n            \"type\": \"text\",\n            \"text\": \"这是我们研究论文的引言部分。\",\n            \"page_idx\": 0  # 此内容出现的页码\n        },\n        {\n            \"type\": \"image\",\n            \"img_path\": \"/absolute/path/to/figure1.jpg\",  # 重要：使用绝对路径\n            \"image_caption\": [\"图1：系统架构\"],\n            \"image_footnote\": [\"来源：作者原创设计\"],\n            \"page_idx\": 1  # 此图像出现的页码\n        },\n        {\n            \"type\": \"table\",\n            \"table_body\": \"| 方法 | 准确率 | F1分数 |\\n|------|--------|--------|\\n| 我们的方法 | 95.2% | 0.94 |\\n| 基准方法 | 87.3% | 0.85 |\",\n            \"table_caption\": [\"表1：性能对比\"],\n            \"table_footnote\": [\"测试数据集结果\"],\n            \"page_idx\": 2  # 此表格出现的页码\n        },\n        {\n            \"type\": \"equation\",\n            \"latex\": \"P(d|q) = \\\\frac{P(q|d) \\\\cdot P(d)}{P(q)}\",\n            \"text\": \"文档相关性概率公式\",\n            \"page_idx\": 3  # 此公式出现的页码\n        },\n        {\n            \"type\": \"text\",\n            \"text\": \"总之，我们的方法在所有指标上都表现出优越的性能。\",\n            \"page_idx\": 4  # 此内容出现的页码\n        }\n    ]\n\n    # 直接插入内容列表\n    await rag.insert_content_list(\n        content_list=content_list,\n        file_path=\"research_paper.pdf\",  # 用于引用的参考文件名\n        split_by_character=None,         # 可选的文本分割\n        split_by_character_only=False,   # 可选的文本分割模式\n        doc_id=None,                     # 可选的自定义文档ID（如果未提供将自动生成）\n        display_stats=True               # 显示内容统计信息\n    )\n\n    # 查询插入的内容\n    result = await rag.aquery(\n        \"研究中提到的主要发现和性能指标是什么？\",\n        mode=\"hybrid\"\n    )\n    print(\"查询结果:\", result)\n\n    # 您也可以使用不同的文档ID插入多个内容列表\n    another_content_list = [\n        {\n            \"type\": \"text\",\n            \"text\": \"这是来自另一个文档的内容。\",\n            \"page_idx\": 0  # 此内容出现的页码\n        },\n        {\n            \"type\": \"table\",\n            \"table_body\": \"| 特性 | 值 |\\n|------|----|\\n| 速度 | 快速 |\\n| 准确性 | 高 |\",\n            \"table_caption\": [\"特性对比\"],\n            \"page_idx\": 1  # 此表格出现的页码\n        }\n    ]\n\n    await rag.insert_content_list(\n        content_list=another_content_list,\n        file_path=\"another_document.pdf\",\n        doc_id=\"custom-doc-id-123\"  # 自定义文档ID\n    )\n\nif __name__ == \"__main__\":\n    asyncio.run(insert_content_list_example())\n```\n\n**内容列表格式：**\n\n`content_list` 应遵循标准格式，每个项目都是包含以下内容的字典：\n\n- **文本内容**: `{\"type\": \"text\", \"text\": \"内容文本\", \"page_idx\": 0}`\n- **图像内容**: `{\"type\": \"image\", \"img_path\": \"/absolute/path/to/image.jpg\", \"image_caption\": [\"标题\"], \"image_footnote\": [\"注释\"], \"page_idx\": 1}`\n- **表格内容**: `{\"type\": \"table\", \"table_body\": \"markdown表格\", \"table_caption\": [\"标题\"], \"table_footnote\": [\"注释\"], \"page_idx\": 2}`\n- **公式内容**: `{\"type\": \"equation\", \"latex\": \"LaTeX公式\", \"text\": \"描述\", \"page_idx\": 3}`\n- **通用内容**: `{\"type\": \"custom_type\", \"content\": \"任何内容\", \"page_idx\": 4}`\n\n**重要说明：**\n- **`img_path`**: 必须是图像文件的绝对路径（例如：`/home/user/images/chart.jpg` 或 `C:\\Users\\user\\images\\chart.jpg`）\n- **`page_idx`**: 表示内容在原始文档中出现的页码（从0开始的索引）\n- **内容顺序**: 项目按照在列表中出现的顺序进行处理\n\n此方法在以下情况下特别有用：\n- 您有来自外部解析器的内容（非MinerU/Docling）\n- 您想要处理程序化生成的内容\n- 您需要将来自多个源的内容插入到单个知识库中\n- 您有想要重用的缓存解析结果\n\n---\n\n## 🛠️ 示例\n\n*实际应用演示*\n\n<div align=\"center\">\n  <img src=\"https://user-images.githubusercontent.com/74038190/212257455-13e3e01e-d6a6-45dc-bb92-3ab87b12dfc1.gif\" width=\"300\">\n</div>\n\n`examples/` 目录包含完整的使用示例：\n\n- **`raganything_example.py`**：基于MinerU的端到端文档处理\n- **`modalprocessors_example.py`**：直接多模态内容处理\n- **`office_document_test.py`**：Office文档解析测试（无需API密钥）\n- **`image_format_test.py`**：图像格式解析测试（无需API密钥）\n- **`text_format_test.py`**：文本格式解析测试（无需API密钥）\n\n**运行示例：**\n\n```bash\n# 端到端处理（包含解析器选择）\npython examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru\n\n# 直接模态处理\npython examples/modalprocessors_example.py --api-key YOUR_API_KEY\n\n# Office文档解析测试（仅MinerU功能）\npython examples/office_document_test.py --file path/to/document.docx\n\n# 图像格式解析测试（仅MinerU功能）\npython examples/image_format_test.py --file path/to/image.bmp\n\n# 文本格式解析测试（仅MinerU功能）\npython examples/text_format_test.py --file path/to/document.md\n\n# 检查LibreOffice安装\npython examples/office_document_test.py --check-libreoffice --file dummy\n\n# 检查PIL/Pillow安装\npython examples/image_format_test.py --check-pillow --file dummy\n\n# 检查ReportLab安装\npython examples/text_format_test.py --check-reportlab --file dummy\n```\n\n> **注意**：API密钥仅在完整RAG处理和LLM集成时需要。解析测试文件（`office_document_test.py`、`image_format_test.py` 和 `text_format_test.py`）仅测试MinerU功能，无需API密钥。\n\n---\n\n## 🔧 配置\n\n*系统优化参数*\n\n### 环境变量\n\n创建 `.env` 文件（参考 `.env.example`）：\n\n```bash\nOPENAI_API_KEY=your_openai_api_key\nOPENAI_BASE_URL=your_base_url  # 可选\nOUTPUT_DIR=./output             # 解析文档的默认输出目录\nPARSER=mineru                   # 解析器选择：mineru 或 docling\nPARSE_METHOD=auto              # 解析方法：auto, ocr 或 txt\n```\n\n**注意：** 为了向后兼容，旧的环境变量名称仍然有效：\n- `MINERU_PARSE_METHOD` 已弃用，请使用 `PARSE_METHOD`\n\n### 解析器配置\n\nRAGAnything 现在支持多种解析器，每种解析器都有其特定的优势：\n\n#### MinerU 解析器\n- 支持PDF、图像、Office文档等多种格式\n- 强大的OCR和表格提取能力\n- 支持GPU加速\n\n#### Docling 解析器\n- 专门优化Office文档和HTML文件的解析\n- 更好的文档结构保持\n- 原生支持多种Office格式\n\n### MinerU配置\n\n```bash\n# MinerU 2.0使用命令行参数而不是配置文件\n# 查看可用选项：\nmineru --help\n\n# 常用配置：\nmineru -p input.pdf -o output_dir -m auto    # 自动解析模式\nmineru -p input.pdf -o output_dir -m ocr     # OCR重点解析\nmineru -p input.pdf -o output_dir -b pipeline --device cuda  # GPU加速\n```\n\n你也可以通过RAGAnything参数配置解析：\n\n```python\n# 基础解析配置和解析器选择\nawait rag.process_document_complete(\n    file_path=\"document.pdf\",\n    output_dir=\"./output/\",\n    parse_method=\"auto\",          # 或 \"ocr\", \"txt\"\n    parser=\"mineru\"               # 可选：\"mineru\" 或 \"docling\"\n)\n\n# 高级解析配置（包含特殊参数）\nawait rag.process_document_complete(\n    file_path=\"document.pdf\",\n    output_dir=\"./output/\",\n    parse_method=\"auto\",          # 解析方法：\"auto\", \"ocr\", \"txt\"\n    parser=\"mineru\",              # 解析器选择：\"mineru\" 或 \"docling\"\n\n    # MinerU特殊参数 - 支持的所有kwargs：\n    lang=\"ch\",                   # 文档语言优化（如：\"ch\", \"en\", \"ja\"）\n    device=\"cuda:0\",             # 推理设备：\"cpu\", \"cuda\", \"cuda:0\", \"npu\", \"mps\"\n    start_page=0,                # 起始页码（0为基准，适用于PDF）\n    end_page=10,                 # 结束页码（0为基准，适用于PDF）\n    formula=True,                # 启用公式解析\n    table=True,                  # 启用表格解析\n    backend=\"pipeline\",          # 解析后端：pipeline|hybrid-auto-engine|hybrid-http-client|vlm-auto-engine|vlm-http-client\n    source=\"huggingface\",        # 模型源：\"huggingface\", \"modelscope\", \"local\"\n    # vlm_url=\"http://127.0.0.1:3000\" # 当backend=vlm-http-client时，需指定服务地址\n\n    # RAGAnything标准参数\n    display_stats=True,          # 显示内容统计信息\n    split_by_character=None,     # 可选的文本分割字符\n    doc_id=None                  # 可选的文档ID\n)\n```\n\n> **注意**：MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。RAG-Anything现在支持多种文档解析器 - 你可以根据需要在MinerU和Docling之间选择。\n\n### 处理要求\n\n不同内容类型需要特定的可选依赖：\n\n- **Office文档** (.doc, .docx, .ppt, .pptx, .xls, .xlsx): 安装并配置 [LibreOffice](https://www.libreoffice.org/download/download/)\n- **扩展图像格式** (.bmp, .tiff, .gif, .webp): 使用 `pip install raganything[image]` 安装\n- **文本文件** (.txt, .md): 使用 `pip install raganything[text]` 安装\n\n> **📋 快速安装**: 使用 `pip install raganything[all]` 启用所有格式支持（仅Python依赖 - LibreOffice仍需单独安装）\n\n---\n\n## 🧪 支持的内容类型\n\n### 文档格式\n\n- **PDF** - 研究论文、报告、演示文稿\n- **Office文档** - DOC、DOCX、PPT、PPTX、XLS、XLSX\n- **图像** - JPG、PNG、BMP、TIFF、GIF、WebP\n- **文本文件** - TXT、MD\n\n### 多模态元素\n\n- **图像** - 照片、图表、示意图、截图\n- **表格** - 数据表、对比图、统计摘要\n- **公式** - LaTeX格式的数学公式\n- **通用内容** - 通过可扩展处理器支持的自定义内容类型\n\n*格式特定依赖的安装说明请参见[配置](#-配置)部分。*\n\n---\n\n## 📖 引用\n\n*学术参考*\n\n<div align=\"center\">\n  <div style=\"width: 60px; height: 60px; margin: 20px auto; position: relative;\">\n    <div style=\"width: 100%; height: 100%; border: 2px solid #00d9ff; border-radius: 50%; position: relative;\">\n      <div style=\"position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); font-size: 24px; color: #00d9ff;\">📖</div>\n    </div>\n    <div style=\"position: absolute; bottom: -5px; left: 50%; transform: translateX(-50%); width: 20px; height: 20px; background: white; border-right: 2px solid #00d9ff; border-bottom: 2px solid #00d9ff; transform: rotate(45deg);\"></div>\n  </div>\n</div>\n\n```bibtex\n@misc{guo2025raganythingallinoneragframework,\n      title={RAG-Anything: All-in-One RAG Framework},\n      author={Zirui Guo and Xubin Ren and Lingrui Xu and Jiahao Zhang and Chao Huang},\n      year={2025},\n      eprint={2510.12323},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https://arxiv.org/abs/2510.12323},\n}\n```\n\n---\n\n## 🔗 相关项目\n\n*生态系统与扩展*\n\n<div align=\"center\">\n  <table>\n    <tr>\n      <td align=\"center\">\n        <a href=\"https://github.com/HKUDS/LightRAG\">\n          <div style=\"width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;\">\n            <span style=\"font-size: 32px;\">⚡</span>\n          </div>\n          <b>LightRAG</b><br>\n          <sub>简单快速的RAG系统</sub>\n        </a>\n      </td>\n      <td align=\"center\">\n        <a href=\"https://github.com/HKUDS/VideoRAG\">\n          <div style=\"width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;\">\n            <span style=\"font-size: 32px;\">🎥</span>\n          </div>\n          <b>VideoRAG</b><br>\n          <sub>超长上下文视频RAG系统</sub>\n        </a>\n      </td>\n      <td align=\"center\">\n        <a href=\"https://github.com/HKUDS/MiniRAG\">\n          <div style=\"width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;\">\n            <span style=\"font-size: 32px;\">✨</span>\n          </div>\n          <b>MiniRAG</b><br>\n          <sub>极简RAG系统</sub>\n        </a>\n      </td>\n    </tr>\n  </table>\n</div>\n\n---\n\n## ⭐ Star History\n\n*社区增长轨迹*\n\n<div align=\"center\">\n  <a href=\"https://star-history.com/#HKUDS/RAG-Anything&Date\">\n    <picture>\n      <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date&theme=dark\" />\n      <source media=\"(prefers-color-scheme: light)\" srcset=\"https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date\" />\n      <img alt=\"Star History Chart\" src=\"https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date\" style=\"border-radius: 15px; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);\" />\n    </picture>\n  </a>\n</div>\n\n---\n\n## 🤝 贡献者\n\n*加入创新*\n\n<div align=\"center\">\n  感谢所有贡献者！\n</div>\n\n<div align=\"center\">\n  <a href=\"https://github.com/HKUDS/RAG-Anything/graphs/contributors\">\n    <img src=\"https://contrib.rocks/image?repo=HKUDS/RAG-Anything\" style=\"border-radius: 15px; box-shadow: 0 0 20px rgba(0, 217, 255, 0.3);\" />\n  </a>\n</div>\n\n---\n\n<div align=\"center\" style=\"background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 30px; margin: 30px 0;\">\n  <div>\n    <img src=\"https://user-images.githubusercontent.com/74038190/212284100-561aa473-3905-4a80-b561-0d28506553ee.gif\" width=\"500\">\n  </div>\n  <div style=\"margin-top: 20px;\">\n    <a href=\"https://github.com/HKUDS/RAG-Anything\" style=\"text-decoration: none;\">\n      <img src=\"https://img.shields.io/badge/⭐%20在GitHub上为我们点星-1a1a2e?style=for-the-badge&logo=github&logoColor=white\">\n    </a>\n    <a href=\"https://github.com/HKUDS/RAG-Anything/issues\" style=\"text-decoration: none;\">\n      <img src=\"https://img.shields.io/badge/🐛%20报告问题-ff6b6b?style=for-the-badge&logo=github&logoColor=white\">\n    </a>\n    <a href=\"https://github.com/HKUDS/RAG-Anything/discussions\" style=\"text-decoration: none;\">\n      <img src=\"https://img.shields.io/badge/💬%20讨论交流-4ecdc4?style=for-the-badge&logo=github&logoColor=white\">\n    </a>\n  </div>\n</div>\n\n<div align=\"center\">\n  <div style=\"width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);\">\n    <div style=\"display: flex; justify-content: center; align-items: center; gap: 15px;\">\n      <span style=\"font-size: 24px;\">⭐</span>\n      <span style=\"color: #00d9ff; font-size: 18px;\">感谢您访问RAG-Anything!</span>\n      <span style=\"font-size: 24px;\">⭐</span>\n    </div>\n    <div style=\"margin-top: 10px; color: #00d9ff; font-size: 16px;\">构建多模态AI的未来</div>\n  </div>\n</div>\n\n<div align=\"center\">\n  <img src=\"https://readme-typing-svg.herokuapp.com?font=Orbitron&size=20&duration=3000&pause=1000&color=00D9FF&center=true&vCenter=true&width=600&lines=感谢您访问RAG-Anything!;构建多模态AI的未来;如果觉得有用请点星⭐!\" alt=\"Closing Animation\" />\n</div>\n\n<style>\n@keyframes pulse {\n  0% { transform: scale(1); }\n  50% { transform: scale(1.05); }\n  100% { transform: scale(1); }\n}\n\n@keyframes glow {\n  0% { box-shadow: 0 0 5px rgba(0, 217, 255, 0.5); }\n  50% { box-shadow: 0 0 20px rgba(0, 217, 255, 0.8); }\n  100% { box-shadow: 0 0 5px rgba(0, 217, 255, 0.5); }\n}\n</style>\n"
  },
  {
    "path": "docs/batch_processing.md",
    "content": "# Batch Processing\n\nThis document describes the batch processing feature for RAG-Anything, which allows you to process multiple documents in parallel for improved throughput.\n\n## Overview\n\nThe batch processing feature allows you to process multiple documents concurrently, significantly improving throughput for large document collections. It provides parallel processing, progress tracking, error handling, and flexible configuration options.\n\n## Key Features\n\n- **Parallel Processing**: Process multiple files concurrently using thread pools\n- **Progress Tracking**: Real-time progress bars with `tqdm`\n- **Error Handling**: Comprehensive error reporting and recovery\n- **Flexible Input**: Support for files, directories, and recursive search\n- **Dry Run**: Preview which files would be processed without running parsers\n- **Configurable Workers**: Adjustable number of parallel workers\n- **Installation Check Bypass**: Optional skip for environments with package conflicts\n\n## Installation\n\n```bash\n# Basic installation\npip install raganything[all]\n\n# Required for batch processing\npip install tqdm\n\n# Optional for parser='paddleocr'\npip install raganything[paddleocr]\n```\n\n## Usage\n\n### Basic Batch Processing\n\n```python\nfrom raganything.batch_parser import BatchParser\n\n# Create batch parser\nbatch_parser = BatchParser(\n    parser_type=\"mineru\",  # or \"docling\" or \"paddleocr\"\n    max_workers=4,\n    show_progress=True,\n    timeout_per_file=300,\n    skip_installation_check=False  # Set to True if having parser installation issues\n)\n\n# Process multiple files\nresult = batch_parser.process_batch(\n    file_paths=[\"doc1.pdf\", \"doc2.docx\", \"folder/\"],\n    output_dir=\"./batch_output\",\n    parse_method=\"auto\",\n    recursive=True\n)\n\n# Check results\nprint(result.summary())\nprint(f\"Success rate: {result.success_rate:.1f}%\")\nprint(f\"Processing time: {result.processing_time:.2f} seconds\")\n```\n\n### Asynchronous Batch Processing\n\n```python\nimport asyncio\nfrom raganything.batch_parser import BatchParser\n\nasync def async_batch_processing():\n    batch_parser = BatchParser(\n        parser_type=\"mineru\",\n        max_workers=4,\n        show_progress=True\n    )\n\n    # Process files asynchronously\n    result = await batch_parser.process_batch_async(\n        file_paths=[\"doc1.pdf\", \"doc2.docx\"],\n        output_dir=\"./output\",\n        parse_method=\"auto\"\n    )\n\n    return result\n\n# Run async processing\nresult = asyncio.run(async_batch_processing())\n```\n\n### Integration with RAG-Anything\n\n```python\nfrom raganything import RAGAnything\n\nrag = RAGAnything()\n\n# Process documents with batch functionality\nresult = rag.process_documents_batch(\n    file_paths=[\"doc1.pdf\", \"doc2.docx\"],\n    output_dir=\"./output\",\n    max_workers=4,\n    show_progress=True\n)\n\nprint(f\"Processed {len(result.successful_files)} files successfully\")\n```\n\n### Process Documents with RAG Integration\n\n```python\n# Process documents in batch and then add them to RAG\nresult = await rag.process_documents_with_rag_batch(\n    file_paths=[\"doc1.pdf\", \"doc2.docx\"],\n    output_dir=\"./output\",\n    max_workers=4,\n    show_progress=True\n)\n\nprint(f\"Processed {result['successful_rag_files']} files with RAG\")\nprint(f\"Total processing time: {result['total_processing_time']:.2f} seconds\")\n```\n\n### Command Line Interface\n\n```bash\n# Basic batch processing\npython -m raganything.batch_parser examples/sample_docs/ --output ./output --workers 4\n\n# With specific parser\npython -m raganything.batch_parser examples/sample_docs/ --parser mineru --method auto\npython -m raganything.batch_parser examples/sample_docs/ --parser paddleocr --method ocr\n\n# Without progress bar\npython -m raganything.batch_parser examples/sample_docs/ --output ./output --no-progress\n\n# Dry run (list supported files without processing)\npython -m raganything.batch_parser examples/sample_docs/ --output ./output --dry-run\n\n# Help\npython -m raganything.batch_parser --help\n```\n\n## Configuration\n\n### Environment Variables\n\n```env\n# Batch processing configuration\nMAX_CONCURRENT_FILES=4\nSUPPORTED_FILE_EXTENSIONS=.pdf,.docx,.doc,.pptx,.ppt,.xlsx,.xls,.txt,.md\nRECURSIVE_FOLDER_PROCESSING=true\nPARSER_OUTPUT_DIR=./parsed_output\n```\n\n### BatchParser Parameters\n\n- **parser_type**: `\"mineru\"`, `\"docling\"`, or `\"paddleocr\"` (default: `\"mineru\"`)\n- **max_workers**: Number of parallel workers (default: `4`)\n- **show_progress**: Show progress bar (default: `True`)\n- **timeout_per_file**: Timeout per file in seconds (default: `300`)\n- **skip_installation_check**: Skip parser installation check (default: `False`)\n\n## Supported File Types\n\n- **PDF files**: `.pdf`\n- **Office documents**: `.doc`, `.docx`, `.ppt`, `.pptx`, `.xls`, `.xlsx`\n- **Images**: `.png`, `.jpg`, `.jpeg`, `.bmp`, `.tiff`, `.tif`, `.gif`, `.webp`\n- **Text files**: `.txt`, `.md`\n\n## API Reference\n\n### BatchProcessingResult\n\n```python\n@dataclass\nclass BatchProcessingResult:\n    successful_files: List[str]      # Successfully processed files\n    failed_files: List[str]          # Failed files\n    total_files: int                 # Total number of files\n    processing_time: float           # Total processing time in seconds\n    errors: Dict[str, str]           # Error messages for failed files\n    output_dir: str                  # Output directory used\n    dry_run: bool                    # True if run was a dry-run\n\n    def summary(self) -> str:        # Human-readable summary\n    def success_rate(self) -> float: # Success rate as percentage\n```\n\n### BatchParser Methods\n\n```python\nclass BatchParser:\n    def __init__(self, parser_type: str = \"mineru\", max_workers: int = 4, ...):\n        \"\"\"Initialize batch parser\"\"\"\n\n    def get_supported_extensions(self) -> List[str]:\n        \"\"\"Get list of supported file extensions\"\"\"\n\n    def filter_supported_files(self, file_paths: List[str], recursive: bool = True) -> List[str]:\n        \"\"\"Filter files to only supported types\"\"\"\n\n    def process_batch(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:\n        \"\"\"Process files in batch\"\"\"\n\n    async def process_batch_async(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:\n        \"\"\"Process files in batch asynchronously\"\"\"\n```\n\n## Performance Considerations\n\n### Memory Usage\n- Each worker uses additional memory\n- Recommended: 2-4 workers for most systems\n- Monitor memory usage with large files\n\n### CPU Usage\n- Parallel processing utilizes multiple cores\n- Optimal worker count depends on CPU cores and file sizes\n- I/O may become bottleneck with many small files\n\n### Recommended Settings\n- **Small files** (< 1MB): Higher worker count (6-8)\n- **Large files** (> 100MB): Lower worker count (2-3)\n- **Mixed sizes**: Start with 4 workers and adjust\n\n## Troubleshooting\n\n### Common Issues\n\n#### Memory Errors\n```python\n# Solution: Reduce max_workers\nbatch_parser = BatchParser(max_workers=2)\n```\n\n#### Timeout Errors\n```python\n# Solution: Increase timeout_per_file\nbatch_parser = BatchParser(timeout_per_file=600)  # 10 minutes\n```\n\n#### Parser Installation Issues\n```python\n# Solution: Skip installation check\nbatch_parser = BatchParser(skip_installation_check=True)\n```\n\n#### File Not Found Errors\n- Check file paths and permissions\n- Ensure input files exist\n- Verify directory access rights\n\n### Debug Mode\n\nEnable debug logging for detailed information:\n\n```python\nimport logging\nlogging.basicConfig(level=logging.DEBUG)\n\n# Create batch parser with debug logging\nbatch_parser = BatchParser(parser_type=\"mineru\", max_workers=2)\n```\n\n### Error Handling\n\nThe batch processor provides comprehensive error handling:\n\n```python\nresult = batch_parser.process_batch(file_paths=[\"doc1.pdf\", \"doc2.docx\"])\n\n# Check for errors\nif result.failed_files:\n    print(\"Failed files:\")\n    for file_path in result.failed_files:\n        error_message = result.errors.get(file_path, \"Unknown error\")\n        print(f\"  - {file_path}: {error_message}\")\n\n# Process only successful files\nfor file_path in result.successful_files:\n    print(f\"Successfully processed: {file_path}\")\n```\n\n## Examples\n\n### Process Entire Directory\n\n```python\nfrom pathlib import Path\n\n# Process all supported files in a directory\nbatch_parser = BatchParser(max_workers=4)\ndirectory_path = Path(\"./documents\")\n\nresult = batch_parser.process_batch(\n    file_paths=[str(directory_path)],\n    output_dir=\"./processed\",\n    recursive=True  # Include subdirectories\n)\n\nprint(f\"Processed {len(result.successful_files)} out of {result.total_files} files\")\n```\n\n### Filter Files Before Processing\n\n```python\n# Get all files in directory\nall_files = [\"doc1.pdf\", \"image.png\", \"spreadsheet.xlsx\", \"unsupported.xyz\"]\n\n# Filter to supported files only\nsupported_files = batch_parser.filter_supported_files(all_files)\nprint(f\"Will process {len(supported_files)} out of {len(all_files)} files\")\n\n# Process only supported files\nresult = batch_parser.process_batch(\n    file_paths=supported_files,\n    output_dir=\"./output\"\n)\n```\n\n### Custom Error Handling\n\n```python\ndef process_with_retry(file_paths, max_retries=3):\n    \"\"\"Process files with retry logic\"\"\"\n\n    for attempt in range(max_retries):\n        result = batch_parser.process_batch(file_paths, \"./output\")\n\n        if not result.failed_files:\n            break  # All files processed successfully\n\n        print(f\"Attempt {attempt + 1}: {len(result.failed_files)} files failed\")\n        file_paths = result.failed_files  # Retry failed files\n\n    return result\n```\n\n## Best Practices\n\n1. **Start with default settings** and adjust based on performance\n2. **Monitor system resources** during batch processing\n3. **Use appropriate worker counts** for your hardware\n4. **Handle errors gracefully** with retry logic\n5. **Test with small batches** before processing large collections\n6. **Use skip_installation_check** if facing parser installation issues\n7. **Enable progress tracking** for long-running operations\n8. **Set appropriate timeouts** based on expected file processing times\n\n## Conclusion\n\nThe batch processing feature significantly improves RAG-Anything's throughput for large document collections. It provides flexible configuration options, comprehensive error handling, and seamless integration with the existing RAG-Anything pipeline.\n"
  },
  {
    "path": "docs/context_aware_processing.md",
    "content": "# Context-Aware Multimodal Processing in RAGAnything\n\nThis document describes the context-aware multimodal processing feature in RAGAnything, which provides surrounding content information to LLMs when analyzing images, tables, equations, and other multimodal content for enhanced accuracy and relevance.\n\n## Overview\n\nThe context-aware feature enables RAGAnything to automatically extract and provide surrounding text content as context when processing multimodal content. This leads to more accurate and contextually relevant analysis by giving AI models additional information about where the content appears in the document structure.\n\n### Key Benefits\n\n- **Enhanced Accuracy**: Context helps AI understand the purpose and meaning of multimodal content\n- **Semantic Coherence**: Generated descriptions align with document context and terminology\n- **Automated Integration**: Context extraction is automatically enabled during document processing\n- **Flexible Configuration**: Multiple extraction modes and filtering options\n\n## Key Features\n\n### 1. Configuration Support\n- **Integrated Configuration**: Complete context options in `RAGAnythingConfig`\n- **Environment Variables**: Configure all context parameters via environment variables\n- **Dynamic Updates**: Runtime configuration updates supported\n- **Content Format Control**: Configurable content source format detection\n\n### 2. Automated Integration\n- **Auto-Initialization**: Modal processors automatically receive tokenizer and context configuration\n- **Content Source Setup**: Document processing automatically sets content sources for context extraction\n- **Position Information**: Automatic position info (page_idx, index) passed to processors\n- **Batch Processing**: Context-aware batch processing for efficient document handling\n\n### 3. Advanced Token Management\n- **Accurate Token Counting**: Uses LightRAG's tokenizer for precise token calculation\n- **Smart Boundary Preservation**: Truncates at sentence/paragraph boundaries\n- **Backward Compatibility**: Fallback to character truncation when tokenizer unavailable\n\n### 4. Universal Context Extraction\n- **Multiple Formats**: Support for MinerU, plain text, custom formats\n- **Flexible Modes**: Page-based and chunk-based context extraction\n- **Content Filtering**: Configurable content type filtering\n- **Header Support**: Optional inclusion of document headers and structure\n\n## Configuration\n\n### RAGAnythingConfig Parameters\n\n```python\n# Context Extraction Configuration\ncontext_window: int = 1                    # Context window size (pages/chunks)\ncontext_mode: str = \"page\"                 # Context mode (\"page\" or \"chunk\")\nmax_context_tokens: int = 2000             # Maximum context tokens\ninclude_headers: bool = True               # Include document headers\ninclude_captions: bool = True              # Include image/table captions\ncontext_filter_content_types: List[str] = [\"text\"]  # Content types to include\ncontent_format: str = \"minerU\"             # Default content format for context extraction\n```\n\n### Environment Variables\n\n```bash\n# Context extraction settings\nCONTEXT_WINDOW=2\nCONTEXT_MODE=page\nMAX_CONTEXT_TOKENS=3000\nINCLUDE_HEADERS=true\nINCLUDE_CAPTIONS=true\nCONTEXT_FILTER_CONTENT_TYPES=text,image\nCONTENT_FORMAT=minerU\n```\n\n## Usage Guide\n\n### 1. Basic Configuration\n\n```python\nfrom raganything import RAGAnything, RAGAnythingConfig\n\n# Create configuration with context settings\nconfig = RAGAnythingConfig(\n    context_window=2,\n    context_mode=\"page\",\n    max_context_tokens=3000,\n    include_headers=True,\n    include_captions=True,\n    context_filter_content_types=[\"text\", \"image\"],\n    content_format=\"minerU\"\n)\n\n# Create RAGAnything instance\nrag_anything = RAGAnything(\n    config=config,\n    llm_model_func=your_llm_function,\n    embedding_func=your_embedding_function\n)\n```\n\n### 2. Automatic Document Processing\n\n```python\n# Context is automatically enabled during document processing\nawait rag_anything.process_document_complete(\"document.pdf\")\n```\n\n### 3. Manual Content Source Configuration\n\n```python\n# Set content source for specific content lists\nrag_anything.set_content_source_for_context(content_list, \"minerU\")\n\n# Update context configuration at runtime\nrag_anything.update_context_config(\n    context_window=1,\n    max_context_tokens=1500,\n    include_captions=False\n)\n```\n\n### 4. Direct Modal Processor Usage\n\n```python\nfrom raganything.modalprocessors import (\n    ContextExtractor,\n    ContextConfig,\n    ImageModalProcessor\n)\n\n# Configure context extraction\nconfig = ContextConfig(\n    context_window=1,\n    context_mode=\"page\",\n    max_context_tokens=2000,\n    include_headers=True,\n    include_captions=True,\n    filter_content_types=[\"text\"]\n)\n\n# Initialize context extractor\ncontext_extractor = ContextExtractor(config)\n\n# Initialize modal processor with context support\nprocessor = ImageModalProcessor(lightrag, caption_func, context_extractor)\n\n# Set content source\nprocessor.set_content_source(content_list, \"minerU\")\n\n# Process with context\nitem_info = {\n    \"page_idx\": 2,\n    \"index\": 5,\n    \"type\": \"image\"\n}\n\nresult = await processor.process_multimodal_content(\n    modal_content=image_data,\n    content_type=\"image\",\n    file_path=\"document.pdf\",\n    entity_name=\"Architecture Diagram\",\n    item_info=item_info\n)\n```\n\n## Context Modes\n\n### Page-Based Context (`context_mode=\"page\"`)\n- Extracts context based on page boundaries\n- Uses `page_idx` field from content items\n- Suitable for document-structured content\n- Example: Include text from 2 pages before and after current image\n\n### Chunk-Based Context (`context_mode=\"chunk\"`)\n- Extracts context based on content item positions\n- Uses sequential position in content list\n- Suitable for fine-grained control\n- Example: Include 5 content items before and after current table\n\n## Processing Workflow\n\n### 1. Document Parsing\n```\nDocument Input → MinerU Parsing → content_list Generation\n```\n\n### 2. Context Setup\n```\ncontent_list → Set as Context Source → All Modal Processors Gain Context Capability\n```\n\n### 3. Multimodal Processing\n```\nMultimodal Content → Extract Surrounding Context → Enhanced LLM Analysis → More Accurate Results\n```\n\n## Content Source Formats\n\n### MinerU Format\n```json\n[\n    {\n        \"type\": \"text\",\n        \"text\": \"Document content here...\",\n        \"text_level\": 1,\n        \"page_idx\": 0\n    },\n    {\n        \"type\": \"image\",\n        \"img_path\": \"images/figure1.jpg\",\n        \"image_caption\": [\"Figure 1: Architecture\"],\n        \"image_footnote\": [],\n        \"page_idx\": 1\n    }\n]\n```\n\n### Custom Text Chunks\n```python\ntext_chunks = [\n    \"First chunk of text content...\",\n    \"Second chunk of text content...\",\n    \"Third chunk of text content...\"\n]\n```\n\n### Plain Text\n```python\nfull_document = \"Complete document text with all content...\"\n```\n\n## Configuration Examples\n\n### High-Precision Context\nFor focused analysis with minimal context:\n```python\nconfig = RAGAnythingConfig(\n    context_window=1,\n    context_mode=\"page\",\n    max_context_tokens=1000,\n    include_headers=True,\n    include_captions=False,\n    context_filter_content_types=[\"text\"]\n)\n```\n\n### Comprehensive Context\nFor broad analysis with rich context:\n```python\nconfig = RAGAnythingConfig(\n    context_window=2,\n    context_mode=\"page\",\n    max_context_tokens=3000,\n    include_headers=True,\n    include_captions=True,\n    context_filter_content_types=[\"text\", \"image\", \"table\"]\n)\n```\n\n### Chunk-Based Analysis\nFor fine-grained sequential context:\n```python\nconfig = RAGAnythingConfig(\n    context_window=5,\n    context_mode=\"chunk\",\n    max_context_tokens=2000,\n    include_headers=False,\n    include_captions=False,\n    context_filter_content_types=[\"text\"]\n)\n```\n\n## Performance Optimization\n\n### 1. Accurate Token Control\n- Uses real tokenizer for precise token counting\n- Avoids exceeding LLM token limits\n- Provides consistent performance\n\n### 2. Smart Truncation\n- Truncates at sentence boundaries\n- Maintains semantic integrity\n- Adds truncation indicators\n\n### 3. Caching Optimization\n- Context extraction results can be reused\n- Reduces redundant computation overhead\n\n## Advanced Features\n\n### Context Truncation\nThe system automatically truncates context to fit within token limits:\n- Uses actual tokenizer for accurate token counting\n- Attempts to end at sentence boundaries (periods)\n- Falls back to line boundaries if needed\n- Adds \"...\" indicator for truncated content\n\n### Header Formatting\nWhen `include_headers=True`, headers are formatted with markdown-style prefixes:\n```\n# Level 1 Header\n## Level 2 Header\n### Level 3 Header\n```\n\n### Caption Integration\nWhen `include_captions=True`, image and table captions are included as:\n```\n[Image: Figure 1 caption text]\n[Table: Table 1 caption text]\n```\n\n## Integration with RAGAnything\n\nThe context-aware feature is seamlessly integrated into RAGAnything's workflow:\n\n1. **Automatic Setup**: Context extractors are automatically created and configured\n2. **Content Source Management**: Document processing automatically sets content sources\n3. **Processor Integration**: All modal processors receive context capabilities\n4. **Configuration Consistency**: Single configuration system for all context settings\n\n## Error Handling\n\nThe system includes robust error handling:\n- Gracefully handles missing or invalid content sources\n- Returns empty context for unsupported formats\n- Logs warnings for configuration issues\n- Continues processing even if context extraction fails\n\n## Compatibility\n\n- **Backward Compatible**: Existing code works without modification\n- **Optional Feature**: Context can be selectively enabled/disabled\n- **Flexible Configuration**: Supports multiple configuration combinations\n\n## Best Practices\n\n1. **Token Limits**: Ensure `max_context_tokens` doesn't exceed LLM context limits\n2. **Performance Impact**: Larger context windows increase processing time\n3. **Content Quality**: Context quality directly affects analysis accuracy\n4. **Window Size**: Match window size to content structure (documents vs articles)\n5. **Content Filtering**: Use `context_filter_content_types` to reduce noise\n\n## Troubleshooting\n\n### Common Issues\n\n**Context Not Extracted**\n- Check if `set_content_source_for_context()` was called\n- Verify `item_info` contains required fields (`page_idx`, `index`)\n- Confirm content source format is correct\n\n**Context Too Long/Short**\n- Adjust `max_context_tokens` setting\n- Modify `context_window` size\n- Check `context_filter_content_types` configuration\n\n**Irrelevant Context**\n- Refine `context_filter_content_types` to exclude noise\n- Reduce `context_window` size\n- Set `include_captions=False` if captions are not helpful\n\n**Configuration Issues**\n- Verify environment variables are set correctly\n- Check RAGAnythingConfig parameter names\n- Ensure content_format matches your data source\n\n## Examples\n\nCheck out these example files for complete usage demonstrations:\n\n- **Configuration Examples**: See how to set up different context configurations\n- **Integration Examples**: Learn how to integrate context-aware processing into your workflow\n- **Custom Processors**: Examples of creating custom modal processors with context support\n\n## API Reference\n\nFor detailed API documentation, see the docstrings in:\n- `raganything/modalprocessors.py` - Context extraction and modal processors\n- `raganything/config.py` - Configuration options\n- `raganything/raganything.py` - Main RAGAnything class integration\n"
  },
  {
    "path": "docs/enhanced_markdown.md",
    "content": "# Enhanced Markdown Conversion\n\nThis document describes the enhanced markdown conversion feature for RAG-Anything, which provides high-quality PDF generation from markdown files with multiple backend options and advanced styling.\n\n## Overview\n\nThe enhanced markdown conversion feature provides professional-quality PDF generation from markdown files. It supports multiple conversion backends, advanced styling options, syntax highlighting, and seamless integration with RAG-Anything's document processing pipeline.\n\n## Key Features\n\n- **Multiple Backends**: WeasyPrint, Pandoc, and automatic backend selection\n- **Advanced Styling**: Custom CSS, syntax highlighting, and professional layouts\n- **Image Support**: Embedded images with proper scaling and positioning\n- **Table Support**: Formatted tables with borders and professional styling\n- **Code Highlighting**: Syntax highlighting for code blocks using Pygments\n- **Custom Templates**: Support for custom CSS and document templates\n- **Table of Contents**: Automatic TOC generation with navigation links\n- **Professional Typography**: High-quality fonts and spacing\n\n## Installation\n\n### Required Dependencies\n\n```bash\n# Basic installation\npip install raganything[all]\n\n# Required for enhanced markdown conversion\npip install markdown weasyprint pygments\n```\n\n### Optional Dependencies\n\n```bash\n# For Pandoc backend (system installation required)\n# Ubuntu/Debian:\nsudo apt-get install pandoc wkhtmltopdf\n\n# macOS:\nbrew install pandoc wkhtmltopdf\n\n# Or using conda:\nconda install -c conda-forge pandoc wkhtmltopdf\n```\n\n### Backend-Specific Installation\n\n#### WeasyPrint (Recommended)\n```bash\n# Install WeasyPrint with system dependencies\npip install weasyprint\n\n# Ubuntu/Debian system dependencies:\nsudo apt-get install -y build-essential python3-dev python3-pip \\\n    python3-setuptools python3-wheel python3-cffi libcairo2 \\\n    libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \\\n    libffi-dev shared-mime-info\n```\n\n#### Pandoc\n- Download from: https://pandoc.org/installing.html\n- Requires system-wide installation\n- Used for complex document structures and LaTeX-quality output\n\n## Usage\n\n### Basic Conversion\n\n```python\nfrom raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig\n\n# Create converter with default settings\nconverter = EnhancedMarkdownConverter()\n\n# Convert markdown file to PDF\nsuccess = converter.convert_file_to_pdf(\n    input_path=\"document.md\",\n    output_path=\"document.pdf\",\n    method=\"auto\"  # Automatically select best available backend\n)\n\nif success:\n    print(\"✅ Conversion successful!\")\nelse:\n    print(\"❌ Conversion failed\")\n```\n\n### Advanced Configuration\n\n```python\n# Create custom configuration\nconfig = MarkdownConfig(\n    page_size=\"A4\",           # A4, Letter, Legal, etc.\n    margin=\"1in\",             # CSS-style margins\n    font_size=\"12pt\",         # Base font size\n    line_height=\"1.5\",        # Line spacing\n    include_toc=True,         # Generate table of contents\n    syntax_highlighting=True, # Enable code syntax highlighting\n\n    # Custom CSS styling\n    custom_css=\"\"\"\n    body {\n        font-family: 'Georgia', serif;\n        color: #333;\n    }\n    h1 {\n        color: #2c3e50;\n        border-bottom: 2px solid #3498db;\n        padding-bottom: 0.3em;\n    }\n    code {\n        background-color: #f8f9fa;\n        padding: 2px 4px;\n        border-radius: 3px;\n    }\n    pre {\n        background-color: #f8f9fa;\n        border-left: 4px solid #3498db;\n        padding: 15px;\n        border-radius: 5px;\n    }\n    table {\n        border-collapse: collapse;\n        width: 100%;\n        margin: 1em 0;\n    }\n    th, td {\n        border: 1px solid #ddd;\n        padding: 8px 12px;\n        text-align: left;\n    }\n    th {\n        background-color: #f2f2f2;\n        font-weight: bold;\n    }\n    \"\"\"\n)\n\nconverter = EnhancedMarkdownConverter(config)\n```\n\n### Backend Selection\n\n```python\n# Check available backends\nconverter = EnhancedMarkdownConverter()\nbackend_info = converter.get_backend_info()\n\nprint(\"Available backends:\")\nfor backend, available in backend_info[\"available_backends\"].items():\n    status = \"✅\" if available else \"❌\"\n    print(f\"  {status} {backend}\")\n\nprint(f\"Recommended backend: {backend_info['recommended_backend']}\")\n\n# Use specific backend\nconverter.convert_file_to_pdf(\n    input_path=\"document.md\",\n    output_path=\"document.pdf\",\n    method=\"weasyprint\"  # or \"pandoc\", \"pandoc_system\", \"auto\"\n)\n```\n\n### Content Conversion\n\n```python\n# Convert markdown content directly (not from file)\nmarkdown_content = \"\"\"\n# Sample Document\n\n## Introduction\nThis is a **bold** statement with *italic* text.\n\n## Code Example\n```python\ndef hello_world():\n    print(\"Hello, World!\")\n    return \"Success\"\n```\n\n## Table\n| Feature | Status | Notes |\n|---------|--------|-------|\n| PDF Generation | ✅ | Working |\n| Syntax Highlighting | ✅ | Pygments |\n| Custom CSS | ✅ | Full support |\n\"\"\"\n\nsuccess = converter.convert_markdown_to_pdf(\n    markdown_content=markdown_content,\n    output_path=\"sample.pdf\",\n    method=\"auto\"\n)\n```\n\n### Command Line Interface\n\n```bash\n# Basic conversion\npython -m raganything.enhanced_markdown document.md --output document.pdf\n\n# With specific backend\npython -m raganything.enhanced_markdown document.md --method weasyprint\n\n# With custom CSS file\npython -m raganything.enhanced_markdown document.md --css custom_style.css\n\n# Show backend information\npython -m raganything.enhanced_markdown --info\n\n# Help\npython -m raganything.enhanced_markdown --help\n```\n\n## Backend Comparison\n\n| Backend | Pros | Cons | Best For | Quality |\n|---------|------|------|----------|---------|\n| **WeasyPrint** | • Excellent CSS support<br>• Fast rendering<br>• Great web-style layouts<br>• Python-based | • Limited LaTeX features<br>• Requires system deps | • Web-style documents<br>• Custom styling<br>• Fast conversion | ⭐⭐⭐⭐ |\n| **Pandoc** | • Extensive features<br>• LaTeX-quality output<br>• Academic formatting<br>• Many input/output formats | • Slower conversion<br>• System installation<br>• Complex setup | • Academic papers<br>• Complex documents<br>• Publication quality | ⭐⭐⭐⭐⭐ |\n| **Auto** | • Automatic selection<br>• Fallback support<br>• User-friendly | • May not use optimal backend | • General use<br>• Quick setup<br>• Development | ⭐⭐⭐⭐ |\n\n## Configuration Options\n\n### MarkdownConfig Parameters\n\n```python\n@dataclass\nclass MarkdownConfig:\n    # Page layout\n    page_size: str = \"A4\"              # A4, Letter, Legal, A3, etc.\n    margin: str = \"1in\"                # CSS margin format\n    font_size: str = \"12pt\"            # Base font size\n    line_height: str = \"1.5\"           # Line spacing multiplier\n\n    # Content options\n    include_toc: bool = True           # Generate table of contents\n    syntax_highlighting: bool = True   # Enable code highlighting\n    image_max_width: str = \"100%\"      # Maximum image width\n    table_style: str = \"...\"           # Default table CSS\n\n    # Styling\n    css_file: Optional[str] = None     # External CSS file path\n    custom_css: Optional[str] = None   # Inline CSS content\n    template_file: Optional[str] = None # Custom HTML template\n\n    # Output options\n    output_format: str = \"pdf\"         # Currently only PDF supported\n    output_dir: Optional[str] = None   # Output directory\n\n    # Metadata\n    metadata: Optional[Dict[str, str]] = None  # Document metadata\n```\n\n### Supported Markdown Features\n\n#### Basic Formatting\n- **Headers**: `# ## ### #### ##### ######`\n- **Emphasis**: `*italic*`, `**bold**`, `***bold italic***`\n- **Links**: `[text](url)`, `[text][ref]`\n- **Images**: `![alt](url)`, `![alt][ref]`\n- **Lists**: Ordered and unordered, nested\n- **Blockquotes**: `> quote`\n- **Line breaks**: Double space or `\\n\\n`\n\n#### Advanced Features\n- **Tables**: GitHub-style tables with alignment\n- **Code blocks**: Fenced code blocks with language specification\n- **Inline code**: `backtick code`\n- **Horizontal rules**: `---` or `***`\n- **Footnotes**: `[^1]` references\n- **Definition lists**: Term and definition pairs\n- **Attributes**: `{#id .class key=value}`\n\n#### Code Highlighting\n\n```markdown\n```python\ndef example_function():\n    \"\"\"This will be syntax highlighted\"\"\"\n    return \"Hello, World!\"\n```\n\n```javascript\nfunction exampleFunction() {\n    // This will also be highlighted\n    return \"Hello, World!\";\n}\n```\n```\n\n## Integration with RAG-Anything\n\nThe enhanced markdown conversion integrates seamlessly with RAG-Anything:\n\n```python\nfrom raganything import RAGAnything\n\n# Initialize RAG-Anything\nrag = RAGAnything()\n\n# Process markdown files - enhanced conversion is used automatically\nawait rag.process_document_complete(\"document.md\")\n\n# Batch processing with enhanced markdown conversion\nresult = rag.process_documents_batch(\n    file_paths=[\"doc1.md\", \"doc2.md\", \"doc3.md\"],\n    output_dir=\"./output\"\n)\n\n# The .md files will be converted to PDF using enhanced conversion\n# before being processed by the RAG system\n```\n\n## Performance Considerations\n\n### Conversion Speed\n- **WeasyPrint**: ~1-3 seconds for typical documents\n- **Pandoc**: ~3-10 seconds for typical documents\n- **Large documents**: Time scales roughly linearly with content\n\n### Memory Usage\n- **WeasyPrint**: ~50-100MB per conversion\n- **Pandoc**: ~100-200MB per conversion\n- **Images**: Large images increase memory usage significantly\n\n### Optimization Tips\n1. **Resize large images** before embedding\n2. **Use compressed images** (JPEG for photos, PNG for graphics)\n3. **Limit concurrent conversions** to avoid memory issues\n4. **Cache converted content** when processing multiple times\n\n## Examples\n\n### Sample Markdown Document\n\n```markdown\n# Technical Documentation\n\n## Table of Contents\n[TOC]\n\n## Overview\nThis document provides comprehensive technical specifications.\n\n## Architecture\n\n### System Components\n1. **Parser Engine**: Handles document processing\n2. **Storage Layer**: Manages data persistence\n3. **Query Interface**: Provides search capabilities\n\n### Code Implementation\n```python\nfrom raganything import RAGAnything\n\n# Initialize system\nrag = RAGAnything(config={\n    \"working_dir\": \"./storage\",\n    \"enable_image_processing\": True\n})\n\n# Process document\nawait rag.process_document_complete(\"document.pdf\")\n```\n\n### Performance Metrics\n\n| Component | Throughput | Latency | Memory |\n|-----------|------------|---------|--------|\n| Parser | 100 docs/hour | 36s avg | 2.5 GB |\n| Storage | 1000 ops/sec | 1ms avg | 512 MB |\n| Query | 50 queries/sec | 20ms avg | 1 GB |\n\n## Integration Notes\n\n> **Important**: Always validate input before processing.\n\n## Conclusion\nThe enhanced system provides excellent performance for document processing workflows.\n```\n\n### Generated PDF Features\n\nThe enhanced markdown converter produces PDFs with:\n\n- **Professional typography** with proper font selection and spacing\n- **Syntax-highlighted code blocks** using Pygments\n- **Formatted tables** with borders and alternating row colors\n- **Clickable table of contents** with navigation links\n- **Responsive images** that scale appropriately\n- **Custom styling** through CSS\n- **Proper page breaks** and margins\n- **Document metadata** and properties\n\n## Troubleshooting\n\n### Common Issues\n\n#### WeasyPrint Installation Problems\n```bash\n# Ubuntu/Debian: Install system dependencies\nsudo apt-get update\nsudo apt-get install -y build-essential python3-dev libcairo2 \\\n    libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \\\n    libffi-dev shared-mime-info\n\n# Then reinstall WeasyPrint\npip install --force-reinstall weasyprint\n```\n\n#### Pandoc Not Found\n```bash\n# Check if Pandoc is installed\npandoc --version\n\n# Install Pandoc (Ubuntu/Debian)\nsudo apt-get install pandoc wkhtmltopdf\n\n# Or download from: https://pandoc.org/installing.html\n```\n\n#### CSS Issues\n- Check CSS syntax in custom_css\n- Verify CSS file paths exist\n- Test CSS with simple HTML first\n- Use browser developer tools to debug styling\n\n#### Image Problems\n- Ensure images are accessible (correct paths)\n- Check image file formats (PNG, JPEG, GIF supported)\n- Verify image file permissions\n- Consider image size and format optimization\n\n#### Font Issues\n```python\n# Use web-safe fonts\nconfig = MarkdownConfig(\n    custom_css=\"\"\"\n    body {\n        font-family: 'Arial', 'Helvetica', sans-serif;\n    }\n    \"\"\"\n)\n```\n\n### Debug Mode\n\nEnable detailed logging for troubleshooting:\n\n```python\nimport logging\n\n# Enable debug logging\nlogging.basicConfig(\n    level=logging.DEBUG,\n    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'\n)\n\n# Create converter with debug logging\nconverter = EnhancedMarkdownConverter()\nresult = converter.convert_file_to_pdf(\"test.md\", \"test.pdf\")\n```\n\n### Error Handling\n\n```python\ndef robust_conversion(input_path, output_path):\n    \"\"\"Convert with fallback backends\"\"\"\n    converter = EnhancedMarkdownConverter()\n\n    # Try backends in order of preference\n    backends = [\"weasyprint\", \"pandoc\", \"auto\"]\n\n    for backend in backends:\n        try:\n            success = converter.convert_file_to_pdf(\n                input_path=input_path,\n                output_path=output_path,\n                method=backend\n            )\n            if success:\n                print(f\"✅ Conversion successful with {backend}\")\n                return True\n        except Exception as e:\n            print(f\"❌ {backend} failed: {str(e)}\")\n            continue\n\n    print(\"❌ All backends failed\")\n    return False\n```\n\n## API Reference\n\n### EnhancedMarkdownConverter\n\n```python\nclass EnhancedMarkdownConverter:\n    def __init__(self, config: Optional[MarkdownConfig] = None):\n        \"\"\"Initialize converter with optional configuration\"\"\"\n\n    def convert_file_to_pdf(self, input_path: str, output_path: str, method: str = \"auto\") -> bool:\n        \"\"\"Convert markdown file to PDF\"\"\"\n\n    def convert_markdown_to_pdf(self, markdown_content: str, output_path: str, method: str = \"auto\") -> bool:\n        \"\"\"Convert markdown content to PDF\"\"\"\n\n    def get_backend_info(self) -> Dict[str, Any]:\n        \"\"\"Get information about available backends\"\"\"\n\n    def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:\n        \"\"\"Convert using WeasyPrint backend\"\"\"\n\n    def convert_with_pandoc(self, markdown_content: str, output_path: str) -> bool:\n        \"\"\"Convert using Pandoc backend\"\"\"\n```\n\n## Best Practices\n\n1. **Choose the right backend** for your use case:\n   - **WeasyPrint** for web-style documents and custom CSS\n   - **Pandoc** for academic papers and complex formatting\n   - **Auto** for general use and development\n\n2. **Optimize images** before embedding:\n   - Use appropriate formats (JPEG for photos, PNG for graphics)\n   - Compress images to reduce file size\n   - Set reasonable maximum widths\n\n3. **Design responsive layouts**:\n   - Use relative units (%, em) instead of absolute (px)\n   - Test with different page sizes\n   - Consider print-specific CSS\n\n4. **Test your styling**:\n   - Start with default styling and incrementally customize\n   - Test with sample content before production use\n   - Validate CSS syntax\n\n5. **Handle errors gracefully**:\n   - Implement fallback backends\n   - Provide meaningful error messages\n   - Log conversion attempts for debugging\n\n6. **Performance optimization**:\n   - Cache converted content when possible\n   - Process large batches with appropriate worker counts\n   - Monitor memory usage with large documents\n\n## Conclusion\n\nThe enhanced markdown conversion feature provides professional-quality PDF generation with flexible styling options and multiple backend support. It seamlessly integrates with RAG-Anything's document processing pipeline while offering standalone functionality for markdown-to-PDF conversion needs.\n"
  },
  {
    "path": "docs/offline_setup.md",
    "content": "# Running RAG-Anything in an Offline Environment\n\nThis document explains a critical consideration for running the RAG-Anything project in an environment with no internet access.\n\n## The Network Dependency: `LightRAG` and `tiktoken`\n\nThe `RAGAnything` core engine relies on the `LightRAG` library for its primary functionality. `LightRAG`, in turn, uses OpenAI's `tiktoken` library for text tokenization.\n\nBy default, the `tiktoken` library has a network dependency. On its first use, it attempts to download tokenizer models from OpenAI's public servers (`openaipublic.blob.core.windows.net`). If the application is running in an offline or network-restricted environment, this download will fail, causing the `LightRAG` instance to fail to initialize.\n\nThis results in an error similar to the following:\n\n```\nFailed to initialize LightRAG instance: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /encodings/o200k_ba\n```\n\nThis dependency is indirect. The `RAG-Anything` codebase itself does not directly import or call `tiktoken`. The call is made from within the `lightrag` library.\n\n## The Solution: Using a Local `tiktoken` Cache\n\nTo resolve this issue and enable fully offline operation, you must provide a local cache for the `tiktoken` models. This is achieved by setting the `TIKTOKEN_CACHE_DIR` environment variable **before** the application starts.\n\nWhen this environment variable is set, `tiktoken` will look for its model files in the specified local directory instead of attempting to download them from the internet.\n\n### Steps to Implement the Solution:\n\n1.  **Create a Model Cache:** In an environment *with* internet access, run the provided script to download and cache the necessary `tiktoken` models.\n\n    ```bash\n    # Run the cache creation script\n    uv run scripts/create_tiktoken_cache.py\n    ```\n\n    This will create a `tiktoken_cache` directory in your project root containing the required model files.\n\n2.  **Configure the Environment Variable:** Add the following line to your `.env` file:\n\n    ```bash\n    TIKTOKEN_CACHE_DIR=./tiktoken_cache\n    ```\n\n    **Important:** You should ensure that the `.env` file is loaded **before** `LightRAG` imports `tiktoken`, making this configuration effective.\n\n    ```python\n    import os\n    from typing import Dict, Any, Optional, Callable\n    import sys\n    import asyncio\n    import atexit\n    from dataclasses import dataclass, field\n    from pathlib import Path\n    from dotenv import load_dotenv\n\n    # Add project root directory to Python path\n    sys.path.insert(0, str(Path(__file__).parent.parent))\n\n    # Load environment variables FIRST - before any imports that use tiktoken\n    load_dotenv(dotenv_path=\".env\", override=False)\n\n    # Now import LightRAG (which will import tiktoken with the correct env var set)\n    from lightrag import LightRAG\n    from lightrag.utils import logger\n\n    # Rest of the code...\n    ```\n\n### Testing the Offline Setup\n\n1.  **Create a `tiktoken_cache` directory:** If you don't have one already, create a directory named `tiktoken_cache` in the project root.\n2.  **Populate the cache:** Run the `scripts/create_tiktoken_cache.py` script to download the necessary tiktoken models into the `tiktoken_cache` directory.\n3.  **Set the `TIKTOKEN_CACHE_DIR` environment variable:** Add the line `TIKTOKEN_CACHE_DIR=./tiktoken_cache` to your `.env` file.\n4.  **Disconnect from the internet:** Disable your internet connection or put your machine in airplane mode.\n5.  **Run the application:** Start the `RAG-Anything` application. For example:\n    ```\n    uv run examples/raganything_example.py requirements.txt\n    ```\n\nBy following these steps, you can eliminate the network dependency and run the `RAG-Anything` project successfully in a fully offline environment.\n"
  },
  {
    "path": "docs/vllm_integration.md",
    "content": "# vLLM Integration Guide for RAG-Anything\n\n[vLLM](https://github.com/vllm-project/vllm) is a high-throughput, memory-efficient inference engine for LLMs. It exposes an OpenAI-compatible API, making it a drop-in backend for RAG-Anything in production environments.\n\n## Why vLLM?\n\n| Feature | vLLM | Ollama | LM Studio |\n|---------|------|--------|-----------|\n| **Continuous batching** | ✅ | ❌ | ❌ |\n| **PagedAttention** | ✅ | ❌ | ❌ |\n| **Tensor parallelism** | ✅ | ❌ | ❌ |\n| **Production throughput** | ✅ High | Moderate | Low |\n| **Quantization (AWQ/GPTQ/FP8)** | ✅ | ✅ (GGUF) | ✅ (GGUF) |\n| **Multi-GPU support** | ✅ Native | Limited | ❌ |\n| **Ease of setup** | Moderate | Easy | Easy |\n| **GUI** | ❌ | ❌ | ✅ |\n\n**Choose vLLM when:** You need production-grade throughput, serve multiple concurrent users, or run large models across multiple GPUs.\n\n## Prerequisites\n\n1. **NVIDIA GPU(s)** with CUDA support (compute capability ≥ 7.0)\n2. **Python 3.9+**\n3. **vLLM installed:**\n   ```bash\n   pip install vllm\n   ```\n4. **RAG-Anything installed:**\n   ```bash\n   pip install raganything\n   ```\n\n## Quick Start\n\n### 1. Start vLLM Server\n\n**Chat/Completion model:**\n```bash\nvllm serve Qwen/Qwen2.5-72B-Instruct \\\n    --tensor-parallel-size 4 \\\n    --max-model-len 32768 \\\n    --port 8000\n```\n\n**Embedding model** (separate process, different port):\n```bash\nvllm serve BAAI/bge-m3 \\\n    --task embedding \\\n    --port 8001\n```\n\n### 2. Configure Environment\n\nCreate a `.env` file:\n\n```bash\n### vLLM Configuration\nLLM_BINDING=vllm\nLLM_MODEL=Qwen/Qwen2.5-72B-Instruct\nLLM_BINDING_HOST=http://localhost:8000/v1\nLLM_BINDING_API_KEY=token-abc123\n\n### Embedding via vLLM\nEMBEDDING_BINDING=vllm\nEMBEDDING_MODEL=BAAI/bge-m3\nEMBEDDING_DIM=1024\nEMBEDDING_BINDING_HOST=http://localhost:8001/v1\nEMBEDDING_BINDING_API_KEY=token-abc123\n```\n\n### 3. Run the Example\n\n```bash\ncd examples\npython vllm_integration_example.py\n```\n\n## Environment Variables\n\n| Variable | Default | Description |\n|----------|---------|-------------|\n| `LLM_BINDING` | — | Set to `vllm` |\n| `LLM_MODEL` | `Qwen/Qwen2.5-72B-Instruct` | Model name (must match what vLLM is serving) |\n| `LLM_BINDING_HOST` | `http://localhost:8000/v1` | vLLM API base URL |\n| `LLM_BINDING_API_KEY` | `token-abc123` | API key (vLLM default: any non-empty string) |\n| `EMBEDDING_BINDING` | — | Set to `vllm` |\n| `EMBEDDING_MODEL` | `BAAI/bge-m3` | Embedding model name |\n| `EMBEDDING_DIM` | `1024` | Embedding dimensions |\n| `EMBEDDING_BINDING_HOST` | `http://localhost:8001/v1` | Embedding endpoint URL |\n| `EMBEDDING_BINDING_API_KEY` | `token-abc123` | Embedding API key |\n\n## Model Configurations\n\n### Qwen 2.5 (Recommended for RAG)\n```bash\nvllm serve Qwen/Qwen2.5-72B-Instruct \\\n    --tensor-parallel-size 4 \\\n    --max-model-len 32768\n```\n\n### Mistral / Mixtral\n```bash\nvllm serve mistralai/Mixtral-8x7B-Instruct-v0.1 \\\n    --tensor-parallel-size 2 \\\n    --max-model-len 32768\n```\n\n### Llama 3.1 70B\n```bash\nvllm serve meta-llama/Llama-3.1-70B-Instruct \\\n    --tensor-parallel-size 4 \\\n    --max-model-len 8192\n```\n\n### With AWQ Quantization (reduced memory)\n```bash\nvllm serve Qwen/Qwen2.5-72B-Instruct-AWQ \\\n    --tensor-parallel-size 2 \\\n    --quantization awq \\\n    --max-model-len 32768\n```\n\n### With GPTQ Quantization\n```bash\nvllm serve TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ \\\n    --tensor-parallel-size 2 \\\n    --quantization gptq\n```\n\n## Performance Tips\n\n### Tensor Parallelism\nDistribute large models across GPUs. Set `--tensor-parallel-size` to the number of GPUs:\n```bash\n# 4x A100 80GB → can serve 72B models in full precision\nvllm serve Qwen/Qwen2.5-72B-Instruct --tensor-parallel-size 4\n```\n\n### GPU Memory Utilization\nIncrease if you have headroom (default 0.9):\n```bash\nvllm serve ... --gpu-memory-utilization 0.95\n```\n\n### Max Model Length\nReduce if you don't need full context (saves memory):\n```bash\n# RAG chunks are typically <4K tokens; 8192 is often sufficient\nvllm serve ... --max-model-len 8192\n```\n\n### Concurrency\nvLLM handles batching automatically. On the RAG-Anything side, increase `MAX_ASYNC` in your `.env`:\n```bash\nMAX_ASYNC=16  # vLLM handles concurrent requests efficiently\n```\n\n### Speculative Decoding (vLLM ≥ 0.4)\nUse a small draft model to speed up generation:\n```bash\nvllm serve Qwen/Qwen2.5-72B-Instruct \\\n    --speculative-model Qwen/Qwen2.5-0.5B-Instruct \\\n    --num-speculative-tokens 5 \\\n    --tensor-parallel-size 4\n```\n\n## Embedding Options\n\n### Option A: vLLM Embedding Server (Recommended)\nRun a dedicated vLLM instance for embeddings:\n```bash\nvllm serve BAAI/bge-m3 --task embedding --port 8001\n```\n\n### Option B: Use Ollama for Embeddings\nIf you already run Ollama, you can mix backends:\n```bash\nEMBEDDING_BINDING=ollama\nEMBEDDING_MODEL=bge-m3:latest\nEMBEDDING_BINDING_HOST=http://localhost:11434\n```\n\n### Option C: OpenAI Embeddings\nUse OpenAI's embedding API alongside vLLM for chat:\n```bash\nEMBEDDING_BINDING=openai\nEMBEDDING_MODEL=text-embedding-3-large\nEMBEDDING_DIM=3072\nEMBEDDING_BINDING_HOST=https://api.openai.com/v1\nEMBEDDING_BINDING_API_KEY=sk-...\n```\n\n## Architecture\n\n```\n┌──────────────────────┐\n│   RAG-Anything       │\n│  (Document Processing│\n│   + Query Engine)    │\n└──────┬───────────────┘\n       │ OpenAI-compatible API\n       ▼\n┌──────────────────────┐     ┌──────────────────────┐\n│  vLLM Chat Server    │     │  vLLM Embedding Server│\n│  :8000/v1            │     │  :8001/v1             │\n│  (Qwen-72B, etc.)   │     │  (bge-m3, etc.)       │\n└──────────────────────┘     └──────────────────────┘\n       │                            │\n       ▼                            ▼\n┌──────────────────────────────────────────────┐\n│              GPU Cluster                      │\n│   PagedAttention · Continuous Batching        │\n│   Tensor Parallelism · Quantization           │\n└──────────────────────────────────────────────┘\n```\n\n## Troubleshooting\n\n### Connection Refused\n```\n❌ Connection failed: Connection refused\n```\n- Ensure vLLM is running: `curl http://localhost:8000/v1/models`\n- Check the port matches your `LLM_BINDING_HOST`\n- Wait for model loading to complete (large models can take minutes)\n\n### Out of Memory\n```\ntorch.cuda.OutOfMemoryError\n```\n- Use quantized models (`--quantization awq` or `gptq`)\n- Reduce `--max-model-len`\n- Increase `--tensor-parallel-size` (more GPUs)\n- Lower `--gpu-memory-utilization`\n\n### Model Not Found\n```\nModel 'xxx' not found\n```\n- `LLM_MODEL` must match the model name vLLM is serving exactly\n- Check available models: `curl http://localhost:8000/v1/models`\n\n### Slow First Request\nThis is normal — vLLM compiles CUDA kernels on first use. Subsequent requests are fast.\n"
  },
  {
    "path": "env.example",
    "content": "### This is sample file of .env\n\n\n### Server Configuration\nHOST=0.0.0.0\nPORT=9621\nWEBUI_TITLE='My Graph KB'\nWEBUI_DESCRIPTION=\"Simple and Fast Graph Based RAG System\"\nOLLAMA_EMULATING_MODEL_TAG=latest\n# WORKERS=2\n# CORS_ORIGINS=http://localhost:3000,http://localhost:8080\n\n### Tiktoken Cache Configuration (for offline deployment)\n### Set this to a local directory containing cached tiktoken models\n### This prevents tiktoken from downloading models from the internet on initialization\n### See docs/offline_setup.md for setup instructions\n# TIKTOKEN_CACHE_DIR=./tiktoken_cache\n\n### Login Configuration\n# AUTH_ACCOUNTS='admin:admin123,user1:pass456'\n# TOKEN_SECRET=Your-Key-For-LightRAG-API-Server\n# TOKEN_EXPIRE_HOURS=48\n# GUEST_TOKEN_EXPIRE_HOURS=24\n# JWT_ALGORITHM=HS256\n\n### API-Key to access LightRAG Server API\n# LIGHTRAG_API_KEY=your-secure-api-key-here\n# WHITELIST_PATHS=/health,/api/*\n\n### Optional SSL Configuration\n# SSL=true\n# SSL_CERTFILE=/path/to/cert.pem\n# SSL_KEYFILE=/path/to/key.pem\n\n### Directory Configuration (defaults to current working directory)\n### Should not be set if deploy by docker (Set by Dockerfile instead of .env)\n### Default value is ./inputs and ./rag_storage\n# INPUT_DIR=<absolute_path_for_doc_input_dir>\n\n### RAGAnything Configuration (Multimodal Document Processing)\n### ---\n### Parser Configuration\n# PARSE_METHOD=auto\n# OUTPUT_DIR=./output\n# PARSER=mineru  # Options: mineru, docling, paddleocr\n# DISPLAY_CONTENT_STATS=true\n\n### Multimodal Processing Configuration\n# ENABLE_IMAGE_PROCESSING=true\n# ENABLE_TABLE_PROCESSING=true\n# ENABLE_EQUATION_PROCESSING=true\n\n### Batch Processing Configuration\n# MAX_CONCURRENT_FILES=1\n# SUPPORTED_FILE_EXTENSIONS=.pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md\n# RECURSIVE_FOLDER_PROCESSING=true\n\n### Context Extraction Configuration\n# CONTEXT_WINDOW=1\n# CONTEXT_MODE=page\n# MAX_CONTEXT_TOKENS=2000\n# INCLUDE_HEADERS=true\n# INCLUDE_CAPTIONS=true\n# CONTEXT_FILTER_CONTENT_TYPES=text\n# CONTENT_FORMAT=minerU\n\n### Max nodes return from grap retrieval\n# MAX_GRAPH_NODES=1000\n\n### Logging level\n# LOG_LEVEL=INFO\n# VERBOSE=False\n# LOG_MAX_BYTES=10485760\n# LOG_BACKUP_COUNT=5\n### Logfile location (defaults to current working directory)\n# LOG_DIR=/path/to/log/directory\n\n### Settings for RAG query\n# HISTORY_TURNS=3\n# COSINE_THRESHOLD=0.2\n# TOP_K=60\n# MAX_TOKEN_TEXT_CHUNK=4000\n# MAX_TOKEN_RELATION_DESC=4000\n# MAX_TOKEN_ENTITY_DESC=4000\n\n### Entity and relation summarization configuration\n### Language: English, Chinese, French, German ...\nSUMMARY_LANGUAGE=English\n### Number of duplicated entities/edges to trigger LLM re-summary on merge ( at least 3 is recommented)\n# FORCE_LLM_SUMMARY_ON_MERGE=6\n### Max tokens for entity/relations description after merge\n# MAX_TOKEN_SUMMARY=500\n\n### Number of parallel processing documents(Less than MAX_ASYNC/2 is recommended)\n# MAX_PARALLEL_INSERT=2\n### Chunk size for document splitting, 500~1500 is recommended\n# CHUNK_SIZE=1200\n# CHUNK_OVERLAP_SIZE=100\n\n### LLM Configuration\nENABLE_LLM_CACHE=true\nENABLE_LLM_CACHE_FOR_EXTRACT=true\n### Time out in seconds for LLM, None for infinite timeout\nTIMEOUT=240\n### Some models like o1-mini require temperature to be set to 1\nTEMPERATURE=0\n### Max concurrency requests of LLM\nMAX_ASYNC=4\n### MAX_TOKENS: max tokens send to LLM for entity relation summaries (less than context size of the model)\n### MAX_TOKENS: set as num_ctx option for Ollama by API Server\nMAX_TOKENS=32768\n### LLM Binding type: openai, ollama, lollms, azure_openai, lmstudio, vllm\nLLM_BINDING=openai\nLLM_MODEL=gpt-4o\nLLM_BINDING_HOST=https://api.openai.com/v1\nLLM_BINDING_API_KEY=your_api_key\n### Optional for Azure\n# AZURE_OPENAI_API_VERSION=2024-08-01-preview\n# AZURE_OPENAI_DEPLOYMENT=gpt-4o\n\n### vLLM Configuration (high-throughput production inference)\n### See docs/vllm_integration.md for setup guide\n# LLM_BINDING=vllm\n# LLM_MODEL=Qwen/Qwen2.5-72B-Instruct\n# LLM_BINDING_HOST=http://localhost:8000/v1\n# LLM_BINDING_API_KEY=token-abc123\n\n### Embedding Configuration\n### Embedding Binding type: openai, ollama, lollms, azure_openai, lmstudio, vllm\nEMBEDDING_BINDING=ollama\nEMBEDDING_MODEL=bge-m3:latest\nEMBEDDING_DIM=1024\nEMBEDDING_BINDING_API_KEY=your_api_key\n# If the embedding service is deployed within the same Docker stack, use host.docker.internal instead of localhost\nEMBEDDING_BINDING_HOST=http://localhost:11434\n### Num of chunks send to Embedding in single request\n# EMBEDDING_BATCH_NUM=32\n### Max concurrency requests for Embedding\n# EMBEDDING_FUNC_MAX_ASYNC=16\n### Maximum tokens sent to Embedding for each chunk (no longer in use?)\n# MAX_EMBED_TOKENS=8192\n### Optional for Azure\n# AZURE_EMBEDDING_DEPLOYMENT=text-embedding-3-large\n# AZURE_EMBEDDING_API_VERSION=2023-05-15\n\n### Data storage selection\n# LIGHTRAG_KV_STORAGE=PGKVStorage\n# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage\n# LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage\n# LIGHTRAG_GRAPH_STORAGE=Neo4JStorage\n\n### TiDB Configuration (Deprecated)\n# TIDB_HOST=localhost\n# TIDB_PORT=4000\n# TIDB_USER=your_username\n# TIDB_PASSWORD='your_password'\n# TIDB_DATABASE=your_database\n### separating all data from difference Lightrag instances(deprecating)\n# TIDB_WORKSPACE=default\n\n### PostgreSQL Configuration\nPOSTGRES_HOST=localhost\nPOSTGRES_PORT=5432\nPOSTGRES_USER=your_username\nPOSTGRES_PASSWORD='your_password'\nPOSTGRES_DATABASE=your_database\nPOSTGRES_MAX_CONNECTIONS=12\n### separating all data from difference Lightrag instances(deprecating)\n# POSTGRES_WORKSPACE=default\n\n### Neo4j Configuration\nNEO4J_URI=neo4j+s://xxxxxxxx.databases.neo4j.io\nNEO4J_USERNAME=neo4j\nNEO4J_PASSWORD='your_password'\n\n### Independent AGM Configuration(not for AMG embedded in PostgreSQL)\n# AGE_POSTGRES_DB=\n# AGE_POSTGRES_USER=\n# AGE_POSTGRES_PASSWORD=\n# AGE_POSTGRES_HOST=\n# AGE_POSTGRES_PORT=8529\n\n# AGE Graph Name(apply to PostgreSQL and independent AGM)\n### AGE_GRAPH_NAME is deprecated\n# AGE_GRAPH_NAME=lightrag\n\n### MongoDB Configuration\nMONGO_URI=mongodb://root:root@localhost:27017/\nMONGO_DATABASE=LightRAG\n### separating all data from difference Lightrag instances(deprecating)\n# MONGODB_GRAPH=false\n\n### Milvus Configuration\nMILVUS_URI=http://localhost:19530\nMILVUS_DB_NAME=lightrag\n# MILVUS_USER=root\n# MILVUS_PASSWORD=your_password\n# MILVUS_TOKEN=your_token\n\n### Qdrant\nQDRANT_URL=http://localhost:16333\n# QDRANT_API_KEY=your-api-key\n\n### Redis\nREDIS_URI=redis://localhost:6379\n"
  },
  {
    "path": "examples/batch_dry_run_example.py",
    "content": "\"\"\"\nDry-run batch parsing example.\n\nLists supported files without running any parser.\n\nUsage:\n  - pip install:\n      python examples/batch_dry_run_example.py examples/sample_docs --parser mineru\n      python examples/batch_dry_run_example.py examples/sample_docs/projects examples/sample_docs/web --parser docling\n      python examples/batch_dry_run_example.py examples/sample_docs --parser paddleocr\n  - uv install:\n      uv run python examples/batch_dry_run_example.py examples/sample_docs --parser mineru --recursive\n      uv run python examples/batch_dry_run_example.py examples/sample_docs --parser mineru --no-recursive\n\"\"\"\n\nimport argparse\n\nfrom raganything.batch_parser import BatchParser\n\n\ndef main() -> int:\n    parser = argparse.ArgumentParser(description=\"Dry-run batch parsing example\")\n    parser.add_argument(\"paths\", nargs=\"+\", help=\"File paths or directories to scan\")\n    parser.add_argument(\n        \"--parser\",\n        choices=[\"mineru\", \"docling\", \"paddleocr\"],\n        default=\"mineru\",\n        help=\"Parser to use for file-type support\",\n    )\n    parser.add_argument(\n        \"--output\",\n        default=\"./batch_output\",\n        help=\"Output directory (unused in dry-run, but required by API)\",\n    )\n    parser.add_argument(\n        \"--recursive\",\n        action=argparse.BooleanOptionalAction,\n        default=True,\n        help=\"Search directories recursively\",\n    )\n    args = parser.parse_args()\n\n    batch_parser = BatchParser(parser_type=args.parser, show_progress=False)\n    result = batch_parser.process_batch(\n        file_paths=args.paths,\n        output_dir=args.output,\n        recursive=args.recursive,\n        dry_run=True,\n    )\n\n    print(result.summary())\n    if result.successful_files:\n        print(\"\\nDry run: files that would be processed:\")\n        for file_path in result.successful_files:\n            print(f\"  - {file_path}\")\n    else:\n        print(\"\\nDry run: no supported files found.\")\n\n    return 0\n\n\nif __name__ == \"__main__\":\n    raise SystemExit(main())\n"
  },
  {
    "path": "examples/batch_processing_example.py",
    "content": "#!/usr/bin/env python\n\"\"\"\nBatch Processing Example for RAG-Anything\n\nThis example demonstrates how to use the batch processing capabilities\nto process multiple documents in parallel for improved throughput.\n\nFeatures demonstrated:\n- Basic batch processing with BatchParser\n- Asynchronous batch processing\n- Integration with RAG-Anything\n- Error handling and progress tracking\n- File filtering and directory processing\n\"\"\"\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nimport tempfile\nimport time\n\n# Add project root directory to Python path\nimport sys\n\nsys.path.append(str(Path(__file__).parent.parent))\n\nfrom raganything import RAGAnything, RAGAnythingConfig\nfrom raganything.batch_parser import BatchParser\n\n\ndef create_sample_documents():\n    \"\"\"Create sample documents for batch processing testing\"\"\"\n    temp_dir = Path(tempfile.mkdtemp())\n    sample_files = []\n\n    # Create various document types\n    documents = {\n        \"document1.txt\": \"This is a simple text document for testing batch processing.\",\n        \"document2.txt\": \"Another text document with different content.\",\n        \"document3.md\": \"\"\"# Markdown Document\n\n## Introduction\nThis is a markdown document for testing.\n\n### Features\n- Markdown formatting\n- Code blocks\n- Lists\n\n```python\ndef example():\n    return \"Hello from markdown\"\n```\n\"\"\",\n        \"report.txt\": \"\"\"Business Report\n\nExecutive Summary:\nThis report demonstrates batch processing capabilities.\n\nKey Findings:\n1. Parallel processing improves throughput\n2. Progress tracking enhances user experience\n3. Error handling ensures reliability\n\nConclusion:\nBatch processing is essential for large-scale document processing.\n\"\"\",\n        \"notes.md\": \"\"\"# Meeting Notes\n\n## Date: 2024-01-15\n\n### Attendees\n- Alice Johnson\n- Bob Smith\n- Carol Williams\n\n### Discussion Topics\n1. **Batch Processing Implementation**\n   - Parallel document processing\n   - Progress tracking\n   - Error handling strategies\n\n2. **Performance Metrics**\n   - Target: 100 documents/hour\n   - Memory usage: < 4GB\n   - Success rate: > 95%\n\n### Action Items\n- [ ] Implement batch processing\n- [ ] Add progress bars\n- [ ] Test with large document sets\n- [ ] Optimize memory usage\n\n### Next Steps\nContinue development and testing of batch processing features.\n\"\"\",\n    }\n\n    # Create files\n    for filename, content in documents.items():\n        file_path = temp_dir / filename\n        with open(file_path, \"w\", encoding=\"utf-8\") as f:\n            f.write(content)\n        sample_files.append(str(file_path))\n\n    return sample_files, temp_dir\n\n\ndef demonstrate_basic_batch_processing():\n    \"\"\"Demonstrate basic batch processing functionality\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"BASIC BATCH PROCESSING DEMONSTRATION\")\n    print(\"=\" * 60)\n\n    # Create sample documents\n    sample_files, temp_dir = create_sample_documents()\n\n    try:\n        print(f\"Created {len(sample_files)} sample documents in: {temp_dir}\")\n        for file_path in sample_files:\n            print(f\"  - {Path(file_path).name}\")\n\n        # Create batch parser\n        batch_parser = BatchParser(\n            parser_type=\"mineru\",\n            max_workers=3,\n            show_progress=True,\n            timeout_per_file=60,\n            skip_installation_check=True,  # Skip installation check for demo\n        )\n\n        print(\"\\nBatch parser configured:\")\n        print(\"  - Parser type: mineru\")\n        print(\"  - Max workers: 3\")\n        print(\"  - Progress tracking: enabled\")\n        print(\"  - Timeout per file: 60 seconds\")\n\n        # Check supported extensions\n        supported_extensions = batch_parser.get_supported_extensions()\n        print(f\"  - Supported extensions: {supported_extensions}\")\n\n        # Filter files to supported types\n        supported_files = batch_parser.filter_supported_files(sample_files)\n        print(\"\\nFile filtering results:\")\n        print(f\"  - Total files: {len(sample_files)}\")\n        print(f\"  - Supported files: {len(supported_files)}\")\n\n        # Process batch\n        output_dir = temp_dir / \"batch_output\"\n        print(\"\\nStarting batch processing...\")\n        print(f\"Output directory: {output_dir}\")\n\n        start_time = time.time()\n        result = batch_parser.process_batch(\n            file_paths=supported_files,\n            output_dir=str(output_dir),\n            parse_method=\"auto\",\n            recursive=False,\n        )\n        processing_time = time.time() - start_time\n\n        # Display results\n        print(\"\\n\" + \"-\" * 40)\n        print(\"BATCH PROCESSING RESULTS\")\n        print(\"-\" * 40)\n        print(result.summary())\n        print(f\"Total processing time: {processing_time:.2f} seconds\")\n        print(f\"Success rate: {result.success_rate:.1f}%\")\n\n        if result.successful_files:\n            print(\"\\nSuccessfully processed files:\")\n            for file_path in result.successful_files:\n                print(f\"  ✅ {Path(file_path).name}\")\n\n        if result.failed_files:\n            print(\"\\nFailed files:\")\n            for file_path in result.failed_files:\n                error = result.errors.get(file_path, \"Unknown error\")\n                print(f\"  ❌ {Path(file_path).name}: {error}\")\n\n        return result\n\n    except Exception as e:\n        print(f\"❌ Batch processing demonstration failed: {str(e)}\")\n        return None\n\n\nasync def demonstrate_async_batch_processing():\n    \"\"\"Demonstrate asynchronous batch processing\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"ASYNCHRONOUS BATCH PROCESSING DEMONSTRATION\")\n    print(\"=\" * 60)\n\n    # Create sample documents\n    sample_files, temp_dir = create_sample_documents()\n\n    try:\n        print(f\"Processing {len(sample_files)} documents asynchronously...\")\n\n        # Create batch parser\n        batch_parser = BatchParser(\n            parser_type=\"mineru\",\n            max_workers=2,\n            show_progress=True,\n            skip_installation_check=True,\n        )\n\n        # Process batch asynchronously\n        output_dir = temp_dir / \"async_output\"\n\n        start_time = time.time()\n        result = await batch_parser.process_batch_async(\n            file_paths=sample_files,\n            output_dir=str(output_dir),\n            parse_method=\"auto\",\n            recursive=False,\n        )\n        processing_time = time.time() - start_time\n\n        # Display results\n        print(\"\\n\" + \"-\" * 40)\n        print(\"ASYNC BATCH PROCESSING RESULTS\")\n        print(\"-\" * 40)\n        print(result.summary())\n        print(f\"Async processing time: {processing_time:.2f} seconds\")\n        print(f\"Success rate: {result.success_rate:.1f}%\")\n\n        return result\n\n    except Exception as e:\n        print(f\"❌ Async batch processing demonstration failed: {str(e)}\")\n        return None\n\n\nasync def demonstrate_rag_integration():\n    \"\"\"Demonstrate batch processing integration with RAG-Anything\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"RAG-ANYTHING BATCH INTEGRATION DEMONSTRATION\")\n    print(\"=\" * 60)\n\n    # Create sample documents\n    sample_files, temp_dir = create_sample_documents()\n\n    try:\n        # Initialize RAG-Anything with temporary storage\n        config = RAGAnythingConfig(\n            working_dir=str(temp_dir / \"rag_storage\"),\n            enable_image_processing=True,\n            enable_table_processing=True,\n            enable_equation_processing=True,\n            max_concurrent_files=2,\n        )\n\n        rag = RAGAnything(config=config)\n\n        print(\"RAG-Anything initialized with batch processing capabilities\")\n\n        # Show available batch methods\n        batch_methods = [method for method in dir(rag) if \"batch\" in method.lower()]\n        print(f\"Available batch methods: {batch_methods}\")\n\n        # Demonstrate batch processing with RAG integration\n        print(f\"\\nProcessing {len(sample_files)} documents with RAG integration...\")\n\n        # Use the RAG-integrated batch processing\n        try:\n            # Process documents in batch\n            result = rag.process_documents_batch(\n                file_paths=sample_files,\n                output_dir=str(temp_dir / \"rag_batch_output\"),\n                max_workers=2,\n                show_progress=True,\n            )\n\n            print(\"\\n\" + \"-\" * 40)\n            print(\"RAG BATCH PROCESSING RESULTS\")\n            print(\"-\" * 40)\n            print(result.summary())\n            print(f\"Success rate: {result.success_rate:.1f}%\")\n\n            # Demonstrate batch processing with full RAG integration\n            print(\"\\nProcessing documents with full RAG integration...\")\n\n            rag_result = await rag.process_documents_with_rag_batch(\n                file_paths=sample_files[:2],  # Process subset for demo\n                output_dir=str(temp_dir / \"rag_full_output\"),\n                max_workers=1,\n                show_progress=True,\n            )\n\n            print(\"\\n\" + \"-\" * 40)\n            print(\"FULL RAG INTEGRATION RESULTS\")\n            print(\"-\" * 40)\n            print(f\"Parse result: {rag_result['parse_result'].summary()}\")\n            print(\n                f\"RAG processing time: {rag_result['total_processing_time']:.2f} seconds\"\n            )\n            print(\n                f\"Successfully processed with RAG: {rag_result['successful_rag_files']}\"\n            )\n            print(f\"Failed RAG processing: {rag_result['failed_rag_files']}\")\n\n            return rag_result\n\n        except Exception as e:\n            print(f\"⚠️ RAG integration demo completed with limitations: {str(e)}\")\n            print(\n                \"Note: This is expected in environments without full API configuration\"\n            )\n            return None\n\n    except Exception as e:\n        print(f\"❌ RAG integration demonstration failed: {str(e)}\")\n        return None\n\n\ndef demonstrate_directory_processing():\n    \"\"\"Demonstrate processing entire directories\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"DIRECTORY PROCESSING DEMONSTRATION\")\n    print(\"=\" * 60)\n\n    # Create a directory structure with nested files\n    temp_dir = Path(tempfile.mkdtemp())\n\n    # Create main directory files\n    main_files = {\n        \"overview.txt\": \"Main directory overview document\",\n        \"readme.md\": \"# Project README\\n\\nThis is the main project documentation.\",\n    }\n\n    # Create subdirectory\n    sub_dir = temp_dir / \"subdirectory\"\n    sub_dir.mkdir()\n\n    sub_files = {\n        \"details.txt\": \"Detailed information in subdirectory\",\n        \"notes.md\": \"# Notes\\n\\nAdditional notes and information.\",\n    }\n\n    # Write all files\n    all_files = []\n    for filename, content in main_files.items():\n        file_path = temp_dir / filename\n        with open(file_path, \"w\", encoding=\"utf-8\") as f:\n            f.write(content)\n        all_files.append(str(file_path))\n\n    for filename, content in sub_files.items():\n        file_path = sub_dir / filename\n        with open(file_path, \"w\", encoding=\"utf-8\") as f:\n            f.write(content)\n        all_files.append(str(file_path))\n\n    try:\n        print(\"Created directory structure:\")\n        print(f\"  Main directory: {temp_dir}\")\n        print(f\"  Files in main: {list(main_files.keys())}\")\n        print(f\"  Subdirectory: {sub_dir}\")\n        print(f\"  Files in sub: {list(sub_files.keys())}\")\n\n        # Create batch parser\n        batch_parser = BatchParser(\n            parser_type=\"mineru\",\n            max_workers=2,\n            show_progress=True,\n            skip_installation_check=True,\n        )\n\n        # Process entire directory recursively\n        print(\"\\nProcessing entire directory recursively...\")\n\n        result = batch_parser.process_batch(\n            file_paths=[str(temp_dir)],  # Pass directory path\n            output_dir=str(temp_dir / \"directory_output\"),\n            parse_method=\"auto\",\n            recursive=True,  # Include subdirectories\n        )\n\n        print(\"\\n\" + \"-\" * 40)\n        print(\"DIRECTORY PROCESSING RESULTS\")\n        print(\"-\" * 40)\n        print(result.summary())\n        print(f\"Total files found and processed: {result.total_files}\")\n        print(f\"Success rate: {result.success_rate:.1f}%\")\n\n        if result.successful_files:\n            print(\"\\nSuccessfully processed:\")\n            for file_path in result.successful_files:\n                relative_path = Path(file_path).relative_to(temp_dir)\n                print(f\"  ✅ {relative_path}\")\n\n        return result\n\n    except Exception as e:\n        print(f\"❌ Directory processing demonstration failed: {str(e)}\")\n        return None\n\n\ndef demonstrate_error_handling():\n    \"\"\"Demonstrate error handling and recovery\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"ERROR HANDLING DEMONSTRATION\")\n    print(\"=\" * 60)\n\n    temp_dir = Path(tempfile.mkdtemp())\n\n    # Create files with various issues\n    files_with_issues = {\n        \"valid_file.txt\": \"This is a valid file that should process successfully.\",\n        \"empty_file.txt\": \"\",  # Empty file\n        \"large_file.txt\": \"x\" * 1000000,  # Large file (1MB of 'x')\n    }\n\n    created_files = []\n    for filename, content in files_with_issues.items():\n        file_path = temp_dir / filename\n        with open(file_path, \"w\", encoding=\"utf-8\") as f:\n            f.write(content)\n        created_files.append(str(file_path))\n\n    # Add a non-existent file to the list\n    created_files.append(str(temp_dir / \"non_existent_file.txt\"))\n\n    try:\n        print(f\"Testing error handling with {len(created_files)} files:\")\n        for file_path in created_files:\n            name = Path(file_path).name\n            exists = Path(file_path).exists()\n            size = Path(file_path).stat().st_size if exists else 0\n            print(f\"  - {name}: {'exists' if exists else 'missing'}, {size} bytes\")\n\n        # Create batch parser with short timeout for demonstration\n        batch_parser = BatchParser(\n            parser_type=\"mineru\",\n            max_workers=2,\n            show_progress=True,\n            timeout_per_file=30,  # Short timeout for demo\n            skip_installation_check=True,\n        )\n\n        # Process files and handle errors\n        result = batch_parser.process_batch(\n            file_paths=created_files,\n            output_dir=str(temp_dir / \"error_test_output\"),\n            parse_method=\"auto\",\n        )\n\n        print(\"\\n\" + \"-\" * 40)\n        print(\"ERROR HANDLING RESULTS\")\n        print(\"-\" * 40)\n        print(result.summary())\n\n        if result.successful_files:\n            print(\"\\nSuccessful files:\")\n            for file_path in result.successful_files:\n                print(f\"  ✅ {Path(file_path).name}\")\n\n        if result.failed_files:\n            print(\"\\nFailed files with error details:\")\n            for file_path in result.failed_files:\n                error = result.errors.get(file_path, \"Unknown error\")\n                print(f\"  ❌ {Path(file_path).name}: {error}\")\n\n        # Demonstrate retry logic\n        if result.failed_files:\n            print(\n                f\"\\nDemonstrating retry logic for {len(result.failed_files)} failed files...\"\n            )\n\n            # Retry only the failed files\n            retry_result = batch_parser.process_batch(\n                file_paths=result.failed_files,\n                output_dir=str(temp_dir / \"retry_output\"),\n                parse_method=\"auto\",\n            )\n\n            print(f\"Retry results: {retry_result.summary()}\")\n\n        return result\n\n    except Exception as e:\n        print(f\"❌ Error handling demonstration failed: {str(e)}\")\n        return None\n\n\nasync def main():\n    \"\"\"Main demonstration function\"\"\"\n    # Configure logging\n    logging.basicConfig(\n        level=logging.INFO,\n        format=\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\",\n    )\n\n    print(\"RAG-Anything Batch Processing Demonstration\")\n    print(\"=\" * 70)\n    print(\"This example demonstrates various batch processing capabilities:\")\n    print(\"  - Basic batch processing with progress tracking\")\n    print(\"  - Asynchronous processing for improved performance\")\n    print(\"  - Integration with RAG-Anything pipeline\")\n    print(\"  - Directory processing with recursive file discovery\")\n    print(\"  - Comprehensive error handling and recovery\")\n\n    results = {}\n\n    # Run demonstrations\n    print(\"\\n🚀 Starting demonstrations...\")\n\n    # Basic batch processing\n    results[\"basic\"] = demonstrate_basic_batch_processing()\n\n    # Asynchronous processing\n    results[\"async\"] = await demonstrate_async_batch_processing()\n\n    # RAG integration\n    results[\"rag\"] = await demonstrate_rag_integration()\n\n    # Directory processing\n    results[\"directory\"] = demonstrate_directory_processing()\n\n    # Error handling\n    results[\"error_handling\"] = demonstrate_error_handling()\n\n    # Summary\n    print(\"\\n\" + \"=\" * 70)\n    print(\"DEMONSTRATION SUMMARY\")\n    print(\"=\" * 70)\n\n    for demo_name, result in results.items():\n        if result:\n            if hasattr(result, \"success_rate\"):\n                print(\n                    f\"✅ {demo_name.upper()}: {result.success_rate:.1f}% success rate\"\n                )\n            else:\n                print(f\"✅ {demo_name.upper()}: Completed successfully\")\n        else:\n            print(f\"❌ {demo_name.upper()}: Failed or had limitations\")\n\n    print(\"\\n📊 Key Features Demonstrated:\")\n    print(\"  - Parallel document processing with configurable worker counts\")\n    print(\"  - Real-time progress tracking with tqdm progress bars\")\n    print(\"  - Comprehensive error handling and reporting\")\n    print(\"  - File filtering based on supported document types\")\n    print(\"  - Directory processing with recursive file discovery\")\n    print(\"  - Asynchronous processing for improved performance\")\n    print(\"  - Integration with RAG-Anything document pipeline\")\n    print(\"  - Retry logic for failed documents\")\n    print(\"  - Detailed processing statistics and timing\")\n\n    print(\"\\n💡 Best Practices Highlighted:\")\n    print(\"  - Use appropriate worker counts for your system\")\n    print(\"  - Enable progress tracking for long-running operations\")\n    print(\"  - Handle errors gracefully with retry mechanisms\")\n    print(\"  - Filter files to supported types before processing\")\n    print(\"  - Set reasonable timeouts for document processing\")\n    print(\"  - Use skip_installation_check for environments with conflicts\")\n\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n"
  },
  {
    "path": "examples/enhanced_markdown_example.py",
    "content": "#!/usr/bin/env python\n\"\"\"\nEnhanced Markdown Conversion Example for RAG-Anything\n\nThis example demonstrates the enhanced markdown to PDF conversion capabilities\nwith multiple backends, advanced styling, and professional formatting.\n\nFeatures demonstrated:\n- Basic markdown to PDF conversion\n- Multiple conversion backends (WeasyPrint, Pandoc)\n- Custom CSS styling and configuration\n- Backend detection and selection\n- Error handling and fallback mechanisms\n- Command-line interface usage\n\"\"\"\n\nimport logging\nfrom pathlib import Path\nimport tempfile\n\n# Add project root directory to Python path\nimport sys\n\nsys.path.append(str(Path(__file__).parent.parent))\n\nfrom raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig\n\n\ndef create_sample_markdown_content():\n    \"\"\"Create comprehensive sample markdown content for testing\"\"\"\n\n    # Basic sample\n    basic_content = \"\"\"# Basic Markdown Sample\n\n## Introduction\nThis is a simple markdown document demonstrating basic formatting.\n\n### Text Formatting\n- **Bold text** and *italic text*\n- `Inline code` examples\n- [Links to external sites](https://github.com)\n\n### Lists\n1. First ordered item\n2. Second ordered item\n3. Third ordered item\n\n- Unordered item\n- Another unordered item\n  - Nested item\n  - Another nested item\n\n### Blockquotes\n> This is a blockquote with important information.\n> It can span multiple lines.\n\n### Code Block\n```python\ndef hello_world():\n    print(\"Hello, World!\")\n    return \"Success\"\n```\n\"\"\"\n\n    # Technical documentation sample\n    technical_content = \"\"\"# Technical Documentation\n\n## Table of Contents\n- [Overview](#overview)\n- [Architecture](#architecture)\n- [Implementation](#implementation)\n- [Performance](#performance)\n\n## Overview\nThis document provides comprehensive technical specifications for the enhanced markdown conversion system.\n\n## Architecture\n\n### Core Components\n1. **Markdown Parser**: Processes markdown syntax\n2. **CSS Engine**: Applies styling and layout\n3. **PDF Generator**: Creates final PDF output\n4. **Backend Manager**: Handles multiple conversion engines\n\n### Data Flow\n```mermaid\ngraph LR\n    A[Markdown Input] --> B[Parser]\n    B --> C[CSS Processor]\n    C --> D[PDF Generator]\n    D --> E[PDF Output]\n```\n\n## Implementation\n\n### Python Code Example\n```python\nfrom raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig\n\n# Configure converter\nconfig = MarkdownConfig(\n    page_size=\"A4\",\n    margin=\"1in\",\n    include_toc=True,\n    syntax_highlighting=True\n)\n\n# Create converter\nconverter = EnhancedMarkdownConverter(config)\n\n# Convert to PDF\nsuccess = converter.convert_file_to_pdf(\n    input_path=\"document.md\",\n    output_path=\"output.pdf\",\n    method=\"weasyprint\"\n)\n```\n\n### Configuration Options\n```yaml\nconverter:\n  page_size: A4\n  margin: 1in\n  font_size: 12pt\n  include_toc: true\n  syntax_highlighting: true\n  backend: weasyprint\n```\n\n## Performance\n\n### Benchmark Results\n| Backend | Speed | Quality | Features |\n|---------|-------|---------|----------|\n| WeasyPrint | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ |\n| Pandoc | ⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |\n\n### Processing Times\n- **Small documents** (< 10 pages): 1-3 seconds\n- **Medium documents** (10-50 pages): 3-10 seconds\n- **Large documents** (> 50 pages): 10-30 seconds\n\n## Advanced Features\n\n### Custom CSS Styling\nThe system supports advanced CSS customization:\n\n```css\nbody {\n    font-family: 'Georgia', serif;\n    line-height: 1.6;\n    color: #333;\n}\n\nh1 {\n    color: #2c3e50;\n    border-bottom: 2px solid #3498db;\n    padding-bottom: 0.3em;\n}\n\ncode {\n    background-color: #f8f9fa;\n    padding: 2px 4px;\n    border-radius: 3px;\n    font-family: 'Courier New', monospace;\n}\n\npre {\n    background-color: #f8f9fa;\n    border-left: 4px solid #3498db;\n    padding: 15px;\n    border-radius: 5px;\n    overflow-x: auto;\n}\n\ntable {\n    border-collapse: collapse;\n    width: 100%;\n    margin: 1em 0;\n}\n\nth, td {\n    border: 1px solid #ddd;\n    padding: 8px 12px;\n    text-align: left;\n}\n\nth {\n    background-color: #f2f2f2;\n    font-weight: bold;\n}\n```\n\n### Image Support\n![Sample Image](https://via.placeholder.com/400x200/3498db/ffffff?text=Sample+Image)\n\nImages are automatically scaled and positioned appropriately in the PDF output.\n\n## Conclusion\nThe enhanced markdown conversion system provides professional-quality PDF generation with extensive customization options and multiple backend support.\n\n---\n\n*Generated on: 2024-01-15*\n*Version: 1.0.0*\n\"\"\"\n\n    # Academic paper sample\n    academic_content = \"\"\"# Research Paper: Advanced Document Processing\n\n**Authors:** Alice Johnson¹, Bob Smith², Carol Williams¹\n**Affiliations:**\n¹ University of Technology\n² Research Institute\n\n## Abstract\n\nThis paper presents a comprehensive analysis of advanced document processing techniques using enhanced markdown conversion. Our research demonstrates significant improvements in processing speed and output quality through optimized backend selection and custom styling approaches.\n\n**Keywords:** document processing, markdown conversion, PDF generation, performance optimization\n\n## 1. Introduction\n\nDocument processing has become increasingly important in modern information systems. The ability to convert markdown documents to high-quality PDF outputs with professional formatting is crucial for academic, technical, and business applications.\n\n### 1.1 Research Objectives\n\n1. Evaluate different markdown conversion backends\n2. Analyze performance characteristics of each approach\n3. Develop optimization strategies for large-scale processing\n4. Design flexible configuration systems for diverse use cases\n\n### 1.2 Contributions\n\nThis work makes the following contributions:\n- Comprehensive comparison of markdown conversion backends\n- Performance optimization techniques for large documents\n- Flexible configuration framework for customization\n- Integration patterns for document processing pipelines\n\n## 2. Methodology\n\n### 2.1 Experimental Setup\n\nWe conducted experiments using the following configuration:\n\n```python\n# Experimental configuration\nconfig = MarkdownConfig(\n    page_size=\"A4\",\n    margin=\"1in\",\n    font_size=\"11pt\",\n    line_height=\"1.4\",\n    include_toc=True,\n    syntax_highlighting=True\n)\n```\n\n### 2.2 Test Documents\n\n| Category | Count | Avg Size | Complexity |\n|----------|-------|----------|------------|\n| Simple | 100 | 2 pages | Low |\n| Medium | 50 | 10 pages | Medium |\n| Complex | 25 | 25 pages | High |\n\n### 2.3 Metrics\n\nWe evaluated performance using the following metrics:\n- **Conversion Speed**: Time to generate PDF (seconds)\n- **Memory Usage**: Peak memory consumption (MB)\n- **Output Quality**: Visual assessment score (1-10)\n- **Feature Support**: Number of supported markdown features\n\n## 3. Results\n\n### 3.1 Performance Comparison\n\nThe following table summarizes our performance results:\n\n| Backend | Speed (s) | Memory (MB) | Quality | Features |\n|---------|-----------|-------------|---------|----------|\n| WeasyPrint | 2.3 ± 0.5 | 85 ± 15 | 8.5 | 85% |\n| Pandoc | 4.7 ± 1.2 | 120 ± 25 | 9.2 | 95% |\n\n### 3.2 Quality Analysis\n\n#### 3.2.1 Typography\nWeasyPrint excels in web-style typography with excellent CSS support, while Pandoc provides superior academic formatting with LaTeX-quality output.\n\n#### 3.2.2 Code Highlighting\nBoth backends support syntax highlighting through Pygments:\n\n```python\ndef analyze_performance(backend, documents):\n    '''Analyze conversion performance for given backend'''\n    results = []\n\n    for doc in documents:\n        start_time = time.time()\n        success = backend.convert(doc)\n        end_time = time.time()\n\n        results.append({\n            'document': doc,\n            'time': end_time - start_time,\n            'success': success\n        })\n\n    return results\n```\n\n### 3.3 Scalability\n\nOur scalability analysis shows:\n- Linear scaling with document size for both backends\n- Memory usage proportional to content complexity\n- Optimal batch sizes of 10-20 documents for parallel processing\n\n## 4. Discussion\n\n### 4.1 Backend Selection Guidelines\n\nChoose **WeasyPrint** for:\n- Web-style documents with custom CSS\n- Fast conversion requirements\n- Simple to medium complexity documents\n\nChoose **Pandoc** for:\n- Academic papers and publications\n- Complex document structures\n- Maximum feature support requirements\n\n### 4.2 Optimization Strategies\n\n1. **Image Optimization**: Compress images before embedding\n2. **CSS Minimization**: Use efficient CSS selectors\n3. **Content Chunking**: Process large documents in sections\n4. **Caching**: Cache converted content for repeated use\n\n## 5. Conclusion\n\nThis research demonstrates that enhanced markdown conversion provides significant benefits for document processing workflows. The choice between WeasyPrint and Pandoc depends on specific requirements for speed, quality, and features.\n\n### 5.1 Future Work\n\n- Integration with cloud processing services\n- Real-time collaborative editing support\n- Advanced template systems\n- Performance optimization for very large documents\n\n## References\n\n1. Johnson, A. et al. (2024). \"Advanced Document Processing Techniques.\" *Journal of Information Systems*, 15(3), 45-62.\n2. Smith, B. (2023). \"PDF Generation Optimization.\" *Technical Computing Review*, 8(2), 12-28.\n3. Williams, C. (2024). \"Markdown Processing Frameworks.\" *Software Engineering Quarterly*, 22(1), 78-95.\n\n---\n\n**Manuscript received:** January 10, 2024\n**Accepted for publication:** January 15, 2024\n**Published online:** January 20, 2024\n\"\"\"\n\n    return {\n        \"basic\": basic_content,\n        \"technical\": technical_content,\n        \"academic\": academic_content,\n    }\n\n\ndef demonstrate_basic_conversion():\n    \"\"\"Demonstrate basic markdown to PDF conversion\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"BASIC MARKDOWN CONVERSION DEMONSTRATION\")\n    print(\"=\" * 60)\n\n    try:\n        # Create converter with default settings\n        converter = EnhancedMarkdownConverter()\n\n        # Show backend information\n        backend_info = converter.get_backend_info()\n        print(\"Available conversion backends:\")\n        for backend, available in backend_info[\"available_backends\"].items():\n            status = \"✅\" if available else \"❌\"\n            print(f\"  {status} {backend}\")\n        print(f\"Recommended backend: {backend_info['recommended_backend']}\")\n\n        # Get sample content\n        samples = create_sample_markdown_content()\n        temp_dir = Path(tempfile.mkdtemp())\n\n        # Convert basic sample\n        basic_md_path = temp_dir / \"basic_sample.md\"\n        with open(basic_md_path, \"w\", encoding=\"utf-8\") as f:\n            f.write(samples[\"basic\"])\n\n        print(f\"\\nConverting basic sample: {basic_md_path}\")\n\n        success = converter.convert_file_to_pdf(\n            input_path=str(basic_md_path),\n            output_path=str(temp_dir / \"basic_sample.pdf\"),\n            method=\"auto\",  # Let the system choose the best backend\n        )\n\n        if success:\n            print(\"✅ Basic conversion successful!\")\n            print(f\"   Output: {temp_dir / 'basic_sample.pdf'}\")\n        else:\n            print(\"❌ Basic conversion failed\")\n\n        return success, temp_dir\n\n    except Exception as e:\n        print(f\"❌ Basic conversion demonstration failed: {str(e)}\")\n        return False, None\n\n\ndef demonstrate_backend_comparison():\n    \"\"\"Demonstrate different conversion backends\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"BACKEND COMPARISON DEMONSTRATION\")\n    print(\"=\" * 60)\n\n    try:\n        samples = create_sample_markdown_content()\n        temp_dir = Path(tempfile.mkdtemp())\n\n        # Create technical document\n        tech_md_path = temp_dir / \"technical.md\"\n        with open(tech_md_path, \"w\", encoding=\"utf-8\") as f:\n            f.write(samples[\"technical\"])\n\n        print(\"Testing different backends with technical document...\")\n\n        # Test different backends\n        backends = [\"auto\", \"weasyprint\", \"pandoc\"]\n        results = {}\n\n        for backend in backends:\n            try:\n                print(f\"\\nTesting {backend} backend...\")\n\n                converter = EnhancedMarkdownConverter()\n                output_path = temp_dir / f\"technical_{backend}.pdf\"\n\n                import time\n\n                start_time = time.time()\n\n                success = converter.convert_file_to_pdf(\n                    input_path=str(tech_md_path),\n                    output_path=str(output_path),\n                    method=backend,\n                )\n\n                end_time = time.time()\n                conversion_time = end_time - start_time\n\n                if success:\n                    file_size = (\n                        output_path.stat().st_size if output_path.exists() else 0\n                    )\n                    print(\n                        f\"  ✅ {backend}: Success in {conversion_time:.2f}s, {file_size} bytes\"\n                    )\n                    results[backend] = {\n                        \"success\": True,\n                        \"time\": conversion_time,\n                        \"size\": file_size,\n                        \"output\": str(output_path),\n                    }\n                else:\n                    print(f\"  ❌ {backend}: Failed\")\n                    results[backend] = {\"success\": False, \"time\": conversion_time}\n\n            except Exception as e:\n                print(f\"  ❌ {backend}: Error - {str(e)}\")\n                results[backend] = {\"success\": False, \"error\": str(e)}\n\n        # Summary\n        print(\"\\n\" + \"-\" * 40)\n        print(\"BACKEND COMPARISON SUMMARY\")\n        print(\"-\" * 40)\n        successful_backends = [b for b, r in results.items() if r.get(\"success\", False)]\n        print(f\"Successful backends: {successful_backends}\")\n\n        if successful_backends:\n            fastest = min(successful_backends, key=lambda b: results[b][\"time\"])\n            print(f\"Fastest backend: {fastest} ({results[fastest]['time']:.2f}s)\")\n\n        return results, temp_dir\n\n    except Exception as e:\n        print(f\"❌ Backend comparison demonstration failed: {str(e)}\")\n        return None, None\n\n\ndef demonstrate_custom_styling():\n    \"\"\"Demonstrate custom CSS styling and configuration\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"CUSTOM STYLING DEMONSTRATION\")\n    print(\"=\" * 60)\n\n    try:\n        samples = create_sample_markdown_content()\n        temp_dir = Path(tempfile.mkdtemp())\n\n        # Create custom CSS\n        custom_css = \"\"\"\n        body {\n            font-family: 'Times New Roman', serif;\n            font-size: 11pt;\n            line-height: 1.4;\n            color: #2c3e50;\n            max-width: 800px;\n            margin: 0 auto;\n            padding: 20px;\n        }\n\n        h1 {\n            color: #c0392b;\n            font-size: 2.2em;\n            border-bottom: 3px solid #e74c3c;\n            padding-bottom: 0.5em;\n            margin-top: 2em;\n        }\n\n        h2 {\n            color: #8e44ad;\n            font-size: 1.6em;\n            border-bottom: 2px solid #9b59b6;\n            padding-bottom: 0.3em;\n            margin-top: 1.5em;\n        }\n\n        h3 {\n            color: #2980b9;\n            font-size: 1.3em;\n            margin-top: 1.2em;\n        }\n\n        code {\n            background-color: #ecf0f1;\n            color: #e74c3c;\n            padding: 3px 6px;\n            border-radius: 4px;\n            font-family: 'Courier New', monospace;\n            font-size: 0.9em;\n        }\n\n        pre {\n            background-color: #2c3e50;\n            color: #ecf0f1;\n            padding: 20px;\n            border-radius: 8px;\n            border-left: 5px solid #3498db;\n            overflow-x: auto;\n            font-size: 0.9em;\n        }\n\n        pre code {\n            background-color: transparent;\n            color: inherit;\n            padding: 0;\n        }\n\n        blockquote {\n            background-color: #f8f9fa;\n            border-left: 5px solid #3498db;\n            margin: 1em 0;\n            padding: 15px 20px;\n            font-style: italic;\n            color: #555;\n        }\n\n        table {\n            border-collapse: collapse;\n            width: 100%;\n            margin: 1.5em 0;\n            background-color: white;\n            border-radius: 8px;\n            overflow: hidden;\n            box-shadow: 0 2px 4px rgba(0,0,0,0.1);\n        }\n\n        th {\n            background-color: #3498db;\n            color: white;\n            padding: 12px 15px;\n            text-align: left;\n            font-weight: bold;\n        }\n\n        td {\n            padding: 10px 15px;\n            border-bottom: 1px solid #ecf0f1;\n        }\n\n        tr:nth-child(even) {\n            background-color: #f8f9fa;\n        }\n\n        tr:hover {\n            background-color: #e8f4fd;\n        }\n\n        ul, ol {\n            margin-bottom: 1em;\n            padding-left: 2em;\n        }\n\n        li {\n            margin-bottom: 0.5em;\n            line-height: 1.6;\n        }\n\n        a {\n            color: #3498db;\n            text-decoration: none;\n            border-bottom: 1px dotted #3498db;\n        }\n\n        a:hover {\n            color: #2980b9;\n            border-bottom: 1px solid #2980b9;\n        }\n\n        .toc {\n            background-color: #f8f9fa;\n            border: 2px solid #e9ecef;\n            border-radius: 8px;\n            padding: 20px;\n            margin: 2em 0;\n        }\n\n        .toc h2 {\n            color: #2c3e50;\n            margin-top: 0;\n            border-bottom: none;\n        }\n\n        .toc ul {\n            list-style-type: none;\n            padding-left: 0;\n        }\n\n        .toc li {\n            margin-bottom: 0.8em;\n        }\n\n        .toc a {\n            color: #2c3e50;\n            font-weight: 500;\n            border-bottom: none;\n        }\n        \"\"\"\n\n        # Create custom configuration\n        config = MarkdownConfig(\n            page_size=\"A4\",\n            margin=\"0.8in\",\n            font_size=\"11pt\",\n            line_height=\"1.4\",\n            include_toc=True,\n            syntax_highlighting=True,\n            custom_css=custom_css,\n        )\n\n        converter = EnhancedMarkdownConverter(config)\n\n        # Convert academic sample with custom styling\n        academic_md_path = temp_dir / \"academic_styled.md\"\n        with open(academic_md_path, \"w\", encoding=\"utf-8\") as f:\n            f.write(samples[\"academic\"])\n\n        print(\"Converting academic paper with custom styling...\")\n        print(\"Custom styling features:\")\n        print(\"  - Custom color scheme (reds, purples, blues)\")\n        print(\"  - Times New Roman serif font\")\n        print(\"  - Enhanced table styling with hover effects\")\n        print(\"  - Styled code blocks with dark theme\")\n        print(\"  - Custom blockquote styling\")\n        print(\"  - Professional header styling\")\n\n        success = converter.convert_file_to_pdf(\n            input_path=str(academic_md_path),\n            output_path=str(temp_dir / \"academic_styled.pdf\"),\n            method=\"weasyprint\",  # WeasyPrint is best for custom CSS\n        )\n\n        if success:\n            print(\"✅ Custom styling conversion successful!\")\n            print(f\"   Output: {temp_dir / 'academic_styled.pdf'}\")\n\n            # Also create a default version for comparison\n            default_converter = EnhancedMarkdownConverter()\n            default_success = default_converter.convert_file_to_pdf(\n                input_path=str(academic_md_path),\n                output_path=str(temp_dir / \"academic_default.pdf\"),\n                method=\"weasyprint\",\n            )\n\n            if default_success:\n                print(f\"   Comparison (default): {temp_dir / 'academic_default.pdf'}\")\n        else:\n            print(\"❌ Custom styling conversion failed\")\n\n        return success, temp_dir\n\n    except Exception as e:\n        print(f\"❌ Custom styling demonstration failed: {str(e)}\")\n        return False, None\n\n\ndef demonstrate_content_conversion():\n    \"\"\"Demonstrate converting markdown content directly (not from file)\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"CONTENT CONVERSION DEMONSTRATION\")\n    print(\"=\" * 60)\n\n    try:\n        # Create markdown content programmatically\n        dynamic_content = f\"\"\"# Dynamic Content Example\n\n## Generated Information\nThis document was generated programmatically on {Path(__file__).name}.\n\n## System Information\n- **Python Path**: {sys.executable}\n- **Script Location**: {Path(__file__).absolute()}\n- **Working Directory**: {Path.cwd()}\n\n## Dynamic Table\n| Property | Value |\n|----------|-------|\n| Script Name | {Path(__file__).name} |\n| Python Version | {sys.version.split()[0]} |\n| Platform | {sys.platform} |\n\n## Code Example\n```python\n# This content was generated dynamically\nimport sys\nfrom pathlib import Path\n\ndef generate_report():\n    return f\"Report generated from {{Path(__file__).name}}\"\n\nprint(generate_report())\n```\n\n## Features Demonstrated\nThis example shows how to:\n1. Generate markdown content programmatically\n2. Convert content directly without saving to file first\n3. Include dynamic information in documents\n4. Use different conversion methods\n\n> **Note**: This content was created in memory and converted directly to PDF\n> without intermediate file storage.\n\n## Conclusion\nDirect content conversion is useful for:\n- Dynamic report generation\n- Programmatic document creation\n- API-based document services\n- Real-time content processing\n\"\"\"\n\n        temp_dir = Path(tempfile.mkdtemp())\n        converter = EnhancedMarkdownConverter()\n\n        print(\"Converting dynamically generated markdown content...\")\n        print(\"Content includes:\")\n        print(\"  - System information\")\n        print(\"  - Dynamic tables with current values\")\n        print(\"  - Generated timestamps\")\n        print(\"  - Programmatic examples\")\n\n        # Convert content directly to PDF\n        output_path = temp_dir / \"dynamic_content.pdf\"\n\n        success = converter.convert_markdown_to_pdf(\n            markdown_content=dynamic_content,\n            output_path=str(output_path),\n            method=\"auto\",\n        )\n\n        if success:\n            print(\"✅ Content conversion successful!\")\n            print(f\"   Output: {output_path}\")\n\n            # Show file size\n            file_size = output_path.stat().st_size\n            print(f\"   Generated PDF size: {file_size} bytes\")\n        else:\n            print(\"❌ Content conversion failed\")\n\n        return success, temp_dir\n\n    except Exception as e:\n        print(f\"❌ Content conversion demonstration failed: {str(e)}\")\n        return False, None\n\n\ndef demonstrate_error_handling():\n    \"\"\"Demonstrate error handling and fallback mechanisms\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"ERROR HANDLING DEMONSTRATION\")\n    print(\"=\" * 60)\n\n    try:\n        temp_dir = Path(tempfile.mkdtemp())\n\n        # Test cases with various issues\n        test_cases = {\n            \"invalid_markdown\": \"\"\"# Invalid Markdown\n\nThis markdown has some {{invalid}} syntax and [broken links](http://nonexistent.invalid).\n\n```unknown_language\nThis code block uses an unknown language\n```\n\n![Missing Image](nonexistent_image.png)\n\"\"\",\n            \"complex_content\": \"\"\"# Complex Content Test\n\n## Mathematical Expressions\nThis tests content that might be challenging for some backends:\n\n$$ E = mc^2 $$\n\n$$\\\\sum_{i=1}^{n} x_i = \\\\frac{n(n+1)}{2}$$\n\n## Complex Tables\n| A | B | C | D | E | F | G |\n|---|---|---|---|---|---|---|\n| Very long content that might wrap | Short | Medium length content | X | Y | Z | End |\n| Another row with different lengths | A | B | C | D | E | F |\n\n## Special Characters\nUnicode: α, β, γ, δ, ε, ζ, η, θ, ι, κ, λ, μ, ν, ξ, ο, π, ρ, σ, τ, υ, φ, χ, ψ, ω\nSymbols: ♠ ♣ ♥ ♦ ☀ ☁ ☂ ☃ ☄ ★ ☆ ☉ ☊ ☋ ☌ ☍ ☎ ☏\nArrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙\n\"\"\",\n            \"empty_content\": \"\",\n            \"minimal_content\": \"# Just a title\",\n        }\n\n        print(\"Testing error handling with various content types...\")\n\n        results = {}\n\n        for test_name, content in test_cases.items():\n            print(f\"\\nTesting: {test_name}\")\n\n            try:\n                # Try multiple backends for each test case\n                for backend in [\"auto\", \"weasyprint\", \"pandoc\"]:\n                    try:\n                        converter = EnhancedMarkdownConverter()\n                        output_path = temp_dir / f\"{test_name}_{backend}.pdf\"\n\n                        success = converter.convert_markdown_to_pdf(\n                            markdown_content=content,\n                            output_path=str(output_path),\n                            method=backend,\n                        )\n\n                        if success:\n                            file_size = (\n                                output_path.stat().st_size\n                                if output_path.exists()\n                                else 0\n                            )\n                            print(f\"  ✅ {backend}: Success ({file_size} bytes)\")\n                            results[f\"{test_name}_{backend}\"] = {\n                                \"success\": True,\n                                \"size\": file_size,\n                            }\n                        else:\n                            print(f\"  ❌ {backend}: Failed\")\n                            results[f\"{test_name}_{backend}\"] = {\"success\": False}\n\n                    except Exception as e:\n                        print(f\"  ❌ {backend}: Error - {str(e)[:60]}...\")\n                        results[f\"{test_name}_{backend}\"] = {\n                            \"success\": False,\n                            \"error\": str(e),\n                        }\n\n            except Exception as e:\n                print(f\"  ❌ Test case failed: {str(e)}\")\n\n        # Demonstrate robust conversion with fallbacks\n        print(\"\\nDemonstrating robust conversion with fallback logic...\")\n\n        def robust_convert(content, output_path):\n            \"\"\"Convert with multiple backend fallbacks\"\"\"\n            backends = [\"weasyprint\", \"pandoc\", \"auto\"]\n\n            for backend in backends:\n                try:\n                    converter = EnhancedMarkdownConverter()\n                    success = converter.convert_markdown_to_pdf(\n                        markdown_content=content,\n                        output_path=output_path,\n                        method=backend,\n                    )\n                    if success:\n                        return backend, True\n                except Exception:\n                    continue\n\n            return None, False\n\n        # Test robust conversion\n        test_content = test_cases[\"complex_content\"]\n        robust_output = temp_dir / \"robust_conversion.pdf\"\n\n        successful_backend, success = robust_convert(test_content, str(robust_output))\n\n        if success:\n            print(f\"✅ Robust conversion successful using {successful_backend}\")\n            print(f\"   Output: {robust_output}\")\n        else:\n            print(\"❌ All backends failed for robust conversion\")\n\n        # Summary\n        print(\"\\n\" + \"-\" * 40)\n        print(\"ERROR HANDLING SUMMARY\")\n        print(\"-\" * 40)\n        successful_conversions = sum(\n            1 for r in results.values() if r.get(\"success\", False)\n        )\n        total_attempts = len(results)\n        success_rate = (\n            (successful_conversions / total_attempts * 100) if total_attempts > 0 else 0\n        )\n\n        print(f\"Total conversion attempts: {total_attempts}\")\n        print(f\"Successful conversions: {successful_conversions}\")\n        print(f\"Success rate: {success_rate:.1f}%\")\n\n        return results, temp_dir\n\n    except Exception as e:\n        print(f\"❌ Error handling demonstration failed: {str(e)}\")\n        return None, None\n\n\ndef main():\n    \"\"\"Main demonstration function\"\"\"\n    # Configure logging\n    logging.basicConfig(\n        level=logging.INFO,\n        format=\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\",\n    )\n\n    print(\"RAG-Anything Enhanced Markdown Conversion Demonstration\")\n    print(\"=\" * 70)\n    print(\n        \"This example demonstrates various enhanced markdown conversion capabilities:\"\n    )\n    print(\"  - Basic markdown to PDF conversion\")\n    print(\"  - Multiple backend comparison (WeasyPrint vs Pandoc)\")\n    print(\"  - Custom CSS styling and professional formatting\")\n    print(\"  - Direct content conversion without file I/O\")\n    print(\"  - Comprehensive error handling and fallback mechanisms\")\n\n    results = {}\n\n    # Run demonstrations\n    print(\"\\n🚀 Starting demonstrations...\")\n\n    # Basic conversion\n    success, temp_dir = demonstrate_basic_conversion()\n    results[\"basic\"] = success\n\n    # Backend comparison\n    backend_results, _ = demonstrate_backend_comparison()\n    results[\"backends\"] = backend_results\n\n    # Custom styling\n    styling_success, _ = demonstrate_custom_styling()\n    results[\"styling\"] = styling_success\n\n    # Content conversion\n    content_success, _ = demonstrate_content_conversion()\n    results[\"content\"] = content_success\n\n    # Error handling\n    error_results, _ = demonstrate_error_handling()\n    results[\"error_handling\"] = error_results\n\n    # Summary\n    print(\"\\n\" + \"=\" * 70)\n    print(\"DEMONSTRATION SUMMARY\")\n    print(\"=\" * 70)\n\n    print(\"✅ Features Successfully Demonstrated:\")\n    if results[\"basic\"]:\n        print(\"  - Basic markdown to PDF conversion\")\n    if results[\"backends\"]:\n        successful_backends = [\n            b for b, r in results[\"backends\"].items() if r.get(\"success\", False)\n        ]\n        print(f\"  - Multiple backends: {successful_backends}\")\n    if results[\"styling\"]:\n        print(\"  - Custom CSS styling and professional formatting\")\n    if results[\"content\"]:\n        print(\"  - Direct content conversion without file I/O\")\n    if results[\"error_handling\"]:\n        success_rate = (\n            sum(\n                1 for r in results[\"error_handling\"].values() if r.get(\"success\", False)\n            )\n            / len(results[\"error_handling\"])\n            * 100\n        )\n        print(f\"  - Error handling with {success_rate:.1f}% overall success rate\")\n\n    print(\"\\n📊 Key Capabilities Highlighted:\")\n    print(\"  - Professional PDF generation with high-quality typography\")\n    print(\"  - Multiple conversion backends with automatic selection\")\n    print(\"  - Extensive CSS customization for branded documents\")\n    print(\"  - Syntax highlighting for code blocks using Pygments\")\n    print(\"  - Table formatting with professional styling\")\n    print(\"  - Image embedding with proper scaling\")\n    print(\"  - Table of contents generation with navigation\")\n    print(\"  - Comprehensive error handling and fallback mechanisms\")\n\n    print(\"\\n💡 Best Practices Demonstrated:\")\n    print(\"  - Choose WeasyPrint for web-style documents and custom CSS\")\n    print(\"  - Choose Pandoc for academic papers and complex formatting\")\n    print(\"  - Use 'auto' method for general-purpose conversion\")\n    print(\"  - Implement fallback logic for robust conversion\")\n    print(\"  - Optimize images before embedding in documents\")\n    print(\"  - Test custom CSS with simple content first\")\n    print(\"  - Handle errors gracefully with multiple backend attempts\")\n    print(\"  - Use appropriate page sizes and margins for target use case\")\n\n    print(\"\\n🎯 Integration Patterns:\")\n    print(\"  - Standalone conversion for document generation\")\n    print(\"  - Integration with RAG-Anything document pipeline\")\n    print(\"  - API-based document services\")\n    print(\"  - Batch processing for multiple documents\")\n    print(\"  - Dynamic content generation from templates\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/image_format_test.py",
    "content": "#!/usr/bin/env python3\n\"\"\"\nImage Format Parsing Test Script for RAG-Anything\n\nThis script demonstrates how to parse various image formats\nusing MinerU, including JPG, PNG, BMP, TIFF, GIF, and WebP files.\n\nRequirements:\n- PIL/Pillow library for format conversion\n- RAG-Anything package\n\nUsage:\n    python image_format_test.py --file path/to/image.bmp\n\"\"\"\n\nimport argparse\nimport asyncio\nimport sys\nfrom pathlib import Path\nfrom raganything import RAGAnything\n\n\ndef check_pillow_installation():\n    \"\"\"Check if PIL/Pillow is installed and available\"\"\"\n    try:\n        from PIL import Image\n\n        print(\n            f\"✅ PIL/Pillow found: PIL version {Image.__version__ if hasattr(Image, '__version__') else 'Unknown'}\"\n        )\n        return True\n    except ImportError:\n        print(\"❌ PIL/Pillow not found. Please install Pillow:\")\n        print(\"  pip install Pillow\")\n        return False\n\n\ndef get_image_info(image_path: Path):\n    \"\"\"Get detailed image information\"\"\"\n    try:\n        from PIL import Image\n\n        with Image.open(image_path) as img:\n            return {\n                \"format\": img.format,\n                \"mode\": img.mode,\n                \"size\": img.size,\n                \"has_transparency\": img.mode in (\"RGBA\", \"LA\")\n                or \"transparency\" in img.info,\n            }\n    except Exception as e:\n        return {\"error\": str(e)}\n\n\nasync def test_image_format_parsing(file_path: str):\n    \"\"\"Test image format parsing with MinerU\"\"\"\n\n    print(f\"🧪 Testing image format parsing: {file_path}\")\n\n    # Check if file exists and is a supported image format\n    file_path = Path(file_path)\n    if not file_path.exists():\n        print(f\"❌ File does not exist: {file_path}\")\n        return False\n\n    supported_extensions = {\n        \".jpg\",\n        \".jpeg\",\n        \".png\",\n        \".bmp\",\n        \".tiff\",\n        \".tif\",\n        \".gif\",\n        \".webp\",\n    }\n    if file_path.suffix.lower() not in supported_extensions:\n        print(f\"❌ Unsupported file format: {file_path.suffix}\")\n        print(f\"   Supported formats: {', '.join(supported_extensions)}\")\n        return False\n\n    print(f\"📸 File format: {file_path.suffix.upper()}\")\n    print(f\"📏 File size: {file_path.stat().st_size / 1024:.1f} KB\")\n\n    # Get detailed image information\n    img_info = get_image_info(file_path)\n    if \"error\" not in img_info:\n        print(\"🖼️  Image info:\")\n        print(f\"   • Format: {img_info['format']}\")\n        print(f\"   • Mode: {img_info['mode']}\")\n        print(f\"   • Size: {img_info['size'][0]}x{img_info['size'][1]}\")\n        print(f\"   • Has transparency: {img_info['has_transparency']}\")\n\n    # Check format compatibility with MinerU\n    mineru_native_formats = {\".jpg\", \".jpeg\", \".png\"}\n    needs_conversion = file_path.suffix.lower() not in mineru_native_formats\n\n    if needs_conversion:\n        print(\n            f\"ℹ️  Format {file_path.suffix.upper()} will be converted to PNG for MinerU compatibility\"\n        )\n    else:\n        print(f\"✅ Format {file_path.suffix.upper()} is natively supported by MinerU\")\n\n    # Initialize RAGAnything (only for parsing functionality)\n    rag = RAGAnything()\n\n    try:\n        # Test image parsing with MinerU\n        print(\"\\n🔄 Testing image parsing with MinerU...\")\n        content_list, md_content = await rag.parse_document(\n            file_path=str(file_path),\n            output_dir=\"./test_output\",\n            parse_method=\"ocr\",  # Images use OCR method\n            display_stats=True,\n        )\n\n        print(\"✅ Parsing successful!\")\n        print(f\"   📊 Content blocks: {len(content_list)}\")\n        print(f\"   📝 Markdown length: {len(md_content)} characters\")\n\n        # Analyze content types\n        content_types = {}\n        for item in content_list:\n            if isinstance(item, dict):\n                content_type = item.get(\"type\", \"unknown\")\n                content_types[content_type] = content_types.get(content_type, 0) + 1\n\n        if content_types:\n            print(\"   📋 Content distribution:\")\n            for content_type, count in sorted(content_types.items()):\n                print(f\"      • {content_type}: {count}\")\n\n        # Display extracted text (if any)\n        if md_content.strip():\n            print(\"\\n📄 Extracted text preview (first 500 characters):\")\n            preview = md_content.strip()[:500]\n            print(f\"   {preview}{'...' if len(md_content) > 500 else ''}\")\n        else:\n            print(\"\\n📄 No text extracted from the image\")\n\n        # Display image processing results\n        image_items = [\n            item\n            for item in content_list\n            if isinstance(item, dict) and item.get(\"type\") == \"image\"\n        ]\n        if image_items:\n            print(f\"\\n🖼️  Found {len(image_items)} processed image(s):\")\n            for i, item in enumerate(image_items, 1):\n                print(f\"   {i}. Image path: {item.get('img_path', 'N/A')}\")\n                caption = item.get(\"image_caption\", item.get(\"img_caption\", []))\n                if caption:\n                    print(f\"      Caption: {caption[0] if caption else 'N/A'}\")\n\n        # Display text blocks (OCR results)\n        text_items = [\n            item\n            for item in content_list\n            if isinstance(item, dict) and item.get(\"type\") == \"text\"\n        ]\n        if text_items:\n            print(\"\\n📝 OCR text blocks found:\")\n            for i, item in enumerate(text_items, 1):\n                text_content = item.get(\"text\", \"\")\n                if text_content.strip():\n                    preview = text_content.strip()[:200]\n                    print(\n                        f\"   {i}. {preview}{'...' if len(text_content) > 200 else ''}\"\n                    )\n\n        # Check for any tables detected in the image\n        table_items = [\n            item\n            for item in content_list\n            if isinstance(item, dict) and item.get(\"type\") == \"table\"\n        ]\n        if table_items:\n            print(f\"\\n📊 Found {len(table_items)} table(s) in image:\")\n            for i, item in enumerate(table_items, 1):\n                print(f\"   {i}. Table detected with content\")\n\n        print(\"\\n🎉 Image format parsing test completed successfully!\")\n        print(\"📁 Output files saved to: ./test_output\")\n        return True\n\n    except Exception as e:\n        print(f\"\\n❌ Image format parsing failed: {str(e)}\")\n        import traceback\n\n        print(f\"   Full error: {traceback.format_exc()}\")\n        return False\n\n\ndef main():\n    \"\"\"Main function\"\"\"\n    parser = argparse.ArgumentParser(\n        description=\"Test image format parsing with MinerU\"\n    )\n    parser.add_argument(\"--file\", help=\"Path to the image file to test\")\n    parser.add_argument(\n        \"--check-pillow\", action=\"store_true\", help=\"Only check PIL/Pillow installation\"\n    )\n\n    args = parser.parse_args()\n\n    # Check PIL/Pillow installation\n    print(\"🔧 Checking PIL/Pillow installation...\")\n    if not check_pillow_installation():\n        return 1\n\n    if args.check_pillow:\n        print(\"✅ PIL/Pillow installation check passed!\")\n        return 0\n\n    # If not just checking dependencies, file argument is required\n    if not args.file:\n        print(\"❌ Error: --file argument is required when not using --check-pillow\")\n        parser.print_help()\n        return 1\n\n    # Run the parsing test\n    try:\n        success = asyncio.run(test_image_format_parsing(args.file))\n        return 0 if success else 1\n    except KeyboardInterrupt:\n        print(\"\\n⏹️ Test interrupted by user\")\n        return 1\n    except Exception as e:\n        print(f\"\\n❌ Unexpected error: {str(e)}\")\n        return 1\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n"
  },
  {
    "path": "examples/insert_content_list_example.py",
    "content": "#!/usr/bin/env python\n\"\"\"\nExample script demonstrating direct content list insertion with RAGAnything\n\nThis example shows how to:\n1. Create a simple content list with different content types\n2. Insert content list directly without document parsing using insert_content_list() method\n3. Perform pure text queries using aquery() method\n4. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method\n5. Handle different types of multimodal content in the inserted knowledge base\n\"\"\"\n\nimport os\nimport argparse\nimport asyncio\nimport logging\nimport logging.config\nfrom pathlib import Path\n\n# Add project root directory to Python path\nimport sys\n\nsys.path.append(str(Path(__file__).parent.parent))\n\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.utils import EmbeddingFunc, logger, set_verbose_debug\nfrom raganything import RAGAnything, RAGAnythingConfig\n\nfrom dotenv import load_dotenv\n\nload_dotenv(dotenv_path=\".env\", override=False)\n\n\ndef configure_logging():\n    \"\"\"Configure logging for the application\"\"\"\n    # Get log directory path from environment variable or use current directory\n    log_dir = os.getenv(\"LOG_DIR\", os.getcwd())\n    log_file_path = os.path.abspath(\n        os.path.join(log_dir, \"insert_content_list_example.log\")\n    )\n\n    print(f\"\\nInsert Content List example log file: {log_file_path}\\n\")\n    os.makedirs(os.path.dirname(log_dir), exist_ok=True)\n\n    # Get log file max size and backup count from environment variables\n    log_max_bytes = int(os.getenv(\"LOG_MAX_BYTES\", 10485760))  # Default 10MB\n    log_backup_count = int(os.getenv(\"LOG_BACKUP_COUNT\", 5))  # Default 5 backups\n\n    logging.config.dictConfig(\n        {\n            \"version\": 1,\n            \"disable_existing_loggers\": False,\n            \"formatters\": {\n                \"default\": {\n                    \"format\": \"%(levelname)s: %(message)s\",\n                },\n                \"detailed\": {\n                    \"format\": \"%(asctime)s - %(name)s - %(levelname)s - %(message)s\",\n                },\n            },\n            \"handlers\": {\n                \"console\": {\n                    \"formatter\": \"default\",\n                    \"class\": \"logging.StreamHandler\",\n                    \"stream\": \"ext://sys.stderr\",\n                },\n                \"file\": {\n                    \"formatter\": \"detailed\",\n                    \"class\": \"logging.handlers.RotatingFileHandler\",\n                    \"filename\": log_file_path,\n                    \"maxBytes\": log_max_bytes,\n                    \"backupCount\": log_backup_count,\n                    \"encoding\": \"utf-8\",\n                },\n            },\n            \"loggers\": {\n                \"lightrag\": {\n                    \"handlers\": [\"console\", \"file\"],\n                    \"level\": \"INFO\",\n                    \"propagate\": False,\n                },\n            },\n        }\n    )\n\n    # Set the logger level to INFO\n    logger.setLevel(logging.INFO)\n    # Enable verbose debug if needed\n    set_verbose_debug(os.getenv(\"VERBOSE\", \"false\").lower() == \"true\")\n\n\ndef create_sample_content_list():\n    \"\"\"\n    Create a simple content list for testing insert_content_list functionality\n\n    Returns:\n        List[Dict]: Sample content list with various content types\n\n    Note:\n        - img_path should be absolute path to the image file\n        - page_idx represents the page number where the content appears (0-based)\n    \"\"\"\n    content_list = [\n        # Introduction text\n        {\n            \"type\": \"text\",\n            \"text\": \"Welcome to the RAGAnything System Documentation. This guide covers the advanced multimodal document processing capabilities and features of our comprehensive RAG system.\",\n            \"page_idx\": 0,  # Page number where this content appears\n        },\n        # System architecture image\n        {\n            \"type\": \"image\",\n            \"img_path\": \"/absolute/path/to/system_architecture.jpg\",  # IMPORTANT: Use absolute path to image file\n            \"image_caption\": [\"Figure 1: RAGAnything System Architecture\"],\n            \"image_footnote\": [\n                \"The architecture shows the complete pipeline from document parsing to multimodal query processing\"\n            ],\n            \"page_idx\": 1,  # Page number where this image appears\n        },\n        # Performance comparison table\n        {\n            \"type\": \"table\",\n            \"table_body\": \"\"\"| System | Accuracy | Processing Speed | Memory Usage |\n                            |--------|----------|------------------|--------------|\n                            | RAGAnything | 95.2% | 120ms | 2.1GB |\n                            | Traditional RAG | 87.3% | 180ms | 3.2GB |\n                            | Baseline System | 82.1% | 220ms | 4.1GB |\n                            | Simple Retrieval | 76.5% | 95ms | 1.8GB |\"\"\",\n            \"table_caption\": [\n                \"Table 1: Performance Comparison of Different RAG Systems\"\n            ],\n            \"table_footnote\": [\n                \"All tests conducted on the same hardware with identical test datasets\"\n            ],\n            \"page_idx\": 2,  # Page number where this table appears\n        },\n        # Mathematical formula\n        {\n            \"type\": \"equation\",\n            \"latex\": \"Relevance(d, q) = \\\\sum_{i=1}^{n} w_i \\\\cdot sim(t_i^d, t_i^q) \\\\cdot \\\\alpha_i\",\n            \"text\": \"Document relevance scoring formula where w_i are term weights, sim() is similarity function, and α_i are modality importance factors\",\n            \"page_idx\": 3,  # Page number where this equation appears\n        },\n        # Feature description\n        {\n            \"type\": \"text\",\n            \"text\": \"The system supports multiple content modalities including text, images, tables, and mathematical equations. Each modality is processed using specialized processors optimized for that content type.\",\n            \"page_idx\": 4,  # Page number where this content appears\n        },\n        # Technical specifications table\n        {\n            \"type\": \"table\",\n            \"table_body\": \"\"\"| Feature | Specification |\n                            |---------|---------------|\n                            | Supported Formats | PDF, DOCX, PPTX, XLSX, Images |\n                            | Max Document Size | 100MB |\n                            | Concurrent Processing | Up to 8 documents |\n                            | Query Response Time | <200ms average |\n                            | Knowledge Graph Nodes | Up to 1M entities |\"\"\",\n            \"table_caption\": [\"Table 2: Technical Specifications\"],\n            \"table_footnote\": [\n                \"Specifications may vary based on hardware configuration\"\n            ],\n            \"page_idx\": 5,  # Page number where this table appears\n        },\n        # Conclusion\n        {\n            \"type\": \"text\",\n            \"text\": \"RAGAnything represents a significant advancement in multimodal document processing, providing comprehensive solutions for complex knowledge extraction and retrieval tasks.\",\n            \"page_idx\": 6,  # Page number where this content appears\n        },\n    ]\n\n    return content_list\n\n\nasync def demo_insert_content_list(\n    api_key: str,\n    base_url: str = None,\n    working_dir: str = None,\n):\n    \"\"\"\n    Demonstrate content list insertion and querying with RAGAnything\n\n    Args:\n        api_key: OpenAI API key\n        base_url: Optional base URL for API\n        working_dir: Working directory for RAG storage\n    \"\"\"\n    try:\n        # Create RAGAnything configuration\n        config = RAGAnythingConfig(\n            working_dir=working_dir or \"./rag_storage\",\n            enable_image_processing=True,\n            enable_table_processing=True,\n            enable_equation_processing=True,\n            display_content_stats=True,  # Show content statistics\n        )\n\n        # Define LLM model function\n        def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):\n            return openai_complete_if_cache(\n                \"gpt-4o-mini\",\n                prompt,\n                system_prompt=system_prompt,\n                history_messages=history_messages,\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n\n        # Define vision model function for image processing\n        def vision_model_func(\n            prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs\n        ):\n            if image_data:\n                return openai_complete_if_cache(\n                    \"gpt-4o\",\n                    \"\",\n                    system_prompt=None,\n                    history_messages=[],\n                    messages=[\n                        {\"role\": \"system\", \"content\": system_prompt}\n                        if system_prompt\n                        else None,\n                        {\n                            \"role\": \"user\",\n                            \"content\": [\n                                {\"type\": \"text\", \"text\": prompt},\n                                {\n                                    \"type\": \"image_url\",\n                                    \"image_url\": {\n                                        \"url\": f\"data:image/jpeg;base64,{image_data}\"\n                                    },\n                                },\n                            ],\n                        }\n                        if image_data\n                        else {\"role\": \"user\", \"content\": prompt},\n                    ],\n                    api_key=api_key,\n                    base_url=base_url,\n                    **kwargs,\n                )\n            else:\n                return llm_model_func(prompt, system_prompt, history_messages, **kwargs)\n\n        # Define embedding function - using environment variables for configuration\n        embedding_dim = int(os.getenv(\"EMBEDDING_DIM\", \"3072\"))\n        embedding_model = os.getenv(\"EMBEDDING_MODEL\", \"text-embedding-3-large\")\n\n        embedding_func = EmbeddingFunc(\n            embedding_dim=embedding_dim,\n            max_token_size=8192,\n            func=lambda texts: openai_embed.func(\n                texts,\n                model=embedding_model,\n                api_key=api_key,\n                base_url=base_url,\n            ),\n        )\n\n        # Initialize RAGAnything\n        rag = RAGAnything(\n            config=config,\n            llm_model_func=llm_model_func,\n            vision_model_func=vision_model_func,\n            embedding_func=embedding_func,\n        )\n\n        # Create sample content list\n        logger.info(\"Creating sample content list...\")\n        content_list = create_sample_content_list()\n        logger.info(f\"Created content list with {len(content_list)} items\")\n\n        # Insert content list directly\n        logger.info(\"\\nInserting content list into RAGAnything...\")\n        await rag.insert_content_list(\n            content_list=content_list,\n            file_path=\"raganything_documentation.pdf\",  # Reference file name for citation\n            split_by_character=None,  # Optional text splitting\n            split_by_character_only=False,  # Optional text splitting mode\n            doc_id=\"demo-doc-001\",  # Custom document ID\n            display_stats=True,  # Show content statistics\n        )\n        logger.info(\"Content list insertion completed!\")\n\n        # Example queries - demonstrating different query approaches\n        logger.info(\"\\nQuerying inserted content:\")\n\n        # 1. Pure text queries using aquery()\n        text_queries = [\n            \"What is RAGAnything and what are its main features?\",\n            \"How does RAGAnything compare to traditional RAG systems?\",\n            \"What are the technical specifications of the system?\",\n        ]\n\n        for query in text_queries:\n            logger.info(f\"\\n[Text Query]: {query}\")\n            result = await rag.aquery(query, mode=\"hybrid\")\n            logger.info(f\"Answer: {result}\")\n\n        # 2. Multimodal query with specific multimodal content using aquery_with_multimodal()\n        logger.info(\n            \"\\n[Multimodal Query]: Analyzing new performance data against existing benchmarks\"\n        )\n        multimodal_result = await rag.aquery_with_multimodal(\n            \"Compare this new performance data with the existing benchmark results in the documentation\",\n            multimodal_content=[\n                {\n                    \"type\": \"table\",\n                    \"table_data\": \"\"\"Method,Accuracy,Speed,Memory\n                                New_Approach,97.1%,110ms,1.9GB\n                                Enhanced_RAG,91.4%,140ms,2.5GB\"\"\",\n                    \"table_caption\": \"Latest experimental results\",\n                }\n            ],\n            mode=\"hybrid\",\n        )\n        logger.info(f\"Answer: {multimodal_result}\")\n\n        # 3. Another multimodal query with equation content\n        logger.info(\"\\n[Multimodal Query]: Mathematical formula analysis\")\n        equation_result = await rag.aquery_with_multimodal(\n            \"How does this similarity formula relate to the relevance scoring mentioned in the documentation?\",\n            multimodal_content=[\n                {\n                    \"type\": \"equation\",\n                    \"latex\": \"sim(a, b) = \\\\frac{a \\\\cdot b}{||a|| \\\\times ||b||} + \\\\beta \\\\cdot context\\\\_weight\",\n                    \"equation_caption\": \"Enhanced cosine similarity with context weighting\",\n                }\n            ],\n            mode=\"hybrid\",\n        )\n        logger.info(f\"Answer: {equation_result}\")\n\n        # 4. Insert another content list with different document ID\n        logger.info(\"\\nInserting additional content list...\")\n        additional_content = [\n            {\n                \"type\": \"text\",\n                \"text\": \"This is additional documentation about advanced features and configuration options.\",\n                \"page_idx\": 0,  # Page number where this content appears\n            },\n            {\n                \"type\": \"table\",\n                \"table_body\": \"\"\"| Configuration | Default Value | Range |\n                                    |---------------|---------------|-------|\n                                    | Chunk Size | 512 tokens | 128-2048 |\n                                    | Context Window | 4096 tokens | 1024-8192 |\n                                    | Batch Size | 32 | 1-128 |\"\"\",\n                \"table_caption\": [\"Advanced Configuration Parameters\"],\n                \"page_idx\": 1,  # Page number where this table appears\n            },\n        ]\n\n        await rag.insert_content_list(\n            content_list=additional_content,\n            file_path=\"advanced_configuration.pdf\",\n            doc_id=\"demo-doc-002\",  # Different document ID\n        )\n\n        # Query combined knowledge base\n        logger.info(\"\\n[Combined Query]: What configuration options are available?\")\n        combined_result = await rag.aquery(\n            \"What configuration options are available and what are their default values?\",\n            mode=\"hybrid\",\n        )\n        logger.info(f\"Answer: {combined_result}\")\n\n    except Exception as e:\n        logger.error(f\"Error in content list insertion demo: {str(e)}\")\n        import traceback\n\n        logger.error(traceback.format_exc())\n\n\ndef main():\n    \"\"\"Main function to run the example\"\"\"\n    parser = argparse.ArgumentParser(description=\"Insert Content List Example\")\n    parser.add_argument(\n        \"--working_dir\", \"-w\", default=\"./rag_storage\", help=\"Working directory path\"\n    )\n    parser.add_argument(\n        \"--api-key\",\n        default=os.getenv(\"LLM_BINDING_API_KEY\"),\n        help=\"OpenAI API key (defaults to LLM_BINDING_API_KEY env var)\",\n    )\n    parser.add_argument(\n        \"--base-url\",\n        default=os.getenv(\"LLM_BINDING_HOST\"),\n        help=\"Optional base URL for API\",\n    )\n\n    args = parser.parse_args()\n\n    # Check if API key is provided\n    if not args.api_key:\n        logger.error(\"Error: OpenAI API key is required\")\n        logger.error(\"Set api key environment variable or use --api-key option\")\n        return\n\n    # Run the demo\n    asyncio.run(\n        demo_insert_content_list(\n            args.api_key,\n            args.base_url,\n            args.working_dir,\n        )\n    )\n\n\nif __name__ == \"__main__\":\n    # Configure logging first\n    configure_logging()\n\n    print(\"RAGAnything Insert Content List Example\")\n    print(\"=\" * 45)\n    print(\"Demonstrating direct content list insertion without document parsing\")\n    print(\"=\" * 45)\n\n    main()\n"
  },
  {
    "path": "examples/lmstudio_integration_example.py",
    "content": "\"\"\"\nLM Studio Integration Example with RAG-Anything\n\nThis example demonstrates how to integrate LM Studio with RAG-Anything for local\ntext document processing and querying.\n\nRequirements:\n- LM Studio running locally with server enabled\n- OpenAI Python package: pip install openai\n- RAG-Anything installed: pip install raganything\n\nEnvironment Setup:\nCreate a .env file with:\nLLM_BINDING=lmstudio\nLLM_MODEL=openai/gpt-oss-20b\nLLM_BINDING_HOST=http://localhost:1234/v1\nLLM_BINDING_API_KEY=lm-studio\nEMBEDDING_BINDING=lmstudio\nEMBEDDING_MODEL=text-embedding-nomic-embed-text-v1.5\nEMBEDDING_BINDING_HOST=http://localhost:1234/v1\nEMBEDDING_BINDING_API_KEY=lm-studio\n\"\"\"\n\nimport os\nimport uuid\nimport asyncio\nfrom typing import List, Dict, Optional\nfrom dotenv import load_dotenv\nfrom openai import AsyncOpenAI\n\n# Load environment variables\nload_dotenv()\n\n# RAG-Anything imports\nfrom raganything import RAGAnything, RAGAnythingConfig\nfrom lightrag.utils import EmbeddingFunc\nfrom lightrag.llm.openai import openai_complete_if_cache\n\nLM_BASE_URL = os.getenv(\"LLM_BINDING_HOST\", \"http://localhost:1234/v1\")\nLM_API_KEY = os.getenv(\"LLM_BINDING_API_KEY\", \"lm-studio\")\nLM_MODEL_NAME = os.getenv(\"LLM_MODEL\", \"openai/gpt-oss-20b\")\nLM_EMBED_MODEL = os.getenv(\"EMBEDDING_MODEL\", \"text-embedding-nomic-embed-text-v1.5\")\n\n\nasync def lmstudio_llm_model_func(\n    prompt: str,\n    system_prompt: Optional[str] = None,\n    history_messages: List[Dict] = None,\n    **kwargs,\n) -> str:\n    \"\"\"Top-level LLM function for LightRAG (pickle-safe).\"\"\"\n    return await openai_complete_if_cache(\n        model=LM_MODEL_NAME,\n        prompt=prompt,\n        system_prompt=system_prompt,\n        history_messages=history_messages or [],\n        base_url=LM_BASE_URL,\n        api_key=LM_API_KEY,\n        **kwargs,\n    )\n\n\nasync def lmstudio_embedding_async(texts: List[str]) -> List[List[float]]:\n    \"\"\"Top-level embedding function for LightRAG (pickle-safe).\"\"\"\n    from lightrag.llm.openai import openai_embed\n\n    embeddings = await openai_embed(\n        texts=texts,\n        model=LM_EMBED_MODEL,\n        base_url=LM_BASE_URL,\n        api_key=LM_API_KEY,\n    )\n    return embeddings.tolist()\n\n\nclass LMStudioRAGIntegration:\n    \"\"\"Integration class for LM Studio with RAG-Anything.\"\"\"\n\n    def __init__(self):\n        # LM Studio configuration using standard LLM_BINDING variables\n        self.base_url = os.getenv(\"LLM_BINDING_HOST\", \"http://localhost:1234/v1\")\n        self.api_key = os.getenv(\"LLM_BINDING_API_KEY\", \"lm-studio\")\n        self.model_name = os.getenv(\"LLM_MODEL\", \"openai/gpt-oss-20b\")\n        self.embedding_model = os.getenv(\n            \"EMBEDDING_MODEL\", \"text-embedding-nomic-embed-text-v1.5\"\n        )\n\n        # RAG-Anything configuration\n        # Use a fresh working directory each run to avoid legacy doc_status schema conflicts\n        self.config = RAGAnythingConfig(\n            working_dir=f\"./rag_storage_lmstudio/{uuid.uuid4()}\",\n            parser=\"mineru\",\n            parse_method=\"auto\",\n            enable_image_processing=False,\n            enable_table_processing=True,\n            enable_equation_processing=True,\n        )\n        print(f\"📁 Using working_dir: {self.config.working_dir}\")\n\n        self.rag = None\n\n    async def test_connection(self) -> bool:\n        \"\"\"Test LM Studio connection.\"\"\"\n        try:\n            print(f\"🔌 Testing LM Studio connection at: {self.base_url}\")\n            client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)\n            models = await client.models.list()\n            print(f\"✅ Connected successfully! Found {len(models.data)} models\")\n\n            # Show available models\n            print(\"📊 Available models:\")\n            for i, model in enumerate(models.data[:5]):\n                marker = \"🎯\" if model.id == self.model_name else \"  \"\n                print(f\"{marker} {i+1}. {model.id}\")\n\n            if len(models.data) > 5:\n                print(f\"  ... and {len(models.data) - 5} more models\")\n\n            return True\n        except Exception as e:\n            print(f\"❌ Connection failed: {str(e)}\")\n            print(\"\\n💡 Troubleshooting tips:\")\n            print(\"1. Ensure LM Studio is running\")\n            print(\"2. Start the local server in LM Studio\")\n            print(\"3. Load a model or enable just-in-time loading\")\n            print(f\"4. Verify server address: {self.base_url}\")\n            return False\n        finally:\n            try:\n                await client.close()\n            except Exception:\n                pass\n\n    async def test_chat_completion(self) -> bool:\n        \"\"\"Test basic chat functionality.\"\"\"\n        try:\n            print(f\"💬 Testing chat with model: {self.model_name}\")\n            client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)\n            response = await client.chat.completions.create(\n                model=self.model_name,\n                messages=[\n                    {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n                    {\n                        \"role\": \"user\",\n                        \"content\": \"Hello! Please confirm you're working and tell me your capabilities.\",\n                    },\n                ],\n                max_tokens=100,\n                temperature=0.7,\n            )\n\n            result = response.choices[0].message.content.strip()\n            print(\"✅ Chat test successful!\")\n            print(f\"Response: {result}\")\n            return True\n        except Exception as e:\n            print(f\"❌ Chat test failed: {str(e)}\")\n            return False\n        finally:\n            try:\n                await client.close()\n            except Exception:\n                pass\n\n    # Deprecated factory helpers removed to reduce redundancy\n\n    def embedding_func_factory(self):\n        \"\"\"Create a completely serializable embedding function.\"\"\"\n        return EmbeddingFunc(\n            embedding_dim=768,  # nomic-embed-text-v1.5 default dimension\n            max_token_size=8192,  # nomic-embed-text-v1.5 context length\n            func=lmstudio_embedding_async,\n        )\n\n    async def initialize_rag(self):\n        \"\"\"Initialize RAG-Anything with LM Studio functions.\"\"\"\n        print(\"Initializing RAG-Anything with LM Studio...\")\n\n        try:\n            self.rag = RAGAnything(\n                config=self.config,\n                llm_model_func=lmstudio_llm_model_func,\n                embedding_func=self.embedding_func_factory(),\n            )\n\n            # Compatibility: avoid writing unknown field 'multimodal_processed' to LightRAG doc_status\n            # Older LightRAG versions may not accept this extra field in DocProcessingStatus\n            async def _noop_mark_multimodal(doc_id: str):\n                return None\n\n            self.rag._mark_multimodal_processing_complete = _noop_mark_multimodal\n\n            print(\"✅ RAG-Anything initialized successfully!\")\n            return True\n        except Exception as e:\n            print(f\"❌ RAG initialization failed: {str(e)}\")\n            return False\n\n    async def process_document_example(self, file_path: str):\n        \"\"\"Example: Process a document with LM Studio backend.\"\"\"\n        if not self.rag:\n            print(\"❌ RAG not initialized. Call initialize_rag() first.\")\n            return\n\n        try:\n            print(f\"📄 Processing document: {file_path}\")\n            await self.rag.process_document_complete(\n                file_path=file_path,\n                output_dir=\"./output_lmstudio\",\n                parse_method=\"auto\",\n                display_stats=True,\n            )\n            print(\"✅ Document processing completed!\")\n        except Exception as e:\n            print(f\"❌ Document processing failed: {str(e)}\")\n\n    async def query_examples(self):\n        \"\"\"Example queries with different modes.\"\"\"\n        if not self.rag:\n            print(\"❌ RAG not initialized. Call initialize_rag() first.\")\n            return\n\n        # Example queries\n        queries = [\n            (\"What are the main topics in the processed documents?\", \"hybrid\"),\n            (\"Summarize any tables or data found in the documents\", \"local\"),\n            (\"What images or figures are mentioned?\", \"global\"),\n        ]\n\n        print(\"\\n🔍 Running example queries...\")\n        for query, mode in queries:\n            try:\n                print(f\"\\nQuery ({mode}): {query}\")\n                result = await self.rag.aquery(query, mode=mode)\n                print(f\"Answer: {result[:200]}...\")\n            except Exception as e:\n                print(f\"❌ Query failed: {str(e)}\")\n\n    async def simple_query_example(self):\n        \"\"\"Example basic text query with sample content.\"\"\"\n        if not self.rag:\n            print(\"❌ RAG not initialized\")\n            return\n\n        try:\n            print(\"\\nAdding sample content for testing...\")\n\n            # Create content list in the format expected by RAGAnything\n            content_list = [\n                {\n                    \"type\": \"text\",\n                    \"text\": \"\"\"LM Studio Integration with RAG-Anything\n\nThis integration demonstrates how to connect LM Studio's local AI models with RAG-Anything's document processing capabilities. The system uses:\n\n- LM Studio for local LLM inference\n- nomic-embed-text-v1.5 for embeddings (768 dimensions)\n- RAG-Anything for document processing and retrieval\n\nKey benefits include:\n- Privacy: All processing happens locally\n- Performance: Direct API access to local models\n- Flexibility: Support for various document formats\n- Cost-effective: No external API usage\"\"\",\n                    \"page_idx\": 0,\n                }\n            ]\n\n            # Insert the content list using the correct method\n            await self.rag.insert_content_list(\n                content_list=content_list,\n                file_path=\"lmstudio_integration_demo.txt\",\n                # Use a unique doc_id to avoid collisions and doc_status reuse across runs\n                doc_id=f\"demo-content-{uuid.uuid4()}\",\n                display_stats=True,\n            )\n            print(\"✅ Sample content added to knowledge base\")\n\n            print(\"\\nTesting basic text query...\")\n\n            # Simple text query example\n            result = await self.rag.aquery(\n                \"What are the key benefits of this LM Studio integration?\",\n                mode=\"hybrid\",\n            )\n            print(f\"✅ Query result: {result[:300]}...\")\n\n        except Exception as e:\n            print(f\"❌ Query failed: {str(e)}\")\n\n\nasync def main():\n    \"\"\"Main example function.\"\"\"\n    print(\"=\" * 70)\n    print(\"LM Studio + RAG-Anything Integration Example\")\n    print(\"=\" * 70)\n\n    # Initialize integration\n    integration = LMStudioRAGIntegration()\n\n    # Test connection\n    if not await integration.test_connection():\n        return False\n\n    print()\n    if not await integration.test_chat_completion():\n        return False\n\n    # Initialize RAG\n    print(\"\\n\" + \"─\" * 50)\n    if not await integration.initialize_rag():\n        return False\n\n    # Example document processing (uncomment and provide a real file path)\n    # await integration.process_document_example(\"path/to/your/document.pdf\")\n\n    # Example queries (uncomment after processing documents)\n    # await integration.query_examples()\n\n    # Example basic query\n    await integration.simple_query_example()\n\n    print(\"\\n\" + \"=\" * 70)\n    print(\"Integration example completed successfully!\")\n    print(\"=\" * 70)\n\n    return True\n\n\nif __name__ == \"__main__\":\n    print(\"🚀 Starting LM Studio integration example...\")\n    success = asyncio.run(main())\n\n    exit(0 if success else 1)\n"
  },
  {
    "path": "examples/modalprocessors_example.py",
    "content": "\"\"\"\nExample of directly using modal processors\n\nThis example demonstrates how to use RAG-Anything's modal processors directly without going through MinerU.\n\"\"\"\n\nimport asyncio\nimport argparse\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.utils import EmbeddingFunc\nfrom lightrag.kg.shared_storage import initialize_pipeline_status\nfrom lightrag import LightRAG\nfrom raganything.modalprocessors import (\n    ImageModalProcessor,\n    TableModalProcessor,\n    EquationModalProcessor,\n)\n\nWORKING_DIR = \"./rag_storage\"\n\n\ndef get_llm_model_func(api_key: str, base_url: str = None):\n    return (\n        lambda prompt,\n        system_prompt=None,\n        history_messages=[],\n        **kwargs: openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n    )\n\n\ndef get_vision_model_func(api_key: str, base_url: str = None):\n    return (\n        lambda prompt,\n        system_prompt=None,\n        history_messages=[],\n        image_data=None,\n        **kwargs: openai_complete_if_cache(\n            \"gpt-4o\",\n            \"\",\n            system_prompt=None,\n            history_messages=[],\n            messages=[\n                {\"role\": \"system\", \"content\": system_prompt} if system_prompt else None,\n                {\n                    \"role\": \"user\",\n                    \"content\": [\n                        {\"type\": \"text\", \"text\": prompt},\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\n                                \"url\": f\"data:image/jpeg;base64,{image_data}\"\n                            },\n                        },\n                    ],\n                }\n                if image_data\n                else {\"role\": \"user\", \"content\": prompt},\n            ],\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n        if image_data\n        else openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        )\n    )\n\n\nasync def process_image_example(lightrag: LightRAG, vision_model_func):\n    \"\"\"Example of processing an image\"\"\"\n    # Create image processor\n    image_processor = ImageModalProcessor(\n        lightrag=lightrag, modal_caption_func=vision_model_func\n    )\n\n    # Prepare image content\n    image_content = {\n        \"img_path\": \"image.jpg\",\n        \"image_caption\": [\"Example image caption\"],\n        \"image_footnote\": [\"Example image footnote\"],\n    }\n\n    # Process image\n    (description, entity_info, _) = await image_processor.process_multimodal_content(\n        modal_content=image_content,\n        content_type=\"image\",\n        file_path=\"image_example.jpg\",\n        entity_name=\"Example Image\",\n    )\n\n    print(\"Image Processing Results:\")\n    print(f\"Description: {description}\")\n    print(f\"Entity Info: {entity_info}\")\n\n\nasync def process_table_example(lightrag: LightRAG, llm_model_func):\n    \"\"\"Example of processing a table\"\"\"\n    # Create table processor\n    table_processor = TableModalProcessor(\n        lightrag=lightrag, modal_caption_func=llm_model_func\n    )\n\n    # Prepare table content\n    table_content = {\n        \"table_body\": \"\"\"\n        | Name | Age | Occupation |\n        |------|-----|------------|\n        | John | 25  | Engineer   |\n        | Mary | 30  | Designer   |\n        \"\"\",\n        \"table_caption\": [\"Employee Information Table\"],\n        \"table_footnote\": [\"Data updated as of 2024\"],\n    }\n\n    # Process table\n    (description, entity_info, _) = await table_processor.process_multimodal_content(\n        modal_content=table_content,\n        content_type=\"table\",\n        file_path=\"table_example.md\",\n        entity_name=\"Employee Table\",\n    )\n\n    print(\"\\nTable Processing Results:\")\n    print(f\"Description: {description}\")\n    print(f\"Entity Info: {entity_info}\")\n\n\nasync def process_equation_example(lightrag: LightRAG, llm_model_func):\n    \"\"\"Example of processing a mathematical equation\"\"\"\n    # Create equation processor\n    equation_processor = EquationModalProcessor(\n        lightrag=lightrag, modal_caption_func=llm_model_func\n    )\n\n    # Prepare equation content\n    equation_content = {\"text\": \"E = mc^2\", \"text_format\": \"LaTeX\"}\n\n    # Process equation\n    (description, entity_info, _) = await equation_processor.process_multimodal_content(\n        modal_content=equation_content,\n        content_type=\"equation\",\n        file_path=\"equation_example.txt\",\n        entity_name=\"Mass-Energy Equivalence\",\n    )\n\n    print(\"\\nEquation Processing Results:\")\n    print(f\"Description: {description}\")\n    print(f\"Entity Info: {entity_info}\")\n\n\nasync def initialize_rag(api_key: str, base_url: str = None):\n    # Use environment variables for embedding configuration\n    import os\n\n    embedding_dim = int(os.getenv(\"EMBEDDING_DIM\", \"3072\"))\n    embedding_model = os.getenv(\"EMBEDDING_MODEL\", \"text-embedding-3-large\")\n\n    rag = LightRAG(\n        working_dir=WORKING_DIR,\n        embedding_func=EmbeddingFunc(\n            embedding_dim=embedding_dim,\n            max_token_size=8192,\n            func=lambda texts: openai_embed.func(\n                texts,\n                model=embedding_model,\n                api_key=api_key,\n                base_url=base_url,\n            ),\n        ),\n        llm_model_func=lambda prompt,\n        system_prompt=None,\n        history_messages=[],\n        **kwargs: openai_complete_if_cache(\n            \"gpt-4o-mini\",\n            prompt,\n            system_prompt=system_prompt,\n            history_messages=history_messages,\n            api_key=api_key,\n            base_url=base_url,\n            **kwargs,\n        ),\n    )\n\n    await rag.initialize_storages()\n    await initialize_pipeline_status()\n\n    return rag\n\n\ndef main():\n    \"\"\"Main function to run the example\"\"\"\n    parser = argparse.ArgumentParser(description=\"Modal Processors Example\")\n    parser.add_argument(\"--api-key\", required=True, help=\"OpenAI API key\")\n    parser.add_argument(\"--base-url\", help=\"Optional base URL for API\")\n    parser.add_argument(\n        \"--working-dir\", \"-w\", default=WORKING_DIR, help=\"Working directory path\"\n    )\n\n    args = parser.parse_args()\n\n    # Run examples\n    asyncio.run(main_async(args.api_key, args.base_url))\n\n\nasync def main_async(api_key: str, base_url: str = None):\n    # Initialize LightRAG\n    lightrag = await initialize_rag(api_key, base_url)\n\n    # Get model functions\n    llm_model_func = get_llm_model_func(api_key, base_url)\n    vision_model_func = get_vision_model_func(api_key, base_url)\n\n    # Run examples\n    await process_image_example(lightrag, vision_model_func)\n    await process_table_example(lightrag, llm_model_func)\n    await process_equation_example(lightrag, llm_model_func)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "examples/office_document_test.py",
    "content": "#!/usr/bin/env python3\n\"\"\"\nOffice Document Parsing Test Script for RAG-Anything\n\nThis script demonstrates how to parse various Office document formats\nusing MinerU, including DOC, DOCX, PPT, PPTX, XLS, and XLSX files.\n\nRequirements:\n- LibreOffice installed on the system\n- RAG-Anything package\n\nUsage:\n    python office_document_test.py --file path/to/office/document.docx\n\"\"\"\n\nimport argparse\nimport asyncio\nimport sys\nfrom pathlib import Path\nfrom raganything import RAGAnything\n\n\ndef check_libreoffice_installation():\n    \"\"\"Check if LibreOffice is installed and available\"\"\"\n    import subprocess\n\n    for cmd in [\"libreoffice\", \"soffice\"]:\n        try:\n            result = subprocess.run(\n                [cmd, \"--version\"], capture_output=True, check=True, timeout=10\n            )\n            print(f\"✅ LibreOffice found: {result.stdout.decode().strip()}\")\n            return True\n        except (\n            subprocess.CalledProcessError,\n            FileNotFoundError,\n            subprocess.TimeoutExpired,\n        ):\n            continue\n\n    print(\"❌ LibreOffice not found. Please install LibreOffice:\")\n    print(\"  - Windows: Download from https://www.libreoffice.org/download/download/\")\n    print(\"  - macOS: brew install --cask libreoffice\")\n    print(\"  - Ubuntu/Debian: sudo apt-get install libreoffice\")\n    print(\"  - CentOS/RHEL: sudo yum install libreoffice\")\n    return False\n\n\nasync def test_office_document_parsing(file_path: str):\n    \"\"\"Test Office document parsing with MinerU\"\"\"\n\n    print(f\"🧪 Testing Office document parsing: {file_path}\")\n\n    # Check if file exists and is a supported Office format\n    file_path = Path(file_path)\n    if not file_path.exists():\n        print(f\"❌ File does not exist: {file_path}\")\n        return False\n\n    supported_extensions = {\".doc\", \".docx\", \".ppt\", \".pptx\", \".xls\", \".xlsx\"}\n    if file_path.suffix.lower() not in supported_extensions:\n        print(f\"❌ Unsupported file format: {file_path.suffix}\")\n        print(f\"   Supported formats: {', '.join(supported_extensions)}\")\n        return False\n\n    print(f\"📄 File format: {file_path.suffix.upper()}\")\n    print(f\"📏 File size: {file_path.stat().st_size / 1024:.1f} KB\")\n\n    # Initialize RAGAnything (only for parsing functionality)\n    rag = RAGAnything()\n\n    try:\n        # Test document parsing with MinerU\n        print(\"\\n🔄 Testing document parsing with MinerU...\")\n        content_list, md_content = await rag.parse_document(\n            file_path=str(file_path),\n            output_dir=\"./test_output\",\n            parse_method=\"auto\",\n            display_stats=True,\n        )\n\n        print(\"✅ Parsing successful!\")\n        print(f\"   📊 Content blocks: {len(content_list)}\")\n        print(f\"   📝 Markdown length: {len(md_content)} characters\")\n\n        # Analyze content types\n        content_types = {}\n        for item in content_list:\n            if isinstance(item, dict):\n                content_type = item.get(\"type\", \"unknown\")\n                content_types[content_type] = content_types.get(content_type, 0) + 1\n\n        if content_types:\n            print(\"   📋 Content distribution:\")\n            for content_type, count in sorted(content_types.items()):\n                print(f\"      • {content_type}: {count}\")\n\n        # Display some parsed content preview\n        if md_content.strip():\n            print(\"\\n📄 Parsed content preview (first 500 characters):\")\n            preview = md_content.strip()[:500]\n            print(f\"   {preview}{'...' if len(md_content) > 500 else ''}\")\n\n        # Display some structured content examples\n        text_items = [\n            item\n            for item in content_list\n            if isinstance(item, dict) and item.get(\"type\") == \"text\"\n        ]\n        if text_items:\n            print(\"\\n📝 Sample text blocks:\")\n            for i, item in enumerate(text_items[:3], 1):\n                text_content = item.get(\"text\", \"\")\n                if text_content.strip():\n                    preview = text_content.strip()[:200]\n                    print(\n                        f\"   {i}. {preview}{'...' if len(text_content) > 200 else ''}\"\n                    )\n\n        # Check for images\n        image_items = [\n            item\n            for item in content_list\n            if isinstance(item, dict) and item.get(\"type\") == \"image\"\n        ]\n        if image_items:\n            print(f\"\\n🖼️  Found {len(image_items)} image(s):\")\n            for i, item in enumerate(image_items, 1):\n                print(f\"   {i}. Image path: {item.get('img_path', 'N/A')}\")\n\n        # Check for tables\n        table_items = [\n            item\n            for item in content_list\n            if isinstance(item, dict) and item.get(\"type\") == \"table\"\n        ]\n        if table_items:\n            print(f\"\\n📊 Found {len(table_items)} table(s):\")\n            for i, item in enumerate(table_items, 1):\n                table_body = item.get(\"table_body\", \"\")\n                row_count = len(table_body.split(\"\\n\"))\n                print(f\"   {i}. Table with {row_count} rows\")\n\n        print(\"\\n🎉 Office document parsing test completed successfully!\")\n        print(\"📁 Output files saved to: ./test_output\")\n        return True\n\n    except Exception as e:\n        print(f\"\\n❌ Office document parsing failed: {str(e)}\")\n        import traceback\n\n        print(f\"   Full error: {traceback.format_exc()}\")\n        return False\n\n\ndef main():\n    \"\"\"Main function\"\"\"\n    parser = argparse.ArgumentParser(\n        description=\"Test Office document parsing with MinerU\"\n    )\n    parser.add_argument(\"--file\", help=\"Path to the Office document to test\")\n    parser.add_argument(\n        \"--check-libreoffice\",\n        action=\"store_true\",\n        help=\"Only check LibreOffice installation\",\n    )\n\n    args = parser.parse_args()\n\n    # Check LibreOffice installation\n    print(\"🔧 Checking LibreOffice installation...\")\n    if not check_libreoffice_installation():\n        return 1\n\n    if args.check_libreoffice:\n        print(\"✅ LibreOffice installation check passed!\")\n        return 0\n\n    # If not just checking dependencies, file argument is required\n    if not args.file:\n        print(\n            \"❌ Error: --file argument is required when not using --check-libreoffice\"\n        )\n        parser.print_help()\n        return 1\n\n    # Run the parsing test\n    try:\n        success = asyncio.run(test_office_document_parsing(args.file))\n        return 0 if success else 1\n    except KeyboardInterrupt:\n        print(\"\\n⏹️ Test interrupted by user\")\n        return 1\n    except Exception as e:\n        print(f\"\\n❌ Unexpected error: {str(e)}\")\n        return 1\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n"
  },
  {
    "path": "examples/raganything_example.py",
    "content": "#!/usr/bin/env python\n\"\"\"\nExample script demonstrating parser integration with RAGAnything\n\nThis example shows how to:\n1. Process documents with RAGAnything using configurable parsers\n2. Perform pure text queries using aquery() method\n3. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method\n4. Handle different types of multimodal content (tables, equations) in queries\n\"\"\"\n\nimport os\nimport argparse\nimport asyncio\nimport logging\nimport logging.config\nfrom pathlib import Path\n\n# Add project root directory to Python path\nimport sys\n\nsys.path.append(str(Path(__file__).parent.parent))\n\nfrom lightrag.llm.openai import openai_complete_if_cache, openai_embed\nfrom lightrag.utils import EmbeddingFunc, logger, set_verbose_debug\nfrom raganything import RAGAnything, RAGAnythingConfig\n\nfrom dotenv import load_dotenv\n\nload_dotenv(dotenv_path=\".env\", override=False)\n\n\ndef configure_logging():\n    \"\"\"Configure logging for the application\"\"\"\n    # Get log directory path from environment variable or use current directory\n    log_dir = os.getenv(\"LOG_DIR\", os.getcwd())\n    log_file_path = os.path.abspath(os.path.join(log_dir, \"raganything_example.log\"))\n\n    print(f\"\\nRAGAnything example log file: {log_file_path}\\n\")\n    os.makedirs(os.path.dirname(log_dir), exist_ok=True)\n\n    # Get log file max size and backup count from environment variables\n    log_max_bytes = int(os.getenv(\"LOG_MAX_BYTES\", 10485760))  # Default 10MB\n    log_backup_count = int(os.getenv(\"LOG_BACKUP_COUNT\", 5))  # Default 5 backups\n\n    logging.config.dictConfig(\n        {\n            \"version\": 1,\n            \"disable_existing_loggers\": False,\n            \"formatters\": {\n                \"default\": {\n                    \"format\": \"%(levelname)s: %(message)s\",\n                },\n                \"detailed\": {\n                    \"format\": \"%(asctime)s - %(name)s - %(levelname)s - %(message)s\",\n                },\n            },\n            \"handlers\": {\n                \"console\": {\n                    \"formatter\": \"default\",\n                    \"class\": \"logging.StreamHandler\",\n                    \"stream\": \"ext://sys.stderr\",\n                },\n                \"file\": {\n                    \"formatter\": \"detailed\",\n                    \"class\": \"logging.handlers.RotatingFileHandler\",\n                    \"filename\": log_file_path,\n                    \"maxBytes\": log_max_bytes,\n                    \"backupCount\": log_backup_count,\n                    \"encoding\": \"utf-8\",\n                },\n            },\n            \"loggers\": {\n                \"lightrag\": {\n                    \"handlers\": [\"console\", \"file\"],\n                    \"level\": \"INFO\",\n                    \"propagate\": False,\n                },\n            },\n        }\n    )\n\n    # Set the logger level to INFO\n    logger.setLevel(logging.INFO)\n    # Enable verbose debug if needed\n    set_verbose_debug(os.getenv(\"VERBOSE\", \"false\").lower() == \"true\")\n\n\nasync def process_with_rag(\n    file_path: str,\n    output_dir: str,\n    api_key: str,\n    base_url: str = None,\n    working_dir: str = None,\n    parser: str = None,\n):\n    \"\"\"\n    Process document with RAGAnything\n\n    Args:\n        file_path: Path to the document\n        output_dir: Output directory for RAG results\n        api_key: OpenAI API key\n        base_url: Optional base URL for API\n        working_dir: Working directory for RAG storage\n    \"\"\"\n    try:\n        # Create RAGAnything configuration\n        config = RAGAnythingConfig(\n            working_dir=working_dir or \"./rag_storage\",\n            parser=parser,  # Parser selection: mineru, docling, or paddleocr\n            parse_method=\"auto\",  # Parse method: auto, ocr, or txt\n            enable_image_processing=True,\n            enable_table_processing=True,\n            enable_equation_processing=True,\n        )\n\n        # Define LLM model function\n        def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):\n            return openai_complete_if_cache(\n                \"gpt-4o-mini\",\n                prompt,\n                system_prompt=system_prompt,\n                history_messages=history_messages,\n                api_key=api_key,\n                base_url=base_url,\n                **kwargs,\n            )\n\n        # Define vision model function for image processing\n        def vision_model_func(\n            prompt,\n            system_prompt=None,\n            history_messages=[],\n            image_data=None,\n            messages=None,\n            **kwargs,\n        ):\n            # If messages format is provided (for multimodal VLM enhanced query), use it directly\n            if messages:\n                return openai_complete_if_cache(\n                    \"gpt-4o\",\n                    \"\",\n                    system_prompt=None,\n                    history_messages=[],\n                    messages=messages,\n                    api_key=api_key,\n                    base_url=base_url,\n                    **kwargs,\n                )\n            # Traditional single image format\n            elif image_data:\n                return openai_complete_if_cache(\n                    \"gpt-4o\",\n                    \"\",\n                    system_prompt=None,\n                    history_messages=[],\n                    messages=[\n                        {\"role\": \"system\", \"content\": system_prompt}\n                        if system_prompt\n                        else None,\n                        {\n                            \"role\": \"user\",\n                            \"content\": [\n                                {\"type\": \"text\", \"text\": prompt},\n                                {\n                                    \"type\": \"image_url\",\n                                    \"image_url\": {\n                                        \"url\": f\"data:image/jpeg;base64,{image_data}\"\n                                    },\n                                },\n                            ],\n                        }\n                        if image_data\n                        else {\"role\": \"user\", \"content\": prompt},\n                    ],\n                    api_key=api_key,\n                    base_url=base_url,\n                    **kwargs,\n                )\n            # Pure text format\n            else:\n                return llm_model_func(prompt, system_prompt, history_messages, **kwargs)\n\n        # Define embedding function - using environment variables for configuration\n        embedding_dim = int(os.getenv(\"EMBEDDING_DIM\", \"3072\"))\n        embedding_model = os.getenv(\"EMBEDDING_MODEL\", \"text-embedding-3-large\")\n\n        embedding_func = EmbeddingFunc(\n            embedding_dim=embedding_dim,\n            max_token_size=8192,\n            func=lambda texts: openai_embed.func(\n                texts,\n                model=embedding_model,\n                api_key=api_key,\n                base_url=base_url,\n            ),\n        )\n\n        # Initialize RAGAnything with new dataclass structure\n        rag = RAGAnything(\n            config=config,\n            llm_model_func=llm_model_func,\n            vision_model_func=vision_model_func,\n            embedding_func=embedding_func,\n        )\n\n        # Process document\n        await rag.process_document_complete(\n            file_path=file_path, output_dir=output_dir, parse_method=\"auto\"\n        )\n\n        # Example queries - demonstrating different query approaches\n        logger.info(\"\\nQuerying processed document:\")\n\n        # 1. Pure text queries using aquery()\n        text_queries = [\n            \"What is the main content of the document?\",\n            \"What are the key topics discussed?\",\n        ]\n\n        for query in text_queries:\n            logger.info(f\"\\n[Text Query]: {query}\")\n            result = await rag.aquery(query, mode=\"hybrid\")\n            logger.info(f\"Answer: {result}\")\n\n        # 2. Multimodal query with specific multimodal content using aquery_with_multimodal()\n        logger.info(\n            \"\\n[Multimodal Query]: Analyzing performance data in context of document\"\n        )\n        multimodal_result = await rag.aquery_with_multimodal(\n            \"Compare this performance data with any similar results mentioned in the document\",\n            multimodal_content=[\n                {\n                    \"type\": \"table\",\n                    \"table_data\": \"\"\"Method,Accuracy,Processing_Time\n                                RAGAnything,95.2%,120ms\n                                Traditional_RAG,87.3%,180ms\n                                Baseline,82.1%,200ms\"\"\",\n                    \"table_caption\": \"Performance comparison results\",\n                }\n            ],\n            mode=\"hybrid\",\n        )\n        logger.info(f\"Answer: {multimodal_result}\")\n\n        # 3. Another multimodal query with equation content\n        logger.info(\"\\n[Multimodal Query]: Mathematical formula analysis\")\n        equation_result = await rag.aquery_with_multimodal(\n            \"Explain this formula and relate it to any mathematical concepts in the document\",\n            multimodal_content=[\n                {\n                    \"type\": \"equation\",\n                    \"latex\": \"F1 = 2 \\\\cdot \\\\frac{precision \\\\cdot recall}{precision + recall}\",\n                    \"equation_caption\": \"F1-score calculation formula\",\n                }\n            ],\n            mode=\"hybrid\",\n        )\n        logger.info(f\"Answer: {equation_result}\")\n\n    except Exception as e:\n        logger.error(f\"Error processing with RAG: {str(e)}\")\n        import traceback\n\n        logger.error(traceback.format_exc())\n\n\ndef main():\n    \"\"\"Main function to run the example\"\"\"\n    parser = argparse.ArgumentParser(description=\"MinerU RAG Example\")\n    parser.add_argument(\"file_path\", help=\"Path to the document to process\")\n    parser.add_argument(\n        \"--working_dir\", \"-w\", default=\"./rag_storage\", help=\"Working directory path\"\n    )\n    parser.add_argument(\n        \"--output\", \"-o\", default=\"./output\", help=\"Output directory path\"\n    )\n    parser.add_argument(\n        \"--api-key\",\n        default=os.getenv(\"LLM_BINDING_API_KEY\"),\n        help=\"OpenAI API key (defaults to LLM_BINDING_API_KEY env var)\",\n    )\n    parser.add_argument(\n        \"--base-url\",\n        default=os.getenv(\"LLM_BINDING_HOST\"),\n        help=\"Optional base URL for API\",\n    )\n    parser.add_argument(\n        \"--parser\",\n        default=os.getenv(\"PARSER\", \"mineru\"),\n        help=(\n            \"Parser selection. Built-ins: mineru, docling, paddleocr. \"\n            \"Custom parsers that you register via register_parser() in the \"\n            \"same Python process are also accepted when using RAGAnything as \"\n            \"a library. This example script does not perform any automatic \"\n            \"plugin discovery.\"\n        ),\n    )\n\n    args = parser.parse_args()\n\n    # Check if API key is provided\n    if not args.api_key:\n        logger.error(\"Error: OpenAI API key is required\")\n        logger.error(\"Set api key environment variable or use --api-key option\")\n        return\n\n    # Create output directory if specified\n    if args.output:\n        os.makedirs(args.output, exist_ok=True)\n\n    # Process with RAG\n    asyncio.run(\n        process_with_rag(\n            args.file_path,\n            args.output,\n            args.api_key,\n            args.base_url,\n            args.working_dir,\n            args.parser,\n        )\n    )\n\n\nif __name__ == \"__main__\":\n    # Configure logging first\n    configure_logging()\n\n    print(\"RAGAnything Example\")\n    print(\"=\" * 30)\n    print(\"Processing document with multimodal RAG pipeline\")\n    print(\"=\" * 30)\n\n    main()\n"
  },
  {
    "path": "examples/text_format_test.py",
    "content": "#!/usr/bin/env python3\n\"\"\"\nText Format Parsing Test Script for RAG-Anything\n\nThis script demonstrates how to parse various text formats\nusing MinerU, including TXT and MD files.\n\nRequirements:\n- ReportLab library for PDF conversion\n- RAG-Anything package\n\nUsage:\n    python text_format_test.py --file path/to/text/document.md\n\"\"\"\n\nimport argparse\nimport asyncio\nimport sys\nfrom pathlib import Path\nfrom raganything import RAGAnything\n\n\ndef check_reportlab_installation():\n    \"\"\"Check if ReportLab is installed and available\"\"\"\n    try:\n        import reportlab\n\n        print(\n            f\"✅ ReportLab found: version {reportlab.Version if hasattr(reportlab, 'Version') else 'Unknown'}\"\n        )\n        return True\n    except ImportError:\n        print(\"❌ ReportLab not found. Please install ReportLab:\")\n        print(\"  pip install reportlab\")\n        return False\n\n\nasync def test_text_format_parsing(file_path: str):\n    \"\"\"Test text format parsing with MinerU\"\"\"\n\n    print(f\"🧪 Testing text format parsing: {file_path}\")\n\n    # Check if file exists and is a supported text format\n    file_path = Path(file_path)\n    if not file_path.exists():\n        print(f\"❌ File does not exist: {file_path}\")\n        return False\n\n    supported_extensions = {\".txt\", \".md\"}\n    if file_path.suffix.lower() not in supported_extensions:\n        print(f\"❌ Unsupported file format: {file_path.suffix}\")\n        print(f\"   Supported formats: {', '.join(supported_extensions)}\")\n        return False\n\n    print(f\"📄 File format: {file_path.suffix.upper()}\")\n    print(f\"📏 File size: {file_path.stat().st_size / 1024:.1f} KB\")\n\n    # Display text file info\n    try:\n        with open(file_path, \"r\", encoding=\"utf-8\") as f:\n            content = f.read()\n        print(f\"📝 Text length: {len(content)} characters\")\n        print(f\"📋 Line count: {len(content.splitlines())}\")\n    except UnicodeDecodeError:\n        print(\n            \"⚠️  Text encoding: Non-UTF-8 (will try multiple encodings during processing)\"\n        )\n\n    # Initialize RAGAnything (only for parsing functionality)\n    rag = RAGAnything()\n\n    try:\n        # Test text parsing with MinerU\n        print(\"\\n🔄 Testing text parsing with MinerU...\")\n        content_list, md_content = await rag.parse_document(\n            file_path=str(file_path),\n            output_dir=\"./test_output\",\n            parse_method=\"auto\",\n            display_stats=True,\n        )\n\n        print(\"✅ Parsing successful!\")\n        print(f\"   📊 Content blocks: {len(content_list)}\")\n        print(f\"   📝 Markdown length: {len(md_content)} characters\")\n\n        # Analyze content types\n        content_types = {}\n        for item in content_list:\n            if isinstance(item, dict):\n                content_type = item.get(\"type\", \"unknown\")\n                content_types[content_type] = content_types.get(content_type, 0) + 1\n\n        if content_types:\n            print(\"   📋 Content distribution:\")\n            for content_type, count in sorted(content_types.items()):\n                print(f\"      • {content_type}: {count}\")\n\n        # Display extracted text (if any)\n        if md_content.strip():\n            print(\"\\n📄 Extracted text preview (first 500 characters):\")\n            preview = md_content.strip()[:500]\n            print(f\"   {preview}{'...' if len(md_content) > 500 else ''}\")\n        else:\n            print(\"\\n📄 No text extracted from the document\")\n\n        # Display text blocks\n        text_items = [\n            item\n            for item in content_list\n            if isinstance(item, dict) and item.get(\"type\") == \"text\"\n        ]\n        if text_items:\n            print(\"\\n📝 Text blocks found:\")\n            for i, item in enumerate(text_items[:3], 1):\n                text_content = item.get(\"text\", \"\")\n                if text_content.strip():\n                    preview = text_content.strip()[:200]\n                    print(\n                        f\"   {i}. {preview}{'...' if len(text_content) > 200 else ''}\"\n                    )\n\n        # Check for any tables detected in the text\n        table_items = [\n            item\n            for item in content_list\n            if isinstance(item, dict) and item.get(\"type\") == \"table\"\n        ]\n        if table_items:\n            print(f\"\\n📊 Found {len(table_items)} table(s) in document:\")\n            for i, item in enumerate(table_items, 1):\n                table_body = item.get(\"table_body\", \"\")\n                row_count = len(table_body.split(\"\\n\"))\n                print(f\"   {i}. Table with {row_count} rows\")\n\n        # Check for images (unlikely in text files but possible in MD)\n        image_items = [\n            item\n            for item in content_list\n            if isinstance(item, dict) and item.get(\"type\") == \"image\"\n        ]\n        if image_items:\n            print(f\"\\n🖼️  Found {len(image_items)} image(s):\")\n            for i, item in enumerate(image_items, 1):\n                print(f\"   {i}. Image path: {item.get('img_path', 'N/A')}\")\n\n        print(\"\\n🎉 Text format parsing test completed successfully!\")\n        print(\"📁 Output files saved to: ./test_output\")\n        return True\n\n    except Exception as e:\n        print(f\"\\n❌ Text format parsing failed: {str(e)}\")\n        import traceback\n\n        print(f\"   Full error: {traceback.format_exc()}\")\n        return False\n\n\ndef main():\n    \"\"\"Main function\"\"\"\n    parser = argparse.ArgumentParser(description=\"Test text format parsing with MinerU\")\n    parser.add_argument(\"--file\", help=\"Path to the text file to test\")\n    parser.add_argument(\n        \"--check-reportlab\",\n        action=\"store_true\",\n        help=\"Only check ReportLab installation\",\n    )\n\n    args = parser.parse_args()\n\n    # Check ReportLab installation\n    print(\"🔧 Checking ReportLab installation...\")\n    if not check_reportlab_installation():\n        return 1\n\n    if args.check_reportlab:\n        print(\"✅ ReportLab installation check passed!\")\n        return 0\n\n    # If not just checking dependencies, file argument is required\n    if not args.file:\n        print(\"❌ Error: --file argument is required when not using --check-reportlab\")\n        parser.print_help()\n        return 1\n\n    # Run the parsing test\n    try:\n        success = asyncio.run(test_text_format_parsing(args.file))\n        return 0 if success else 1\n    except KeyboardInterrupt:\n        print(\"\\n⏹️ Test interrupted by user\")\n        return 1\n    except Exception as e:\n        print(f\"\\n❌ Unexpected error: {str(e)}\")\n        return 1\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n"
  },
  {
    "path": "examples/vllm_integration_example.py",
    "content": "\"\"\"\nvLLM Integration Example with RAG-Anything\n\nThis example demonstrates how to integrate vLLM with RAG-Anything for\nhigh-throughput document processing and querying using locally or remotely\nserved models.\n\nvLLM provides an OpenAI-compatible API server with continuous batching,\nPagedAttention, and optimized inference — ideal for production RAG workloads.\n\nRequirements:\n- vLLM serving a model (see: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)\n- OpenAI Python package: pip install openai\n- RAG-Anything installed: pip install raganything\n\nStart vLLM (example):\n    # Chat / completion model\n    vllm serve Qwen/Qwen2.5-72B-Instruct --tensor-parallel-size 4\n\n    # Embedding model (separate process, different port)\n    vllm serve BAAI/bge-m3 --task embedding --port 8001\n\nEnvironment Setup:\nCreate a .env file with:\nLLM_BINDING=vllm\nLLM_MODEL=Qwen/Qwen2.5-72B-Instruct\nLLM_BINDING_HOST=http://localhost:8000/v1\nLLM_BINDING_API_KEY=token-abc123\nEMBEDDING_BINDING=vllm\nEMBEDDING_MODEL=BAAI/bge-m3\nEMBEDDING_BINDING_HOST=http://localhost:8001/v1\nEMBEDDING_BINDING_API_KEY=token-abc123\n\"\"\"\n\nimport os\nimport uuid\nimport asyncio\nfrom typing import List, Dict, Optional\nfrom dotenv import load_dotenv\nfrom openai import AsyncOpenAI\n\n# Load environment variables\nload_dotenv()\n\n# RAG-Anything imports\nfrom raganything import RAGAnything, RAGAnythingConfig\nfrom lightrag.utils import EmbeddingFunc\nfrom lightrag.llm.openai import openai_complete_if_cache\n\n# vLLM configuration from environment variables\nVLLM_BASE_URL = os.getenv(\"LLM_BINDING_HOST\", \"http://localhost:8000/v1\")\nVLLM_API_KEY = os.getenv(\"LLM_BINDING_API_KEY\", \"token-abc123\")\nVLLM_MODEL_NAME = os.getenv(\"LLM_MODEL\", \"Qwen/Qwen2.5-72B-Instruct\")\nVLLM_EMBED_MODEL = os.getenv(\"EMBEDDING_MODEL\", \"BAAI/bge-m3\")\nVLLM_EMBED_BASE_URL = os.getenv(\"EMBEDDING_BINDING_HOST\", \"http://localhost:8001/v1\")\nVLLM_EMBED_API_KEY = os.getenv(\"EMBEDDING_BINDING_API_KEY\", \"token-abc123\")\n\n\nasync def vllm_llm_model_func(\n    prompt: str,\n    system_prompt: Optional[str] = None,\n    history_messages: List[Dict] = None,\n    **kwargs,\n) -> str:\n    \"\"\"Top-level LLM function for LightRAG (pickle-safe).\n\n    Uses openai_complete_if_cache since vLLM exposes an OpenAI-compatible API.\n    \"\"\"\n    return await openai_complete_if_cache(\n        model=VLLM_MODEL_NAME,\n        prompt=prompt,\n        system_prompt=system_prompt,\n        history_messages=history_messages or [],\n        base_url=VLLM_BASE_URL,\n        api_key=VLLM_API_KEY,\n        **kwargs,\n    )\n\n\nasync def vllm_embedding_async(texts: List[str]) -> List[List[float]]:\n    \"\"\"Top-level embedding function for LightRAG (pickle-safe).\n\n    Connects to vLLM's embedding endpoint (may run on a separate port).\n    \"\"\"\n    from lightrag.llm.openai import openai_embed\n\n    embeddings = await openai_embed(\n        texts=texts,\n        model=VLLM_EMBED_MODEL,\n        base_url=VLLM_EMBED_BASE_URL,\n        api_key=VLLM_EMBED_API_KEY,\n    )\n    return embeddings.tolist()\n\n\nclass VLLMRAGIntegration:\n    \"\"\"Integration class for vLLM with RAG-Anything.\"\"\"\n\n    def __init__(self):\n        # vLLM configuration using standard LLM_BINDING variables\n        self.base_url = os.getenv(\"LLM_BINDING_HOST\", \"http://localhost:8000/v1\")\n        self.api_key = os.getenv(\"LLM_BINDING_API_KEY\", \"token-abc123\")\n        self.model_name = os.getenv(\"LLM_MODEL\", \"Qwen/Qwen2.5-72B-Instruct\")\n        self.embedding_model = os.getenv(\"EMBEDDING_MODEL\", \"BAAI/bge-m3\")\n        self.embedding_base_url = os.getenv(\n            \"EMBEDDING_BINDING_HOST\", \"http://localhost:8001/v1\"\n        )\n        self.embedding_api_key = os.getenv(\"EMBEDDING_BINDING_API_KEY\", \"token-abc123\")\n\n        # RAG-Anything configuration\n        # Use a fresh working directory each run to avoid legacy doc_status schema conflicts\n        self.config = RAGAnythingConfig(\n            working_dir=f\"./rag_storage_vllm/{uuid.uuid4()}\",\n            parser=\"mineru\",\n            parse_method=\"auto\",\n            enable_image_processing=False,\n            enable_table_processing=True,\n            enable_equation_processing=True,\n        )\n        print(f\"📁 Using working_dir: {self.config.working_dir}\")\n\n        self.rag = None\n\n    async def test_connection(self) -> bool:\n        \"\"\"Test vLLM connection and list available models.\"\"\"\n        try:\n            print(f\"🔌 Testing vLLM connection at: {self.base_url}\")\n            client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)\n            models = await client.models.list()\n            print(f\"✅ Connected successfully! Found {len(models.data)} models\")\n\n            # Show available models\n            print(\"📊 Available models:\")\n            for i, model in enumerate(models.data[:5]):\n                marker = \"🎯\" if model.id == self.model_name else \"  \"\n                print(f\"{marker} {i+1}. {model.id}\")\n\n            if len(models.data) > 5:\n                print(f\"  ... and {len(models.data) - 5} more models\")\n\n            return True\n        except Exception as e:\n            print(f\"❌ Connection failed: {str(e)}\")\n            print(\"\\n💡 Troubleshooting tips:\")\n            print(\"1. Ensure vLLM server is running:\")\n            print(\"   vllm serve Qwen/Qwen2.5-72B-Instruct\")\n            print(f\"2. Verify server address: {self.base_url}\")\n            print(\"3. Check that the model has finished loading\")\n            print(\"4. If using authentication, verify your API key\")\n            return False\n        finally:\n            try:\n                await client.close()\n            except Exception:\n                pass\n\n    async def test_chat_completion(self) -> bool:\n        \"\"\"Test basic chat functionality.\"\"\"\n        try:\n            print(f\"💬 Testing chat with model: {self.model_name}\")\n            client = AsyncOpenAI(base_url=self.base_url, api_key=self.api_key)\n            response = await client.chat.completions.create(\n                model=self.model_name,\n                messages=[\n                    {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n                    {\n                        \"role\": \"user\",\n                        \"content\": \"Hello! Please confirm you're working and tell me your capabilities.\",\n                    },\n                ],\n                max_tokens=100,\n                temperature=0.7,\n            )\n\n            result = response.choices[0].message.content.strip()\n            print(\"✅ Chat test successful!\")\n            print(f\"Response: {result}\")\n            return True\n        except Exception as e:\n            print(f\"❌ Chat test failed: {str(e)}\")\n            return False\n        finally:\n            try:\n                await client.close()\n            except Exception:\n                pass\n\n    def embedding_func_factory(self):\n        \"\"\"Create a completely serializable embedding function.\"\"\"\n        return EmbeddingFunc(\n            embedding_dim=1024,  # bge-m3 default dimension\n            max_token_size=8192,  # bge-m3 context length\n            func=vllm_embedding_async,\n        )\n\n    async def initialize_rag(self):\n        \"\"\"Initialize RAG-Anything with vLLM functions.\"\"\"\n        print(\"Initializing RAG-Anything with vLLM...\")\n\n        try:\n            self.rag = RAGAnything(\n                config=self.config,\n                llm_model_func=vllm_llm_model_func,\n                embedding_func=self.embedding_func_factory(),\n            )\n\n            # Compatibility: avoid writing unknown field 'multimodal_processed' to LightRAG doc_status\n            async def _noop_mark_multimodal(doc_id: str):\n                return None\n\n            self.rag._mark_multimodal_processing_complete = _noop_mark_multimodal\n\n            print(\"✅ RAG-Anything initialized successfully!\")\n            return True\n        except Exception as e:\n            print(f\"❌ RAG initialization failed: {str(e)}\")\n            return False\n\n    async def process_document_example(self, file_path: str):\n        \"\"\"Example: Process a document with vLLM backend.\"\"\"\n        if not self.rag:\n            print(\"❌ RAG not initialized. Call initialize_rag() first.\")\n            return\n\n        try:\n            print(f\"📄 Processing document: {file_path}\")\n            await self.rag.process_document_complete(\n                file_path=file_path,\n                output_dir=\"./output_vllm\",\n                parse_method=\"auto\",\n                display_stats=True,\n            )\n            print(\"✅ Document processing completed!\")\n        except Exception as e:\n            print(f\"❌ Document processing failed: {str(e)}\")\n\n    async def query_examples(self):\n        \"\"\"Example queries with different modes.\"\"\"\n        if not self.rag:\n            print(\"❌ RAG not initialized. Call initialize_rag() first.\")\n            return\n\n        # Example queries\n        queries = [\n            (\"What are the main topics in the processed documents?\", \"hybrid\"),\n            (\"Summarize any tables or data found in the documents\", \"local\"),\n            (\"What images or figures are mentioned?\", \"global\"),\n        ]\n\n        print(\"\\n🔍 Running example queries...\")\n        for query, mode in queries:\n            try:\n                print(f\"\\nQuery ({mode}): {query}\")\n                result = await self.rag.aquery(query, mode=mode)\n                print(f\"Answer: {result[:200]}...\")\n            except Exception as e:\n                print(f\"❌ Query failed: {str(e)}\")\n\n    async def simple_query_example(self):\n        \"\"\"Example basic text query with sample content.\"\"\"\n        if not self.rag:\n            print(\"❌ RAG not initialized\")\n            return\n\n        try:\n            print(\"\\nAdding sample content for testing...\")\n\n            # Create content list in the format expected by RAGAnything\n            content_list = [\n                {\n                    \"type\": \"text\",\n                    \"text\": \"\"\"vLLM Integration with RAG-Anything\n\nThis integration demonstrates how to connect vLLM's high-performance inference engine\nwith RAG-Anything's multimodal document processing capabilities. The system uses:\n\n- vLLM for high-throughput LLM inference with continuous batching\n- PagedAttention for efficient memory management\n- Tensor parallelism for serving large models across multiple GPUs\n- RAG-Anything for document processing and retrieval\n\nKey benefits include:\n- Production throughput: Continuous batching serves many concurrent requests\n- Memory efficiency: PagedAttention reduces GPU memory waste by up to 90%\n- Scalability: Tensor parallelism distributes large models across GPUs\n- OpenAI compatibility: Drop-in replacement for OpenAI API clients\n- Quantization support: AWQ, GPTQ, and FP8 for reduced memory footprint\"\"\",\n                    \"page_idx\": 0,\n                }\n            ]\n\n            # Insert the content list using the correct method\n            await self.rag.insert_content_list(\n                content_list=content_list,\n                file_path=\"vllm_integration_demo.txt\",\n                doc_id=f\"demo-content-{uuid.uuid4()}\",\n                display_stats=True,\n            )\n            print(\"✅ Sample content added to knowledge base\")\n\n            print(\"\\nTesting basic text query...\")\n\n            # Simple text query example\n            result = await self.rag.aquery(\n                \"What are the key benefits of using vLLM for RAG workloads?\",\n                mode=\"hybrid\",\n            )\n            print(f\"✅ Query result: {result[:300]}...\")\n\n        except Exception as e:\n            print(f\"❌ Query failed: {str(e)}\")\n\n\nasync def main():\n    \"\"\"Main example function.\"\"\"\n    print(\"=\" * 70)\n    print(\"vLLM + RAG-Anything Integration Example\")\n    print(\"=\" * 70)\n\n    # Initialize integration\n    integration = VLLMRAGIntegration()\n\n    # Test connection\n    if not await integration.test_connection():\n        return False\n\n    print()\n    if not await integration.test_chat_completion():\n        return False\n\n    # Initialize RAG\n    print(\"\\n\" + \"─\" * 50)\n    if not await integration.initialize_rag():\n        return False\n\n    # Example document processing (uncomment and provide a real file path)\n    # await integration.process_document_example(\"path/to/your/document.pdf\")\n\n    # Example queries (uncomment after processing documents)\n    # await integration.query_examples()\n\n    # Example basic query\n    await integration.simple_query_example()\n\n    print(\"\\n\" + \"=\" * 70)\n    print(\"Integration example completed successfully!\")\n    print(\"=\" * 70)\n\n    return True\n\n\nif __name__ == \"__main__\":\n    print(\"🚀 Starting vLLM integration example...\")\n    success = asyncio.run(main())\n\n    exit(0 if success else 1)\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[build-system]\nrequires = [\"setuptools>=64\", \"wheel\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[project]\nname = \"raganything\"\ndynamic = [\"version\"]\nauthors = [\n    {name = \"Zirui Guo\"}\n]\ndescription = \"RAGAnything: All-in-One RAG System\"\nreadme = \"README.md\"\nlicense = { text = \"MIT\" }\nrequires-python = \">=3.10\"\nclassifiers = [\n    \"Development Status :: 4 - Beta\",\n    \"Programming Language :: Python :: 3\",\n    \"License :: OSI Approved :: MIT License\",\n    \"Operating System :: OS Independent\",\n    \"Intended Audience :: Developers\",\n    \"Topic :: Software Development :: Libraries :: Python Modules\",\n]\ndependencies = [\n    \"huggingface_hub\",\n    \"lightrag-hku\",\n    \"mineru[core]\",\n    \"tqdm\",\n]\n\n[project.optional-dependencies]\nimage = [\"Pillow>=10.0.0\"]\ntext = [\"reportlab>=4.0.0\"]\noffice = []  # Requires LibreOffice (external program)\npaddleocr = [\n    \"paddleocr>=2.7.0\",\n    \"pypdfium2>=4.25.0\",\n]\nmarkdown = [\n    \"markdown>=3.4.0\",\n    \"weasyprint>=60.0\",\n    \"pygments>=2.10.0\",\n]\nall = [\n    \"Pillow>=10.0.0\",\n    \"reportlab>=4.0.0\",\n    \"paddleocr>=2.7.0\",\n    \"pypdfium2>=4.25.0\",\n    \"markdown>=3.4.0\",\n    \"weasyprint>=60.0\",\n    \"pygments>=2.10.0\",\n]\n\n[project.urls]\nHomepage = \"https://github.com/HKUDS/RAG-Anything\"\nDocumentation = \"https://github.com/HKUDS/RAG-Anything\"\nRepository = \"https://github.com/HKUDS/RAG-Anything\"\nIssues = \"https://github.com/HKUDS/RAG-Anything/issues\"\n\n[tool.uv]\ndev-dependencies = [\n    \"pytest>=6.0\",\n    \"pytest-asyncio\",\n    \"black\",\n    \"isort\",\n    \"flake8\",\n    \"mypy\",\n    \"openai\",\n    \"python-dotenv\",\n]\n\n[tool.setuptools.packages.find]\ninclude = [\"raganything*\"]\n\n[tool.setuptools]\ninclude-package-data = true\n\n[tool.setuptools.dynamic]\nversion = {attr = \"raganything.__version__\"}\n\n[tool.ruff]\ntarget-version = \"py310\"\n\n[tool.pytest.ini_options]\ntestpaths = [\"tests\"]\npython_files = [\"test*.py\"]\nnorecursedirs = [\"examples\"]\n"
  },
  {
    "path": "raganything/__init__.py",
    "content": "from .raganything import RAGAnything as RAGAnything\nfrom .config import RAGAnythingConfig as RAGAnythingConfig\n\n# Core parser class is always available.\nfrom .parser import Parser as Parser\n\n# Optional: parser plugin APIs (only present in newer versions / when feature PR is merged).\ntry:\n    from .parser import (\n        register_parser as register_parser,\n        unregister_parser as unregister_parser,\n        list_parsers as list_parsers,\n        get_supported_parsers as get_supported_parsers,\n    )\nexcept ImportError:\n    # Older versions without the custom parser registry: keep base import working.\n    pass\n\n# Optional: resilience utilities (may not exist in all installations).\ntry:\n    from .resilience import (\n        retry as retry,\n        async_retry as async_retry,\n        CircuitBreaker as CircuitBreaker,\n    )\nexcept ModuleNotFoundError:\n    # Resilience module not present in this build.\n    pass\nexcept ImportError:\n    # Symbols not available; ignore to avoid breaking import raganything.\n    pass\n\n# Optional: processing callbacks.\ntry:\n    from .callbacks import (\n        ProcessingCallback as ProcessingCallback,\n        MetricsCallback as MetricsCallback,\n        CallbackManager as CallbackManager,\n        ProcessingEvent as ProcessingEvent,\n    )\nexcept ModuleNotFoundError:\n    pass\nexcept ImportError:\n    pass\n\n# Optional: multilingual prompt manager.\ntry:\n    from .prompt_manager import (\n        set_prompt_language as set_prompt_language,\n        get_prompt_language as get_prompt_language,\n        reset_prompts as reset_prompts,\n        register_prompt_language as register_prompt_language,\n        get_available_languages as get_available_languages,\n    )\nexcept ModuleNotFoundError:\n    pass\nexcept ImportError:\n    pass\n\n__version__ = \"1.2.9\"\n__author__ = \"Zirui Guo\"\n__url__ = \"https://github.com/HKUDS/RAG-Anything\"\n\n__all__ = [\"RAGAnything\", \"RAGAnythingConfig\"]\n\n__all__ = [\n    \"RAGAnything\",\n    \"RAGAnythingConfig\",\n    \"Parser\",\n]\n\n# Feature-gated exports: only add names that are actually available in this build.\nif \"register_parser\" in globals():\n    __all__.extend(\n        [\n            \"register_parser\",\n            \"unregister_parser\",\n            \"list_parsers\",\n            \"get_supported_parsers\",\n        ]\n    )\n\nif \"retry\" in globals():\n    __all__.extend(\n        [\n            \"retry\",\n            \"async_retry\",\n            \"CircuitBreaker\",\n        ]\n    )\n\nif \"ProcessingCallback\" in globals():\n    __all__.extend(\n        [\n            \"ProcessingCallback\",\n            \"MetricsCallback\",\n            \"CallbackManager\",\n            \"ProcessingEvent\",\n        ]\n    )\n\nif \"set_prompt_language\" in globals():\n    __all__.extend(\n        [\n            \"set_prompt_language\",\n            \"get_prompt_language\",\n            \"reset_prompts\",\n            \"register_prompt_language\",\n            \"get_available_languages\",\n        ]\n    )\n\n\ndef get_version() -> str:\n    \"\"\"Return the RAG-Anything version string.\"\"\"\n    return __version__\n"
  },
  {
    "path": "raganything/base.py",
    "content": "from enum import Enum\n\n\nclass DocStatus(str, Enum):\n    \"\"\"Document processing status\"\"\"\n\n    READY = \"ready\"\n    HANDLING = \"handling\"\n    PENDING = \"pending\"\n    PROCESSING = \"processing\"\n    PROCESSED = \"processed\"\n    FAILED = \"failed\"\n"
  },
  {
    "path": "raganything/batch.py",
    "content": "\"\"\"\nBatch processing functionality for RAGAnything\n\nContains methods for processing multiple documents in batch mode\n\"\"\"\n\nimport asyncio\nimport logging\nfrom pathlib import Path\nfrom typing import List, Dict, Any, Optional, TYPE_CHECKING\nimport time\n\nfrom .batch_parser import BatchParser, BatchProcessingResult\n\nif TYPE_CHECKING:\n    from .config import RAGAnythingConfig\n\n\nclass BatchMixin:\n    \"\"\"BatchMixin class containing batch processing functionality for RAGAnything\"\"\"\n\n    # Type hints for mixin attributes (will be available when mixed into RAGAnything)\n    config: \"RAGAnythingConfig\"\n    logger: logging.Logger\n\n    # Type hints for methods that will be available from other mixins\n    async def _ensure_lightrag_initialized(self) -> None: ...\n    async def process_document_complete(self, file_path: str, **kwargs) -> None: ...\n\n    # ==========================================\n    # ORIGINAL BATCH PROCESSING METHOD (RESTORED)\n    # ==========================================\n\n    async def process_folder_complete(\n        self,\n        folder_path: str,\n        output_dir: str = None,\n        parse_method: str = None,\n        display_stats: bool = None,\n        split_by_character: str | None = None,\n        split_by_character_only: bool = False,\n        file_extensions: Optional[List[str]] = None,\n        recursive: bool = None,\n        max_workers: int = None,\n    ):\n        \"\"\"\n        Process all supported files in a folder\n\n        Args:\n            folder_path: Path to the folder containing files to process\n            output_dir: Directory for parsed outputs (optional)\n            parse_method: Parsing method to use (optional)\n            display_stats: Whether to display statistics (optional)\n            split_by_character: Character to split by (optional)\n            split_by_character_only: Whether to split only by character (optional)\n            file_extensions: List of file extensions to process (optional)\n            recursive: Whether to process folders recursively (optional)\n            max_workers: Maximum number of workers for concurrent processing (optional)\n        \"\"\"\n        if output_dir is None:\n            output_dir = self.config.parser_output_dir\n        if parse_method is None:\n            parse_method = self.config.parse_method\n        if display_stats is None:\n            display_stats = True\n        if file_extensions is None:\n            file_extensions = self.config.supported_file_extensions\n        if recursive is None:\n            recursive = self.config.recursive_folder_processing\n        if max_workers is None:\n            max_workers = self.config.max_concurrent_files\n\n        await self._ensure_lightrag_initialized()\n\n        # Get all files in the folder\n        folder_path_obj = Path(folder_path)\n        if not folder_path_obj.exists():\n            raise FileNotFoundError(f\"Folder not found: {folder_path}\")\n\n        # Collect files based on supported extensions\n        files_to_process = []\n        for file_ext in file_extensions:\n            if recursive:\n                pattern = f\"**/*{file_ext}\"\n            else:\n                pattern = f\"*{file_ext}\"\n            files_to_process.extend(folder_path_obj.glob(pattern))\n\n        if not files_to_process:\n            self.logger.warning(f\"No supported files found in {folder_path}\")\n            return\n\n        self.logger.info(\n            f\"Found {len(files_to_process)} files to process in {folder_path}\"\n        )\n\n        # Create output directory if it doesn't exist\n        output_path = Path(output_dir)\n        output_path.mkdir(parents=True, exist_ok=True)\n\n        # Process files with controlled concurrency\n        semaphore = asyncio.Semaphore(max_workers)\n        tasks = []\n\n        async def process_single_file(file_path: Path):\n            async with semaphore:\n                is_in_subdir = (\n                    lambda file_path, dir_path: len(\n                        file_path.relative_to(dir_path).parents\n                    )\n                    > 1\n                )(file_path, folder_path_obj)\n\n                try:\n                    await self.process_document_complete(\n                        str(file_path),\n                        output_dir=(\n                            output_dir\n                            if not is_in_subdir\n                            else str(\n                                output_path\n                                / file_path.parent.relative_to(folder_path_obj)\n                            )\n                        ),\n                        parse_method=parse_method,\n                        split_by_character=split_by_character,\n                        split_by_character_only=split_by_character_only,\n                        file_name=(\n                            None\n                            if not is_in_subdir\n                            else str(file_path.relative_to(folder_path_obj))\n                        ),\n                    )\n                    return True, str(file_path), None\n                except Exception as e:\n                    self.logger.error(f\"Failed to process {file_path}: {str(e)}\")\n                    return False, str(file_path), str(e)\n\n        # Create tasks for all files\n        for file_path in files_to_process:\n            task = asyncio.create_task(process_single_file(file_path))\n            tasks.append(task)\n\n        # Wait for all tasks to complete\n        results = await asyncio.gather(*tasks, return_exceptions=True)\n\n        # Process results\n        successful_files = []\n        failed_files = []\n        for result in results:\n            if isinstance(result, Exception):\n                failed_files.append((\"unknown\", str(result)))\n            else:\n                success, file_path, error = result\n                if success:\n                    successful_files.append(file_path)\n                else:\n                    failed_files.append((file_path, error))\n\n        # Display statistics if requested\n        if display_stats:\n            self.logger.info(\"Processing complete!\")\n            self.logger.info(f\"  Successful: {len(successful_files)} files\")\n            self.logger.info(f\"  Failed: {len(failed_files)} files\")\n            if failed_files:\n                self.logger.warning(\"Failed files:\")\n                for file_path, error in failed_files:\n                    self.logger.warning(f\"  - {file_path}: {error}\")\n\n    # ==========================================\n    # NEW ENHANCED BATCH PROCESSING METHODS\n    # ==========================================\n\n    def process_documents_batch(\n        self,\n        file_paths: List[str],\n        output_dir: Optional[str] = None,\n        parse_method: Optional[str] = None,\n        max_workers: Optional[int] = None,\n        recursive: Optional[bool] = None,\n        show_progress: bool = True,\n        **kwargs,\n    ) -> BatchProcessingResult:\n        \"\"\"\n        Process multiple documents in batch using the new BatchParser\n\n        Args:\n            file_paths: List of file paths or directories to process\n            output_dir: Output directory for parsed files\n            parse_method: Parsing method to use\n            max_workers: Maximum number of workers for parallel processing\n            recursive: Whether to process directories recursively\n            show_progress: Whether to show progress bar\n            **kwargs: Additional arguments passed to the parser\n\n        Returns:\n            BatchProcessingResult: Results of the batch processing\n        \"\"\"\n        # Use config defaults if not specified\n        if output_dir is None:\n            output_dir = self.config.parser_output_dir\n        if parse_method is None:\n            parse_method = self.config.parse_method\n        if max_workers is None:\n            max_workers = self.config.max_concurrent_files\n        if recursive is None:\n            recursive = self.config.recursive_folder_processing\n\n        # Create batch parser\n        batch_parser = BatchParser(\n            parser_type=self.config.parser,\n            max_workers=max_workers,\n            show_progress=show_progress,\n            skip_installation_check=True,  # Skip installation check for better UX\n        )\n\n        # Process batch\n        return batch_parser.process_batch(\n            file_paths=file_paths,\n            output_dir=output_dir,\n            parse_method=parse_method,\n            recursive=recursive,\n            **kwargs,\n        )\n\n    async def process_documents_batch_async(\n        self,\n        file_paths: List[str],\n        output_dir: Optional[str] = None,\n        parse_method: Optional[str] = None,\n        max_workers: Optional[int] = None,\n        recursive: Optional[bool] = None,\n        show_progress: bool = True,\n        **kwargs,\n    ) -> BatchProcessingResult:\n        \"\"\"\n        Asynchronously process multiple documents in batch\n\n        Args:\n            file_paths: List of file paths or directories to process\n            output_dir: Output directory for parsed files\n            parse_method: Parsing method to use\n            max_workers: Maximum number of workers for parallel processing\n            recursive: Whether to process directories recursively\n            show_progress: Whether to show progress bar\n            **kwargs: Additional arguments passed to the parser\n\n        Returns:\n            BatchProcessingResult: Results of the batch processing\n        \"\"\"\n        # Use config defaults if not specified\n        if output_dir is None:\n            output_dir = self.config.parser_output_dir\n        if parse_method is None:\n            parse_method = self.config.parse_method\n        if max_workers is None:\n            max_workers = self.config.max_concurrent_files\n        if recursive is None:\n            recursive = self.config.recursive_folder_processing\n\n        # Create batch parser\n        batch_parser = BatchParser(\n            parser_type=self.config.parser,\n            max_workers=max_workers,\n            show_progress=show_progress,\n            skip_installation_check=True,  # Skip installation check for better UX\n        )\n\n        # Process batch asynchronously\n        return await batch_parser.process_batch_async(\n            file_paths=file_paths,\n            output_dir=output_dir,\n            parse_method=parse_method,\n            recursive=recursive,\n            **kwargs,\n        )\n\n    def get_supported_file_extensions(self) -> List[str]:\n        \"\"\"Get list of supported file extensions for batch processing\"\"\"\n        batch_parser = BatchParser(parser_type=self.config.parser)\n        return batch_parser.get_supported_extensions()\n\n    def filter_supported_files(\n        self, file_paths: List[str], recursive: Optional[bool] = None\n    ) -> List[str]:\n        \"\"\"\n        Filter file paths to only include supported file types\n\n        Args:\n            file_paths: List of file paths to filter\n            recursive: Whether to process directories recursively\n\n        Returns:\n            List of supported file paths\n        \"\"\"\n        if recursive is None:\n            recursive = self.config.recursive_folder_processing\n\n        batch_parser = BatchParser(parser_type=self.config.parser)\n        return batch_parser.filter_supported_files(file_paths, recursive)\n\n    async def process_documents_with_rag_batch(\n        self,\n        file_paths: List[str],\n        output_dir: Optional[str] = None,\n        parse_method: Optional[str] = None,\n        max_workers: Optional[int] = None,\n        recursive: Optional[bool] = None,\n        show_progress: bool = True,\n        **kwargs,\n    ) -> Dict[str, Any]:\n        \"\"\"\n        Process documents in batch and then add them to RAG\n\n        This method combines document parsing and RAG insertion:\n        1. First, parse all documents using batch processing\n        2. Then, process each successfully parsed document with RAG\n\n        Args:\n            file_paths: List of file paths or directories to process\n            output_dir: Output directory for parsed files\n            parse_method: Parsing method to use\n            max_workers: Maximum number of workers for parallel processing\n            recursive: Whether to process directories recursively\n            show_progress: Whether to show progress bar\n            **kwargs: Additional arguments passed to the parser\n\n        Returns:\n            Dict containing both parse results and RAG processing results\n        \"\"\"\n        start_time = time.time()\n        callback_manager = getattr(self, \"callback_manager\", None)\n        total_files = len(file_paths)\n\n        if callback_manager is not None:\n            callback_manager.dispatch(\n                \"on_batch_start\",\n                file_count=total_files,\n            )\n\n        # Use config defaults if not specified\n        if output_dir is None:\n            output_dir = self.config.parser_output_dir\n        if parse_method is None:\n            parse_method = self.config.parse_method\n        if max_workers is None:\n            max_workers = self.config.max_concurrent_files\n        if recursive is None:\n            recursive = self.config.recursive_folder_processing\n\n        self.logger.info(\"Starting batch processing with RAG integration\")\n\n        # Step 1: Parse documents in batch\n        parse_result = self.process_documents_batch(\n            file_paths=file_paths,\n            output_dir=output_dir,\n            parse_method=parse_method,\n            max_workers=max_workers,\n            recursive=recursive,\n            show_progress=show_progress,\n            **kwargs,\n        )\n\n        # Step 2: Process with RAG\n        # Initialize RAG system\n        await self._ensure_lightrag_initialized()\n\n        # Then, process each successful file with RAG\n        rag_results = {}\n\n        if parse_result.successful_files:\n            self.logger.info(\n                f\"Processing {len(parse_result.successful_files)} files with RAG\"\n            )\n\n            # Process files with RAG (this could be parallelized in the future)\n            for file_path in parse_result.successful_files:\n                try:\n                    # Process the successfully parsed file with RAG\n                    await self.process_document_complete(\n                        file_path,\n                        output_dir=output_dir,\n                        parse_method=parse_method,\n                        **kwargs,\n                    )\n\n                    # Get some statistics about the processed content\n                    # This would require additional tracking in the RAG system\n                    rag_results[file_path] = {\"status\": \"success\", \"processed\": True}\n\n                except Exception as e:\n                    self.logger.error(\n                        f\"Failed to process {file_path} with RAG: {str(e)}\"\n                    )\n                    rag_results[file_path] = {\n                        \"status\": \"failed\",\n                        \"error\": str(e),\n                        \"processed\": False,\n                    }\n\n        processing_time = time.time() - start_time\n\n        successful_rag_files = len([r for r in rag_results.values() if r[\"processed\"]])\n        failed_rag_files = len([r for r in rag_results.values() if not r[\"processed\"]])\n\n        if callback_manager is not None:\n            callback_manager.dispatch(\n                \"on_batch_complete\",\n                total_files=total_files,\n                successful=successful_rag_files,\n                failed=failed_rag_files,\n                duration_seconds=processing_time,\n            )\n\n        return {\n            \"parse_result\": parse_result,\n            \"rag_results\": rag_results,\n            \"total_processing_time\": processing_time,\n            \"successful_rag_files\": successful_rag_files,\n            \"failed_rag_files\": failed_rag_files,\n        }\n"
  },
  {
    "path": "raganything/batch_parser.py",
    "content": "\"\"\"\nBatch and Parallel Document Parsing\n\nThis module provides functionality for processing multiple documents in parallel,\nwith progress reporting and error handling.\n\"\"\"\n\nimport asyncio\nimport logging\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom pathlib import Path\nfrom typing import Dict, List, Optional, Tuple\nfrom dataclasses import dataclass\nimport time\n\nfrom tqdm import tqdm\n\nfrom .parser import get_parser\n\n\n@dataclass\nclass BatchProcessingResult:\n    \"\"\"Result of batch processing operation\"\"\"\n\n    successful_files: List[str]\n    failed_files: List[str]\n    total_files: int\n    processing_time: float\n    errors: Dict[str, str]\n    output_dir: str\n    dry_run: bool = False\n\n    @property\n    def success_rate(self) -> float:\n        \"\"\"Calculate success rate as percentage\"\"\"\n        if self.total_files == 0:\n            return 0.0\n        return (len(self.successful_files) / self.total_files) * 100\n\n    def summary(self) -> str:\n        \"\"\"Generate a summary of the batch processing results\"\"\"\n        return (\n            f\"Batch Processing Summary:\\n\"\n            f\"  Total files: {self.total_files}\\n\"\n            f\"  Successful: {len(self.successful_files)} ({self.success_rate:.1f}%)\\n\"\n            f\"  Failed: {len(self.failed_files)}\\n\"\n            f\"  Processing time: {self.processing_time:.2f} seconds\\n\"\n            f\"  Output directory: {self.output_dir}\\n\"\n            f\"  Dry run: {self.dry_run}\"\n        )\n\n\nclass BatchParser:\n    \"\"\"\n    Batch document parser with parallel processing capabilities\n\n    Supports processing multiple documents concurrently with progress tracking\n    and comprehensive error handling.\n    \"\"\"\n\n    def __init__(\n        self,\n        parser_type: str = \"mineru\",\n        max_workers: int = 4,\n        show_progress: bool = True,\n        timeout_per_file: int = 300,\n        skip_installation_check: bool = False,\n    ):\n        \"\"\"\n        Initialize batch parser\n\n        Args:\n            parser_type: Type of parser to use (\"mineru\", \"docling\", or \"paddleocr\")\n            max_workers: Maximum number of parallel workers\n            show_progress: Whether to show progress bars\n            timeout_per_file: Timeout in seconds for each file\n            skip_installation_check: Skip parser installation check (useful for testing)\n        \"\"\"\n        self.parser_type = parser_type\n        self.max_workers = max_workers\n        self.show_progress = show_progress\n        self.timeout_per_file = timeout_per_file\n        self.logger = logging.getLogger(__name__)\n\n        # Initialize parser\n        try:\n            self.parser = get_parser(parser_type)\n        except ValueError as exc:\n            raise ValueError(f\"Unsupported parser type: {parser_type}\") from exc\n\n        # Check parser installation (optional)\n        if not skip_installation_check:\n            if not self.parser.check_installation():\n                self.logger.warning(\n                    f\"{parser_type.title()} parser installation check failed. \"\n                    f\"This may be due to package conflicts. \"\n                    f\"Use skip_installation_check=True to bypass this check.\"\n                )\n                # Don't raise an error, just warn - the parser might still work\n\n    def get_supported_extensions(self) -> List[str]:\n        \"\"\"Get list of supported file extensions\"\"\"\n        return list(\n            self.parser.OFFICE_FORMATS\n            | self.parser.IMAGE_FORMATS\n            | self.parser.TEXT_FORMATS\n            | {\".pdf\"}\n        )\n\n    def filter_supported_files(\n        self, file_paths: List[str], recursive: bool = True\n    ) -> List[str]:\n        \"\"\"\n        Filter file paths to only include supported file types\n\n        Args:\n            file_paths: List of file paths or directories\n            recursive: Whether to search directories recursively\n\n        Returns:\n            List of supported file paths\n        \"\"\"\n        supported_extensions = set(self.get_supported_extensions())\n        supported_files = []\n\n        for path_str in file_paths:\n            path = Path(path_str)\n\n            if path.is_file():\n                if path.suffix.lower() in supported_extensions:\n                    supported_files.append(str(path))\n                else:\n                    self.logger.warning(f\"Unsupported file type: {path}\")\n\n            elif path.is_dir():\n                if recursive:\n                    # Recursively find all files\n                    for file_path in path.rglob(\"*\"):\n                        if (\n                            file_path.is_file()\n                            and file_path.suffix.lower() in supported_extensions\n                        ):\n                            supported_files.append(str(file_path))\n                else:\n                    # Only files in the directory (not subdirectories)\n                    for file_path in path.glob(\"*\"):\n                        if (\n                            file_path.is_file()\n                            and file_path.suffix.lower() in supported_extensions\n                        ):\n                            supported_files.append(str(file_path))\n\n            else:\n                self.logger.warning(f\"Path does not exist: {path}\")\n\n        return supported_files\n\n    def process_single_file(\n        self, file_path: str, output_dir: str, parse_method: str = \"auto\", **kwargs\n    ) -> Tuple[bool, str, Optional[str]]:\n        \"\"\"\n        Process a single file\n\n        Args:\n            file_path: Path to the file to process\n            output_dir: Output directory\n            parse_method: Parsing method\n            **kwargs: Additional parser arguments\n\n        Returns:\n            Tuple of (success, file_path, error_message)\n        \"\"\"\n        try:\n            start_time = time.time()\n\n            # Create file-specific output directory\n            file_name = Path(file_path).stem\n            file_output_dir = Path(output_dir) / file_name\n            file_output_dir.mkdir(parents=True, exist_ok=True)\n\n            # Parse the document\n            content_list = self.parser.parse_document(\n                file_path=file_path,\n                output_dir=str(file_output_dir),\n                method=parse_method,\n                **kwargs,\n            )\n\n            processing_time = time.time() - start_time\n\n            self.logger.info(\n                f\"Successfully processed {file_path} \"\n                f\"({len(content_list)} content blocks, {processing_time:.2f}s)\"\n            )\n\n            return True, file_path, None\n\n        except Exception as e:\n            error_msg = f\"Failed to process {file_path}: {str(e)}\"\n            self.logger.error(error_msg)\n            return False, file_path, error_msg\n\n    def process_batch(\n        self,\n        file_paths: List[str],\n        output_dir: str,\n        parse_method: str = \"auto\",\n        recursive: bool = True,\n        dry_run: bool = False,\n        **kwargs,\n    ) -> BatchProcessingResult:\n        \"\"\"\n        Process multiple files in parallel\n\n        Args:\n            file_paths: List of file paths or directories to process\n            output_dir: Base output directory\n            parse_method: Parsing method for all files\n            recursive: Whether to search directories recursively\n            dry_run: When True, only list files without processing them\n            **kwargs: Additional parser arguments\n\n        Returns:\n            BatchProcessingResult with processing statistics\n        \"\"\"\n        start_time = time.time()\n\n        # Filter to supported files\n        supported_files = self.filter_supported_files(file_paths, recursive)\n\n        if not supported_files:\n            self.logger.warning(\"No supported files found to process\")\n            return BatchProcessingResult(\n                successful_files=[],\n                failed_files=[],\n                total_files=0,\n                processing_time=0.0,\n                errors={},\n                output_dir=output_dir,\n                dry_run=dry_run,\n            )\n\n        self.logger.info(f\"Found {len(supported_files)} files to process\")\n\n        if dry_run:\n            self.logger.info(\n                f\"Dry run enabled. {len(supported_files)} files would be processed.\"\n            )\n            return BatchProcessingResult(\n                successful_files=supported_files,\n                failed_files=[],\n                total_files=len(supported_files),\n                processing_time=0.0,\n                errors={},\n                output_dir=output_dir,\n                dry_run=True,\n            )\n\n        # Create output directory\n        output_path = Path(output_dir)\n        output_path.mkdir(parents=True, exist_ok=True)\n\n        # Process files in parallel\n        successful_files = []\n        failed_files = []\n        errors = {}\n\n        # Create progress bar if requested\n        pbar = None\n        if self.show_progress:\n            pbar = tqdm(\n                total=len(supported_files),\n                desc=f\"Processing files ({self.parser_type})\",\n                unit=\"file\",\n            )\n\n        try:\n            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:\n                # Submit all tasks\n                future_to_file = {\n                    executor.submit(\n                        self.process_single_file,\n                        file_path,\n                        output_dir,\n                        parse_method,\n                        **kwargs,\n                    ): file_path\n                    for file_path in supported_files\n                }\n\n                # Process completed tasks\n                for future in as_completed(\n                    future_to_file, timeout=self.timeout_per_file\n                ):\n                    success, file_path, error_msg = future.result()\n\n                    if success:\n                        successful_files.append(file_path)\n                    else:\n                        failed_files.append(file_path)\n                        errors[file_path] = error_msg\n\n                    if pbar:\n                        pbar.update(1)\n\n        except Exception as e:\n            self.logger.error(f\"Batch processing failed: {str(e)}\")\n            # Mark remaining files as failed\n            for future in future_to_file:\n                if not future.done():\n                    file_path = future_to_file[future]\n                    failed_files.append(file_path)\n                    errors[file_path] = f\"Processing interrupted: {str(e)}\"\n                    if pbar:\n                        pbar.update(1)\n\n        finally:\n            if pbar:\n                pbar.close()\n\n        processing_time = time.time() - start_time\n\n        # Create result\n        result = BatchProcessingResult(\n            successful_files=successful_files,\n            failed_files=failed_files,\n            total_files=len(supported_files),\n            processing_time=processing_time,\n            errors=errors,\n            output_dir=output_dir,\n            dry_run=False,\n        )\n\n        # Log summary\n        self.logger.info(result.summary())\n\n        return result\n\n    async def process_batch_async(\n        self,\n        file_paths: List[str],\n        output_dir: str,\n        parse_method: str = \"auto\",\n        recursive: bool = True,\n        dry_run: bool = False,\n        **kwargs,\n    ) -> BatchProcessingResult:\n        \"\"\"\n        Async version of batch processing\n\n        Args:\n            file_paths: List of file paths or directories to process\n            output_dir: Base output directory\n            parse_method: Parsing method for all files\n            recursive: Whether to search directories recursively\n            dry_run: When True, only list files without processing them\n            **kwargs: Additional parser arguments\n\n        Returns:\n            BatchProcessingResult with processing statistics\n        \"\"\"\n        # Run the sync version in a thread pool\n        loop = asyncio.get_event_loop()\n        return await loop.run_in_executor(\n            None,\n            self.process_batch,\n            file_paths,\n            output_dir,\n            parse_method,\n            recursive,\n            dry_run,\n            **kwargs,\n        )\n\n\ndef main():\n    \"\"\"Command-line interface for batch parsing\"\"\"\n    import argparse\n\n    parser = argparse.ArgumentParser(description=\"Batch document parsing\")\n    parser.add_argument(\"paths\", nargs=\"+\", help=\"File paths or directories to process\")\n    parser.add_argument(\"--output\", \"-o\", required=True, help=\"Output directory\")\n    parser.add_argument(\n        \"--parser\",\n        default=\"mineru\",\n        help=(\n            \"Parser to use. Built-ins: mineru, docling, paddleocr. \"\n            \"When using RAGAnything as a library, any custom parsers that you \"\n            \"have registered via register_parser() in the current process \"\n            \"are also accepted. The standalone CLI itself does not perform \"\n            \"plugin discovery.\"\n        ),\n    )\n    parser.add_argument(\n        \"--method\",\n        choices=[\"auto\", \"txt\", \"ocr\"],\n        default=\"auto\",\n        help=\"Parsing method\",\n    )\n    parser.add_argument(\n        \"--workers\", type=int, default=4, help=\"Number of parallel workers\"\n    )\n    parser.add_argument(\n        \"--no-progress\", action=\"store_true\", help=\"Disable progress bar\"\n    )\n    parser.add_argument(\n        \"--recursive\",\n        action=\"store_true\",\n        default=True,\n        help=\"Search directories recursively\",\n    )\n    parser.add_argument(\n        \"--timeout\", type=int, default=300, help=\"Timeout per file (seconds)\"\n    )\n    parser.add_argument(\n        \"--dry-run\",\n        action=\"store_true\",\n        help=\"List files that would be processed without running parsers\",\n    )\n\n    args = parser.parse_args()\n\n    # Configure logging\n    logging.basicConfig(\n        level=logging.INFO,\n        format=\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\",\n    )\n\n    try:\n        # Create batch parser\n        batch_parser = BatchParser(\n            parser_type=args.parser,\n            max_workers=args.workers,\n            show_progress=not args.no_progress,\n            timeout_per_file=args.timeout,\n        )\n\n        # Process files\n        result = batch_parser.process_batch(\n            file_paths=args.paths,\n            output_dir=args.output,\n            parse_method=args.method,\n            recursive=args.recursive,\n            dry_run=args.dry_run,\n        )\n\n        # Print summary\n        print(\"\\n\" + result.summary())\n\n        if args.dry_run:\n            if result.successful_files:\n                print(\"\\nDry run: files that would be processed:\")\n                for file_path in result.successful_files:\n                    print(f\"  - {file_path}\")\n            else:\n                print(\"\\nDry run: no supported files found.\")\n\n        # Exit with error code if any files failed\n        if result.failed_files:\n            return 1\n\n        return 0\n\n    except Exception as e:\n        print(f\"Error: {str(e)}\")\n        return 1\n\n\nif __name__ == \"__main__\":\n    exit(main())\n"
  },
  {
    "path": "raganything/callbacks.py",
    "content": "\"\"\"\nProcessing callbacks and event system for RAGAnything.\n\nProvides a lightweight publish-subscribe mechanism that lets users hook\ninto every stage of the document processing pipeline — parsing, text\ninsertion, multimodal processing, and querying.\n\nUsage::\n\n    from raganything.callbacks import ProcessingCallback, CallbackManager\n\n    class MyCallback(ProcessingCallback):\n        def on_parse_start(self, file_path: str, **kw):\n            print(f\"Parsing started: {file_path}\")\n\n        def on_parse_complete(self, file_path: str, content_blocks: int, **kw):\n            print(f\"Parsed {content_blocks} blocks from {file_path}\")\n\n    rag = RAGAnything(config=config)\n    rag.callback_manager.register(MyCallback())\n\"\"\"\n\nfrom __future__ import annotations\n\nimport logging\nimport time\nfrom dataclasses import dataclass, field\nfrom typing import Any, Dict, List, Optional\nimport threading\n\nlogger = logging.getLogger(__name__)\n\n\n@dataclass\nclass ProcessingEvent:\n    \"\"\"Immutable record of a processing pipeline event.\"\"\"\n\n    event_type: str\n    timestamp: float = field(default_factory=time.time)\n    file_path: Optional[str] = None\n    doc_id: Optional[str] = None\n    stage: Optional[str] = None\n    details: Dict[str, Any] = field(default_factory=dict)\n    duration_seconds: Optional[float] = None\n    error: Optional[str] = None\n\n    def to_dict(self) -> Dict[str, Any]:\n        \"\"\"Serialise to a plain dictionary.\"\"\"\n        return {\n            \"event_type\": self.event_type,\n            \"timestamp\": self.timestamp,\n            \"file_path\": self.file_path,\n            \"doc_id\": self.doc_id,\n            \"stage\": self.stage,\n            \"details\": self.details,\n            \"duration_seconds\": self.duration_seconds,\n            \"error\": self.error,\n        }\n\n\nclass ProcessingCallback:\n    \"\"\"Base class for processing pipeline callbacks.\n\n    Override any of the ``on_*`` methods to hook into the corresponding\n    stage.  Methods that are not overridden are silently ignored.\n\n    All methods receive ``**kwargs`` so that future versions can add\n    parameters without breaking existing subclasses.\n    \"\"\"\n\n    # ── Parsing stage ─────────────────────────────────────────────\n    def on_parse_start(self, file_path: str, parser: str = \"\", **kwargs: Any) -> None:\n        \"\"\"Called before document parsing begins.\"\"\"\n\n    def on_parse_complete(\n        self,\n        file_path: str,\n        content_blocks: int = 0,\n        doc_id: str = \"\",\n        duration_seconds: float = 0.0,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Called after document parsing succeeds.\"\"\"\n\n    def on_parse_error(\n        self, file_path: str, error: BaseException | str = \"\", **kwargs: Any\n    ) -> None:\n        \"\"\"Called when document parsing fails.\"\"\"\n\n    # ── Text insertion stage ──────────────────────────────────────\n    def on_text_insert_start(\n        self, file_path: str, text_length: int = 0, **kwargs: Any\n    ) -> None:\n        \"\"\"Called before text content is inserted into LightRAG.\"\"\"\n\n    def on_text_insert_complete(\n        self, file_path: str, duration_seconds: float = 0.0, **kwargs: Any\n    ) -> None:\n        \"\"\"Called after text content insertion succeeds.\"\"\"\n\n    # ── Multimodal processing stage ───────────────────────────────\n    def on_multimodal_start(\n        self, file_path: str, item_count: int = 0, **kwargs: Any\n    ) -> None:\n        \"\"\"Called before multimodal content processing begins.\"\"\"\n\n    def on_multimodal_item_complete(\n        self,\n        file_path: str,\n        item_index: int = 0,\n        item_type: str = \"\",\n        total_items: int = 0,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Called after each individual multimodal item is processed.\"\"\"\n\n    def on_multimodal_complete(\n        self,\n        file_path: str,\n        processed_count: int = 0,\n        duration_seconds: float = 0.0,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Called after all multimodal content processing completes.\"\"\"\n\n    # ── Query stage ───────────────────────────────────────────────\n    def on_query_start(self, query: str, mode: str = \"\", **kwargs: Any) -> None:\n        \"\"\"Called before a query is executed.\"\"\"\n\n    def on_query_complete(\n        self,\n        query: str,\n        mode: str = \"\",\n        duration_seconds: float = 0.0,\n        result_length: int = 0,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Called after a query completes.\"\"\"\n\n    def on_query_error(\n        self,\n        query: str,\n        mode: str = \"\",\n        error: BaseException | str = \"\",\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Called when a query fails.\"\"\"\n\n    # ── Document complete ─────────────────────────────────────────\n    def on_document_complete(\n        self,\n        file_path: str,\n        doc_id: str = \"\",\n        duration_seconds: float = 0.0,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Called when the entire document processing pipeline finishes.\"\"\"\n\n    def on_document_error(\n        self,\n        file_path: str,\n        error: BaseException | str = \"\",\n        stage: str = \"\",\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Called when document processing fails at any stage.\"\"\"\n\n    # ── Batch processing ──────────────────────────────────────────\n    def on_batch_start(self, file_count: int = 0, **kwargs: Any) -> None:\n        \"\"\"Called when batch processing begins.\"\"\"\n\n    def on_batch_complete(\n        self,\n        total_files: int = 0,\n        successful: int = 0,\n        failed: int = 0,\n        duration_seconds: float = 0.0,\n        **kwargs: Any,\n    ) -> None:\n        \"\"\"Called when batch processing completes.\"\"\"\n\n\nclass MetricsCallback(ProcessingCallback):\n    \"\"\"Built-in callback that collects processing metrics.\n\n    Access aggregated metrics via the :attr:`metrics` attribute.\n\n    Example::\n\n        metrics_cb = MetricsCallback()\n        rag.callback_manager.register(metrics_cb)\n        # ... process documents ...\n        print(metrics_cb.summary())\n    \"\"\"\n\n    def __init__(self) -> None:\n        self.metrics: Dict[str, Any] = {\n            \"documents_processed\": 0,\n            \"documents_failed\": 0,\n            \"total_content_blocks\": 0,\n            \"total_multimodal_items\": 0,\n            \"total_parse_time\": 0.0,\n            \"total_insert_time\": 0.0,\n            \"total_multimodal_time\": 0.0,\n            \"queries_executed\": 0,\n            \"total_query_time\": 0.0,\n            \"errors\": [],\n        }\n\n    def on_parse_complete(\n        self,\n        file_path: str,\n        content_blocks: int = 0,\n        duration_seconds: float = 0.0,\n        **kw: Any,\n    ) -> None:\n        self.metrics[\"total_content_blocks\"] += content_blocks\n        self.metrics[\"total_parse_time\"] += duration_seconds\n\n    def on_text_insert_complete(\n        self, file_path: str, duration_seconds: float = 0.0, **kw: Any\n    ) -> None:\n        self.metrics[\"total_insert_time\"] += duration_seconds\n\n    def on_multimodal_complete(\n        self,\n        file_path: str,\n        processed_count: int = 0,\n        duration_seconds: float = 0.0,\n        **kw: Any,\n    ) -> None:\n        self.metrics[\"total_multimodal_items\"] += processed_count\n        self.metrics[\"total_multimodal_time\"] += duration_seconds\n\n    def on_document_complete(self, file_path: str, **kw: Any) -> None:\n        self.metrics[\"documents_processed\"] += 1\n\n    def on_document_error(\n        self,\n        file_path: str,\n        error: BaseException | str = \"\",\n        stage: str = \"\",\n        **kw: Any,\n    ) -> None:\n        self.metrics[\"documents_failed\"] += 1\n        self.metrics[\"errors\"].append(\n            {\"file\": file_path, \"error\": str(error), \"stage\": stage}\n        )\n\n    def on_query_complete(\n        self, query: str, duration_seconds: float = 0.0, **kw: Any\n    ) -> None:\n        self.metrics[\"queries_executed\"] += 1\n        self.metrics[\"total_query_time\"] += duration_seconds\n\n    def on_query_error(\n        self, query: str, error: BaseException | str = \"\", **kw: Any\n    ) -> None:\n        self.metrics[\"errors\"].append(\n            {\"file\": None, \"error\": str(error), \"stage\": \"query\"}\n        )\n\n    def summary(self) -> str:\n        \"\"\"Return a human-readable summary of collected metrics.\"\"\"\n        m = self.metrics\n        lines = [\n            \"RAGAnything Processing Metrics\",\n            \"=\" * 40,\n            f\"Documents processed : {m['documents_processed']}\",\n            f\"Documents failed    : {m['documents_failed']}\",\n            f\"Content blocks      : {m['total_content_blocks']}\",\n            f\"Multimodal items    : {m['total_multimodal_items']}\",\n            f\"Parse time          : {m['total_parse_time']:.2f}s\",\n            f\"Insert time         : {m['total_insert_time']:.2f}s\",\n            f\"Multimodal time     : {m['total_multimodal_time']:.2f}s\",\n            f\"Queries executed    : {m['queries_executed']}\",\n            f\"Query time          : {m['total_query_time']:.2f}s\",\n        ]\n        if m[\"errors\"]:\n            lines.append(f\"Errors              : {len(m['errors'])}\")\n            for err in m[\"errors\"][:5]:\n                lines.append(f\"  - [{err['stage']}] {err['file']}: {err['error']}\")\n        return \"\\n\".join(lines)\n\n    def reset(self) -> None:\n        \"\"\"Reset all collected metrics.\"\"\"\n        self.__init__()\n\n\nclass CallbackManager:\n    \"\"\"Manages and dispatches events to registered callbacks.\n\n    Thread-safe for registration/unregistration and event logging.\n    Event dispatch iterates over a snapshot of currently registered\n    callbacks so that callbacks can safely register/unregister others.\n    \"\"\"\n\n    def __init__(self) -> None:\n        self._callbacks: List[ProcessingCallback] = []\n        self._event_log: List[ProcessingEvent] = []\n        self._log_events: bool = False\n        self._lock = threading.RLock()\n\n    def register(self, callback: ProcessingCallback) -> None:\n        \"\"\"Register a callback to receive processing events.\n\n        Args:\n            callback: An instance of :class:`ProcessingCallback` (or subclass).\n\n        Raises:\n            TypeError: If *callback* is not a :class:`ProcessingCallback`.\n        \"\"\"\n        if not isinstance(callback, ProcessingCallback):\n            raise TypeError(\n                f\"Expected ProcessingCallback instance, got {type(callback).__name__}\"\n            )\n        with self._lock:\n            self._callbacks.append(callback)\n\n    def unregister(self, callback: ProcessingCallback) -> None:\n        \"\"\"Remove a previously registered callback.\"\"\"\n        with self._lock:\n            self._callbacks.remove(callback)\n\n    def enable_event_log(self, enabled: bool = True) -> None:\n        \"\"\"Enable or disable internal event logging.\n\n        When enabled, every dispatched event is recorded in\n        :attr:`event_log` for later inspection.\n        \"\"\"\n        with self._lock:\n            self._log_events = enabled\n\n    @property\n    def event_log(self) -> List[ProcessingEvent]:\n        \"\"\"Read-only access to the internal event log.\"\"\"\n        with self._lock:\n            return list(self._event_log)\n\n    def clear_event_log(self) -> None:\n        \"\"\"Clear the internal event log.\"\"\"\n        with self._lock:\n            self._event_log.clear()\n\n    def dispatch(self, event_name: str, **kwargs: Any) -> None:\n        \"\"\"Dispatch an event to all registered callbacks.\n\n        Args:\n            event_name: Name of the callback method (e.g., ``\"on_parse_start\"``).\n            **kwargs: Arguments forwarded to the callback method.\n        \"\"\"\n        with self._lock:\n            callbacks_snapshot = list(self._callbacks)\n            log_events = self._log_events\n            if log_events:\n                event = ProcessingEvent(\n                    event_type=event_name,\n                    file_path=kwargs.get(\"file_path\"),\n                    doc_id=kwargs.get(\"doc_id\"),\n                    stage=kwargs.get(\"stage\"),\n                    details=kwargs,\n                    duration_seconds=kwargs.get(\"duration_seconds\"),\n                    error=str(kwargs[\"error\"]) if \"error\" in kwargs else None,\n                )\n                self._event_log.append(event)\n\n        for cb in callbacks_snapshot:\n            handler = getattr(cb, event_name, None)\n            if handler is not None:\n                try:\n                    handler(**kwargs)\n                except Exception:\n                    logger.exception(\n                        \"Error in callback %s.%s\",\n                        type(cb).__name__,\n                        event_name,\n                    )\n"
  },
  {
    "path": "raganything/config.py",
    "content": "\"\"\"\nConfiguration classes for RAGAnything\n\nContains configuration dataclasses with environment variable support\n\"\"\"\n\nfrom dataclasses import dataclass, field\nfrom typing import List\nfrom lightrag.utils import get_env_value\n\n\n@dataclass\nclass RAGAnythingConfig:\n    \"\"\"Configuration class for RAGAnything with environment variable support\"\"\"\n\n    # Directory Configuration\n    # ---\n    working_dir: str = field(default=get_env_value(\"WORKING_DIR\", \"./rag_storage\", str))\n    \"\"\"Directory where RAG storage and cache files are stored.\"\"\"\n\n    # Parser Configuration\n    # ---\n    parse_method: str = field(default=get_env_value(\"PARSE_METHOD\", \"auto\", str))\n    \"\"\"Default parsing method for document parsing: 'auto', 'ocr', or 'txt'.\"\"\"\n\n    parser_output_dir: str = field(default=get_env_value(\"OUTPUT_DIR\", \"./output\", str))\n    \"\"\"Default output directory for parsed content.\"\"\"\n\n    parser: str = field(default=get_env_value(\"PARSER\", \"mineru\", str))\n    \"\"\"Parser selection: 'mineru', 'docling', or 'paddleocr'.\"\"\"\n\n    display_content_stats: bool = field(\n        default=get_env_value(\"DISPLAY_CONTENT_STATS\", True, bool)\n    )\n    \"\"\"Whether to display content statistics during parsing.\"\"\"\n\n    # Multimodal Processing Configuration\n    # ---\n    enable_image_processing: bool = field(\n        default=get_env_value(\"ENABLE_IMAGE_PROCESSING\", True, bool)\n    )\n    \"\"\"Enable image content processing.\"\"\"\n\n    enable_table_processing: bool = field(\n        default=get_env_value(\"ENABLE_TABLE_PROCESSING\", True, bool)\n    )\n    \"\"\"Enable table content processing.\"\"\"\n\n    enable_equation_processing: bool = field(\n        default=get_env_value(\"ENABLE_EQUATION_PROCESSING\", True, bool)\n    )\n    \"\"\"Enable equation content processing.\"\"\"\n\n    # Batch Processing Configuration\n    # ---\n    max_concurrent_files: int = field(\n        default=get_env_value(\"MAX_CONCURRENT_FILES\", 1, int)\n    )\n    \"\"\"Maximum number of files to process concurrently.\"\"\"\n\n    supported_file_extensions: List[str] = field(\n        default_factory=lambda: get_env_value(\n            \"SUPPORTED_FILE_EXTENSIONS\",\n            \".pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md\",\n            str,\n        ).split(\",\")\n    )\n    \"\"\"List of supported file extensions for batch processing.\"\"\"\n\n    recursive_folder_processing: bool = field(\n        default=get_env_value(\"RECURSIVE_FOLDER_PROCESSING\", True, bool)\n    )\n    \"\"\"Whether to recursively process subfolders in batch mode.\"\"\"\n\n    # Context Extraction Configuration\n    # ---\n    context_window: int = field(default=get_env_value(\"CONTEXT_WINDOW\", 1, int))\n    \"\"\"Number of pages/chunks to include before and after current item for context.\"\"\"\n\n    context_mode: str = field(default=get_env_value(\"CONTEXT_MODE\", \"page\", str))\n    \"\"\"Context extraction mode: 'page' for page-based, 'chunk' for chunk-based.\"\"\"\n\n    max_context_tokens: int = field(\n        default=get_env_value(\"MAX_CONTEXT_TOKENS\", 2000, int)\n    )\n    \"\"\"Maximum number of tokens in extracted context.\"\"\"\n\n    include_headers: bool = field(default=get_env_value(\"INCLUDE_HEADERS\", True, bool))\n    \"\"\"Whether to include document headers and titles in context.\"\"\"\n\n    include_captions: bool = field(\n        default=get_env_value(\"INCLUDE_CAPTIONS\", True, bool)\n    )\n    \"\"\"Whether to include image/table captions in context.\"\"\"\n\n    context_filter_content_types: List[str] = field(\n        default_factory=lambda: get_env_value(\n            \"CONTEXT_FILTER_CONTENT_TYPES\", \"text\", str\n        ).split(\",\")\n    )\n    \"\"\"Content types to include in context extraction (e.g., 'text', 'image', 'table').\"\"\"\n\n    content_format: str = field(default=get_env_value(\"CONTENT_FORMAT\", \"minerU\", str))\n    \"\"\"Default content format for context extraction when processing documents.\"\"\"\n\n    # Path Handling Configuration\n    # ---\n    use_full_path: bool = field(default=get_env_value(\"USE_FULL_PATH\", False, bool))\n    \"\"\"Whether to use full file path (True) or just basename (False) for file references in LightRAG.\"\"\"\n\n    def __post_init__(self):\n        \"\"\"Post-initialization setup for backward compatibility\"\"\"\n        # Support legacy environment variable names for backward compatibility\n        legacy_parse_method = get_env_value(\"MINERU_PARSE_METHOD\", None, str)\n        if legacy_parse_method and not get_env_value(\"PARSE_METHOD\", None, str):\n            self.parse_method = legacy_parse_method\n            import warnings\n\n            warnings.warn(\n                \"MINERU_PARSE_METHOD is deprecated. Use PARSE_METHOD instead.\",\n                DeprecationWarning,\n                stacklevel=2,\n            )\n\n    @property\n    def mineru_parse_method(self) -> str:\n        \"\"\"\n        Backward compatibility property for old code.\n\n        .. deprecated::\n           Use `parse_method` instead. This property will be removed in a future version.\n        \"\"\"\n        import warnings\n\n        warnings.warn(\n            \"mineru_parse_method is deprecated. Use parse_method instead.\",\n            DeprecationWarning,\n            stacklevel=2,\n        )\n        return self.parse_method\n\n    @mineru_parse_method.setter\n    def mineru_parse_method(self, value: str):\n        \"\"\"Setter for backward compatibility\"\"\"\n        import warnings\n\n        warnings.warn(\n            \"mineru_parse_method is deprecated. Use parse_method instead.\",\n            DeprecationWarning,\n            stacklevel=2,\n        )\n        self.parse_method = value\n"
  },
  {
    "path": "raganything/enhanced_markdown.py",
    "content": "\"\"\"\nEnhanced Markdown to PDF Conversion\n\nThis module provides improved Markdown to PDF conversion with:\n- Better formatting and styling\n- Image support\n- Table support\n- Code syntax highlighting\n- Custom templates\n- Multiple output formats\n\"\"\"\n\nimport os\nimport logging\nfrom pathlib import Path\nfrom typing import Dict, Any, Optional\nfrom dataclasses import dataclass\nimport tempfile\nimport subprocess\n\ntry:\n    import markdown\n\n    MARKDOWN_AVAILABLE = True\nexcept ImportError:\n    MARKDOWN_AVAILABLE = False\n\ntry:\n    from weasyprint import HTML\n\n    WEASYPRINT_AVAILABLE = True\nexcept ImportError:\n    WEASYPRINT_AVAILABLE = False\n\ntry:\n    # Check if pandoc module exists (not used directly, just for detection)\n    import importlib.util\n\n    spec = importlib.util.find_spec(\"pandoc\")\n    PANDOC_AVAILABLE = spec is not None\nexcept ImportError:\n    PANDOC_AVAILABLE = False\n\n\n@dataclass\nclass MarkdownConfig:\n    \"\"\"Configuration for Markdown to PDF conversion\"\"\"\n\n    # Styling options\n    css_file: Optional[str] = None\n    template_file: Optional[str] = None\n    page_size: str = \"A4\"\n    margin: str = \"1in\"\n    font_size: str = \"12pt\"\n    line_height: str = \"1.5\"\n\n    # Content options\n    include_toc: bool = True\n    syntax_highlighting: bool = True\n    image_max_width: str = \"100%\"\n    table_style: str = \"border-collapse: collapse; width: 100%;\"\n\n    # Output options\n    output_format: str = \"pdf\"  # pdf, html, docx\n    output_dir: Optional[str] = None\n\n    # Advanced options\n    custom_css: Optional[str] = None\n    metadata: Optional[Dict[str, str]] = None\n\n\nclass EnhancedMarkdownConverter:\n    \"\"\"\n    Enhanced Markdown to PDF converter with multiple backends\n\n    Supports multiple conversion methods:\n    - WeasyPrint (recommended for HTML/CSS styling)\n    - Pandoc (recommended for complex documents)\n    - ReportLab (fallback, basic styling)\n    \"\"\"\n\n    def __init__(self, config: Optional[MarkdownConfig] = None):\n        \"\"\"\n        Initialize the converter\n\n        Args:\n            config: Configuration for conversion\n        \"\"\"\n        self.config = config or MarkdownConfig()\n        self.logger = logging.getLogger(__name__)\n\n        # Check available backends\n        self.available_backends = self._check_backends()\n        self.logger.info(f\"Available backends: {list(self.available_backends.keys())}\")\n\n    def _check_backends(self) -> Dict[str, bool]:\n        \"\"\"Check which conversion backends are available\"\"\"\n        backends = {\n            \"weasyprint\": WEASYPRINT_AVAILABLE,\n            \"pandoc\": PANDOC_AVAILABLE,\n            \"markdown\": MARKDOWN_AVAILABLE,\n        }\n\n        # Check if pandoc is installed on system\n        try:\n            subprocess.run([\"pandoc\", \"--version\"], capture_output=True, check=True)\n            backends[\"pandoc_system\"] = True\n        except (subprocess.CalledProcessError, FileNotFoundError):\n            backends[\"pandoc_system\"] = False\n\n        return backends\n\n    def _get_default_css(self) -> str:\n        \"\"\"Get default CSS styling\"\"\"\n        return \"\"\"\n        body {\n            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;\n            line-height: 1.6;\n            color: #333;\n            max-width: 800px;\n            margin: 0 auto;\n            padding: 20px;\n        }\n\n        h1, h2, h3, h4, h5, h6 {\n            color: #2c3e50;\n            margin-top: 1.5em;\n            margin-bottom: 0.5em;\n        }\n\n        h1 { font-size: 2em; border-bottom: 2px solid #3498db; padding-bottom: 0.3em; }\n        h2 { font-size: 1.5em; border-bottom: 1px solid #bdc3c7; padding-bottom: 0.2em; }\n        h3 { font-size: 1.3em; }\n        h4 { font-size: 1.1em; }\n\n        p { margin-bottom: 1em; }\n\n        code {\n            background-color: #f8f9fa;\n            padding: 2px 4px;\n            border-radius: 3px;\n            font-family: 'Courier New', monospace;\n            font-size: 0.9em;\n        }\n\n        pre {\n            background-color: #f8f9fa;\n            padding: 15px;\n            border-radius: 5px;\n            overflow-x: auto;\n            border-left: 4px solid #3498db;\n        }\n\n        pre code {\n            background-color: transparent;\n            padding: 0;\n        }\n\n        blockquote {\n            border-left: 4px solid #3498db;\n            margin: 0;\n            padding-left: 20px;\n            color: #7f8c8d;\n        }\n\n        table {\n            border-collapse: collapse;\n            width: 100%;\n            margin: 1em 0;\n        }\n\n        th, td {\n            border: 1px solid #ddd;\n            padding: 8px 12px;\n            text-align: left;\n        }\n\n        th {\n            background-color: #f2f2f2;\n            font-weight: bold;\n        }\n\n        img {\n            max-width: 100%;\n            height: auto;\n            display: block;\n            margin: 1em auto;\n        }\n\n        ul, ol {\n            margin-bottom: 1em;\n        }\n\n        li {\n            margin-bottom: 0.5em;\n        }\n\n        a {\n            color: #3498db;\n            text-decoration: none;\n        }\n\n        a:hover {\n            text-decoration: underline;\n        }\n\n        .toc {\n            background-color: #f8f9fa;\n            padding: 15px;\n            border-radius: 5px;\n            margin-bottom: 2em;\n        }\n\n        .toc ul {\n            list-style-type: none;\n            padding-left: 0;\n        }\n\n        .toc li {\n            margin-bottom: 0.3em;\n        }\n\n        .toc a {\n            color: #2c3e50;\n        }\n        \"\"\"\n\n    def _process_markdown_content(self, content: str) -> str:\n        \"\"\"Process Markdown content with extensions\"\"\"\n        if not MARKDOWN_AVAILABLE:\n            raise RuntimeError(\n                \"Markdown library not available. Install with: pip install markdown\"\n            )\n\n        # Configure Markdown extensions\n        extensions = [\n            \"markdown.extensions.tables\",\n            \"markdown.extensions.fenced_code\",\n            \"markdown.extensions.codehilite\",\n            \"markdown.extensions.toc\",\n            \"markdown.extensions.attr_list\",\n            \"markdown.extensions.def_list\",\n            \"markdown.extensions.footnotes\",\n        ]\n\n        extension_configs = {\n            \"codehilite\": {\n                \"css_class\": \"highlight\",\n                \"use_pygments\": True,\n            },\n            \"toc\": {\n                \"title\": \"Table of Contents\",\n                \"permalink\": True,\n            },\n        }\n\n        # Convert Markdown to HTML\n        md = markdown.Markdown(\n            extensions=extensions, extension_configs=extension_configs\n        )\n\n        html_content = md.convert(content)\n\n        # Add CSS styling\n        css = self.config.custom_css or self._get_default_css()\n\n        # Create complete HTML document\n        html_doc = f\"\"\"\n        <!DOCTYPE html>\n        <html>\n        <head>\n            <meta charset=\"UTF-8\">\n            <title>Converted Document</title>\n            <style>\n                {css}\n            </style>\n        </head>\n        <body>\n            {html_content}\n        </body>\n        </html>\n        \"\"\"\n\n        return html_doc\n\n    def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:\n        \"\"\"Convert using WeasyPrint (best for styling)\"\"\"\n        if not WEASYPRINT_AVAILABLE:\n            raise RuntimeError(\n                \"WeasyPrint not available. Install with: pip install weasyprint\"\n            )\n\n        try:\n            # Process Markdown to HTML\n            html_content = self._process_markdown_content(markdown_content)\n\n            # Convert HTML to PDF\n            html = HTML(string=html_content)\n            html.write_pdf(output_path)\n\n            self.logger.info(\n                f\"Successfully converted to PDF using WeasyPrint: {output_path}\"\n            )\n            return True\n\n        except Exception as e:\n            self.logger.error(f\"WeasyPrint conversion failed: {str(e)}\")\n            return False\n\n    def convert_with_pandoc(\n        self, markdown_content: str, output_path: str, use_system_pandoc: bool = False\n    ) -> bool:\n        \"\"\"Convert using Pandoc (best for complex documents)\"\"\"\n        if (\n            not self.available_backends.get(\"pandoc_system\", False)\n            and not use_system_pandoc\n        ):\n            raise RuntimeError(\n                \"Pandoc not available. Install from: https://pandoc.org/installing.html\"\n            )\n\n        temp_md_path = None\n        try:\n            import subprocess\n\n            # Create temporary markdown file\n            with tempfile.NamedTemporaryFile(\n                mode=\"w\", suffix=\".md\", delete=False\n            ) as temp_file:\n                temp_file.write(markdown_content)\n                temp_md_path = temp_file.name\n\n            # Build pandoc command with wkhtmltopdf engine\n            cmd = [\n                \"pandoc\",\n                temp_md_path,\n                \"-o\",\n                output_path,\n                \"--pdf-engine=wkhtmltopdf\",\n                \"--standalone\",\n                \"--toc\",\n                \"--number-sections\",\n            ]\n\n            # Run pandoc\n            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)\n\n            if result.returncode == 0:\n                self.logger.info(\n                    f\"Successfully converted to PDF using Pandoc: {output_path}\"\n                )\n                return True\n            else:\n                self.logger.error(f\"Pandoc conversion failed: {result.stderr}\")\n                return False\n\n        except Exception as e:\n            self.logger.error(f\"Pandoc conversion failed: {str(e)}\")\n            return False\n\n        finally:\n            if temp_md_path and os.path.exists(temp_md_path):\n                try:\n                    os.unlink(temp_md_path)\n                except OSError as e:\n                    self.logger.error(\n                        f\"Failed to clean up temp file {temp_md_path}: {str(e)}\"\n                    )\n\n    def convert_markdown_to_pdf(\n        self, markdown_content: str, output_path: str, method: str = \"auto\"\n    ) -> bool:\n        \"\"\"\n        Convert markdown content to PDF\n\n        Args:\n            markdown_content: Markdown content to convert\n            output_path: Output PDF file path\n            method: Conversion method (\"auto\", \"weasyprint\", \"pandoc\", \"pandoc_system\")\n\n        Returns:\n            True if conversion successful, False otherwise\n        \"\"\"\n        if method == \"auto\":\n            method = self._get_recommended_backend()\n\n        try:\n            if method == \"weasyprint\":\n                return self.convert_with_weasyprint(markdown_content, output_path)\n            elif method == \"pandoc\":\n                return self.convert_with_pandoc(markdown_content, output_path)\n            elif method == \"pandoc_system\":\n                return self.convert_with_pandoc(\n                    markdown_content, output_path, use_system_pandoc=True\n                )\n            else:\n                raise ValueError(f\"Unknown conversion method: {method}\")\n\n        except Exception as e:\n            self.logger.error(f\"{method.title()} conversion failed: {str(e)}\")\n            return False\n\n    def convert_file_to_pdf(\n        self, input_path: str, output_path: Optional[str] = None, method: str = \"auto\"\n    ) -> bool:\n        \"\"\"\n        Convert Markdown file to PDF\n\n        Args:\n            input_path: Input Markdown file path\n            output_path: Output PDF file path (optional)\n            method: Conversion method\n\n        Returns:\n            bool: True if conversion successful\n        \"\"\"\n        input_path_obj = Path(input_path)\n\n        if not input_path_obj.exists():\n            raise FileNotFoundError(f\"Input file not found: {input_path}\")\n\n        # Read markdown content\n        try:\n            with open(input_path_obj, \"r\", encoding=\"utf-8\") as f:\n                markdown_content = f.read()\n        except UnicodeDecodeError:\n            # Try with different encodings\n            for encoding in [\"gbk\", \"latin-1\", \"cp1252\"]:\n                try:\n                    with open(input_path_obj, \"r\", encoding=encoding) as f:\n                        markdown_content = f.read()\n                    break\n                except UnicodeDecodeError:\n                    continue\n            else:\n                raise RuntimeError(\n                    f\"Could not decode file {input_path} with any supported encoding\"\n                )\n\n        # Determine output path\n        if output_path is None:\n            output_path = str(input_path_obj.with_suffix(\".pdf\"))\n\n        return self.convert_markdown_to_pdf(markdown_content, output_path, method)\n\n    def get_backend_info(self) -> Dict[str, Any]:\n        \"\"\"Get information about available backends\"\"\"\n        return {\n            \"available_backends\": self.available_backends,\n            \"recommended_backend\": self._get_recommended_backend(),\n            \"config\": {\n                \"page_size\": self.config.page_size,\n                \"margin\": self.config.margin,\n                \"font_size\": self.config.font_size,\n                \"include_toc\": self.config.include_toc,\n                \"syntax_highlighting\": self.config.syntax_highlighting,\n            },\n        }\n\n    def _get_recommended_backend(self) -> str:\n        \"\"\"Get recommended backend based on availability\"\"\"\n        if self.available_backends.get(\"pandoc_system\", False):\n            return \"pandoc\"\n        elif self.available_backends.get(\"weasyprint\", False):\n            return \"weasyprint\"\n        else:\n            return \"none\"\n\n\ndef main():\n    \"\"\"Command-line interface for enhanced markdown conversion\"\"\"\n    import argparse\n\n    parser = argparse.ArgumentParser(description=\"Enhanced Markdown to PDF conversion\")\n    parser.add_argument(\"input\", nargs=\"?\", help=\"Input markdown file\")\n    parser.add_argument(\"--output\", \"-o\", help=\"Output PDF file\")\n    parser.add_argument(\n        \"--method\",\n        choices=[\"auto\", \"weasyprint\", \"pandoc\", \"pandoc_system\"],\n        default=\"auto\",\n        help=\"Conversion method\",\n    )\n    parser.add_argument(\"--css\", help=\"Custom CSS file\")\n    parser.add_argument(\"--info\", action=\"store_true\", help=\"Show backend information\")\n\n    args = parser.parse_args()\n\n    # Configure logging\n    logging.basicConfig(\n        level=logging.INFO,\n        format=\"%(asctime)s - %(name)s - %(levelname)s - %(message)s\",\n    )\n\n    # Create converter\n    config = MarkdownConfig()\n    if args.css:\n        config.css_file = args.css\n\n    converter = EnhancedMarkdownConverter(config)\n\n    # Show backend info if requested\n    if args.info:\n        info = converter.get_backend_info()\n        print(\"Backend Information:\")\n        for backend, available in info[\"available_backends\"].items():\n            status = \"✅\" if available else \"❌\"\n            print(f\"  {status} {backend}\")\n        print(f\"Recommended backend: {info['recommended_backend']}\")\n        return 0\n\n    # Check if input file is provided\n    if not args.input:\n        parser.error(\"Input file is required when not using --info\")\n\n    # Convert file\n    try:\n        success = converter.convert_file_to_pdf(\n            input_path=args.input, output_path=args.output, method=args.method\n        )\n\n        if success:\n            print(f\"✅ Successfully converted {args.input} to PDF\")\n            return 0\n        else:\n            print(\"❌ Conversion failed\")\n            return 1\n\n    except Exception as e:\n        print(f\"❌ Error: {str(e)}\")\n        return 1\n\n\nif __name__ == \"__main__\":\n    exit(main())\n"
  },
  {
    "path": "raganything/modalprocessors.py",
    "content": "\"\"\"\nSpecialized processors for different modalities\n\nIncludes:\n- ContextExtractor: Universal context extraction for multimodal content\n- ImageModalProcessor: Specialized processor for image content\n- TableModalProcessor: Specialized processor for table content\n- EquationModalProcessor: Specialized processor for equation content\n- GenericModalProcessor: Processor for other modal content\n\"\"\"\n\nimport re\nimport json\nimport time\nimport base64\nfrom typing import Dict, Any, Tuple, List\nfrom pathlib import Path\nfrom dataclasses import dataclass\n\nfrom lightrag.utils import (\n    logger,\n    compute_mdhash_id,\n)\nfrom lightrag.lightrag import LightRAG\nfrom dataclasses import asdict\nfrom lightrag.kg.shared_storage import get_namespace_data, get_pipeline_status_lock\nfrom lightrag.operate import extract_entities, merge_nodes_and_edges\n\n# Import prompt templates\nfrom raganything.prompt import PROMPTS\n\n\n@dataclass\nclass ContextConfig:\n    \"\"\"Configuration for context extraction\"\"\"\n\n    context_window: int = 1  # Window size for context extraction\n    context_mode: str = \"page\"  # \"page\", \"chunk\", \"token\"\n    max_context_tokens: int = 2000  # Maximum context tokens\n    include_headers: bool = True  # Whether to include headers/titles\n    include_captions: bool = True  # Whether to include image/table captions\n    filter_content_types: List[str] = None  # Content types to include\n\n    def __post_init__(self):\n        if self.filter_content_types is None:\n            self.filter_content_types = [\"text\"]\n\n\nclass ContextExtractor:\n    \"\"\"Universal context extractor supporting multiple content source formats\"\"\"\n\n    def __init__(self, config: ContextConfig = None, tokenizer=None):\n        \"\"\"Initialize context extractor\n\n        Args:\n            config: Context extraction configuration\n            tokenizer: Tokenizer for accurate token counting\n        \"\"\"\n        self.config = config or ContextConfig()\n        self.tokenizer = tokenizer\n\n    def extract_context(\n        self,\n        content_source: Any,\n        current_item_info: Dict[str, Any],\n        content_format: str = \"auto\",\n    ) -> str:\n        \"\"\"Extract context for current item from content source\n\n        Args:\n            content_source: Source content (list, dict, or other format)\n            current_item_info: Information about current item (page_idx, index, etc.)\n            content_format: Format hint for content source (\"minerU\", \"text_chunks\", \"auto\", etc.)\n\n        Returns:\n            Extracted context text\n        \"\"\"\n        if not content_source and not self.config.context_window:\n            return \"\"\n\n        try:\n            # Use format hint if provided, otherwise auto-detect\n            if content_format == \"minerU\" and isinstance(content_source, list):\n                return self._extract_from_content_list(\n                    content_source, current_item_info\n                )\n            elif content_format == \"text_chunks\" and isinstance(content_source, list):\n                return self._extract_from_text_chunks(content_source, current_item_info)\n            elif content_format == \"text\" and isinstance(content_source, str):\n                return self._extract_from_text_source(content_source, current_item_info)\n            else:\n                # Auto-detect content source format\n                if isinstance(content_source, list):\n                    return self._extract_from_content_list(\n                        content_source, current_item_info\n                    )\n                elif isinstance(content_source, dict):\n                    return self._extract_from_dict_source(\n                        content_source, current_item_info\n                    )\n                elif isinstance(content_source, str):\n                    return self._extract_from_text_source(\n                        content_source, current_item_info\n                    )\n                else:\n                    logger.warning(\n                        f\"Unsupported content source type: {type(content_source)}\"\n                    )\n                    return \"\"\n        except Exception as e:\n            logger.error(f\"Error extracting context: {e}\")\n            return \"\"\n\n    def _extract_from_content_list(\n        self, content_list: List[Dict], current_item_info: Dict\n    ) -> str:\n        \"\"\"Extract context from MinerU-style content list\n\n        Args:\n            content_list: List of content items with page_idx and type info\n            current_item_info: Current item information\n\n        Returns:\n            Context text from surrounding pages/chunks\n        \"\"\"\n        if self.config.context_mode == \"page\":\n            return self._extract_page_context(content_list, current_item_info)\n        elif self.config.context_mode == \"chunk\":\n            return self._extract_chunk_context(content_list, current_item_info)\n        else:\n            return self._extract_page_context(content_list, current_item_info)\n\n    def _extract_page_context(\n        self, content_list: List[Dict], current_item_info: Dict\n    ) -> str:\n        \"\"\"Extract context based on page boundaries\n\n        Args:\n            content_list: List of content items\n            current_item_info: Current item with page_idx\n\n        Returns:\n            Context text from surrounding pages\n        \"\"\"\n        current_page = current_item_info.get(\"page_idx\", 0)\n        window_size = self.config.context_window\n\n        start_page = max(0, current_page - window_size)\n        end_page = current_page + window_size + 1\n\n        context_texts = []\n\n        for item in content_list:\n            item_page = item.get(\"page_idx\", 0)\n            item_type = item.get(\"type\", \"\")\n\n            # Check if item is within context window and matches filter criteria\n            if (\n                start_page <= item_page < end_page\n                and item_type in self.config.filter_content_types\n            ):\n                text_content = self._extract_text_from_item(item)\n                if text_content and text_content.strip():\n                    # Add page marker for better context understanding\n                    if item_page != current_page:\n                        context_texts.append(f\"[Page {item_page}] {text_content}\")\n                    else:\n                        context_texts.append(text_content)\n\n        context = \"\\n\".join(context_texts)\n        return self._truncate_context(context)\n\n    def _extract_chunk_context(\n        self, content_list: List[Dict], current_item_info: Dict\n    ) -> str:\n        \"\"\"Extract context based on content chunks\n\n        Args:\n            content_list: List of content items\n            current_item_info: Current item with index info\n\n        Returns:\n            Context text from surrounding chunks\n        \"\"\"\n        current_index = current_item_info.get(\"index\", 0)\n        window_size = self.config.context_window\n\n        start_idx = max(0, current_index - window_size)\n        end_idx = min(len(content_list), current_index + window_size + 1)\n\n        context_texts = []\n\n        for i in range(start_idx, end_idx):\n            if i != current_index:\n                item = content_list[i]\n                item_type = item.get(\"type\", \"\")\n\n                if item_type in self.config.filter_content_types:\n                    text_content = self._extract_text_from_item(item)\n                    if text_content and text_content.strip():\n                        context_texts.append(text_content)\n\n        context = \"\\n\".join(context_texts)\n        return self._truncate_context(context)\n\n    def _extract_text_from_item(self, item: Dict) -> str:\n        \"\"\"Extract text content from a content item\n\n        Args:\n            item: Content item dictionary\n\n        Returns:\n            Extracted text content\n        \"\"\"\n        item_type = item.get(\"type\", \"\")\n\n        if item_type == \"text\":\n            text = item.get(\"text\", \"\")\n            text_level = item.get(\"text_level\", 0)\n\n            # Add header indication for structured content·\n            if self.config.include_headers and text_level > 0:\n                return f\"{'#' * text_level} {text}\"\n            return text\n\n        elif item_type == \"image\" and self.config.include_captions:\n            captions = item.get(\"image_caption\", item.get(\"img_caption\", []))\n            if captions:\n                return f\"[Image: {', '.join(captions)}]\"\n\n        elif item_type == \"table\" and self.config.include_captions:\n            captions = item.get(\"table_caption\", [])\n            if captions:\n                return f\"[Table: {', '.join(captions)}]\"\n\n        return \"\"\n\n    def _extract_from_dict_source(\n        self, dict_source: Dict, current_item_info: Dict\n    ) -> str:\n        \"\"\"Extract context from dictionary-based content source\n\n        Args:\n            dict_source: Dictionary containing content\n            current_item_info: Current item information\n\n        Returns:\n            Extracted context text\n        \"\"\"\n        # Handle different dictionary structures\n        if \"content\" in dict_source:\n            context = str(dict_source[\"content\"])\n        elif \"text\" in dict_source:\n            context = str(dict_source[\"text\"])\n        else:\n            # Try to extract any string values\n            text_parts = []\n            for value in dict_source.values():\n                if isinstance(value, str):\n                    text_parts.append(value)\n            context = \"\\n\".join(text_parts)\n\n        return self._truncate_context(context)\n\n    def _extract_from_text_source(\n        self, text_source: str, current_item_info: Dict\n    ) -> str:\n        \"\"\"Extract context from plain text source\n\n        Args:\n            text_source: Plain text content\n            current_item_info: Current item information\n\n        Returns:\n            Truncated text context\n        \"\"\"\n        return self._truncate_context(text_source)\n\n    def _extract_from_text_chunks(\n        self, text_chunks: List[str], current_item_info: Dict\n    ) -> str:\n        \"\"\"Extract context from simple text chunks list\n\n        Args:\n            text_chunks: List of text strings\n            current_item_info: Current item information with index\n\n        Returns:\n            Context text from surrounding chunks\n        \"\"\"\n        current_index = current_item_info.get(\"index\", 0)\n        window_size = self.config.context_window\n\n        start_idx = max(0, current_index - window_size)\n        end_idx = min(len(text_chunks), current_index + window_size + 1)\n\n        context_texts = []\n        for i in range(start_idx, end_idx):\n            if i != current_index:  # Exclude current chunk\n                if i < len(text_chunks):\n                    chunk_text = str(text_chunks[i]).strip()\n                    if chunk_text:\n                        context_texts.append(chunk_text)\n\n        context = \"\\n\".join(context_texts)\n        return self._truncate_context(context)\n\n    def _truncate_context(self, context: str) -> str:\n        \"\"\"Truncate context to maximum token limit\n\n        Args:\n            context: Context text to truncate\n\n        Returns:\n            Truncated context text\n        \"\"\"\n        if not context:\n            return \"\"\n\n        # Use tokenizer if available for accurate token counting\n        if self.tokenizer:\n            tokens = self.tokenizer.encode(context)\n            if len(tokens) <= self.config.max_context_tokens:\n                return context\n\n            # Truncate to max tokens and decode back to text\n            truncated_tokens = tokens[: self.config.max_context_tokens]\n            truncated_text = self.tokenizer.decode(truncated_tokens)\n\n            # Try to end at a sentence boundary\n            last_period = truncated_text.rfind(\".\")\n            last_newline = truncated_text.rfind(\"\\n\")\n\n            if last_period > len(truncated_text) * 0.8:\n                return truncated_text[: last_period + 1]\n            elif last_newline > len(truncated_text) * 0.8:\n                return truncated_text[:last_newline]\n            else:\n                return truncated_text + \"...\"\n        else:\n            # Fallback to character-based truncation if no tokenizer\n            if len(context) <= self.config.max_context_tokens:\n                return context\n\n            # Simple truncation - fallback when no tokenizer available\n            truncated = context[: self.config.max_context_tokens]\n\n            # Try to end at a sentence boundary\n            last_period = truncated.rfind(\".\")\n            last_newline = truncated.rfind(\"\\n\")\n\n            if last_period > len(truncated) * 0.8:\n                return truncated[: last_period + 1]\n            elif last_newline > len(truncated) * 0.8:\n                return truncated[:last_newline]\n            else:\n                return truncated + \"...\"\n\n\nclass BaseModalProcessor:\n    \"\"\"Base class for modal processors\"\"\"\n\n    def __init__(\n        self,\n        lightrag: LightRAG,\n        modal_caption_func,\n        context_extractor: ContextExtractor = None,\n    ):\n        \"\"\"Initialize base processor\n\n        Args:\n            lightrag: LightRAG instance\n            modal_caption_func: Function for generating descriptions\n            context_extractor: Context extractor instance\n        \"\"\"\n        self.lightrag = lightrag\n        self.modal_caption_func = modal_caption_func\n\n        # Use LightRAG's storage instances\n        self.text_chunks_db = lightrag.text_chunks\n        self.chunks_vdb = lightrag.chunks_vdb\n        self.entities_vdb = lightrag.entities_vdb\n        self.relationships_vdb = lightrag.relationships_vdb\n        self.knowledge_graph_inst = lightrag.chunk_entity_relation_graph\n\n        # Use LightRAG's configuration and functions\n        self.embedding_func = lightrag.embedding_func\n        self.llm_model_func = lightrag.llm_model_func\n        self.global_config = asdict(lightrag)\n        self.hashing_kv = lightrag.llm_response_cache\n        self.tokenizer = lightrag.tokenizer\n\n        # Initialize context extractor with tokenizer if not provided\n        if context_extractor is None:\n            self.context_extractor = ContextExtractor(tokenizer=self.tokenizer)\n        else:\n            self.context_extractor = context_extractor\n            # Update tokenizer if context_extractor doesn't have one\n            if self.context_extractor.tokenizer is None:\n                self.context_extractor.tokenizer = self.tokenizer\n\n        # Content source for context extraction\n        self.content_source = None\n        self.content_format = \"auto\"\n\n    def set_content_source(self, content_source: Any, content_format: str = \"auto\"):\n        \"\"\"Set content source for context extraction\n\n        Args:\n            content_source: Source content for context extraction\n            content_format: Format of content source (\"minerU\", \"text_chunks\", \"auto\")\n        \"\"\"\n        self.content_source = content_source\n        self.content_format = content_format\n        logger.info(f\"Content source set with format: {content_format}\")\n\n    def _get_context_for_item(self, item_info: Dict[str, Any]) -> str:\n        \"\"\"Get context for current processing item\n\n        Args:\n            item_info: Information about current item (page_idx, index, etc.)\n\n        Returns:\n            Context text for the item\n        \"\"\"\n        if not self.content_source:\n            return \"\"\n\n        try:\n            context = self.context_extractor.extract_context(\n                self.content_source, item_info, self.content_format\n            )\n            if context:\n                logger.debug(\n                    f\"Extracted context of length {len(context)} for item: {item_info}\"\n                )\n            return context\n        except Exception as e:\n            logger.error(f\"Error getting context for item {item_info}: {e}\")\n            return \"\"\n\n    async def generate_description_only(\n        self,\n        modal_content,\n        content_type: str,\n        item_info: Dict[str, Any] = None,\n        entity_name: str = None,\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"\n        Generate text description and entity info only, without entity relation extraction.\n        Used for batch processing stage 1.\n\n        Args:\n            modal_content: Modal content to process\n            content_type: Type of modal content\n            item_info: Item information for context extraction\n            entity_name: Optional predefined entity name\n\n        Returns:\n            Tuple of (description, entity_info)\n        \"\"\"\n        # Subclasses must implement this method\n        raise NotImplementedError(\"Subclasses must implement this method\")\n\n    async def _create_entity_and_chunk(\n        self,\n        modal_chunk: str,\n        entity_info: Dict[str, Any],\n        file_path: str,\n        batch_mode: bool = False,\n        doc_id: str = None,\n        chunk_order_index: int = 0,\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"Create entity and text chunk\"\"\"\n        # Create chunk\n        chunk_id = compute_mdhash_id(str(modal_chunk), prefix=\"chunk-\")\n        tokens = len(self.tokenizer.encode(modal_chunk))\n\n        # Use provided doc_id or generate one from chunk_id for backward compatibility\n        actual_doc_id = doc_id if doc_id else chunk_id\n\n        chunk_data = {\n            \"tokens\": tokens,\n            \"content\": modal_chunk,\n            \"chunk_order_index\": chunk_order_index,\n            \"full_doc_id\": actual_doc_id,  # Use proper document ID\n            \"file_path\": file_path,\n        }\n\n        # Store chunk\n        await self.text_chunks_db.upsert({chunk_id: chunk_data})\n\n        # Store chunk in vector database for retrieval\n        chunk_vdb_data = {\n            chunk_id: {\n                \"content\": modal_chunk,\n                \"full_doc_id\": actual_doc_id,\n                \"tokens\": tokens,\n                \"chunk_order_index\": chunk_order_index,\n                \"file_path\": file_path,\n            }\n        }\n        await self.chunks_vdb.upsert(chunk_vdb_data)\n\n        # Create entity node\n        node_data = {\n            \"entity_id\": entity_info[\"entity_name\"],\n            \"entity_type\": entity_info[\"entity_type\"],\n            \"description\": entity_info[\"summary\"],\n            \"source_id\": chunk_id,\n            \"file_path\": file_path,\n            \"created_at\": int(time.time()),\n        }\n\n        await self.knowledge_graph_inst.upsert_node(\n            entity_info[\"entity_name\"], node_data\n        )\n\n        # Insert entity into vector database\n        entity_vdb_data = {\n            compute_mdhash_id(entity_info[\"entity_name\"], prefix=\"ent-\"): {\n                \"entity_name\": entity_info[\"entity_name\"],\n                \"entity_type\": entity_info[\"entity_type\"],\n                \"content\": f\"{entity_info['entity_name']}\\n{entity_info['summary']}\",\n                \"source_id\": chunk_id,\n                \"file_path\": file_path,\n            }\n        }\n        await self.entities_vdb.upsert(entity_vdb_data)\n\n        # Process entity and relationship extraction\n        chunk_results = await self._process_chunk_for_extraction(\n            chunk_id, entity_info[\"entity_name\"], batch_mode\n        )\n\n        return (\n            entity_info[\"summary\"],\n            {\n                \"entity_name\": entity_info[\"entity_name\"],\n                \"entity_type\": entity_info[\"entity_type\"],\n                \"description\": entity_info[\"summary\"],\n                \"chunk_id\": chunk_id,\n            },\n            chunk_results,\n        )\n\n    def _robust_json_parse(self, response: str) -> dict:\n        \"\"\"Robust JSON parsing with multiple fallback strategies\"\"\"\n\n        # Strategy 1: Try direct parsing first\n        for json_candidate in self._extract_all_json_candidates(response):\n            result = self._try_parse_json(json_candidate)\n            if result:\n                return result\n\n        # Strategy 2: Try with basic cleanup\n        for json_candidate in self._extract_all_json_candidates(response):\n            cleaned = self._basic_json_cleanup(json_candidate)\n            result = self._try_parse_json(cleaned)\n            if result:\n                return result\n\n        # Strategy 3: Try progressive quote fixing\n        for json_candidate in self._extract_all_json_candidates(response):\n            fixed = self._progressive_quote_fix(json_candidate)\n            result = self._try_parse_json(fixed)\n            if result:\n                return result\n\n        # Strategy 4: Fallback to regex field extraction\n        return self._extract_fields_with_regex(response)\n\n    def _extract_all_json_candidates(self, response: str) -> list:\n        \"\"\"Extract all possible JSON candidates from response\"\"\"\n        candidates = []\n\n        import re\n\n        # Pre-process: Remove thinking/reasoning tags that some models use\n        # This handles models like qwen2.5-think, deepseek-r1 that wrap reasoning in tags\n        cleaned_response = re.sub(\n            r\"<think>.*?</think>\", \"\", response, flags=re.DOTALL | re.IGNORECASE\n        )\n        cleaned_response = re.sub(\n            r\"<thinking>.*?</thinking>\",\n            \"\",\n            cleaned_response,\n            flags=re.DOTALL | re.IGNORECASE,\n        )\n\n        # Method 1: JSON in code blocks\n        json_blocks = re.findall(\n            r\"```(?:json)?\\s*(\\{.*?\\})\\s*```\", cleaned_response, re.DOTALL\n        )\n        candidates.extend(json_blocks)\n\n        # Method 2: Balanced braces\n        brace_count = 0\n        start_pos = -1\n\n        for i, char in enumerate(cleaned_response):\n            if char == \"{\":\n                if brace_count == 0:\n                    start_pos = i\n                brace_count += 1\n            elif char == \"}\":\n                brace_count -= 1\n                if brace_count == 0 and start_pos != -1:\n                    candidates.append(cleaned_response[start_pos : i + 1])\n\n        # Method 3: Simple regex fallback\n        simple_match = re.search(r\"\\{.*\\}\", cleaned_response, re.DOTALL)\n        if simple_match:\n            candidates.append(simple_match.group(0))\n\n        return candidates\n\n    def _try_parse_json(self, json_str: str) -> dict:\n        \"\"\"Try to parse JSON string, return None if failed\"\"\"\n        if not json_str or not json_str.strip():\n            return None\n\n        try:\n            return json.loads(json_str)\n        except (json.JSONDecodeError, ValueError):\n            return None\n\n    def _basic_json_cleanup(self, json_str: str) -> str:\n        \"\"\"Basic cleanup for common JSON issues\"\"\"\n        # Remove extra whitespace\n        json_str = json_str.strip()\n\n        # Fix common quote issues\n        json_str = json_str.replace('\"', '\"').replace('\"', '\"')  # Smart quotes\n        json_str = json_str.replace(\"\"\", \"'\").replace(\"\"\", \"'\")  # Smart apostrophes\n\n        # Fix trailing commas (simple case)\n        json_str = re.sub(r\",(\\s*[}\\]])\", r\"\\1\", json_str)\n\n        return json_str\n\n    def _progressive_quote_fix(self, json_str: str) -> str:\n        \"\"\"Progressive fixing of quote and escape issues\"\"\"\n        # Only escape unescaped backslashes before quotes\n        json_str = re.sub(r'(?<!\\\\)\\\\(?=\")', r\"\\\\\\\\\", json_str)\n\n        # Fix unescaped backslashes in string values (more conservative)\n        def fix_string_content(match):\n            content = match.group(1)\n            # Only escape obvious problematic patterns\n            content = re.sub(r\"\\\\(?=[a-zA-Z])\", r\"\\\\\\\\\", content)  # \\alpha -> \\\\alpha\n            return f'\"{content}\"'\n\n        json_str = re.sub(r'\"([^\"]*(?:\\\\.[^\"]*)*)\"', fix_string_content, json_str)\n        return json_str\n\n    def _extract_fields_with_regex(self, response: str) -> dict:\n        \"\"\"Extract required fields using regex as last resort\"\"\"\n        logger.warning(\"Using regex fallback for JSON parsing\")\n\n        # Extract detailed_description\n        desc_match = re.search(\n            r'\"detailed_description\":\\s*\"([^\"]*(?:\\\\.[^\"]*)*)\"', response, re.DOTALL\n        )\n        description = desc_match.group(1) if desc_match else \"\"\n\n        # Extract entity_name\n        name_match = re.search(r'\"entity_name\":\\s*\"([^\"]*(?:\\\\.[^\"]*)*)\"', response)\n        entity_name = name_match.group(1) if name_match else \"unknown_entity\"\n\n        # Extract entity_type\n        type_match = re.search(r'\"entity_type\":\\s*\"([^\"]*(?:\\\\.[^\"]*)*)\"', response)\n        entity_type = type_match.group(1) if type_match else \"unknown\"\n\n        # Extract summary\n        summary_match = re.search(\n            r'\"summary\":\\s*\"([^\"]*(?:\\\\.[^\"]*)*)\"', response, re.DOTALL\n        )\n        summary = summary_match.group(1) if summary_match else description[:100]\n\n        return {\n            \"detailed_description\": description,\n            \"entity_info\": {\n                \"entity_name\": entity_name,\n                \"entity_type\": entity_type,\n                \"summary\": summary,\n            },\n        }\n\n    def _extract_json_from_response(self, response: str) -> str:\n        \"\"\"Legacy method - now handled by _extract_all_json_candidates\"\"\"\n        candidates = self._extract_all_json_candidates(response)\n        return candidates[0] if candidates else None\n\n    def _fix_json_escapes(self, json_str: str) -> str:\n        \"\"\"Legacy method - now handled by progressive strategies\"\"\"\n        return self._progressive_quote_fix(json_str)\n\n    async def _process_chunk_for_extraction(\n        self, chunk_id: str, modal_entity_name: str, batch_mode: bool = False\n    ):\n        \"\"\"Process chunk for entity and relationship extraction\"\"\"\n        chunk_data = await self.text_chunks_db.get_by_id(chunk_id)\n        if not chunk_data:\n            logger.error(f\"Chunk {chunk_id} not found\")\n            return\n\n        # Create text chunk for vector database\n        chunk_vdb_data = {\n            chunk_id: {\n                \"content\": chunk_data[\"content\"],\n                \"full_doc_id\": chunk_id,\n                \"tokens\": chunk_data[\"tokens\"],\n                \"chunk_order_index\": chunk_data[\"chunk_order_index\"],\n                \"file_path\": chunk_data[\"file_path\"],\n            }\n        }\n\n        await self.chunks_vdb.upsert(chunk_vdb_data)\n\n        pipeline_status = await get_namespace_data(\"pipeline_status\")\n        pipeline_status_lock = get_pipeline_status_lock()\n\n        # Prepare chunk for extraction\n        chunks = {chunk_id: chunk_data}\n\n        # Extract entities and relationships\n        chunk_results = await extract_entities(\n            chunks=chunks,\n            global_config=self.global_config,\n            pipeline_status=pipeline_status,\n            pipeline_status_lock=pipeline_status_lock,\n            llm_response_cache=self.hashing_kv,\n        )\n\n        # Add \"belongs_to\" relationships for all extracted entities\n        processed_chunk_results = []\n        for maybe_nodes, maybe_edges in chunk_results:\n            for entity_name in maybe_nodes.keys():\n                if entity_name != modal_entity_name:  # Skip self-relationship\n                    # Create belongs_to relationship\n                    relation_data = {\n                        \"description\": f\"Entity {entity_name} belongs to {modal_entity_name}\",\n                        \"keywords\": \"belongs_to,part_of,contained_in\",\n                        \"source_id\": chunk_id,\n                        \"weight\": 10.0,\n                        \"file_path\": chunk_data.get(\"file_path\", \"manual_creation\"),\n                    }\n                    await self.knowledge_graph_inst.upsert_edge(\n                        entity_name, modal_entity_name, relation_data\n                    )\n\n                    relation_id = compute_mdhash_id(\n                        entity_name + modal_entity_name, prefix=\"rel-\"\n                    )\n                    relation_vdb_data = {\n                        relation_id: {\n                            \"src_id\": entity_name,\n                            \"tgt_id\": modal_entity_name,\n                            \"keywords\": relation_data[\"keywords\"],\n                            \"content\": f\"{relation_data['keywords']}\\t{entity_name}\\n{modal_entity_name}\\n{relation_data['description']}\",\n                            \"source_id\": chunk_id,\n                            \"file_path\": chunk_data.get(\"file_path\", \"manual_creation\"),\n                        }\n                    }\n                    await self.relationships_vdb.upsert(relation_vdb_data)\n\n                    # Add to maybe_edges\n                    maybe_edges[(entity_name, modal_entity_name)] = [relation_data]\n\n            processed_chunk_results.append((maybe_nodes, maybe_edges))\n\n        if not batch_mode:\n            # Merge with correct file_path parameter\n            file_path = chunk_data.get(\"file_path\", \"manual_creation\")\n            await merge_nodes_and_edges(\n                chunk_results=chunk_results,\n                knowledge_graph_inst=self.knowledge_graph_inst,\n                entity_vdb=self.entities_vdb,\n                relationships_vdb=self.relationships_vdb,\n                global_config=self.global_config,\n                pipeline_status=pipeline_status,\n                pipeline_status_lock=pipeline_status_lock,\n                llm_response_cache=self.hashing_kv,\n                current_file_number=1,\n                total_files=1,\n                file_path=file_path,  # Pass the correct file_path\n            )\n\n            # Ensure all storage updates are complete\n            await self.lightrag._insert_done()\n\n        return processed_chunk_results\n\n\nclass ImageModalProcessor(BaseModalProcessor):\n    \"\"\"Processor specialized for image content\"\"\"\n\n    def __init__(\n        self,\n        lightrag: LightRAG,\n        modal_caption_func,\n        context_extractor: ContextExtractor = None,\n    ):\n        \"\"\"Initialize image processor\n\n        Args:\n            lightrag: LightRAG instance\n            modal_caption_func: Function for generating descriptions (supporting image understanding)\n            context_extractor: Context extractor instance\n        \"\"\"\n        super().__init__(lightrag, modal_caption_func, context_extractor)\n\n    def _encode_image_to_base64(self, image_path: str) -> str:\n        \"\"\"Encode image to base64\"\"\"\n        try:\n            with open(image_path, \"rb\") as image_file:\n                encoded_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n            return encoded_string\n        except Exception as e:\n            logger.error(f\"Failed to encode image {image_path}: {e}\")\n            return \"\"\n\n    async def generate_description_only(\n        self,\n        modal_content,\n        content_type: str,\n        item_info: Dict[str, Any] = None,\n        entity_name: str = None,\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"\n        Generate image description and entity info only, without entity relation extraction.\n        Used for batch processing stage 1.\n\n        Args:\n            modal_content: Image content to process\n            content_type: Type of modal content (\"image\")\n            item_info: Item information for context extraction\n            entity_name: Optional predefined entity name\n\n        Returns:\n            Tuple of (enhanced_caption, entity_info)\n        \"\"\"\n        try:\n            # Parse image content (reuse existing logic)\n            if isinstance(modal_content, str):\n                try:\n                    content_data = json.loads(modal_content)\n                except json.JSONDecodeError:\n                    content_data = {\"description\": modal_content}\n            else:\n                content_data = modal_content\n\n            image_path = content_data.get(\"img_path\")\n            captions = content_data.get(\n                \"image_caption\", content_data.get(\"img_caption\", [])\n            )\n            footnotes = content_data.get(\n                \"image_footnote\", content_data.get(\"img_footnote\", [])\n            )\n\n            # Validate image path\n            if not image_path:\n                raise ValueError(\n                    f\"No image path provided in modal_content: {modal_content}\"\n                )\n\n            # Convert to Path object and check if it exists\n            image_path_obj = Path(image_path)\n            if not image_path_obj.exists():\n                raise FileNotFoundError(f\"Image file not found: {image_path}\")\n\n            # Extract context for current item\n            context = \"\"\n            if item_info:\n                context = self._get_context_for_item(item_info)\n\n            # Build detailed visual analysis prompt with context\n            if context:\n                vision_prompt = PROMPTS.get(\n                    \"vision_prompt_with_context\", PROMPTS[\"vision_prompt\"]\n                ).format(\n                    context=context,\n                    entity_name=entity_name\n                    if entity_name\n                    else \"unique descriptive name for this image\",\n                    image_path=image_path,\n                    captions=captions if captions else \"None\",\n                    footnotes=footnotes if footnotes else \"None\",\n                )\n            else:\n                vision_prompt = PROMPTS[\"vision_prompt\"].format(\n                    entity_name=entity_name\n                    if entity_name\n                    else \"unique descriptive name for this image\",\n                    image_path=image_path,\n                    captions=captions if captions else \"None\",\n                    footnotes=footnotes if footnotes else \"None\",\n                )\n\n            # Encode image to base64\n            image_base64 = self._encode_image_to_base64(image_path)\n            if not image_base64:\n                raise RuntimeError(f\"Failed to encode image to base64: {image_path}\")\n\n            # Call vision model with encoded image\n            response = await self.modal_caption_func(\n                vision_prompt,\n                image_data=image_base64,\n                system_prompt=PROMPTS[\"IMAGE_ANALYSIS_SYSTEM\"],\n            )\n\n            # Parse response (reuse existing logic)\n            enhanced_caption, entity_info = self._parse_response(response, entity_name)\n\n            return enhanced_caption, entity_info\n\n        except Exception as e:\n            logger.error(f\"Error generating image description: {e}\")\n            # Fallback processing\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"image_{compute_mdhash_id(str(modal_content))}\",\n                \"entity_type\": \"image\",\n                \"summary\": f\"Image content: {str(modal_content)[:100]}\",\n            }\n            return str(modal_content), fallback_entity\n\n    async def process_multimodal_content(\n        self,\n        modal_content,\n        content_type: str,\n        file_path: str = \"manual_creation\",\n        entity_name: str = None,\n        item_info: Dict[str, Any] = None,\n        batch_mode: bool = False,\n        doc_id: str = None,\n        chunk_order_index: int = 0,\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"Process image content with context support\"\"\"\n        try:\n            # Generate description and entity info\n            enhanced_caption, entity_info = await self.generate_description_only(\n                modal_content, content_type, item_info, entity_name\n            )\n\n            # Build complete image content\n            if isinstance(modal_content, str):\n                try:\n                    content_data = json.loads(modal_content)\n                except json.JSONDecodeError:\n                    content_data = {\"description\": modal_content}\n            else:\n                content_data = modal_content\n\n            image_path = content_data.get(\"img_path\", \"\")\n            captions = content_data.get(\n                \"image_caption\", content_data.get(\"img_caption\", [])\n            )\n            footnotes = content_data.get(\n                \"image_footnote\", content_data.get(\"img_footnote\", [])\n            )\n\n            modal_chunk = PROMPTS[\"image_chunk\"].format(\n                image_path=image_path,\n                captions=\", \".join(captions) if captions else \"None\",\n                footnotes=\", \".join(footnotes) if footnotes else \"None\",\n                enhanced_caption=enhanced_caption,\n            )\n\n            return await self._create_entity_and_chunk(\n                modal_chunk,\n                entity_info,\n                file_path,\n                batch_mode,\n                doc_id,\n                chunk_order_index,\n            )\n\n        except Exception as e:\n            logger.error(f\"Error processing image content: {e}\")\n            # Fallback processing\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"image_{compute_mdhash_id(str(modal_content))}\",\n                \"entity_type\": \"image\",\n                \"summary\": f\"Image content: {str(modal_content)[:100]}\",\n            }\n            return str(modal_content), fallback_entity\n\n    def _parse_response(\n        self, response: str, entity_name: str = None\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"Parse model response\"\"\"\n        try:\n            response_data = self._robust_json_parse(response)\n\n            description = response_data.get(\"detailed_description\", \"\")\n            entity_data = response_data.get(\"entity_info\", {})\n\n            if not description or not entity_data:\n                raise ValueError(\"Missing required fields in response\")\n\n            if not all(\n                key in entity_data for key in [\"entity_name\", \"entity_type\", \"summary\"]\n            ):\n                raise ValueError(\"Missing required fields in entity_info\")\n\n            entity_data[\"entity_name\"] = (\n                entity_data[\"entity_name\"] + f\" ({entity_data['entity_type']})\"\n            )\n            if entity_name:\n                entity_data[\"entity_name\"] = entity_name\n\n            return description, entity_data\n\n        except (json.JSONDecodeError, AttributeError, ValueError) as e:\n            logger.error(f\"Error parsing image analysis response: {e}\")\n            logger.debug(f\"Raw response: {response}\")\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"image_{compute_mdhash_id(response)}\",\n                \"entity_type\": \"image\",\n                \"summary\": response[:100] + \"...\" if len(response) > 100 else response,\n            }\n            return response, fallback_entity\n\n\nclass TableModalProcessor(BaseModalProcessor):\n    \"\"\"Processor specialized for table content\"\"\"\n\n    async def generate_description_only(\n        self,\n        modal_content,\n        content_type: str,\n        item_info: Dict[str, Any] = None,\n        entity_name: str = None,\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"\n        Generate table description and entity info only, without entity relation extraction.\n        Used for batch processing stage 1.\n\n        Args:\n            modal_content: Table content to process\n            content_type: Type of modal content (\"table\")\n            item_info: Item information for context extraction\n            entity_name: Optional predefined entity name\n\n        Returns:\n            Tuple of (enhanced_caption, entity_info)\n        \"\"\"\n        try:\n            # Parse table content (reuse existing logic)\n            if isinstance(modal_content, str):\n                try:\n                    content_data = json.loads(modal_content)\n                except json.JSONDecodeError:\n                    content_data = {\"table_body\": modal_content}\n            else:\n                content_data = modal_content\n\n            table_img_path = content_data.get(\"img_path\")\n            table_caption = content_data.get(\"table_caption\", [])\n            table_body = content_data.get(\"table_body\", \"\")\n            table_footnote = content_data.get(\"table_footnote\", [])\n\n            # Extract context for current item\n            context = \"\"\n            if item_info:\n                context = self._get_context_for_item(item_info)\n\n            # Build table analysis prompt with context\n            if context:\n                table_prompt = PROMPTS.get(\n                    \"table_prompt_with_context\", PROMPTS[\"table_prompt\"]\n                ).format(\n                    context=context,\n                    entity_name=entity_name\n                    if entity_name\n                    else \"descriptive name for this table\",\n                    table_img_path=table_img_path,\n                    table_caption=table_caption if table_caption else \"None\",\n                    table_body=table_body,\n                    table_footnote=table_footnote if table_footnote else \"None\",\n                )\n            else:\n                table_prompt = PROMPTS[\"table_prompt\"].format(\n                    entity_name=entity_name\n                    if entity_name\n                    else \"descriptive name for this table\",\n                    table_img_path=table_img_path,\n                    table_caption=table_caption if table_caption else \"None\",\n                    table_body=table_body,\n                    table_footnote=table_footnote if table_footnote else \"None\",\n                )\n\n            # Call LLM for table analysis\n            response = await self.modal_caption_func(\n                table_prompt,\n                system_prompt=PROMPTS[\"TABLE_ANALYSIS_SYSTEM\"],\n            )\n\n            # Parse response (reuse existing logic)\n            enhanced_caption, entity_info = self._parse_table_response(\n                response, entity_name\n            )\n\n            return enhanced_caption, entity_info\n\n        except Exception as e:\n            logger.error(f\"Error generating table description: {e}\")\n            # Fallback processing\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"table_{compute_mdhash_id(str(modal_content))}\",\n                \"entity_type\": \"table\",\n                \"summary\": f\"Table content: {str(modal_content)[:100]}\",\n            }\n            return str(modal_content), fallback_entity\n\n    async def process_multimodal_content(\n        self,\n        modal_content,\n        content_type: str,\n        file_path: str = \"manual_creation\",\n        entity_name: str = None,\n        item_info: Dict[str, Any] = None,\n        batch_mode: bool = False,\n        doc_id: str = None,\n        chunk_order_index: int = 0,\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"Process table content with context support\"\"\"\n        try:\n            # Generate description and entity info\n            enhanced_caption, entity_info = await self.generate_description_only(\n                modal_content, content_type, item_info, entity_name\n            )\n\n            # Parse table content for building complete chunk\n            if isinstance(modal_content, str):\n                try:\n                    content_data = json.loads(modal_content)\n                except json.JSONDecodeError:\n                    content_data = {\"table_body\": modal_content}\n            else:\n                content_data = modal_content\n\n            table_img_path = content_data.get(\"img_path\")\n            table_caption = content_data.get(\"table_caption\", [])\n            table_body = content_data.get(\"table_body\", \"\")\n            table_footnote = content_data.get(\"table_footnote\", [])\n\n            # Build complete table content\n            modal_chunk = PROMPTS[\"table_chunk\"].format(\n                table_img_path=table_img_path,\n                table_caption=\", \".join(table_caption) if table_caption else \"None\",\n                table_body=table_body,\n                table_footnote=\", \".join(table_footnote) if table_footnote else \"None\",\n                enhanced_caption=enhanced_caption,\n            )\n\n            return await self._create_entity_and_chunk(\n                modal_chunk,\n                entity_info,\n                file_path,\n                batch_mode,\n                doc_id,\n                chunk_order_index,\n            )\n\n        except Exception as e:\n            logger.error(f\"Error processing table content: {e}\")\n            # Fallback processing\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"table_{compute_mdhash_id(str(modal_content))}\",\n                \"entity_type\": \"table\",\n                \"summary\": f\"Table content: {str(modal_content)[:100]}\",\n            }\n            return str(modal_content), fallback_entity\n\n    def _parse_table_response(\n        self, response: str, entity_name: str = None\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"Parse table analysis response\"\"\"\n        try:\n            response_data = self._robust_json_parse(response)\n\n            description = response_data.get(\"detailed_description\", \"\")\n            entity_data = response_data.get(\"entity_info\", {})\n\n            if not description or not entity_data:\n                raise ValueError(\"Missing required fields in response\")\n\n            if not all(\n                key in entity_data for key in [\"entity_name\", \"entity_type\", \"summary\"]\n            ):\n                raise ValueError(\"Missing required fields in entity_info\")\n\n            entity_data[\"entity_name\"] = (\n                entity_data[\"entity_name\"] + f\" ({entity_data['entity_type']})\"\n            )\n            if entity_name:\n                entity_data[\"entity_name\"] = entity_name\n\n            return description, entity_data\n\n        except (json.JSONDecodeError, AttributeError, ValueError) as e:\n            logger.error(f\"Error parsing table analysis response: {e}\")\n            logger.debug(f\"Raw response: {response}\")\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"table_{compute_mdhash_id(response)}\",\n                \"entity_type\": \"table\",\n                \"summary\": response[:100] + \"...\" if len(response) > 100 else response,\n            }\n            return response, fallback_entity\n\n\nclass EquationModalProcessor(BaseModalProcessor):\n    \"\"\"Processor specialized for equation content\"\"\"\n\n    async def generate_description_only(\n        self,\n        modal_content,\n        content_type: str,\n        item_info: Dict[str, Any] = None,\n        entity_name: str = None,\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"\n        Generate equation description and entity info only, without entity relation extraction.\n        Used for batch processing stage 1.\n\n        Args:\n            modal_content: Equation content to process\n            content_type: Type of modal content (\"equation\")\n            item_info: Item information for context extraction\n            entity_name: Optional predefined entity name\n\n        Returns:\n            Tuple of (enhanced_caption, entity_info)\n        \"\"\"\n        try:\n            # Parse equation content (reuse existing logic)\n            if isinstance(modal_content, str):\n                try:\n                    content_data = json.loads(modal_content)\n                except json.JSONDecodeError:\n                    content_data = {\"equation\": modal_content}\n            else:\n                content_data = modal_content\n\n            equation_text = content_data.get(\"text\")\n            equation_format = content_data.get(\"text_format\", \"\")\n\n            # Extract context for current item\n            context = \"\"\n            if item_info:\n                context = self._get_context_for_item(item_info)\n\n            # Build equation analysis prompt with context\n            if context:\n                equation_prompt = PROMPTS.get(\n                    \"equation_prompt_with_context\", PROMPTS[\"equation_prompt\"]\n                ).format(\n                    context=context,\n                    equation_text=equation_text,\n                    equation_format=equation_format,\n                    entity_name=entity_name\n                    if entity_name\n                    else \"descriptive name for this equation\",\n                )\n            else:\n                equation_prompt = PROMPTS[\"equation_prompt\"].format(\n                    equation_text=equation_text,\n                    equation_format=equation_format,\n                    entity_name=entity_name\n                    if entity_name\n                    else \"descriptive name for this equation\",\n                )\n\n            # Call LLM for equation analysis\n            response = await self.modal_caption_func(\n                equation_prompt,\n                system_prompt=PROMPTS[\"EQUATION_ANALYSIS_SYSTEM\"],\n            )\n\n            # Parse response (reuse existing logic)\n            enhanced_caption, entity_info = self._parse_equation_response(\n                response, entity_name\n            )\n\n            return enhanced_caption, entity_info\n\n        except Exception as e:\n            logger.error(f\"Error generating equation description: {e}\")\n            # Fallback processing\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"equation_{compute_mdhash_id(str(modal_content))}\",\n                \"entity_type\": \"equation\",\n                \"summary\": f\"Equation content: {str(modal_content)[:100]}\",\n            }\n            return str(modal_content), fallback_entity\n\n    async def process_multimodal_content(\n        self,\n        modal_content,\n        content_type: str,\n        file_path: str = \"manual_creation\",\n        entity_name: str = None,\n        item_info: Dict[str, Any] = None,\n        batch_mode: bool = False,\n        doc_id: str = None,\n        chunk_order_index: int = 0,\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"Process equation content with context support\"\"\"\n        try:\n            # Generate description and entity info\n            enhanced_caption, entity_info = await self.generate_description_only(\n                modal_content, content_type, item_info, entity_name\n            )\n\n            # Parse equation content for building complete chunk\n            if isinstance(modal_content, str):\n                try:\n                    content_data = json.loads(modal_content)\n                except json.JSONDecodeError:\n                    content_data = {\"equation\": modal_content}\n            else:\n                content_data = modal_content\n\n            equation_text = content_data.get(\"text\")\n            equation_format = content_data.get(\"text_format\", \"\")\n\n            # Build complete equation content\n            modal_chunk = PROMPTS[\"equation_chunk\"].format(\n                equation_text=equation_text,\n                equation_format=equation_format,\n                enhanced_caption=enhanced_caption,\n            )\n\n            return await self._create_entity_and_chunk(\n                modal_chunk,\n                entity_info,\n                file_path,\n                batch_mode,\n                doc_id,\n                chunk_order_index,\n            )\n\n        except Exception as e:\n            logger.error(f\"Error processing equation content: {e}\")\n            # Fallback processing\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"equation_{compute_mdhash_id(str(modal_content))}\",\n                \"entity_type\": \"equation\",\n                \"summary\": f\"Equation content: {str(modal_content)[:100]}\",\n            }\n            return str(modal_content), fallback_entity\n\n    def _parse_equation_response(\n        self, response: str, entity_name: str = None\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"Parse equation analysis response with robust JSON handling\"\"\"\n        try:\n            response_data = self._robust_json_parse(response)\n\n            description = response_data.get(\"detailed_description\", \"\")\n            entity_data = response_data.get(\"entity_info\", {})\n\n            if not description or not entity_data:\n                raise ValueError(\"Missing required fields in response\")\n\n            if not all(\n                key in entity_data for key in [\"entity_name\", \"entity_type\", \"summary\"]\n            ):\n                raise ValueError(\"Missing required fields in entity_info\")\n\n            entity_data[\"entity_name\"] = (\n                entity_data[\"entity_name\"] + f\" ({entity_data['entity_type']})\"\n            )\n            if entity_name:\n                entity_data[\"entity_name\"] = entity_name\n\n            return description, entity_data\n\n        except (json.JSONDecodeError, AttributeError, ValueError) as e:\n            logger.error(f\"Error parsing equation analysis response: {e}\")\n            logger.debug(f\"Raw response: {response}\")\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"equation_{compute_mdhash_id(response)}\",\n                \"entity_type\": \"equation\",\n                \"summary\": response[:100] + \"...\" if len(response) > 100 else response,\n            }\n            return response, fallback_entity\n\n\nclass GenericModalProcessor(BaseModalProcessor):\n    \"\"\"Generic processor for other types of modal content\"\"\"\n\n    async def generate_description_only(\n        self,\n        modal_content,\n        content_type: str,\n        item_info: Dict[str, Any] = None,\n        entity_name: str = None,\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"\n        Generate generic modal description and entity info only, without entity relation extraction.\n        Used for batch processing stage 1.\n\n        Args:\n            modal_content: Generic modal content to process\n            content_type: Type of modal content\n            item_info: Item information for context extraction\n            entity_name: Optional predefined entity name\n\n        Returns:\n            Tuple of (enhanced_caption, entity_info)\n        \"\"\"\n        try:\n            # Extract context for current item\n            context = \"\"\n            if item_info:\n                context = self._get_context_for_item(item_info)\n\n            # Build generic analysis prompt with context\n            if context:\n                generic_prompt = PROMPTS.get(\n                    \"generic_prompt_with_context\", PROMPTS[\"generic_prompt\"]\n                ).format(\n                    context=context,\n                    content_type=content_type,\n                    entity_name=entity_name\n                    if entity_name\n                    else f\"descriptive name for this {content_type}\",\n                    content=str(modal_content),\n                )\n            else:\n                generic_prompt = PROMPTS[\"generic_prompt\"].format(\n                    content_type=content_type,\n                    entity_name=entity_name\n                    if entity_name\n                    else f\"descriptive name for this {content_type}\",\n                    content=str(modal_content),\n                )\n\n            # Call LLM for generic analysis\n            response = await self.modal_caption_func(\n                generic_prompt,\n                system_prompt=PROMPTS[\"GENERIC_ANALYSIS_SYSTEM\"].format(\n                    content_type=content_type\n                ),\n            )\n\n            # Parse response (reuse existing logic)\n            enhanced_caption, entity_info = self._parse_generic_response(\n                response, entity_name, content_type\n            )\n\n            return enhanced_caption, entity_info\n\n        except Exception as e:\n            logger.error(f\"Error generating {content_type} description: {e}\")\n            # Fallback processing\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"{content_type}_{compute_mdhash_id(str(modal_content))}\",\n                \"entity_type\": content_type,\n                \"summary\": f\"{content_type} content: {str(modal_content)[:100]}\",\n            }\n            return str(modal_content), fallback_entity\n\n    async def process_multimodal_content(\n        self,\n        modal_content,\n        content_type: str,\n        file_path: str = \"manual_creation\",\n        entity_name: str = None,\n        item_info: Dict[str, Any] = None,\n        batch_mode: bool = False,\n        doc_id: str = None,\n        chunk_order_index: int = 0,\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"Process generic modal content with context support\"\"\"\n        try:\n            # Generate description and entity info\n            enhanced_caption, entity_info = await self.generate_description_only(\n                modal_content, content_type, item_info, entity_name\n            )\n\n            # Build complete content\n            modal_chunk = PROMPTS[\"generic_chunk\"].format(\n                content_type=content_type.title(),\n                content=str(modal_content),\n                enhanced_caption=enhanced_caption,\n            )\n\n            return await self._create_entity_and_chunk(\n                modal_chunk,\n                entity_info,\n                file_path,\n                batch_mode,\n                doc_id,\n                chunk_order_index,\n            )\n\n        except Exception as e:\n            logger.error(f\"Error processing {content_type} content: {e}\")\n            # Fallback processing\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"{content_type}_{compute_mdhash_id(str(modal_content))}\",\n                \"entity_type\": content_type,\n                \"summary\": f\"{content_type} content: {str(modal_content)[:100]}\",\n            }\n            return str(modal_content), fallback_entity\n\n    def _parse_generic_response(\n        self, response: str, entity_name: str = None, content_type: str = \"content\"\n    ) -> Tuple[str, Dict[str, Any]]:\n        \"\"\"Parse generic analysis response\"\"\"\n        try:\n            response_data = self._robust_json_parse(response)\n\n            description = response_data.get(\"detailed_description\", \"\")\n            entity_data = response_data.get(\"entity_info\", {})\n\n            if not description or not entity_data:\n                raise ValueError(\"Missing required fields in response\")\n\n            if not all(\n                key in entity_data for key in [\"entity_name\", \"entity_type\", \"summary\"]\n            ):\n                raise ValueError(\"Missing required fields in entity_info\")\n\n            entity_data[\"entity_name\"] = (\n                entity_data[\"entity_name\"] + f\" ({entity_data['entity_type']})\"\n            )\n            if entity_name:\n                entity_data[\"entity_name\"] = entity_name\n\n            return description, entity_data\n\n        except (json.JSONDecodeError, AttributeError, ValueError) as e:\n            logger.error(f\"Error parsing {content_type} analysis response: {e}\")\n            logger.debug(f\"Raw response: {response}\")\n            fallback_entity = {\n                \"entity_name\": entity_name\n                if entity_name\n                else f\"{content_type}_{compute_mdhash_id(response)}\",\n                \"entity_type\": content_type,\n                \"summary\": response[:100] + \"...\" if len(response) > 100 else response,\n            }\n            return response, fallback_entity\n"
  },
  {
    "path": "raganything/parser.py",
    "content": "# type: ignore\n\"\"\"\nGeneric Document Parser Utility\n\nThis module provides functionality for parsing documents using the built-in\nMinerU, Docling, and PaddleOCR parsers, and exposes a small registry for\n**in-process** custom parsers (see :func:`register_parser`).\n\nImportant notes:\n\n- The custom parser registry is primarily intended for Python usage, where your\n  application imports a parser implementation and calls :func:`register_parser`\n  before invoking RAGAnything APIs.\n- The standalone CLI (``python -m raganything.parser`` or the installed console\n  script) does **not** perform automatic plugin discovery; it will only see\n  custom parsers that have already been registered in the current process\n  (for example via a wrapper script or :mod:`sitecustomize`).\n\nMinerU 2.0 no longer includes LibreOffice document conversion module.\nFor Office documents (.doc, .docx, .ppt, .pptx), please convert them to PDF\nformat first.\n\"\"\"\n\nfrom __future__ import annotations\n\n\nimport os\nimport hashlib\nimport json\nimport argparse\nimport base64\nimport subprocess\nimport tempfile\nimport logging\nfrom pathlib import Path\nfrom typing import (\n    Dict,\n    List,\n    Optional,\n    Union,\n    Tuple,\n    Any,\n    Iterator,\n    TypeVar,\n)\n\nT = TypeVar(\"T\")\n\n\nclass MineruExecutionError(Exception):\n    \"\"\"catch mineru error\"\"\"\n\n    def __init__(self, return_code, error_msg):\n        self.return_code = return_code\n        self.error_msg = error_msg\n        super().__init__(\n            f\"Mineru command failed with return code {return_code}: {error_msg}\"\n        )\n\n\nclass Parser:\n    \"\"\"\n    Base class for document parsing utilities.\n\n    Defines common functionality and constants for parsing different document types.\n    \"\"\"\n\n    # Define common file formats\n    OFFICE_FORMATS = {\".doc\", \".docx\", \".ppt\", \".pptx\", \".xls\", \".xlsx\"}\n    IMAGE_FORMATS = {\".png\", \".jpeg\", \".jpg\", \".bmp\", \".tiff\", \".tif\", \".gif\", \".webp\"}\n    TEXT_FORMATS = {\".txt\", \".md\"}\n\n    # Class-level logger\n    logger = logging.getLogger(__name__)\n\n    def __init__(self) -> None:\n        \"\"\"Initialize the base parser.\"\"\"\n        pass\n\n    @staticmethod\n    def _unique_output_dir(\n        base_dir: Union[str, Path], file_path: Union[str, Path]\n    ) -> Path:\n        \"\"\"Create a unique output subdirectory for a file to prevent same-name collisions.\n\n        When multiple files share the same name (e.g. dir1/paper.pdf and dir2/paper.pdf),\n        their parser output would collide in the same output directory. This creates a\n        unique subdirectory by appending a short hash of the file's absolute path. (Fixes #51)\n\n        Args:\n            base_dir: The base output directory\n            file_path: Path to the input file\n\n        Returns:\n            Path like base_dir/paper_a1b2c3d4/ unique per absolute file path.\n        \"\"\"\n        file_path = Path(file_path).resolve()\n        stem = file_path.stem\n        path_hash = hashlib.md5(str(file_path).encode()).hexdigest()[:8]\n        return Path(base_dir) / f\"{stem}_{path_hash}\"\n\n    @classmethod\n    def convert_office_to_pdf(\n        cls, doc_path: Union[str, Path], output_dir: Optional[str] = None\n    ) -> Path:\n        \"\"\"\n        Convert Office document (.doc, .docx, .ppt, .pptx, .xls, .xlsx) to PDF.\n        Requires LibreOffice to be installed.\n\n        Args:\n            doc_path: Path to the Office document file\n            output_dir: Output directory for the PDF file\n\n        Returns:\n            Path to the generated PDF file\n        \"\"\"\n        try:\n            # Convert to Path object for easier handling\n            doc_path = Path(doc_path)\n            if not doc_path.exists():\n                raise FileNotFoundError(f\"Office document does not exist: {doc_path}\")\n\n            name_without_suff = doc_path.stem\n\n            # Prepare output directory\n            if output_dir:\n                base_output_dir = Path(output_dir)\n            else:\n                base_output_dir = doc_path.parent / \"libreoffice_output\"\n\n            base_output_dir.mkdir(parents=True, exist_ok=True)\n\n            # Create temporary directory for PDF conversion\n            with tempfile.TemporaryDirectory() as temp_dir:\n                temp_path = Path(temp_dir)\n\n                # Convert to PDF using LibreOffice\n                cls.logger.info(\n                    f\"Converting {doc_path.name} to PDF using LibreOffice...\"\n                )\n\n                # Prepare subprocess parameters to hide console window on Windows\n                import platform\n\n                # Try LibreOffice commands in order of preference\n                commands_to_try = [\"libreoffice\", \"soffice\"]\n\n                conversion_successful = False\n                for cmd in commands_to_try:\n                    try:\n                        convert_cmd = [\n                            cmd,\n                            \"--headless\",\n                            \"--convert-to\",\n                            \"pdf\",\n                            \"--outdir\",\n                            str(temp_path),\n                            str(doc_path),\n                        ]\n\n                        # Prepare conversion subprocess parameters\n                        convert_subprocess_kwargs = {\n                            \"capture_output\": True,\n                            \"text\": True,\n                            \"timeout\": 60,  # 60 second timeout\n                            \"encoding\": \"utf-8\",\n                            \"errors\": \"ignore\",\n                        }\n\n                        # Hide console window on Windows\n                        if platform.system() == \"Windows\":\n                            convert_subprocess_kwargs[\"creationflags\"] = (\n                                subprocess.CREATE_NO_WINDOW\n                            )\n\n                        result = subprocess.run(\n                            convert_cmd, **convert_subprocess_kwargs\n                        )\n\n                        if result.returncode == 0:\n                            conversion_successful = True\n                            cls.logger.info(\n                                f\"Successfully converted {doc_path.name} to PDF using {cmd}\"\n                            )\n                            break\n                        else:\n                            cls.logger.warning(\n                                f\"LibreOffice command '{cmd}' failed: {result.stderr}\"\n                            )\n                    except FileNotFoundError:\n                        cls.logger.warning(f\"LibreOffice command '{cmd}' not found\")\n                    except subprocess.TimeoutExpired:\n                        cls.logger.warning(f\"LibreOffice command '{cmd}' timed out\")\n                    except Exception as e:\n                        cls.logger.error(\n                            f\"LibreOffice command '{cmd}' failed with exception: {e}\"\n                        )\n\n                if not conversion_successful:\n                    raise RuntimeError(\n                        f\"LibreOffice conversion failed for {doc_path.name}. \"\n                        f\"Please ensure LibreOffice is installed:\\n\"\n                        \"- Windows: Download from https://www.libreoffice.org/download/download/\\n\"\n                        \"- macOS: brew install --cask libreoffice\\n\"\n                        \"- Ubuntu/Debian: sudo apt-get install libreoffice\\n\"\n                        \"- CentOS/RHEL: sudo yum install libreoffice\\n\"\n                        \"Alternatively, convert the document to PDF manually.\"\n                    )\n\n                # Find the generated PDF\n                pdf_files = list(temp_path.glob(\"*.pdf\"))\n                if not pdf_files:\n                    raise RuntimeError(\n                        f\"PDF conversion failed for {doc_path.name} - no PDF file generated. \"\n                        f\"Please check LibreOffice installation or try manual conversion.\"\n                    )\n\n                pdf_path = pdf_files[0]\n                cls.logger.info(\n                    f\"Generated PDF: {pdf_path.name} ({pdf_path.stat().st_size} bytes)\"\n                )\n\n                # Validate the generated PDF\n                if pdf_path.stat().st_size < 100:  # Very small file, likely empty\n                    raise RuntimeError(\n                        \"Generated PDF appears to be empty or corrupted. \"\n                        \"Original file may have issues or LibreOffice conversion failed.\"\n                    )\n\n                # Copy PDF to final output directory\n                final_pdf_path = base_output_dir / f\"{name_without_suff}.pdf\"\n                import shutil\n\n                shutil.copy2(pdf_path, final_pdf_path)\n\n                return final_pdf_path\n\n        except Exception as e:\n            cls.logger.error(f\"Error in convert_office_to_pdf: {str(e)}\")\n            raise\n\n    @classmethod\n    def convert_text_to_pdf(\n        cls, text_path: Union[str, Path], output_dir: Optional[str] = None\n    ) -> Path:\n        \"\"\"\n        Convert text file (.txt, .md) to PDF using ReportLab with full markdown support.\n\n        Args:\n            text_path: Path to the text file\n            output_dir: Output directory for the PDF file\n\n        Returns:\n            Path to the generated PDF file\n        \"\"\"\n        try:\n            text_path = Path(text_path)\n            if not text_path.exists():\n                raise FileNotFoundError(f\"Text file does not exist: {text_path}\")\n\n            # Supported text formats\n            supported_text_formats = {\".txt\", \".md\"}\n            if text_path.suffix.lower() not in supported_text_formats:\n                raise ValueError(f\"Unsupported text format: {text_path.suffix}\")\n\n            # Read the text content\n            try:\n                with open(text_path, \"r\", encoding=\"utf-8\") as f:\n                    text_content = f.read()\n            except UnicodeDecodeError:\n                # Try with different encodings\n                for encoding in [\"gbk\", \"latin-1\", \"cp1252\"]:\n                    try:\n                        with open(text_path, \"r\", encoding=encoding) as f:\n                            text_content = f.read()\n                        cls.logger.info(\n                            f\"Successfully read file with {encoding} encoding\"\n                        )\n                        break\n                    except UnicodeDecodeError:\n                        continue\n                else:\n                    raise RuntimeError(\n                        f\"Could not decode text file {text_path.name} with any supported encoding\"\n                    )\n\n            # Prepare output directory\n            if output_dir:\n                base_output_dir = Path(output_dir)\n            else:\n                base_output_dir = text_path.parent / \"reportlab_output\"\n\n            base_output_dir.mkdir(parents=True, exist_ok=True)\n            pdf_path = base_output_dir / f\"{text_path.stem}.pdf\"\n\n            # Convert text to PDF\n            cls.logger.info(f\"Converting {text_path.name} to PDF...\")\n\n            try:\n                from reportlab.lib.pagesizes import A4\n                from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer\n                from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle\n                from reportlab.lib.units import inch\n                from reportlab.pdfbase import pdfmetrics\n                from reportlab.pdfbase.ttfonts import TTFont\n\n                support_chinese = True\n                try:\n                    if \"WenQuanYi\" not in pdfmetrics.getRegisteredFontNames():\n                        if not Path(\n                            \"/usr/share/fonts/wqy-microhei/wqy-microhei.ttc\"\n                        ).exists():\n                            support_chinese = False\n                            cls.logger.warning(\n                                \"WenQuanYi font not found at /usr/share/fonts/wqy-microhei/wqy-microhei.ttc. Chinese characters may not render correctly.\"\n                            )\n                        else:\n                            pdfmetrics.registerFont(\n                                TTFont(\n                                    \"WenQuanYi\",\n                                    \"/usr/share/fonts/wqy-microhei/wqy-microhei.ttc\",\n                                )\n                            )\n                except Exception as e:\n                    support_chinese = False\n                    cls.logger.warning(\n                        f\"Failed to register WenQuanYi font: {e}. Chinese characters may not render correctly.\"\n                    )\n\n                # Create PDF document\n                doc = SimpleDocTemplate(\n                    str(pdf_path),\n                    pagesize=A4,\n                    leftMargin=inch,\n                    rightMargin=inch,\n                    topMargin=inch,\n                    bottomMargin=inch,\n                )\n\n                # Get styles\n                styles = getSampleStyleSheet()\n                normal_style = styles[\"Normal\"]\n                heading_style = styles[\"Heading1\"]\n                if support_chinese:\n                    normal_style.fontName = \"WenQuanYi\"\n                    heading_style.fontName = \"WenQuanYi\"\n\n                # Try to register a font that supports Chinese characters\n                try:\n                    # Try to use system fonts that support Chinese\n                    import platform\n\n                    system = platform.system()\n                    if system == \"Windows\":\n                        # Try common Windows fonts\n                        for font_name in [\"SimSun\", \"SimHei\", \"Microsoft YaHei\"]:\n                            try:\n                                from reportlab.pdfbase.cidfonts import (\n                                    UnicodeCIDFont,\n                                )\n\n                                pdfmetrics.registerFont(UnicodeCIDFont(font_name))\n                                normal_style.fontName = font_name\n                                heading_style.fontName = font_name\n                                break\n                            except Exception:\n                                continue\n                    elif system == \"Darwin\":  # macOS\n                        for font_name in [\"STSong-Light\", \"STHeiti\"]:\n                            try:\n                                from reportlab.pdfbase.cidfonts import (\n                                    UnicodeCIDFont,\n                                )\n\n                                pdfmetrics.registerFont(UnicodeCIDFont(font_name))\n                                normal_style.fontName = font_name\n                                heading_style.fontName = font_name\n                                break\n                            except Exception:\n                                continue\n                except Exception:\n                    pass  # Use default fonts if Chinese font setup fails\n\n                # Build content\n                story = []\n\n                # Handle markdown or plain text\n                if text_path.suffix.lower() == \".md\":\n                    # Handle markdown content - simplified implementation\n                    lines = text_content.split(\"\\n\")\n                    for line in lines:\n                        line = line.strip()\n                        if not line:\n                            story.append(Spacer(1, 12))\n                            continue\n\n                        # Headers\n                        if line.startswith(\"#\"):\n                            level = len(line) - len(line.lstrip(\"#\"))\n                            header_text = line.lstrip(\"#\").strip()\n                            if header_text:\n                                header_style = ParagraphStyle(\n                                    name=f\"Heading{level}\",\n                                    parent=heading_style,\n                                    fontSize=max(16 - level, 10),\n                                    spaceAfter=8,\n                                    spaceBefore=16 if level <= 2 else 12,\n                                )\n                                story.append(Paragraph(header_text, header_style))\n                        else:\n                            # Regular text\n                            story.append(Paragraph(line, normal_style))\n                            story.append(Spacer(1, 6))\n                else:\n                    # Handle plain text files (.txt)\n                    cls.logger.info(\n                        f\"Processing plain text file with {len(text_content)} characters...\"\n                    )\n\n                    # Split text into lines and process each line\n                    lines = text_content.split(\"\\n\")\n                    line_count = 0\n\n                    for line in lines:\n                        line = line.rstrip()\n                        line_count += 1\n\n                        # Empty lines\n                        if not line.strip():\n                            story.append(Spacer(1, 6))\n                            continue\n\n                        # Regular text lines\n                        # Escape special characters for ReportLab\n                        safe_line = (\n                            line.replace(\"&\", \"&amp;\")\n                            .replace(\"<\", \"&lt;\")\n                            .replace(\">\", \"&gt;\")\n                        )\n\n                        # Create paragraph\n                        story.append(Paragraph(safe_line, normal_style))\n                        story.append(Spacer(1, 3))\n\n                    cls.logger.info(f\"Added {line_count} lines to PDF\")\n\n                    # If no content was added, add a placeholder\n                    if not story:\n                        story.append(Paragraph(\"(Empty text file)\", normal_style))\n\n                # Build PDF\n                doc.build(story)\n                cls.logger.info(\n                    f\"Successfully converted {text_path.name} to PDF ({pdf_path.stat().st_size / 1024:.1f} KB)\"\n                )\n\n            except ImportError:\n                raise RuntimeError(\n                    \"reportlab is required for text-to-PDF conversion. \"\n                    \"Please install it using: pip install reportlab\"\n                )\n            except Exception as e:\n                raise RuntimeError(\n                    f\"Failed to convert text file {text_path.name} to PDF: {str(e)}\"\n                )\n\n            # Validate the generated PDF\n            if not pdf_path.exists() or pdf_path.stat().st_size < 100:\n                raise RuntimeError(\n                    f\"PDF conversion failed for {text_path.name} - generated PDF is empty or corrupted.\"\n                )\n\n            return pdf_path\n\n        except Exception as e:\n            cls.logger.error(f\"Error in convert_text_to_pdf: {str(e)}\")\n            raise\n\n    @classmethod\n    def _process_inline_markdown(cls, text: str) -> str:\n        \"\"\"\n        Process inline markdown formatting (bold, italic, code, links)\n\n        Args:\n            text: Raw text with markdown formatting\n\n        Returns:\n            Text with ReportLab markup\n        \"\"\"\n        import re\n\n        # Escape special characters for ReportLab\n        text = text.replace(\"&\", \"&amp;\").replace(\"<\", \"&lt;\").replace(\">\", \"&gt;\")\n\n        # Bold text: **text** or __text__\n        text = re.sub(r\"\\*\\*(.*?)\\*\\*\", r\"<b>\\1</b>\", text)\n        text = re.sub(r\"__(.*?)__\", r\"<b>\\1</b>\", text)\n\n        # Italic text: *text* or _text_ (but not in the middle of words)\n        text = re.sub(r\"(?<!\\w)\\*([^*\\n]+?)\\*(?!\\w)\", r\"<i>\\1</i>\", text)\n        text = re.sub(r\"(?<!\\w)_([^_\\n]+?)_(?!\\w)\", r\"<i>\\1</i>\", text)\n\n        # Inline code: `code`\n        text = re.sub(\n            r\"`([^`]+?)`\",\n            r'<font name=\"Courier\" size=\"9\" color=\"darkred\">\\1</font>',\n            text,\n        )\n\n        # Links: [text](url) - convert to text with URL annotation\n        def link_replacer(match):\n            link_text = match.group(1)\n            url = match.group(2)\n            return f'<link href=\"{url}\" color=\"blue\"><u>{link_text}</u></link>'\n\n        text = re.sub(r\"\\[([^\\]]+?)\\]\\(([^)]+?)\\)\", link_replacer, text)\n\n        # Strikethrough: ~~text~~\n        text = re.sub(r\"~~(.*?)~~\", r\"<strike>\\1</strike>\", text)\n\n        return text\n\n    def parse_pdf(\n        self,\n        pdf_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        method: str = \"auto\",\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Abstract method to parse PDF document.\n        Must be implemented by subclasses.\n\n        Args:\n            pdf_path: Path to the PDF file\n            output_dir: Output directory path\n            method: Parsing method (auto, txt, ocr)\n            lang: Document language for OCR optimization\n            **kwargs: Additional parameters for parser-specific command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        raise NotImplementedError(\"parse_pdf must be implemented by subclasses\")\n\n    def parse_image(\n        self,\n        image_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Abstract method to parse image document.\n        Must be implemented by subclasses.\n\n        Note: Different parsers may support different image formats.\n        Check the specific parser's documentation for supported formats.\n\n        Args:\n            image_path: Path to the image file\n            output_dir: Output directory path\n            lang: Document language for OCR optimization\n            **kwargs: Additional parameters for parser-specific command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        raise NotImplementedError(\"parse_image must be implemented by subclasses\")\n\n    def parse_document(\n        self,\n        file_path: Union[str, Path],\n        method: str = \"auto\",\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Abstract method to parse a document.\n        Must be implemented by subclasses.\n\n        Args:\n            file_path: Path to the file to be parsed\n            method: Parsing method (auto, txt, ocr)\n            output_dir: Output directory path\n            lang: Document language for OCR optimization\n            **kwargs: Additional parameters for parser-specific command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        raise NotImplementedError(\"parse_document must be implemented by subclasses\")\n\n    def check_installation(self) -> bool:\n        \"\"\"\n        Abstract method to check if the parser is properly installed.\n        Must be implemented by subclasses.\n\n        Returns:\n            bool: True if installation is valid, False otherwise\n        \"\"\"\n        raise NotImplementedError(\n            \"check_installation must be implemented by subclasses\"\n        )\n\n\nclass MineruParser(Parser):\n    \"\"\"\n    MinerU 2.0 document parsing utility class\n\n    Supports parsing PDF and image documents, converting the content into structured data\n    and generating markdown and JSON output.\n\n    Note: Office documents are no longer directly supported. Please convert them to PDF first.\n    \"\"\"\n\n    __slots__ = ()\n\n    # Class-level logger\n    logger = logging.getLogger(__name__)\n\n    def __init__(self) -> None:\n        \"\"\"Initialize MineruParser\"\"\"\n        super().__init__()\n\n    @classmethod\n    def _run_mineru_command(\n        cls,\n        input_path: Union[str, Path],\n        output_dir: Union[str, Path],\n        method: str = \"auto\",\n        lang: Optional[str] = None,\n        backend: Optional[str] = None,\n        start_page: Optional[int] = None,\n        end_page: Optional[int] = None,\n        formula: bool = True,\n        table: bool = True,\n        device: Optional[str] = None,\n        source: Optional[str] = None,\n        vlm_url: Optional[str] = None,\n        **kwargs,\n    ) -> None:\n        \"\"\"\n        Run mineru command line tool\n\n        Args:\n            input_path: Path to input file or directory\n            output_dir: Output directory path\n            method: Parsing method (auto, txt, ocr)\n            lang: Document language for OCR optimization\n            backend: Parsing backend\n            start_page: Starting page number (0-based)\n            end_page: Ending page number (0-based)\n            formula: Enable formula parsing\n            table: Enable table parsing\n            device: Inference device\n            source: Model source\n            vlm_url: When the backend is `vlm-http-client`, you need to specify the server_url\n            **kwargs: Additional parameters for subprocess (e.g., env)\n        \"\"\"\n        cmd = [\n            \"mineru\",\n            \"-p\",\n            str(input_path),\n            \"-o\",\n            str(output_dir),\n            \"-m\",\n            method,\n        ]\n\n        if backend:\n            cmd.extend([\"-b\", backend])\n        if source:\n            cmd.extend([\"--source\", source])\n        if lang:\n            cmd.extend([\"-l\", lang])\n        if start_page is not None:\n            cmd.extend([\"-s\", str(start_page)])\n        if end_page is not None:\n            cmd.extend([\"-e\", str(end_page)])\n        if not formula:\n            cmd.extend([\"-f\", \"false\"])\n        if not table:\n            cmd.extend([\"-t\", \"false\"])\n        if device:\n            cmd.extend([\"-d\", device])\n        if vlm_url:\n            cmd.extend([\"-u\", vlm_url])\n\n        output_lines = []\n        error_lines = []\n\n        # Handle and validate environment variables\n        custom_env = kwargs.pop(\"env\", None)\n\n        # Validate env if provided\n        if custom_env is not None:\n            if not isinstance(custom_env, dict):\n                raise TypeError(\n                    f\"env must be a dictionary, got {type(custom_env).__name__}\"\n                )\n            for k, v in custom_env.items():\n                if not isinstance(k, str) or not isinstance(v, str):\n                    raise TypeError(\"env keys and values must be strings\")\n\n        # Check for unsupported arguments to fail fast\n        if kwargs:\n            unsupported = \", \".join(kwargs.keys())\n            raise TypeError(\n                f\"MineruParser._run_mineru_command received unexpected keyword argument(s): {unsupported}\"\n            )\n\n        try:\n            # Prepare subprocess parameters to hide console window on Windows\n            import platform\n            import threading\n            from queue import Queue, Empty\n\n            # Log the command being executed\n            cls.logger.info(f\"Executing mineru command: {' '.join(cmd)}\")\n\n            env = None\n            if custom_env:\n                env = os.environ.copy()\n                env.update(custom_env)\n\n            subprocess_kwargs = {\n                \"stdout\": subprocess.PIPE,\n                \"stderr\": subprocess.PIPE,\n                \"text\": True,\n                \"encoding\": \"utf-8\",\n                \"errors\": \"ignore\",\n                \"bufsize\": 1,  # Line buffered\n                \"env\": env,\n            }\n\n            # Hide console window on Windows\n            if platform.system() == \"Windows\":\n                subprocess_kwargs[\"creationflags\"] = subprocess.CREATE_NO_WINDOW\n\n            # Function to read output from subprocess and add to queue\n            def enqueue_output(pipe, queue, prefix):\n                try:\n                    for line in iter(pipe.readline, \"\"):\n                        if line.strip():  # Only add non-empty lines\n                            queue.put((prefix, line.strip()))\n                    pipe.close()\n                except Exception as e:\n                    queue.put((prefix, f\"Error reading {prefix}: {e}\"))\n\n            # Start subprocess\n            process = subprocess.Popen(cmd, **subprocess_kwargs)\n\n            # Create queues for stdout and stderr\n            stdout_queue = Queue()\n            stderr_queue = Queue()\n\n            # Start threads to read output\n            stdout_thread = threading.Thread(\n                target=enqueue_output, args=(process.stdout, stdout_queue, \"STDOUT\")\n            )\n            stderr_thread = threading.Thread(\n                target=enqueue_output, args=(process.stderr, stderr_queue, \"STDERR\")\n            )\n\n            stdout_thread.daemon = True\n            stderr_thread.daemon = True\n            stdout_thread.start()\n            stderr_thread.start()\n\n            # Process output in real time\n            while process.poll() is None:\n                # Check stdout queue\n                try:\n                    while True:\n                        prefix, line = stdout_queue.get_nowait()\n                        output_lines.append(line)\n                        # Log mineru output with INFO level, prefixed with [MinerU]\n                        cls.logger.info(f\"[MinerU] {line}\")\n                except Empty:\n                    pass\n\n                # Check stderr queue\n                try:\n                    while True:\n                        prefix, line = stderr_queue.get_nowait()\n                        # Log mineru errors with WARNING level\n                        if \"warning\" in line.lower():\n                            cls.logger.warning(f\"[MinerU] {line}\")\n                        elif \"error\" in line.lower():\n                            cls.logger.error(f\"[MinerU] {line}\")\n                            error_message = line.split(\"\\n\")[0]\n                            error_lines.append(error_message)\n                        else:\n                            cls.logger.info(f\"[MinerU] {line}\")\n                except Empty:\n                    pass\n\n                # Small delay to prevent busy waiting\n                import time\n\n                time.sleep(0.1)\n\n            # Process any remaining output after process completion\n            try:\n                while True:\n                    prefix, line = stdout_queue.get_nowait()\n                    output_lines.append(line)\n                    cls.logger.info(f\"[MinerU] {line}\")\n            except Empty:\n                pass\n\n            try:\n                while True:\n                    prefix, line = stderr_queue.get_nowait()\n                    if \"warning\" in line.lower():\n                        cls.logger.warning(f\"[MinerU] {line}\")\n                    elif \"error\" in line.lower():\n                        cls.logger.error(f\"[MinerU] {line}\")\n                        error_message = line.split(\"\\n\")[0]\n                        error_lines.append(error_message)\n                    else:\n                        cls.logger.info(f\"[MinerU] {line}\")\n            except Empty:\n                pass\n\n            # Wait for process to complete and get return code\n            return_code = process.wait()\n\n            # Wait for threads to finish\n            stdout_thread.join(timeout=5)\n            stderr_thread.join(timeout=5)\n\n            if return_code != 0 or error_lines:\n                cls.logger.info(\"[MinerU] Command executed failed\")\n                raise MineruExecutionError(return_code, error_lines)\n            else:\n                cls.logger.info(\"[MinerU] Command executed successfully\")\n\n        except MineruExecutionError:\n            raise\n        except subprocess.CalledProcessError as e:\n            cls.logger.error(f\"Error running mineru subprocess command: {e}\")\n            cls.logger.error(f\"Command: {' '.join(cmd)}\")\n            cls.logger.error(f\"Return code: {e.returncode}\")\n            raise\n        except FileNotFoundError:\n            raise RuntimeError(\n                \"mineru command not found. Please ensure MinerU 2.0 is properly installed:\\n\"\n                \"pip install -U 'mineru[core]' or uv pip install -U 'mineru[core]'\"\n            )\n        except Exception as e:\n            error_message = f\"Unexpected error running mineru command: {e}\"\n            cls.logger.error(error_message)\n            raise RuntimeError(error_message) from e\n\n    @classmethod\n    def _read_output_files(\n        cls, output_dir: Path, file_stem: str, method: str = \"auto\"\n    ) -> Tuple[List[Dict[str, Any]], str]:\n        \"\"\"\n        Read the output files generated by mineru\n\n        Args:\n            output_dir: Output directory\n            file_stem: File name without extension\n            method: Parsing method (used as fallback if subdirectory scan fails)\n\n        Returns:\n            Tuple containing (content list JSON, Markdown text)\n        \"\"\"\n        # Look for the generated files\n        md_file = output_dir / f\"{file_stem}.md\"\n        json_file = output_dir / f\"{file_stem}_content_list.json\"\n        images_base_dir = output_dir  # Base directory for images\n\n        file_stem_subdir = output_dir / file_stem\n        if file_stem_subdir.is_dir():\n            # Scan for actual output subdirectory instead of assuming method name\n            found = False\n            for subdir in file_stem_subdir.iterdir():\n                if not subdir.is_dir():\n                    continue\n                # Check if this subdirectory contains the expected JSON output file\n                candidate_json = subdir / f\"{file_stem}_content_list.json\"\n                if candidate_json.exists():\n                    # Found the actual output directory\n                    md_file = subdir / f\"{file_stem}.md\"\n                    json_file = candidate_json\n                    images_base_dir = subdir\n                    found = True\n                    cls.logger.info(\n                        f\"Found MinerU output in subdirectory: {subdir.name}\"\n                    )\n                    break\n\n            # Fallback to method-based path if scanning didn't find output\n            if not found:\n                cls.logger.debug(\n                    f\"No output found by scanning, falling back to method-based path: {method}\"\n                )\n                md_file = file_stem_subdir / method / f\"{file_stem}.md\"\n                json_file = file_stem_subdir / method / f\"{file_stem}_content_list.json\"\n                images_base_dir = file_stem_subdir / method\n\n        # Read markdown content\n        md_content = \"\"\n        if md_file.exists():\n            try:\n                with open(md_file, \"r\", encoding=\"utf-8\") as f:\n                    md_content = f.read()\n            except Exception as e:\n                cls.logger.warning(f\"Could not read markdown file {md_file}: {e}\")\n\n        # Read JSON content list\n        content_list = []\n        if json_file.exists():\n            try:\n                with open(json_file, \"r\", encoding=\"utf-8\") as f:\n                    content_list = json.load(f)\n\n                # Normalize MinerU 2.0 field names to expected names for backward compatibility.\n                # MinerU 2.0 renamed: img_caption -> image_caption, img_footnote -> image_footnote\n                # The codebase primarily uses image_caption/image_footnote with img_caption/img_footnote\n                # as fallback, but we ensure both fields exist so downstream code works regardless.\n                _FIELD_ALIASES = {\n                    # MinerU 1.x name -> MinerU 2.0 name (canonical)\n                    \"img_caption\": \"image_caption\",\n                    \"img_footnote\": \"image_footnote\",\n                }\n                for item in content_list:\n                    if isinstance(item, dict):\n                        for old_name, new_name in _FIELD_ALIASES.items():\n                            # If only the old field exists, copy it to the new field name\n                            if old_name in item and new_name not in item:\n                                item[new_name] = item[old_name]\n                            # If only the new field exists, copy it to the old field name (for any legacy code)\n                            elif new_name in item and old_name not in item:\n                                item[old_name] = item[new_name]\n\n                # Always fix relative paths in content_list to absolute paths\n                cls.logger.info(\n                    f\"Fixing image paths in {json_file} with base directory: {images_base_dir}\"\n                )\n                for item in content_list:\n                    if isinstance(item, dict):\n                        for field_name in [\n                            \"img_path\",\n                            \"table_img_path\",\n                            \"equation_img_path\",\n                        ]:\n                            if field_name in item and item[field_name]:\n                                img_path = item[field_name]\n                                absolute_img_path = (\n                                    images_base_dir / img_path\n                                ).resolve()\n\n                                # Security check: ensure the image path is within the base directory\n                                resolved_base = images_base_dir.resolve()\n                                if not absolute_img_path.is_relative_to(resolved_base):\n                                    cls.logger.warning(\n                                        f\"Potential path traversal detected in {field_name}: {img_path}. Skipping.\"\n                                    )\n                                    item[field_name] = \"\"  # Clear unsafe path\n                                    continue\n\n                                item[field_name] = str(absolute_img_path)\n\n            except Exception as e:\n                cls.logger.warning(f\"Could not read JSON file {json_file}: {e}\")\n\n        return content_list, md_content\n\n    def parse_pdf(\n        self,\n        pdf_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        method: str = \"auto\",\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Parse PDF document using MinerU 2.0\n\n        Args:\n            pdf_path: Path to the PDF file\n            output_dir: Output directory path\n            method: Parsing method (auto, txt, ocr)\n            lang: Document language for OCR optimization\n            **kwargs: Additional parameters for mineru command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        try:\n            # Convert to Path object for easier handling\n            pdf_path = Path(pdf_path)\n            if not pdf_path.exists():\n                raise FileNotFoundError(f\"PDF file does not exist: {pdf_path}\")\n\n            name_without_suff = pdf_path.stem\n\n            # Prepare output directory — use unique subdirectory to prevent\n            # same-name file collisions when output_dir is shared (#51)\n            if output_dir:\n                base_output_dir = self._unique_output_dir(output_dir, pdf_path)\n            else:\n                base_output_dir = pdf_path.parent / \"mineru_output\"\n\n            base_output_dir.mkdir(parents=True, exist_ok=True)\n\n            # Run mineru command\n            self._run_mineru_command(\n                input_path=pdf_path,\n                output_dir=base_output_dir,\n                method=method,\n                lang=lang,\n                **kwargs,\n            )\n\n            # Read the generated output files\n            # Map backend to expected output directory name for better compatibility\n            # MinerU 2.7.0+ uses different directory names based on backend:\n            # - pipeline -> auto/\n            # - vlm-* -> vlm/\n            # - hybrid-* -> hybrid_auto/\n            # Note: _read_output_files() will scan subdirectories automatically,\n            # so this mapping is just for optimization and fallback\n            # Use `or \"\"` to handle both missing keys and explicit None values\n            backend = kwargs.get(\"backend\") or \"\"\n            if backend.startswith(\"vlm-\"):\n                method = \"vlm\"\n            elif backend.startswith(\"hybrid-\"):\n                method = \"hybrid_auto\"\n\n            content_list, _ = self._read_output_files(\n                base_output_dir, name_without_suff, method=method\n            )\n            return content_list\n\n        except MineruExecutionError:\n            raise\n        except Exception as e:\n            self.logger.error(f\"Error in parse_pdf: {str(e)}\")\n            raise\n\n    def parse_image(\n        self,\n        image_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Parse image document using MinerU 2.0\n\n        Note: MinerU 2.0 natively supports .png, .jpeg, .jpg formats.\n        Other formats (.bmp, .tiff, .tif, etc.) will be automatically converted to .png.\n\n        Args:\n            image_path: Path to the image file\n            output_dir: Output directory path\n            lang: Document language for OCR optimization\n            **kwargs: Additional parameters for mineru command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        try:\n            # Convert to Path object for easier handling\n            image_path = Path(image_path)\n            if not image_path.exists():\n                raise FileNotFoundError(f\"Image file does not exist: {image_path}\")\n\n            # Supported image formats by MinerU 2.0\n            mineru_supported_formats = {\".png\", \".jpeg\", \".jpg\"}\n\n            # All supported image formats (including those we can convert)\n            all_supported_formats = {\n                \".png\",\n                \".jpeg\",\n                \".jpg\",\n                \".bmp\",\n                \".tiff\",\n                \".tif\",\n                \".gif\",\n                \".webp\",\n            }\n\n            ext = image_path.suffix.lower()\n            if ext not in all_supported_formats:\n                raise ValueError(\n                    f\"Unsupported image format: {ext}. Supported formats: {', '.join(all_supported_formats)}\"\n                )\n\n            # Determine the actual image file to process\n            actual_image_path = image_path\n            temp_converted_file = None\n\n            # If format is not natively supported by MinerU, convert it\n            if ext not in mineru_supported_formats:\n                self.logger.info(\n                    f\"Converting {ext} image to PNG for MinerU compatibility...\"\n                )\n\n                try:\n                    from PIL import Image\n                except ImportError:\n                    raise RuntimeError(\n                        \"PIL/Pillow is required for image format conversion. \"\n                        \"Please install it using: pip install Pillow\"\n                    )\n\n                # Create temporary directory for conversion\n                temp_dir = Path(tempfile.mkdtemp())\n                temp_converted_file = temp_dir / f\"{image_path.stem}_converted.png\"\n\n                try:\n                    # Open and convert image\n                    with Image.open(image_path) as img:\n                        # Handle different image modes\n                        if img.mode in (\"RGBA\", \"LA\", \"P\"):\n                            # For images with transparency or palette, convert to RGB first\n                            if img.mode == \"P\":\n                                img = img.convert(\"RGBA\")\n\n                            # Create white background for transparent images\n                            background = Image.new(\"RGB\", img.size, (255, 255, 255))\n                            if img.mode == \"RGBA\":\n                                background.paste(\n                                    img, mask=img.split()[-1]\n                                )  # Use alpha channel as mask\n                            else:\n                                background.paste(img)\n                            img = background\n                        elif img.mode not in (\"RGB\", \"L\"):\n                            # Convert other modes to RGB\n                            img = img.convert(\"RGB\")\n\n                        # Save as PNG\n                        img.save(temp_converted_file, \"PNG\", optimize=True)\n                        self.logger.info(\n                            f\"Successfully converted {image_path.name} to PNG ({temp_converted_file.stat().st_size / 1024:.1f} KB)\"\n                        )\n\n                        actual_image_path = temp_converted_file\n\n                except Exception as e:\n                    if temp_converted_file and temp_converted_file.exists():\n                        temp_converted_file.unlink()\n                    raise RuntimeError(\n                        f\"Failed to convert image {image_path.name}: {str(e)}\"\n                    )\n\n            name_without_suff = image_path.stem\n\n            # Prepare output directory — use unique subdirectory to prevent\n            # same-name file collisions when output_dir is shared (#51)\n            if output_dir:\n                base_output_dir = self._unique_output_dir(output_dir, image_path)\n            else:\n                base_output_dir = image_path.parent / \"mineru_output\"\n\n            base_output_dir.mkdir(parents=True, exist_ok=True)\n\n            try:\n                # Run mineru command (images are processed with OCR method)\n                self._run_mineru_command(\n                    input_path=actual_image_path,\n                    output_dir=base_output_dir,\n                    method=\"ocr\",  # Images require OCR method\n                    lang=lang,\n                    **kwargs,\n                )\n\n                # Read the generated output files\n                content_list, _ = self._read_output_files(\n                    base_output_dir, name_without_suff, method=\"ocr\"\n                )\n                return content_list\n\n            except MineruExecutionError:\n                raise\n\n            finally:\n                # Clean up temporary converted file if it was created\n                if temp_converted_file and temp_converted_file.exists():\n                    try:\n                        temp_converted_file.unlink()\n                        temp_converted_file.parent.rmdir()  # Remove temp directory if empty\n                    except Exception:\n                        pass  # Ignore cleanup errors\n\n        except Exception as e:\n            self.logger.error(f\"Error in parse_image: {str(e)}\")\n            raise\n\n    def parse_office_doc(\n        self,\n        doc_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Parse office document by first converting to PDF, then parsing with MinerU 2.0\n\n        Note: This method requires LibreOffice to be installed separately for PDF conversion.\n        MinerU 2.0 no longer includes built-in Office document conversion.\n\n        Supported formats: .doc, .docx, .ppt, .pptx, .xls, .xlsx\n\n        Args:\n            doc_path: Path to the document file (.doc, .docx, .ppt, .pptx, .xls, .xlsx)\n            output_dir: Output directory path\n            lang: Document language for OCR optimization\n            **kwargs: Additional parameters for mineru command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        try:\n            # Convert Office document to PDF using base class method\n            pdf_path = self.convert_office_to_pdf(doc_path, output_dir)\n\n            # Parse the converted PDF\n            return self.parse_pdf(\n                pdf_path=pdf_path, output_dir=output_dir, lang=lang, **kwargs\n            )\n\n        except Exception as e:\n            self.logger.error(f\"Error in parse_office_doc: {str(e)}\")\n            raise\n\n    def parse_text_file(\n        self,\n        text_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Parse text file by first converting to PDF, then parsing with MinerU 2.0\n\n        Supported formats: .txt, .md\n\n        Args:\n            text_path: Path to the text file (.txt, .md)\n            output_dir: Output directory path\n            lang: Document language for OCR optimization\n            **kwargs: Additional parameters for mineru command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        try:\n            # Convert text file to PDF using base class method\n            pdf_path = self.convert_text_to_pdf(text_path, output_dir)\n\n            # Parse the converted PDF\n            return self.parse_pdf(\n                pdf_path=pdf_path, output_dir=output_dir, lang=lang, **kwargs\n            )\n\n        except Exception as e:\n            self.logger.error(f\"Error in parse_text_file: {str(e)}\")\n            raise\n\n    def parse_document(\n        self,\n        file_path: Union[str, Path],\n        method: str = \"auto\",\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Parse document using MinerU 2.0 based on file extension\n\n        Args:\n            file_path: Path to the file to be parsed\n            method: Parsing method (auto, txt, ocr)\n            output_dir: Output directory path\n            lang: Document language for OCR optimization\n            **kwargs: Additional parameters for mineru command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        # Convert to Path object\n        file_path = Path(file_path)\n        if not file_path.exists():\n            raise FileNotFoundError(f\"File does not exist: {file_path}\")\n\n        # Get file extension\n        ext = file_path.suffix.lower()\n\n        # Choose appropriate parser based on file type\n        if ext == \".pdf\":\n            return self.parse_pdf(file_path, output_dir, method, lang, **kwargs)\n        elif ext in self.IMAGE_FORMATS:\n            return self.parse_image(file_path, output_dir, lang, **kwargs)\n        elif ext in self.OFFICE_FORMATS:\n            self.logger.warning(\n                f\"Warning: Office document detected ({ext}). \"\n                f\"MinerU 2.0 requires conversion to PDF first.\"\n            )\n            return self.parse_office_doc(file_path, output_dir, lang, **kwargs)\n        elif ext in self.TEXT_FORMATS:\n            return self.parse_text_file(file_path, output_dir, lang, **kwargs)\n        else:\n            # For unsupported file types, try as PDF\n            self.logger.warning(\n                f\"Warning: Unsupported file extension '{ext}', \"\n                f\"attempting to parse as PDF\"\n            )\n            return self.parse_pdf(file_path, output_dir, method, lang, **kwargs)\n\n    def check_installation(self) -> bool:\n        \"\"\"\n        Check if MinerU 2.0 is properly installed\n\n        Returns:\n            bool: True if installation is valid, False otherwise\n        \"\"\"\n        try:\n            # Prepare subprocess parameters to hide console window on Windows\n            import platform\n\n            subprocess_kwargs = {\n                \"capture_output\": True,\n                \"text\": True,\n                \"check\": True,\n                \"encoding\": \"utf-8\",\n                \"errors\": \"ignore\",\n            }\n\n            # Hide console window on Windows\n            if platform.system() == \"Windows\":\n                subprocess_kwargs[\"creationflags\"] = subprocess.CREATE_NO_WINDOW\n\n            result = subprocess.run([\"mineru\", \"--version\"], **subprocess_kwargs)\n            self.logger.debug(f\"MinerU version: {result.stdout.strip()}\")\n            return True\n        except (subprocess.CalledProcessError, FileNotFoundError):\n            self.logger.debug(\n                \"MinerU 2.0 is not properly installed. \"\n                \"Please install it using: pip install -U 'mineru[core]'\"\n            )\n            return False\n\n\nclass DoclingParser(Parser):\n    \"\"\"\n    Docling document parsing utility class.\n\n    Specialized in parsing Office documents and HTML files, converting the content\n    into structured data and generating markdown and JSON output.\n    \"\"\"\n\n    # Define Docling-specific formats\n    HTML_FORMATS = {\".html\", \".htm\", \".xhtml\"}\n\n    def __init__(self) -> None:\n        \"\"\"Initialize DoclingParser\"\"\"\n        super().__init__()\n\n    def parse_pdf(\n        self,\n        pdf_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        method: str = \"auto\",\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Parse PDF document using Docling\n\n        Args:\n            pdf_path: Path to the PDF file\n            output_dir: Output directory path\n            method: Parsing method (auto, txt, ocr)\n            lang: Document language for OCR optimization\n            **kwargs: Additional parameters for docling command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        try:\n            # Convert to Path object for easier handling\n            pdf_path = Path(pdf_path)\n            if not pdf_path.exists():\n                raise FileNotFoundError(f\"PDF file does not exist: {pdf_path}\")\n\n            name_without_suff = pdf_path.stem\n\n            # Prepare output directory — use unique subdirectory to prevent\n            # same-name file collisions when output_dir is shared (#51)\n            if output_dir:\n                base_output_dir = self._unique_output_dir(output_dir, pdf_path)\n            else:\n                base_output_dir = pdf_path.parent / \"docling_output\"\n\n            base_output_dir.mkdir(parents=True, exist_ok=True)\n\n            # Run docling command\n            self._run_docling_command(\n                input_path=pdf_path,\n                output_dir=base_output_dir,\n                file_stem=name_without_suff,\n                **kwargs,\n            )\n\n            # Read the generated output files\n            content_list, _ = self._read_output_files(\n                base_output_dir, name_without_suff\n            )\n            return content_list\n\n        except Exception as e:\n            self.logger.error(f\"Error in parse_pdf: {str(e)}\")\n            raise\n\n    def parse_document(\n        self,\n        file_path: Union[str, Path],\n        method: str = \"auto\",\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Parse document using Docling based on file extension\n\n        Args:\n            file_path: Path to the file to be parsed\n            method: Parsing method\n            output_dir: Output directory path\n            lang: Document language for optimization\n            **kwargs: Additional parameters for docling command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        # Convert to Path object\n        file_path = Path(file_path)\n        if not file_path.exists():\n            raise FileNotFoundError(f\"File does not exist: {file_path}\")\n\n        # Get file extension\n        ext = file_path.suffix.lower()\n\n        # Choose appropriate parser based on file type\n        if ext == \".pdf\":\n            return self.parse_pdf(file_path, output_dir, method, lang, **kwargs)\n        elif ext in self.OFFICE_FORMATS:\n            return self.parse_office_doc(file_path, output_dir, lang, **kwargs)\n        elif ext in self.HTML_FORMATS:\n            return self.parse_html(file_path, output_dir, lang, **kwargs)\n        else:\n            raise ValueError(\n                f\"Unsupported file format: {ext}. \"\n                f\"Docling only supports PDF files, Office formats ({', '.join(self.OFFICE_FORMATS)}) \"\n                f\"and HTML formats ({', '.join(self.HTML_FORMATS)})\"\n            )\n\n    def _run_docling_command(\n        self,\n        input_path: Union[str, Path],\n        output_dir: Union[str, Path],\n        file_stem: str,\n        **kwargs,\n    ) -> None:\n        \"\"\"\n        Run docling command line tool\n\n        Args:\n            input_path: Path to input file or directory\n            output_dir: Output directory path\n            file_stem: File stem for creating subdirectory\n            **kwargs: Additional parameters for docling command\n        \"\"\"\n        # Create subdirectory structure similar to MinerU\n        file_output_dir = Path(output_dir) / file_stem / \"docling\"\n        file_output_dir.mkdir(parents=True, exist_ok=True)\n\n        cmd = [\n            \"docling\",\n            \"--output\",\n            str(file_output_dir),\n            \"--to\",\n            \"json\",\n            \"--to\",\n            \"md\",\n            str(input_path),\n        ]\n\n        # Handle and validate environment variables\n        custom_env = kwargs.pop(\"env\", None)\n\n        # Validate env if provided\n        if custom_env is not None:\n            if not isinstance(custom_env, dict):\n                raise TypeError(\n                    f\"env must be a dictionary, got {type(custom_env).__name__}\"\n                )\n            for k, v in custom_env.items():\n                if not isinstance(k, str) or not isinstance(v, str):\n                    raise TypeError(\"env keys and values must be strings\")\n\n        try:\n            # Prepare subprocess parameters to hide console window on Windows\n            import platform\n\n            env = None\n            if custom_env:\n                env = os.environ.copy()\n                env.update(custom_env)\n\n            docling_subprocess_kwargs = {\n                \"capture_output\": True,\n                \"text\": True,\n                \"check\": True,\n                \"encoding\": \"utf-8\",\n                \"errors\": \"ignore\",\n                \"env\": env,\n            }\n\n            # Hide console window on Windows\n            if platform.system() == \"Windows\":\n                docling_subprocess_kwargs[\"creationflags\"] = subprocess.CREATE_NO_WINDOW\n\n            result = subprocess.run(cmd, **docling_subprocess_kwargs)\n            self.logger.info(\"Docling command executed successfully\")\n            if result.stdout:\n                self.logger.debug(f\"JSON and Markdown cmd output: {result.stdout}\")\n        except subprocess.CalledProcessError as e:\n            self.logger.error(f\"Error running docling command: {e}\")\n            if e.stderr:\n                self.logger.error(f\"Error details: {e.stderr}\")\n            raise\n        except FileNotFoundError:\n            raise RuntimeError(\n                \"docling command not found. Please ensure Docling is properly installed.\"\n            )\n\n    def _read_output_files(\n        self,\n        output_dir: Path,\n        file_stem: str,\n    ) -> Tuple[List[Dict[str, Any]], str]:\n        \"\"\"\n        Read the output files generated by docling and convert to MinerU format\n\n        Args:\n            output_dir: Output directory\n            file_stem: File name without extension\n\n        Returns:\n            Tuple containing (content list JSON, Markdown text)\n        \"\"\"\n        # Use subdirectory structure similar to MinerU\n        file_subdir = output_dir / file_stem / \"docling\"\n        md_file = file_subdir / f\"{file_stem}.md\"\n        json_file = file_subdir / f\"{file_stem}.json\"\n\n        # Read markdown content\n        md_content = \"\"\n        if md_file.exists():\n            try:\n                with open(md_file, \"r\", encoding=\"utf-8\") as f:\n                    md_content = f.read()\n            except Exception as e:\n                self.logger.warning(f\"Could not read markdown file {md_file}: {e}\")\n\n        # Read JSON content and convert format\n        content_list = []\n        if json_file.exists():\n            try:\n                with open(json_file, \"r\", encoding=\"utf-8\") as f:\n                    docling_content = json.load(f)\n                    # Convert docling format to minerU format\n                    content_list = self.read_from_block_recursive(\n                        docling_content[\"body\"],\n                        \"body\",\n                        file_subdir,\n                        0,\n                        \"0\",\n                        docling_content,\n                    )\n            except Exception as e:\n                self.logger.warning(\n                    f\"Could not read or convert JSON file {json_file}: {e}\"\n                )\n        return content_list, md_content\n\n    def read_from_block_recursive(\n        self,\n        block,\n        type: str,\n        output_dir: Path,\n        cnt: int,\n        num: str,\n        docling_content: Dict[str, Any],\n    ) -> List[Dict[str, Any]]:\n        content_list = []\n        if not block.get(\"children\"):\n            cnt += 1\n            content_list.append(self.read_from_block(block, type, output_dir, cnt, num))\n        else:\n            if type not in [\"groups\", \"body\"]:\n                cnt += 1\n                content_list.append(\n                    self.read_from_block(block, type, output_dir, cnt, num)\n                )\n            members = block[\"children\"]\n            for member in members:\n                cnt += 1\n                member_tag = member[\"$ref\"]\n                member_type = member_tag.split(\"/\")[1]\n                member_num = member_tag.split(\"/\")[2]\n                member_block = docling_content[member_type][int(member_num)]\n                content_list.extend(\n                    self.read_from_block_recursive(\n                        member_block,\n                        member_type,\n                        output_dir,\n                        cnt,\n                        member_num,\n                        docling_content,\n                    )\n                )\n        return content_list\n\n    def read_from_block(\n        self, block, type: str, output_dir: Path, cnt: int, num: str\n    ) -> Dict[str, Any]:\n        if type == \"texts\":\n            if block[\"label\"] == \"formula\":\n                return {\n                    \"type\": \"equation\",\n                    \"img_path\": \"\",\n                    \"text\": block[\"orig\"],\n                    \"text_format\": \"unknown\",\n                    \"page_idx\": cnt // 10,\n                }\n            else:\n                return {\n                    \"type\": \"text\",\n                    \"text\": block[\"orig\"],\n                    \"page_idx\": cnt // 10,\n                }\n        elif type == \"pictures\":\n            try:\n                base64_uri = block[\"image\"][\"uri\"]\n                base64_str = base64_uri.split(\",\")[1]\n                # Create images directory within the docling subdirectory\n                image_dir = output_dir / \"images\"\n                image_dir.mkdir(parents=True, exist_ok=True)  # Ensure directory exists\n                image_path = image_dir / f\"image_{num}.png\"\n                with open(image_path, \"wb\") as f:\n                    f.write(base64.b64decode(base64_str))\n                return {\n                    \"type\": \"image\",\n                    \"img_path\": str(image_path.resolve()),  # Convert to absolute path\n                    \"image_caption\": block.get(\"caption\", \"\"),\n                    \"image_footnote\": block.get(\"footnote\", \"\"),\n                    \"page_idx\": cnt // 10,\n                }\n            except Exception as e:\n                self.logger.warning(f\"Failed to process image {num}: {e}\")\n                return {\n                    \"type\": \"text\",\n                    \"text\": f\"[Image processing failed: {block.get('caption', '')}]\",\n                    \"page_idx\": cnt // 10,\n                }\n        else:\n            try:\n                return {\n                    \"type\": \"table\",\n                    \"img_path\": \"\",\n                    \"table_caption\": block.get(\"caption\", \"\"),\n                    \"table_footnote\": block.get(\"footnote\", \"\"),\n                    \"table_body\": block.get(\"data\", []),\n                    \"page_idx\": cnt // 10,\n                }\n            except Exception as e:\n                self.logger.warning(f\"Failed to process table {num}: {e}\")\n                return {\n                    \"type\": \"text\",\n                    \"text\": f\"[Table processing failed: {block.get('caption', '')}]\",\n                    \"page_idx\": cnt // 10,\n                }\n\n    def parse_office_doc(\n        self,\n        doc_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Parse office document directly using Docling\n\n        Supported formats: .doc, .docx, .ppt, .pptx, .xls, .xlsx\n\n        Args:\n            doc_path: Path to the document file\n            output_dir: Output directory path\n            lang: Document language for optimization\n            **kwargs: Additional parameters for docling command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        try:\n            # Convert to Path object\n            doc_path = Path(doc_path)\n            if not doc_path.exists():\n                raise FileNotFoundError(f\"Document file does not exist: {doc_path}\")\n\n            if doc_path.suffix.lower() not in self.OFFICE_FORMATS:\n                raise ValueError(f\"Unsupported office format: {doc_path.suffix}\")\n\n            name_without_suff = doc_path.stem\n\n            # Prepare output directory — use unique subdirectory to prevent\n            # same-name file collisions when output_dir is shared (#51)\n            if output_dir:\n                base_output_dir = self._unique_output_dir(output_dir, doc_path)\n            else:\n                base_output_dir = doc_path.parent / \"docling_output\"\n\n            base_output_dir.mkdir(parents=True, exist_ok=True)\n\n            # Run docling command\n            self._run_docling_command(\n                input_path=doc_path,\n                output_dir=base_output_dir,\n                file_stem=name_without_suff,\n                **kwargs,\n            )\n\n            # Read the generated output files\n            content_list, _ = self._read_output_files(\n                base_output_dir, name_without_suff\n            )\n            return content_list\n\n        except Exception as e:\n            self.logger.error(f\"Error in parse_office_doc: {str(e)}\")\n            raise\n\n    def parse_html(\n        self,\n        html_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        \"\"\"\n        Parse HTML document using Docling\n\n        Supported formats: .html, .htm, .xhtml\n\n        Args:\n            html_path: Path to the HTML file\n            output_dir: Output directory path\n            lang: Document language for optimization\n            **kwargs: Additional parameters for docling command\n\n        Returns:\n            List[Dict[str, Any]]: List of content blocks\n        \"\"\"\n        try:\n            # Convert to Path object\n            html_path = Path(html_path)\n            if not html_path.exists():\n                raise FileNotFoundError(f\"HTML file does not exist: {html_path}\")\n\n            if html_path.suffix.lower() not in self.HTML_FORMATS:\n                raise ValueError(f\"Unsupported HTML format: {html_path.suffix}\")\n\n            name_without_suff = html_path.stem\n\n            # Prepare output directory — use unique subdirectory to prevent\n            # same-name file collisions when output_dir is shared (#51)\n            if output_dir:\n                base_output_dir = self._unique_output_dir(output_dir, html_path)\n            else:\n                base_output_dir = html_path.parent / \"docling_output\"\n\n            base_output_dir.mkdir(parents=True, exist_ok=True)\n\n            # Run docling command\n            self._run_docling_command(\n                input_path=html_path,\n                output_dir=base_output_dir,\n                file_stem=name_without_suff,\n                **kwargs,\n            )\n\n            # Read the generated output files\n            content_list, _ = self._read_output_files(\n                base_output_dir, name_without_suff\n            )\n            return content_list\n\n        except Exception as e:\n            self.logger.error(f\"Error in parse_html: {str(e)}\")\n            raise\n\n    def check_installation(self) -> bool:\n        \"\"\"\n        Check if Docling is properly installed\n\n        Returns:\n            bool: True if installation is valid, False otherwise\n        \"\"\"\n        try:\n            # Prepare subprocess parameters to hide console window on Windows\n            import platform\n\n            subprocess_kwargs = {\n                \"capture_output\": True,\n                \"text\": True,\n                \"check\": True,\n                \"encoding\": \"utf-8\",\n                \"errors\": \"ignore\",\n            }\n\n            # Hide console window on Windows\n            if platform.system() == \"Windows\":\n                subprocess_kwargs[\"creationflags\"] = subprocess.CREATE_NO_WINDOW\n\n            result = subprocess.run([\"docling\", \"--version\"], **subprocess_kwargs)\n            self.logger.debug(f\"Docling version: {result.stdout.strip()}\")\n            return True\n        except (subprocess.CalledProcessError, FileNotFoundError):\n            self.logger.debug(\n                \"Docling is not properly installed. \"\n                \"Please ensure it is installed correctly.\"\n            )\n            return False\n\n\nclass PaddleOCRParser(Parser):\n    \"\"\"PaddleOCR document parser with optional PDF page rendering support.\"\"\"\n\n    def __init__(self, default_lang: str = \"en\") -> None:\n        super().__init__()\n        self.default_lang = default_lang\n        self._ocr_instances: Dict[str, Any] = {}\n\n    def _require_paddleocr(self):\n        try:\n            from paddleocr import PaddleOCR\n        except ImportError as exc:\n            raise ImportError(\n                \"PaddleOCR parser requires optional dependency `paddleocr`. \"\n                \"Install with `pip install -e '.[paddleocr]'` or \"\n                \"`uv sync --extra paddleocr`. \"\n                \"PaddleOCR also needs `paddlepaddle`; install it from \"\n                \"https://www.paddlepaddle.org.cn/install/quick.\"\n            ) from exc\n        return PaddleOCR\n\n    def _get_ocr(self, lang: Optional[str] = None):\n        PaddleOCR = self._require_paddleocr()\n        language = (lang or self.default_lang).strip() or self.default_lang\n        cached = self._ocr_instances.get(language)\n        if cached is not None:\n            return cached\n\n        init_candidates = [\n            {\"lang\": language, \"show_log\": False},\n            {\"lang\": language},\n            {},\n        ]\n        last_exception = None\n        for candidate_kwargs in init_candidates:\n            try:\n                ocr = PaddleOCR(**candidate_kwargs)\n                self._ocr_instances[language] = ocr\n                return ocr\n            except Exception as exc:  # pragma: no cover - defensive fallback\n                last_exception = exc\n                continue\n\n        raise RuntimeError(\n            f\"Unable to initialize PaddleOCR for language '{language}': {last_exception}\"\n        )\n\n    def _extract_text_lines(self, result: Any) -> List[str]:\n        lines: List[str] = []\n\n        def append_text(text: str) -> None:\n            clean_text = text.strip()\n            if clean_text:\n                lines.append(clean_text)\n\n        if isinstance(result, str):\n            append_text(result)\n            return lines\n\n        def visit(node: Any) -> None:\n            if node is None:\n                return\n\n            if hasattr(node, \"to_dict\"):\n                try:\n                    visit(node.to_dict())\n                    return\n                except Exception:\n                    pass\n\n            if isinstance(node, dict):\n                rec_texts = node.get(\"rec_texts\")\n                if isinstance(rec_texts, list):\n                    for item in rec_texts:\n                        if isinstance(item, str):\n                            append_text(item)\n                        else:\n                            visit(item)\n\n                text_value = node.get(\"text\")\n                if isinstance(text_value, str):\n                    append_text(text_value)\n\n                texts_value = node.get(\"texts\")\n                if isinstance(texts_value, list):\n                    for item in texts_value:\n                        if isinstance(item, str):\n                            append_text(item)\n                        else:\n                            visit(item)\n\n                # Avoid double-visiting keys we already handled above; this prevents\n                # accidental duplication without content-level deduplication.\n                for key, value in node.items():\n                    if key in {\"rec_texts\", \"text\", \"texts\"}:\n                        continue\n                    visit(value)\n                return\n\n            if isinstance(node, (list, tuple)):\n                if node and all(isinstance(item, str) for item in node):\n                    for item in node:\n                        append_text(item)\n                    return\n\n                if (\n                    len(node) >= 2\n                    and isinstance(node[1], (list, tuple))\n                    and len(node[1]) >= 1\n                    and isinstance(node[1][0], str)\n                ):\n                    append_text(node[1][0])\n                    return\n\n                if (\n                    len(node) >= 1\n                    and isinstance(node[0], str)\n                    and (len(node) == 1 or isinstance(node[1], (int, float)))\n                ):\n                    append_text(node[0])\n                    return\n\n                for item in node:\n                    visit(item)\n                return\n\n            if isinstance(node, str):\n                append_text(node)\n                return\n\n        visit(result)\n        return lines\n\n    def _ocr_input(\n        self, input_data: Any, lang: Optional[str] = None, cls_enabled: bool = True\n    ) -> List[str]:\n        ocr = self._get_ocr(lang=lang)\n\n        if hasattr(ocr, \"ocr\"):\n            try:\n                result = ocr.ocr(input_data, cls=cls_enabled)\n            except TypeError:\n                result = ocr.ocr(input_data)\n            return self._extract_text_lines(result)\n\n        if hasattr(ocr, \"predict\"):\n            result = ocr.predict(input_data)\n            return self._extract_text_lines(result)\n\n        raise RuntimeError(\n            \"Unsupported PaddleOCR API: expected `ocr` or `predict` method.\"\n        )\n\n    def _extract_pdf_page_inputs(self, pdf_path: Path) -> Iterator[Tuple[int, Any]]:\n        try:\n            import pypdfium2 as pdfium\n        except ImportError as exc:\n            raise ImportError(\n                \"PDF parsing with parser='paddleocr' requires `pypdfium2`. \"\n                \"Install with `pip install -e '.[paddleocr]'` or \"\n                \"`uv sync --extra paddleocr`.\"\n            ) from exc\n\n        pdf = pdfium.PdfDocument(str(pdf_path))\n        try:\n            total_pages = len(pdf)\n            for page_idx in range(total_pages):\n                page = pdf[page_idx]\n                try:\n                    rendered = page.render(scale=2.0)\n                    if hasattr(rendered, \"to_pil\"):\n                        yield (page_idx, rendered.to_pil())\n                    elif hasattr(rendered, \"to_numpy\"):\n                        yield (page_idx, rendered.to_numpy())\n                    else:\n                        raise RuntimeError(\n                            \"Unsupported rendered page format from pypdfium2.\"\n                        )\n                finally:\n                    if hasattr(page, \"close\"):\n                        page.close()\n        finally:\n            if hasattr(pdf, \"close\"):\n                pdf.close()\n\n    def _ocr_rendered_page(\n        self, rendered_page: Any, lang: Optional[str] = None, cls_enabled: bool = True\n    ) -> List[str]:\n        if hasattr(rendered_page, \"save\"):\n            temp_image_path: Optional[Path] = None\n            try:\n                with tempfile.NamedTemporaryFile(suffix=\".png\", delete=False) as temp:\n                    temp_image_path = Path(temp.name)\n                rendered_page.save(temp_image_path)\n                return self._ocr_input(\n                    str(temp_image_path), lang=lang, cls_enabled=cls_enabled\n                )\n            finally:\n                if temp_image_path is not None and temp_image_path.exists():\n                    try:\n                        temp_image_path.unlink()\n                    except Exception:\n                        pass\n\n        return self._ocr_input(rendered_page, lang=lang, cls_enabled=cls_enabled)\n\n    def parse_pdf(\n        self,\n        pdf_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        method: str = \"auto\",\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        del output_dir, method\n        pdf_path = Path(pdf_path)\n        if not pdf_path.exists():\n            raise FileNotFoundError(f\"PDF file does not exist: {pdf_path}\")\n\n        cls_enabled = kwargs.get(\"cls\", True)\n        content_list: List[Dict[str, Any]] = []\n        page_inputs = self._extract_pdf_page_inputs(pdf_path)\n        try:\n            for page_idx, rendered_page in page_inputs:\n                page_lines = self._ocr_rendered_page(\n                    rendered_page, lang=lang, cls_enabled=cls_enabled\n                )\n                for text in page_lines:\n                    content_list.append(\n                        {\"type\": \"text\", \"text\": text, \"page_idx\": int(page_idx)}\n                    )\n        finally:\n            # Ensure we promptly release PDF handles even if OCR fails mid-stream.\n            close = getattr(page_inputs, \"close\", None)\n            if callable(close):\n                close()\n        return content_list\n\n    def parse_image(\n        self,\n        image_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        del output_dir\n        image_path = Path(image_path)\n        if not image_path.exists():\n            raise FileNotFoundError(f\"Image file does not exist: {image_path}\")\n\n        ext = image_path.suffix.lower()\n        if ext not in self.IMAGE_FORMATS:\n            raise ValueError(\n                f\"Unsupported image format: {ext}. Supported formats: {', '.join(sorted(self.IMAGE_FORMATS))}\"\n            )\n\n        cls_enabled = kwargs.get(\"cls\", True)\n        page_idx = int(kwargs.get(\"page_idx\", 0))\n        text_lines = self._ocr_input(\n            str(image_path), lang=lang, cls_enabled=cls_enabled\n        )\n        return [\n            {\"type\": \"text\", \"text\": text, \"page_idx\": page_idx} for text in text_lines\n        ]\n\n    def parse_office_doc(\n        self,\n        doc_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        pdf_path = self.convert_office_to_pdf(doc_path, output_dir)\n        return self.parse_pdf(\n            pdf_path=pdf_path, output_dir=output_dir, lang=lang, **kwargs\n        )\n\n    def parse_text_file(\n        self,\n        text_path: Union[str, Path],\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        pdf_path = self.convert_text_to_pdf(text_path, output_dir)\n        return self.parse_pdf(\n            pdf_path=pdf_path, output_dir=output_dir, lang=lang, **kwargs\n        )\n\n    def parse_document(\n        self,\n        file_path: Union[str, Path],\n        method: str = \"auto\",\n        output_dir: Optional[str] = None,\n        lang: Optional[str] = None,\n        **kwargs,\n    ) -> List[Dict[str, Any]]:\n        del method\n        file_path = Path(file_path)\n        if not file_path.exists():\n            raise FileNotFoundError(f\"File does not exist: {file_path}\")\n\n        ext = file_path.suffix.lower()\n        if ext == \".pdf\":\n            return self.parse_pdf(file_path, output_dir, lang=lang, **kwargs)\n        if ext in self.IMAGE_FORMATS:\n            return self.parse_image(file_path, output_dir, lang=lang, **kwargs)\n        if ext in self.OFFICE_FORMATS:\n            return self.parse_office_doc(file_path, output_dir, lang=lang, **kwargs)\n        if ext in self.TEXT_FORMATS:\n            return self.parse_text_file(file_path, output_dir, lang=lang, **kwargs)\n\n        raise ValueError(\n            f\"Unsupported file format: {ext}. \"\n            \"PaddleOCR parser supports PDF, image, office, and text formats.\"\n        )\n\n    def check_installation(self) -> bool:\n        try:\n            self._require_paddleocr()\n            return True\n        except ImportError:\n            return False\n\n\ndef _normalize_parser_name(name: str) -> str:\n    \"\"\"Normalize and validate a parser name for registry APIs.\"\"\"\n    if not isinstance(name, str):\n        raise TypeError(\n            f\"parser name must be a non-empty string, got {type(name).__name__}\"\n        )\n    normalized = name.strip().lower()\n    if not normalized:\n        raise ValueError(\"parser name must be a non-empty string\")\n    return normalized\n\n\n# Custom parser registry for Bring-Your-Own-Parser support (see #151)\n_CUSTOM_PARSERS: Dict[str, type] = {}\n\n\ndef register_parser(name: str, parser_class: type) -> None:\n    \"\"\"Register a custom parser class for use with RAGAnything.\n\n    This enables the Bring-Your-Own-Parser pattern: users can integrate\n    any document parser (e.g., Marker, Unstructured, Surya) by subclassing\n    ``Parser`` and registering it here.\n\n    Args:\n        name: Unique identifier for the parser (e.g., \"marker\", \"surya\").\n              Must not collide with built-in names (\"mineru\", \"docling\", \"paddleocr\").\n        parser_class: A subclass of ``Parser`` that implements at least\n                      ``parse_document``, ``check_installation``, and\n                      optionally ``parse_pdf``, ``parse_image``, ``parse_office_doc``.\n\n    Raises:\n        TypeError: If *parser_class* is not a subclass of ``Parser``.\n        ValueError: If *name* collides with a built-in parser name.\n\n    Example::\n\n        from raganything.parser import Parser, register_parser\n\n        class MarkerParser(Parser):\n            def check_installation(self) -> bool:\n                try:\n                    import marker\n                    return True\n                except ImportError:\n                    return False\n\n            def parse_pdf(self, pdf_path, output_dir=\"./output\", method=\"auto\", **kw):\n                import marker\n                # ... your implementation ...\n                return content_list\n\n            def parse_document(self, file_path, output_dir=\"./output\", method=\"auto\", **kw):\n                return self.parse_pdf(pdf_path=file_path, output_dir=output_dir, method=method, **kw)\n\n        register_parser(\"marker\", MarkerParser)\n    \"\"\"\n    normalized_name = _normalize_parser_name(name)\n    if not isinstance(parser_class, type) or not issubclass(parser_class, Parser):\n        raise TypeError(\n            f\"parser_class must be a subclass of Parser, got {parser_class!r}\"\n        )\n    _BUILTIN_NAMES = {\"mineru\", \"docling\", \"paddleocr\"}\n    if normalized_name in _BUILTIN_NAMES:\n        raise ValueError(\n            f\"Cannot override built-in parser '{normalized_name}'. \"\n            f\"Choose a different name for your custom parser.\"\n        )\n    _CUSTOM_PARSERS[normalized_name] = parser_class\n    Parser.logger.info(\n        \"Registered custom parser: '%s' -> %s\", normalized_name, parser_class.__name__\n    )\n\n\ndef unregister_parser(name: str) -> None:\n    \"\"\"Remove a previously registered custom parser.\n\n    Args:\n        name: The parser name to remove.\n\n    Raises:\n        TypeError: If *name* is not a string.\n        ValueError: If *name* is empty or only whitespace.\n        KeyError: If no custom parser with that name is registered.\n    \"\"\"\n    normalized_name = _normalize_parser_name(name)\n    if normalized_name not in _CUSTOM_PARSERS:\n        raise KeyError(f\"No custom parser registered with name '{normalized_name}'\")\n    del _CUSTOM_PARSERS[normalized_name]\n    Parser.logger.info(\"Unregistered custom parser: '%s'\", normalized_name)\n\n\ndef list_parsers() -> Dict[str, str]:\n    \"\"\"Return a mapping of all available parser names to their class names.\n\n    Returns:\n        Dict mapping parser name to the fully-qualified class name.\n        Includes both built-in and custom parsers.\n    \"\"\"\n    result: Dict[str, str] = {\n        \"mineru\": \"MineruParser\",\n        \"docling\": \"DoclingParser\",\n        \"paddleocr\": \"PaddleOCRParser\",\n    }\n    for name, cls in _CUSTOM_PARSERS.items():\n        result[name] = cls.__name__\n    return result\n\n\nSUPPORTED_PARSERS = (\"mineru\", \"docling\", \"paddleocr\")\n\n\ndef get_supported_parsers() -> tuple:\n    \"\"\"Return all supported parser names including custom registered parsers.\"\"\"\n    return SUPPORTED_PARSERS + tuple(_CUSTOM_PARSERS.keys())\n\n\ndef get_parser(parser_type: str) -> Parser:\n    \"\"\"Get a parser instance by name.\n\n    Checks built-in parsers first, then falls back to the custom parser\n    registry populated via :func:`register_parser`.\n\n    Args:\n        parser_type: Parser name (e.g., \"mineru\", \"docling\", \"paddleocr\",\n                     or any custom registered name).\n\n    Returns:\n        An instance of the requested parser.\n\n    Raises:\n        ValueError: If the parser name is not recognized.\n    \"\"\"\n    parser_name = (parser_type or \"mineru\").strip().lower()\n    if parser_name == \"mineru\":\n        return MineruParser()\n    if parser_name == \"docling\":\n        return DoclingParser()\n    if parser_name == \"paddleocr\":\n        return PaddleOCRParser()\n    # Check custom parser registry\n    if parser_name in _CUSTOM_PARSERS:\n        return _CUSTOM_PARSERS[parser_name]()\n    raise ValueError(\n        f\"Unsupported parser type: {parser_type}. \"\n        f\"Supported parsers: {', '.join(get_supported_parsers())}\"\n    )\n\n\ndef main():\n    \"\"\"\n    Main function to run the document parser from command line\n    \"\"\"\n    parser = argparse.ArgumentParser(\n        description=\"Parse documents using MinerU 2.0, Docling, or PaddleOCR\"\n    )\n    parser.add_argument(\"file_path\", help=\"Path to the document to parse\")\n    parser.add_argument(\"--output\", \"-o\", help=\"Output directory path\")\n    parser.add_argument(\n        \"--method\",\n        \"-m\",\n        choices=[\"auto\", \"txt\", \"ocr\"],\n        default=\"auto\",\n        help=\"Parsing method (auto, txt, ocr)\",\n    )\n    parser.add_argument(\n        \"--lang\",\n        \"-l\",\n        help=\"Document language for OCR optimization (e.g., ch, en, ja)\",\n    )\n    parser.add_argument(\n        \"--backend\",\n        \"-b\",\n        choices=[\n            \"pipeline\",\n            \"hybrid-auto-engine\",\n            \"hybrid-http-client\",\n            \"vlm-auto-engine\",\n            \"vlm-http-client\",\n        ],\n        default=\"pipeline\",\n        help=\"Parsing backend\",\n    )\n    parser.add_argument(\n        \"--device\",\n        \"-d\",\n        help=\"Inference device (e.g., cpu, cuda, cuda:0, npu, mps)\",\n    )\n    parser.add_argument(\n        \"--source\",\n        choices=[\"huggingface\", \"modelscope\", \"local\"],\n        default=\"huggingface\",\n        help=\"Model source\",\n    )\n    parser.add_argument(\n        \"--no-formula\",\n        action=\"store_true\",\n        help=\"Disable formula parsing\",\n    )\n    parser.add_argument(\n        \"--no-table\",\n        action=\"store_true\",\n        help=\"Disable table parsing\",\n    )\n    parser.add_argument(\n        \"--stats\", action=\"store_true\", help=\"Display content statistics\"\n    )\n    parser.add_argument(\n        \"--check\",\n        action=\"store_true\",\n        help=\"Check parser installation\",\n    )\n    parser.add_argument(\n        \"--parser\",\n        default=\"mineru\",\n        help=(\n            \"Parser selection. Built-ins: mineru, docling, paddleocr. \"\n            \"Custom parsers registered via register_parser() in the same \"\n            \"Python process are also accepted when you integrate RAGAnything \"\n            \"as a library. The standalone CLI itself only sees parsers that \"\n            \"have already been registered in this process.\"\n        ),\n    )\n    parser.add_argument(\n        \"--vlm_url\",\n        help=\"When the backend is `vlm-http-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`\",\n    )\n\n    args = parser.parse_args()\n\n    # Check installation if requested\n    if args.check:\n        doc_parser = get_parser(args.parser)\n        if doc_parser.check_installation():\n            print(f\"✅ {args.parser.title()} is properly installed\")\n            return 0\n        else:\n            print(f\"❌ {args.parser.title()} installation check failed\")\n            return 1\n\n    try:\n        # Parse the document\n        doc_parser = get_parser(args.parser)\n        content_list = doc_parser.parse_document(\n            file_path=args.file_path,\n            method=args.method,\n            output_dir=args.output,\n            lang=args.lang,\n            backend=args.backend,\n            device=args.device,\n            source=args.source,\n            formula=not args.no_formula,\n            table=not args.no_table,\n            vlm_url=args.vlm_url,\n        )\n\n        print(f\"✅ Successfully parsed: {args.file_path}\")\n        print(f\"📊 Extracted {len(content_list)} content blocks\")\n\n        # Display statistics if requested\n        if args.stats:\n            print(\"\\n📈 Document Statistics:\")\n            print(f\"Total content blocks: {len(content_list)}\")\n\n            # Count different types of content\n            content_types = {}\n            for item in content_list:\n                if isinstance(item, dict):\n                    content_type = item.get(\"type\", \"unknown\")\n                    content_types[content_type] = content_types.get(content_type, 0) + 1\n\n            if content_types:\n                print(\"\\n📋 Content Type Distribution:\")\n                for content_type, count in sorted(content_types.items()):\n                    print(f\"  • {content_type}: {count}\")\n\n    except Exception as e:\n        print(f\"❌ Error: {str(e)}\")\n        return 1\n\n    return 0\n\n\nif __name__ == \"__main__\":\n    exit(main())\n"
  },
  {
    "path": "raganything/processor.py",
    "content": "\"\"\"\nDocument processing functionality for RAGAnything\n\nContains methods for parsing documents and processing multimodal content\n\"\"\"\n\nimport os\nimport time\nimport hashlib\nimport json\nfrom typing import Dict, List, Any, Tuple, Optional\nfrom pathlib import Path\n\nfrom raganything.base import DocStatus\nfrom raganything.parser import MineruParser, MineruExecutionError, get_parser\nfrom raganything.utils import (\n    separate_content,\n    insert_text_content,\n    insert_text_content_with_multimodal_content,\n    get_processor_for_type,\n)\nimport asyncio\nfrom lightrag.utils import compute_mdhash_id\n\n\nclass ProcessorMixin:\n    \"\"\"ProcessorMixin class containing document processing functionality for RAGAnything\"\"\"\n\n    def _get_file_reference(self, file_path: str) -> str:\n        \"\"\"\n        Get file reference based on use_full_path configuration.\n\n        Args:\n            file_path: Path to the file (can be absolute or relative)\n\n        Returns:\n            str: Full path if use_full_path is True, otherwise basename\n        \"\"\"\n        if self.config.use_full_path:\n            return str(file_path)\n        else:\n            return os.path.basename(file_path)\n\n    def _generate_cache_key(\n        self, file_path: Path, parse_method: str = None, **kwargs\n    ) -> str:\n        \"\"\"\n        Generate cache key based on file path and parsing configuration\n\n        Args:\n            file_path: Path to the file\n            parse_method: Parse method used\n            **kwargs: Additional parser parameters\n\n        Returns:\n            str: Cache key for the file and configuration\n        \"\"\"\n\n        # Get file modification time\n        mtime = file_path.stat().st_mtime\n\n        # Create configuration dict for cache key\n        config_dict = {\n            \"file_path\": str(file_path.absolute()),\n            \"mtime\": mtime,\n            \"parser\": self.config.parser,\n            \"parse_method\": parse_method or self.config.parse_method,\n        }\n\n        # Add relevant kwargs to config\n        relevant_kwargs = {\n            k: v\n            for k, v in kwargs.items()\n            if k\n            in [\n                \"lang\",\n                \"device\",\n                \"start_page\",\n                \"end_page\",\n                \"formula\",\n                \"table\",\n                \"backend\",\n                \"source\",\n            ]\n        }\n        config_dict.update(relevant_kwargs)\n\n        # Generate hash from config\n        config_str = json.dumps(config_dict, sort_keys=True)\n        cache_key = hashlib.md5(config_str.encode()).hexdigest()\n\n        return cache_key\n\n    def _generate_content_based_doc_id(self, content_list: List[Dict[str, Any]]) -> str:\n        \"\"\"\n        Generate doc_id based on document content\n\n        Args:\n            content_list: Parsed content list\n\n        Returns:\n            str: Content-based document ID with doc- prefix\n        \"\"\"\n        from lightrag.utils import compute_mdhash_id\n\n        # Extract key content for ID generation\n        content_hash_data = []\n\n        for item in content_list:\n            if isinstance(item, dict):\n                # For text content, use the text\n                if item.get(\"type\") == \"text\" and item.get(\"text\"):\n                    content_hash_data.append(item[\"text\"].strip())\n                # For other content types, use key identifiers\n                elif item.get(\"type\") == \"image\" and item.get(\"img_path\"):\n                    content_hash_data.append(f\"image:{item['img_path']}\")\n                elif item.get(\"type\") == \"table\" and item.get(\"table_body\"):\n                    content_hash_data.append(f\"table:{item['table_body']}\")\n                elif item.get(\"type\") == \"equation\" and item.get(\"text\"):\n                    content_hash_data.append(f\"equation:{item['text']}\")\n                else:\n                    # For other types, use string representation\n                    content_hash_data.append(str(item))\n\n        # Create a content signature\n        content_signature = \"\\n\".join(content_hash_data)\n\n        # Generate doc_id from content\n        doc_id = compute_mdhash_id(content_signature, prefix=\"doc-\")\n\n        return doc_id\n\n    async def _get_cached_result(\n        self, cache_key: str, file_path: Path, parse_method: str = None, **kwargs\n    ) -> tuple[List[Dict[str, Any]], str] | None:\n        \"\"\"\n        Get cached parsing result if available and valid\n\n        Args:\n            cache_key: Cache key to look up\n            file_path: Path to the file for mtime check\n            parse_method: Parse method used\n            **kwargs: Additional parser parameters\n\n        Returns:\n            tuple[List[Dict[str, Any]], str] | None: (content_list, doc_id) or None if not found/invalid\n        \"\"\"\n        if not hasattr(self, \"parse_cache\") or self.parse_cache is None:\n            return None\n\n        try:\n            cached_data = await self.parse_cache.get_by_id(cache_key)\n            if not cached_data:\n                return None\n\n            # Check file modification time\n            current_mtime = file_path.stat().st_mtime\n            cached_mtime = cached_data.get(\"mtime\", 0)\n\n            if current_mtime != cached_mtime:\n                self.logger.debug(f\"Cache invalid - file modified: {cache_key}\")\n                return None\n\n            # Check parsing configuration\n            cached_config = cached_data.get(\"parse_config\", {})\n            current_config = {\n                \"parser\": self.config.parser,\n                \"parse_method\": parse_method or self.config.parse_method,\n            }\n\n            # Add relevant kwargs to current config\n            relevant_kwargs = {\n                k: v\n                for k, v in kwargs.items()\n                if k\n                in [\n                    \"lang\",\n                    \"device\",\n                    \"start_page\",\n                    \"end_page\",\n                    \"formula\",\n                    \"table\",\n                    \"backend\",\n                    \"source\",\n                ]\n            }\n            current_config.update(relevant_kwargs)\n\n            if cached_config != current_config:\n                self.logger.debug(f\"Cache invalid - config changed: {cache_key}\")\n                return None\n\n            content_list = cached_data.get(\"content_list\", [])\n            doc_id = cached_data.get(\"doc_id\")\n\n            if content_list and doc_id:\n                self.logger.debug(\n                    f\"Found valid cached parsing result for key: {cache_key}\"\n                )\n                return content_list, doc_id\n            else:\n                self.logger.debug(\n                    f\"Cache incomplete - missing content or doc_id: {cache_key}\"\n                )\n                return None\n\n        except Exception as e:\n            self.logger.warning(f\"Error accessing parse cache: {e}\")\n\n        return None\n\n    async def _store_cached_result(\n        self,\n        cache_key: str,\n        content_list: List[Dict[str, Any]],\n        doc_id: str,\n        file_path: Path,\n        parse_method: str = None,\n        **kwargs,\n    ) -> None:\n        \"\"\"\n        Store parsing result in cache\n\n        Args:\n            cache_key: Cache key to store under\n            content_list: Content list to cache\n            doc_id: Content-based document ID\n            file_path: Path to the file for mtime storage\n            parse_method: Parse method used\n            **kwargs: Additional parser parameters\n        \"\"\"\n        if not hasattr(self, \"parse_cache\") or self.parse_cache is None:\n            return\n\n        try:\n            # Get file modification time\n            file_mtime = file_path.stat().st_mtime\n\n            # Create parsing configuration\n            parse_config = {\n                \"parser\": self.config.parser,\n                \"parse_method\": parse_method or self.config.parse_method,\n            }\n\n            # Add relevant kwargs to config\n            relevant_kwargs = {\n                k: v\n                for k, v in kwargs.items()\n                if k\n                in [\n                    \"lang\",\n                    \"device\",\n                    \"start_page\",\n                    \"end_page\",\n                    \"formula\",\n                    \"table\",\n                    \"backend\",\n                    \"source\",\n                ]\n            }\n            parse_config.update(relevant_kwargs)\n\n            cache_data = {\n                cache_key: {\n                    \"content_list\": content_list,\n                    \"doc_id\": doc_id,\n                    \"mtime\": file_mtime,\n                    \"parse_config\": parse_config,\n                    \"cached_at\": time.time(),\n                    \"cache_version\": \"1.0\",\n                }\n            }\n            await self.parse_cache.upsert(cache_data)\n            # Ensure data is persisted to disk\n            await self.parse_cache.index_done_callback()\n            self.logger.info(f\"Stored parsing result in cache: {cache_key}\")\n        except Exception as e:\n            self.logger.warning(f\"Error storing to parse cache: {e}\")\n\n    async def parse_document(\n        self,\n        file_path: str,\n        output_dir: str = None,\n        parse_method: str = None,\n        display_stats: bool = None,\n        **kwargs,\n    ) -> tuple[List[Dict[str, Any]], str]:\n        \"\"\"\n        Parse document with caching support\n\n        Args:\n            file_path: Path to the file to parse\n            output_dir: Output directory (defaults to config.parser_output_dir)\n            parse_method: Parse method (defaults to config.parse_method)\n            display_stats: Whether to display content statistics (defaults to config.display_content_stats)\n            **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)\n\n        Returns:\n            tuple[List[Dict[str, Any]], str]: (content_list, doc_id)\n        \"\"\"\n        # Use config defaults if not provided\n        if output_dir is None:\n            output_dir = self.config.parser_output_dir\n        if parse_method is None:\n            parse_method = self.config.parse_method\n        if display_stats is None:\n            display_stats = self.config.display_content_stats\n\n        self.logger.info(f\"Starting document parsing: {file_path}\")\n\n        file_path = Path(file_path)\n        if not file_path.exists():\n            raise FileNotFoundError(f\"File not found: {file_path}\")\n\n        callback_file = str(file_path)\n        callback_manager = getattr(self, \"callback_manager\", None)\n        parse_start_time = time.time()\n        if callback_manager is not None:\n            callback_manager.dispatch(\n                \"on_parse_start\",\n                file_path=callback_file,\n                parser=self.config.parser,\n            )\n\n        # Generate cache key based on file and configuration\n        cache_key = self._generate_cache_key(file_path, parse_method, **kwargs)\n\n        # Check cache first\n        cached_result = await self._get_cached_result(\n            cache_key, file_path, parse_method, **kwargs\n        )\n        if cached_result is not None:\n            content_list, doc_id = cached_result\n            self.logger.info(f\"Using cached parsing result for: {file_path}\")\n            if display_stats:\n                self.logger.info(\n                    f\"* Total blocks in cached content_list: {len(content_list)}\"\n                )\n            if callback_manager is not None:\n                duration = time.time() - parse_start_time\n                callback_manager.dispatch(\n                    \"on_parse_complete\",\n                    file_path=callback_file,\n                    content_blocks=len(content_list),\n                    doc_id=doc_id,\n                    duration_seconds=duration,\n                )\n            return content_list, doc_id\n\n        # Choose appropriate parsing method based on file extension\n        ext = file_path.suffix.lower()\n\n        try:\n            doc_parser = getattr(self, \"doc_parser\", None)\n            if doc_parser is None:\n                doc_parser = get_parser(self.config.parser)\n                self.doc_parser = doc_parser\n\n            # Log parser and method information\n            self.logger.info(\n                f\"Using {self.config.parser} parser with method: {parse_method}\"\n            )\n\n            if ext in [\".pdf\"]:\n                self.logger.info(\"Detected PDF file, using parser for PDF...\")\n                content_list = await asyncio.to_thread(\n                    doc_parser.parse_pdf,\n                    pdf_path=file_path,\n                    output_dir=output_dir,\n                    method=parse_method,\n                    **kwargs,\n                )\n            elif ext in [\n                \".jpg\",\n                \".jpeg\",\n                \".png\",\n                \".bmp\",\n                \".tiff\",\n                \".tif\",\n                \".gif\",\n                \".webp\",\n            ]:\n                self.logger.info(\"Detected image file, using parser for images...\")\n                try:\n                    content_list = await asyncio.to_thread(\n                        doc_parser.parse_image,\n                        image_path=file_path,\n                        output_dir=output_dir,\n                        **kwargs,\n                    )\n                except NotImplementedError:\n                    # Fallback to MinerU for image parsing if current parser doesn't support it\n                    self.logger.warning(\n                        f\"{self.config.parser} parser doesn't support image parsing, falling back to MinerU\"\n                    )\n                    content_list = await asyncio.to_thread(\n                        MineruParser().parse_image,\n                        image_path=file_path,\n                        output_dir=output_dir,\n                        **kwargs,\n                    )\n            elif ext in [\n                \".doc\",\n                \".docx\",\n                \".ppt\",\n                \".pptx\",\n                \".xls\",\n                \".xlsx\",\n                \".html\",\n                \".htm\",\n                \".xhtml\",\n            ]:\n                self.logger.info(\n                    \"Detected Office or HTML document, using parser for Office/HTML...\"\n                )\n                content_list = await asyncio.to_thread(\n                    doc_parser.parse_office_doc,\n                    doc_path=file_path,\n                    output_dir=output_dir,\n                    **kwargs,\n                )\n            else:\n                # For other or unknown formats, use generic parser\n                self.logger.info(\n                    f\"Using generic parser for {ext} file (method={parse_method})...\"\n                )\n                content_list = await asyncio.to_thread(\n                    doc_parser.parse_document,\n                    file_path=file_path,\n                    method=parse_method,\n                    output_dir=output_dir,\n                    **kwargs,\n                )\n\n        except MineruExecutionError as e:\n            self.logger.error(f\"Mineru command failed: {e}\")\n            if callback_manager is not None:\n                callback_manager.dispatch(\n                    \"on_parse_error\",\n                    file_path=callback_file,\n                    error=e,\n                    parser=self.config.parser,\n                )\n            raise\n        except Exception as e:\n            self.logger.error(\n                f\"Error during parsing with {self.config.parser} parser: {str(e)}\"\n            )\n            if callback_manager is not None:\n                callback_manager.dispatch(\n                    \"on_parse_error\",\n                    file_path=callback_file,\n                    error=e,\n                    parser=self.config.parser,\n                )\n            raise\n\n        msg = f\"Parsing {file_path} complete! Extracted {len(content_list)} content blocks\"\n        self.logger.info(msg)\n\n        if len(content_list) == 0:\n            raise ValueError(\"Parsing failed: No content was extracted\")\n\n        # Generate doc_id based on content\n        doc_id = self._generate_content_based_doc_id(content_list)\n\n        # Store result in cache\n        await self._store_cached_result(\n            cache_key, content_list, doc_id, file_path, parse_method, **kwargs\n        )\n\n        # Display content statistics if requested\n        if display_stats:\n            self.logger.info(\"\\nContent Information:\")\n            self.logger.info(f\"* Total blocks in content_list: {len(content_list)}\")\n\n            # Count elements by type\n            block_types: Dict[str, int] = {}\n            for block in content_list:\n                if isinstance(block, dict):\n                    block_type = block.get(\"type\", \"unknown\")\n                    if isinstance(block_type, str):\n                        block_types[block_type] = block_types.get(block_type, 0) + 1\n\n            self.logger.info(\"* Content block types:\")\n            for block_type, count in block_types.items():\n                self.logger.info(f\"  - {block_type}: {count}\")\n\n        if callback_manager is not None:\n            duration = time.time() - parse_start_time\n            callback_manager.dispatch(\n                \"on_parse_complete\",\n                file_path=callback_file,\n                content_blocks=len(content_list),\n                doc_id=doc_id,\n                duration_seconds=duration,\n            )\n\n        return content_list, doc_id\n\n    async def _process_multimodal_content(\n        self,\n        multimodal_items: List[Dict[str, Any]],\n        file_path: str,\n        doc_id: str,\n        pipeline_status: Optional[Any] = None,\n        pipeline_status_lock: Optional[Any] = None,\n    ):\n        \"\"\"\n        Process multimodal content (using specialized processors)\n\n        Args:\n            multimodal_items: List of multimodal items\n            file_path: File path (for reference)\n            doc_id: Document ID for proper chunk association\n            pipeline_status: Pipeline status object\n            pipeline_status_lock: Pipeline status lock\n        \"\"\"\n\n        if not multimodal_items:\n            self.logger.debug(\"No multimodal content to process\")\n            return\n\n        callback_manager = getattr(self, \"callback_manager\", None)\n        mm_start_time = time.time()\n        if callback_manager is not None:\n            callback_manager.dispatch(\n                \"on_multimodal_start\",\n                file_path=file_path,\n                item_count=len(multimodal_items),\n                doc_id=doc_id,\n            )\n\n        # Check multimodal processing status - handle LightRAG's early DocStatus.PROCESSED marking\n        try:\n            existing_doc_status = await self.lightrag.doc_status.get_by_id(doc_id)\n            if existing_doc_status:\n                # Check if multimodal content is already processed\n                multimodal_processed = existing_doc_status.get(\n                    \"multimodal_processed\", False\n                )\n\n                if multimodal_processed:\n                    self.logger.info(\n                        f\"Document {doc_id} multimodal content is already processed\"\n                    )\n                    return\n\n                # Even if status is DocStatus.PROCESSED (text processing done),\n                # we still need to process multimodal content if not yet done\n                doc_status = existing_doc_status.get(\"status\", \"\")\n                if doc_status == DocStatus.PROCESSED and not multimodal_processed:\n                    self.logger.info(\n                        f\"Document {doc_id} text processing is complete, but multimodal content still needs processing\"\n                    )\n                    # Continue with multimodal processing\n                elif doc_status == DocStatus.PROCESSED and multimodal_processed:\n                    self.logger.info(\n                        f\"Document {doc_id} is fully processed (text + multimodal)\"\n                    )\n                    return\n\n        except Exception as e:\n            self.logger.debug(f\"Error checking document status for {doc_id}: {e}\")\n            # Continue with processing if cache check fails\n\n        # Use ProcessorMixin's own batch processing that can handle multiple content types\n        log_message = \"Starting multimodal content processing...\"\n        self.logger.info(log_message)\n        if pipeline_status_lock and pipeline_status:\n            async with pipeline_status_lock:\n                pipeline_status[\"latest_message\"] = log_message\n                pipeline_status[\"history_messages\"].append(log_message)\n\n        try:\n            # Ensure LightRAG is initialized\n            await self._ensure_lightrag_initialized()\n\n            await self._process_multimodal_content_batch_type_aware(\n                multimodal_items=multimodal_items, file_path=file_path, doc_id=doc_id\n            )\n\n            # Mark multimodal content as processed and update final status\n            await self._mark_multimodal_processing_complete(doc_id)\n\n            log_message = \"Multimodal content processing complete\"\n            self.logger.info(log_message)\n            if pipeline_status_lock and pipeline_status:\n                async with pipeline_status_lock:\n                    pipeline_status[\"latest_message\"] = log_message\n                    pipeline_status[\"history_messages\"].append(log_message)\n\n            if callback_manager is not None:\n                duration = time.time() - mm_start_time\n                callback_manager.dispatch(\n                    \"on_multimodal_complete\",\n                    file_path=file_path,\n                    processed_count=len(multimodal_items),\n                    duration_seconds=duration,\n                    doc_id=doc_id,\n                )\n\n        except Exception as e:\n            self.logger.error(f\"Error in multimodal processing: {e}\")\n            # Fallback to individual processing if batch processing fails\n            self.logger.warning(\"Falling back to individual multimodal processing\")\n            await self._process_multimodal_content_individual(\n                multimodal_items, file_path, doc_id\n            )\n\n            # Mark multimodal content as processed even after fallback\n            await self._mark_multimodal_processing_complete(doc_id)\n\n    async def _process_multimodal_content_individual(\n        self, multimodal_items: List[Dict[str, Any]], file_path: str, doc_id: str\n    ):\n        \"\"\"\n        Process multimodal content individually (fallback method)\n\n        Args:\n            multimodal_items: List of multimodal items\n            file_path: File path (for reference)\n            doc_id: Document ID for proper chunk association\n        \"\"\"\n        # Use full path or basename based on config\n        file_name = self._get_file_reference(file_path)\n\n        # Collect all chunk results for batch processing (similar to text content processing)\n        all_chunk_results = []\n        multimodal_chunk_ids = []\n\n        # Get current text chunks count to set proper order indexes for multimodal chunks\n        existing_doc_status = await self.lightrag.doc_status.get_by_id(doc_id)\n        existing_chunks_count = (\n            existing_doc_status.get(\"chunks_count\", 0) if existing_doc_status else 0\n        )\n\n        for i, item in enumerate(multimodal_items):\n            try:\n                content_type = item.get(\"type\", \"unknown\")\n                self.logger.info(\n                    f\"Processing item {i + 1}/{len(multimodal_items)}: {content_type} content\"\n                )\n\n                # Select appropriate processor\n                processor = get_processor_for_type(self.modal_processors, content_type)\n\n                if processor:\n                    # Prepare item info for context extraction\n                    item_info = {\n                        \"page_idx\": item.get(\"page_idx\", 0),\n                        \"index\": i,\n                        \"type\": content_type,\n                    }\n\n                    # Process content and get chunk results instead of immediately merging\n                    (\n                        enhanced_caption,\n                        entity_info,\n                        chunk_results,\n                    ) = await processor.process_multimodal_content(\n                        modal_content=item,\n                        content_type=content_type,\n                        file_path=file_name,\n                        item_info=item_info,  # Pass item info for context extraction\n                        batch_mode=True,\n                        doc_id=doc_id,  # Pass doc_id for proper association\n                        chunk_order_index=existing_chunks_count\n                        + i,  # Proper order index\n                    )\n\n                    # Collect chunk results for batch processing\n                    all_chunk_results.extend(chunk_results)\n\n                    # Extract chunk ID from the entity_info (actual chunk_id created by processor)\n                    if entity_info and \"chunk_id\" in entity_info:\n                        chunk_id = entity_info[\"chunk_id\"]\n                        multimodal_chunk_ids.append(chunk_id)\n\n                    self.logger.info(\n                        f\"{content_type} processing complete: {entity_info.get('entity_name', 'Unknown')}\"\n                    )\n                else:\n                    self.logger.warning(\n                        f\"No suitable processor found for {content_type} type content\"\n                    )\n\n            except Exception as e:\n                self.logger.error(f\"Error processing multimodal content: {str(e)}\")\n                self.logger.debug(\"Exception details:\", exc_info=True)\n                continue\n\n        # Update doc_status to include multimodal chunks in the standard chunks_list\n        if multimodal_chunk_ids:\n            try:\n                # Get current document status\n                current_doc_status = await self.lightrag.doc_status.get_by_id(doc_id)\n\n                if current_doc_status:\n                    existing_chunks_list = current_doc_status.get(\"chunks_list\", [])\n                    existing_chunks_count = current_doc_status.get(\"chunks_count\", 0)\n\n                    # Add multimodal chunks to the standard chunks_list\n                    updated_chunks_list = existing_chunks_list + multimodal_chunk_ids\n                    updated_chunks_count = existing_chunks_count + len(\n                        multimodal_chunk_ids\n                    )\n\n                    # Update document status with integrated chunk list\n                    await self.lightrag.doc_status.upsert(\n                        {\n                            doc_id: {\n                                **current_doc_status,  # Keep existing fields\n                                \"chunks_list\": updated_chunks_list,  # Integrated chunks list\n                                \"chunks_count\": updated_chunks_count,  # Updated total count\n                                \"updated_at\": time.strftime(\"%Y-%m-%dT%H:%M:%S+00:00\"),\n                            }\n                        }\n                    )\n\n                    # Ensure doc_status update is persisted to disk\n                    await self.lightrag.doc_status.index_done_callback()\n\n                    self.logger.info(\n                        f\"Updated doc_status with {len(multimodal_chunk_ids)} multimodal chunks integrated into chunks_list\"\n                    )\n\n            except Exception as e:\n                self.logger.warning(\n                    f\"Error updating doc_status with multimodal chunks: {e}\"\n                )\n\n        # Batch merge all multimodal content results (similar to text content processing)\n        if all_chunk_results:\n            from lightrag.operate import merge_nodes_and_edges\n            from lightrag.kg.shared_storage import (\n                get_namespace_data,\n                get_pipeline_status_lock,\n            )\n\n            # Get pipeline status and lock from shared storage\n            pipeline_status = await get_namespace_data(\"pipeline_status\")\n            pipeline_status_lock = get_pipeline_status_lock()\n\n            await merge_nodes_and_edges(\n                chunk_results=all_chunk_results,\n                knowledge_graph_inst=self.lightrag.chunk_entity_relation_graph,\n                entity_vdb=self.lightrag.entities_vdb,\n                relationships_vdb=self.lightrag.relationships_vdb,\n                global_config=self.lightrag.__dict__,\n                full_entities_storage=self.lightrag.full_entities,\n                full_relations_storage=self.lightrag.full_relations,\n                doc_id=doc_id,\n                pipeline_status=pipeline_status,\n                pipeline_status_lock=pipeline_status_lock,\n                llm_response_cache=self.lightrag.llm_response_cache,\n                current_file_number=1,\n                total_files=1,\n                file_path=file_name,\n            )\n\n            await self.lightrag._insert_done()\n\n        self.logger.info(\"Individual multimodal content processing complete\")\n\n        # Mark multimodal content as processed\n        await self._mark_multimodal_processing_complete(doc_id)\n\n    async def _process_multimodal_content_batch_type_aware(\n        self, multimodal_items: List[Dict[str, Any]], file_path: str, doc_id: str\n    ):\n        \"\"\"\n        Type-aware batch processing that selects correct processors based on content type.\n        This is the corrected implementation that handles different modality types properly.\n\n        Args:\n            multimodal_items: List of multimodal items with different types\n            file_path: File path for citation\n            doc_id: Document ID for proper association\n        \"\"\"\n        if not multimodal_items:\n            self.logger.debug(\"No multimodal content to process\")\n            return\n\n        # Get existing chunks count for proper order indexing\n        try:\n            existing_doc_status = await self.lightrag.doc_status.get_by_id(doc_id)\n            existing_chunks_count = (\n                existing_doc_status.get(\"chunks_count\", 0) if existing_doc_status else 0\n            )\n        except Exception:\n            existing_chunks_count = 0\n\n        # Use LightRAG's concurrency control\n        semaphore = asyncio.Semaphore(getattr(self.lightrag, \"max_parallel_insert\", 2))\n\n        # Progress tracking variables\n        total_items = len(multimodal_items)\n        completed_count = 0\n        progress_lock = asyncio.Lock()\n\n        # Log processing start\n        self.logger.info(f\"Starting to process {total_items} multimodal content items\")\n\n        # Stage 1: Concurrent generation of descriptions using correct processors for each type\n        async def process_single_item_with_correct_processor(\n            item: Dict[str, Any], index: int, file_path: str\n        ):\n            \"\"\"Process single item using the correct processor for its type\"\"\"\n            nonlocal completed_count\n            async with semaphore:\n                try:\n                    content_type = item.get(\"type\", \"unknown\")\n\n                    # Select the correct processor based on content type\n                    processor = get_processor_for_type(\n                        self.modal_processors, content_type\n                    )\n\n                    if not processor:\n                        self.logger.warning(\n                            f\"No processor found for type: {content_type}\"\n                        )\n                        return None\n\n                    item_info = {\n                        \"page_idx\": item.get(\"page_idx\", 0),\n                        \"index\": index,\n                        \"type\": content_type,\n                    }\n\n                    # Call the correct processor's description generation method\n                    (\n                        description,\n                        entity_info,\n                    ) = await processor.generate_description_only(\n                        modal_content=item,\n                        content_type=content_type,\n                        item_info=item_info,\n                        entity_name=None,  # Let LLM auto-generate\n                    )\n\n                    # Update progress (non-blocking)\n                    async with progress_lock:\n                        completed_count += 1\n                        if (\n                            completed_count % max(1, total_items // 10) == 0\n                            or completed_count == total_items\n                        ):\n                            progress_percent = (completed_count / total_items) * 100\n                            self.logger.info(\n                                f\"Multimodal chunk generation progress: {completed_count}/{total_items} ({progress_percent:.1f}%)\"\n                            )\n\n                    return {\n                        \"index\": index,\n                        \"content_type\": content_type,\n                        \"description\": description,\n                        \"entity_info\": entity_info,\n                        \"original_item\": item,\n                        \"item_info\": item_info,\n                        \"chunk_order_index\": existing_chunks_count + index,\n                        \"processor\": processor,  # Keep reference to the processor used\n                        \"file_path\": file_path,  # Add file_path to the result\n                    }\n\n                except Exception as e:\n                    # Update progress even on error (non-blocking)\n                    async with progress_lock:\n                        completed_count += 1\n                        if (\n                            completed_count % max(1, total_items // 10) == 0\n                            or completed_count == total_items\n                        ):\n                            progress_percent = (completed_count / total_items) * 100\n                            self.logger.info(\n                                f\"Multimodal chunk generation progress: {completed_count}/{total_items} ({progress_percent:.1f}%)\"\n                            )\n\n                    self.logger.error(\n                        f\"Error generating description for {content_type} item {index}: {e}\"\n                    )\n                    return None\n\n        # Process all items concurrently with correct processors\n        tasks = [\n            asyncio.create_task(\n                process_single_item_with_correct_processor(item, i, file_path)\n            )\n            for i, item in enumerate(multimodal_items)\n        ]\n\n        results = await asyncio.gather(*tasks, return_exceptions=True)\n\n        # Filter successful results\n        multimodal_data_list = []\n        for result in results:\n            if isinstance(result, Exception):\n                self.logger.error(f\"Task failed: {result}\")\n                continue\n            if result is not None:\n                multimodal_data_list.append(result)\n\n        if not multimodal_data_list:\n            self.logger.warning(\"No valid multimodal descriptions generated\")\n            return\n\n        self.logger.info(\n            f\"Generated descriptions for {len(multimodal_data_list)}/{len(multimodal_items)} multimodal items using correct processors\"\n        )\n\n        # Stage 2: Convert to LightRAG chunks format\n        lightrag_chunks = self._convert_to_lightrag_chunks_type_aware(\n            multimodal_data_list, file_path, doc_id\n        )\n\n        # Stage 3: Store chunks to LightRAG storage\n        await self._store_chunks_to_lightrag_storage_type_aware(lightrag_chunks)\n\n        # Stage 3.5: Store multimodal main entities to entities_vdb and full_entities\n        await self._store_multimodal_main_entities(\n            multimodal_data_list, lightrag_chunks, file_path, doc_id\n        )\n\n        # Track chunk IDs for doc_status update\n        chunk_ids = list(lightrag_chunks.keys())\n\n        # Stage 4: Use LightRAG's batch entity relation extraction\n        chunk_results = await self._batch_extract_entities_lightrag_style_type_aware(\n            lightrag_chunks\n        )\n\n        # Stage 5: Add belongs_to relations (multimodal-specific)\n        enhanced_chunk_results = await self._batch_add_belongs_to_relations_type_aware(\n            chunk_results, multimodal_data_list\n        )\n\n        # Stage 6: Use LightRAG's batch merge\n        await self._batch_merge_lightrag_style_type_aware(\n            enhanced_chunk_results, file_path, doc_id\n        )\n\n        # Stage 7: Update doc_status with integrated chunks_list\n        await self._update_doc_status_with_chunks_type_aware(doc_id, chunk_ids)\n\n    def _convert_to_lightrag_chunks_type_aware(\n        self, multimodal_data_list: List[Dict[str, Any]], file_path: str, doc_id: str\n    ) -> Dict[str, Any]:\n        \"\"\"Convert multimodal data to LightRAG standard chunks format\"\"\"\n\n        chunks = {}\n\n        for data in multimodal_data_list:\n            description = data[\"description\"]\n            entity_info = data[\"entity_info\"]\n            chunk_order_index = data[\"chunk_order_index\"]\n            content_type = data[\"content_type\"]\n            original_item = data[\"original_item\"]\n\n            # Apply the appropriate chunk template based on content type\n            formatted_chunk_content = self._apply_chunk_template(\n                content_type, original_item, description\n            )\n\n            # Generate chunk_id\n            chunk_id = compute_mdhash_id(formatted_chunk_content, prefix=\"chunk-\")\n\n            # Calculate tokens\n            tokens = len(self.lightrag.tokenizer.encode(formatted_chunk_content))\n\n            # Use full path or basename based on config\n            file_ref = self._get_file_reference(file_path)\n\n            # Build LightRAG standard chunk format\n            chunks[chunk_id] = {\n                \"content\": formatted_chunk_content,  # Now uses the templated content\n                \"tokens\": tokens,\n                \"full_doc_id\": doc_id,\n                \"chunk_order_index\": chunk_order_index,\n                \"file_path\": file_ref,\n                \"llm_cache_list\": [],  # LightRAG will populate this field\n                # Multimodal-specific metadata\n                \"is_multimodal\": True,\n                \"modal_entity_name\": entity_info[\"entity_name\"],\n                \"original_type\": data[\"content_type\"],\n                \"page_idx\": data[\"item_info\"].get(\"page_idx\", 0),\n            }\n\n        self.logger.debug(\n            f\"Converted {len(chunks)} multimodal items to multimodal chunks format\"\n        )\n        return chunks\n\n    def _apply_chunk_template(\n        self, content_type: str, original_item: Dict[str, Any], description: str\n    ) -> str:\n        \"\"\"\n        Apply the appropriate chunk template based on content type\n\n        Args:\n            content_type: Type of content (image, table, equation, generic)\n            original_item: Original multimodal item data\n            description: Enhanced description generated by the processor\n\n        Returns:\n            Formatted chunk content using the appropriate template\n        \"\"\"\n        from raganything.prompt import PROMPTS\n\n        try:\n            if content_type == \"image\":\n                image_path = original_item.get(\"img_path\", \"\")\n                captions = original_item.get(\n                    \"image_caption\", original_item.get(\"img_caption\", [])\n                )\n                footnotes = original_item.get(\n                    \"image_footnote\", original_item.get(\"img_footnote\", [])\n                )\n\n                return PROMPTS[\"image_chunk\"].format(\n                    image_path=image_path,\n                    captions=\", \".join(captions) if captions else \"None\",\n                    footnotes=\", \".join(footnotes) if footnotes else \"None\",\n                    enhanced_caption=description,\n                )\n\n            elif content_type == \"table\":\n                table_img_path = original_item.get(\"img_path\", \"\")\n                table_caption = original_item.get(\"table_caption\", [])\n                table_body = original_item.get(\"table_body\", \"\")\n                table_footnote = original_item.get(\"table_footnote\", [])\n\n                return PROMPTS[\"table_chunk\"].format(\n                    table_img_path=table_img_path,\n                    table_caption=\", \".join(table_caption) if table_caption else \"None\",\n                    table_body=table_body,\n                    table_footnote=\", \".join(table_footnote)\n                    if table_footnote\n                    else \"None\",\n                    enhanced_caption=description,\n                )\n\n            elif content_type == \"equation\":\n                equation_text = original_item.get(\"text\", \"\")\n                equation_format = original_item.get(\"text_format\", \"\")\n\n                return PROMPTS[\"equation_chunk\"].format(\n                    equation_text=equation_text,\n                    equation_format=equation_format,\n                    enhanced_caption=description,\n                )\n\n            else:  # generic or unknown types\n                content = str(original_item.get(\"content\", original_item))\n\n                return PROMPTS[\"generic_chunk\"].format(\n                    content_type=content_type.title(),\n                    content=content,\n                    enhanced_caption=description,\n                )\n\n        except Exception as e:\n            self.logger.warning(\n                f\"Error applying chunk template for {content_type}: {e}\"\n            )\n            # Fallback to just the description if template fails\n            return description\n\n    async def _store_chunks_to_lightrag_storage_type_aware(\n        self, chunks: Dict[str, Any]\n    ):\n        \"\"\"Store chunks to storage\"\"\"\n        try:\n            # Store in text_chunks storage (required for extract_entities)\n            await self.lightrag.text_chunks.upsert(chunks)\n\n            # Store in chunks vector database for retrieval\n            await self.lightrag.chunks_vdb.upsert(chunks)\n\n            self.logger.debug(f\"Stored {len(chunks)} multimodal chunks to storage\")\n\n        except Exception as e:\n            self.logger.error(f\"Error storing chunks to storage: {e}\")\n            raise\n\n    async def _store_multimodal_main_entities(\n        self,\n        multimodal_data_list: List[Dict[str, Any]],\n        lightrag_chunks: Dict[str, Any],\n        file_path: str,\n        doc_id: str = None,\n    ):\n        \"\"\"\n        Store multimodal main entities to entities_vdb and full_entities.\n        This ensures that entities like \"TableName (table)\" are properly indexed.\n\n        Args:\n            multimodal_data_list: List of processed multimodal data with entity info\n            lightrag_chunks: Chunks in LightRAG format (already formatted with templates)\n            file_path: File path for the entities\n            doc_id: Document ID for full_entities storage\n        \"\"\"\n        if not multimodal_data_list:\n            return\n\n        # Create entities_vdb entries for all multimodal main entities\n        entities_to_store = {}\n\n        # Use full path or basename based on config\n        file_ref = self._get_file_reference(file_path)\n\n        for data in multimodal_data_list:\n            entity_info = data[\"entity_info\"]\n            entity_name = entity_info[\"entity_name\"]\n            description = data[\"description\"]\n            content_type = data[\"content_type\"]\n            original_item = data[\"original_item\"]\n\n            # Apply the same chunk template to get the formatted content\n            formatted_chunk_content = self._apply_chunk_template(\n                content_type, original_item, description\n            )\n\n            # Generate chunk_id using the formatted content (same as in _convert_to_lightrag_chunks)\n            chunk_id = compute_mdhash_id(formatted_chunk_content, prefix=\"chunk-\")\n\n            # Generate entity_id using LightRAG's standard format\n            entity_id = compute_mdhash_id(entity_name, prefix=\"ent-\")\n\n            # Create entity data in LightRAG format\n            entity_data = {\n                \"entity_name\": entity_name,\n                \"entity_type\": entity_info.get(\"entity_type\", content_type),\n                \"content\": entity_info.get(\"summary\", description),\n                \"source_id\": chunk_id,\n                \"file_path\": file_ref,\n            }\n\n            entities_to_store[entity_id] = entity_data\n\n        if entities_to_store:\n            try:\n                # Store entities in knowledge graph\n                for entity_id, entity_data in entities_to_store.items():\n                    entity_name = entity_data[\"entity_name\"]\n\n                    # Create node data for knowledge graph\n                    node_data = {\n                        \"entity_id\": entity_name,\n                        \"entity_type\": entity_data[\"entity_type\"],\n                        \"description\": entity_data[\"content\"],\n                        \"source_id\": entity_data[\"source_id\"],\n                        \"file_path\": entity_data[\"file_path\"],\n                        \"created_at\": int(time.time()),\n                    }\n\n                    # Store in knowledge graph\n                    await self.lightrag.chunk_entity_relation_graph.upsert_node(\n                        entity_name, node_data\n                    )\n\n                # Store in entities_vdb\n                await self.lightrag.entities_vdb.upsert(entities_to_store)\n                await self.lightrag.entities_vdb.index_done_callback()\n\n                # NEW: Store multimodal main entities in full_entities storage\n                if doc_id and self.lightrag.full_entities:\n                    await self._store_multimodal_entities_to_full_entities(\n                        entities_to_store, doc_id\n                    )\n\n                self.logger.debug(\n                    f\"Stored {len(entities_to_store)} multimodal main entities to knowledge graph, entities_vdb, and full_entities\"\n                )\n\n            except Exception as e:\n                self.logger.error(f\"Error storing multimodal main entities: {e}\")\n                raise\n\n    async def _store_multimodal_entities_to_full_entities(\n        self, entities_to_store: Dict[str, Any], doc_id: str\n    ):\n        \"\"\"\n        Store multimodal main entities to full_entities storage.\n\n        Args:\n            entities_to_store: Dictionary of entities to store\n            doc_id: Document ID for grouping entities\n        \"\"\"\n        try:\n            # Get current full_entities data for this document\n            current_doc_entities = await self.lightrag.full_entities.get_by_id(doc_id)\n\n            if current_doc_entities is None:\n                # Create new document entry\n                entity_names = list(\n                    entity_data[\"entity_name\"]\n                    for entity_data in entities_to_store.values()\n                )\n                doc_entities_data = {\n                    \"entity_names\": entity_names,\n                    \"count\": len(entity_names),\n                    \"update_time\": int(time.time()),\n                }\n            else:\n                # Update existing document entry\n                existing_entity_names = set(\n                    current_doc_entities.get(\"entity_names\", [])\n                )\n                new_entity_names = [\n                    entity_data[\"entity_name\"]\n                    for entity_data in entities_to_store.values()\n                ]\n\n                # Add new multimodal entities to the list (avoid duplicates)\n                for entity_name in new_entity_names:\n                    existing_entity_names.add(entity_name)\n\n                doc_entities_data = {\n                    \"entity_names\": list(existing_entity_names),\n                    \"count\": len(existing_entity_names),\n                    \"update_time\": int(time.time()),\n                }\n\n            # Store updated data\n            await self.lightrag.full_entities.upsert({doc_id: doc_entities_data})\n            await self.lightrag.full_entities.index_done_callback()\n\n            self.logger.debug(\n                f\"Added {len(entities_to_store)} multimodal main entities to full_entities for doc {doc_id}\"\n            )\n\n        except Exception as e:\n            self.logger.error(\n                f\"Error storing multimodal entities to full_entities: {e}\"\n            )\n            raise\n\n    async def _batch_extract_entities_lightrag_style_type_aware(\n        self, lightrag_chunks: Dict[str, Any]\n    ) -> List[Tuple]:\n        \"\"\"Use LightRAG's extract_entities for batch entity relation extraction\"\"\"\n        from lightrag.kg.shared_storage import (\n            get_namespace_data,\n            get_pipeline_status_lock,\n        )\n        from lightrag.operate import extract_entities\n\n        # Get pipeline status (consistent with LightRAG)\n        pipeline_status = await get_namespace_data(\"pipeline_status\")\n        pipeline_status_lock = get_pipeline_status_lock()\n\n        # Directly use LightRAG's extract_entities\n        chunk_results = await extract_entities(\n            chunks=lightrag_chunks,\n            global_config=self.lightrag.__dict__,\n            pipeline_status=pipeline_status,\n            pipeline_status_lock=pipeline_status_lock,\n            llm_response_cache=self.lightrag.llm_response_cache,\n            text_chunks_storage=self.lightrag.text_chunks,\n        )\n\n        self.logger.info(\n            f\"Extracted entities from {len(lightrag_chunks)} multimodal chunks\"\n        )\n        return chunk_results\n\n    async def _batch_add_belongs_to_relations_type_aware(\n        self, chunk_results: List[Tuple], multimodal_data_list: List[Dict[str, Any]]\n    ) -> List[Tuple]:\n        \"\"\"Add belongs_to relations for multimodal entities\"\"\"\n        # Create mapping from chunk_id to modal_entity_name\n        chunk_to_modal_entity = {}\n        chunk_to_file_path = {}\n\n        for data in multimodal_data_list:\n            description = data[\"description\"]\n            content_type = data[\"content_type\"]\n            original_item = data[\"original_item\"]\n\n            # Use the same template formatting as in _convert_to_lightrag_chunks_type_aware\n            formatted_chunk_content = self._apply_chunk_template(\n                content_type, original_item, description\n            )\n            chunk_id = compute_mdhash_id(formatted_chunk_content, prefix=\"chunk-\")\n\n            chunk_to_modal_entity[chunk_id] = data[\"entity_info\"][\"entity_name\"]\n            chunk_to_file_path[chunk_id] = data.get(\"file_path\", \"multimodal_content\")\n\n        enhanced_chunk_results = []\n        belongs_to_count = 0\n\n        for maybe_nodes, maybe_edges in chunk_results:\n            # Find corresponding modal_entity_name for this chunk\n            chunk_id = None\n            for nodes_dict in maybe_nodes.values():\n                if nodes_dict:\n                    chunk_id = nodes_dict[0].get(\"source_id\")\n                    break\n\n            if chunk_id and chunk_id in chunk_to_modal_entity:\n                modal_entity_name = chunk_to_modal_entity[chunk_id]\n                file_path = chunk_to_file_path.get(chunk_id, \"multimodal_content\")\n\n                # Add belongs_to relations for all extracted entities\n                for entity_name in maybe_nodes.keys():\n                    if entity_name != modal_entity_name:  # Avoid self-relation\n                        belongs_to_relation = {\n                            \"src_id\": entity_name,\n                            \"tgt_id\": modal_entity_name,\n                            \"description\": f\"Entity {entity_name} belongs to {modal_entity_name}\",\n                            \"keywords\": \"belongs_to,part_of,contained_in\",\n                            \"source_id\": chunk_id,\n                            \"weight\": 10.0,\n                            \"file_path\": file_path,\n                        }\n\n                        # Add to maybe_edges\n                        edge_key = (entity_name, modal_entity_name)\n                        if edge_key not in maybe_edges:\n                            maybe_edges[edge_key] = []\n                        maybe_edges[edge_key].append(belongs_to_relation)\n                        belongs_to_count += 1\n\n            enhanced_chunk_results.append((maybe_nodes, maybe_edges))\n\n        self.logger.info(\n            f\"Added {belongs_to_count} belongs_to relations for multimodal entities\"\n        )\n        return enhanced_chunk_results\n\n    async def _batch_merge_lightrag_style_type_aware(\n        self, enhanced_chunk_results: List[Tuple], file_path: str, doc_id: str = None\n    ):\n        \"\"\"Use LightRAG's merge_nodes_and_edges for batch merge\"\"\"\n        from lightrag.kg.shared_storage import (\n            get_namespace_data,\n            get_pipeline_status_lock,\n        )\n        from lightrag.operate import merge_nodes_and_edges\n\n        pipeline_status = await get_namespace_data(\"pipeline_status\")\n        pipeline_status_lock = get_pipeline_status_lock()\n\n        # Use full path or basename based on config\n        file_ref = self._get_file_reference(file_path)\n\n        await merge_nodes_and_edges(\n            chunk_results=enhanced_chunk_results,\n            knowledge_graph_inst=self.lightrag.chunk_entity_relation_graph,\n            entity_vdb=self.lightrag.entities_vdb,\n            relationships_vdb=self.lightrag.relationships_vdb,\n            global_config=self.lightrag.__dict__,\n            full_entities_storage=self.lightrag.full_entities,\n            full_relations_storage=self.lightrag.full_relations,\n            doc_id=doc_id,\n            pipeline_status=pipeline_status,\n            pipeline_status_lock=pipeline_status_lock,\n            llm_response_cache=self.lightrag.llm_response_cache,\n            current_file_number=1,\n            total_files=1,\n            file_path=file_ref,\n        )\n\n        await self.lightrag._insert_done()\n\n    async def _update_doc_status_with_chunks_type_aware(\n        self, doc_id: str, chunk_ids: List[str]\n    ):\n        \"\"\"Update document status with multimodal chunks\"\"\"\n        try:\n            # Get current document status\n            current_doc_status = await self.lightrag.doc_status.get_by_id(doc_id)\n\n            if current_doc_status:\n                existing_chunks_list = current_doc_status.get(\"chunks_list\", [])\n                existing_chunks_count = current_doc_status.get(\"chunks_count\", 0)\n\n                # Add multimodal chunks to the standard chunks_list\n                updated_chunks_list = existing_chunks_list + chunk_ids\n                updated_chunks_count = existing_chunks_count + len(chunk_ids)\n\n                # Update document status with integrated chunk list\n                await self.lightrag.doc_status.upsert(\n                    {\n                        doc_id: {\n                            **current_doc_status,  # Keep existing fields\n                            \"chunks_list\": updated_chunks_list,  # Integrated chunks list\n                            \"chunks_count\": updated_chunks_count,  # Updated total count\n                            \"updated_at\": time.strftime(\"%Y-%m-%dT%H:%M:%S+00:00\"),\n                        }\n                    }\n                )\n\n                # Ensure doc_status update is persisted to disk\n                await self.lightrag.doc_status.index_done_callback()\n\n                self.logger.info(\n                    f\"Updated doc_status: added {len(chunk_ids)} multimodal chunks to standard chunks_list \"\n                    f\"(total chunks: {updated_chunks_count})\"\n                )\n\n        except Exception as e:\n            self.logger.warning(\n                f\"Error updating doc_status with multimodal chunks: {e}\"\n            )\n\n    async def _mark_multimodal_processing_complete(self, doc_id: str):\n        \"\"\"Mark multimodal content processing as complete in the document status.\"\"\"\n        try:\n            current_doc_status = await self.lightrag.doc_status.get_by_id(doc_id)\n            if current_doc_status:\n                await self.lightrag.doc_status.upsert(\n                    {\n                        doc_id: {\n                            **current_doc_status,\n                            \"multimodal_processed\": True,\n                            \"updated_at\": time.strftime(\"%Y-%m-%dT%H:%M:%S+00:00\"),\n                        }\n                    }\n                )\n                await self.lightrag.doc_status.index_done_callback()\n                self.logger.debug(\n                    f\"Marked multimodal content processing as complete for document {doc_id}\"\n                )\n        except Exception as e:\n            self.logger.warning(\n                f\"Error marking multimodal processing as complete for document {doc_id}: {e}\"\n            )\n\n    async def is_document_fully_processed(self, doc_id: str) -> bool:\n        \"\"\"\n        Check if a document is fully processed (both text and multimodal content).\n\n        Args:\n            doc_id: Document ID to check\n\n        Returns:\n            bool: True if both text and multimodal content are processed\n        \"\"\"\n        try:\n            doc_status = await self.lightrag.doc_status.get_by_id(doc_id)\n            if not doc_status:\n                return False\n\n            text_processed = doc_status.get(\"status\") == DocStatus.PROCESSED\n            multimodal_processed = doc_status.get(\"multimodal_processed\", False)\n\n            return text_processed and multimodal_processed\n\n        except Exception as e:\n            self.logger.error(\n                f\"Error checking document processing status for {doc_id}: {e}\"\n            )\n            return False\n\n    async def get_document_processing_status(self, doc_id: str) -> Dict[str, Any]:\n        \"\"\"\n        Get detailed processing status for a document.\n\n        Args:\n            doc_id: Document ID to check\n\n        Returns:\n            Dict with processing status details\n        \"\"\"\n        try:\n            doc_status = await self.lightrag.doc_status.get_by_id(doc_id)\n            if not doc_status:\n                return {\n                    \"exists\": False,\n                    \"text_processed\": False,\n                    \"multimodal_processed\": False,\n                    \"fully_processed\": False,\n                    \"chunks_count\": 0,\n                }\n\n            text_processed = doc_status.get(\"status\") == DocStatus.PROCESSED\n            multimodal_processed = doc_status.get(\"multimodal_processed\", False)\n            fully_processed = text_processed and multimodal_processed\n\n            return {\n                \"exists\": True,\n                \"text_processed\": text_processed,\n                \"multimodal_processed\": multimodal_processed,\n                \"fully_processed\": fully_processed,\n                \"chunks_count\": doc_status.get(\"chunks_count\", 0),\n                \"chunks_list\": doc_status.get(\"chunks_list\", []),\n                \"status\": doc_status.get(\"status\", \"\"),\n                \"updated_at\": doc_status.get(\"updated_at\", \"\"),\n                \"raw_status\": doc_status,\n            }\n\n        except Exception as e:\n            self.logger.error(\n                f\"Error getting document processing status for {doc_id}: {e}\"\n            )\n            return {\n                \"exists\": False,\n                \"error\": str(e),\n                \"text_processed\": False,\n                \"multimodal_processed\": False,\n                \"fully_processed\": False,\n                \"chunks_count\": 0,\n            }\n\n    async def process_document_complete(\n        self,\n        file_path: str,\n        output_dir: str = None,\n        parse_method: str = None,\n        display_stats: bool = None,\n        split_by_character: str | None = None,\n        split_by_character_only: bool = False,\n        doc_id: str | None = None,\n        file_name: str | None = None,\n        **kwargs,\n    ):\n        \"\"\"\n        Complete document processing workflow\n\n        Args:\n            file_path: Path to the file to process\n            output_dir: output directory (defaults to config.parser_output_dir)\n            parse_method: Parse method (defaults to config.parse_method)\n            display_stats: Whether to display content statistics (defaults to config.display_content_stats)\n            split_by_character: Optional character to split the text by\n            split_by_character_only: If True, split only by the specified character\n            doc_id: Optional document ID, if not provided will be generated from content\n            **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)\n        \"\"\"\n        callback_manager = getattr(self, \"callback_manager\", None)\n        doc_start_time = time.time()\n        stage = \"parse\"\n\n        try:\n            # Ensure LightRAG is initialized\n            await self._ensure_lightrag_initialized()\n\n            # Use config defaults if not provided\n            if output_dir is None:\n                output_dir = self.config.parser_output_dir\n            if parse_method is None:\n                parse_method = self.config.parse_method\n            if display_stats is None:\n                display_stats = self.config.display_content_stats\n\n            self.logger.info(f\"Starting complete document processing: {file_path}\")\n\n            # Step 1: Parse document\n            content_list, content_based_doc_id = await self.parse_document(\n                file_path, output_dir, parse_method, display_stats, **kwargs\n            )\n\n            # Use provided doc_id or fall back to content-based doc_id\n            if doc_id is None:\n                doc_id = content_based_doc_id\n\n            # Step 2: Separate text and multimodal content\n            text_content, multimodal_items = separate_content(content_list)\n\n            # Step 2.5: Set content source for context extraction in multimodal processing\n            if hasattr(self, \"set_content_source_for_context\") and multimodal_items:\n                self.logger.info(\n                    \"Setting content source for context-aware multimodal processing...\"\n                )\n                self.set_content_source_for_context(\n                    content_list, self.config.content_format\n                )\n\n            # Step 3: Insert pure text content with all parameters\n            stage = \"text_insert\"\n            if text_content.strip():\n                if file_name is None:\n                    # Use full path or basename based on config\n                    file_name = self._get_file_reference(file_path)\n                if callback_manager is not None:\n                    callback_manager.dispatch(\n                        \"on_text_insert_start\",\n                        file_path=file_name,\n                        text_length=len(text_content),\n                        doc_id=doc_id,\n                    )\n                insert_start = time.time()\n                await insert_text_content(\n                    self.lightrag,\n                    input=text_content,\n                    file_paths=file_name,\n                    split_by_character=split_by_character,\n                    split_by_character_only=split_by_character_only,\n                    ids=doc_id,\n                )\n                if callback_manager is not None:\n                    insert_duration = time.time() - insert_start\n                    callback_manager.dispatch(\n                        \"on_text_insert_complete\",\n                        file_path=file_name,\n                        duration_seconds=insert_duration,\n                        doc_id=doc_id,\n                    )\n            else:\n                # Determine file reference even if no text content\n                if file_name is None:\n                    file_name = self._get_file_reference(file_path)\n\n            # Step 4: Process multimodal content (using specialized processors)\n            stage = \"multimodal\"\n            if multimodal_items:\n                await self._process_multimodal_content(\n                    multimodal_items, file_name, doc_id\n                )\n            else:\n                # If no multimodal content, mark multimodal processing as complete\n                # This ensures the document status properly reflects completion of all processing\n                await self._mark_multimodal_processing_complete(doc_id)\n                self.logger.debug(\n                    f\"No multimodal content found in document {doc_id}, \"\n                    \"marked multimodal processing as complete\",\n                )\n\n        except Exception as exc:\n            if callback_manager is not None:\n                callback_manager.dispatch(\n                    \"on_document_error\",\n                    file_path=str(file_path),\n                    doc_id=doc_id,\n                    stage=stage,\n                    error=exc,\n                )\n            raise\n\n        self.logger.info(f\"Document {file_path} processing complete!\")\n        if callback_manager is not None:\n            duration = time.time() - doc_start_time\n            callback_manager.dispatch(\n                \"on_document_complete\",\n                file_path=str(file_path),\n                doc_id=doc_id,\n                duration_seconds=duration,\n            )\n\n    async def process_document_complete_lightrag_api(\n        self,\n        file_path: str,\n        output_dir: str = None,\n        parse_method: str = None,\n        display_stats: bool = None,\n        split_by_character: str | None = None,\n        split_by_character_only: bool = False,\n        doc_id: str | None = None,\n        scheme_name: str | None = None,\n        parser: str | None = None,\n        **kwargs,\n    ):\n        \"\"\"\n        API exclusively for LightRAG calls: Complete document processing workflow\n\n        Args:\n            file_path: Path to the file to process\n            output_dir: output directory (defaults to config.parser_output_dir)\n            parse_method: Parse method (defaults to config.parse_method)\n            display_stats: Whether to display content statistics (defaults to config.display_content_stats)\n            split_by_character: Optional character to split the text by\n            split_by_character_only: If True, split only by the specified character\n            doc_id: Optional document ID, if not provided will be generated from content\n            **kwargs: Additional parameters for parser (e.g., lang, device, start_page, end_page, formula, table, backend, source)\n        \"\"\"\n        # Use full path or basename based on config\n        file_name = self._get_file_reference(file_path)\n        doc_pre_id = f\"doc-pre-{file_name}\"\n        pipeline_status = None\n        pipeline_status_lock = None\n\n        if parser:\n            self.config.parser = parser\n\n        current_doc_status = await self.lightrag.doc_status.get_by_id(doc_pre_id)\n\n        try:\n            # Ensure LightRAG is initialized\n            result = await self._ensure_lightrag_initialized()\n            if not result[\"success\"]:\n                await self.lightrag.doc_status.upsert(\n                    {\n                        doc_pre_id: {\n                            **current_doc_status,\n                            \"status\": DocStatus.FAILED,\n                            \"error_msg\": result[\"error\"],\n                        }\n                    }\n                )\n                return False\n\n            # Use config defaults if not provided\n            if output_dir is None:\n                output_dir = self.config.parser_output_dir\n            if parse_method is None:\n                parse_method = self.config.parse_method\n            if display_stats is None:\n                display_stats = self.config.display_content_stats\n\n            self.logger.info(f\"Starting complete document processing: {file_path}\")\n\n            # Initialize doc status\n            current_doc_status = await self.lightrag.doc_status.get_by_id(doc_pre_id)\n            if not current_doc_status:\n                await self.lightrag.doc_status.upsert(\n                    {\n                        doc_pre_id: {\n                            \"status\": DocStatus.READY,\n                            \"content\": \"\",\n                            \"error_msg\": \"\",\n                            \"content_summary\": \"\",\n                            \"multimodal_content\": [],\n                            \"scheme_name\": scheme_name,\n                            \"content_length\": 0,\n                            \"created_at\": \"\",\n                            \"updated_at\": \"\",\n                            \"file_path\": file_name,\n                        }\n                    }\n                )\n                current_doc_status = await self.lightrag.doc_status.get_by_id(\n                    doc_pre_id\n                )\n\n            from lightrag.kg.shared_storage import (\n                get_namespace_data,\n                get_pipeline_status_lock,\n            )\n\n            pipeline_status = await get_namespace_data(\"pipeline_status\")\n            pipeline_status_lock = get_pipeline_status_lock()\n\n            # Set processing status\n            async with pipeline_status_lock:\n                pipeline_status.update({\"scan_disabled\": True})\n                pipeline_status[\"history_messages\"].append(\"Now is not allowed to scan\")\n\n            await self.lightrag.doc_status.upsert(\n                {\n                    doc_pre_id: {\n                        **current_doc_status,\n                        \"status\": DocStatus.HANDLING,\n                        \"error_msg\": \"\",\n                    }\n                }\n            )\n\n            content_list = []\n            content_based_doc_id = \"\"\n\n            try:\n                # Step 1: Parse document\n                content_list, content_based_doc_id = await self.parse_document(\n                    file_path, output_dir, parse_method, display_stats, **kwargs\n                )\n            except MineruExecutionError as e:\n                error_message = e.error_msg\n                if isinstance(e.error_msg, list):\n                    error_message = \"\\n\".join(e.error_msg)\n                await self.lightrag.doc_status.upsert(\n                    {\n                        doc_pre_id: {\n                            **current_doc_status,\n                            \"status\": DocStatus.FAILED,\n                            \"error_msg\": error_message,\n                        }\n                    }\n                )\n                self.logger.info(\n                    f\"Error processing document {file_path}: MineruExecutionError\"\n                )\n                return False\n            except Exception as e:\n                await self.lightrag.doc_status.upsert(\n                    {\n                        doc_pre_id: {\n                            **current_doc_status,\n                            \"status\": DocStatus.FAILED,\n                            \"error_msg\": str(e),\n                        }\n                    }\n                )\n                self.logger.info(f\"Error processing document {file_path}: {str(e)}\")\n                return False\n\n            # Use provided doc_id or fall back to content-based doc_id\n            if doc_id is None:\n                doc_id = content_based_doc_id\n\n            # Step 2: Separate text and multimodal content\n            text_content, multimodal_items = separate_content(content_list)\n\n            # Step 2.5: Set content source for context extraction in multimodal processing\n            if hasattr(self, \"set_content_source_for_context\") and multimodal_items:\n                self.logger.info(\n                    \"Setting content source for context-aware multimodal processing...\"\n                )\n                self.set_content_source_for_context(\n                    content_list, self.config.content_format\n                )\n\n            # Step 3: Insert pure text content and multimodal content with all parameters\n            if text_content.strip():\n                await insert_text_content_with_multimodal_content(\n                    self.lightrag,\n                    input=text_content,\n                    multimodal_content=multimodal_items,\n                    file_paths=file_name,\n                    split_by_character=split_by_character,\n                    split_by_character_only=split_by_character_only,\n                    ids=doc_id,\n                    scheme_name=scheme_name,\n                )\n\n            self.logger.info(f\"Document {file_path} processing completed successfully\")\n            return True\n\n        except Exception as e:\n            self.logger.error(f\"Error processing document {file_path}: {str(e)}\")\n            self.logger.debug(\"Exception details:\", exc_info=True)\n\n            # Update doc status to Failed\n            await self.lightrag.doc_status.upsert(\n                {\n                    doc_pre_id: {\n                        **current_doc_status,\n                        \"status\": DocStatus.FAILED,\n                        \"error_msg\": str(e),\n                    }\n                }\n            )\n            await self.lightrag.doc_status.index_done_callback()\n\n            # Update pipeline status\n            if pipeline_status_lock and pipeline_status:\n                try:\n                    async with pipeline_status_lock:\n                        pipeline_status.update({\"scan_disabled\": False})\n                        error_msg = (\n                            f\"RAGAnything processing failed for {file_name}: {str(e)}\"\n                        )\n                        pipeline_status[\"latest_message\"] = error_msg\n                        pipeline_status[\"history_messages\"].append(error_msg)\n                        pipeline_status[\"history_messages\"].append(\n                            \"Now is allowed to scan\"\n                        )\n                except Exception as pipeline_update_error:\n                    self.logger.error(\n                        f\"Failed to update pipeline status: {pipeline_update_error}\"\n                    )\n\n            return False\n\n        finally:\n            async with pipeline_status_lock:\n                pipeline_status.update({\"scan_disabled\": False})\n                pipeline_status[\"latest_message\"] = (\n                    f\"RAGAnything processing completed for {file_name}\"\n                )\n                pipeline_status[\"history_messages\"].append(\n                    f\"RAGAnything processing completed for {file_name}\"\n                )\n                pipeline_status[\"history_messages\"].append(\"Now is allowed to scan\")\n\n    async def insert_content_list(\n        self,\n        content_list: List[Dict[str, Any]],\n        file_path: str = \"unknown_document\",\n        split_by_character: str | None = None,\n        split_by_character_only: bool = False,\n        doc_id: str | None = None,\n        display_stats: bool = None,\n    ):\n        \"\"\"\n        Insert content list directly without document parsing\n\n        Args:\n            content_list: Pre-parsed content list containing text and multimodal items.\n                         Each item should be a dictionary with the following structure:\n                         - Text: {\"type\": \"text\", \"text\": \"content\", \"page_idx\": 0}\n                         - Image: {\"type\": \"image\", \"img_path\": \"/absolute/path/to/image.jpg\",\n                                  \"image_caption\": [\"caption\"], \"image_footnote\": [\"note\"], \"page_idx\": 1}\n                         - Table: {\"type\": \"table\", \"table_body\": \"markdown table\",\n                                  \"table_caption\": [\"caption\"], \"table_footnote\": [\"note\"], \"page_idx\": 2}\n                         - Equation: {\"type\": \"equation\", \"latex\": \"LaTeX formula\",\n                                     \"text\": \"description\", \"page_idx\": 3}\n                         - Generic: {\"type\": \"custom_type\", \"content\": \"any content\", \"page_idx\": 4}\n            file_path: Reference file path/name for citation (defaults to \"unknown_document\")\n            split_by_character: Optional character to split the text by\n            split_by_character_only: If True, split only by the specified character\n            doc_id: Optional document ID, if not provided will be generated from content\n            display_stats: Whether to display content statistics (defaults to config.display_content_stats)\n\n        Note:\n            - img_path must be an absolute path to the image file\n            - page_idx represents the page number where the content appears (0-based indexing)\n            - Items are processed in the order they appear in the list\n        \"\"\"\n        callback_manager = getattr(self, \"callback_manager\", None)\n        doc_start_time = time.time()\n\n        # Ensure LightRAG is initialized\n        await self._ensure_lightrag_initialized()\n\n        # Use config defaults if not provided\n        if display_stats is None:\n            display_stats = self.config.display_content_stats\n\n        self.logger.info(\n            f\"Starting direct content list insertion for: {file_path} ({len(content_list)} items)\"\n        )\n\n        # Generate doc_id based on content if not provided\n        if doc_id is None:\n            doc_id = self._generate_content_based_doc_id(content_list)\n\n        # Display content statistics if requested\n        if display_stats:\n            self.logger.info(\"\\nContent Information:\")\n            self.logger.info(f\"* Total blocks in content_list: {len(content_list)}\")\n\n            # Count elements by type\n            block_types: Dict[str, int] = {}\n            for block in content_list:\n                if isinstance(block, dict):\n                    block_type = block.get(\"type\", \"unknown\")\n                    if isinstance(block_type, str):\n                        block_types[block_type] = block_types.get(block_type, 0) + 1\n\n            self.logger.info(\"* Content block types:\")\n            for block_type, count in block_types.items():\n                self.logger.info(f\"  - {block_type}: {count}\")\n\n        # Step 1: Separate text and multimodal content\n        text_content, multimodal_items = separate_content(content_list)\n\n        # Step 1.5: Set content source for context extraction in multimodal processing\n        if hasattr(self, \"set_content_source_for_context\") and multimodal_items:\n            self.logger.info(\n                \"Setting content source for context-aware multimodal processing...\"\n            )\n            self.set_content_source_for_context(\n                content_list, self.config.content_format\n            )\n\n        # Step 2: Insert pure text content with all parameters\n        if text_content.strip():\n            # Use full path or basename based on config\n            file_ref = self._get_file_reference(file_path)\n            if callback_manager is not None:\n                callback_manager.dispatch(\n                    \"on_text_insert_start\",\n                    file_path=file_ref,\n                    text_length=len(text_content),\n                    doc_id=doc_id,\n                )\n            insert_start = time.time()\n            await insert_text_content(\n                self.lightrag,\n                input=text_content,\n                file_paths=file_ref,\n                split_by_character=split_by_character,\n                split_by_character_only=split_by_character_only,\n                ids=doc_id,\n            )\n            if callback_manager is not None:\n                insert_duration = time.time() - insert_start\n                callback_manager.dispatch(\n                    \"on_text_insert_complete\",\n                    file_path=file_ref,\n                    duration_seconds=insert_duration,\n                    doc_id=doc_id,\n                )\n        else:\n            # Determine file reference even if no text content\n            file_ref = self._get_file_reference(file_path)\n\n        # Step 3: Process multimodal content (using specialized processors)\n        if multimodal_items:\n            await self._process_multimodal_content(multimodal_items, file_ref, doc_id)\n        else:\n            # If no multimodal content, mark multimodal processing as complete\n            # This ensures the document status properly reflects completion of all processing\n            await self._mark_multimodal_processing_complete(doc_id)\n            self.logger.debug(\n                f\"No multimodal content found in document {doc_id}, marked multimodal processing as complete\"\n            )\n\n        self.logger.info(f\"Content list insertion complete for: {file_path}\")\n        if callback_manager is not None:\n            duration = time.time() - doc_start_time\n            callback_manager.dispatch(\n                \"on_document_complete\",\n                file_path=file_path,\n                doc_id=doc_id,\n                duration_seconds=duration,\n            )\n"
  },
  {
    "path": "raganything/prompt.py",
    "content": "\"\"\"\nPrompt templates for multimodal content processing\n\nContains all prompt templates used in modal processors for analyzing\ndifferent types of content (images, tables, equations, etc.)\n\"\"\"\n\nfrom __future__ import annotations\nfrom typing import Any\n\n\nPROMPTS: dict[str, Any] = {}\n\n# System prompts for different analysis types\nPROMPTS[\"IMAGE_ANALYSIS_SYSTEM\"] = (\n    \"You are an expert image analyst. Provide detailed, accurate descriptions.\"\n)\nPROMPTS[\"IMAGE_ANALYSIS_FALLBACK_SYSTEM\"] = (\n    \"You are an expert image analyst. Provide detailed analysis based on available information.\"\n)\nPROMPTS[\"TABLE_ANALYSIS_SYSTEM\"] = (\n    \"You are an expert data analyst. Provide detailed table analysis with specific insights.\"\n)\nPROMPTS[\"EQUATION_ANALYSIS_SYSTEM\"] = (\n    \"You are an expert mathematician. Provide detailed mathematical analysis.\"\n)\nPROMPTS[\"GENERIC_ANALYSIS_SYSTEM\"] = (\n    \"You are an expert content analyst specializing in {content_type} content.\"\n)\n\n# Image analysis prompt template\nPROMPTS[\n    \"vision_prompt\"\n] = \"\"\"Please analyze this image in detail and provide a JSON response with the following structure:\n\n{{\n    \"detailed_description\": \"A comprehensive and detailed visual description of the image following these guidelines:\n    - Describe the overall composition and layout\n    - Identify all objects, people, text, and visual elements\n    - Explain relationships between elements\n    - Note colors, lighting, and visual style\n    - Describe any actions or activities shown\n    - Include technical details if relevant (charts, diagrams, etc.)\n    - Always use specific names instead of pronouns\",\n    \"entity_info\": {{\n        \"entity_name\": \"{entity_name}\",\n        \"entity_type\": \"image\",\n        \"summary\": \"concise summary of the image content and its significance (max 100 words)\"\n    }}\n}}\n\nAdditional context:\n- Image Path: {image_path}\n- Captions: {captions}\n- Footnotes: {footnotes}\n\nFocus on providing accurate, detailed visual analysis that would be useful for knowledge retrieval.\"\"\"\n\n# Image analysis prompt with context support\nPROMPTS[\n    \"vision_prompt_with_context\"\n] = \"\"\"Please analyze this image in detail, considering the surrounding context. Provide a JSON response with the following structure:\n\n{{\n    \"detailed_description\": \"A comprehensive and detailed visual description of the image following these guidelines:\n    - Describe the overall composition and layout\n    - Identify all objects, people, text, and visual elements\n    - Explain relationships between elements and how they relate to the surrounding context\n    - Note colors, lighting, and visual style\n    - Describe any actions or activities shown\n    - Include technical details if relevant (charts, diagrams, etc.)\n    - Reference connections to the surrounding content when relevant\n    - Always use specific names instead of pronouns\",\n    \"entity_info\": {{\n        \"entity_name\": \"{entity_name}\",\n        \"entity_type\": \"image\",\n        \"summary\": \"concise summary of the image content, its significance, and relationship to surrounding content (max 100 words)\"\n    }}\n}}\n\nContext from surrounding content:\n{context}\n\nImage details:\n- Image Path: {image_path}\n- Captions: {captions}\n- Footnotes: {footnotes}\n\nFocus on providing accurate, detailed visual analysis that incorporates the context and would be useful for knowledge retrieval.\"\"\"\n\n# Image analysis prompt with text fallback\nPROMPTS[\"text_prompt\"] = \"\"\"Based on the following image information, provide analysis:\n\nImage Path: {image_path}\nCaptions: {captions}\nFootnotes: {footnotes}\n\n{vision_prompt}\"\"\"\n\n# Table analysis prompt template\nPROMPTS[\n    \"table_prompt\"\n] = \"\"\"Please analyze this table content and provide a JSON response with the following structure:\n\n{{\n    \"detailed_description\": \"A comprehensive analysis of the table including:\n    - Table structure and organization\n    - Column headers and their meanings\n    - Key data points and patterns\n    - Statistical insights and trends\n    - Relationships between data elements\n    - Significance of the data presented\n    Always use specific names and values instead of general references.\",\n    \"entity_info\": {{\n        \"entity_name\": \"{entity_name}\",\n        \"entity_type\": \"table\",\n        \"summary\": \"concise summary of the table's purpose and key findings (max 100 words)\"\n    }}\n}}\n\nTable Information:\nImage Path: {table_img_path}\nCaption: {table_caption}\nBody: {table_body}\nFootnotes: {table_footnote}\n\nFocus on extracting meaningful insights and relationships from the tabular data.\"\"\"\n\n# Table analysis prompt with context support\nPROMPTS[\n    \"table_prompt_with_context\"\n] = \"\"\"Please analyze this table content considering the surrounding context, and provide a JSON response with the following structure:\n\n{{\n    \"detailed_description\": \"A comprehensive analysis of the table including:\n    - Table structure and organization\n    - Column headers and their meanings\n    - Key data points and patterns\n    - Statistical insights and trends\n    - Relationships between data elements\n    - Significance of the data presented in relation to surrounding context\n    - How the table supports or illustrates concepts from the surrounding content\n    Always use specific names and values instead of general references.\",\n    \"entity_info\": {{\n        \"entity_name\": \"{entity_name}\",\n        \"entity_type\": \"table\",\n        \"summary\": \"concise summary of the table's purpose, key findings, and relationship to surrounding content (max 100 words)\"\n    }}\n}}\n\nContext from surrounding content:\n{context}\n\nTable Information:\nImage Path: {table_img_path}\nCaption: {table_caption}\nBody: {table_body}\nFootnotes: {table_footnote}\n\nFocus on extracting meaningful insights and relationships from the tabular data in the context of the surrounding content.\"\"\"\n\n# Equation analysis prompt template\nPROMPTS[\n    \"equation_prompt\"\n] = \"\"\"Please analyze this mathematical equation and provide a JSON response with the following structure:\n\n{{\n    \"detailed_description\": \"A comprehensive analysis of the equation including:\n    - Mathematical meaning and interpretation\n    - Variables and their definitions\n    - Mathematical operations and functions used\n    - Application domain and context\n    - Physical or theoretical significance\n    - Relationship to other mathematical concepts\n    - Practical applications or use cases\n    Always use specific mathematical terminology.\",\n    \"entity_info\": {{\n        \"entity_name\": \"{entity_name}\",\n        \"entity_type\": \"equation\",\n        \"summary\": \"concise summary of the equation's purpose and significance (max 100 words)\"\n    }}\n}}\n\nEquation Information:\nEquation: {equation_text}\nFormat: {equation_format}\n\nFocus on providing mathematical insights and explaining the equation's significance.\"\"\"\n\n# Equation analysis prompt with context support\nPROMPTS[\n    \"equation_prompt_with_context\"\n] = \"\"\"Please analyze this mathematical equation considering the surrounding context, and provide a JSON response with the following structure:\n\n{{\n    \"detailed_description\": \"A comprehensive analysis of the equation including:\n    - Mathematical meaning and interpretation\n    - Variables and their definitions in the context of surrounding content\n    - Mathematical operations and functions used\n    - Application domain and context based on surrounding material\n    - Physical or theoretical significance\n    - Relationship to other mathematical concepts mentioned in the context\n    - Practical applications or use cases\n    - How the equation relates to the broader discussion or framework\n    Always use specific mathematical terminology.\",\n    \"entity_info\": {{\n        \"entity_name\": \"{entity_name}\",\n        \"entity_type\": \"equation\",\n        \"summary\": \"concise summary of the equation's purpose, significance, and role in the surrounding context (max 100 words)\"\n    }}\n}}\n\nContext from surrounding content:\n{context}\n\nEquation Information:\nEquation: {equation_text}\nFormat: {equation_format}\n\nFocus on providing mathematical insights and explaining the equation's significance within the broader context.\"\"\"\n\n# Generic content analysis prompt template\nPROMPTS[\n    \"generic_prompt\"\n] = \"\"\"Please analyze this {content_type} content and provide a JSON response with the following structure:\n\n{{\n    \"detailed_description\": \"A comprehensive analysis of the content including:\n    - Content structure and organization\n    - Key information and elements\n    - Relationships between components\n    - Context and significance\n    - Relevant details for knowledge retrieval\n    Always use specific terminology appropriate for {content_type} content.\",\n    \"entity_info\": {{\n        \"entity_name\": \"{entity_name}\",\n        \"entity_type\": \"{content_type}\",\n        \"summary\": \"concise summary of the content's purpose and key points (max 100 words)\"\n    }}\n}}\n\nContent: {content}\n\nFocus on extracting meaningful information that would be useful for knowledge retrieval.\"\"\"\n\n# Generic content analysis prompt with context support\nPROMPTS[\n    \"generic_prompt_with_context\"\n] = \"\"\"Please analyze this {content_type} content considering the surrounding context, and provide a JSON response with the following structure:\n\n{{\n    \"detailed_description\": \"A comprehensive analysis of the content including:\n    - Content structure and organization\n    - Key information and elements\n    - Relationships between components\n    - Context and significance in relation to surrounding content\n    - How this content connects to or supports the broader discussion\n    - Relevant details for knowledge retrieval\n    Always use specific terminology appropriate for {content_type} content.\",\n    \"entity_info\": {{\n        \"entity_name\": \"{entity_name}\",\n        \"entity_type\": \"{content_type}\",\n        \"summary\": \"concise summary of the content's purpose, key points, and relationship to surrounding context (max 100 words)\"\n    }}\n}}\n\nContext from surrounding content:\n{context}\n\nContent: {content}\n\nFocus on extracting meaningful information that would be useful for knowledge retrieval and understanding the content's role in the broader context.\"\"\"\n\n# Modal chunk templates\nPROMPTS[\"image_chunk\"] = \"\"\"\nImage Content Analysis:\nImage Path: {image_path}\nCaptions: {captions}\nFootnotes: {footnotes}\n\nVisual Analysis: {enhanced_caption}\"\"\"\n\nPROMPTS[\"table_chunk\"] = \"\"\"Table Analysis:\nImage Path: {table_img_path}\nCaption: {table_caption}\nStructure: {table_body}\nFootnotes: {table_footnote}\n\nAnalysis: {enhanced_caption}\"\"\"\n\nPROMPTS[\"equation_chunk\"] = \"\"\"Mathematical Equation Analysis:\nEquation: {equation_text}\nFormat: {equation_format}\n\nMathematical Analysis: {enhanced_caption}\"\"\"\n\nPROMPTS[\"generic_chunk\"] = \"\"\"{content_type} Content Analysis:\nContent: {content}\n\nAnalysis: {enhanced_caption}\"\"\"\n\n# Query-related prompts\nPROMPTS[\"QUERY_IMAGE_DESCRIPTION\"] = (\n    \"Please briefly describe the main content, key elements, and important information in this image.\"\n)\n\nPROMPTS[\"QUERY_IMAGE_ANALYST_SYSTEM\"] = (\n    \"You are a professional image analyst who can accurately describe image content.\"\n)\n\nPROMPTS[\n    \"QUERY_TABLE_ANALYSIS\"\n] = \"\"\"Please analyze the main content, structure, and key information of the following table data:\n\nTable data:\n{table_data}\n\nTable caption: {table_caption}\n\nPlease briefly summarize the main content, data characteristics, and important findings of the table.\"\"\"\n\nPROMPTS[\"QUERY_TABLE_ANALYST_SYSTEM\"] = (\n    \"You are a professional data analyst who can accurately analyze table data.\"\n)\n\nPROMPTS[\n    \"QUERY_EQUATION_ANALYSIS\"\n] = \"\"\"Please explain the meaning and purpose of the following mathematical formula:\n\nLaTeX formula: {latex}\nFormula caption: {equation_caption}\n\nPlease briefly explain the mathematical meaning, application scenarios, and importance of this formula.\"\"\"\n\nPROMPTS[\"QUERY_EQUATION_ANALYST_SYSTEM\"] = (\n    \"You are a mathematics expert who can clearly explain mathematical formulas.\"\n)\n\nPROMPTS[\n    \"QUERY_GENERIC_ANALYSIS\"\n] = \"\"\"Please analyze the following {content_type} type content and extract its main information and key features:\n\nContent: {content_str}\n\nPlease briefly summarize the main characteristics and important information of this content.\"\"\"\n\nPROMPTS[\"QUERY_GENERIC_ANALYST_SYSTEM\"] = (\n    \"You are a professional content analyst who can accurately analyze {content_type} type content.\"\n)\n\nPROMPTS[\"QUERY_ENHANCEMENT_SUFFIX\"] = (\n    \"\\n\\nPlease provide a comprehensive answer based on the user query and the provided multimodal content information.\"\n)\n"
  },
  {
    "path": "raganything/query.py",
    "content": "\"\"\"\nQuery functionality for RAGAnything\n\nContains all query-related methods for both text and multimodal queries\n\"\"\"\n\nimport json\nimport hashlib\nimport re\nimport time\nfrom typing import Dict, List, Any\nfrom pathlib import Path\nfrom lightrag import QueryParam\nfrom lightrag.utils import always_get_an_event_loop\nfrom raganything.prompt import PROMPTS\nfrom raganything.utils import (\n    get_processor_for_type,\n    encode_image_to_base64,\n    validate_image_file,\n)\n\n\nclass QueryMixin:\n    \"\"\"QueryMixin class containing query functionality for RAGAnything\"\"\"\n\n    def _generate_multimodal_cache_key(\n        self, query: str, multimodal_content: List[Dict[str, Any]], mode: str, **kwargs\n    ) -> str:\n        \"\"\"\n        Generate cache key for multimodal query\n\n        Args:\n            query: Base query text\n            multimodal_content: List of multimodal content\n            mode: Query mode\n            **kwargs: Additional parameters\n\n        Returns:\n            str: Cache key hash\n        \"\"\"\n        # Create a normalized representation of the query parameters\n        cache_data = {\n            \"query\": query.strip(),\n            \"mode\": mode,\n        }\n\n        # Normalize multimodal content for stable caching\n        normalized_content = []\n        if multimodal_content:\n            for item in multimodal_content:\n                if isinstance(item, dict):\n                    normalized_item = {}\n                    for key, value in item.items():\n                        # For file paths, use basename to make cache more portable\n                        if key in [\n                            \"img_path\",\n                            \"image_path\",\n                            \"file_path\",\n                        ] and isinstance(value, str):\n                            normalized_item[key] = Path(value).name\n                        # For large content, create a hash instead of storing directly\n                        elif (\n                            key in [\"table_data\", \"table_body\"]\n                            and isinstance(value, str)\n                            and len(value) > 200\n                        ):\n                            normalized_item[f\"{key}_hash\"] = hashlib.md5(\n                                value.encode()\n                            ).hexdigest()\n                        else:\n                            normalized_item[key] = value\n                    normalized_content.append(normalized_item)\n                else:\n                    normalized_content.append(item)\n\n        cache_data[\"multimodal_content\"] = normalized_content\n\n        # Add relevant kwargs to cache data\n        relevant_kwargs = {\n            k: v\n            for k, v in kwargs.items()\n            if k\n            in [\n                \"stream\",\n                \"response_type\",\n                \"top_k\",\n                \"max_tokens\",\n                \"temperature\",\n                # \"only_need_context\",\n                # \"only_need_prompt\",\n            ]\n        }\n        cache_data.update(relevant_kwargs)\n\n        # Generate hash from the cache data\n        cache_str = json.dumps(cache_data, sort_keys=True, ensure_ascii=False)\n        cache_hash = hashlib.md5(cache_str.encode()).hexdigest()\n\n        return f\"multimodal_query:{cache_hash}\"\n\n    async def aquery(\n        self, query: str, mode: str = \"mix\", system_prompt: str | None = None, **kwargs\n    ) -> str:\n        \"\"\"\n        Pure text query - directly calls LightRAG's query functionality\n\n        Args:\n            query: Query text\n            mode: Query mode (\"local\", \"global\", \"hybrid\", \"naive\", \"mix\", \"bypass\")\n            system_prompt: Optional system prompt to include.\n            **kwargs: Other query parameters, will be passed to QueryParam\n                - vlm_enhanced: bool, default True when vision_model_func is available.\n                  If True, will parse image paths in retrieved context and replace them\n                  with base64 encoded images for VLM processing.\n\n        Returns:\n            str: Query result\n        \"\"\"\n        if self.lightrag is None:\n            raise ValueError(\n                \"No LightRAG instance available. Please process documents first or provide a pre-initialized LightRAG instance.\"\n            )\n\n        # Check if VLM enhanced query should be used\n        vlm_enhanced = kwargs.pop(\"vlm_enhanced\", None)\n\n        # Auto-determine VLM enhanced based on availability\n        if vlm_enhanced is None:\n            vlm_enhanced = (\n                hasattr(self, \"vision_model_func\")\n                and self.vision_model_func is not None\n            )\n\n        # Use VLM enhanced query if enabled and available\n        if (\n            vlm_enhanced\n            and hasattr(self, \"vision_model_func\")\n            and self.vision_model_func\n        ):\n            return await self.aquery_vlm_enhanced(\n                query, mode=mode, system_prompt=system_prompt, **kwargs\n            )\n        elif vlm_enhanced and (\n            not hasattr(self, \"vision_model_func\") or not self.vision_model_func\n        ):\n            self.logger.warning(\n                \"VLM enhanced query requested but vision_model_func is not available, falling back to normal query\"\n            )\n\n        callback_manager = getattr(self, \"callback_manager\", None)\n        query_start_time = time.time()\n\n        if callback_manager is not None:\n            callback_manager.dispatch(\n                \"on_query_start\",\n                query=query,\n                mode=mode,\n            )\n\n        # Create query parameters\n        query_param = QueryParam(mode=mode, **kwargs)\n\n        self.logger.info(f\"Executing text query: {query[:100]}...\")\n        self.logger.info(f\"Query mode: {mode}\")\n\n        try:\n            # Call LightRAG's query method\n            result = await self.lightrag.aquery(\n                query, param=query_param, system_prompt=system_prompt\n            )\n        except Exception as exc:\n            if callback_manager is not None:\n                callback_manager.dispatch(\n                    \"on_query_error\",\n                    query=query,\n                    mode=mode,\n                    error=exc,\n                )\n            raise\n\n        self.logger.info(\"Text query completed\")\n        if callback_manager is not None:\n            duration = time.time() - query_start_time\n            result_len = len(result) if isinstance(result, str) else 0\n            callback_manager.dispatch(\n                \"on_query_complete\",\n                query=query,\n                mode=mode,\n                duration_seconds=duration,\n                result_length=result_len,\n            )\n        return result\n\n    async def aquery_with_multimodal(\n        self,\n        query: str,\n        multimodal_content: List[Dict[str, Any]] = None,\n        mode: str = \"mix\",\n        **kwargs,\n    ) -> str:\n        \"\"\"\n        Multimodal query - combines text and multimodal content for querying\n\n        Args:\n            query: Base query text\n            multimodal_content: List of multimodal content, each element contains:\n                - type: Content type (\"image\", \"table\", \"equation\", etc.)\n                - Other fields depend on type (e.g., img_path, table_data, latex, etc.)\n            mode: Query mode (\"local\", \"global\", \"hybrid\", \"naive\", \"mix\", \"bypass\")\n            **kwargs: Other query parameters, will be passed to QueryParam\n\n        Returns:\n            str: Query result\n\n        Examples:\n            # Pure text query\n            result = await rag.query_with_multimodal(\"What is machine learning?\")\n\n            # Image query\n            result = await rag.query_with_multimodal(\n                \"Analyze the content in this image\",\n                multimodal_content=[{\n                    \"type\": \"image\",\n                    \"img_path\": \"./image.jpg\"\n                }]\n            )\n\n            # Table query\n            result = await rag.query_with_multimodal(\n                \"Analyze the data trends in this table\",\n                multimodal_content=[{\n                    \"type\": \"table\",\n                    \"table_data\": \"Name,Age\\nAlice,25\\nBob,30\"\n                }]\n            )\n        \"\"\"\n        # Ensure LightRAG is initialized\n        await self._ensure_lightrag_initialized()\n\n        self.logger.info(f\"Executing multimodal query: {query[:100]}...\")\n        self.logger.info(f\"Query mode: {mode}\")\n\n        # If no multimodal content, fallback to pure text query\n        if not multimodal_content:\n            self.logger.info(\"No multimodal content provided, executing text query\")\n            return await self.aquery(query, mode=mode, **kwargs)\n\n        # Generate cache key for multimodal query\n        cache_key = self._generate_multimodal_cache_key(\n            query, multimodal_content, mode, **kwargs\n        )\n\n        # Check cache if available and enabled\n        cached_result = None\n        if (\n            hasattr(self, \"lightrag\")\n            and self.lightrag\n            and hasattr(self.lightrag, \"llm_response_cache\")\n            and self.lightrag.llm_response_cache\n        ):\n            if self.lightrag.llm_response_cache.global_config.get(\n                \"enable_llm_cache\", True\n            ):\n                try:\n                    cached_result = await self.lightrag.llm_response_cache.get_by_id(\n                        cache_key\n                    )\n                    if cached_result and isinstance(cached_result, dict):\n                        result_content = cached_result.get(\"return\")\n                        if result_content:\n                            self.logger.info(\n                                f\"Multimodal query cache hit: {cache_key[:16]}...\"\n                            )\n                            return result_content\n                except Exception as e:\n                    self.logger.debug(f\"Error accessing multimodal query cache: {e}\")\n\n        # Process multimodal content to generate enhanced query text\n        enhanced_query = await self._process_multimodal_query_content(\n            query, multimodal_content\n        )\n\n        self.logger.info(\n            f\"Generated enhanced query length: {len(enhanced_query)} characters\"\n        )\n\n        # Execute enhanced query\n        result = await self.aquery(enhanced_query, mode=mode, **kwargs)\n\n        # Save to cache if available and enabled\n        if (\n            hasattr(self, \"lightrag\")\n            and self.lightrag\n            and hasattr(self.lightrag, \"llm_response_cache\")\n            and self.lightrag.llm_response_cache\n        ):\n            if self.lightrag.llm_response_cache.global_config.get(\n                \"enable_llm_cache\", True\n            ):\n                try:\n                    # Create cache entry for multimodal query\n                    cache_entry = {\n                        \"return\": result,\n                        \"cache_type\": \"multimodal_query\",\n                        \"original_query\": query,\n                        \"multimodal_content_count\": len(multimodal_content),\n                        \"mode\": mode,\n                    }\n\n                    await self.lightrag.llm_response_cache.upsert(\n                        {cache_key: cache_entry}\n                    )\n                    self.logger.info(\n                        f\"Saved multimodal query result to cache: {cache_key[:16]}...\"\n                    )\n                except Exception as e:\n                    self.logger.debug(f\"Error saving multimodal query to cache: {e}\")\n\n        # Ensure cache is persisted to disk\n        if (\n            hasattr(self, \"lightrag\")\n            and self.lightrag\n            and hasattr(self.lightrag, \"llm_response_cache\")\n            and self.lightrag.llm_response_cache\n        ):\n            try:\n                await self.lightrag.llm_response_cache.index_done_callback()\n            except Exception as e:\n                self.logger.debug(f\"Error persisting multimodal query cache: {e}\")\n\n        self.logger.info(\"Multimodal query completed\")\n        return result\n\n    async def aquery_vlm_enhanced(\n        self,\n        query: str,\n        mode: str = \"mix\",\n        system_prompt: str | None = None,\n        extra_safe_dirs: List[str] = None,\n        **kwargs,\n    ) -> str:\n        \"\"\"\n        VLM enhanced query - replaces image paths in retrieved context with base64 encoded images for VLM processing\n\n        Args:\n            query: User query\n            mode: Underlying LightRAG query mode\n            system_prompt: Optional system prompt to include\n            extra_safe_dirs: Optional list of additional safe directories to allow images from\n            **kwargs: Other query parameters\n\n        Returns:\n            str: VLM query result\n        \"\"\"\n        # Ensure VLM is available\n        if not hasattr(self, \"vision_model_func\") or not self.vision_model_func:\n            raise ValueError(\n                \"VLM enhanced query requires vision_model_func. \"\n                \"Please provide a vision model function when initializing RAGAnything.\"\n            )\n\n        # Ensure LightRAG is initialized\n        await self._ensure_lightrag_initialized()\n\n        self.logger.info(f\"Executing VLM enhanced query: {query[:100]}...\")\n\n        # Clear previous image cache\n        if hasattr(self, \"_current_images_base64\"):\n            delattr(self, \"_current_images_base64\")\n\n        # 1. Get original retrieval prompt (without generating final answer)\n        query_param = QueryParam(mode=mode, only_need_prompt=True, **kwargs)\n        raw_prompt = await self.lightrag.aquery(query, param=query_param)\n\n        self.logger.debug(\"Retrieved raw prompt from LightRAG\")\n\n        # 2. Extract and process image paths\n        enhanced_prompt, images_found = await self._process_image_paths_for_vlm(\n            raw_prompt, extra_safe_dirs=extra_safe_dirs\n        )\n\n        if not images_found:\n            self.logger.info(\"No valid images found, falling back to normal query\")\n            # Fallback to normal query\n            query_param = QueryParam(mode=mode, **kwargs)\n            return await self.lightrag.aquery(\n                query, param=query_param, system_prompt=system_prompt\n            )\n\n        self.logger.info(f\"Processed {images_found} images for VLM\")\n\n        # 3. Build VLM message format\n        messages = self._build_vlm_messages_with_images(\n            enhanced_prompt, query, system_prompt\n        )\n\n        # 4. Call VLM for question answering\n        result = await self._call_vlm_with_multimodal_content(messages)\n\n        self.logger.info(\"VLM enhanced query completed\")\n        return result\n\n    async def _process_multimodal_query_content(\n        self, base_query: str, multimodal_content: List[Dict[str, Any]]\n    ) -> str:\n        \"\"\"\n        Process multimodal query content to generate enhanced query text\n\n        Args:\n            base_query: Base query text\n            multimodal_content: List of multimodal content\n\n        Returns:\n            str: Enhanced query text\n        \"\"\"\n        self.logger.info(\"Starting multimodal query content processing...\")\n\n        enhanced_parts = [f\"User query: {base_query}\"]\n\n        for i, content in enumerate(multimodal_content):\n            content_type = content.get(\"type\", \"unknown\")\n            self.logger.info(\n                f\"Processing {i+1}/{len(multimodal_content)} multimodal content: {content_type}\"\n            )\n\n            try:\n                # Get appropriate processor\n                processor = get_processor_for_type(self.modal_processors, content_type)\n\n                if processor:\n                    # Generate content description\n                    description = await self._generate_query_content_description(\n                        processor, content, content_type\n                    )\n                    enhanced_parts.append(\n                        f\"\\nRelated {content_type} content: {description}\"\n                    )\n                else:\n                    # If no appropriate processor, use basic description\n                    basic_desc = str(content)[:200]\n                    enhanced_parts.append(\n                        f\"\\nRelated {content_type} content: {basic_desc}\"\n                    )\n\n            except Exception as e:\n                self.logger.error(f\"Error processing multimodal content: {str(e)}\")\n                # Continue processing other content\n                continue\n\n        enhanced_query = \"\\n\".join(enhanced_parts)\n        enhanced_query += PROMPTS[\"QUERY_ENHANCEMENT_SUFFIX\"]\n\n        self.logger.info(\"Multimodal query content processing completed\")\n        return enhanced_query\n\n    async def _generate_query_content_description(\n        self, processor, content: Dict[str, Any], content_type: str\n    ) -> str:\n        \"\"\"\n        Generate content description for query\n\n        Args:\n            processor: Multimodal processor\n            content: Content data\n            content_type: Content type\n\n        Returns:\n            str: Content description\n        \"\"\"\n        try:\n            if content_type == \"image\":\n                return await self._describe_image_for_query(processor, content)\n            elif content_type == \"table\":\n                return await self._describe_table_for_query(processor, content)\n            elif content_type == \"equation\":\n                return await self._describe_equation_for_query(processor, content)\n            else:\n                return await self._describe_generic_for_query(\n                    processor, content, content_type\n                )\n\n        except Exception as e:\n            self.logger.error(f\"Error generating {content_type} description: {str(e)}\")\n            return f\"{content_type} content: {str(content)[:100]}\"\n\n    async def _describe_image_for_query(\n        self, processor, content: Dict[str, Any]\n    ) -> str:\n        \"\"\"Generate image description for query\"\"\"\n        image_path = content.get(\"img_path\")\n        captions = content.get(\"image_caption\", content.get(\"img_caption\", []))\n        footnotes = content.get(\"image_footnote\", content.get(\"img_footnote\", []))\n\n        if image_path and Path(image_path).exists():\n            # If image exists, use vision model to generate description\n            image_base64 = processor._encode_image_to_base64(image_path)\n            if image_base64:\n                prompt = PROMPTS[\"QUERY_IMAGE_DESCRIPTION\"]\n                description = await processor.modal_caption_func(\n                    prompt,\n                    image_data=image_base64,\n                    system_prompt=PROMPTS[\"QUERY_IMAGE_ANALYST_SYSTEM\"],\n                )\n                return description\n\n        # If image doesn't exist or processing failed, use existing information\n        parts = []\n        if image_path:\n            parts.append(f\"Image path: {image_path}\")\n        if captions:\n            parts.append(f\"Image captions: {', '.join(captions)}\")\n        if footnotes:\n            parts.append(f\"Image footnotes: {', '.join(footnotes)}\")\n\n        return \"; \".join(parts) if parts else \"Image content information incomplete\"\n\n    async def _describe_table_for_query(\n        self, processor, content: Dict[str, Any]\n    ) -> str:\n        \"\"\"Generate table description for query\"\"\"\n        table_data = content.get(\"table_data\", \"\")\n        table_caption = content.get(\"table_caption\", \"\")\n\n        prompt = PROMPTS[\"QUERY_TABLE_ANALYSIS\"].format(\n            table_data=table_data, table_caption=table_caption\n        )\n\n        description = await processor.modal_caption_func(\n            prompt, system_prompt=PROMPTS[\"QUERY_TABLE_ANALYST_SYSTEM\"]\n        )\n\n        return description\n\n    async def _describe_equation_for_query(\n        self, processor, content: Dict[str, Any]\n    ) -> str:\n        \"\"\"Generate equation description for query\"\"\"\n        latex = content.get(\"latex\", \"\")\n        equation_caption = content.get(\"equation_caption\", \"\")\n\n        prompt = PROMPTS[\"QUERY_EQUATION_ANALYSIS\"].format(\n            latex=latex, equation_caption=equation_caption\n        )\n\n        description = await processor.modal_caption_func(\n            prompt, system_prompt=PROMPTS[\"QUERY_EQUATION_ANALYST_SYSTEM\"]\n        )\n\n        return description\n\n    async def _describe_generic_for_query(\n        self, processor, content: Dict[str, Any], content_type: str\n    ) -> str:\n        \"\"\"Generate generic content description for query\"\"\"\n        content_str = str(content)\n\n        prompt = PROMPTS[\"QUERY_GENERIC_ANALYSIS\"].format(\n            content_type=content_type, content_str=content_str\n        )\n\n        description = await processor.modal_caption_func(\n            prompt,\n            system_prompt=PROMPTS[\"QUERY_GENERIC_ANALYST_SYSTEM\"].format(\n                content_type=content_type\n            ),\n        )\n\n        return description\n\n    async def _process_image_paths_for_vlm(\n        self, prompt: str, extra_safe_dirs: List[str] = None\n    ) -> tuple[str, int]:\n        \"\"\"\n        Process image paths in prompt, keeping original paths and adding VLM markers\n\n        Args:\n            prompt: Original prompt\n            extra_safe_dirs: Optional list of additional safe directories\n\n        Returns:\n            tuple: (processed prompt, image count)\n        \"\"\"\n        enhanced_prompt = prompt\n        images_processed = 0\n\n        # Initialize image cache\n        self._current_images_base64 = []\n\n        # Enhanced regex pattern for matching image paths\n        # Matches only the path ending with image file extensions\n        image_path_pattern = (\n            r\"Image Path:\\s*([^\\r\\n]*?\\.(?:jpg|jpeg|png|gif|bmp|webp|tiff|tif))\"\n        )\n\n        # First, let's see what matches we find\n        matches = re.findall(image_path_pattern, prompt)\n        self.logger.info(f\"Found {len(matches)} image path matches in prompt\")\n\n        def replace_image_path(match):\n            nonlocal images_processed\n\n            image_path = match.group(1).strip()\n            self.logger.debug(f\"Processing image path: '{image_path}'\")\n\n            # Validate path format (basic check)\n            if not image_path or len(image_path) < 3:\n                self.logger.warning(f\"Invalid image path format: {image_path}\")\n                return match.group(0)  # Keep original\n\n            # Use utility function to validate image file\n            is_valid = validate_image_file(image_path)\n\n            # Security check: only allow images from the workspace or output directories\n            # to prevent indirect prompt injection from reading arbitrary system files.\n            if is_valid:\n                abs_image_path = Path(image_path).resolve()\n                # Check if it's in the current working directory or subdirectories\n                try:\n                    is_in_cwd = abs_image_path.is_relative_to(Path.cwd())\n                except ValueError:\n                    is_in_cwd = False\n\n                # If a config is available, check against working_dir and parser_output_dir\n                is_in_safe_dir = is_in_cwd\n                if hasattr(self, \"config\") and self.config:\n                    try:\n                        is_in_working = abs_image_path.is_relative_to(\n                            Path(self.config.working_dir).resolve()\n                        )\n                        is_in_output = abs_image_path.is_relative_to(\n                            Path(self.config.parser_output_dir).resolve()\n                        )\n                        is_in_safe_dir = is_in_safe_dir or is_in_working or is_in_output\n                    except Exception:\n                        pass\n\n                # Check against extra safe directories if provided\n                if not is_in_safe_dir and extra_safe_dirs:\n                    for safe_dir in extra_safe_dirs:\n                        try:\n                            if abs_image_path.is_relative_to(Path(safe_dir).resolve()):\n                                is_in_safe_dir = True\n                                break\n                        except Exception:\n                            continue\n\n                if not is_in_safe_dir:\n                    self.logger.warning(\n                        f\"Blocking image path outside safe directories: {image_path}\"\n                    )\n                    is_valid = False\n\n            if not is_valid:\n                self.logger.warning(\n                    f\"Image validation failed or path unsafe for: {image_path}\"\n                )\n                return match.group(0)  # Keep original if validation fails\n\n            try:\n                # Encode image to base64 using utility function\n                self.logger.debug(f\"Attempting to encode image: {image_path}\")\n                image_base64 = encode_image_to_base64(image_path)\n                if image_base64:\n                    images_processed += 1\n                    # Save base64 to instance variable for later use\n                    self._current_images_base64.append(image_base64)\n\n                    # Keep original path info and add VLM marker\n                    result = f\"Image Path: {image_path}\\n[VLM_IMAGE_{images_processed}]\"\n                    self.logger.debug(\n                        f\"Successfully processed image {images_processed}: {image_path}\"\n                    )\n                    return result\n                else:\n                    self.logger.error(f\"Failed to encode image: {image_path}\")\n                    return match.group(0)  # Keep original if encoding failed\n\n            except Exception as e:\n                self.logger.error(f\"Failed to process image {image_path}: {e}\")\n                return match.group(0)  # Keep original\n\n        # Execute replacement\n        enhanced_prompt = re.sub(\n            image_path_pattern, replace_image_path, enhanced_prompt\n        )\n\n        return enhanced_prompt, images_processed\n\n    def _build_vlm_messages_with_images(\n        self, enhanced_prompt: str, user_query: str, system_prompt: str\n    ) -> List[Dict]:\n        \"\"\"\n        Build VLM message format, using markers to correspond images with text positions\n\n        Args:\n            enhanced_prompt: Enhanced prompt with image markers\n            user_query: User query\n\n        Returns:\n            List[Dict]: VLM message format\n        \"\"\"\n        images_base64 = getattr(self, \"_current_images_base64\", [])\n\n        if not images_base64:\n            # Pure text mode\n            return [\n                {\n                    \"role\": \"user\",\n                    \"content\": f\"Context:\\n{enhanced_prompt}\\n\\nUser Question: {user_query}\",\n                }\n            ]\n\n        # Build multimodal content\n        content_parts = []\n\n        # Split text at image markers and insert images\n        text_parts = enhanced_prompt.split(\"[VLM_IMAGE_\")\n\n        for i, text_part in enumerate(text_parts):\n            if i == 0:\n                # First text part\n                if text_part.strip():\n                    content_parts.append({\"type\": \"text\", \"text\": text_part})\n            else:\n                # Find marker number and insert corresponding image\n                marker_match = re.match(r\"(\\d+)\\](.*)\", text_part, re.DOTALL)\n                if marker_match:\n                    image_num = (\n                        int(marker_match.group(1)) - 1\n                    )  # Convert to 0-based index\n                    remaining_text = marker_match.group(2)\n\n                    # Insert corresponding image\n                    if 0 <= image_num < len(images_base64):\n                        content_parts.append(\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image/jpeg;base64,{images_base64[image_num]}\"\n                                },\n                            }\n                        )\n\n                    # Insert remaining text\n                    if remaining_text.strip():\n                        content_parts.append({\"type\": \"text\", \"text\": remaining_text})\n\n        # Add user question\n        content_parts.append(\n            {\n                \"type\": \"text\",\n                \"text\": f\"\\n\\nUser Question: {user_query}\\n\\nPlease answer based on the context and images provided.\",\n            }\n        )\n        base_system_prompt = \"You are a helpful assistant that can analyze both text and image content to provide comprehensive answers.\"\n\n        if system_prompt:\n            full_system_prompt = base_system_prompt + \" \" + system_prompt\n        else:\n            full_system_prompt = base_system_prompt\n\n        return [\n            {\n                \"role\": \"system\",\n                \"content\": full_system_prompt,\n            },\n            {\n                \"role\": \"user\",\n                \"content\": content_parts,\n            },\n        ]\n\n    async def _call_vlm_with_multimodal_content(self, messages: List[Dict]) -> str:\n        \"\"\"\n        Call VLM to process multimodal content\n\n        Args:\n            messages: VLM message format\n\n        Returns:\n            str: VLM response result\n        \"\"\"\n        try:\n            user_message = messages[1]\n            content = user_message[\"content\"]\n            system_prompt = messages[0][\"content\"]\n\n            if isinstance(content, str):\n                # Pure text mode\n                result = await self.vision_model_func(\n                    content, system_prompt=system_prompt\n                )\n            else:\n                # Multimodal mode - pass complete messages directly to VLM\n                result = await self.vision_model_func(\n                    \"\",  # Empty prompt since we're using messages format\n                    messages=messages,\n                )\n\n            return result\n\n        except Exception as e:\n            self.logger.error(f\"VLM call failed: {e}\")\n            raise\n\n    # Synchronous versions of query methods\n    def query(self, query: str, mode: str = \"mix\", **kwargs) -> str:\n        \"\"\"\n        Synchronous version of pure text query\n\n        Args:\n            query: Query text\n            mode: Query mode (\"local\", \"global\", \"hybrid\", \"naive\", \"mix\", \"bypass\")\n            **kwargs: Other query parameters, will be passed to QueryParam\n                - vlm_enhanced: bool, default True when vision_model_func is available.\n                  If True, will parse image paths in retrieved context and replace them\n                  with base64 encoded images for VLM processing.\n\n        Returns:\n            str: Query result\n        \"\"\"\n        loop = always_get_an_event_loop()\n        return loop.run_until_complete(self.aquery(query, mode=mode, **kwargs))\n\n    def query_with_multimodal(\n        self,\n        query: str,\n        multimodal_content: List[Dict[str, Any]] = None,\n        mode: str = \"mix\",\n        **kwargs,\n    ) -> str:\n        \"\"\"\n        Synchronous version of multimodal query\n\n        Args:\n            query: Base query text\n            multimodal_content: List of multimodal content, each element contains:\n                - type: Content type (\"image\", \"table\", \"equation\", etc.)\n                - Other fields depend on type (e.g., img_path, table_data, latex, etc.)\n            mode: Query mode (\"local\", \"global\", \"hybrid\", \"naive\", \"mix\", \"bypass\")\n            **kwargs: Other query parameters, will be passed to QueryParam\n\n        Returns:\n            str: Query result\n        \"\"\"\n        loop = always_get_an_event_loop()\n        return loop.run_until_complete(\n            self.aquery_with_multimodal(query, multimodal_content, mode=mode, **kwargs)\n        )\n"
  },
  {
    "path": "raganything/raganything.py",
    "content": "\"\"\"\nComplete document parsing + multimodal content insertion Pipeline\n\nThis script integrates:\n1. Document parsing (using configurable parsers)\n2. Pure text content LightRAG insertion\n3. Specialized processing for multimodal content (using different processors)\n\"\"\"\n\nimport os\nfrom typing import Dict, Any, Optional, Callable\nimport sys\nimport asyncio\nimport atexit\nfrom dataclasses import dataclass, field\nfrom pathlib import Path\nfrom dotenv import load_dotenv\n\n# Add project root directory to Python path\nsys.path.insert(0, str(Path(__file__).parent.parent))\n\n# Load environment variables from .env file BEFORE importing LightRAG\n# This is critical for TIKTOKEN_CACHE_DIR to work properly in offline environments\n# The OS environment variables take precedence over the .env file\nload_dotenv(dotenv_path=\".env\", override=False)\n\nfrom lightrag import LightRAG\nfrom lightrag.utils import logger\n\n# Import configuration and modules\nfrom raganything.config import RAGAnythingConfig\nfrom raganything.query import QueryMixin\nfrom raganything.processor import ProcessorMixin\nfrom raganything.batch import BatchMixin\nfrom raganything.utils import get_processor_supports\nfrom raganything.parser import MineruParser, SUPPORTED_PARSERS, get_parser\nfrom raganything.callbacks import CallbackManager\n\n# Import specialized processors\nfrom raganything.modalprocessors import (\n    ImageModalProcessor,\n    TableModalProcessor,\n    EquationModalProcessor,\n    GenericModalProcessor,\n    ContextExtractor,\n    ContextConfig,\n)\n\n\n@dataclass\nclass RAGAnything(QueryMixin, ProcessorMixin, BatchMixin):\n    \"\"\"Multimodal Document Processing Pipeline - Complete document parsing and insertion pipeline\"\"\"\n\n    # Core Components\n    # ---\n    lightrag: Optional[LightRAG] = field(default=None)\n    \"\"\"Optional pre-initialized LightRAG instance.\"\"\"\n\n    llm_model_func: Optional[Callable] = field(default=None)\n    \"\"\"LLM model function for text analysis.\"\"\"\n\n    vision_model_func: Optional[Callable] = field(default=None)\n    \"\"\"Vision model function for image analysis.\"\"\"\n\n    embedding_func: Optional[Callable] = field(default=None)\n    \"\"\"Embedding function for text vectorization.\"\"\"\n\n    config: Optional[RAGAnythingConfig] = field(default=None)\n    \"\"\"Configuration object, if None will create with environment variables.\"\"\"\n\n    # LightRAG Configuration\n    # ---\n    lightrag_kwargs: Dict[str, Any] = field(default_factory=dict)\n    \"\"\"Additional keyword arguments for LightRAG initialization when lightrag is not provided.\n    This allows passing all LightRAG configuration parameters like:\n    - kv_storage, vector_storage, graph_storage, doc_status_storage\n    - top_k, chunk_top_k, max_entity_tokens, max_relation_tokens, max_total_tokens\n    - cosine_threshold, related_chunk_number\n    - chunk_token_size, chunk_overlap_token_size, tokenizer, tiktoken_model_name\n    - embedding_batch_num, embedding_func_max_async, embedding_cache_config\n    - llm_model_name, llm_model_max_token_size, llm_model_max_async, llm_model_kwargs\n    - rerank_model_func, vector_db_storage_cls_kwargs, enable_llm_cache\n    - max_parallel_insert, max_graph_nodes, addon_params, etc.\n    \"\"\"\n\n    # Internal State\n    # ---\n    modal_processors: Dict[str, Any] = field(default_factory=dict, init=False)\n    \"\"\"Dictionary of multimodal processors.\"\"\"\n\n    context_extractor: Optional[ContextExtractor] = field(default=None, init=False)\n    \"\"\"Context extractor for providing surrounding content to modal processors.\"\"\"\n\n    parse_cache: Optional[Any] = field(default=None, init=False)\n    \"\"\"Parse result cache storage using LightRAG KV storage.\"\"\"\n\n    callback_manager: CallbackManager = field(\n        default_factory=CallbackManager, init=False, repr=False\n    )\n    \"\"\"Processing callbacks manager (optional hooks for observability and metrics).\"\"\"\n\n    _parser_installation_checked: bool = field(default=False, init=False)\n    \"\"\"Flag to track if parser installation has been checked.\"\"\"\n\n    def __post_init__(self):\n        \"\"\"Post-initialization setup following LightRAG pattern\"\"\"\n        # Initialize configuration if not provided\n        if self.config is None:\n            self.config = RAGAnythingConfig()\n\n        # Set working directory\n        self.working_dir = self.config.working_dir\n\n        # Set up logger (use existing logger, don't configure it)\n        self.logger = logger\n\n        # Set up document parser\n        self.doc_parser = get_parser(self.config.parser)\n\n        # Register close method for cleanup\n        atexit.register(self.close)\n\n        # Create working directory if needed\n        if not os.path.exists(self.working_dir):\n            os.makedirs(self.working_dir)\n            self.logger.info(f\"Created working directory: {self.working_dir}\")\n\n        # Log configuration info\n        self.logger.info(\"RAGAnything initialized with config:\")\n        self.logger.info(f\"  Working directory: {self.config.working_dir}\")\n        self.logger.info(f\"  Parser: {self.config.parser}\")\n        self.logger.info(f\"  Parse method: {self.config.parse_method}\")\n        self.logger.info(\n            f\"  Multimodal processing - Image: {self.config.enable_image_processing}, \"\n            f\"Table: {self.config.enable_table_processing}, \"\n            f\"Equation: {self.config.enable_equation_processing}\"\n        )\n        self.logger.info(f\"  Max concurrent files: {self.config.max_concurrent_files}\")\n\n    def close(self):\n        \"\"\"Cleanup resources when object is destroyed\"\"\"\n        try:\n            import asyncio\n\n            # Check if there's a running event loop using get_running_loop()\n            # This is the proper way in Python 3.10+ to avoid DeprecationWarning\n            try:\n                asyncio.get_running_loop()\n                # If we're in an async context, schedule cleanup\n                asyncio.create_task(self.finalize_storages())\n            except RuntimeError:\n                # No running event loop, run cleanup synchronously\n                asyncio.run(self.finalize_storages())\n        except Exception as e:\n            # Use print instead of logger since logger might be cleaned up already\n            print(f\"Warning: Failed to finalize RAGAnything storages: {e}\")\n\n    def _create_context_config(self) -> ContextConfig:\n        \"\"\"Create context configuration from RAGAnything config\"\"\"\n        return ContextConfig(\n            context_window=self.config.context_window,\n            context_mode=self.config.context_mode,\n            max_context_tokens=self.config.max_context_tokens,\n            include_headers=self.config.include_headers,\n            include_captions=self.config.include_captions,\n            filter_content_types=self.config.context_filter_content_types,\n        )\n\n    def _create_context_extractor(self) -> ContextExtractor:\n        \"\"\"Create context extractor with tokenizer from LightRAG\"\"\"\n        if self.lightrag is None:\n            raise ValueError(\n                \"LightRAG must be initialized before creating context extractor\"\n            )\n\n        context_config = self._create_context_config()\n        return ContextExtractor(\n            config=context_config, tokenizer=self.lightrag.tokenizer\n        )\n\n    def _initialize_processors(self):\n        \"\"\"Initialize multimodal processors with appropriate model functions\"\"\"\n        if self.lightrag is None:\n            raise ValueError(\n                \"LightRAG instance must be initialized before creating processors\"\n            )\n\n        # Create context extractor\n        self.context_extractor = self._create_context_extractor()\n\n        # Create different multimodal processors based on configuration\n        self.modal_processors = {}\n\n        if self.config.enable_image_processing:\n            self.modal_processors[\"image\"] = ImageModalProcessor(\n                lightrag=self.lightrag,\n                modal_caption_func=self.vision_model_func or self.llm_model_func,\n                context_extractor=self.context_extractor,\n            )\n\n        if self.config.enable_table_processing:\n            self.modal_processors[\"table\"] = TableModalProcessor(\n                lightrag=self.lightrag,\n                modal_caption_func=self.llm_model_func,\n                context_extractor=self.context_extractor,\n            )\n\n        if self.config.enable_equation_processing:\n            self.modal_processors[\"equation\"] = EquationModalProcessor(\n                lightrag=self.lightrag,\n                modal_caption_func=self.llm_model_func,\n                context_extractor=self.context_extractor,\n            )\n\n        # Always include generic processor as fallback\n        self.modal_processors[\"generic\"] = GenericModalProcessor(\n            lightrag=self.lightrag,\n            modal_caption_func=self.llm_model_func,\n            context_extractor=self.context_extractor,\n        )\n\n        self.logger.info(\"Multimodal processors initialized with context support\")\n        self.logger.info(f\"Available processors: {list(self.modal_processors.keys())}\")\n        self.logger.info(f\"Context configuration: {self._create_context_config()}\")\n\n    def update_config(self, **kwargs):\n        \"\"\"Update configuration with new values\"\"\"\n        for key, value in kwargs.items():\n            if hasattr(self.config, key):\n                setattr(self.config, key, value)\n                self.logger.debug(f\"Updated config: {key} = {value}\")\n            else:\n                self.logger.warning(f\"Unknown config parameter: {key}\")\n\n    async def _ensure_lightrag_initialized(self):\n        \"\"\"Ensure LightRAG instance is initialized, create if necessary\"\"\"\n        try:\n            # Check parser installation first\n            if not self._parser_installation_checked:\n                if not self.doc_parser.check_installation():\n                    error_msg = (\n                        f\"Parser '{self.config.parser}' is not properly installed. \"\n                        \"Please install it using 'pip install' or 'uv pip install'.\"\n                    )\n                    self.logger.error(error_msg)\n                    return {\"success\": False, \"error\": error_msg}\n\n                self._parser_installation_checked = True\n                self.logger.info(f\"Parser '{self.config.parser}' installation verified\")\n\n            if self.lightrag is not None:\n                # LightRAG was pre-provided, but we need to ensure it's properly initialized\n                # Inherit model functions from LightRAG if not explicitly provided\n                if self.llm_model_func is None and hasattr(\n                    self.lightrag, \"llm_model_func\"\n                ):\n                    self.llm_model_func = self.lightrag.llm_model_func\n                    self.logger.debug(\"Inherited llm_model_func from LightRAG instance\")\n\n                if self.embedding_func is None and hasattr(\n                    self.lightrag, \"embedding_func\"\n                ):\n                    self.embedding_func = self.lightrag.embedding_func\n                    self.logger.debug(\"Inherited embedding_func from LightRAG instance\")\n\n                try:\n                    # Ensure LightRAG storages are initialized\n                    if (\n                        not hasattr(self.lightrag, \"_storages_status\")\n                        or self.lightrag._storages_status.name != \"INITIALIZED\"\n                    ):\n                        self.logger.info(\n                            \"Initializing storages for pre-provided LightRAG instance\"\n                        )\n                        await self.lightrag.initialize_storages()\n                        from lightrag.kg.shared_storage import (\n                            initialize_pipeline_status,\n                        )\n\n                        await initialize_pipeline_status()\n\n                    # Initialize parse cache if not already done\n                    if self.parse_cache is None:\n                        self.logger.info(\n                            \"Initializing parse cache for pre-provided LightRAG instance\"\n                        )\n                        self.parse_cache = (\n                            self.lightrag.key_string_value_json_storage_cls(\n                                namespace=\"parse_cache\",\n                                workspace=self.lightrag.workspace,\n                                global_config=self.lightrag.__dict__,\n                                embedding_func=self.embedding_func,\n                            )\n                        )\n                        await self.parse_cache.initialize()\n\n                    # Initialize processors if not already done\n                    if not self.modal_processors:\n                        self._initialize_processors()\n\n                    return {\"success\": True}\n\n                except Exception as e:\n                    error_msg = (\n                        f\"Failed to initialize pre-provided LightRAG instance: {str(e)}\"\n                    )\n                    self.logger.error(error_msg, exc_info=True)\n                    return {\"success\": False, \"error\": error_msg}\n\n            # Validate required functions for creating new LightRAG instance\n            if self.llm_model_func is None:\n                error_msg = \"llm_model_func must be provided when LightRAG is not pre-initialized\"\n                self.logger.error(error_msg)\n                return {\"success\": False, \"error\": error_msg}\n\n            if self.embedding_func is None:\n                error_msg = \"embedding_func must be provided when LightRAG is not pre-initialized\"\n                self.logger.error(error_msg)\n                return {\"success\": False, \"error\": error_msg}\n\n            from lightrag.kg.shared_storage import initialize_pipeline_status\n\n            # Prepare LightRAG initialization parameters\n            lightrag_params = {\n                \"working_dir\": self.working_dir,\n                \"llm_model_func\": self.llm_model_func,\n                \"embedding_func\": self.embedding_func,\n            }\n\n            # Merge user-provided lightrag_kwargs, which can override defaults\n            lightrag_params.update(self.lightrag_kwargs)\n\n            # Log the parameters being used for initialization (excluding sensitive data)\n            log_params = {\n                k: v\n                for k, v in lightrag_params.items()\n                if not callable(v)\n                and k not in [\"llm_model_kwargs\", \"vector_db_storage_cls_kwargs\"]\n            }\n            self.logger.info(f\"Initializing LightRAG with parameters: {log_params}\")\n\n            try:\n                # Create LightRAG instance with merged parameters\n                self.lightrag = LightRAG(**lightrag_params)\n                await self.lightrag.initialize_storages()\n                await initialize_pipeline_status()\n\n                # Initialize parse cache storage using LightRAG's KV storage\n                self.parse_cache = self.lightrag.key_string_value_json_storage_cls(\n                    namespace=\"parse_cache\",\n                    workspace=self.lightrag.workspace,\n                    global_config=self.lightrag.__dict__,\n                    embedding_func=self.embedding_func,\n                )\n                await self.parse_cache.initialize()\n\n                # Initialize processors after LightRAG is ready\n                self._initialize_processors()\n\n                self.logger.info(\n                    \"LightRAG, parse cache, and multimodal processors initialized\"\n                )\n                return {\"success\": True}\n\n            except Exception as e:\n                error_msg = f\"Failed to initialize LightRAG instance: {str(e)}\"\n                self.logger.error(error_msg, exc_info=True)\n                return {\"success\": False, \"error\": error_msg}\n\n        except Exception as e:\n            error_msg = f\"Unexpected error during LightRAG initialization: {str(e)}\"\n            self.logger.error(error_msg, exc_info=True)\n            return {\"success\": False, \"error\": error_msg}\n\n    async def finalize_storages(self):\n        \"\"\"Finalize all storages including parse cache and LightRAG storages\n\n        This method should be called when shutting down to properly clean up resources\n        and persist any cached data. It will finalize both the parse cache and LightRAG's\n        internal storages.\n\n        Example usage:\n            try:\n                rag_anything = RAGAnything(...)\n                await rag_anything.process_file(\"document.pdf\")\n                # ... other operations ...\n            finally:\n                # Always finalize storages to clean up resources\n                if rag_anything:\n                    await rag_anything.finalize_storages()\n\n        Note:\n            - This method is automatically called in __del__ when the object is destroyed\n            - Manual calling is recommended in production environments\n            - All finalization tasks run concurrently for better performance\n        \"\"\"\n        try:\n            tasks = []\n\n            # Finalize parse cache if it exists\n            if self.parse_cache is not None:\n                tasks.append(self.parse_cache.finalize())\n                self.logger.debug(\"Scheduled parse cache finalization\")\n\n            # Finalize LightRAG storages if LightRAG is initialized\n            if self.lightrag is not None:\n                tasks.append(self.lightrag.finalize_storages())\n                self.logger.debug(\"Scheduled LightRAG storages finalization\")\n\n            # Run all finalization tasks concurrently\n            if tasks:\n                await asyncio.gather(*tasks)\n                self.logger.info(\"Successfully finalized all RAGAnything storages\")\n            else:\n                self.logger.debug(\"No storages to finalize\")\n\n        except Exception as e:\n            self.logger.error(f\"Error during storage finalization: {e}\")\n            raise\n\n    def check_parser_installation(self) -> bool:\n        \"\"\"\n        Check if the configured parser is properly installed\n\n        Returns:\n            bool: True if the configured parser is properly installed\n        \"\"\"\n        return self.doc_parser.check_installation()\n\n    def verify_parser_installation_once(self) -> bool:\n        if not self._parser_installation_checked:\n            if not self.doc_parser.check_installation():\n                raise RuntimeError(\n                    f\"Parser '{self.config.parser}' is not properly installed. \"\n                    \"Please install it using pip install or uv pip install.\"\n                )\n            self._parser_installation_checked = True\n            self.logger.info(f\"Parser '{self.config.parser}' installation verified\")\n        return True\n\n    def get_config_info(self) -> Dict[str, Any]:\n        \"\"\"Get current configuration information\"\"\"\n        config_info = {\n            \"directory\": {\n                \"working_dir\": self.config.working_dir,\n                \"parser_output_dir\": self.config.parser_output_dir,\n            },\n            \"parsing\": {\n                \"parser\": self.config.parser,\n                \"parse_method\": self.config.parse_method,\n                \"display_content_stats\": self.config.display_content_stats,\n            },\n            \"multimodal_processing\": {\n                \"enable_image_processing\": self.config.enable_image_processing,\n                \"enable_table_processing\": self.config.enable_table_processing,\n                \"enable_equation_processing\": self.config.enable_equation_processing,\n            },\n            \"context_extraction\": {\n                \"context_window\": self.config.context_window,\n                \"context_mode\": self.config.context_mode,\n                \"max_context_tokens\": self.config.max_context_tokens,\n                \"include_headers\": self.config.include_headers,\n                \"include_captions\": self.config.include_captions,\n                \"filter_content_types\": self.config.context_filter_content_types,\n            },\n            \"batch_processing\": {\n                \"max_concurrent_files\": self.config.max_concurrent_files,\n                \"supported_file_extensions\": self.config.supported_file_extensions,\n                \"recursive_folder_processing\": self.config.recursive_folder_processing,\n            },\n            \"logging\": {\n                \"note\": \"Logging fields have been removed - configure logging externally\",\n            },\n        }\n\n        # Add LightRAG configuration if available\n        if self.lightrag_kwargs:\n            # Filter out sensitive data and callable objects for display\n            safe_kwargs = {\n                k: v\n                for k, v in self.lightrag_kwargs.items()\n                if not callable(v)\n                and k not in [\"llm_model_kwargs\", \"vector_db_storage_cls_kwargs\"]\n            }\n            config_info[\"lightrag_config\"] = {\n                \"custom_parameters\": safe_kwargs,\n                \"note\": \"LightRAG will be initialized with these additional parameters\",\n            }\n        else:\n            config_info[\"lightrag_config\"] = {\n                \"custom_parameters\": {},\n                \"note\": \"Using default LightRAG parameters\",\n            }\n\n        return config_info\n\n    def set_content_source_for_context(\n        self, content_source, content_format: str = \"auto\"\n    ):\n        \"\"\"Set content source for context extraction in all modal processors\n\n        Args:\n            content_source: Source content for context extraction (e.g., MinerU content list)\n            content_format: Format of content source (\"minerU\", \"text_chunks\", \"auto\")\n        \"\"\"\n        if not self.modal_processors:\n            self.logger.warning(\n                \"Modal processors not initialized. Content source will be set when processors are created.\"\n            )\n            return\n\n        for processor_name, processor in self.modal_processors.items():\n            try:\n                processor.set_content_source(content_source, content_format)\n                self.logger.debug(f\"Set content source for {processor_name} processor\")\n            except Exception as e:\n                self.logger.error(\n                    f\"Failed to set content source for {processor_name}: {e}\"\n                )\n\n        self.logger.info(\n            f\"Content source set for context extraction (format: {content_format})\"\n        )\n\n    def update_context_config(self, **context_kwargs):\n        \"\"\"Update context extraction configuration\n\n        Args:\n            **context_kwargs: Context configuration parameters to update\n                (context_window, context_mode, max_context_tokens, etc.)\n        \"\"\"\n        # Update the main config\n        for key, value in context_kwargs.items():\n            if hasattr(self.config, key):\n                setattr(self.config, key, value)\n                self.logger.debug(f\"Updated context config: {key} = {value}\")\n            else:\n                self.logger.warning(f\"Unknown context config parameter: {key}\")\n\n        # Recreate context extractor with new config if processors are initialized\n        if self.lightrag and self.modal_processors:\n            try:\n                self.context_extractor = self._create_context_extractor()\n                # Update all processors with new context extractor\n                for processor_name, processor in self.modal_processors.items():\n                    processor.context_extractor = self.context_extractor\n\n                self.logger.info(\n                    \"Context configuration updated and applied to all processors\"\n                )\n                self.logger.info(\n                    f\"New context configuration: {self._create_context_config()}\"\n                )\n            except Exception as e:\n                self.logger.error(f\"Failed to update context configuration: {e}\")\n\n    def get_processor_info(self) -> Dict[str, Any]:\n        \"\"\"Get processor information\"\"\"\n        base_info = {\n            \"mineru_installed\": MineruParser.check_installation(MineruParser()),\n            \"parser_installation\": {\n                parser_name: get_parser(parser_name).check_installation()\n                for parser_name in SUPPORTED_PARSERS\n            },\n            \"config\": self.get_config_info(),\n            \"models\": {\n                \"llm_model\": \"External function\"\n                if self.llm_model_func\n                else \"Not provided\",\n                \"vision_model\": \"External function\"\n                if self.vision_model_func\n                else \"Not provided\",\n                \"embedding_model\": \"External function\"\n                if self.embedding_func\n                else \"Not provided\",\n            },\n        }\n\n        if not self.modal_processors:\n            base_info[\"status\"] = \"Not initialized\"\n            base_info[\"processors\"] = {}\n        else:\n            base_info[\"status\"] = \"Initialized\"\n            base_info[\"processors\"] = {}\n\n            for proc_type, processor in self.modal_processors.items():\n                base_info[\"processors\"][proc_type] = {\n                    \"class\": processor.__class__.__name__,\n                    \"supports\": get_processor_supports(proc_type),\n                    \"enabled\": True,\n                }\n\n        return base_info\n"
  },
  {
    "path": "raganything/utils.py",
    "content": "\"\"\"\nUtility functions for RAGAnything\n\nContains helper functions for content separation, text insertion, and other utilities\n\"\"\"\n\nimport base64\nfrom typing import Dict, List, Any, Tuple\nfrom pathlib import Path\nfrom lightrag.utils import logger\n\n\ndef separate_content(\n    content_list: List[Dict[str, Any]],\n) -> Tuple[str, List[Dict[str, Any]]]:\n    \"\"\"\n    Separate text content and multimodal content\n\n    Args:\n        content_list: Content list from MinerU parsing\n\n    Returns:\n        (text_content, multimodal_items): Pure text content and multimodal items list\n    \"\"\"\n    text_parts = []\n    multimodal_items = []\n\n    for item in content_list:\n        content_type = item.get(\"type\", \"text\")\n\n        if content_type == \"text\":\n            # Text content\n            text = item.get(\"text\", \"\")\n            if text.strip():\n                text_parts.append(text)\n        else:\n            # Multimodal content (image, table, equation, etc.)\n            multimodal_items.append(item)\n\n    # Merge all text content\n    text_content = \"\\n\\n\".join(text_parts)\n\n    logger.info(\"Content separation complete:\")\n    logger.info(f\"  - Text content length: {len(text_content)} characters\")\n    logger.info(f\"  - Multimodal items count: {len(multimodal_items)}\")\n\n    # Count multimodal types\n    modal_types = {}\n    for item in multimodal_items:\n        modal_type = item.get(\"type\", \"unknown\")\n        modal_types[modal_type] = modal_types.get(modal_type, 0) + 1\n\n    if modal_types:\n        logger.info(f\"  - Multimodal type distribution: {modal_types}\")\n\n    return text_content, multimodal_items\n\n\ndef encode_image_to_base64(image_path: str) -> str:\n    \"\"\"\n    Encode image file to base64 string\n\n    Args:\n        image_path: Path to the image file\n\n    Returns:\n        str: Base64 encoded string, empty string if encoding fails\n    \"\"\"\n    try:\n        with open(image_path, \"rb\") as image_file:\n            encoded_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n        return encoded_string\n    except Exception as e:\n        logger.error(f\"Failed to encode image {image_path}: {e}\")\n        return \"\"\n\n\ndef validate_image_file(image_path: str, max_size_mb: int = 50) -> bool:\n    \"\"\"\n    Validate if a file is a valid image file\n\n    Args:\n        image_path: Path to the image file\n        max_size_mb: Maximum file size in MB\n\n    Returns:\n        bool: True if valid, False otherwise\n    \"\"\"\n    try:\n        path = Path(image_path)\n\n        logger.debug(f\"Validating image path: {image_path}\")\n        logger.debug(f\"Resolved path object: {path}\")\n        logger.debug(f\"Path exists check: {path.exists()}\")\n\n        # Check if file exists and is not a symlink (for security)\n        if not path.exists():\n            logger.warning(f\"Image file not found: {image_path}\")\n            return False\n\n        if path.is_symlink():\n            logger.warning(f\"Blocking symlink for security: {image_path}\")\n            return False\n\n        # Check file extension\n        image_extensions = [\n            \".jpg\",\n            \".jpeg\",\n            \".png\",\n            \".gif\",\n            \".bmp\",\n            \".webp\",\n            \".tiff\",\n            \".tif\",\n        ]\n\n        path_lower = str(path).lower()\n        has_valid_extension = any(path_lower.endswith(ext) for ext in image_extensions)\n        logger.debug(\n            f\"File extension check - path: {path_lower}, valid: {has_valid_extension}\"\n        )\n\n        if not has_valid_extension:\n            logger.warning(f\"File does not appear to be an image: {image_path}\")\n            return False\n\n        # Check file size\n        file_size = path.stat().st_size\n        max_size = max_size_mb * 1024 * 1024\n        logger.debug(\n            f\"File size check - size: {file_size} bytes, max: {max_size} bytes\"\n        )\n\n        if file_size > max_size:\n            logger.warning(f\"Image file too large ({file_size} bytes): {image_path}\")\n            return False\n\n        logger.debug(f\"Image validation successful: {image_path}\")\n        return True\n\n    except Exception as e:\n        logger.error(f\"Error validating image file {image_path}: {e}\")\n        return False\n\n\nasync def insert_text_content(\n    lightrag,\n    input: str | list[str],\n    split_by_character: str | None = None,\n    split_by_character_only: bool = False,\n    ids: str | list[str] | None = None,\n    file_paths: str | list[str] | None = None,\n):\n    \"\"\"\n    Insert pure text content into LightRAG\n\n    Args:\n        lightrag: LightRAG instance\n        input: Single document string or list of document strings\n        split_by_character: if split_by_character is not None, split the string by character, if chunk longer than\n        chunk_token_size, it will be split again by token size.\n        split_by_character_only: if split_by_character_only is True, split the string by character only, when\n        split_by_character is None, this parameter is ignored.\n        ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated\n        file_paths: single string of the file path or list of file paths, used for citation\n    \"\"\"\n    logger.info(\"Starting text content insertion into LightRAG...\")\n\n    # Use LightRAG's insert method with all parameters\n    await lightrag.ainsert(\n        input=input,\n        file_paths=file_paths,\n        split_by_character=split_by_character,\n        split_by_character_only=split_by_character_only,\n        ids=ids,\n    )\n\n    logger.info(\"Text content insertion complete\")\n\n\nasync def insert_text_content_with_multimodal_content(\n    lightrag,\n    input: str | list[str],\n    multimodal_content: list[dict[str, any]] | None = None,\n    split_by_character: str | None = None,\n    split_by_character_only: bool = False,\n    ids: str | list[str] | None = None,\n    file_paths: str | list[str] | None = None,\n    scheme_name: str | None = None,\n):\n    \"\"\"\n    Insert pure text content into LightRAG\n\n    Args:\n        lightrag: LightRAG instance\n        input: Single document string or list of document strings\n        multimodal_content: Multimodal content list (optional)\n        split_by_character: if split_by_character is not None, split the string by character, if chunk longer than\n        chunk_token_size, it will be split again by token size.\n        split_by_character_only: if split_by_character_only is True, split the string by character only, when\n        split_by_character is None, this parameter is ignored.\n        ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated\n        file_paths: single string of the file path or list of file paths, used for citation\n        scheme_name: scheme name (optional)\n    \"\"\"\n    logger.info(\"Starting text content insertion into LightRAG...\")\n\n    # Use LightRAG's insert method with all parameters\n    try:\n        await lightrag.ainsert(\n            input=input,\n            multimodal_content=multimodal_content,\n            file_paths=file_paths,\n            split_by_character=split_by_character,\n            split_by_character_only=split_by_character_only,\n            ids=ids,\n            scheme_name=scheme_name,\n        )\n    except Exception as e:\n        logger.info(f\"Error: {e}\")\n        logger.info(\n            \"If the error is caused by the ainsert function not having a multimodal content parameter, please update the raganything branch of lightrag\"\n        )\n\n    logger.info(\"Text content insertion complete\")\n\n\ndef get_processor_for_type(modal_processors: Dict[str, Any], content_type: str):\n    \"\"\"\n    Get appropriate processor based on content type\n\n    Args:\n        modal_processors: Dictionary of available processors\n        content_type: Content type\n\n    Returns:\n        Corresponding processor instance\n    \"\"\"\n    # Direct mapping to corresponding processor\n    if content_type == \"image\":\n        return modal_processors.get(\"image\")\n    elif content_type == \"table\":\n        return modal_processors.get(\"table\")\n    elif content_type == \"equation\":\n        return modal_processors.get(\"equation\")\n    else:\n        # For other types, use generic processor\n        return modal_processors.get(\"generic\")\n\n\ndef get_processor_supports(proc_type: str) -> List[str]:\n    \"\"\"Get processor supported features\"\"\"\n    supports_map = {\n        \"image\": [\n            \"Image content analysis\",\n            \"Visual understanding\",\n            \"Image description generation\",\n            \"Image entity extraction\",\n        ],\n        \"table\": [\n            \"Table structure analysis\",\n            \"Data statistics\",\n            \"Trend identification\",\n            \"Table entity extraction\",\n        ],\n        \"equation\": [\n            \"Mathematical formula parsing\",\n            \"Variable identification\",\n            \"Formula meaning explanation\",\n            \"Formula entity extraction\",\n        ],\n        \"generic\": [\n            \"General content analysis\",\n            \"Structured processing\",\n            \"Entity extraction\",\n        ],\n    }\n    return supports_map.get(proc_type, [\"Basic processing\"])\n"
  },
  {
    "path": "requirements.txt",
    "content": "huggingface_hub\n# LightRAG packages\nlightrag-hku\n# MinerU 2.0 packages (replaces magic-pdf)\nmineru[core]\n# Progress bars for batch processing\ntqdm\n# Note: Optional dependencies are now defined in setup.py extras_require:\n# - [image]: Pillow>=10.0.0 (for BMP, TIFF, GIF, WebP format conversion)\n# - [text]: reportlab>=4.0.0 (for TXT, MD to PDF conversion)\n# - [paddleocr]: paddleocr + pypdfium2 (for parser='paddleocr')\n# - [office]: requires LibreOffice (external program, not Python package)\n# - [all]: includes all optional dependencies\n#\n# Install with: pip install raganything[image,text] or pip install raganything[all]\n"
  },
  {
    "path": "scripts/create_tiktoken_cache.py",
    "content": "import tiktoken\nimport os\n\n# Define the directory where you want to store the cache\ncache_dir = \"./tiktoken_cache\"\nif \"TIKTOKEN_CACHE_DIR\" not in os.environ:\n    os.environ[\"TIKTOKEN_CACHE_DIR\"] = cache_dir\n\n# Create the directory if it doesn't exist\nif not os.path.exists(cache_dir):\n    os.makedirs(cache_dir)\n\nprint(\"Downloading and caching tiktoken models...\")\ntiktoken.get_encoding(\"cl100k_base\")\n# tiktoken.get_encoding(\"p50k_base\")\n\nprint(f\"tiktoken models have been cached in '{cache_dir}'\")\n"
  },
  {
    "path": "setup.py",
    "content": "import setuptools\nfrom pathlib import Path\n\n\n# Reading the long description from README.md\ndef read_long_description():\n    try:\n        return Path(\"README.md\").read_text(encoding=\"utf-8\")\n    except FileNotFoundError:\n        return \"A description of RAGAnything is currently unavailable.\"\n\n\n# Retrieving metadata from __init__.py\ndef retrieve_metadata():\n    vars2find = [\"__author__\", \"__version__\", \"__url__\"]\n    vars2readme = {}\n    try:\n        with open(\"./raganything/__init__.py\") as f:\n            for line in f.readlines():\n                for v in vars2find:\n                    if line.startswith(v):\n                        line = (\n                            line.replace(\" \", \"\")\n                            .replace('\"', \"\")\n                            .replace(\"'\", \"\")\n                            .strip()\n                        )\n                        vars2readme[v] = line.split(\"=\")[1]\n    except FileNotFoundError:\n        raise FileNotFoundError(\"Metadata file './raganything/__init__.py' not found.\")\n\n    # Checking if all required variables are found\n    missing_vars = [v for v in vars2find if v not in vars2readme]\n    if missing_vars:\n        raise ValueError(\n            f\"Missing required metadata variables in __init__.py: {missing_vars}\"\n        )\n\n    return vars2readme\n\n\n# Reading dependencies from requirements.txt\ndef read_requirements():\n    deps = []\n    try:\n        with open(\"./requirements.txt\") as f:\n            deps = [\n                line.strip() for line in f if line.strip() and not line.startswith(\"#\")\n            ]\n    except FileNotFoundError:\n        print(\n            \"Warning: 'requirements.txt' not found. No dependencies will be installed.\"\n        )\n    return deps\n\n\nmetadata = retrieve_metadata()\nlong_description = read_long_description()\nrequirements = read_requirements()\n\n# Define extras_require for optional features\nextras_require = {\n    \"image\": [\"Pillow>=10.0.0\"],  # For image format conversion (BMP, TIFF, GIF, WebP)\n    \"text\": [\"reportlab>=4.0.0\"],  # For text file to PDF conversion (TXT, MD)\n    \"office\": [],  # Office document processing requires LibreOffice (external program)\n    \"paddleocr\": [\"paddleocr>=2.7.0\", \"pypdfium2>=4.25.0\"],  # PaddleOCR parser\n    \"all\": [\n        \"Pillow>=10.0.0\",\n        \"reportlab>=4.0.0\",\n        \"paddleocr>=2.7.0\",\n        \"pypdfium2>=4.25.0\",\n        \"markdown>=3.4.0\",\n        \"weasyprint>=60.0\",\n        \"pygments>=2.10.0\",\n    ],  # All optional features\n    \"markdown\": [\n        \"markdown>=3.4.0\",\n        \"weasyprint>=60.0\",\n        \"pygments>=2.10.0\",\n    ],  # Enhanced markdown conversion\n}\n\nsetuptools.setup(\n    name=\"raganything\",\n    url=metadata[\"__url__\"],\n    version=metadata[\"__version__\"],\n    author=metadata[\"__author__\"],\n    description=\"RAGAnything: All-in-One RAG System\",\n    long_description=long_description,\n    long_description_content_type=\"text/markdown\",\n    packages=setuptools.find_packages(\n        exclude=(\"tests*\", \"docs*\")\n    ),  # Automatically find packages\n    classifiers=[\n        \"Development Status :: 4 - Beta\",\n        \"Programming Language :: Python :: 3\",\n        \"License :: OSI Approved :: MIT License\",\n        \"Operating System :: OS Independent\",\n        \"Intended Audience :: Developers\",\n        \"Topic :: Software Development :: Libraries :: Python Modules\",\n    ],\n    python_requires=\">=3.9\",\n    install_requires=requirements,\n    extras_require=extras_require,\n    include_package_data=True,  # Includes non-code files from MANIFEST.in\n    project_urls={  # Additional project metadata\n        \"Documentation\": metadata.get(\"__url__\", \"\"),\n        \"Source\": metadata.get(\"__url__\", \"\"),\n        \"Tracker\": f\"{metadata.get('__url__', '')}/issues\"\n        if metadata.get(\"__url__\")\n        else \"\",\n    },\n)\n"
  },
  {
    "path": "tests/testpaddleocr_parser.py",
    "content": "import importlib\nimport sys\n\nimport pytest\n\nimport raganything.parser as parser_module\nfrom raganything.parser import PaddleOCRParser, SUPPORTED_PARSERS, get_parser\n\n\ndef test_supported_parsers_include_paddleocr():\n    assert \"paddleocr\" in SUPPORTED_PARSERS\n\n\ndef test_get_parser_returns_paddleocr_parser():\n    parser = get_parser(\"paddleocr\")\n    assert isinstance(parser, PaddleOCRParser)\n\n\ndef test_get_parser_rejects_unknown_parser():\n    with pytest.raises(ValueError, match=\"Unsupported parser type\"):\n        get_parser(\"unknown-parser\")\n\n\ndef test_parser_module_import_does_not_import_paddleocr():\n    sys.modules.pop(\"paddleocr\", None)\n    importlib.reload(parser_module)\n    assert \"paddleocr\" not in sys.modules\n\n\ndef test_check_installation_false_when_dependency_missing(monkeypatch):\n    parser = PaddleOCRParser()\n\n    def missing_dependency():\n        raise ImportError(\"missing paddleocr\")\n\n    monkeypatch.setattr(parser, \"_require_paddleocr\", missing_dependency)\n    assert parser.check_installation() is False\n\n\ndef test_check_installation_true_when_pdf_renderer_missing(monkeypatch):\n    parser = PaddleOCRParser()\n\n    monkeypatch.setattr(parser, \"_require_paddleocr\", lambda: object())\n\n    import builtins\n\n    real_import = builtins.__import__\n\n    def fake_import(name, globals=None, locals=None, fromlist=(), level=0):\n        if name == \"pypdfium2\":\n            raise ImportError(\"missing pypdfium2\")\n        return real_import(name, globals, locals, fromlist, level)\n\n    monkeypatch.setattr(builtins, \"__import__\", fake_import)\n\n    assert parser.check_installation() is True\n\n\ndef test_parse_pdf_raises_import_error_when_pdf_renderer_missing(monkeypatch, tmp_path):\n    parser = PaddleOCRParser()\n    fake_pdf = tmp_path / \"sample.pdf\"\n    fake_pdf.write_bytes(b\"%PDF-1.4\\n\")\n\n    monkeypatch.setattr(parser, \"_require_paddleocr\", lambda: object())\n\n    import builtins\n\n    real_import = builtins.__import__\n\n    def fake_import(name, globals=None, locals=None, fromlist=(), level=0):\n        if name == \"pypdfium2\":\n            raise ImportError(\"missing pypdfium2\")\n        return real_import(name, globals, locals, fromlist, level)\n\n    monkeypatch.setattr(builtins, \"__import__\", fake_import)\n\n    with pytest.raises(ImportError, match=\"pypdfium2\"):\n        parser.parse_pdf(fake_pdf)\n\n\ndef test_parse_image_raises_import_error_with_install_hint(monkeypatch, tmp_path):\n    parser = PaddleOCRParser()\n    fake_image = tmp_path / \"sample.png\"\n    fake_image.write_bytes(b\"not-a-real-image\")\n\n    def missing_dependency():\n        raise ImportError(\"missing paddleocr\")\n\n    monkeypatch.setattr(parser, \"_require_paddleocr\", missing_dependency)\n\n    with pytest.raises(ImportError, match=\"paddleocr\"):\n        parser.parse_image(fake_image)\n\n\ndef test_parse_image_returns_content_list_schema(monkeypatch, tmp_path):\n    parser = PaddleOCRParser()\n    fake_image = tmp_path / \"sample.png\"\n    fake_image.write_bytes(b\"image-bytes\")\n\n    class FakeOCR:\n        def ocr(self, input_data, cls=True):\n            return [\n                [\n                    [[[0, 0], [1, 0], [1, 1], [0, 1]], (\"First line\", 0.99)],\n                    [[[0, 2], [1, 2], [1, 3], [0, 3]], (\"Second line\", 0.95)],\n                ]\n            ]\n\n    monkeypatch.setattr(parser, \"_get_ocr\", lambda lang=None: FakeOCR())\n\n    content_list = parser.parse_image(fake_image, page_idx=7)\n\n    assert content_list == [\n        {\"type\": \"text\", \"text\": \"First line\", \"page_idx\": 7},\n        {\"type\": \"text\", \"text\": \"Second line\", \"page_idx\": 7},\n    ]\n\n\ndef test_parse_image_preserves_repeated_ocr_lines(monkeypatch, tmp_path):\n    parser = PaddleOCRParser()\n    fake_image = tmp_path / \"sample.png\"\n    fake_image.write_bytes(b\"image-bytes\")\n\n    class FakeOCR:\n        def ocr(self, input_data, cls=True):\n            return [\n                [\n                    [[[0, 0], [1, 0], [1, 1], [0, 1]], (\"Same\", 0.99)],\n                    [[[0, 2], [1, 2], [1, 3], [0, 3]], (\"Same\", 0.95)],\n                ]\n            ]\n\n    monkeypatch.setattr(parser, \"_get_ocr\", lambda lang=None: FakeOCR())\n\n    content_list = parser.parse_image(fake_image, page_idx=1)\n\n    assert content_list == [\n        {\"type\": \"text\", \"text\": \"Same\", \"page_idx\": 1},\n        {\"type\": \"text\", \"text\": \"Same\", \"page_idx\": 1},\n    ]\n\n\ndef test_parse_pdf_assigns_page_index(monkeypatch, tmp_path):\n    parser = PaddleOCRParser()\n    fake_pdf = tmp_path / \"sample.pdf\"\n    fake_pdf.write_bytes(b\"%PDF-1.4\\n\")\n\n    monkeypatch.setattr(\n        parser,\n        \"_extract_pdf_page_inputs\",\n        lambda pdf_path: [(0, \"page0\"), (1, \"page1\")],\n    )\n    monkeypatch.setattr(\n        parser,\n        \"_ocr_rendered_page\",\n        lambda rendered_page, lang=None, cls_enabled=True: [f\"{rendered_page}-text\"],\n    )\n\n    content_list = parser.parse_pdf(fake_pdf)\n\n    assert content_list == [\n        {\"type\": \"text\", \"text\": \"page0-text\", \"page_idx\": 0},\n        {\"type\": \"text\", \"text\": \"page1-text\", \"page_idx\": 1},\n    ]\n"
  },
  {
    "path": "tests/testparser_kwargs.py",
    "content": "#!/usr/bin/env python3\n\"\"\"\nParser Validation Test Script for RAG-Anything (Pytest)\n\nThis script validates the environment variable propagation and\nargument validation logic for both MineruParser and DoclingParser.\nIt ensures that environment variables are correctly passed to subprocesses\nand that invalid inputs are handled properly (fail-fast).\n\nRequirements:\n- RAG-Anything package\n- pytest\n\nUsage:\n    pytest tests/testparser_kwargs.py\n\"\"\"\n\nimport pytest\nfrom unittest.mock import patch, MagicMock\nimport os\nfrom raganything.parser import MineruParser, DoclingParser\n\n\n@pytest.fixture\ndef mineru_parser():\n    return MineruParser()\n\n\n@pytest.fixture\ndef docling_parser():\n    return DoclingParser()\n\n\n@pytest.fixture\ndef dummy_path():\n    return \"dummy.pdf\"\n\n\n@patch(\"subprocess.Popen\")\n@patch(\"pathlib.Path.exists\")\n@patch(\"pathlib.Path.mkdir\")\ndef test_mineru_env_propagation(\n    mock_mkdir, mock_exists, mock_popen, mineru_parser, dummy_path\n):\n    mock_exists.return_value = True\n    mock_process = MagicMock()\n    mock_process.poll.return_value = 0\n    mock_process.wait.return_value = 0\n    mock_process.stdout.readline.return_value = \"\"\n    mock_process.stderr.readline.return_value = \"\"\n    mock_popen.return_value = mock_process\n\n    custom_env = {\"MY_VAR\": \"test_value\"}\n\n    # Test env propagation\n    try:\n        mineru_parser._run_mineru_command(dummy_path, \"out\", env=custom_env)\n    except Exception:\n        pass\n\n    args, kwargs = mock_popen.call_args\n    assert \"env\" in kwargs\n    assert kwargs[\"env\"][\"MY_VAR\"] == \"test_value\"\n    assert kwargs[\"env\"][\"PATH\"] == os.environ[\"PATH\"]\n\n\n@patch(\"subprocess.run\")\ndef test_docling_env_propagation(mock_run, docling_parser, dummy_path):\n    mock_run.return_value = MagicMock(returncode=0, stdout=\"\")\n\n    custom_env = {\"DOCLING_VAR\": \"docling_value\"}\n\n    # Test env propagation\n    docling_parser._run_docling_command(dummy_path, \"out\", \"stem\", env=custom_env)\n\n    args, kwargs = mock_run.call_args\n    assert \"env\" in kwargs\n    assert kwargs[\"env\"][\"DOCLING_VAR\"] == \"docling_value\"\n    assert kwargs[\"env\"][\"PATH\"] == os.environ[\"PATH\"]\n\n\ndef test_mineru_unknown_kwargs(mineru_parser, dummy_path):\n    # Mineru should fail fast on unknown kwargs\n    with pytest.raises(TypeError) as excinfo:\n        mineru_parser._run_mineru_command(dummy_path, \"out\", unknown_arg=\"fail\")\n    assert \"unexpected keyword argument(s): unknown_arg\" in str(excinfo.value)\n\n\n@patch(\"subprocess.run\")\ndef test_docling_unknown_kwargs(mock_run, docling_parser, dummy_path):\n    mock_run.return_value = MagicMock(returncode=0, stdout=\"\")\n    # Docling should NOT fail on unknown kwargs as per user request\n    docling_parser._run_docling_command(dummy_path, \"out\", \"stem\", unknown_arg=\"allow\")\n    # No exception means success\n\n\ndef test_invalid_env_type(mineru_parser, docling_parser, dummy_path):\n    # Test non-dict env\n    with pytest.raises(TypeError, match=\"env must be a dictionary\"):\n        mineru_parser._run_mineru_command(dummy_path, \"out\", env=[\"not\", \"a\", \"dict\"])\n\n    with pytest.raises(TypeError, match=\"env must be a dictionary\"):\n        docling_parser._run_docling_command(dummy_path, \"out\", \"stem\", env=\"string\")\n\n\ndef test_invalid_env_contents(mineru_parser, docling_parser, dummy_path):\n    # Test non-string keys/values\n    with pytest.raises(TypeError, match=\"env keys and values must be strings\"):\n        mineru_parser._run_mineru_command(dummy_path, \"out\", env={1: \"string_val\"})\n\n    with pytest.raises(TypeError, match=\"env keys and values must be strings\"):\n        docling_parser._run_docling_command(dummy_path, \"out\", \"stem\", env={\"key\": 123})\n"
  },
  {
    "path": "tests/testparser_wiring.py",
    "content": "import pytest\n\nfrom raganything.batch_parser import BatchParser\n\n\ndef test_batch_parser_uses_paddleocr_parser():\n    batch_parser = BatchParser(\n        parser_type=\"paddleocr\",\n        show_progress=False,\n        skip_installation_check=True,\n    )\n    assert batch_parser.parser.__class__.__name__ == \"PaddleOCRParser\"\n\n\ndef test_raganything_initializes_selected_parser(monkeypatch, tmp_path):\n    pytest.importorskip(\"lightrag\")\n\n    import raganything.raganything as rag_module\n    from raganything.config import RAGAnythingConfig\n\n    class StubParser:\n        def check_installation(self):\n            return True\n\n    captured = {}\n\n    def fake_get_parser(parser_name):\n        captured[\"parser_name\"] = parser_name\n        return StubParser()\n\n    monkeypatch.setattr(rag_module, \"get_parser\", fake_get_parser)\n    monkeypatch.setattr(rag_module.atexit, \"register\", lambda *args, **kwargs: None)\n\n    config = RAGAnythingConfig(\n        working_dir=str(tmp_path / \"rag_workdir\"),\n        parser=\"paddleocr\",\n    )\n    rag = rag_module.RAGAnything(config=config)\n\n    assert captured[\"parser_name\"] == \"paddleocr\"\n    assert isinstance(rag.doc_parser, StubParser)\n\n\n@pytest.mark.asyncio\nasync def test_processor_parse_document_uses_selected_parser(monkeypatch, tmp_path):\n    import raganything.processor as processor_module\n\n    class FakeLogger:\n        def info(self, *args, **kwargs):\n            pass\n\n        def warning(self, *args, **kwargs):\n            pass\n\n        def error(self, *args, **kwargs):\n            pass\n\n        def debug(self, *args, **kwargs):\n            pass\n\n    class FakeParser:\n        def parse_pdf(self, **kwargs):\n            return [{\"type\": \"text\", \"text\": \"parsed by fake parser\", \"page_idx\": 0}]\n\n        def parse_image(self, **kwargs):\n            return [{\"type\": \"text\", \"text\": \"image parsed\", \"page_idx\": 0}]\n\n        def parse_office_doc(self, **kwargs):\n            return [{\"type\": \"text\", \"text\": \"office parsed\", \"page_idx\": 0}]\n\n        def parse_document(self, **kwargs):\n            return [{\"type\": \"text\", \"text\": \"generic parsed\", \"page_idx\": 0}]\n\n    selected = {\"calls\": 0}\n\n    def fake_get_parser(parser_name):\n        selected[\"parser_name\"] = parser_name\n        selected[\"calls\"] += 1\n        return FakeParser()\n\n    monkeypatch.setattr(processor_module, \"get_parser\", fake_get_parser)\n\n    class DummyProcessor(processor_module.ProcessorMixin):\n        pass\n\n    dummy = DummyProcessor()\n    dummy.config = type(\n        \"Config\",\n        (),\n        {\n            \"parser\": \"paddleocr\",\n            \"parser_output_dir\": str(tmp_path / \"output\"),\n            \"parse_method\": \"auto\",\n            \"display_content_stats\": False,\n            \"use_full_path\": False,\n        },\n    )()\n    dummy.logger = FakeLogger()\n    dummy.parse_cache = None\n\n    async def fake_store_cached_result(*args, **kwargs):\n        return None\n\n    monkeypatch.setattr(\n        DummyProcessor,\n        \"_store_cached_result\",\n        fake_store_cached_result,\n        raising=False,\n    )\n    monkeypatch.setattr(\n        DummyProcessor,\n        \"_generate_content_based_doc_id\",\n        lambda self, content_list: \"doc-fixed\",\n        raising=False,\n    )\n\n    fake_pdf = tmp_path / \"sample.pdf\"\n    fake_pdf.write_bytes(b\"%PDF-1.4\\n\")\n\n    content_list, doc_id = await dummy.parse_document(str(fake_pdf))\n    content_list_2, doc_id_2 = await dummy.parse_document(str(fake_pdf))\n\n    assert selected[\"parser_name\"] == \"paddleocr\"\n    assert selected[\"calls\"] == 1\n    assert doc_id == \"doc-fixed\"\n    assert doc_id_2 == \"doc-fixed\"\n    assert content_list == [\n        {\"type\": \"text\", \"text\": \"parsed by fake parser\", \"page_idx\": 0}\n    ]\n    assert content_list_2 == [\n        {\"type\": \"text\", \"text\": \"parsed by fake parser\", \"page_idx\": 0}\n    ]\n"
  }
]