Repository: Cinnamon/kotaemon Branch: main Commit: 155e590720f9 Files: 363 Total size: 2.3 MB Directory structure: gitextract_waa09c9u/ ├── .commitlintrc ├── .dockerignore ├── .gitattributes ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yml │ │ ├── config.yml │ │ └── feature_request.yml │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ ├── auto-bump-and-release.yaml │ ├── build-push-docker.yaml │ ├── pr-lint.yaml │ ├── style-check.yaml │ └── unit-test.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE.txt ├── README.md ├── app.py ├── doc_env_reqs.txt ├── docs/ │ ├── about.md │ ├── development/ │ │ ├── contributing.md │ │ ├── create-a-component.md │ │ ├── data-components.md │ │ ├── index.md │ │ └── utilities.md │ ├── extra/ │ │ └── css/ │ │ └── code_select.css │ ├── index.md │ ├── local_model.md │ ├── online_install.md │ ├── pages/ │ │ └── app/ │ │ ├── customize-flows.md │ │ ├── ext/ │ │ │ └── user-management.md │ │ ├── features.md │ │ ├── functional-description.md │ │ ├── index/ │ │ │ └── file.md │ │ └── settings/ │ │ ├── overview.md │ │ └── user-settings.md │ ├── scripts/ │ │ ├── generate_examples_docs.py │ │ └── generate_reference_docs.py │ ├── theme/ │ │ ├── assets/ │ │ │ └── pymdownx-extras/ │ │ │ ├── extra-fb5a2a1c86.css │ │ │ ├── extra-loader-MCFnu0Wd.js │ │ │ ├── material-extra-3rdparty-E-i8w1WA.js │ │ │ └── material-extra-theme-TVq-kNRT.js │ │ ├── main.html │ │ └── partials/ │ │ ├── footer.html │ │ ├── header.html │ │ └── libs.html │ └── usage.md ├── flowsettings.py ├── fly.toml ├── launch.sh ├── libs/ │ ├── kotaemon/ │ │ ├── README.md │ │ ├── kotaemon/ │ │ │ ├── __init__.py │ │ │ ├── agents/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── io/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── base.py │ │ │ │ ├── langchain_based.py │ │ │ │ ├── react/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ └── prompt.py │ │ │ │ ├── rewoo/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── agent.py │ │ │ │ │ ├── planner.py │ │ │ │ │ ├── prompt.py │ │ │ │ │ └── solver.py │ │ │ │ ├── tools/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── google.py │ │ │ │ │ ├── llm.py │ │ │ │ │ ├── mcp.py │ │ │ │ │ └── wikipedia.py │ │ │ │ └── utils.py │ │ │ ├── base/ │ │ │ │ ├── __init__.py │ │ │ │ ├── component.py │ │ │ │ └── schema.py │ │ │ ├── chatbot/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── simple_respondent.py │ │ │ ├── cli.py │ │ │ ├── contribs/ │ │ │ │ ├── __init__.py │ │ │ │ ├── docs.py │ │ │ │ └── promptui/ │ │ │ │ ├── .gitignore │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── cli.py │ │ │ │ ├── config.py │ │ │ │ ├── export.py │ │ │ │ ├── logs.py │ │ │ │ ├── themes.py │ │ │ │ ├── tunnel.py │ │ │ │ └── ui/ │ │ │ │ ├── __init__.py │ │ │ │ ├── blocks.py │ │ │ │ ├── chat.py │ │ │ │ └── pipeline.py │ │ │ ├── embeddings/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── endpoint_based.py │ │ │ │ ├── fastembed.py │ │ │ │ ├── langchain_based.py │ │ │ │ ├── openai.py │ │ │ │ ├── tei_endpoint_embed.py │ │ │ │ └── voyageai.py │ │ │ ├── indices/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── extractors/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── doc_parsers.py │ │ │ │ ├── ingests/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── files.py │ │ │ │ ├── qa/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── citation.py │ │ │ │ │ ├── citation_qa.py │ │ │ │ │ ├── citation_qa_inline.py │ │ │ │ │ ├── format_context.py │ │ │ │ │ └── utils.py │ │ │ │ ├── rankings/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cohere.py │ │ │ │ │ ├── llm.py │ │ │ │ │ ├── llm_scoring.py │ │ │ │ │ └── llm_trulens.py │ │ │ │ ├── retrievers/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── jina_web_search.py │ │ │ │ │ └── tavily_web_search.py │ │ │ │ ├── splitters/ │ │ │ │ │ └── __init__.py │ │ │ │ └── vectorindex.py │ │ │ ├── llms/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── branching.py │ │ │ │ ├── chats/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── endpoint_based.py │ │ │ │ │ ├── langchain_based.py │ │ │ │ │ ├── llamacpp.py │ │ │ │ │ └── openai.py │ │ │ │ ├── completions/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── langchain_based.py │ │ │ │ ├── cot.py │ │ │ │ ├── linear.py │ │ │ │ └── prompts/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── template.py │ │ │ ├── loaders/ │ │ │ │ ├── __init__.py │ │ │ │ ├── adobe_loader.py │ │ │ │ ├── azureai_document_intelligence_loader.py │ │ │ │ ├── base.py │ │ │ │ ├── composite_loader.py │ │ │ │ ├── docling_loader.py │ │ │ │ ├── docx_loader.py │ │ │ │ ├── excel_loader.py │ │ │ │ ├── html_loader.py │ │ │ │ ├── mathpix_loader.py │ │ │ │ ├── ocr_loader.py │ │ │ │ ├── pdf_loader.py │ │ │ │ ├── txt_loader.py │ │ │ │ ├── unstructured_loader.py │ │ │ │ ├── utils/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── adobe.py │ │ │ │ │ ├── box.py │ │ │ │ │ ├── gpt4v.py │ │ │ │ │ ├── pdf_ocr.py │ │ │ │ │ └── table.py │ │ │ │ └── web_loader.py │ │ │ ├── parsers/ │ │ │ │ ├── __init__.py │ │ │ │ └── regex_extractor.py │ │ │ ├── rerankings/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── cohere.py │ │ │ │ ├── tei_fast_rerank.py │ │ │ │ └── voyageai.py │ │ │ └── storages/ │ │ │ ├── __init__.py │ │ │ ├── docstores/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── elasticsearch.py │ │ │ │ ├── in_memory.py │ │ │ │ ├── lancedb.py │ │ │ │ └── simple_file.py │ │ │ └── vectorstores/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── chroma.py │ │ │ ├── in_memory.py │ │ │ ├── lancedb.py │ │ │ ├── milvus.py │ │ │ ├── qdrant.py │ │ │ └── simple_file.py │ │ ├── pyproject.toml │ │ ├── pytest.ini │ │ └── tests/ │ │ ├── __init__.py │ │ ├── _test_multimodal_reader.py │ │ ├── conftest.py │ │ ├── resources/ │ │ │ ├── dummy.docx │ │ │ ├── dummy.mhtml │ │ │ ├── dummy.xlsx │ │ │ ├── embedding_openai.json │ │ │ ├── embedding_openai_batch.json │ │ │ ├── fullocr_sample_output.json │ │ │ ├── ggml-vocab-llama.gguf │ │ │ ├── html/ │ │ │ │ └── dummy.html │ │ │ └── policy.md │ │ ├── simple_pipeline.py │ │ ├── test_agent.py │ │ ├── test_composite.py │ │ ├── test_cot.py │ │ ├── test_docstores.py │ │ ├── test_documents.py │ │ ├── test_embedding_models.py │ │ ├── test_indexing_retrieval.py │ │ ├── test_ingestor.py │ │ ├── test_llms_chat_models.py │ │ ├── test_llms_completion_models.py │ │ ├── test_mcp_manager.py │ │ ├── test_mcp_tools.py │ │ ├── test_post_processing.py │ │ ├── test_prompt.py │ │ ├── test_promptui.py │ │ ├── test_reader.py │ │ ├── test_reranking.py │ │ ├── test_splitter.py │ │ ├── test_table_reader.py │ │ ├── test_telemetry.py │ │ ├── test_template.py │ │ ├── test_tools.py │ │ └── test_vectorstore.py │ └── ktem/ │ ├── .gitignore │ ├── MANIFEST.in │ ├── alembic.ini │ ├── ktem/ │ │ ├── __init__.py │ │ ├── app.py │ │ ├── assets/ │ │ │ ├── __init__.py │ │ │ ├── css/ │ │ │ │ └── main.css │ │ │ ├── js/ │ │ │ │ ├── main.js │ │ │ │ └── pdf_viewer.js │ │ │ ├── md/ │ │ │ │ ├── about.md │ │ │ │ ├── changelogs.md │ │ │ │ └── usage.md │ │ │ └── theme.py │ │ ├── components.py │ │ ├── db/ │ │ │ ├── __init__.py │ │ │ ├── base_models.py │ │ │ ├── engine.py │ │ │ └── models.py │ │ ├── embeddings/ │ │ │ ├── __init__.py │ │ │ ├── db.py │ │ │ ├── manager.py │ │ │ └── ui.py │ │ ├── exceptions.py │ │ ├── extension_protocol.py │ │ ├── index/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── file/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ ├── exceptions.py │ │ │ │ ├── graph/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── graph_index.py │ │ │ │ │ ├── light_graph_index.py │ │ │ │ │ ├── lightrag_pipelines.py │ │ │ │ │ ├── nano_graph_index.py │ │ │ │ │ ├── nano_pipelines.py │ │ │ │ │ ├── pipelines.py │ │ │ │ │ └── visualize.py │ │ │ │ ├── index.py │ │ │ │ ├── knet/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── knet_index.py │ │ │ │ │ └── pipelines.py │ │ │ │ ├── pipelines.py │ │ │ │ ├── ui.py │ │ │ │ └── utils.py │ │ │ ├── manager.py │ │ │ ├── models.py │ │ │ └── ui.py │ │ ├── llms/ │ │ │ ├── __init__.py │ │ │ ├── db.py │ │ │ ├── manager.py │ │ │ └── ui.py │ │ ├── main.py │ │ ├── mcp/ │ │ │ ├── __init__.py │ │ │ ├── db.py │ │ │ ├── manager.py │ │ │ └── ui.py │ │ ├── pages/ │ │ │ ├── __init__.py │ │ │ ├── chat/ │ │ │ │ ├── __init__.py │ │ │ │ ├── chat_panel.py │ │ │ │ ├── chat_suggestion.py │ │ │ │ ├── common.py │ │ │ │ ├── control.py │ │ │ │ ├── demo_hint.py │ │ │ │ ├── paper_list.py │ │ │ │ └── report.py │ │ │ ├── help.py │ │ │ ├── login.py │ │ │ ├── resources/ │ │ │ │ ├── __init__.py │ │ │ │ └── user.py │ │ │ ├── settings.py │ │ │ └── setup.py │ │ ├── reasoning/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── prompt_optimization/ │ │ │ │ ├── __init__.py │ │ │ │ ├── decompose_question.py │ │ │ │ ├── fewshot_rewrite_question.py │ │ │ │ ├── mindmap.py │ │ │ │ ├── rephrase_question_train.json │ │ │ │ ├── rewrite_question.py │ │ │ │ ├── suggest_conversation_name.py │ │ │ │ └── suggest_followup_chat.py │ │ │ ├── react.py │ │ │ ├── rewoo.py │ │ │ └── simple.py │ │ ├── rerankings/ │ │ │ ├── __init__.py │ │ │ ├── db.py │ │ │ ├── manager.py │ │ │ └── ui.py │ │ ├── settings.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── commands.py │ │ ├── conversation.py │ │ ├── file.py │ │ ├── generator.py │ │ ├── hf_papers.py │ │ ├── lang.py │ │ ├── plantuml.py │ │ ├── rate_limit.py │ │ ├── render.py │ │ └── visualize_cited.py │ ├── ktem_tests/ │ │ ├── __init__.py │ │ ├── resources/ │ │ │ └── embedding_openai.json │ │ └── test_qa.py │ ├── migrations/ │ │ ├── README │ │ ├── env.py │ │ ├── script.py.mako │ │ └── versions/ │ │ └── .keep │ ├── pyproject.toml │ └── requirements.txt ├── mkdocs.yml ├── pyproject.toml ├── scripts/ │ ├── download_pdfjs.sh │ ├── migrate/ │ │ ├── __init__.py │ │ └── migrate_chroma_db.py │ ├── run_linux.sh │ ├── run_macos.sh │ ├── run_windows.bat │ ├── serve_local.py │ ├── server_llamacpp_linux.sh │ ├── server_llamacpp_macos.sh │ ├── server_llamacpp_windows.bat │ ├── update_linux.sh │ ├── update_macos.sh │ └── update_windows.bat ├── settings.yaml.example ├── sso_app.py ├── sso_app_demo.py └── templates/ ├── component-default/ │ └── README.md └── project-default/ ├── cookiecutter.json └── {{cookiecutter.project_name}}/ ├── .gitattributes ├── .gitignore ├── .pre-commit-config.yaml ├── README.md ├── setup.py ├── tests/ │ └── __init__.py └── {{cookiecutter.project_name}}/ ├── __init__.py └── pipeline.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .commitlintrc ================================================ { "extends": ["@commitlint/config-conventional"], "defaultIgnores": true, "rules": { "body-leading-blank": [1, "always"], "body-max-line-length": [2, "always", 100], "footer-leading-blank": [1, "always"], "footer-max-line-length": [2, "always", 10000], "header-max-length": [2, "always", 200], "subject-case": [ 2, "never", [] ], "subject-empty": [2, "never"], "subject-full-stop": [2, "never", "."], "type-case": [2, "always", "lower-case"], "type-empty": [2, "never"], "type-enum": [ 2, "always", [ "build", "chore", "ci", "docs", "feat", "fix", "perf", "refactor", "revert", "style", "test" ] ] } } ================================================ FILE: .dockerignore ================================================ .github/ .git/ .mypy_cache/ __pycache__/ ktem_app_data/ env/ .pre-commit-config.yaml .commitlintrc .gitignore .gitattributes README.md *.zip *.sh !/launch.sh ================================================ FILE: .gitattributes ================================================ *.bat text eol=crlf ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yml ================================================ name: "Bug Report" description: Report something that is not working as expected title: "[BUG] " labels: ["bug"] body: - type: markdown attributes: value: | *Please fill this form with as much information as possible.* - type: textarea id: description attributes: label: "Description" description: Please enter an explicit description of your issue placeholder: Short and explicit description of your incident... validations: required: true - type: textarea id: reprod attributes: label: "Reproduction steps" description: Please enter an explicit description of your issue value: | 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error render: bash validations: required: true - type: textarea id: screenshot attributes: label: "Screenshots" description: If applicable, add screenshots to help explain your problem. value: | ![DESCRIPTION](LINK.png) render: bash validations: required: false - type: textarea id: logs attributes: label: "Logs" description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. render: bash validations: required: false - type: dropdown id: browsers attributes: label: "Browsers" description: What browsers are you seeing the problem on ? multiple: true options: - Firefox - Chrome - Safari - Microsoft Edge - Opera - Brave - Other validations: required: false - type: dropdown id: os attributes: label: "OS" description: What is the impacted environment ? multiple: true options: - Windows - MacOS - Linux - Other validations: required: false - type: textarea id: additional_information attributes: label: "Additional information" description: Add any relevant information or context. placeholder: validations: required: false ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.yml ================================================ name: "Feature Request" description: Brainstorm and propose new features for the project title: "[REQUEST] " labels: ["enhancement"] body: - type: markdown attributes: value: | *Please fill this form with as much information as possible.* - type: textarea id: reference_issues attributes: label: "Reference Issues" description: Common issues placeholder: "#Issues IDs" validations: required: false - type: textarea id: summary attributes: label: "Summary" description: Provide a brief explanation of the feature placeholder: Describe in a few lines your feature request validations: required: true - type: textarea id: basic_example attributes: label: "Basic Example" description: Indicate here some basic examples of your feature. placeholder: A few specific words about your feature request. validations: required: true - type: textarea id: drawbacks attributes: label: "Drawbacks" description: What are the drawbacks/impacts of your feature request ? placeholder: Identify the drawbacks and impacts while being neutral on your feature request validations: required: true - type: textarea id: additional_information attributes: label: "Additional information" description: Add any additional information that you think is important for your feature request placeholder: validations: required: false ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ## Description - Please include a summary of the changes and the related issue. - Fixes # (issue) ## Type of change - [ ] New features (non-breaking change). - [ ] Bug fix (non-breaking change). - [ ] Breaking change (fix or feature that would cause existing functionality not to work as expected). ## Checklist - [ ] I have performed a self-review of my code. - [ ] I have added thorough tests if it is a core feature. - [ ] There is a reference to the original bug report and related work. - [ ] I have commented on my code, particularly in hard-to-understand areas. - [ ] The feature is well documented. ================================================ FILE: .github/workflows/auto-bump-and-release.yaml ================================================ name: Auto Bump and Release on: push: branches: - main jobs: auto-bump-and-release: runs-on: ubuntu-latest steps: - name: Clone the repo uses: actions/checkout@v4 with: fetch-depth: 0 - name: Update Application Version id: update-version uses: anothrNick/github-tag-action@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} WITH_V: true DEFAULT_BUMP: patch MAJOR_STRING_TOKEN: "bump:major" MINOR_STRING_TOKEN: "bump:minor" PATCH_STRING_TOKEN: "bump:patch" - name: Create release for ${{ steps.update-version.outputs.new_tag }} # need to repeat this if statement because Github Action doesn't support early # stopping for steps if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} run: | echo Create release folder mkdir kotaemon-app echo ${{ steps.update-version.outputs.new_tag }} > kotaemon-app/VERSION cp LICENSE.txt kotaemon-app/ cp flowsettings.py kotaemon-app/ cp app.py kotaemon-app/ cp .env.example kotaemon-app/.env cp -r scripts kotaemon-app/ mkdir -p kotaemon-app/libs/ktem/ktem/ cp -r libs/ktem/ktem/assets kotaemon-app/libs/ktem/ktem/ tree kotaemon-app zip -r kotaemon-app.zip kotaemon-app - name: Release ${{ steps.update-version.outputs.new_tag }} if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} uses: softprops/action-gh-release@v2 with: files: kotaemon-app.zip fail_on_unmatched_files: true token: ${{ secrets.GITHUB_TOKEN }} generate_release_notes: true tag_name: ${{ steps.update-version.outputs.new_tag }} make_latest: true - name: Setup latest branch locally without switching current branch if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} run: git fetch origin latest:latest - name: Update latest branch if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} run: | git branch -f latest tags/${{ steps.update-version.outputs.new_tag }} git checkout latest git push -f -u origin latest ================================================ FILE: .github/workflows/build-push-docker.yaml ================================================ name: Build and Push Docker Image on: release: types: - created push: tags: - "v[0-9]+.[0-9]+.[0-9]+" workflow_dispatch: env: REGISTRY: ghcr.io jobs: build: name: Build and push container runs-on: ubuntu-latest permissions: contents: read packages: write attestations: write id-token: write strategy: matrix: target: - lite - full - ollama # The maximum number of jobs that can run simultaneously max-parallel: 1 steps: - name: Free Disk Space (Ubuntu) uses: jlumbroso/free-disk-space@main with: # this might remove tools that are actually needed, # if set to "true" but frees about 6 GB tool-cache: true # all of these default to true, but feel free to set to # "false" if necessary for your workflow android: true dotnet: true haskell: true large-packages: true docker-images: true swap-storage: true - name: Set repository and image name run: | echo "FULL_IMAGE_NAME=${{ env.REGISTRY }}/${IMAGE_NAME,,}" >>${GITHUB_ENV} env: IMAGE_NAME: "${{ github.repository }}" - name: Checkout uses: actions/checkout@v4 - name: Set up QEMU uses: docker/setup-qemu-action@v3 with: image: tonistiigi/binfmt:latest platforms: arm64,arm - name: Set up Docker Buildx id: buildx uses: docker/setup-buildx-action@v3 - name: Set up Docker meta id: meta uses: docker/metadata-action@v5 with: images: ${{ env.FULL_IMAGE_NAME }} tags: | # branch type=ref,event=branch,suffix=-${{ matrix.target }} # semver with suffix for lite/full targets type=semver,pattern={{version}},suffix=-${{ matrix.target }} # latest tag with suffix for lite/full targets type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/') && !contains(github.ref, 'pre') }},suffix=-${{ matrix.target }} flavor: | # This is disabled here so we can use the raw form above latest=false # Suffix is not used here since there's no way to disable it above - name: Log in to the Container registry uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build docker image uses: docker/build-push-action@v6 with: file: Dockerfile context: . push: true platforms: linux/amd64, linux/arm64 tags: | ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} target: ${{ matrix.target }} cache-from: type=gha cache-to: type=gha,mode=max ================================================ FILE: .github/workflows/pr-lint.yaml ================================================ name: "Lint PR" on: pull_request: types: - opened - edited - synchronize permissions: pull-requests: write jobs: pr-title: name: Validate PR title runs-on: ubuntu-latest permissions: write-all steps: - uses: amannn/action-semantic-pull-request@v5 id: lint_pr_title env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - uses: marocchino/sticky-pull-request-comment@v2 # When the previous steps fails, the workflow would stop. By adding this # condition you can continue the execution with the populated error message. if: always() && (steps.lint_pr_title.outputs.error_message != null) with: header: pr-title-lint-error message: | Hey there and thank you for opening this pull request! 👋🏼 We require pull request titles to follow the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) and it looks like your proposed title needs to be adjusted. Details: ``` ${{ steps.lint_pr_title.outputs.error_message }} ``` # Delete a previous comment when the issue has been resolved - if: ${{ steps.lint_pr_title.outputs.error_message == null }} uses: marocchino/sticky-pull-request-comment@v2 with: header: pr-title-lint-error delete: true commitlint: if: false # Disable this job for now name: Validate commit messages runs-on: ubuntu-latest permissions: write-all steps: - uses: actions/checkout@v4 - uses: wagoid/commitlint-github-action@v6 id: commitlint env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: configFile: ./.commitlintrc - uses: buildingcash/json-to-markdown-table-action@v1 if: always() && (steps.commitlint.outcome != 'success') id: table with: json: ${{ steps.commitlint.outputs.results }} - uses: marocchino/sticky-pull-request-comment@v2 if: always() && (steps.commitlint.outcome != 'success') with: header: commitlint-error message: | **All commits** in this PR need to follow the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) and [.commitlintrc](${{ github.server_url }}/${{ github.repository }}/blob/${{ github.head_ref || github.ref_name }}/.commitlintrc). Details: ${{ steps.table.outputs.table }} - if: ${{ steps.commitlint.outcome == 'success' }} uses: marocchino/sticky-pull-request-comment@v2 with: header: commitlint-error delete: true ================================================ FILE: .github/workflows/style-check.yaml ================================================ name: style-check on: pull_request: branches: [main, develop] push: branches: [main, develop] jobs: pre-commit: runs-on: ubuntu-latest steps: - name: Clone the repo uses: actions/checkout@v4 - name: Setup python uses: actions/setup-python@v4 with: python-version: "3.10" - name: run pre-commit uses: pre-commit/action@v3.0.0 ================================================ FILE: .github/workflows/unit-test.yaml ================================================ name: unit-test on: pull_request: branches: [main] push: branches: [main] env: THEFLOW_TEMP_PATH: ./tmp jobs: unit-test: # if: false # temporary disable this job due to legacy interface #TODO: enable this job after the new interface is ready if: ${{ !cancelled() }} runs-on: ${{ matrix.os }} timeout-minutes: 20 defaults: run: shell: ${{ matrix.shell }} strategy: matrix: python-version: ["3.10", "3.11"] include: - os: ubuntu-latest shell: bash ACTIVATE_ENV: ". env/bin/activate" GITHUB_OUTPUT: "$GITHUB_OUTPUT" # - os: windows-latest # shell: pwsh # ACTIVATE_ENV: env/Scripts/activate.ps1 # GITHUB_OUTPUT: "$env:GITHUB_OUTPUT" name: unit testing with python ${{ matrix.python-version }} steps: - name: Clone the repo uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} - name: Get Head Commit Message id: get-head-commit-message run: echo "message=$(git show -s --format=%s)" | tee -a ${{ matrix.GITHUB_OUTPUT }} - name: Check ignore caching id: check-ignore-cache run: | ignore_cache=${{ contains(steps.get-head-commit-message.outputs.message, '[ignore cache]') }} echo "check=$ignore_cache" | tee -a ${{ matrix.GITHUB_OUTPUT }} - name: Set up Python ${{ matrix.python-version }} on ${{ runner.os }} uses: actions/setup-python@v4 id: setup_python with: python-version: ${{ matrix.python-version }} architecture: x64 - name: Install uv uses: astral-sh/setup-uv@v5 with: python-version: ${{ matrix.python-version }} enable-cache: true - name: Get cache key id: get-cache-key run: | pip install "setuptools-git-versioning>=2.0,<3" package_version=$(setuptools-git-versioning) cache_key="${{ runner.os }}-py${{ matrix.python-version }}-v${package_version}" echo "key=$cache_key" | tee -a ${{ matrix.GITHUB_OUTPUT }} - name: Try to restore dependencies from ${{ steps.get-cache-key.outputs.key }} id: restore-dependencies if: steps.check-ignore-cache.outputs.check != 'true' uses: actions/cache/restore@v3 with: path: ${{ env.pythonLocation }} key: ${{ steps.get-cache-key.outputs.key }} # could using cache of previous ver to reuse unchanged packages restore-keys: ${{ runner.os }}-py${{ matrix.python-version }} - name: Check cache hit id: check-cache-hit run: | echo "cache-hit=${{ steps.restore-dependencies.outputs.cache-hit }}" echo "cache-matched-key=${{ steps.restore-dependencies.outputs.cache-matched-key }}" cache_hit=${{ steps.restore-dependencies.outputs.cache-primary-key == steps.restore-dependencies.outputs.cache-matched-key }} echo "check=$cache_hit" | tee -a ${{ matrix.GITHUB_OUTPUT }} - name: Install additional dependencies (if any) run: | uv sync --frozen --no-cache - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }} if: | steps.check-ignore-cache.outputs.check != 'true' && steps.check-cache-hit.outputs.check != 'true' uses: actions/cache/save@v3 with: path: ${{ env.pythonLocation }} key: ${{ steps.restore-dependencies.outputs.cache-primary-key }} - name: Install OS-based packages run: | sudo apt update -qqy sudo apt install -y poppler-utils libpoppler-dev tesseract-ocr - name: Test kotaemon with pytest run: | source .venv/bin/activate uv pip show pytest cd libs/kotaemon pytest ================================================ FILE: .gitignore ================================================ # Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm activate* activate/* kotaemon-env* .env ### Emacs ### # -*- mode: gitignore; -*- *~ \#*\# /.emacs.desktop /.emacs.desktop.lock *.elc auto-save-list tramp .\#* # Org-mode .org-id-locations *_archive # flymake-mode *_flymake.* # eshell files /eshell/history /eshell/lastdir # elpa packages /elpa/ # reftex files *.rel # AUCTeX auto folder /auto/ # cask packages .cask/ dist/ # Flycheck flycheck_*.el # server auth directory /server/ # projectiles files .projectile # directory configuration .dir-locals.el # network security /network-security.data ### Linux ### # temporary files which can be created if a process still has a handle open of a deleted file .fuse_hidden* # KDE directory preferences .directory # Linux trash folder which might appear on any partition or disk .Trash-* # .nfs files are created when an open file is removed but is still being accessed .nfs* ### macOS ### # General .DS_Store .AppleDouble .LSOverride # Icon must end with two \r Icon # Thumbnails ._* # Files that might appear in the root of a volume .DocumentRevisions-V100 .fseventsd .Spotlight-V100 .TemporaryItems .Trashes .VolumeIcon.icns .com.apple.timemachine.donotpresent # Directories potentially created on remote AFP share .AppleDB .AppleDesktop Network Trash Folder Temporary Items .apdisk ### macOS Patch ### # iCloud generated files *.icloud ### PyCharm ### # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff .idea/**/workspace.xml .idea/**/tasks.xml .idea/**/usage.statistics.xml .idea/**/dictionaries .idea/**/shelf # AWS User-specific .idea/**/aws.xml # Generated files .idea/**/contentModel.xml # Sensitive or high-churn files .idea/**/dataSources/ .idea/**/dataSources.ids .idea/**/dataSources.local.xml .idea/**/sqlDataSources.xml .idea/**/dynamic.xml .idea/**/uiDesigner.xml .idea/**/dbnavigator.xml # Gradle .idea/**/gradle.xml .idea/**/libraries # Gradle and Maven with auto-import # When using Gradle or Maven with auto-import, you should exclude module files, # since they will be recreated, and may cause churn. Uncomment if using # auto-import. # .idea/artifacts # .idea/compiler.xml # .idea/jarRepositories.xml # .idea/modules.xml # .idea/*.iml # .idea/modules # *.iml # *.ipr # CMake cmake-build-*/ # Mongo Explorer plugin .idea/**/mongoSettings.xml # File-based project format *.iws # IntelliJ out/ # mpeltonen/sbt-idea plugin .idea_modules/ # JIRA plugin atlassian-ide-plugin.xml # Cursive Clojure plugin .idea/replstate.xml # SonarLint plugin .idea/sonarlint/ # Crashlytics plugin (for Android Studio and IntelliJ) com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties fabric.properties # Editor-based Rest Client .idea/httpRequests # Android studio 3.1+ serialized cache file .idea/caches/build_file_checksums.ser ### PyCharm Patch ### # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 # *.iml # modules.xml # .idea/misc.xml # *.ipr # Sonarlint plugin # https://plugins.jetbrains.com/plugin/7973-sonarlint .idea/**/sonarlint/ # SonarQube Plugin # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin .idea/**/sonarIssues.xml # Markdown Navigator plugin # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced .idea/**/markdown-navigator.xml .idea/**/markdown-navigator-enh.xml .idea/**/markdown-navigator/ # Cache file creation bug # See https://youtrack.jetbrains.com/issue/JBR-2257 .idea/$CACHE_FILE$ # CodeStream plugin # https://plugins.jetbrains.com/plugin/12206-codestream .idea/codestream.xml # Azure Toolkit for IntelliJ plugin # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij .idea/**/azureSettings.xml ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ ### Python Patch ### # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration poetry.toml # ruff .ruff_cache/ # LSP config files pyrightconfig.json ### Vim ### # Swap [._]*.s[a-v][a-z] !*.svg # comment out if you don't need vector files [._]*.sw[a-p] [._]s[a-rt-v][a-z] [._]ss[a-gi-z] [._]sw[a-p] # Session Session.vim Sessionx.vim # Temporary .netrwhist # Auto-generated tag files tags # Persistent undo [._]*.un~ ### VisualStudioCode ### .vscode/* !.vscode/settings.json !.vscode/tasks.json !.vscode/launch.json !.vscode/extensions.json !.vscode/*.code-snippets # Local History for Visual Studio Code .history/ # Built Visual Studio Code Extensions *.vsix ### VisualStudioCode Patch ### # Ignore all local history of files .history .ionide ### Windows ### # Windows thumbnail cache files Thumbs.db Thumbs.db:encryptable ehthumbs.db ehthumbs_vista.db # Dump file *.stackdump # Folder config file [Dd]esktop.ini # Recycle Bin used on file shares $RECYCLE.BIN/ # Windows Installer files *.cab *.msi *.msix *.msm *.msp # Windows shortcuts *.lnk # PDF files *.pdf !libs/kotaemon/tests/resources/*.pdf .theflow/ # End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm *.py[coid] logs/ .gitsecret/keys/random_seed !*.secret .envrc .env S.gpg-agent* .vscode/settings.json examples/example1/assets storage/* # Conda and env storages *install_dir/ doc_env/ # application data ktem_app_data/ gradio_tmp/ ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.3.0 hooks: - id: check-yaml args: ["--unsafe"] - id: check-toml - id: end-of-file-fixer - id: trailing-whitespace - id: mixed-line-ending - id: detect-aws-credentials args: ["--allow-missing-credentials"] - id: detect-private-key - id: check-added-large-files args: ["--maxkb=750"] - id: debug-statements - repo: https://github.com/ambv/black rev: 22.3.0 hooks: - id: black language_version: python3 - repo: https://github.com/pycqa/isort rev: 5.12.0 hooks: - id: isort args: ["--profile", "black"] language_version: python3.10 - repo: https://github.com/pycqa/flake8 rev: 4.0.1 hooks: - id: flake8 args: ["--max-line-length", "88", "--extend-ignore", "E203"] - repo: https://github.com/myint/autoflake rev: v1.4 hooks: - id: autoflake args: [ "--in-place", "--remove-unused-variables", "--remove-all-unused-imports", "--ignore-init-module-imports", "--exclude=tests/*", ] - repo: https://github.com/pre-commit/mirrors-prettier rev: v2.7.1 hooks: - id: prettier types_or: [markdown, yaml] - repo: https://github.com/pre-commit/mirrors-mypy rev: "v1.7.1" hooks: - id: mypy additional_dependencies: [ types-PyYAML==6.0.12.11, "types-requests", "sqlmodel", "types-Markdown", "types-cachetools", types-tzlocal, ] args: ["--check-untyped-defs", "--ignore-missing-imports"] exclude: "^templates/" - repo: https://github.com/codespell-project/codespell rev: v2.2.4 hooks: - id: codespell additional_dependencies: - tomli ================================================ FILE: .python-version ================================================ 3.10 ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: - Demonstrating empathy and kindness toward other people - Being respectful of differing opinions, viewpoints, and experiences - Giving and gracefully accepting constructive feedback - Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience - Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: - The use of sexualized language or imagery, and sexual attention or advances of any kind - Trolling, insulting or derogatory comments, and personal or political attacks - Public or private harassment - Publishing others' private information, such as a physical or email address, without their explicit permission - Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at . All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to Kotaemon Welcome 👋 to the Kotaemon project! We're thrilled that you're interested in contributing. Whether you're fixing bugs, adding new features, or improving documentation, your efforts are highly appreciated. This guide aims to help you get started with contributing to Kotaemon. ### Table of Contents 1. [📖 Code of Conduct](#code-of-conduct) 2. [🔁 Contributing via Pull Requests](#contributing-via-pull-requests) 3. [📥 Opening an Issue](#-opening-an-issue) 4. [📝 Commit Messages](#-commit-messages) 5. [🧾 License](#-license) ## 📖 Code of Conduct Please review our [code of conduct](./CODE_OF_CONDUCT.md), which is in effect at all times. We expect everyone who contributes to this project to honor it. ## 🔁 Contributing via Pull Requests 1. [**Fork the repository**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo): Click on the [Fork](https://github.com/Cinnamon/kotaemon/fork) button on the repository's page to create a copy of Kotaemon under your GitHub account. 2. [**Clone your code**](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository): Clone your forked repository to your local machine. 3. [**Create new branch**](https://docs.github.com/en/desktop/making-changes-in-a-branch/managing-branches-in-github-desktop): Create a new branch in your forked repo with a descriptive name that reflects your changes. ```sh git checkout -b descriptive-name-for-your-changes ``` 4. **Setup the development environment**: If you are working on the code, make sure to install the necessary dependencies for development ```sh pip install -e "libs/kotaemon[dev]" ``` 5. **Make your changes**: Ensure your code follows the project's coding style and passes all test cases. - Check the coding style ```sh pre-commit run --all-files ``` - Run the tests ```sh pytest libs/kotaemon/tests/ ``` 6. [**Commit your changes**](https://docs.github.com/en/desktop/making-changes-in-a-branch/committing-and-reviewing-changes-to-your-project-in-github-desktop): Once you are done with your changes, add and commit them with clear messages. ```sh git add your_changes.py git commit -m "clear message described your changes." git push -u origin descriptive-name-for-your-changes ``` 7. [**Create a pull request**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request): When you are satisfied with your changes, [submit a pull request](https://github.com/Cinnamon/kotaemon/compare) from your forked repository to Kotaemon repository. In the pull request, provide a clear description of your changes and any related issues. For the title of the pull request, please refer to our [commit messages convention](#-commit-messages). 8. **Wait for reviews**: Wait for the maintainers to review your pull request. If everything is okay, your changes will be merged into the Kotaemon project. ### GitHub Actions CI Tests All pull requests must pass the [GitHub Actions Continuous Integration (CI)](https://docs.github.com/en/actions/about-github-actions/about-continuous-integration-with-github-actions) tests before they can be merged. These tests include coding-style checks, PR title validation, unit tests, etc. to ensure that your changes meet the project's quality standards. Please review and fix any CI failures that arise. ## 📥 Opening an Issue Before [creating an issues](https://github.com/Cinnamon/kotaemon/issues/new/choose), search through existing issues to ensure you are not opening a duplicate. If you are reporting a bug or issue, please provide a reproducible example to help us quickly identify the problem. ## 📝 Commit Messages ### Overview We use [Angular convention](https://www.conventionalcommits.org/en/) for commit messages to maintain consistency and clarity in our project history. Please take a moment to familiarize yourself with this convention before making your first commit. _For the sake of simplicity, we use [squashing merge](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-commits) with pull requests. Therefore, if you contribute via a pull request, just make sure your PR's title, instead of the whole commits, follows this convention._ Commit format: ```sh ():
``` Examples: ```sh docs(api): update api doc ``` ### Commit types | Types | Description | | :--------- | :------------------------------------------------------------ | | `feat` | New features | | `fix` | Bug fix | | `docs` | Documentation only changes | | `build` | Changes that affect the build system or external dependencies | | `chore` | Something that doesn’t fit the other types | | `ci` | Changes to our CI configuration files and scripts | | `perf` | Improve performance | | `refactor` | Refactor code | | `revert` | Revert a previous commit | | `style` | Improve structure/format of the code | | `test` | Add, update or pass tests | ## 🧾 License All contributions will be licensed under the project's license: [Apache License 2.0](https://github.com/Cinnamon/kotaemon/blob/main/LICENSE.txt). ================================================ FILE: Dockerfile ================================================ # Lite version FROM python:3.10-slim AS lite # Common dependencies RUN apt-get update -qqy && \ apt-get install -y --no-install-recommends \ ssh \ git \ gcc \ g++ \ poppler-utils \ libpoppler-dev \ unzip \ curl \ cargo \ && \ apt-get autoremove && apt-get clean && rm -rf /var/lib/apt/lists/* # Setup args ARG TARGETPLATFORM ARG TARGETARCH # Set environment variables ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 ENV PYTHONIOENCODING=UTF-8 ENV TARGETARCH=${TARGETARCH} # Create working directory WORKDIR /app # Download pdfjs COPY scripts/download_pdfjs.sh /app/scripts/download_pdfjs.sh RUN chmod +x /app/scripts/download_pdfjs.sh ENV PDFJS_PREBUILT_DIR="/app/libs/ktem/ktem/assets/prebuilt/pdfjs-dist" RUN bash scripts/download_pdfjs.sh $PDFJS_PREBUILT_DIR # Install uv dependencies RUN pip install --no-cache-dir "uv" # Copy contents COPY . /app COPY launch.sh /app/launch.sh COPY .env.example /app/.env # Install pip packages RUN --mount=type=ssh \ --mount=type=cache,target=/root/.cache/uv \ uv sync --frozen --no-cache \ && uv pip install --python .venv "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements" RUN --mount=type=ssh \ --mount=type=cache,target=/root/.cache/uv \ if [ "$TARGETARCH" = "amd64" ]; then uv pip install --python .venv "graphrag<=0.3.6" future; fi ENTRYPOINT ["sh", "/app/launch.sh"] # Full version FROM lite AS full # Additional dependencies for full version RUN apt-get update -qqy && \ apt-get install -y --no-install-recommends \ tesseract-ocr \ tesseract-ocr-jpn \ libsm6 \ libxext6 \ libreoffice \ ffmpeg \ libmagic-dev \ && \ apt-get autoremove && apt-get clean && rm -rf /var/lib/apt/lists/* # Install torch and torchvision for unstructured RUN --mount=type=ssh \ --mount=type=cache,target=/root/.cache/uv \ uv pip install --python .venv torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu # Install additional pip packages RUN --mount=type=ssh \ --mount=type=cache,target=/root/.cache/uv \ uv pip install --python .venv "libs/kotaemon[adv]" \ && uv pip install --python .venv unstructured[all-docs] # Install lightRAG ENV USE_LIGHTRAG=true RUN --mount=type=ssh \ --mount=type=cache,target=/root/.cache/uv \ uv pip install --python .venv aioboto3 nano-vectordb ollama xxhash "lightrag-hku<=1.3.0" RUN --mount=type=ssh \ --mount=type=cache,target=/root/.cache/uv \ uv pip install --python .venv "docling<=2.5.2" # Download NLTK data from LlamaIndex RUN /app/.venv/bin/python -c "from llama_index.core.readers.base import BaseReader" ENTRYPOINT ["sh", "/app/launch.sh"] # Ollama-bundled version FROM full AS ollama # Install ollama RUN curl -fsSL https://ollama.com/install.sh | sh # RUN nohup bash -c "ollama serve &" && sleep 4 && ollama pull qwen2.5:7b RUN nohup bash -c "ollama serve &" && sleep 4 && ollama pull nomic-embed-text ENTRYPOINT ["sh", "/app/launch.sh"] ================================================ FILE: LICENSE.txt ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================
# kotaemon An open-source clean & customizable RAG UI for chatting with your documents. Built with both end users and developers in mind. ![Preview](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/preview-graph.png) Cinnamon%2Fkotaemon | Trendshift [Live Demo #1](https://huggingface.co/spaces/cin-model/kotaemon) | [Live Demo #2](https://huggingface.co/spaces/cin-model/kotaemon-demo) | [Online Install](https://cinnamon.github.io/kotaemon/online_install/) | [Colab Notebook (Local RAG)](https://colab.research.google.com/drive/1eTfieec_UOowNizTJA1NjawBJH9y_1nn) [User Guide](https://cinnamon.github.io/kotaemon/) | [Developer Guide](https://cinnamon.github.io/kotaemon/development/) | [Feedback](https://github.com/Cinnamon/kotaemon/issues) | [Contact](mailto:kotaemon.support@cinnamon.is) [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-31013/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) docker pull ghcr.io/cinnamon/kotaemon:latest ![download](https://img.shields.io/github/downloads/Cinnamon/kotaemon/total.svg?label=downloads&color=blue) Featured|HelloGitHub
## Introduction This project serves as a functional RAG UI for both end users who want to do QA on their documents and developers who want to build their own RAG pipeline.
```yml +----------------------------------------------------------------------------+ | End users: Those who use apps built with `kotaemon`. | | (You use an app like the one in the demo above) | | +----------------------------------------------------------------+ | | | Developers: Those who built with `kotaemon`. | | | | (You have `import kotaemon` somewhere in your project) | | | | +----------------------------------------------------+ | | | | | Contributors: Those who make `kotaemon` better. | | | | | | (You make PR to this repo) | | | | | +----------------------------------------------------+ | | | +----------------------------------------------------------------+ | +----------------------------------------------------------------------------+ ``` ### For end users - **Clean & Minimalistic UI**: A user-friendly interface for RAG-based QA. - **Support for Various LLMs**: Compatible with LLM API providers (OpenAI, AzureOpenAI, Cohere, etc.) and local LLMs (via `ollama` and `llama-cpp-python`). - **Easy Installation**: Simple scripts to get you started quickly. ### For developers - **Framework for RAG Pipelines**: Tools to build your own RAG-based document QA pipeline. - **Customizable UI**: See your RAG pipeline in action with the provided UI, built with Gradio . - **Gradio Theme**: If you use Gradio for development, check out our theme here: [kotaemon-gradio-theme](https://github.com/lone17/kotaemon-gradio-theme). ## Key Features - **Host your own document QA (RAG) web-UI**: Support multi-user login, organize your files in private/public collections, collaborate and share your favorite chat with others. - **Organize your LLM & Embedding models**: Support both local LLMs & popular API providers (OpenAI, Azure, Ollama, Groq). - **Hybrid RAG pipeline**: Sane default RAG pipeline with hybrid (full-text & vector) retriever and re-ranking to ensure best retrieval quality. - **Multi-modal QA support**: Perform Question Answering on multiple documents with figures and tables support. Support multi-modal document parsing (selectable options on UI). - **Advanced citations with document preview**: By default the system will provide detailed citations to ensure the correctness of LLM answers. View your citations (incl. relevant score) directly in the _in-browser PDF viewer_ with highlights. Warning when retrieval pipeline return low relevant articles. - **Support complex reasoning methods**: Use question decomposition to answer your complex/multi-hop question. Support agent-based reasoning with `ReAct`, `ReWOO` and other agents. - **Configurable settings UI**: You can adjust most important aspects of retrieval & generation process on the UI (incl. prompts). - **Extensible**: Being built on Gradio, you are free to customize or add any UI elements as you like. Also, we aim to support multiple strategies for document indexing & retrieval. `GraphRAG` indexing pipeline is provided as an example. ![Preview](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/preview.png) ## Installation > If you are not a developer and just want to use the app, please check out our easy-to-follow [User Guide](https://cinnamon.github.io/kotaemon/). Download the `.zip` file from the [latest release](https://github.com/Cinnamon/kotaemon/releases/latest) to get all the newest features and bug fixes. ### System requirements 1. [Python](https://www.python.org/downloads/) >= 3.10 2. [Docker](https://www.docker.com/): optional, if you [install with Docker](#with-docker-recommended) 3. [Unstructured](https://docs.unstructured.io/open-source/installation/full-installation#full-installation) if you want to process files other than `.pdf`, `.html`, `.mhtml`, and `.xlsx` documents. Installation steps differ depending on your operating system. Please visit the link and follow the specific instructions provided there. ### With Docker (recommended) 1. We support both `lite` & `full` version of Docker images. With `full` version, the extra packages of `unstructured` will be installed, which can support additional file types (`.doc`, `.docx`, ...) but the cost is larger docker image size. For most users, the `lite` image should work well in most cases. - To use the `full` version. ```bash docker run \ -e GRADIO_SERVER_NAME=0.0.0.0 \ -e GRADIO_SERVER_PORT=7860 \ -v ./ktem_app_data:/app/ktem_app_data \ -p 7860:7860 -it --rm \ ghcr.io/cinnamon/kotaemon:main-full ``` - To use the `full` version with bundled **Ollama** for _local / private RAG_. ```bash # change image name to docker run <...> ghcr.io/cinnamon/kotaemon:main-ollama ``` - To use the `lite` version. ```bash # change image name to docker run <...> ghcr.io/cinnamon/kotaemon:main-lite ``` 2. We currently support and test two platforms: `linux/amd64` and `linux/arm64` (for newer Mac). You can specify the platform by passing `--platform` in the `docker run` command. For example: ```bash # To run docker with platform linux/arm64 docker run \ -e GRADIO_SERVER_NAME=0.0.0.0 \ -e GRADIO_SERVER_PORT=7860 \ -v ./ktem_app_data:/app/ktem_app_data \ -p 7860:7860 -it --rm \ --platform linux/arm64 \ ghcr.io/cinnamon/kotaemon:main-lite ``` 3. Once everything is set up correctly, you can go to `http://localhost:7860/` to access the WebUI. 4. We use [GHCR](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) to store docker images, all images can be found [here.](https://github.com/Cinnamon/kotaemon/pkgs/container/kotaemon) ### Without Docker #### Option 1: Using uv (Recommended for faster installation) 1. Clone the repository and run the uv installation script: ```shell # clone this repo git clone https://github.com/Cinnamon/kotaemon cd kotaemon # run the uv installation script (installs uv automatically if not present) bash scripts/run_uv.sh ``` This script will: - Install uv package manager if not present - Create a virtual environment with Python 3.10 - Install all dependencies using uv (significantly faster than conda/pip) - Set up PDF.js viewer - Launch the application #### Option 2: Using conda (Traditional method) 1. Clone and install required packages on a fresh python environment. ```shell # optional (setup env) conda create -n kotaemon python=3.10 conda activate kotaemon # clone this repo git clone https://github.com/Cinnamon/kotaemon cd kotaemon pip install -e "libs/kotaemon[all]" pip install -e "libs/ktem" ``` 2. Create a `.env` file in the root of this project. Use `.env.example` as a template The `.env` file is there to serve use cases where users want to pre-config the models before starting up the app (e.g. deploy the app on HF hub). The file will only be used to populate the db once upon the first run, it will no longer be used in consequent runs. 3. (Optional) To enable in-browser `PDF_JS` viewer, download [PDF_JS_DIST](https://github.com/mozilla/pdf.js/releases/download/v4.0.379/pdfjs-4.0.379-dist.zip) then extract it to `libs/ktem/ktem/assets/prebuilt` pdf-setup 4. Start the web server: ```shell python app.py ``` - The app will be automatically launched in your browser. - Default username and password are both `admin`. You can set up additional users directly through the UI. ![Chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png) 5. Check the `Resources` tab and `LLMs and Embeddings` and ensure that your `api_key` value is set correctly from your `.env` file. If it is not set, you can set it there. ### Setup GraphRAG > [!NOTE] > Official MS GraphRAG indexing only works with OpenAI or Ollama API. > We recommend most users to use NanoGraphRAG implementation for straightforward integration with Kotaemon.
Setup Nano GRAPHRAG - Install nano-GraphRAG: `pip install nano-graphrag` - `nano-graphrag` install might introduce version conflicts, see [this issue](https://github.com/Cinnamon/kotaemon/issues/440) - To quickly fix: `pip uninstall hnswlib chroma-hnswlib && pip install chroma-hnswlib` - Launch Kotaemon with `USE_NANO_GRAPHRAG=true` environment variable. - Set your default LLM & Embedding models in Resources setting and it will be recognized automatically from NanoGraphRAG.
Setup LIGHTRAG - Install LightRAG: `pip install git+https://github.com/HKUDS/LightRAG.git` - `LightRAG` install might introduce version conflicts, see [this issue](https://github.com/Cinnamon/kotaemon/issues/440) - To quickly fix: `pip uninstall hnswlib chroma-hnswlib && pip install chroma-hnswlib` - Launch Kotaemon with `USE_LIGHTRAG=true` environment variable. - Set your default LLM & Embedding models in Resources setting and it will be recognized automatically from LightRAG.
Setup MS GRAPHRAG - **Non-Docker Installation**: If you are not using Docker, install GraphRAG with the following command: ```shell pip install "graphrag<=0.3.6" future ``` - **Setting Up API KEY**: To use the GraphRAG retriever feature, ensure you set the `GRAPHRAG_API_KEY` environment variable. You can do this directly in your environment or by adding it to a `.env` file. - **Using Local Models and Custom Settings**: If you want to use GraphRAG with local models (like `Ollama`) or customize the default LLM and other configurations, set the `USE_CUSTOMIZED_GRAPHRAG_SETTING` environment variable to true. Then, adjust your settings in the `settings.yaml.example` file.
### Setup Local Models (for local/private RAG) See [Local model setup](docs/local_model.md). ### Setup multimodal document parsing (OCR, table parsing, figure extraction) These options are available: - [Azure Document Intelligence (API)](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence) - [Adobe PDF Extract (API)](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/) - [Docling (local, open-source)](https://github.com/DS4SD/docling) - To use Docling, first install required dependencies: `pip install docling` Select corresponding loaders in `Settings -> Retrieval Settings -> File loader` ### Customize your application - By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine. - For advanced users or specific use cases, you can customize these files: - `flowsettings.py` - `.env` #### `flowsettings.py` This file contains the configuration of your application. You can use the example [here](flowsettings.py) as the starting point.
Notable settings ```python # setup your preferred document store (with full-text search capabilities) KH_DOCSTORE=(Elasticsearch | LanceDB | SimpleFileDocumentStore) # setup your preferred vectorstore (for vector-based search) KH_VECTORSTORE=(ChromaDB | LanceDB | InMemory | Milvus | Qdrant) # Enable / disable multimodal QA KH_REASONINGS_USE_MULTIMODAL=True # Setup your new reasoning pipeline or modify existing one. KH_REASONINGS = [ "ktem.reasoning.simple.FullQAPipeline", "ktem.reasoning.simple.FullDecomposeQAPipeline", "ktem.reasoning.react.ReactAgentPipeline", "ktem.reasoning.rewoo.RewooAgentPipeline", ] ```
#### `.env` This file provides another way to configure your models and credentials.
Configure model via the .env file - Alternatively, you can configure the models via the `.env` file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one. - Currently, the following providers are supported: - **OpenAI** In the `.env` file, set the `OPENAI_API_KEY` variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people. ```shell OPENAI_API_BASE=https://api.openai.com/v1 OPENAI_API_KEY= OPENAI_CHAT_MODEL=gpt-3.5-turbo OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002 ``` - **Azure OpenAI** For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development. ```shell AZURE_OPENAI_ENDPOINT= AZURE_OPENAI_API_KEY= OPENAI_API_VERSION=2024-02-15-preview AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002 ``` - **Local Models** - Using `ollama` OpenAI compatible server: - Install [ollama](https://github.com/ollama/ollama) and start the application. - Pull your model, for example: ```shell ollama pull llama3.1:8b ollama pull nomic-embed-text ``` - Set the model names on web UI and make it as default: ![Models](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/models.png) - Using `GGUF` with `llama-cpp-python` You can search and download a LLM to be ran locally from the [Hugging Face Hub](https://huggingface.co/models). Currently, these model formats are supported: - GGUF You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that takes up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time. Here are some recommendations and their size in memory: - [Qwen1.5-1.8B-Chat-GGUF](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q8_0.gguf?download=true): around 2 GB Add a new LlamaCpp model with the provided model name on the web UI.
### Adding your own RAG pipeline #### Custom Reasoning Pipeline 1. Check the default pipeline implementation in [here](libs/ktem/ktem/reasoning/simple.py). You can make quick adjustment to how the default QA pipeline work. 2. Add new `.py` implementation in `libs/ktem/ktem/reasoning/` and later include it in `flowssettings` to enable it on the UI. #### Custom Indexing Pipeline - Check sample implementation in `libs/ktem/ktem/index/file/graph` > (more instruction WIP). ## Citation Please cite this project as ```BibTeX @misc{kotaemon2024, title = {Kotaemon - An open-source RAG-based tool for chatting with any content.}, author = {The Kotaemon Team}, year = {2024}, howpublished = {\url{https://github.com/Cinnamon/kotaemon}}, } ``` ## Star History Star History Chart ## Contribution Since our project is actively being developed, we greatly value your feedback and contributions. Please see our [Contributing Guide](https://github.com/Cinnamon/kotaemon/blob/main/CONTRIBUTING.md) to get started. Thank you to all our contributors! ================================================ FILE: app.py ================================================ import os from theflow.settings import settings as flowsettings KH_APP_DATA_DIR = getattr(flowsettings, "KH_APP_DATA_DIR", ".") KH_GRADIO_SHARE = getattr(flowsettings, "KH_GRADIO_SHARE", False) GRADIO_TEMP_DIR = os.getenv("GRADIO_TEMP_DIR", None) # override GRADIO_TEMP_DIR if it's not set if GRADIO_TEMP_DIR is None: GRADIO_TEMP_DIR = os.path.join(KH_APP_DATA_DIR, "gradio_tmp") os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR from ktem.main import App # noqa app = App() demo = app.make() demo.queue().launch( favicon_path=app._favicon, inbrowser=True, allowed_paths=[ "libs/ktem/ktem/assets", GRADIO_TEMP_DIR, ], share=KH_GRADIO_SHARE, ) ================================================ FILE: doc_env_reqs.txt ================================================ mkdocs mkdocstrings[python] mkdocs-material mkdocs-gen-files mkdocs-literate-nav mkdocs-git-revision-date-localized-plugin mkdocs-section-index mkdocs-include-markdown-plugin[cache] mdx_truly_sane_lists ================================================ FILE: docs/about.md ================================================ # About Kotaemon An open-source tool for chatting with your documents. Built with both end users and developers in mind. [Source Code](https://github.com/Cinnamon/kotaemon) | [HF Space](https://huggingface.co/spaces/cin-model/kotaemon-demo) [Installation Guide](https://cinnamon.github.io/kotaemon/) | [Developer Guide](https://cinnamon.github.io/kotaemon/development/) | [Feedback](https://github.com/Cinnamon/kotaemon/issues) ================================================ FILE: docs/development/contributing.md ================================================ # Contributing ## Setting up - Clone the repo ```shell git clone git@github.com:Cinnamon/kotaemon.git cd kotaemon ``` - Install the environment - Create a conda environment (python >= 3.10 is recommended) ```shell conda create -n kotaemon python=3.10 conda activate kotaemon # install dependencies cd libs/kotaemon pip install -e ".[all]" ``` - Or run the installer (one of the `scripts/run_*` scripts depends on your OS), then you will have all the dependencies installed as a conda environment at `install_dir/env`. ```shell conda activate install_dir/env ``` - Pre-commit ```shell pre-commit install ``` - Test ```shell pytest tests ``` ## Package overview `kotaemon` library focuses on the AI building blocks to implement a RAG-based QA application. It consists of base interfaces, core components and a list of utilities: - Base interfaces: `kotaemon` defines the base interface of a component in a pipeline. A pipeline is also a component. By clearly define this interface, a pipeline of steps can be easily constructed and orchestrated. - Core components: `kotaemon` implements (or wraps 3rd-party libraries like Langchain, llama-index,... when possible) commonly used components in kotaemon use cases. Some of these components are: LLM, vector store, document store, retriever... For a detailed list and description of these components, please refer to the [API Reference](../reference/Summary.md) section. - List of utilities: `kotaemon` provides utilities and tools that are usually needed in client project. For example, it provides a prompt engineering UI for AI developers in a project to quickly create a prompt engineering tool for DMs and QALs. It also provides a command to quickly spin up a project code base. For a full list and description of these utilities, please refer to the [Utilities](utilities.md) section. ```mermaid mindmap root((kotaemon)) Base Interfaces Document LLMInterface RetrievedDocument BaseEmbeddings BaseChat BaseCompletion ... Core Components LLMs AzureOpenAI OpenAI Embeddings AzureOpenAI OpenAI HuggingFaceEmbedding VectorStore InMemoryVectorstore ChromaVectorstore Agent Tool DocumentStore ... Utilities Scaffold project PromptUI Documentation Support ``` ## Common conventions - PR title: One-line description (example: Feat: Declare BaseComponent and decide LLM call interface). - [Encouraged] Provide a quick description in the PR, so that: - Reviewers can quickly understand the direction of the PR. - It will be included in the commit message when the PR is merged. ## Environment caching on PR - To speed up CI, environments are cached based on the version specified in `__init__.py`. - Since dependencies versions in `setup.py` are not pinned, you need to pump the version in order to use a new environment. That environment will then be cached and used by your subsequence commits within the PR, until you pump the version again - The new environment created during your PR is cached and will be available to others once the PR is merged. - If you are experimenting with new dependencies and want a fresh environment every time, add `[ignore cache]` in your commit message. The CI will create a fresh environment to run your commit and then discard it. - If your PR include updated dependencies, the recommended workflow would be: - Doing development as usual. - When you want to run the CI, push a commit with the message containing `[ignore cache]`. - Once the PR is final, pump the version in `__init__.py` and push a final commit not containing `[ignore cache]`. ## Merge PR guideline - Use squash and merge option - 1st line message is the PR title. - The text area is the PR description. ================================================ FILE: docs/development/create-a-component.md ================================================ # Creating a component A fundamental concept in kotaemon is "component". Anything that isn't data or data structure is a "component". A component can be thought of as a step within a pipeline. It takes in some input, processes it, and returns an output, just the same as a Python function! The output will then become an input for the next component in a pipeline. In fact, a pipeline is just a component. More appropriately, a nested component: a component that makes use of one or more other components in the processing step. So in reality, there isn't a difference between a pipeline and a component! Because of that, in kotaemon, we will consider them the same as "component". To define a component, you will: 1. Create a class that subclasses from `kotaemon.base.BaseComponent` 2. Declare init params with type annotation 3. Declare nodes (nodes are just other components!) with type annotation 4. Implement the processing logic in `run`. The syntax of a component is as follow: ```python from kotaemon.base import BaseComponent from kotaemon.llms import LCAzureChatOpenAI from kotaemon.parsers import RegexExtractor class FancyPipeline(BaseComponent): param1: str = "This is param1" param2: int = 10 param3: float node1: BaseComponent # this is a node because of BaseComponent type annotation node2: LCAzureChatOpenAI # this is also a node because LCAzureChatOpenAI subclasses BaseComponent node3: RegexExtractor # this is also a node bceause RegexExtractor subclasses BaseComponent def run(self, some_text: str): prompt = (self.param1 + some_text) * int(self.param2 + self.param3) llm_pred = self.node2(prompt).text matches = self.node3(llm_pred) return matches ``` Then this component can be used as follow: ```python llm = LCAzureChatOpenAI(endpoint="some-endpont") extractor = RegexExtractor(pattern=["yes", "Yes"]) component = FancyPipeline( param1="Hello" param3=1.5 node1=llm, node2=llm, node3=extractor ) component("goodbye") ``` This way, we can define each operation as a reusable component, and use them to compose larger reusable components! ## Benefits of component By defining a component as above, we formally encapsulate all the necessary information inside a single class. This introduces several benefits: 1. Allow tools like promptui to inspect the inner working of a component in order to automatically generate the promptui. 2. Allow visualizing a pipeline for debugging purpose. ================================================ FILE: docs/development/data-components.md ================================================ # Data & Data Structure Components The data & data structure components include: - The `Document` class. - The document store. - The vector store. ## Data Loader - PdfLoader - Layout-aware with table parsing PdfLoader - MathPixLoader: To use this loader, you need MathPix API key, refer to [mathpix docs](https://docs.mathpix.com/#introduction) for more information - OCRLoader: This loader uses lib-table and Flax pipeline to perform OCR and read table structure from PDF file (TODO: add more info about deployment of this module). - Output: - Document: text + metadata to identify whether it is table or not ``` - "source": source file name - "type": "table" or "text" - "table_origin": original table in markdown format (to be feed to LLM or visualize using external tools) - "page_label": page number in the original PDF document ``` ## Document Store - InMemoryDocumentStore ## Vector Store - ChromaVectorStore - InMemoryVectorStore ================================================ FILE: docs/development/index.md ================================================ {% include-markdown "../../README.md" start="" end="" %} ================================================ FILE: docs/development/utilities.md ================================================ # Utilities ## Prompt engineering UI ![chat-ui](images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png) **_Important:_** despite the name prompt engineering UI, this tool allows testers to test any kind of parameters that are exposed by developers. Prompt is one kind of param. There can be other type of params that testers can tweak (e.g. top_k, temperature...). In the development process, developers typically build the pipeline. However, for use cases requiring expertise in prompt creation, non-technical members (testers, domain experts) can be more effective. To facilitate this, `kotaemon` offers a user-friendly prompt engineering UI that developers integrate into their pipelines. This enables non-technical members to adjust prompts and parameters, run experiments, and export results for optimization. As of Sept 2023, there are 2 kinds of prompt engineering UI: - Simple pipeline: run one-way from start to finish. - Chat pipeline: interactive back-and-forth. ### Simple pipeline For simple pipeline, the supported client project workflow looks as follow: 1. [tech] Build pipeline 2. [tech] Export pipeline to config: `$ kotaemon promptui export --output ` 3. [tech] Customize the config 4. [tech] Spin up prompt engineering UI: `$ kotaemon promptui run ` 5. [non-tech] Change params, run inference 6. [non-tech] Export to Excel 7. [non-tech] Select the set of params that achieve the best output The prompt engineering UI prominently involves from step 2 to step 7 (step 1 is normally done by the developers, while step 7 happens exclusively in Excel file). #### Step 2 - Export pipeline to config Command: ```shell $ kotaemon promptui export --output ``` where: - `` is a dot-separated path to the pipeline. For example, if your pipeline can be accessed with `from projectA.pipelines import AnsweringPipeline`, then this value is `projectA.pipelines.AnswerPipeline`. - `` is the target file path that the config will be exported to. If the config file already exists, and contains information of other pipelines, the config of current pipeline will additionally be added. If it contains information of the current pipeline (in the past), the old information will be replaced. By default, all params in a pipeline (including nested params) will be export to the configuration file. For params that you do not wish to expose to the UI, you can directly remove them from the config YAML file. You can also annotate those param with `ignore_ui=True`, and they will be ignored in the config generation process. Example: ```python class Pipeline(BaseComponent): param1: str = Param(default="hello") param2: str = Param(default="goodbye", ignore_ui=True) ``` Declared as above, and `param1` will show up in the config YAML file, while `param2` will not. #### Step 3 - Customize the config developers can further edit the config file in this step to get the most suitable UI (step 4) with their tasks. The exported config will have this overall schema: ```yml : params: ... (Detail param information to initiate a pipeline. This corresponds to the pipeline init parameters.) inputs: ... (Detail the input of the pipeline e.g. a text prompt. This corresponds to the params of `run(...)` method.) outputs: ... (Detail the output of the pipeline e.g. prediction, accuracy... This is the output information we wish to see in the UI.) logs: ... (Detail what information should show up in the log.) ``` ##### Input and params The inputs section have the overall schema as follow: ```yml inputs: : component: params: # this section is optional) value: : ... # similar to above params: : ... # similar to those in the inputs ``` The list of supported prompt UI and their corresponding gradio UI components: ```python COMPONENTS_CLASS = { "text": gr.components.Textbox, "checkbox": gr.components.CheckboxGroup, "dropdown": gr.components.Dropdown, "file": gr.components.File, "image": gr.components.Image, "number": gr.components.Number, "radio": gr.components.Radio, "slider": gr.components.Slider, } ``` ##### Outputs The outputs are a list of variables that we wish to show in the UI. Since in Python, the function output doesn't have variable name, so output declaration is a little bit different than input and param declaration: ```yml outputs: - component: step: item: - ... # similar to above ``` where: - component: the same text string and corresponding Gradio UI as in inputs & params - step: the pipeline step that we wish to look fetch and show output on the UI - item: the jsonpath mechanism to get the targeted variable from the step above ##### Logs The logs show a list of sheetname and how to retrieve the desired information. ```yml logs: : inputs: - name: step: variable: - ... outputs: - name: step: item: ``` #### Step 4 + 5 - Spin up prompt engineering UI + Perform prompt engineering Command: ```shell $ kotaemon promptui run ``` This will generate an UI as follow: ![Screenshot from 2023-09-20 12-20-31](images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png) where: - The tabs at the top of the UI corresponds to the pipeline to do prompt engineering. - The inputs and params tabs allow users to edit (these corresponds to the inputs and params in the config file). - The outputs panel holds the UI elements to show the outputs defined in config file. - The Run button: will execute pipeline with the supplied inputs and params, and render result in the outputs panel. - The Export button: will export the logs of all the run to an Excel files users to inspect for best set of params. #### Step 6 - Export to Excel Upon clicking export, the users can download Excel file. ### Chat pipeline Chat pipeline workflow is different from simple pipeline workflow. In simple pipeline, each Run creates a set of output, input and params for users to compare. In chat pipeline, each Run is not a one-off run, but a long interactive session. Hence, the workflow is as follow: 1. Set the desired parameters. 2. Click "New chat" to start a chat session with the supplied parameters. This set of parameters will persist until the end of the chat session. During an ongoing chat session, changing the parameters will not take any effect. 3. Chat and interact with the chat bot on the right panel. You can add any additional input (if any), and they will be supplied to the chatbot. 4. During chat, the log of the chat will show up in the "Output" tabs. This is empty by default, so if you want to show the log here, tell the AI developers to configure the UI settings. 5. When finishing chat, select your preference in the radio box. Click "End chat". This will save the chat log and the preference to disk. 6. To compare the result of different run, click "Export" to get an Excel spreadsheet summary of different run. ================================================ FILE: docs/extra/css/code_select.css ================================================ .language-pycon .gp, .language-pycon .go { /* Generic.Prompt, Generic.Output */ user-select: none; } ================================================ FILE: docs/index.md ================================================ # Getting Started with Kotaemon ![type:video](https://github.com/Cinnamon/kotaemon/assets/25688648/815ecf68-3a02-4914-a0dd-3f8ec7e75cd9) This page is intended for **end users** who want to use the `kotaemon` tool for Question Answering on local documents. If you are a **developer** who wants contribute to the project, please visit the [development](development/index.md) page. ## Installation (Online HuggingFace Space) - easy (10 mins) Visit this [guide](online_install.md). ## Installation (Offline) - intermediate (20 mins) ### Download Download the `kotaemon-app.zip` file from the [latest release](https://github.com/Cinnamon/kotaemon/releases/latest/). ### Run setup script 0. Unzip the downloaded file. 1. Navigate to the `scripts` folder and start an installer that matches your OS: - Windows: `run_windows.bat`. Just double click the file. - macOS: `run_macos.sh` 1. Right click on your file and select Open with and Other. 2. Enable All Applications and choose Terminal. 3. NOTE: If you always want to open that file with Terminal, then check Always Open With. 4. From now on, double click on your file and it should work. - Linux: `run_linux.sh`. Please run the script using `bash run_linux.sh` in your terminal. 2. After the installation, the installer will ask to launch the ktem's UI, answer to continue. 3. If launched, the application will be open automatically in your browser. 4. Default login information is: `username: admin / password: admin`. You should change this credential right after the first login on the UI. ## Launch To launch the app after initial setup or any change, simply run the `run_*` script again. A browser window will be opened and greets you with this screen: ![Chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png) ## Usage For how to use the application, see [Usage](usage.md). This page will also be available to you within the application. ## Feedback Feel free to create a bug report or a feature request on our [repo](https://github.com/Cinnamon/kotaemon/issues). ================================================ FILE: docs/local_model.md ================================================ # Setup local LLMs & Embedding models ## Prepare local models #### NOTE In the case of using Docker image, please replace `http://localhost` with `http://host.docker.internal` to correctly communicate with service on the host machine. See [more detail](https://stackoverflow.com/questions/31324981/how-to-access-host-port-from-docker-container). ### Ollama OpenAI compatible server (recommended) Install [ollama](https://github.com/ollama/ollama) and start the application. Pull your model (e.g): ``` ollama pull llama3.1:8b ollama pull nomic-embed-text ``` Setup LLM and Embedding model on Resources tab with type OpenAI. Set these model parameters to connect to Ollama: ``` api_key: ollama base_url: http://localhost:11434/v1/ model: gemma2:2b (for llm) | nomic-embed-text (for embedding) ``` ![Models](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/models.png) ### oobabooga/text-generation-webui OpenAI compatible server Install [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui/). Follow the setup guide to download your models (GGUF, HF). Also take a look at [OpenAI compatible server](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API) for detail instructions. Here is a short version ``` # install sentence-transformer for embeddings creation pip install sentence_transformers # change to text-generation-webui src dir python server.py --api ``` Use the `Models` tab to download new model and press Load. Setup LLM and Embedding model on Resources tab with type OpenAI. Set these model parameters to connect to `text-generation-webui`: ``` api_key: dummy base_url: http://localhost:5000/v1/ model: any ``` ### llama-cpp-python server (LLM only) See [llama-cpp-python OpenAI server](https://llama-cpp-python.readthedocs.io/en/latest/server/). Download any GGUF model weight on HuggingFace or other source. Place it somewhere on your local machine. Run ``` LOCAL_MODEL= python scripts/serve_local.py ``` Setup LLM model on Resources tab with type OpenAI. Set these model parameters to connect to `llama-cpp-python`: ``` api_key: dummy base_url: http://localhost:8000/v1/ model: model_name ``` ## Use local models for RAG - Set default LLM and Embedding model to a local variant. ![Models](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/llm-default.png) - Set embedding model for the File Collection to a local model (e.g: `ollama`) ![Index](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/index-embedding.png) - Go to Retrieval settings and choose LLM relevant scoring model as a local model (e.g: `ollama`). Or, you can choose to disable this feature if your machine cannot handle a lot of parallel LLM requests at the same time. ![Settings](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/retrieval-setting.png) You are set! Start a new conversation to test your local RAG pipeline. ================================================ FILE: docs/online_install.md ================================================ ## Installation (Online HuggingFace Space) 1. Go to [HF kotaemon_template](https://huggingface.co/spaces/cin-model/kotaemon_template). 2. Use Duplicate function to create your own space. Or use this [direct link](https://huggingface.co/spaces/cin-model/kotaemon_template?duplicate=true). ![Duplicate space](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/duplicate_space.png) ![Change space params](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/change_space_params.png) 3. Wait for the build to complete and start up (apprx 10 mins). ![Wait space build](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/space_build.png) ![Close space build](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/close_logs_space.png) 4. Follow the first setup instructions (and register for Cohere API key if needed). ![Cohere API](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/cohere_api_key.png) 5. Complete the setup and use your own private space! ![App Startup](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/initial_startup.png) ================================================ FILE: docs/pages/app/customize-flows.md ================================================ # Add new indexing and reasoning pipeline to the application @trducng At high level, to add new indexing and reasoning pipeline: 1. You define your indexing or reasoning pipeline as a class from `BaseComponent`. 2. You declare that class in the setting files `flowsettings.py`. Then when `python app.py`, the application will dynamically load those pipelines. The below sections talk in more detail about how the pipelines should be constructed. ## Define a pipeline as a class In essence, a pipeline will subclass from `kotaemon.base.BaseComponent`. Each pipeline has 2 main parts: - All declared arguments and sub-pipelines. - The logic inside the pipeline. An example pipeline: ```python from kotaemon.base import BaseComponent class SoSimple(BaseComponent): arg1: int arg2: str def run(self, arg3: str): return self.arg1 * self.arg2 + arg3 ``` This pipeline is simple for demonstration purpose, but we can imagine pipelines with much more arguments, that can take other pipelines as arguments, and have more complicated logic in the `run` method. **_An indexing or reasoning pipeline is just a class subclass from `BaseComponent` like above._** For more detail on this topic, please refer to [Creating a Component](/create-a-component/) ## Run signatures **Note**: this section is tentative at the moment. We will finalize `def run` function signature by latest early April. The indexing pipeline: ```python def run( self, file_paths: str | Path | list[str | Path], reindex: bool = False, **kwargs, ): """Index files to intermediate representation (e.g. vector, database...) Args: file_paths: the list of paths to files reindex: if True, files in `file_paths` that already exists in database should be reindex. """ ``` The reasoning pipeline: ```python def run(self, question: str, history: list, **kwargs) -> Document: """Answer the question Args: question: the user input history: the chat history [(user_msg1, bot_msg1), (user_msg2, bot_msg2)...] Returns: kotaemon.base.Document: the final answer """ ``` ## Register your pipeline to ktem To register your pipelines to ktem, you declare it in the `flowsettings.py` file. This file locates at the current working directory where you start the ktem. In most use cases, it is this [one](https://github.com/Cinnamon/kotaemon/blob/main/flowsettings.py). ```python KH_REASONING = [""] KH_INDEX = "" ``` You can register multiple reasoning pipelines to ktem by populating the `KH_REASONING` list. The user can select which reasoning pipeline to use in their Settings page. For now, there's only one supported index option for `KH_INDEX`. Make sure that your class is discoverable by Python. ## Allow users to customize your pipeline in the app settings To allow the users to configure your pipeline, you need to declare what you allow the users to configure as a dictionary. `ktem` will include them into the application settings. In your pipeline class, add a classmethod `get_user_settings` that returns a setting dictionary, add a classmethod `get_info` that returns an info dictionary. Example: ```python class SoSimple(BaseComponent): ... # as above @classmethod def get_user_settings(cls) -> dict: """The settings to the user""" return { "setting_1": { "name": "Human-friendly name", "value": "Default value", "choices": [("Human-friendly Choice 1", "choice1-id"), ("HFC 2", "choice2-id")], # optional "component": "Which Gradio UI component to render, can be: text, number, checkbox, dropdown, radio, checkboxgroup" }, "setting_2": { # follow the same rule as above } } @classmethod def get_info(cls) -> dict: """Pipeline information for bookkeeping purpose""" return { "id": "a unique id to differentiate this pipeline from other pipeline", "name": "Human-friendly name of the pipeline", "description": "Can be a short description of this pipeline" } ``` Once adding these methods to your pipeline class, `ktem` will automatically extract and add them to the settings. ## Construct to pipeline object Once `ktem` runs your pipeline, it will call your classmethod `get_pipeline` with the full user settings and expect to obtain the pipeline object. Within this `get_pipeline` method, you implement all the necessary logics to initiate the pipeline object. Example: ```python class SoSimple(BaseComponent): ... # as above @classmethod def get_pipeline(self, setting): obj = cls(arg1=setting["reasoning.id.setting1"]) return obj ``` ## Reasoning: Stream output to UI For fast user experience, you can stream the output directly to UI. This way, user can start observing the output as soon as the LLM model generates the 1st token, rather than having to wait the pipeline finishes to read the whole message. To stream the output, you need to; 1. Turn the `run` function to async. 2. Pass in the output to a special queue with `self.report_output`. ```python async def run(self, question: str, history: list, **kwargs) -> Document: for char in "This is a long messages": self.report_output({"output": text.text}) ``` The argument to `self.report_output` is a dictionary, that contains either or all of these 2 keys: "output", "evidence". The "output" string will be streamed to the chat message, and the "evidence" string will be streamed to the information panel. ## Access application LLMs, Embeddings You can access users' collections of LLMs and embedding models with: ```python from ktem.embeddings.manager import embeddings from ktem.llms.manager import llms llm = llms.get_default() embedding_model = embeddings.get_default() ``` You can also allow the users to specifically select which llms or embedding models they want to use through the settings. ```python @classmethod def get_user_settings(cls) -> dict: from ktem.llms.manager import llms return { "citation_llm": { "name": "LLM for citation", "value": llms.get_default(), "component: "dropdown", "choices": list(llms.options().keys()), }, ... } ``` ## Optional: Access application data You can access the user's application database, vector store as follow: ```python # get the database that contains the source files from ktem.db.models import Source, Index, Conversation, User # get the vector store ``` ================================================ FILE: docs/pages/app/ext/user-management.md ================================================ `ktem` provides user management as an extension. To enable user management, in your `flowsettings.py`, set the following variables: - `KH_FEATURE_USER_MANAGEMENT`: True to enable. - `KH_FEATURE_USER_MANAGEMENT_ADMIN`: the admin username. This user will be created when the app 1st start. - `KH_FEATURE_USER_MANAGEMENT_PASSWORD`: the admin password. This value accompanies the admin username. Once enabled, you have access to the following features: - User login/logout (located in Settings Tab) - User changing password (located in Settings Tab) - Create / List / Edit / Delete user (located in Resources > Users Tab) ================================================ FILE: docs/pages/app/features.md ================================================ ## Chat The kotaemon focuses on question and answering over a corpus of data. Below is the gentle introduction about the chat functionality. - Users can upload corpus of files. - Users can converse to the chatbot to ask questions about the corpus of files. - Users can view the reference in the files. ================================================ FILE: docs/pages/app/functional-description.md ================================================ ## User group / tenant management ### Create new user group (6 man-days) **Description**: each client has a dedicated user group. Each user group has an admin user who can do administrative tasks (e.g. creating user account in that user group...). The workflow for creating new user group is as follow: 1. Cinnamon accesses the user group management UI. 2. On "Create user group" panel, we supply: a. Client name: e.g. Apple. b. Sub-domain name: e.g. apple. c. Admin email, username & password. 3. The system will: a. An Aurora Platform deployment with the specified sub-domain. b. Send an email to the admin, with the username & password. **Expectation**: - The admin can go to the deployed Aurora Platform. - The admin can login with the specified username & password. **Condition**: - When sub-domain name already exists, raise error. - If error sending email to the client, raise the error, and delete the newly-created user-group. - Password rule: - Have at least 8 characters. - Must contain uppercase, lowercase, number and symbols. --- ### Delete user group (2 man-days) **Description**: in the tenant management page, we can delete the selected user group. The user flow is as follow: 1. Cinnamon accesses the user group management UI, 2. View list of user groups. 3. Next to target user group, click delete. 4. Confirm whether to delete. 5. If Yes, delete the user group. If No, cancel the operation. **Expectation**: when a user group is deleted, we expect to delete everything related to the user groups: domain, files, databases, caches, deployments. ## User management --- ### Create user account (for admin user) (1 man-day) **Description**: the admin user in the client's account can create user account for that user group. To create the new user, the client admin do: 1. Navigate to "Admin" > "Users" 2. In the "Create user" panel, supply: - Username - Password - Confirm password 3. Click "Create" **Expectation**: - The user can create the account. - The username: - Is case-insensitive (e.g. Moon and moon will be the same) - Can only contains these characters: a-z A-Z 0-9 \_ + - . - Has maximum length of 32 characters - The password is subjected to the following rule: - 8-character minimum length - Contains at least 1 number - Contains at least 1 lowercase letter - Contains at least 1 uppercase letter - Contains at least 1 special character from the following set, or a non-leading, non-trailing space character: `^ $ * . [ ] { } ( ) ? - " ! @ # % & / \ , > < ' : ; | _ ~ ` + = --- ### Delete user account (for admin user) **Description**: the admin user in the client's account can delete user account. Once an user account is deleted, he/she cannot login to Aurora Platform. 1. The admin user navigates to "Admin" > "Users". 2. In the user list panel, next to the username, the admin click on the "Delete" button. The Confirmation dialog appears. 3. If "Delete", the user account is deleted. If "Cancel", do nothing. The Confirmation dialog disappears. **Expectation**: - Once the user is deleted, the following information relating to the user will be deleted: - His/her personal setting. - His/her conversations. - The following information relating to the user will still be retained: - His/her uploaded files. --- ### Edit user account (for admin user) **Description**: the admin user can change any information about the user account, including password. To change user information: 1. The admin user navigates to "Admin" > "Users". 2. In the user list panel, next to the username, the admin click on the "Edit" button. 3. The user list disappears, the user detail appears, with the following information show up: - Username: (prefilled the username) - Password: (blank) - Confirm password: (blank) 4. The admin can edit any of the information, and click "Save" or "Cancel". - If "Save": the information will be updated to the database, or show error per Expectation below. - If "Cancel": skip. 5. If Save success or Cancel, transfer back to the user list UI, where the user information is updated accordingly. **Expectation**: - If the "Password" & "Confirm password" are different from each other, show error: "Password mismatch". - If both "Password" & \*"Confirm password" are blank, don't change the user password. - If changing password, the password rule is subjected to the same rule when creating user. - It's possible to change username. If changing username, the target user has to use the new username. --- ### Sign-in (3 man-days) **Description**: the users can sign-in to Aurora Platform as follow: 1. User navigates to the URL. 2. If the user is not logged in, the UI just shows the login screen. 3. User types username & password. 4. If correct, the user will proceed to normal working UI. 5. If incorrect, the login screen shows text error. --- ### Sign-out (1 man-day) **Description**: the user can sign-out of Aurora Platform as follow: 1. User navigates to the Settings > User page. 2. User click on logout. 3. The user is signed out to the UI login screen. **Expectation**: the user is completely signed out. Next time he/she uses the Aurora Platform, he/she has to login again. --- ### Change password **Description**: the user can change their password as follow: 1. User navigates to the Settings > User page. 2. In the change password section, the user provides these info and click Change: - Current password - New password - Confirm new password 3. If changing successfully, then the password is changed. Otherwise, show the error on the UI. **Expectation**: - If changing password succeeds, next time they logout/login to the system, they can use the new password. - Password rule (Same as normal password rule when creating user) - Errors: - Password does not match. - Violated password rules. --- ## Chat ### Chat to the bot **Description**: the Aurora Platform focuses on question and answering over the uploaded data. Each chat has the following components: - Chat message: show the exchange between bots and humans. - Text input + send button: for the user to input the message. - Data source panel: for selecting the files that will scope the context for the bot. - Information panel: showing evidence as the bot answers user's questions. The chat workflow looks as follow: 1. [Optional] User select files that they want to scope the context for the bot. If the user doesn't select any files, then all files on Aurora Platform will be the context for the bot. - The user can type multi-line messages, using "Shift + Enter" for line-break. 2. User sends the message (either clicking the Send button or hitting the Enter key). 3. The bot in the chat conversation will return "Thinking..." while it processes. 4. The information panel on the right begin to show data related to the user message. 5. The bot begins to generate answer. The "Thinking..." placeholder disappears.. **Expecatation**: - Messages: - User can send multi-line messages, using "Shift + Enter" for line-break. - User can thumbs up, thumbs down the AI response. This information is recorded in the database. - User can click on a copy button on the chat message to copy the content to clipboard. - Information panel: - The information panel shows the latest evidence. - The user can click on the message, and the reference for that message will show up on the "Reference panel" (feature in-planning). - The user can click on the title to show/hide the content. - The whole information panel can be collapsed. - Chatbot quality: - The user can converse with the bot. The bot answer the user's requests in a natural manner. - The bot message should be streamed to the UI. The bot don't wait to gather alll the text response, then dump all of them at once. ### Conversation - switch **Description**: users can jump around between different conversations. They can see the list of all conversations, can select an old converation, and continue the chat under the context of the old conversation. The switching workflow is like this: 1. Users click on the conversation dropdown. It will show a list of conversations. 2. Within that dropdown, the user selects one conversation. 3. The chat messages, information panel, and selected data will show the content in that old chat. 4. The user can continue chatting as normal under the context of this old chat. **Expectation**: - In the conversation drop down list, the conversations are ordered in created date order. - When there is no conversation, the conversation list is empty. - When there is no conversation, the user can still converse with the chat bot. When doing so, it automatically create new conversation. ### Conversation - create **Description**: the user can explicitly start a new conversation with the chatbot: 1. User click on the "New" button. 2. The new conversation is automatically created. **Expectation**: - The default conversation name is the current datetime. - It become selected. - It is added to the conversation list. ### Conversation - rename **Description**: user can rename the chatbot by typing the name, and click on the Rename button next to it. - If rename succeeds: the name shown in the 1st dropdown will change accordingly - If rename doesn't succeed: show error message in red color below the rename section **Condition**: - Name constraint: - Min characters: 1 - Max characters: 40 - Could not having the same name with an existing conversation of the same user. ### Conversation - delete **Description**: user can delete the existing conversation as follow: 1. Click on Delete button. 2. The UI show confirmation with 2 buttons: - Delete - Cancel. 3. If Delete, delete the conversation, switch to the next oldest conversation, close the confirmation panel. 4. If cancel, just close the confirmation panel. ## File management The file management allows users to upload, list and delete files that they upload to the Aurora Platform ### Upload file **Description**: the user can upload files to the Aurora Platform. The uploaded files will be served as context for our chatbot to refer to when it converses with the user. To upload file, the user: 1. Navigate to the File tab. 2. Within the File tab, there is an Upload section. 3. User can add files to the Upload section through drag & drop, and or by click on the file browser. 4. User can select some options relating to uploading and indexing. Depending on the project, these options can be different. Nevertheless, they will discuss below. 5. User click on "Upload and Index" button. 6. The app show notifications when indexing starts and finishes, and when errors happen on the top right corner. **Options**: - Force re-index file. When user tries to upload files that already exists on the system: - If this option is True: will re-index those files. - If this option is False: will skip indexing those files. **Condition**: - Max number of files: 100 files. - Max number of pages per file: 500 pages - Max file size: 10 MB ### List all files **Description**: the user can know which files are on the system by: 1. Navigate to the File tab. 2. By default, it will show all the uploaded files, each with the following information: file name, file size, number of pages, uploaded date 3. The UI also shows total number of pages, and total number of sizes in MB. ### Delete file **Description**: users can delete files from this UI to free up the space, or to remove outdated information. To remove the files: 1. User navigate to the File tab. 2. In the list of file, next to each file, there is a Delete button. 3. The user clicks on the Delete button. Confirmation dialog appear. 4. If Delete, delete the file. If Cancel, close the confirmation dialog. **Expectation**: once the file is deleted: - The database entry of that file is deleted. - The file is removed from "Chat - Data source". - The total number of pages and MB sizes are reduced accordingly. - The reference to the file in the information panel is still retained. ================================================ FILE: docs/pages/app/index/file.md ================================================ The file index stores files in a local folder and index them for retrieval. This file index provides the following infrastructure to support the indexing: - SQL table Source: store the list of files that are indexed by the system - Vector store: contain the embedding of segments of the files - Document store: contain the text of segments of the files. Each text stored in this document store is associated with a vector in the vector store. - SQL table Index: store the relationship between (1) the source and the docstore, and (2) the source and the vector store. The indexing and retrieval pipelines are encouraged to use the above software infrastructure. ## Indexing pipeline The ktem has default indexing pipeline: `ktem.index.file.pipelines.IndexDocumentPipeline`. This default pipeline works as follow: - **Input**: list of file paths - **Output**: list of nodes that are indexed into database - **Process**: - Read files into texts. Different file types has different ways to read texts. - Split text files into smaller segments - Run each segments into embeddings. - Store the embeddings into vector store. Store the texts of each segment into docstore. Store the list of files in Source. Store the linking between Sources and docstore + vectorstore in Index table. You can customize this default pipeline if your indexing process is close to the default pipeline. You can create your own indexing pipeline if there are too much different logic. ### Customize the default pipeline The default pipeline provides the contact points in `flowsettings.py`. 1. `FILE_INDEX_PIPELINE_FILE_EXTRACTORS`. Supply overriding file extractor, based on file extension. Example: `{".pdf": "path.to.PDFReader", ".xlsx": "path.to.ExcelReader"}` 2. `FILE_INDEX_PIPELINE_SPLITTER_CHUNK_SIZE`. The expected number of characters of each text segment. Example: 1024. 3. `FILE_INDEX_PIPELINE_SPLITTER_CHUNK_OVERLAP`. The expected number of characters that consecutive text segments should overlap with each other. Example: 256. ### Create your own indexing pipeline Your indexing pipeline will subclass `BaseFileIndexIndexing`. You should define the following methods: - `run(self, file_paths)`: run the indexing given the pipeline - `get_pipeline(cls, user_settings, index_settings)`: return the fully-initialized pipeline, ready to be used by ktem. - `user_settings`: is a dictionary contains user settings (e.g. `{"pdf_mode": True, "num_retrieval": 5}`). You can declare these settings in the `get_user_settings` classmethod. ktem will collect these settings into the app Settings page, and will supply these user settings to your `get_pipeline` method. - `index_settings`: is a dictionary. Currently it's empty for File Index. - `get_user_settings`: to declare user settings, return a dictionary. By subclassing `BaseFileIndexIndexing`, You will have access to the following resources: - `self._Source`: the source table - `self._Index`: the index table - `self._VS`: the vector store - `self._DS`: the docstore Once you have prepared your pipeline, register it in `flowsettings.py`: `FILE_INDEX_PIPELINE = ""`. ## Retrieval pipeline The ktem has default retrieval pipeline: `ktem.index.file.pipelines.DocumentRetrievalPipeline`. This pipeline works as follow: - Input: user text query & optionally a list of source file ids - Output: the output segments that match the user text query - Process: - If a list of source file ids is given, get the list of vector ids that associate with those file ids. - Embed the user text query. - Query the vector store. Provide a list of vector ids to limit query scope if the user restrict. - Return the matched text segments ### Create your own retrieval pipeline Your retrieval pipeline will subclass `BaseFileIndexRetriever`. The retriever has the same database, vectorstore and docstore accesses like the indexing pipeline. You should define the following methods: - `run(self, query, file_ids)`: retrieve relevant documents relating to the query. If `file_ids` is given, you should restrict your search within these `file_ids`. - `get_pipeline(cls, user_settings, index_settings, selected)`: return the fully-initialized pipeline, ready to be used by ktem. - `user_settings`: is a dictionary contains user settings (e.g. `{"pdf_mode": True, "num_retrieval": 5}`). You can declare these settings in the `get_user_settings` classmethod. ktem will collect these settings into the app Settings page, and will supply these user settings to your `get_pipeline` method. - `index_settings`: is a dictionary. Currently it's empty for File Index. - `selected`: a list of file ids selected by user. If user doesn't select anything, this variable will be None. - `get_user_settings`: to declare user settings, return a dictionary. Once you build the retrieval pipeline class, you can register it in `flowsettings.py`: `FILE_INDEXING_RETRIEVER_PIPELIENS = ["path.to.retrieval.pipelie"]`. Because there can be multiple parallel pipelines within an index, this variable takes a list of string rather than a string. ## Software infrastructure | Infra | Access | Schema | Ref | | ---------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- | | SQL table Source | self.\_Source | - id (int): id of the source (auto)
- name (str): the name of the file
- path (str): the path of the file
- size (int): the file size in bytes
- note (dict): allow extra optional information about the file
- date_created (datetime): the time the file is created (auto) | This is SQLALchemy ORM class. Can consult | | SQL table Index | self.\_Index | - id (int): id of the index entry (auto)
- source_id (int): the id of a file in the Source table
- target_id: the id of the segment in docstore or vector store
- relation_type (str): if the link is "document" or "vector" | This is SQLAlchemy ORM class | | Vector store | self.\_VS | - self.\_VS.add: add the list of embeddings to the vector store (optionally associate metadata and ids)
- self.\_VS.delete: delete vector entries based on ids
- self.\_VS.query: get embeddings based on embeddings. | kotaemon > storages > vectorstores > BaseVectorStore | | Doc store | self.\_DS | - self.\_DS.add: add the segments to document stores
- self.\_DS.get: get the segments based on id
- self.\_DS.get_all: get all segments
- self.\_DS.delete: delete segments based on id | kotaemon > storages > docstores > base > BaseDocumentStore | ================================================ FILE: docs/pages/app/settings/overview.md ================================================ # Overview There are 3 kinds of settings in `ktem`, geared towards different stakeholders for different use cases: - Developer settings. These settings are meant for very basic app customization, such as database URL, cloud config, logging config, which features to enable... You will be interested in the developer settings if you deploy `ktem` to your customers, or if you build extension for `ktem` for developers. These settings are declared inside `flowsettings.py`. - Admin settings. These settings show up in the Admin page, and are meant to allow admin-level user to customize low level features, such as which credentials to connect to data sources, which keys to use for LLM... - [User settings](/pages/app/settings/user-settings/). These settings are meant for run-time users to tweak ktem to their personal needs, such as which output languages the chatbot should generate, which reasoning type to use... ================================================ FILE: docs/pages/app/settings/user-settings.md ================================================ # User settings `ktem` allows developers to extend the index and the reasoning pipeline. In many cases, these components can have settings that should be modified by users at run-time, (e.g. `topk`, `chunksize`...). These are the user settings. `ktem` allows developers to declare such user settings in their code. Once declared, `ktem` will render them in a Settings page. There are 2 places that `ktem` looks for declared user settings. You can refer to the respective pages. - In the index. - In the reasoning pipeline. ## Syntax of a settings A collection of settings is a dictionary of type `dict[str, dict]`, where the key is a setting id, and the value is the description of the setting. ```python settings = { "topk": { "name": "Top-k chunks", "value": 10, "component": "number", }, "lang": { "name": "Languages", "value": "en", "component": "dropdown", "choices": [("en", "English"), ("cn", "Chinese")], } } ``` Each setting description must have: - name: the human-understandable name of the settings. - value: the default value of the settings. - component: the UI component to render such setting on the UI. Available: - "text": single-value - "number": single-value - "checkbox": single-value - "dropdown": choices - "radio": choices - "checkboxgroup": choices - choices: the list of choices, if the component type allows. ## Settings page structure ================================================ FILE: docs/scripts/generate_examples_docs.py ================================================ # import shutil from pathlib import Path from typing import Any, Iterable import mkdocs_gen_files # get the root source code directory doc_dir_name = "docs" doc_dir = Path(__file__) while doc_dir.name != doc_dir_name and doc_dir != doc_dir.parent: doc_dir = doc_dir.parent if doc_dir == doc_dir.parent: raise ValueError(f"root_name ({doc_dir_name}) not in path ({str(Path(__file__))}).") def generate_docs_for_examples_readme( examples_dir: Path, target_doc_folder: str, ignored_modules: Iterable[Any] = [] ): if not examples_dir.is_dir(): raise ModuleNotFoundError(str(examples_dir)) nav = mkdocs_gen_files.Nav() for path in sorted(examples_dir.rglob("*README.md")): # ignore modules with name starts with underscore (i.e. __init__) if path.name.startswith("_") or path.name.startswith("test"): continue module_path = path.parent.relative_to(examples_dir).with_suffix("") doc_path = path.parent.relative_to(examples_dir).with_suffix(".md") full_doc_path = Path(target_doc_folder, doc_path) parts = list(module_path.parts) identifier = ".".join(parts) if "tests" in parts: continue ignore = False for each_module in ignored_modules: if identifier.startswith(each_module): ignore = True break if ignore: continue nav_titles = [name.replace("_", " ").title() for name in parts] nav[nav_titles] = doc_path.as_posix() with mkdocs_gen_files.open(full_doc_path, "w") as f: f.write(f'--8<-- "{path.relative_to(examples_dir.parent)}"') mkdocs_gen_files.set_edit_path( full_doc_path, Path("..") / path.relative_to(examples_dir.parent) ) with mkdocs_gen_files.open(f"{target_doc_folder}/NAV.md", "w") as nav_file: nav_file.writelines(nav.build_literate_nav()) generate_docs_for_examples_readme( examples_dir=doc_dir.parent / "examples", target_doc_folder="examples", ) ================================================ FILE: docs/scripts/generate_reference_docs.py ================================================ # import shutil from pathlib import Path from typing import Any, Iterable import mkdocs_gen_files # get the root source code directory doc_dir_name = "docs" doc_dir = Path(__file__) while doc_dir.name != doc_dir_name and doc_dir != doc_dir.parent: doc_dir = doc_dir.parent if doc_dir == doc_dir.parent: raise ValueError(f"root_name ({doc_dir_name}) not in path ({str(Path(__file__))}).") nav_title_map = {"cli": "CLI", "llms": "LLMs"} def generate_docs_for_src_code( code_dir: Path, target_doc_folder: str, ignored_modules: Iterable[Any] = [] ): if not code_dir.is_dir(): raise ModuleNotFoundError(str(code_dir)) nav = mkdocs_gen_files.Nav() for path in sorted(code_dir.rglob("*.py")): # ignore modules with name starts with underscore (i.e. __init__) # if path.name.startswith("_") or path.name.startswith("test"): # continue module_path = path.relative_to(code_dir).with_suffix("") doc_path = path.relative_to(code_dir).with_suffix(".md") full_doc_path = Path(target_doc_folder, doc_path) parts = list(module_path.parts) if parts[-1] == "__init__": doc_path = doc_path.with_name("index.md") full_doc_path = full_doc_path.with_name("index.md") parts.pop() if not parts: continue if "tests" in parts: continue identifier = ".".join(parts) ignore = False for each_module in ignored_modules: if identifier.startswith(each_module): ignore = True break if ignore: continue nav_titles = [ nav_title_map.get(name, name.replace("_", " ").title()) for name in parts ] nav[nav_titles] = doc_path.as_posix() with mkdocs_gen_files.open(full_doc_path, "w") as f: f.write(f"::: {identifier}") # this method works in docs folder mkdocs_gen_files.set_edit_path( full_doc_path, Path("..") / path.relative_to(code_dir.parent) ) with mkdocs_gen_files.open(f"{target_doc_folder}/Summary.md", "w") as nav_file: nav_file.writelines(nav.build_literate_nav()) generate_docs_for_src_code( code_dir=doc_dir.parent / "libs" / "kotaemon" / "kotaemon", target_doc_folder="reference", ignored_modules={"contribs"}, ) ================================================ FILE: docs/theme/assets/pymdownx-extras/extra-fb5a2a1c86.css ================================================ @charset "UTF-8";:root>*{--md-code-link-bg-color:hsla(0, 0%, 96%, 1);--md-code-link-accent-bg-color:var(--md-code-link-bg-color);--md-default-bg-color--trans:rgb(100%, 100%, 100%, 0);--md-code-title-bg-color:var(--md-code-bg-color);--md-code-inline-bg-color:var(--md-code-bg-color);--md-code-special-bg-color:#e8e8e8;--md-code-alternate-bg-color:var(--md-code-bg-color);--md-code-hl-punctuation-color:var(--md-code-fg-color);--md-code-hl-namespace-color:var(--md-code-fg-color);--md-code-hl-entity-color:var(--md-code-hl-keyword-color);--md-code-hl-tag-color:var(--md-code-hl-keyword-color);--md-code-hl-builtin-color:var(--md-code-hl-constant-color);--md-code-hl-class-color:var(--md-code-hl-function-color);--md-typeset-a-color:#00bcd4;--md-progress-stripe:var(--md-default-bg-color--lighter);--md-progress-100:#00e676;--md-progress-80:#00e676;--md-progress-60:#fbc02d;--md-progress-40:#ff9100;--md-progress-20:#ff5252;--md-progress-0:#ff1744;--md-typeset-kbd-color:#ebebeb;--md-typeset-kbd-border-color:#b8b8b8;--md-typeset-kbd-accent-color:hsla(0, 100%, 100%, 1)}:root>[data-md-color-scheme=slate]{--md-code-link-bg-color:hsla(232, 15%, 15%, 1);--md-code-link-accent-bg-color:var(--md-code-link-bg-color);--md-code-special-bg-color:#2b2d3b;--md-default-bg-color--trans:hsla(232,15%,15%, 0);--md-typeset-kbd-color:var(--md-default-fg-color--lightest);--md-typeset-kbd-border-color:#1a1c24;--md-typeset-kbd-accent-color:var(--md-default-fg-color--lighter)}:root>[data-md-color-scheme=dracula]{--md-default-fg-color:rgba(248, 248, 242, 0.87);--md-default-fg-color--light:rgba(248, 248, 242, 0.54);--md-default-fg-color--lighter:rgba(248, 248, 242, 0.16);--md-default-fg-color--lightest:rgba(248, 248, 242, 0.07);--md-default-autocomplete-fg-color:rgba(248, 248, 242, 0.4);--md-shadow-z2:0 0.2rem 0.5rem hsla(0, 0%, 0%, 0.3),0 0 0.05rem hsla(0, 0%, 0%, 0.2);--md-default-bg-color:var(--md-default-bg-color--darkest);--md-default-bg-color--light:rgba(50, 52, 67, 0.7);--md-default-bg-color--lighter:rgba(50, 52, 67, 0.3);--md-default-bg-color--lightest:rgba(50, 52, 67, 0.12);--md-default-bg-color--trans:rgba(50, 52, 67, 0);--md-default-bg-color--dark:#2b2e3b;--md-default-bg-color--darker:#252732;--md-default-bg-color--darkest:#1e2029;--md-default-bg-color--ultra-dark:#111217;--md-text-color:var(--md-default-fg-color);--md-typeset-color:var(--md-default-fg-color);--md-admonition-fg-color:var(--md-default-fg-color);--md-code-fg-color:hsl(60, 30%, 96%);--md-code-bg-color:hsl(231, 15%, 18%);--md-code-title-bg-color:var(--md-default-bg-color--ultra-dark);--md-code-inline-bg-color:#323443;--md-code-hl-operator-color:hsl(326, 100%, 74%);--md-code-hl-punctuation-color:hsl(60, 30%, 96%);--md-code-hl-string-color:hsl(65, 92%, 76%);--md-code-hl-special-color:hsl(265, 89%, 78%);--md-code-hl-number-color:hsl(265, 89%, 78%);--md-code-hl-keyword-color:hsl(326, 100%, 74%);--md-code-hl-name-color:hsl(60, 30%, 96%);--md-code-hl-constant-color:hsl(265, 89%, 78%);--md-code-hl-function-color:hsl(135, 94%, 65%);--md-code-hl-comment-color:hsl(225, 27%, 51%);--md-code-hl-variable-color:hsl(31, 100%, 71%);--md-code-hl-generic-color:hsl(225, 27%, 51%);--md-code-hl-color:hsl(231, 25%, 25%);--md-code-hl-entity-color:hsl(135, 94%, 65%);--md-code-hl-tag-color:hsl(326, 100%, 74%);--md-code-hl-namespace-color:hsl(60, 30%, 96%);--md-code-hl-builtin-color:hsl(191, 97%, 77%);--md-code-hl-class-color:hsl(191, 97%, 77%);--md-code-special-bg-color:#1c1e26;--md-code-alternate-bg-color:#3d3e49;--md-code-link-bg-color:#364653;--md-typeset-a-color:hsl(191, 97%, 77%);--md-typeset-mark-color:#6e7252;--md-typeset-del-color:#734568;--md-typeset-ins-color:#36724e;--md-progress-stripe:var(--md-default-bg-color--lightest);--md-progress-100:hsl(135, 94%, 65%);--md-progress-80:hsl(135, 92%, 79%);--md-progress-60:hsl(65, 92%, 76%);--md-progress-40:hsl(31, 100%, 71%);--md-progress-20:hsl(326, 100%, 74%);--md-progress-0:hsl(0, 100%, 67%);--md-typeset-kbd-color:var(--md-default-fg-color--lightest);--md-typeset-kbd-border-color:var(--md-default-bg-color--ultra-dark);--md-typeset-kbd-accent-color:var(--md-default-fg-color--lighter)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=red],[data-md-color-scheme=dracula][data-md-color-primary=red]{--md-primary-code-bg-color:#47303a;--md-primary-fg-color:hsla(0deg, 100%, 67%, 1);--md-primary-fg-color--transparent:hsla(0deg, 100%, 67%, 0.1);--md-primary-fg-color--light:hsla(0deg, 100%, 72%, 1);--md-primary-fg-color--dark:hsla(0deg, 100%, 62%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=pink],[data-md-color-scheme=dracula][data-md-color-primary=pink]{--md-primary-code-bg-color:#47354b;--md-primary-fg-color:hsla(326deg, 100%, 74%, 1);--md-primary-fg-color--transparent:hsla(326deg, 100%, 74%, 0.1);--md-primary-fg-color--light:hsla(326deg, 100%, 79%, 1);--md-primary-fg-color--dark:hsla(326deg, 100%, 69%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=purple],[data-md-color-scheme=dracula][data-md-color-primary=purple]{--md-primary-code-bg-color:#3e3952;--md-primary-fg-color:hsla(265deg, 89%, 78%, 1);--md-primary-fg-color--transparent:hsla(265deg, 89%, 78%, 0.1);--md-primary-fg-color--light:hsla(265deg, 89%, 83%, 1);--md-primary-fg-color--dark:hsla(265deg, 89%, 73%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=deep-purple],[data-md-color-scheme=dracula][data-md-color-primary=deep-purple]{--md-primary-code-bg-color:#3e3952;--md-primary-fg-color:hsla(265deg, 89%, 78%, 1);--md-primary-fg-color--transparent:hsla(265deg, 89%, 78%, 0.1);--md-primary-fg-color--light:hsla(265deg, 89%, 83%, 1);--md-primary-fg-color--dark:hsla(265deg, 89%, 73%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=blue],[data-md-color-scheme=dracula][data-md-color-primary=blue]{--md-primary-code-bg-color:#303446;--md-primary-fg-color:hsla(225deg, 27%, 51%, 1);--md-primary-fg-color--transparent:hsla(225deg, 27%, 51%, 0.1);--md-primary-fg-color--light:hsla(225deg, 27%, 56%, 1);--md-primary-fg-color--dark:hsla(225deg, 27%, 46%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=indigo],[data-md-color-scheme=dracula][data-md-color-primary=indigo]{--md-primary-code-bg-color:#303446;--md-primary-fg-color:hsla(225deg, 27%, 51%, 1);--md-primary-fg-color--transparent:hsla(225deg, 27%, 51%, 0.1);--md-primary-fg-color--light:hsla(225deg, 27%, 56%, 1);--md-primary-fg-color--dark:hsla(225deg, 27%, 46%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=light-blue],[data-md-color-scheme=dracula][data-md-color-primary=light-blue]{--md-primary-code-bg-color:#303446;--md-primary-fg-color:hsla(225deg, 27%, 51%, 1);--md-primary-fg-color--transparent:hsla(225deg, 27%, 51%, 0.1);--md-primary-fg-color--light:hsla(225deg, 27%, 56%, 1);--md-primary-fg-color--dark:hsla(225deg, 27%, 46%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=cyan],[data-md-color-scheme=dracula][data-md-color-primary=cyan]{--md-primary-code-bg-color:#364653;--md-primary-fg-color:hsla(191deg, 97%, 77%, 1);--md-primary-fg-color--transparent:hsla(191deg, 97%, 77%, 0.1);--md-primary-fg-color--light:hsla(191deg, 97%, 82%, 1);--md-primary-fg-color--dark:hsla(191deg, 97%, 72%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=teal],[data-md-color-scheme=dracula][data-md-color-primary=teal]{--md-primary-code-bg-color:#364653;--md-primary-fg-color:hsla(191deg, 97%, 77%, 1);--md-primary-fg-color--transparent:hsla(191deg, 97%, 77%, 0.1);--md-primary-fg-color--light:hsla(191deg, 97%, 82%, 1);--md-primary-fg-color--dark:hsla(191deg, 97%, 72%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=green],[data-md-color-scheme=dracula][data-md-color-primary=green]{--md-primary-code-bg-color:#2d4840;--md-primary-fg-color:hsla(135deg, 94%, 65%, 1);--md-primary-fg-color--transparent:hsla(135deg, 94%, 65%, 0.1);--md-primary-fg-color--light:hsla(135deg, 94%, 70%, 1);--md-primary-fg-color--dark:hsla(135deg, 94%, 60%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=light-green],[data-md-color-scheme=dracula][data-md-color-primary=light-green]{--md-primary-code-bg-color:#2d4840;--md-primary-fg-color:hsla(135deg, 94%, 65%, 1);--md-primary-fg-color--transparent:hsla(135deg, 94%, 65%, 0.1);--md-primary-fg-color--light:hsla(135deg, 94%, 70%, 1);--md-primary-fg-color--dark:hsla(135deg, 94%, 60%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=lime],[data-md-color-scheme=dracula][data-md-color-primary=lime]{--md-primary-code-bg-color:#2d4840;--md-primary-fg-color:hsla(135deg, 94%, 65%, 1);--md-primary-fg-color--transparent:hsla(135deg, 94%, 65%, 0.1);--md-primary-fg-color--light:hsla(135deg, 94%, 70%, 1);--md-primary-fg-color--dark:hsla(135deg, 94%, 60%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=yellow],[data-md-color-scheme=dracula][data-md-color-primary=yellow]{--md-primary-code-bg-color:#454842;--md-primary-fg-color:hsla(65deg, 92%, 76%, 1);--md-primary-fg-color--transparent:hsla(65deg, 92%, 76%, 0.1);--md-primary-fg-color--light:hsla(65deg, 92%, 81%, 1);--md-primary-fg-color--dark:hsla(65deg, 92%, 71%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=amber],[data-md-color-scheme=dracula][data-md-color-primary=amber]{--md-primary-code-bg-color:#454842;--md-primary-fg-color:hsla(65deg, 92%, 76%, 1);--md-primary-fg-color--transparent:hsla(65deg, 92%, 76%, 0.1);--md-primary-fg-color--light:hsla(65deg, 92%, 81%, 1);--md-primary-fg-color--dark:hsla(65deg, 92%, 71%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=orange],[data-md-color-scheme=dracula][data-md-color-primary=orange]{--md-primary-code-bg-color:#473e3d;--md-primary-fg-color:hsla(31deg, 100%, 71%, 1);--md-primary-fg-color--transparent:hsla(31deg, 100%, 71%, 0.1);--md-primary-fg-color--light:hsla(31deg, 100%, 76%, 1);--md-primary-fg-color--dark:hsla(31deg, 100%, 66%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=deep-orange],[data-md-color-scheme=dracula][data-md-color-primary=deep-orange]{--md-primary-code-bg-color:#473e3d;--md-primary-fg-color:hsla(31deg, 100%, 71%, 1);--md-primary-fg-color--transparent:hsla(31deg, 100%, 71%, 0.1);--md-primary-fg-color--light:hsla(31deg, 100%, 76%, 1);--md-primary-fg-color--dark:hsla(31deg, 100%, 66%, 1);--md-primary-bg-color:var(--md-default-bg-color);--md-primary-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=red],[data-md-color-scheme=dracula][data-md-color-accent=red]{--md-code-link-accent-bg-color:#472c36;--md-accent-fg-color:hsla(0deg, 100%, 62%, 1);--md-accent-fg-color--transparent:hsla(0deg, 100%, 62%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=pink],[data-md-color-scheme=dracula][data-md-color-accent=pink]{--md-code-link-accent-bg-color:#473149;--md-accent-fg-color:hsla(326deg, 100%, 69%, 1);--md-accent-fg-color--transparent:hsla(326deg, 100%, 69%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=purple],[data-md-color-scheme=dracula][data-md-color-accent=purple]{--md-code-link-accent-bg-color:#3c3652;--md-accent-fg-color:hsla(265deg, 89%, 73%, 1);--md-accent-fg-color--transparent:hsla(265deg, 89%, 73%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=deep-purple],[data-md-color-scheme=dracula][data-md-color-accent=deep-purple]{--md-code-link-accent-bg-color:#3c3652;--md-accent-fg-color:hsla(265deg, 89%, 73%, 1);--md-accent-fg-color--transparent:hsla(265deg, 89%, 73%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=blue],[data-md-color-scheme=dracula][data-md-color-accent=blue]{--md-code-link-accent-bg-color:#2e3243;--md-accent-fg-color:hsla(225deg, 27%, 46%, 1);--md-accent-fg-color--transparent:hsla(225deg, 27%, 46%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=indigo],[data-md-color-scheme=dracula][data-md-color-accent=indigo]{--md-code-link-accent-bg-color:#2e3243;--md-accent-fg-color:hsla(225deg, 27%, 46%, 1);--md-accent-fg-color--transparent:hsla(225deg, 27%, 46%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=light-blue],[data-md-color-scheme=dracula][data-md-color-accent=light-blue]{--md-code-link-accent-bg-color:#2e3243;--md-accent-fg-color:hsla(225deg, 27%, 46%, 1);--md-accent-fg-color--transparent:hsla(225deg, 27%, 46%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=cyan],[data-md-color-scheme=dracula][data-md-color-accent=cyan]{--md-code-link-accent-bg-color:#324553;--md-accent-fg-color:hsla(191deg, 97%, 72%, 1);--md-accent-fg-color--transparent:hsla(191deg, 97%, 72%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=teal],[data-md-color-scheme=dracula][data-md-color-accent=teal]{--md-code-link-accent-bg-color:#324553;--md-accent-fg-color:hsla(191deg, 97%, 72%, 1);--md-accent-fg-color--transparent:hsla(191deg, 97%, 72%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=green],[data-md-color-scheme=dracula][data-md-color-accent=green]{--md-code-link-accent-bg-color:#2a483d;--md-accent-fg-color:hsla(135deg, 94%, 60%, 1);--md-accent-fg-color--transparent:hsla(135deg, 94%, 60%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=light-green],[data-md-color-scheme=dracula][data-md-color-accent=light-green]{--md-code-link-accent-bg-color:#2a483d;--md-accent-fg-color:hsla(135deg, 94%, 60%, 1);--md-accent-fg-color--transparent:hsla(135deg, 94%, 60%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=lime],[data-md-color-scheme=dracula][data-md-color-accent=lime]{--md-code-link-accent-bg-color:#2a483d;--md-accent-fg-color:hsla(135deg, 94%, 60%, 1);--md-accent-fg-color--transparent:hsla(135deg, 94%, 60%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=yellow],[data-md-color-scheme=dracula][data-md-color-accent=yellow]{--md-code-link-accent-bg-color:#45483e;--md-accent-fg-color:hsla(65deg, 92%, 71%, 1);--md-accent-fg-color--transparent:hsla(65deg, 92%, 71%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=amber],[data-md-color-scheme=dracula][data-md-color-accent=amber]{--md-code-link-accent-bg-color:#45483e;--md-accent-fg-color:hsla(65deg, 92%, 71%, 1);--md-accent-fg-color--transparent:hsla(65deg, 92%, 71%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=orange],[data-md-color-scheme=dracula][data-md-color-accent=orange]{--md-code-link-accent-bg-color:#473d39;--md-accent-fg-color:hsla(31deg, 100%, 66%, 1);--md-accent-fg-color--transparent:hsla(31deg, 100%, 66%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] :not([data-md-color-scheme])[data-md-color-primary=deep-orange],[data-md-color-scheme=dracula][data-md-color-accent=deep-orange]{--md-code-link-accent-bg-color:#473d39;--md-accent-fg-color:hsla(31deg, 100%, 66%, 1);--md-accent-fg-color--transparent:hsla(31deg, 100%, 66%, 0.1);--md-accent-bg-color:var(--md-default-bg-color);--md-accent-bg-color--light:var(--md-default-bg-color--light)}:root{--md-heart:#ff5252;--md-heart-big:#ff1744}:root :focus-visible{outline-style:solid}:root [data-md-color-scheme=dracula]{--md-heart:hsl(326, 100%, 74%);--md-heart-big:hsl(0, 100%, 67%)}.md-typeset h4{margin:2em 0 1em}.md-typeset a.source-link{position:relative;top:-.6rem;float:right;color:var(--md-default-fg-color--lighter);transition:color 125ms}.md-typeset a.source-link:hover{color:var(--md-accent-fg-color)}.md-typeset a.source-link .twemoji{height:1.2rem}.md-typeset a.source-link .twemoji svg{width:1.2rem;height:1.2rem}.md-typeset div.highlight.md-max-height pre>code{max-height:15rem}.twemoji.heart-throb svg,.twemoji.heart-throb-hover svg{position:relative;color:var(--md-heart);animation:pulse 1.5s ease infinite}@keyframes pulse{0%{transform:scale(1)}40%{color:var(--md-heart-big);transform:scale(1.3)}50%{transform:scale(1.2)}60%{color:var(--md-heart-big);transform:scale(1.3)}100%{transform:scale(1)}}footer.sponsorship{text-align:center}footer.sponsorship hr{display:inline-block;width:1.6rem;margin:0 .7rem;vertical-align:middle;border-bottom:2px solid var(--md-default-fg-color--lighter)}footer.sponsorship:hover hr{border-color:var(--md-accent-fg-color)}footer.sponsorship:not(:hover) .twemoji.heart-throb-hover svg{color:var(--md-default-fg-color--lighter)!important}body:not([data-md-prefers-color-scheme=true])[data-md-color-scheme=dracula] .md-icon .light-mode,body:not([data-md-prefers-color-scheme=true])[data-md-color-scheme=dracula] .md-icon .system-mode,body:not([data-md-prefers-color-scheme=true])[data-md-color-scheme=dracula] .md-icon .unknown-mode{display:none}body:not([data-md-prefers-color-scheme=true])[data-md-color-scheme=default] .md-icon .dark-mode,body:not([data-md-prefers-color-scheme=true])[data-md-color-scheme=default] .md-icon .system-mode,body:not([data-md-prefers-color-scheme=true])[data-md-color-scheme=default] .md-icon .unknown-mode{display:none}body:not([data-md-prefers-color-scheme=true]):not([data-md-color-scheme=default]):not([data-md-color-scheme=dracula]) .md-icon .dark-mode,body:not([data-md-prefers-color-scheme=true]):not([data-md-color-scheme=default]):not([data-md-color-scheme=dracula]) .md-icon .light-mode,body:not([data-md-prefers-color-scheme=true]):not([data-md-color-scheme=default]):not([data-md-color-scheme=dracula]) .md-icon .system-mode{display:none}body[data-md-prefers-color-scheme=true] .md-icon .dark-mode,body[data-md-prefers-color-scheme=true] .md-icon .light-mode,body[data-md-prefers-color-scheme=true] .md-icon .unknown-mode{display:none}.md-header-nav__scheme{z-index:0}[data-md-toggle=search]:checked~.md-header .md-header-nav__scheme{display:none}.md-typeset .admonition,.md-typeset details{border-width:0;border-left-width:4px}:root>*{--md-admonition-bg-color:transparent;--md-admonition-icon--settings:url('data:image/svg+xml;charset=utf-8,');--md-admonition-bg-color--settings:rgba(170, 0, 255, 0.1);--md-admonition-icon-color--settings:#aa00ff;--md-admonition-shadow-color--settings:rgba(170, 0, 255, 0.1);--md-admonition-icon--new:url('data:image/svg+xml;charset=utf-8,');--md-admonition-bg-color--new:rgba(255, 214, 0, 0.1);--md-admonition-icon-color--new:#ffd600;--md-admonition-shadow-color--new:rgba(255, 214, 0, 0.1);--md-admonition-bg-color--note:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--note:hsl(51, 94%, 73%);--md-admonition-shadow-color--note:rgba(251, 231, 121, 0.1);--md-admonition-bg-color--abstract:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--abstract:hsl(191, 97%, 77%);--md-admonition-shadow-color--abstract:rgba(139, 232, 253, 0.1);--md-admonition-bg-color--info:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--info:hsl(190, 94%, 87%);--md-admonition-shadow-color--info:rgba(191, 243, 253, 0.1);--md-admonition-bg-color--tip:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--tip:hsl(161, 97%, 77%);--md-admonition-shadow-color--tip:rgba(139, 253, 217, 0.1);--md-admonition-bg-color--success:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--success:hsl(135, 94%, 65%);--md-admonition-shadow-color--success:rgba(82, 250, 124, 0.1);--md-admonition-bg-color--question:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--question:hsl(135, 92%, 79%);--md-admonition-shadow-color--question:rgba(152, 251, 177, 0.1);--md-admonition-bg-color--warning:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--warning:hsl(31, 100%, 71%);--md-admonition-shadow-color--warning:rgba(255, 184, 107, 0.1);--md-admonition-bg-color--failure:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--failure:hsl(0, 100%, 59%);--md-admonition-shadow-color--failure:rgba(255, 46, 46, 0.1);--md-admonition-bg-color--danger:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--danger:hsl(0, 100%, 67%);--md-admonition-shadow-color--danger:rgba(255, 87, 87, 0.1);--md-admonition-bg-color--bug:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--bug:hsl(325, 100%, 64%);--md-admonition-shadow-color--bug:rgba(255, 71, 179, 0.1);--md-admonition-bg-color--example:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--example:hsl(265, 89%, 78%);--md-admonition-shadow-color--example:rgba(191, 149, 249, 0.1);--md-admonition-bg-color--quote:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--quote:hsl(225, 8%, 51%);--md-admonition-shadow-color--quote:rgba(120, 125, 140, 0.1)}:root>[data-md-color-scheme=dracula]{--md-admonition-icon-color:$drac-dark-yellow}:root>[data-md-color-scheme=dracula]{--md-admonition-bg-color--settings:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--settings:hsl(326, 100%, 74%);--md-admonition-shadow-color--settings:rgba(255, 122, 198, 0.1)}:root>[data-md-color-scheme=dracula]{--md-admonition-bg-color--new:var(--md-default-bg-color--ultra-dark);--md-admonition-icon-color--new:hsl(65, 92%, 76%);--md-admonition-shadow-color--new:rgba(241, 250, 137, 0.1)}[data-md-color-scheme=dracula] .md-typeset .admonition,[data-md-color-scheme=dracula] .md-typeset details{border-color:var(--md-admonition-icon-color--note);box-shadow:var(--md-shadow-z2)}[data-md-color-scheme=dracula] .md-typeset .admonition:focus-within,[data-md-color-scheme=dracula] .md-typeset details:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--note)}[data-md-color-scheme=dracula] .md-typeset .admonition>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details>summary{background-color:var(--md-admonition-bg-color--note)}[data-md-color-scheme=dracula] .md-typeset .admonition>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details>summary::before{background-color:var(--md-admonition-icon-color--note)}[data-md-color-scheme=dracula] .md-typeset .admonition>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details>summary::after{color:var(--md-admonition-icon-color--note)}[data-md-color-scheme=dracula] .md-typeset .admonition.note,[data-md-color-scheme=dracula] .md-typeset details.note{border-color:var(--md-admonition-icon-color--note)}[data-md-color-scheme=dracula] .md-typeset .admonition.note:focus-within,[data-md-color-scheme=dracula] .md-typeset details.note:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--note)}[data-md-color-scheme=dracula] .md-typeset .admonition.note>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.note>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.note>summary{background-color:var(--md-admonition-bg-color--note);border-color:var(--md-admonition-icon-color--note)}[data-md-color-scheme=dracula] .md-typeset .admonition.note>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.note>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.note>summary::before{background-color:var(--md-admonition-icon-color--note)}[data-md-color-scheme=dracula] .md-typeset .admonition.note>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.note>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.note>summary::after{color:var(--md-admonition-icon-color--note)}[data-md-color-scheme=dracula] .md-typeset .admonition.abstract,[data-md-color-scheme=dracula] .md-typeset details.abstract{border-color:var(--md-admonition-icon-color--abstract)}[data-md-color-scheme=dracula] .md-typeset .admonition.abstract:focus-within,[data-md-color-scheme=dracula] .md-typeset details.abstract:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--abstract)}[data-md-color-scheme=dracula] .md-typeset .admonition.abstract>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.abstract>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.abstract>summary{background-color:var(--md-admonition-bg-color--abstract);border-color:var(--md-admonition-icon-color--abstract)}[data-md-color-scheme=dracula] .md-typeset .admonition.abstract>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.abstract>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.abstract>summary::before{background-color:var(--md-admonition-icon-color--abstract)}[data-md-color-scheme=dracula] .md-typeset .admonition.abstract>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.abstract>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.abstract>summary::after{color:var(--md-admonition-icon-color--abstract)}[data-md-color-scheme=dracula] .md-typeset .admonition.info,[data-md-color-scheme=dracula] .md-typeset details.info{border-color:var(--md-admonition-icon-color--info)}[data-md-color-scheme=dracula] .md-typeset .admonition.info:focus-within,[data-md-color-scheme=dracula] .md-typeset details.info:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--info)}[data-md-color-scheme=dracula] .md-typeset .admonition.info>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.info>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.info>summary{background-color:var(--md-admonition-bg-color--info);border-color:var(--md-admonition-icon-color--info)}[data-md-color-scheme=dracula] .md-typeset .admonition.info>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.info>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.info>summary::before{background-color:var(--md-admonition-icon-color--info)}[data-md-color-scheme=dracula] .md-typeset .admonition.info>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.info>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.info>summary::after{color:var(--md-admonition-icon-color--info)}[data-md-color-scheme=dracula] .md-typeset .admonition.tip,[data-md-color-scheme=dracula] .md-typeset details.tip{border-color:var(--md-admonition-icon-color--tip)}[data-md-color-scheme=dracula] .md-typeset .admonition.tip:focus-within,[data-md-color-scheme=dracula] .md-typeset details.tip:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--tip)}[data-md-color-scheme=dracula] .md-typeset .admonition.tip>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.tip>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.tip>summary{background-color:var(--md-admonition-bg-color--tip);border-color:var(--md-admonition-icon-color--tip)}[data-md-color-scheme=dracula] .md-typeset .admonition.tip>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.tip>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.tip>summary::before{background-color:var(--md-admonition-icon-color--tip)}[data-md-color-scheme=dracula] .md-typeset .admonition.tip>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.tip>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.tip>summary::after{color:var(--md-admonition-icon-color--tip)}[data-md-color-scheme=dracula] .md-typeset .admonition.success,[data-md-color-scheme=dracula] .md-typeset details.success{border-color:var(--md-admonition-icon-color--success)}[data-md-color-scheme=dracula] .md-typeset .admonition.success:focus-within,[data-md-color-scheme=dracula] .md-typeset details.success:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--success)}[data-md-color-scheme=dracula] .md-typeset .admonition.success>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.success>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.success>summary{background-color:var(--md-admonition-bg-color--success);border-color:var(--md-admonition-icon-color--success)}[data-md-color-scheme=dracula] .md-typeset .admonition.success>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.success>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.success>summary::before{background-color:var(--md-admonition-icon-color--success)}[data-md-color-scheme=dracula] .md-typeset .admonition.success>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.success>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.success>summary::after{color:var(--md-admonition-icon-color--success)}[data-md-color-scheme=dracula] .md-typeset .admonition.question,[data-md-color-scheme=dracula] .md-typeset details.question{border-color:var(--md-admonition-icon-color--question)}[data-md-color-scheme=dracula] .md-typeset .admonition.question:focus-within,[data-md-color-scheme=dracula] .md-typeset details.question:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--question)}[data-md-color-scheme=dracula] .md-typeset .admonition.question>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.question>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.question>summary{background-color:var(--md-admonition-bg-color--question);border-color:var(--md-admonition-icon-color--question)}[data-md-color-scheme=dracula] .md-typeset .admonition.question>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.question>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.question>summary::before{background-color:var(--md-admonition-icon-color--question)}[data-md-color-scheme=dracula] .md-typeset .admonition.question>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.question>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.question>summary::after{color:var(--md-admonition-icon-color--question)}[data-md-color-scheme=dracula] .md-typeset .admonition.warning,[data-md-color-scheme=dracula] .md-typeset details.warning{border-color:var(--md-admonition-icon-color--warning)}[data-md-color-scheme=dracula] .md-typeset .admonition.warning:focus-within,[data-md-color-scheme=dracula] .md-typeset details.warning:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--warning)}[data-md-color-scheme=dracula] .md-typeset .admonition.warning>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.warning>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.warning>summary{background-color:var(--md-admonition-bg-color--warning);border-color:var(--md-admonition-icon-color--warning)}[data-md-color-scheme=dracula] .md-typeset .admonition.warning>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.warning>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.warning>summary::before{background-color:var(--md-admonition-icon-color--warning)}[data-md-color-scheme=dracula] .md-typeset .admonition.warning>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.warning>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.warning>summary::after{color:var(--md-admonition-icon-color--warning)}[data-md-color-scheme=dracula] .md-typeset .admonition.failure,[data-md-color-scheme=dracula] .md-typeset details.failure{border-color:var(--md-admonition-icon-color--failure)}[data-md-color-scheme=dracula] .md-typeset .admonition.failure:focus-within,[data-md-color-scheme=dracula] .md-typeset details.failure:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--failure)}[data-md-color-scheme=dracula] .md-typeset .admonition.failure>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.failure>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.failure>summary{background-color:var(--md-admonition-bg-color--failure);border-color:var(--md-admonition-icon-color--failure)}[data-md-color-scheme=dracula] .md-typeset .admonition.failure>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.failure>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.failure>summary::before{background-color:var(--md-admonition-icon-color--failure)}[data-md-color-scheme=dracula] .md-typeset .admonition.failure>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.failure>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.failure>summary::after{color:var(--md-admonition-icon-color--failure)}[data-md-color-scheme=dracula] .md-typeset .admonition.danger,[data-md-color-scheme=dracula] .md-typeset details.danger{border-color:var(--md-admonition-icon-color--danger)}[data-md-color-scheme=dracula] .md-typeset .admonition.danger:focus-within,[data-md-color-scheme=dracula] .md-typeset details.danger:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--danger)}[data-md-color-scheme=dracula] .md-typeset .admonition.danger>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.danger>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.danger>summary{background-color:var(--md-admonition-bg-color--danger);border-color:var(--md-admonition-icon-color--danger)}[data-md-color-scheme=dracula] .md-typeset .admonition.danger>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.danger>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.danger>summary::before{background-color:var(--md-admonition-icon-color--danger)}[data-md-color-scheme=dracula] .md-typeset .admonition.danger>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.danger>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.danger>summary::after{color:var(--md-admonition-icon-color--danger)}[data-md-color-scheme=dracula] .md-typeset .admonition.bug,[data-md-color-scheme=dracula] .md-typeset details.bug{border-color:var(--md-admonition-icon-color--bug)}[data-md-color-scheme=dracula] .md-typeset .admonition.bug:focus-within,[data-md-color-scheme=dracula] .md-typeset details.bug:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--bug)}[data-md-color-scheme=dracula] .md-typeset .admonition.bug>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.bug>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.bug>summary{background-color:var(--md-admonition-bg-color--bug);border-color:var(--md-admonition-icon-color--bug)}[data-md-color-scheme=dracula] .md-typeset .admonition.bug>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.bug>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.bug>summary::before{background-color:var(--md-admonition-icon-color--bug)}[data-md-color-scheme=dracula] .md-typeset .admonition.bug>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.bug>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.bug>summary::after{color:var(--md-admonition-icon-color--bug)}[data-md-color-scheme=dracula] .md-typeset .admonition.example,[data-md-color-scheme=dracula] .md-typeset details.example{border-color:var(--md-admonition-icon-color--example)}[data-md-color-scheme=dracula] .md-typeset .admonition.example:focus-within,[data-md-color-scheme=dracula] .md-typeset details.example:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--example)}[data-md-color-scheme=dracula] .md-typeset .admonition.example>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.example>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.example>summary{background-color:var(--md-admonition-bg-color--example);border-color:var(--md-admonition-icon-color--example)}[data-md-color-scheme=dracula] .md-typeset .admonition.example>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.example>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.example>summary::before{background-color:var(--md-admonition-icon-color--example)}[data-md-color-scheme=dracula] .md-typeset .admonition.example>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.example>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.example>summary::after{color:var(--md-admonition-icon-color--example)}[data-md-color-scheme=dracula] .md-typeset .admonition.quote,[data-md-color-scheme=dracula] .md-typeset details.quote{border-color:var(--md-admonition-icon-color--quote)}[data-md-color-scheme=dracula] .md-typeset .admonition.quote:focus-within,[data-md-color-scheme=dracula] .md-typeset details.quote:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--quote)}[data-md-color-scheme=dracula] .md-typeset .admonition.quote>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.quote>.admonition-title,[data-md-color-scheme=dracula] .md-typeset details.quote>summary{background-color:var(--md-admonition-bg-color--quote);border-color:var(--md-admonition-icon-color--quote)}[data-md-color-scheme=dracula] .md-typeset .admonition.quote>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.quote>.admonition-title::before,[data-md-color-scheme=dracula] .md-typeset details.quote>summary::before{background-color:var(--md-admonition-icon-color--quote)}[data-md-color-scheme=dracula] .md-typeset .admonition.quote>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.quote>.admonition-title::after,[data-md-color-scheme=dracula] .md-typeset details.quote>summary::after{color:var(--md-admonition-icon-color--quote)}.md-typeset .admonition.config,.md-typeset .admonition.settings,.md-typeset details.config,.md-typeset details.settings{border-color:var(--md-admonition-icon-color--settings)}.md-typeset .admonition.config:focus-within,.md-typeset .admonition.settings:focus-within,.md-typeset details.config:focus-within,.md-typeset details.settings:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--settings)}.md-typeset .admonition.config>.admonition-title,.md-typeset .admonition.settings>.admonition-title,.md-typeset details.config>.admonition-title,.md-typeset details.config>summary,.md-typeset details.settings>.admonition-title,.md-typeset details.settings>summary{background-color:var(--md-admonition-bg-color--settings);border-color:var(--md-admonition-icon-color--settings)}.md-typeset .admonition.config>.admonition-title::before,.md-typeset .admonition.settings>.admonition-title::before,.md-typeset details.config>.admonition-title::before,.md-typeset details.config>summary::before,.md-typeset details.settings>.admonition-title::before,.md-typeset details.settings>summary::before{width:1rem;height:1rem;background-color:var(--md-admonition-icon-color--settings);background-size:1rem;-webkit-mask-image:var(--md-admonition-icon--settings);mask-image:var(--md-admonition-icon--settings);content:" "}.md-typeset .admonition.config>.admonition-title::after,.md-typeset .admonition.settings>.admonition-title::after,.md-typeset details.config>.admonition-title::after,.md-typeset details.config>summary::after,.md-typeset details.settings>.admonition-title::after,.md-typeset details.settings>summary::after{color:var(--md-admonition-icon-color--settings)}.md-typeset .admonition.new,.md-typeset details.new{border-color:var(--md-admonition-icon-color--new)}.md-typeset .admonition.new:focus-within,.md-typeset details.new:focus-within{box-shadow:0 0 0 .2rem var(--md-admonition-shadow-color--new)}.md-typeset .admonition.new>.admonition-title,.md-typeset details.new>.admonition-title,.md-typeset details.new>summary{background-color:var(--md-admonition-bg-color--new);border-color:var(--md-admonition-icon-color--new)}.md-typeset .admonition.new>.admonition-title::before,.md-typeset details.new>.admonition-title::before,.md-typeset details.new>summary::before{width:1rem;height:1rem;background-color:var(--md-admonition-icon-color--new);background-size:1rem;-webkit-mask-image:var(--md-admonition-icon--new);mask-image:var(--md-admonition-icon--new);content:" "}.md-typeset .admonition.new>.admonition-title::after,.md-typeset details.new>.admonition-title::after,.md-typeset details.new>summary::after{color:var(--md-admonition-icon-color--new)}mjx-container[display=true]{font-size:120%!important}mjx-container:not([display]){font-size:100%!important}[data-md-color-scheme=dracula] .CtxtMenu_InfoContent pre,[data-md-color-scheme=dracula] .CtxtMenu_InfoSignature input,[data-md-color-scheme=slate] .CtxtMenu_InfoContent pre,[data-md-color-scheme=slate] .CtxtMenu_InfoSignature input{color:#000}[data-md-color-scheme=dracula] .CtxtMenu_Info,[data-md-color-scheme=dracula] .CtxtMenu_Menu,[data-md-color-scheme=slate] .CtxtMenu_Info,[data-md-color-scheme=slate] .CtxtMenu_Menu{box-shadow:0 10px 20px rgba(0,0,0,.5)}.md-typeset .arithmatex{overflow-x:auto!important;overflow-y:hidden!important}.katex-display .katex-html{display:flex!important;flex-direction:row;flex-wrap:nowrap;align-items:baseline;justify-content:space-between}.katex-display .katex-html .base{display:inline!important}.katex-display .katex-html .tag{position:relative!important;display:inline!important;margin-left:var(--margin-small)}.md-typeset del.critic,.md-typeset ins.critic,.md-typeset mark.critic{padding:0 .25em;color:unset;box-shadow:none}.md-typeset .critic.break{margin:0}.md-typeset details{overflow:hidden}.md-typeset details>summary:focus{outline-style:none}.highlight .kc{color:var(--md-code-hl-constant-color)}.highlight .nc,.highlight .ne{color:var(--md-code-hl-class-color)}.highlight .mb{color:var(--md-code-hl-number-color)}.highlight .bp,.highlight .nb{color:var(--md-code-hl-builtin-color)}.highlight .nn{color:var(--md-code-hl-namespace-color)}.highlight .na,.highlight .nd,.highlight .ni{color:var(--md-code-hl-entity-color)}.highlight .nl,.highlight .nt{color:var(--md-code-hl-tag-color)}.md-typeset :not(pre)>code{margin:0;padding:0 .2941176471em;color:var(--md-code-fg-color);background-color:var(--md-code-inline-bg-color);border-radius:.1rem;box-shadow:none}.md-typeset a>code{color:inherit!important;background-color:var(--md-code-link-bg-color)!important;transition:color 125ms;transition:background-color 125ms}.md-typeset a>code *{color:var(--md-typeset-a-color)!important}.md-typeset a>code:hover{background-color:var(--md-code-link-accent-bg-color)!important}.md-typeset a>code:hover *{color:var(--md-accent-fg-color)!important}.md-typeset pre>code{outline:0}.md-typeset td code{word-break:normal}.md-typeset .highlight{-moz-tab-size:8;-o-tab-size:8;tab-size:8}.md-typeset .highlight+.result{border-width:.1rem}.md-typeset .highlight [data-linenos].special::before{background-color:var(--md-code-special-bg-color)}.md-typeset .highlighttable .linenodiv .special{margin-right:-.5882352941em;margin-left:-1.1764705882em;padding-right:.5882352941em;padding-left:1.1764705882em;background-color:var(--md-code-special-bg-color)}.md-typeset .highlight span.filename{position:relative;display:block;margin-top:1em;padding:.5em 1.1764705882em .5em 2.9411764706em;font-weight:700;font-size:.68rem;background-color:var(--md-code-title-bg-color);border-top-left-radius:.1rem;border-top-right-radius:.1rem}.md-typeset .highlight span.filename+pre{margin-top:0}.md-typeset .highlight span.filename+pre code{border-top-left-radius:0;border-top-right-radius:0}.md-typeset .highlight span.filename::before{position:absolute;left:.8823529412em;width:1.4705882353em;height:1.4705882353em;background-color:var(--md-default-fg-color);-webkit-mask-image:url('data:image/svg+xml;charset=utf-8,');mask-image:url('data:image/svg+xml;charset=utf-8,');-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;content:""}.md-typeset .collapse-code{position:relative;margin-top:1em;margin-bottom:1em}.md-typeset .collapse-code pre{margin-top:0;margin-bottom:0}.md-typeset .collapse-code input{display:none}.md-typeset .collapse-code input~.code-footer{width:100%;margin:0;padding:.25em .5em .25em 0}.md-typeset .collapse-code input~.code-footer label{position:relative;margin:.05em;padding:.15em .8em;color:var(--md-primary-bg-color);font-size:90%;background-color:var(--md-primary-fg-color);-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;border-radius:.1rem;cursor:pointer;content:""}.md-typeset .collapse-code input~.code-footer label:hover{background-color:var(--md-accent-fg-color)}.md-typeset .collapse-code input~.code-footer label::before{position:absolute;top:.15em;left:.15em;display:block;box-sizing:border-box;width:1.25em;height:1.25em;background-color:var(--md-primary-bg-color);background-size:1.25em;content:""}.md-typeset .collapse-code input~.code-footer label.expand{display:none}.md-typeset .collapse-code input~.code-footer label.expand::before{-webkit-mask-image:url('data:image/svg+xml;charset=utf-8,');mask-image:url('data:image/svg+xml;charset=utf-8,')}.md-typeset .collapse-code input~.code-footer label.collapse::before{-webkit-mask-image:url('data:image/svg+xml;charset=utf-8,');mask-image:url('data:image/svg+xml;charset=utf-8,')}.md-typeset .collapse-code input:checked~.code-footer label.expand{display:inline}.md-typeset .collapse-code input:checked~.code-footer label.collapse{display:none}.md-typeset .collapse-code input:checked+div.highlight code{max-height:9.375em;overflow:hidden}.md-typeset .collapse-code input:checked~.code-footer{position:absolute;bottom:0;left:0;padding:2em .5em .5em .8rem;background-image:linear-gradient(to bottom,transparent,var(--md-default-bg-color) 80% 100%)}.md-typeset .keys .key-power::before{padding-right:.4em;content:"⏻"}.md-typeset .keys .key-fingerprint::before{padding-right:.4em;content:"☝"}:root>*{--magiclink-email-icon:url('data:image/svg+xml;charset=utf-8,');--magiclink-github-icon:url('data:image/svg+xml;charset=utf-8,');--magiclink-bitbucket-icon:url('data:image/svg+xml;charset=utf-8,');--magiclink-gitlab-icon:url('data:image/svg+xml;charset=utf-8,');--magiclink-commit-icon:url('data:image/svg+xml;charset=utf-8,');--magiclink-compare-icon:url('data:image/svg+xml;charset=utf-8,');--magiclink-pull-icon:url('data:image/svg+xml;charset=utf-8,');--magiclink-issue-icon:url('data:image/svg+xml;charset=utf-8,');--magiclink-discussion-icon:url('data:image/svg+xml;charset=utf-8,')}.md-typeset a[href^="mailto:"]:not(.magiclink-ignore)::before{-webkit-mask-image:var(--magiclink-email-icon);mask-image:var(--magiclink-email-icon)}.md-typeset .magiclink-commit:not(.magiclink-ignore),.md-typeset .magiclink-compare:not(.magiclink-ignore),.md-typeset .magiclink-discussion:not(.magiclink-ignore),.md-typeset .magiclink-issue:not(.magiclink-ignore),.md-typeset .magiclink-pull:not(.magiclink-ignore),.md-typeset .magiclink-repository:not(.magiclink-ignore),.md-typeset a[href^="mailto:"]:not(.magiclink-ignore){position:relative;padding-left:1.375em}.md-typeset .magiclink-commit:not(.magiclink-ignore)::before,.md-typeset .magiclink-compare:not(.magiclink-ignore)::before,.md-typeset .magiclink-discussion:not(.magiclink-ignore)::before,.md-typeset .magiclink-issue:not(.magiclink-ignore)::before,.md-typeset .magiclink-pull:not(.magiclink-ignore)::before,.md-typeset .magiclink-repository:not(.magiclink-ignore)::before,.md-typeset a[href^="mailto:"]:not(.magiclink-ignore)::before{position:absolute;top:0;left:0;display:block;box-sizing:border-box;width:1.25em;height:1.25em;background-color:var(--md-typeset-a-color);background-size:1.25em;transition:background-color 125ms;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;-webkit-mask-size:contain;mask-size:contain;content:""}.md-typeset .magiclink-commit:not(.magiclink-ignore):hover::before,.md-typeset .magiclink-compare:not(.magiclink-ignore):hover::before,.md-typeset .magiclink-discussion:not(.magiclink-ignore):hover::before,.md-typeset .magiclink-issue:not(.magiclink-ignore):hover::before,.md-typeset .magiclink-pull:not(.magiclink-ignore):hover::before,.md-typeset .magiclink-repository:not(.magiclink-ignore):hover::before,.md-typeset a[href^="mailto:"]:not(.magiclink-ignore):hover::before{background-color:var(--md-accent-fg-color)}.md-typeset .magiclink-commit:not(.magiclink-ignore)::before{-webkit-mask-image:var(--magiclink-commit-icon);mask-image:var(--magiclink-commit-icon)}.md-typeset .magiclink-compare:not(.magiclink-ignore)::before{-webkit-mask-image:var(--magiclink-compare-icon);mask-image:var(--magiclink-compare-icon)}.md-typeset .magiclink-pull:not(.magiclink-ignore)::before{-webkit-mask-image:var(--magiclink-pull-icon);mask-image:var(--magiclink-pull-icon)}.md-typeset .magiclink-issue:not(.magiclink-ignore)::before{-webkit-mask-image:var(--magiclink-issue-icon);mask-image:var(--magiclink-issue-icon)}.md-typeset .magiclink-discussion:not(.magiclink-ignore)::before{-webkit-mask-image:var(--magiclink-discussion-icon);mask-image:var(--magiclink-discussion-icon)}.md-typeset .magiclink-repository.magiclink-github:not(.magiclink-ignore)::before{-webkit-mask-image:var(--magiclink-github-icon);mask-image:var(--magiclink-github-icon)}.md-typeset .magiclink-repository.magiclink-gitlab:not(.magiclink-ignore)::before{-webkit-mask-image:var(--magiclink-gitlab-icon);mask-image:var(--magiclink-gitlab-icon)}.md-typeset .magiclink-repository.magiclink-bitbucket:not(.magiclink-ignore)::before{-webkit-mask-image:var(--magiclink-bitbucket-icon);mask-image:var(--magiclink-bitbucket-icon)}.md-typeset mark:not(.critic){box-shadow:none}.md-typeset .progress-label{position:absolute;width:100%;margin:0;color:var(--md-text-color);font-weight:700;line-height:1.4rem;white-space:nowrap;text-align:center;text-shadow:-.0625em -.0625em .375em var(--md-default-bg-color--light),.0625em -.0625em .375em var(--md-default-bg-color--light),-.0625em .0625em .375em var(--md-default-bg-color--light),.0625em .0625em .375em var(--md-default-bg-color--light)}.md-typeset .progress-bar{float:left;height:1.2rem;background-color:#2979ff}.md-typeset .candystripe-animate .progress-bar{animation:animate-stripes 3s linear infinite}.md-typeset .progress{position:relative;display:block;width:100%;height:1.2rem;margin:.5rem 0;background-color:var(--md-default-fg-color--lightest)}.md-typeset .progress.thin{height:.4rem;margin-top:.9rem}.md-typeset .progress.thin .progress-label{margin-top:-.4rem}.md-typeset .progress.thin .progress-bar{height:.4rem}.md-typeset .progress.candystripe .progress-bar{background-image:linear-gradient(135deg,var(--md-progress-stripe) 27%,transparent 27%,transparent 52%,var(--md-progress-stripe) 52%,var(--md-progress-stripe) 77%,transparent 77%,transparent);background-size:2rem 2rem}.md-typeset .progress-100plus .progress-bar{background-color:var(--md-progress-100)}.md-typeset .progress-80plus .progress-bar{background-color:var(--md-progress-80)}.md-typeset .progress-60plus .progress-bar{background-color:var(--md-progress-60)}.md-typeset .progress-40plus .progress-bar{background-color:var(--md-progress-40)}.md-typeset .progress-20plus .progress-bar{background-color:var(--md-progress-20)}.md-typeset .progress-0plus .progress-bar{background-color:var(--md-progress-0)}@keyframes animate-stripes{0%{background-position:0 0}100%{background-position:6rem 0}}[data-md-color-scheme=dracula] .md-typeset .tabbed-set>.tabbed-labels{box-shadow:0 -.05rem var(--md-default-fg-color--lighter) inset}.md-typeset .tabbed-alternate.tabbed-set .tabbed-control{width:2rem}.md-typeset .tabbed-alternate.tabbed-set .tabbed-control[hidden]{width:1.2rem;opacity:0}.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block{padding:0 .6rem}.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>.codehilite:only-child,.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>.codehilitetable:only-child,.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>.highlight:only-child,.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>.highlighttable:only-child,.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>pre:only-child{margin-right:-1.2rem;margin-left:-1.2rem;padding-right:.6rem;padding-left:.6rem}.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>.codehilite:only-child span.filename,.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>.codehilitetable:only-child span.filename,.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>.highlight:only-child span.filename,.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>.highlighttable:only-child span.filename,.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>pre:only-child span.filename{margin-top:0}.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>.collapse-code:only-child{margin-top:0;margin-right:-1.2rem;margin-left:-1.2rem;padding-right:.6rem;padding-left:.6rem}.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>.collapse-code:only-child>.code-footer{left:.6rem}.md-typeset .tabbed-alternate.tabbed-set>.tabbed-content>.tabbed-block>diagram-div:only-child{margin-right:-1.2rem;margin-left:-1.2rem;padding-right:.6rem;padding-left:.6rem}.js .md-typeset .tabbed-labels::before{background-color:var(--md-accent-fg-color)}[data-md-color-scheme=dracula] .md-typeset table:not([class]){box-shadow:var(--md-shadow-z2)}[data-md-color-scheme=dracula] .md-typeset table:not([class]) tr:hover{background-color:rgba(0,0,0,.08)}[data-md-color-scheme=dracula] .md-typeset table:not([class]) th{color:var(--md-text-color);background-color:var(--md-default-bg-color--ultra-dark);border-bottom:.05rem solid var(--md-primary-fg-color)}[data-md-color-scheme=dracula] .md-typeset table:not([class]) td{border-top:.05rem solid var(--md-default-fg-color--lighter)}[data-md-color-scheme=dracula] .md-typeset .task-list-control .task-list-indicator::before{background-color:var(--md-default-fg-color--lighter)}[data-md-color-scheme=dracula] .md-typeset .task-list-control [type=checkbox]:checked+.task-list-indicator::before{background-color:#51f97b}.md-typeset .headerlink{width:1em;height:1em;vertical-align:middle;background-color:var(--md-default-fg-color--lighter);background-size:1em;-webkit-mask-size:1em;mask-size:1em;-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat;visibility:visible;-webkit-mask-image:url('data:image/svg+xml;charset=utf-8,');mask-image:url('data:image/svg+xml;charset=utf-8,')}.md-typeset .headerlink:hover,.md-typeset [id]:target .headerlink{background-color:var(--md-accent-fg-color)}diagram-div{overflow:auto}html{background-color:transparent}[data-md-component=announce] .twemoji{color:var(--md-primary-fg-color)}[data-md-color-scheme=dracula]{--md-text-color:var(--md-default-fg-color);background-color:var(--md-default-bg-color);--md-footer-bg-color:transparent;--md-footer-bg-color--dark:var(--md-default-bg-color--darkest);--md-header-fg-color:var(--md-text-color);--md-header-bg-color:var(--md-default-bg-color--darkest)}[data-md-color-scheme=dracula] .md-header{color:var(--md-text-color);background-color:var(--md-header-bg-color);border-bottom:.05rem solid var(--md-primary-fg-color)}[data-md-color-scheme=dracula] .md-header[data-md-state=shadow]{box-shadow:0 0 .2rem rgba(0,0,0,.15),0 0 .2rem .4rem rgba(0,0,0,.2)}[data-md-color-scheme=dracula] .md-top{background-color:var(--md-default-bg-color--dark)}[data-md-color-scheme=dracula] .md-top:hover{background-color:var(--md-primary-fg-color)}[data-md-color-scheme=dracula] .md-tabs{color:var(--md-text-color);background-color:var(--md-primary-fg-color--transparent)}[data-md-color-scheme=dracula] .md-tabs__link--active{color:var(--md-primary-fg-color)}[data-md-color-scheme=dracula] .md-tabs__link:hover{color:var(--md-accent-fg-color)}[data-md-color-scheme=dracula] .md-hero{color:var(--md-text-color);background-color:var(--md-primary-fg-color--transparent)}[data-md-color-scheme=dracula] .md-nav__source{color:var(--md-text-color)}[data-md-color-scheme=dracula] .md-nav__link[data-md-state=blur]{color:var(--md-default-fg-color--light)}[data-md-color-scheme=dracula] .md-nav__item .md-nav__link--active{color:var(--md-primary-fg-color)}[data-md-color-scheme=dracula] .md-nav__link:focus,[data-md-color-scheme=dracula] .md-nav__link:hover{color:var(--md-accent-fg-color)}[data-md-color-scheme=dracula] .md-search__input{color:var(--md-text-color);background-color:var(--md-accent-bg-color--light)}[data-md-color-scheme=dracula] .md-search__input:hover{background-color:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] .md-search__input~.md-search__icon{color:var(--md-text-color)}[data-md-color-scheme=dracula] .md-search__input::-moz-placeholder{color:var(--md-default-fg-color--light)}[data-md-color-scheme=dracula] .md-search__input::placeholder{color:var(--md-default-fg-color--light)}[data-md-color-scheme=dracula] [data-md-toggle=search]:checked~.md-header .md-search__input{background-color:transparent}[data-md-color-scheme=dracula] .md-search__suggest{color:var(--md-default-autocomplete-fg-color)}[data-md-color-scheme=dracula] .md-overlay,[data-md-color-scheme=dracula] .md-search__overlay{background-color:var(--md-default-bg-color--light)}[data-md-color-scheme=dracula] .md-footer-nav__direction{color:var(--md-primary-fg-color)}[data-md-color-scheme=dracula] .md-footer-meta{border-top:.05rem solid var(--md-primary-fg-color)}[data-md-color-scheme=dracula] [data-md-component=announce]{background-color:var(--md-default-bg-color--ultra-dark)}.md-typeset h5{color:var(--md-text-color);text-transform:none}.md-search__scrollwrap,.md-sidebar__scrollwrap,.md-typeset diagram-div,.md-typeset div.arithmatex,.md-typeset div.diagram,.md-typeset div.mermaid,.md-typeset mermaid-div,.md-typeset pre.arithmatex,.md-typeset pre>code,.md-typeset__scrollwrap{scrollbar-color:var(--md-default-fg-color--lighter) transparent;scrollbar-width:thin}.md-search__scrollwrap::-webkit-scrollbar,.md-sidebar__scrollwrap::-webkit-scrollbar,.md-typeset diagram-div::-webkit-scrollbar,.md-typeset div.arithmatex::-webkit-scrollbar,.md-typeset div.diagram::-webkit-scrollbar,.md-typeset div.mermaid::-webkit-scrollbar,.md-typeset mermaid-div::-webkit-scrollbar,.md-typeset pre.arithmatex::-webkit-scrollbar,.md-typeset pre>code::-webkit-scrollbar,.md-typeset__scrollwrap::-webkit-scrollbar{width:.2rem;height:.2rem}.md-search__scrollwrap::-webkit-scrollbar-corner,.md-sidebar__scrollwrap::-webkit-scrollbar-corner,.md-typeset diagram-div::-webkit-scrollbar-corner,.md-typeset div.arithmatex::-webkit-scrollbar-corner,.md-typeset div.diagram::-webkit-scrollbar-corner,.md-typeset div.mermaid::-webkit-scrollbar-corner,.md-typeset mermaid-div::-webkit-scrollbar-corner,.md-typeset pre.arithmatex::-webkit-scrollbar-corner,.md-typeset pre>code::-webkit-scrollbar-corner,.md-typeset__scrollwrap::-webkit-scrollbar-corner{background-color:transparent}.md-search__scrollwrap::-webkit-scrollbar-thumb,.md-sidebar__scrollwrap::-webkit-scrollbar-thumb,.md-typeset diagram-div::-webkit-scrollbar-thumb,.md-typeset div.arithmatex::-webkit-scrollbar-thumb,.md-typeset div.diagram::-webkit-scrollbar-thumb,.md-typeset div.mermaid::-webkit-scrollbar-thumb,.md-typeset mermaid-div::-webkit-scrollbar-thumb,.md-typeset pre.arithmatex::-webkit-scrollbar-thumb,.md-typeset pre>code::-webkit-scrollbar-thumb,.md-typeset__scrollwrap::-webkit-scrollbar-thumb{background-color:var(--md-default-fg-color--lighter)}.md-search__scrollwrap::-webkit-scrollbar-thumb:hover,.md-sidebar__scrollwrap::-webkit-scrollbar-thumb:hover,.md-typeset diagram-div::-webkit-scrollbar-thumb:hover,.md-typeset div.arithmatex::-webkit-scrollbar-thumb:hover,.md-typeset div.diagram::-webkit-scrollbar-thumb:hover,.md-typeset div.mermaid::-webkit-scrollbar-thumb:hover,.md-typeset mermaid-div::-webkit-scrollbar-thumb:hover,.md-typeset pre.arithmatex::-webkit-scrollbar-thumb:hover,.md-typeset pre>code::-webkit-scrollbar-thumb:hover,.md-typeset__scrollwrap::-webkit-scrollbar-thumb:hover{background-color:var(--md-accent-fg-color)}.md-search__scrollwrap:hover,.md-sidebar__scrollwrap:hover,.md-typeset diagram-div:hover,.md-typeset div.arithmatex:hover,.md-typeset div.diagram:hover,.md-typeset div.mermaid:hover,.md-typeset mermaid-div:hover,.md-typeset pre.arithmatex:hover,.md-typeset pre>code:hover,.md-typeset__scrollwrap:hover{scrollbar-color:var(--md-accent-fg-color) transparent}@media screen and (max-width:59.9375em){.md-header-nav__scheme{padding-right:0}label[for=__search]{padding-left:0}[data-md-color-scheme=dracula] .md-nav__source{color:var(--md-text-color);background-color:var(--md-primary-fg-color--transparent)}[data-md-color-scheme=dracula] .md-nav .md-nav__title{color:var(--md-text-color);background-color:var(--md-header-bg-color);border-bottom:.05rem solid var(--md-primary-fg-color)}}@media screen and (max-width:44.9375em){.md-typeset>diagram-div{margin-right:-.8rem;margin-left:-.8rem}.md-typeset>.collapse-code{margin-right:-.8rem;margin-left:-.8rem}.md-typeset>.collapse-code label.collapse{left:.8rem}[dir=ltr] .md-content__inner>.tabbed-set .tabbed-labels{padding-left:0}.md-content__inner>.tabbed-set .tabbed-labels{max-width:100%;margin:0;padding-inline-start:0;scroll-padding-inline-start:0}.md-content__inner>.tabbed-set .tabbed-labels::after{padding-inline-end:0;content:none}.md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--prev{margin-inline-start:0;padding-inline-start:0}.md-content__inner>.tabbed-set .tabbed-labels~.tabbed-control--next{margin-inline-end:0;padding-inline-end:0}}@media screen and (max-width:76.1875em){[data-md-color-scheme=dracula] .md-nav--primary .md-nav__item--active>.md-nav__link:not(:hover){color:var(--md-primary-fg-color)}[data-md-color-scheme=dracula] .md-nav--primary .md-nav__title{color:var(--md-text-color);background-color:var(--md-header-bg-color);border-bottom:.05rem solid var(--md-primary-fg-color)}} /*# sourceMappingURL=extra-fb5a2a1c86.css.map */ ================================================ FILE: docs/theme/assets/pymdownx-extras/extra-loader-MCFnu0Wd.js ================================================ function _typeof(t){return _typeof="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},_typeof(t)}!function(){"use strict";function t(){t=function(){return r};var e,r={},n=Object.prototype,o=n.hasOwnProperty,i=Object.defineProperty||function(t,e,r){t[e]=r.value},a="function"==typeof Symbol?Symbol:{},c=a.iterator||"@@iterator",u=a.asyncIterator||"@@asyncIterator",l=a.toStringTag||"@@toStringTag";function f(t,e,r){return Object.defineProperty(t,e,{value:r,enumerable:!0,configurable:!0,writable:!0}),t[e]}try{f({},"")}catch(e){f=function(t,e,r){return t[e]=r}}function s(t,e,r,n){var o=e&&e.prototype instanceof g?e:g,a=Object.create(o.prototype),c=new N(n||[]);return i(a,"_invoke",{value:S(t,r,c)}),a}function h(t,e,r){try{return{type:"normal",arg:t.call(e,r)}}catch(t){return{type:"throw",arg:t}}}r.wrap=s;var d="suspendedStart",p="suspendedYield",y="executing",m="completed",v={};function g(){}function b(){}function w(){}var x={};f(x,c,(function(){return this}));var E=Object.getPrototypeOf,L=E&&E(E(C([])));L&&L!==n&&o.call(L,c)&&(x=L);var O=w.prototype=g.prototype=Object.create(x);function _(t){["next","throw","return"].forEach((function(e){f(t,e,(function(t){return this._invoke(e,t)}))}))}function j(t,e){function r(n,i,a,c){var u=h(t[n],t,i);if("throw"!==u.type){var l=u.arg,f=l.value;return f&&"object"==_typeof(f)&&o.call(f,"__await")?e.resolve(f.__await).then((function(t){r("next",t,a,c)}),(function(t){r("throw",t,a,c)})):e.resolve(f).then((function(t){l.value=t,a(l)}),(function(t){return r("throw",t,a,c)}))}c(u.arg)}var n;i(this,"_invoke",{value:function(t,o){function i(){return new e((function(e,n){r(t,o,e,n)}))}return n=n?n.then(i,i):i()}})}function S(t,r,n){var o=d;return function(i,a){if(o===y)throw new Error("Generator is already running");if(o===m){if("throw"===i)throw a;return{value:e,done:!0}}for(n.method=i,n.arg=a;;){var c=n.delegate;if(c){var u=P(c,n);if(u){if(u===v)continue;return u}}if("next"===n.method)n.sent=n._sent=n.arg;else if("throw"===n.method){if(o===d)throw o=m,n.arg;n.dispatchException(n.arg)}else"return"===n.method&&n.abrupt("return",n.arg);o=y;var l=h(t,r,n);if("normal"===l.type){if(o=n.done?m:p,l.arg===v)continue;return{value:l.arg,done:n.done}}"throw"===l.type&&(o=m,n.method="throw",n.arg=l.arg)}}}function P(t,r){var n=r.method,o=t.iterator[n];if(o===e)return r.delegate=null,"throw"===n&&t.iterator.return&&(r.method="return",r.arg=e,P(t,r),"throw"===r.method)||"return"!==n&&(r.method="throw",r.arg=new TypeError("The iterator does not provide a '"+n+"' method")),v;var i=h(o,t.iterator,r.arg);if("throw"===i.type)return r.method="throw",r.arg=i.arg,r.delegate=null,v;var a=i.arg;return a?a.done?(r[t.resultName]=a.value,r.next=t.nextLoc,"return"!==r.method&&(r.method="next",r.arg=e),r.delegate=null,v):a:(r.method="throw",r.arg=new TypeError("iterator result is not an object"),r.delegate=null,v)}function k(t){var e={tryLoc:t[0]};1 in t&&(e.catchLoc=t[1]),2 in t&&(e.finallyLoc=t[2],e.afterLoc=t[3]),this.tryEntries.push(e)}function M(t){var e=t.completion||{};e.type="normal",delete e.arg,t.completion=e}function N(t){this.tryEntries=[{tryLoc:"root"}],t.forEach(k,this),this.reset(!0)}function C(t){if(t||""===t){var r=t[c];if(r)return r.call(t);if("function"==typeof t.next)return t;if(!isNaN(t.length)){var n=-1,i=function r(){for(;++n=0;--i){var a=this.tryEntries[i],c=a.completion;if("root"===a.tryLoc)return n("end");if(a.tryLoc<=this.prev){var u=o.call(a,"catchLoc"),l=o.call(a,"finallyLoc");if(u&&l){if(this.prev=0;--r){var n=this.tryEntries[r];if(n.tryLoc<=this.prev&&o.call(n,"finallyLoc")&&this.prev=0;--e){var r=this.tryEntries[e];if(r.finallyLoc===t)return this.complete(r.completion,r.afterLoc),M(r),v}},catch:function(t){for(var e=this.tryEntries.length-1;e>=0;--e){var r=this.tryEntries[e];if(r.tryLoc===t){var n=r.completion;if("throw"===n.type){var o=n.arg;M(r)}return o}}throw new Error("illegal catch attempt")},delegateYield:function(t,r,n){return this.delegate={iterator:C(t),resultName:r,nextLoc:n},"next"===this.method&&(this.arg=e),v}},r}function e(t,e,r,n,o,i,a){try{var c=t[i](a),u=c.value}catch(t){return void r(t)}c.done?e(u):Promise.resolve(u).then(n,o)}function r(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}function n(t,e){for(var r=0;r .label text { fill: var(--drac-purple-fg); } .edgeLabel text { fill: var(--drac-purple-fg) !important; } .edgeLabel rect { opacity: 0.75 !important; fill: var(--drac-purple-bg) !important; } .grey rect.label-container { fill: var(--drac-purple-bg) !important; stroke: var(--drac-purple-fg) !important; } /* Sequence */ line[id^='actor'] { stroke: var(--drac-blue-fg); } .noteText { fill: var(--drac-yellow-fg); } /* Gantt */ .sectionTitle { fill: var(--drac-purple-fg) !important; } .grid .tick line { stroke: var(--drac-blue-fg) !important; } .grid .tick text { fill: var(--drac-purple-fg); } /* Class Diagram */ .statediagram-state rect.divider { fill: transparent !important; } /* State Diagram */ .stateGroup circle[style$=\"fill: black;\"] { fill: var(--drac-purple-bg) !important; stroke: var(--drac-purple-bg) !important; } .stateGroup circle[style$=\"fill: white;\"] { fill: var(--drac-purple-bg) !important; stroke: var(--drac-purple-fg) !important; } .stateGroup .composit { fill: var(--drac-page-bg); } /* Pie */ text.slice { fill: var(--drac-white-fg) !important; } /* Git Graph */ .commit-bullets .commit-reverse, .commit-bullets .commit-merge, .commit-bullets .commit-highlight-inner { fill: var(--drac-page-bg) !important; stroke: var(--drac-page-bg) !important; } ",themeVariables:{darkMode:!0,background:"#323443",mainBkg:"#604b7d",textColor:"#bf95f9",lineColor:"#bf95f9",errorBkgColor:"#802c2c",errorTextColor:"#ff5757",primaryColor:"#604b7d",primaryTextColor:"#bf95f9",primaryBorderColor:"#bf95f9",secondaryColor:"#297d3e",secondaryTextColor:"#52fa7c",secondaryBorderColor:"#52fa7c",tertiaryColor:"#303952",tertiaryTextColor:"#6071a4",tertiaryBorderColor:"#6071a4",noteBkgColor:"#797d45",noteTextColor:"#f1fa89",noteBorderColor:"#f1fa89",edgeLabelBackground:"#604b7d",edgeLabelText:"#604b7d",actorLineColor:"#6071a4",activeTaskBkgColor:"#803d63",activeTaskBorderColor:"#ff7ac6",doneTaskBkgColor:"#297d3e",doneTaskBorderColor:"#52fa7c",critBkgColor:"#802c2c",critBorderColor:"#ff5757",taskTextColor:"#bf95f9",taskTextOutsideColor:"#bf95f9",taskTextLightColor:"#bf95f9",sectionBkgColor:"#bf95f9b3",sectionBkgColor2:"#bf95f966",altSectionBkgColor:"#323443",todayLineColor:"#ff7ac6",gridColor:"#6071a4",defaultLinkColor:"#8be8fd",altBackground:"#bf95f9",classText:"#bf95f9",fillType0:"#406080",fillType1:"#46747f",fillType2:"#297d3e",fillType3:"#805c36",fillType4:"#803d63",fillType5:"#604b7d",fillType6:"#802c2c",fillType7:"#797d45",fillType8:"#7c7c79",git0:"#ff5555",git1:"#ffb86c",git2:"#f1fa8c",git3:"#50fa7b",git4:"#8be9fd",git5:"#809fff",git6:"#ff79c6",git7:"#bd93f9",gitInv0:"#ff5555",gitInv1:"#ffb86c",gitInv2:"#f1fa8c",gitInv3:"#50fa7b",gitInv4:"#8be9fd",gitInv5:"#809fff",gitInv6:"#ff79c6",gitInv7:"#bd93f9",gitBranchLabel0:"#323443",gitBranchLabel1:"#323443",gitBranchLabel2:"#323443",gitBranchLabel3:"#323443",gitBranchLabel4:"#323443",gitBranchLabel5:"#323443",gitBranchLabel6:"#323443",gitBranchLabel7:"#323443",commitLabelColor:"#52fa7c",commitLabelBackground:"#297d3e"},flowchart:{htmlLabels:!1,useMaxWidth:!1},er:{useMaxWidth:!1},sequence:{useMaxWidth:!1,noteFontWeight:"14px",actorFontSize:"14px",messageFontSize:"16px"},journey:{useMaxWidth:!1},pie:{useMaxWidth:!1},gantt:{useMaxWidth:!1},gitGraph:{useMaxWidth:!1}},default:{startOnLoad:!1,theme:"default",flowchart:{htmlLabels:!1,useMaxWidth:!1},er:{useMaxWidth:!1},sequence:{useMaxWidth:!1,noteFontWeight:"14px",actorFontSize:"14px",messageFontSize:"16px"},journey:{useMaxWidth:!1},pie:{useMaxWidth:!1},gantt:{useMaxWidth:!1},gitGraph:{useMaxWidth:!1}},slate:{startOnLoad:!1,theme:"dark",flowchart:{htmlLabels:!1,useMaxWidth:!1},er:{useMaxWidth:!1},sequence:{useMaxWidth:!1,noteFontWeight:"14px",actorFontSize:"14px",messageFontSize:"16px"},journey:{useMaxWidth:!1},pie:{useMaxWidth:!1},gantt:{useMaxWidth:!1},gitGraph:{useMaxWidth:!1}}})}(); //# sourceMappingURL=material-extra-3rdparty-E-i8w1WA.js.map ================================================ FILE: docs/theme/assets/pymdownx-extras/material-extra-theme-TVq-kNRT.js ================================================ !function(){"use strict";var e;e=function(e){"true"===localStorage.getItem("data-md-prefers-color-scheme")&&document.querySelector("body").setAttribute("data-md-color-scheme",e.matches?"dracula":"default")},new MutationObserver((function(t){t.forEach((function(t){if("childList"===t.type&&t.addedNodes.length)for(var a=0;a {% if page.previous_page or page.next_page %} {% endif %}
================================================ FILE: docs/theme/partials/header.html ================================================ {% set site_url = config.site_url | d(nav.homepage.url, true) | url %} {% if not config.use_directory_urls and site_url[0] == site_url[-1] == "." %} {% set site_url = site_url ~ "/index.html" %} {% endif %}
================================================ FILE: docs/theme/partials/libs.html ================================================ ================================================ FILE: docs/usage.md ================================================ ## 1. Add your AI models ![resources tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/resources-tab.png) - The tool uses Large Language Model (LLMs) to perform various tasks in a QA pipeline. So, you need to provide the application with access to the LLMs you want to use. - You only need to provide at least one. However, it is recommended that you include all the LLMs that you have access to, you will be able to switch between them while using the application. To add a model: 1. Navigate to the `Resources` tab. 2. Select the `LLMs` sub-tab. 3. Select the `Add` sub-tab. 4. Config the model to add: - Give it a name. - Pick a vendor/provider (e.g. `ChatOpenAI`). - Provide the specifications. - (Optional) Set the model as default. 5. Click `Add` to add the model. 6. Select `Embedding Models` sub-tab and repeat the step 3 to 5 to add an embedding model.
(Optional) Configure model via the .env file Alternatively, you can configure the models via the `.env` file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one. Currently, the following providers are supported: ### OpenAI In the `.env` file, set the `OPENAI_API_KEY` variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people. ```shell OPENAI_API_BASE=https://api.openai.com/v1 OPENAI_API_KEY= OPENAI_CHAT_MODEL=gpt-3.5-turbo OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002 ``` ### Azure OpenAI For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development. ```shell AZURE_OPENAI_ENDPOINT= AZURE_OPENAI_API_KEY= OPENAI_API_VERSION=2024-02-15-preview # could be different for you AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo # change to your deployment name AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002 # change to your deployment name ``` ### Local models Pros: - Privacy. Your documents will be stored and process locally. - Choices. There are a wide range of LLMs in terms of size, domain, language to choose from. - Cost. It's free. Cons: - Quality. Local models are much smaller and thus have lower generative quality than paid APIs. - Speed. Local models are deployed using your machine so the processing speed is limited by your hardware. #### Find and download a LLM You can search and download a LLM to be ran locally from the [Hugging Face Hub](https://huggingface.co/models). Currently, these model formats are supported: - GGUF You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that take up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time. Here are some recommendations and their size in memory: - [Qwen1.5-1.8B-Chat-GGUF](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q8_0.gguf?download=true): around 2 GB #### Enable local models To add a local model to the model pool, set the `LOCAL_MODEL` variable in the `.env` file to the path of the model file. ```shell LOCAL_MODEL= ``` Here is how to get the full path of your model file: - On Windows 11: right click the file and select `Copy as Path`.
## 2. Upload your documents ![file index tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/file-index-tab.png) In order to do QA on your documents, you need to upload them to the application first. Navigate to the `File Index` tab and you will see 2 sections: 1. File upload: - Drag and drop your file to the UI or select it from your file system. Then click `Upload and Index`. - The application will take some time to process the file and show a message once it is done. 2. File list: - This section shows the list of files that have been uploaded to the application and allows users to delete them. ## 3. Chat with your documents ![chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png) Now navigate back to the `Chat` tab. The chat tab is divided into 3 regions: 1. Conversation Settings Panel - Here you can select, create, rename, and delete conversations. - By default, a new conversation is created automatically if no conversation is selected. - Below that you have the file index, where you can choose whether to disable, select all files, or select which files to retrieve references from. - If you choose "Disabled", no files will be considered as context during chat. - If you choose "Search All", all files will be considered during chat. - If you choose "Select", a dropdown will appear for you to select the files to be considered during chat. If no files are selected, then no files will be considered during chat. 2. Chat Panel - This is where you can chat with the chatbot. 3. Information Panel ![information panel](https://raw.githubusercontent.com/Cinnamon/kotaemon/develop/docs/images/info-panel-scores.png) - Supporting information such as the retrieved evidence and reference will be displayed here. - Direct citation for the answer produced by the LLM is highlighted. - The confidence score of the answer and relevant scores of evidences are displayed to quickly assess the quality of the answer and retrieved content. - Meaning of the score displayed: - **Answer confidence**: answer confidence level from the LLM model. - **Relevance score**: overall relevant score between evidence and user question. - **Vectorstore score**: relevant score from vector embedding similarity calculation (show `full-text search` if retrieved from full-text search DB). - **LLM relevant score**: relevant score from LLM model (which judge relevancy between question and evidence using specific prompt). - **Reranking score**: relevant score from Cohere [reranking model](https://cohere.com/rerank). Generally, the score quality is `LLM relevant score` > `Reranking score` > `Vectorscore`. By default, overall relevance score is taken directly from LLM relevant score. Evidences are sorted based on their overall relevance score and whether they have citation or not. ================================================ FILE: flowsettings.py ================================================ import os from importlib.metadata import version from inspect import currentframe, getframeinfo from pathlib import Path from decouple import config from ktem.utils.lang import SUPPORTED_LANGUAGE_MAP from theflow.settings.default import * # noqa cur_frame = currentframe() if cur_frame is None: raise ValueError("Cannot get the current frame.") this_file = getframeinfo(cur_frame).filename this_dir = Path(this_file).parent # change this if your app use a different name KH_PACKAGE_NAME = "kotaemon_app" KH_APP_VERSION = config("KH_APP_VERSION", None) if not KH_APP_VERSION: try: # Caution: This might produce the wrong version # https://stackoverflow.com/a/59533071 KH_APP_VERSION = version(KH_PACKAGE_NAME) except Exception: KH_APP_VERSION = "local" KH_GRADIO_SHARE = config("KH_GRADIO_SHARE", default=False, cast=bool) KH_ENABLE_FIRST_SETUP = config("KH_ENABLE_FIRST_SETUP", default=True, cast=bool) KH_DEMO_MODE = config("KH_DEMO_MODE", default=False, cast=bool) KH_OLLAMA_URL = config("KH_OLLAMA_URL", default="http://localhost:11434/v1/") # App can be ran from anywhere and it's not trivial to decide where to store app data. # So let's use the same directory as the flowsetting.py file. KH_APP_DATA_DIR = this_dir / "ktem_app_data" KH_APP_DATA_EXISTS = KH_APP_DATA_DIR.exists() KH_APP_DATA_DIR.mkdir(parents=True, exist_ok=True) # User data directory KH_USER_DATA_DIR = KH_APP_DATA_DIR / "user_data" KH_USER_DATA_DIR.mkdir(parents=True, exist_ok=True) # markdown output directory KH_MARKDOWN_OUTPUT_DIR = KH_APP_DATA_DIR / "markdown_cache_dir" KH_MARKDOWN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # chunks output directory KH_CHUNKS_OUTPUT_DIR = KH_APP_DATA_DIR / "chunks_cache_dir" KH_CHUNKS_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # zip output directory KH_ZIP_OUTPUT_DIR = KH_APP_DATA_DIR / "zip_cache_dir" KH_ZIP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # zip input directory KH_ZIP_INPUT_DIR = KH_APP_DATA_DIR / "zip_cache_dir_in" KH_ZIP_INPUT_DIR.mkdir(parents=True, exist_ok=True) # HF models can be big, let's store them in the app data directory so that it's easier # for users to manage their storage. # ref: https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache os.environ["HF_HOME"] = str(KH_APP_DATA_DIR / "huggingface") os.environ["HF_HUB_CACHE"] = str(KH_APP_DATA_DIR / "huggingface") # doc directory KH_DOC_DIR = this_dir / "docs" KH_MODE = "dev" KH_SSO_ENABLED = config("KH_SSO_ENABLED", default=False, cast=bool) KH_FEATURE_CHAT_SUGGESTION = config( "KH_FEATURE_CHAT_SUGGESTION", default=False, cast=bool ) KH_FEATURE_USER_MANAGEMENT = config( "KH_FEATURE_USER_MANAGEMENT", default=True, cast=bool ) KH_USER_CAN_SEE_PUBLIC = None KH_FEATURE_USER_MANAGEMENT_ADMIN = str( config("KH_FEATURE_USER_MANAGEMENT_ADMIN", default="admin") ) KH_FEATURE_USER_MANAGEMENT_PASSWORD = str( config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="admin") ) KH_ENABLE_ALEMBIC = False KH_DATABASE = f"sqlite:///{KH_USER_DATA_DIR / 'sql.db'}" KH_FILESTORAGE_PATH = str(KH_USER_DATA_DIR / "files") KH_WEB_SEARCH_BACKEND = ( "kotaemon.indices.retrievers.tavily_web_search.WebSearch" # "kotaemon.indices.retrievers.jina_web_search.WebSearch" ) KH_DOCSTORE = { # "__type__": "kotaemon.storages.ElasticsearchDocumentStore", # "__type__": "kotaemon.storages.SimpleFileDocumentStore", "__type__": "kotaemon.storages.LanceDBDocumentStore", "path": str(KH_USER_DATA_DIR / "docstore"), } KH_VECTORSTORE = { # "__type__": "kotaemon.storages.LanceDBVectorStore", "__type__": "kotaemon.storages.ChromaVectorStore", # "__type__": "kotaemon.storages.MilvusVectorStore", # "__type__": "kotaemon.storages.QdrantVectorStore", "path": str(KH_USER_DATA_DIR / "vectorstore"), } KH_LLMS = {} KH_EMBEDDINGS = {} KH_RERANKINGS = {} # populate options from config if config("AZURE_OPENAI_API_KEY", default="") and config( "AZURE_OPENAI_ENDPOINT", default="" ): if config("AZURE_OPENAI_CHAT_DEPLOYMENT", default=""): KH_LLMS["azure"] = { "spec": { "__type__": "kotaemon.llms.AzureChatOpenAI", "temperature": 0, "azure_endpoint": config("AZURE_OPENAI_ENDPOINT", default=""), "api_key": config("AZURE_OPENAI_API_KEY", default=""), "api_version": config("OPENAI_API_VERSION", default="") or "2024-02-15-preview", "azure_deployment": config("AZURE_OPENAI_CHAT_DEPLOYMENT", default=""), "timeout": 20, }, "default": False, } if config("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT", default=""): KH_EMBEDDINGS["azure"] = { "spec": { "__type__": "kotaemon.embeddings.AzureOpenAIEmbeddings", "azure_endpoint": config("AZURE_OPENAI_ENDPOINT", default=""), "api_key": config("AZURE_OPENAI_API_KEY", default=""), "api_version": config("OPENAI_API_VERSION", default="") or "2024-02-15-preview", "azure_deployment": config( "AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT", default="" ), "timeout": 10, }, "default": False, } OPENAI_DEFAULT = "" OPENAI_API_KEY = config("OPENAI_API_KEY", default=OPENAI_DEFAULT) GOOGLE_API_KEY = config("GOOGLE_API_KEY", default="your-key") IS_OPENAI_DEFAULT = len(OPENAI_API_KEY) > 0 and OPENAI_API_KEY != OPENAI_DEFAULT if OPENAI_API_KEY: KH_LLMS["openai"] = { "spec": { "__type__": "kotaemon.llms.ChatOpenAI", "temperature": 0, "base_url": config("OPENAI_API_BASE", default="") or "https://api.openai.com/v1", "api_key": OPENAI_API_KEY, "model": config("OPENAI_CHAT_MODEL", default="gpt-4o-mini"), "timeout": 20, }, "default": IS_OPENAI_DEFAULT, } KH_EMBEDDINGS["openai"] = { "spec": { "__type__": "kotaemon.embeddings.OpenAIEmbeddings", "base_url": config("OPENAI_API_BASE", default="https://api.openai.com/v1"), "api_key": OPENAI_API_KEY, "model": config( "OPENAI_EMBEDDINGS_MODEL", default="text-embedding-3-large" ), "timeout": 10, "context_length": 8191, }, "default": IS_OPENAI_DEFAULT, } VOYAGE_API_KEY = config("VOYAGE_API_KEY", default="") if VOYAGE_API_KEY: KH_EMBEDDINGS["voyageai"] = { "spec": { "__type__": "kotaemon.embeddings.VoyageAIEmbeddings", "api_key": VOYAGE_API_KEY, "model": config("VOYAGE_EMBEDDINGS_MODEL", default="voyage-3-large"), }, "default": False, } KH_RERANKINGS["voyageai"] = { "spec": { "__type__": "kotaemon.rerankings.VoyageAIReranking", "model_name": "rerank-2", "api_key": VOYAGE_API_KEY, }, "default": False, } if config("LOCAL_MODEL", default=""): KH_LLMS["ollama"] = { "spec": { "__type__": "kotaemon.llms.ChatOpenAI", "base_url": KH_OLLAMA_URL, "model": config("LOCAL_MODEL", default="qwen2.5:7b"), "api_key": "ollama", }, "default": False, } KH_LLMS["ollama-long-context"] = { "spec": { "__type__": "kotaemon.llms.LCOllamaChat", "base_url": KH_OLLAMA_URL.replace("v1/", ""), "model": config("LOCAL_MODEL", default="qwen2.5:7b"), "num_ctx": 8192, }, "default": False, } KH_EMBEDDINGS["ollama"] = { "spec": { "__type__": "kotaemon.embeddings.OpenAIEmbeddings", "base_url": KH_OLLAMA_URL, "model": config("LOCAL_MODEL_EMBEDDINGS", default="nomic-embed-text"), "api_key": "ollama", }, "default": False, } KH_EMBEDDINGS["fast_embed"] = { "spec": { "__type__": "kotaemon.embeddings.FastEmbedEmbeddings", "model_name": "BAAI/bge-base-en-v1.5", }, "default": False, } # additional LLM configurations KH_LLMS["claude"] = { "spec": { "__type__": "kotaemon.llms.chats.LCAnthropicChat", "model_name": "claude-3-5-sonnet-20240620", "api_key": "your-key", }, "default": False, } KH_LLMS["google"] = { "spec": { "__type__": "kotaemon.llms.chats.LCGeminiChat", "model_name": "gemini-1.5-flash", "api_key": GOOGLE_API_KEY, }, "default": not IS_OPENAI_DEFAULT, } KH_LLMS["groq"] = { "spec": { "__type__": "kotaemon.llms.ChatOpenAI", "base_url": "https://api.groq.com/openai/v1", "model": "llama-3.1-8b-instant", "api_key": "your-key", }, "default": False, } KH_LLMS["cohere"] = { "spec": { "__type__": "kotaemon.llms.chats.LCCohereChat", "model_name": "command-r-plus-08-2024", "api_key": config("COHERE_API_KEY", default="your-key"), }, "default": False, } KH_LLMS["mistral"] = { "spec": { "__type__": "kotaemon.llms.ChatOpenAI", "base_url": "https://api.mistral.ai/v1", "model": "ministral-8b-latest", "api_key": config("MISTRAL_API_KEY", default="your-key"), }, "default": False, } # additional embeddings configurations KH_EMBEDDINGS["cohere"] = { "spec": { "__type__": "kotaemon.embeddings.LCCohereEmbeddings", "model": "embed-multilingual-v3.0", "cohere_api_key": config("COHERE_API_KEY", default="your-key"), "user_agent": "default", }, "default": False, } KH_EMBEDDINGS["google"] = { "spec": { "__type__": "kotaemon.embeddings.LCGoogleEmbeddings", "model": "models/text-embedding-004", "google_api_key": GOOGLE_API_KEY, }, "default": not IS_OPENAI_DEFAULT, } KH_EMBEDDINGS["mistral"] = { "spec": { "__type__": "kotaemon.embeddings.LCMistralEmbeddings", "model": "mistral-embed", "api_key": config("MISTRAL_API_KEY", default="your-key"), }, "default": False, } # KH_EMBEDDINGS["huggingface"] = { # "spec": { # "__type__": "kotaemon.embeddings.LCHuggingFaceEmbeddings", # "model_name": "sentence-transformers/all-mpnet-base-v2", # }, # "default": False, # } # default reranking models KH_RERANKINGS["cohere"] = { "spec": { "__type__": "kotaemon.rerankings.CohereReranking", "model_name": "rerank-multilingual-v2.0", "cohere_api_key": config("COHERE_API_KEY", default=""), }, "default": True, } KH_REASONINGS = [ "ktem.reasoning.simple.FullQAPipeline", "ktem.reasoning.simple.FullDecomposeQAPipeline", "ktem.reasoning.react.ReactAgentPipeline", "ktem.reasoning.rewoo.RewooAgentPipeline", ] KH_REASONINGS_USE_MULTIMODAL = config("USE_MULTIMODAL", default=False, cast=bool) KH_VLM_ENDPOINT = "{0}/openai/deployments/{1}/chat/completions?api-version={2}".format( config("AZURE_OPENAI_ENDPOINT", default=""), config("OPENAI_VISION_DEPLOYMENT_NAME", default="gpt-4o"), config("OPENAI_API_VERSION", default=""), ) SETTINGS_APP: dict[str, dict] = {} SETTINGS_REASONING = { "use": { "name": "Reasoning options", "value": None, "choices": [], "component": "radio", }, "lang": { "name": "Language", "value": "en", "choices": [(lang, code) for code, lang in SUPPORTED_LANGUAGE_MAP.items()], "component": "dropdown", }, "max_context_length": { "name": "Max context length (LLM)", "value": 32000, "component": "number", }, } USE_GLOBAL_GRAPHRAG = config("USE_GLOBAL_GRAPHRAG", default=True, cast=bool) USE_NANO_GRAPHRAG = config("USE_NANO_GRAPHRAG", default=False, cast=bool) USE_LIGHTRAG = config("USE_LIGHTRAG", default=True, cast=bool) USE_MS_GRAPHRAG = config("USE_MS_GRAPHRAG", default=True, cast=bool) GRAPHRAG_INDEX_TYPES = [] if USE_MS_GRAPHRAG: GRAPHRAG_INDEX_TYPES.append("ktem.index.file.graph.GraphRAGIndex") if USE_NANO_GRAPHRAG: GRAPHRAG_INDEX_TYPES.append("ktem.index.file.graph.NanoGraphRAGIndex") if USE_LIGHTRAG: GRAPHRAG_INDEX_TYPES.append("ktem.index.file.graph.LightRAGIndex") KH_INDEX_TYPES = [ "ktem.index.file.FileIndex", *GRAPHRAG_INDEX_TYPES, ] GRAPHRAG_INDICES = [ { "name": graph_type.split(".")[-1].replace("Index", "") + " Collection", # get last name "config": { "supported_file_types": ( ".png, .jpeg, .jpg, .tiff, .tif, .pdf, .xls, .xlsx, .doc, .docx, " ".pptx, .csv, .html, .mhtml, .txt, .md, .zip" ), "private": True, }, "index_type": graph_type, } for graph_type in GRAPHRAG_INDEX_TYPES ] KH_INDICES = [ { "name": "File Collection", "config": { "supported_file_types": ( ".png, .jpeg, .jpg, .tiff, .tif, .pdf, .xls, .xlsx, .doc, .docx, " ".pptx, .csv, .html, .mhtml, .txt, .md, .zip" ), "private": True, }, "index_type": "ktem.index.file.FileIndex", }, *GRAPHRAG_INDICES, ] ================================================ FILE: fly.toml ================================================ # fly.toml app configuration file generated for kotaemon on 2024-12-24T20:56:32+07:00 # # See https://fly.io/docs/reference/configuration/ for information about how to use this file. # app = 'kotaemon' primary_region = 'sin' [build] [mounts] destination = "/app/ktem_app_data" source = "ktem_volume" [http_service] internal_port = 7860 force_https = true auto_stop_machines = 'suspend' auto_start_machines = true min_machines_running = 0 processes = ['app'] [[vm]] memory = '4gb' cpu_kind = 'shared' cpus = 4 ================================================ FILE: launch.sh ================================================ #!/bin/bash if [ -z "$GRADIO_SERVER_NAME" ]; then export GRADIO_SERVER_NAME="0.0.0.0" fi if [ -z "$GRADIO_SERVER_PORT" ]; then export GRADIO_SERVER_PORT="7860" fi # Check if environment variable KH_DEMO_MODE is set to true if [ "$KH_DEMO_MODE" = "true" ]; then echo "KH_DEMO_MODE is true. Launching in demo mode..." # Command to launch in demo mode GR_FILE_ROOT_PATH="/app" KH_FEATURE_USER_MANAGEMENT=false USE_LIGHTRAG=false .venv/bin/uvicorn sso_app_demo:app --host "$GRADIO_SERVER_NAME" --port "$GRADIO_SERVER_PORT" else if [ "$KH_SSO_ENABLED" = "true" ]; then echo "KH_SSO_ENABLED is true. Launching in SSO mode..." GR_FILE_ROOT_PATH="/app" KH_SSO_ENABLED=true .venv/bin/uvicorn sso_app:app --host "$GRADIO_SERVER_NAME" --port "$GRADIO_SERVER_PORT" else ollama serve & .venv/bin/python app.py fi fi ================================================ FILE: libs/kotaemon/README.md ================================================ # kotaemon Quick and easy AI components to build Kotaemon ## Documentation TBD ## Install ```shell pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git ``` ## Contribute ### Setup - Create conda environment (suggest 3.10) ```shell conda create -n kotaemon python=3.10 conda activate kotaemon ``` - Clone the repo ```shell git clone git@github.com:Cinnamon/kotaemon.git cd kotaemon ``` - Install all ```shell pip install -e ".[dev]" ``` - Pre-commit ```shell pre-commit install ``` - Test ```shell pytest tests ``` ### Credential sharing This repo uses [git-secret](https://sobolevn.me/git-secret/) to share credentials, which internally uses `gpg` to encrypt and decrypt secret files. This repo uses `python-dotenv` to manage credentials stored as environment variable. Please note that the use of `python-dotenv` and credentials are for development purposes only. Thus, it should not be used in the main source code (i.e. `kotaemon/` and `tests/`), but can be used in `examples/`. #### Install git-secret Please follow the [official guide](https://sobolevn.me/git-secret/installation) to install git-secret. For Windows users, see [For Windows users](#for-windows-users). For users who don't have sudo privilege to install packages, follow the `Manual Installation` in the [official guide](https://sobolevn.me/git-secret/installation) and set `PREFIX` to a path that you have access to. And please don't forget to add `PREFIX` to your `PATH`. #### Gaining access In order to gain access to the secret files, you must provide your gpg public file to anyone who has access and ask them to ask your key to the keyring. For a quick tutorial on generating your gpg key pair, you can refer to the `Using gpg` section from the [git-secret main page](https://sobolevn.me/git-secret/). #### Decrypt the secret file The credentials are encrypted in the `.env.secret` file. To print the decrypted content to stdout, run ```shell git-secret cat [filename] ``` Or to get the decrypted `.env` file, run ```shell git-secret reveal [filename] ``` #### For Windows users git-secret is currently not available for Windows, thus the easiest way is to use it in WSL (please use the latest version of WSL2). From there you have 2 options: 1. Using the gpg of WSL. This is the most straight-forward option since you would use WSL just like any other unix environment. However, the downside is that you have to make WSL your main environment, which means WSL must have write permission on your repo. To achieve this, you must either: - Clone and store your repo inside WSL's file system. - Provide WSL with necessary permission on your Windows file system. This can be achieve by setting `automount` options for WSL. To do that, add these content to `/etc/wsl.conf` and then restart your sub-system. ```shell [automount] options = "metadata,umask=022,fmask=011" ``` This enables all permissions for user owner. 2. Using the gpg of Windows but with git-secret from WSL. For those who use Windows as the main environment, having to switch back and forth between Windows and WSL will be inconvenient. You can instead stay within your Windows environment and apply some tricks to use `git-secret` from WSL. - Install and setup `gpg` on Windows. - Install `git-secret` on WSL. Now in Windows, you can invoke `git-secret` using `wsl git-secret`. - Alternatively you can setup alias in CMD to shorten the syntax. Please refer to [this SO answer](https://stackoverflow.com/a/65823225) for the instruction. Some recommended aliases are: ```bat @echo off :: Commands DOSKEY ls=dir /B $* DOSKEY ll=dir /a $* DOSKEY git-secret=wsl git-secret $* DOSKEY gs=wsl git-secret $* ``` Now you can invoke `git-secret` in CMD using `git-secret` or `gs`. - For Powershell users, similar behaviours can be achieved using `Set-Alias` and `profile.ps1`. Please refer this [SO thread](https://stackoverflow.com/questions/61081434/how-do-i-create-a-permanent-alias-file-in-powershell-core) as an example. ### Code base structure - documents: define document - loaders ================================================ FILE: libs/kotaemon/kotaemon/__init__.py ================================================ # Disable telemetry with monkey patching import logging logger = logging.getLogger(__name__) try: import posthog def capture(*args, **kwargs): logger.info("posthog.capture called with args: %s, kwargs: %s", args, kwargs) posthog.capture = capture except ImportError: pass try: import os os.environ["HAYSTACK_TELEMETRY_ENABLED"] = "False" import haystack.telemetry haystack.telemetry.telemetry = None except ImportError: pass ================================================ FILE: libs/kotaemon/kotaemon/agents/__init__.py ================================================ from .base import BaseAgent from .io import AgentFinish, AgentOutput, AgentType, BaseScratchPad from .langchain_based import LangchainAgent from .react.agent import ReactAgent from .rewoo.agent import RewooAgent from .tools import ( BaseTool, ComponentTool, GoogleSearchTool, LLMTool, MCPTool, WikipediaTool, ) __all__ = [ # agent "BaseAgent", "ReactAgent", "RewooAgent", "LangchainAgent", # tool "BaseTool", "ComponentTool", "GoogleSearchTool", "WikipediaTool", "LLMTool", "MCPTool", # io "AgentType", "AgentOutput", "AgentFinish", "BaseScratchPad", ] ================================================ FILE: libs/kotaemon/kotaemon/agents/base.py ================================================ from typing import Optional, Union from kotaemon.base import BaseComponent, Node, Param from kotaemon.llms import BaseLLM, PromptTemplate from .io import AgentOutput, AgentType from .tools import BaseTool class BaseAgent(BaseComponent): """Define base agent interface""" name: str = Param(help="Name of the agent.") agent_type: AgentType = Param(help="Agent type, must be one of AgentType") description: str = Param( help=( "Description used to tell the model how/when/why to use the agent. You can" " provide few-shot examples as a part of the description. This will be" " input to the prompt of LLM." ) ) llm: Optional[BaseLLM] = Node( help=( "LLM to be used for the agent (optional). LLM must implement BaseLLM" " interface." ) ) prompt_template: Optional[Union[PromptTemplate, dict[str, PromptTemplate]]] = Param( help="A prompt template or a dict to supply different prompt to the agent" ) plugins: list[BaseTool] = Param( default_callback=lambda _: [], help="List of plugins / tools to be used in the agent", ) @staticmethod def safeguard_run(run_func, *args, **kwargs): def wrapper(self, *args, **kwargs): try: return run_func(self, *args, **kwargs) except Exception as e: return AgentOutput( text="", agent_type=self.agent_type, status="failed", error=str(e), ) return wrapper def add_tools(self, tools: list[BaseTool]) -> None: """Helper method to add tools and update agent state if needed""" self.plugins.extend(tools) def run(self, *args, **kwargs) -> AgentOutput | list[AgentOutput]: """Run the component.""" raise NotImplementedError() ================================================ FILE: libs/kotaemon/kotaemon/agents/io/__init__.py ================================================ from .base import AgentAction, AgentFinish, AgentOutput, AgentType, BaseScratchPad __all__ = ["AgentOutput", "AgentFinish", "BaseScratchPad", "AgentType", "AgentAction"] ================================================ FILE: libs/kotaemon/kotaemon/agents/io/base.py ================================================ import json import logging import os from dataclasses import dataclass from enum import Enum from typing import Any, Dict, Literal, NamedTuple, Optional, Union from pydantic import ConfigDict from kotaemon.base import LLMInterface def check_log(): """ Checks if logging has been enabled. :return: True if logging has been enabled, False otherwise. :rtype: bool """ return os.environ.get("LOG_PATH", None) is not None class AgentType(Enum): """ Enumerated type for agent types. """ openai = "openai" openai_multi = "openai_multi" openai_tool = "openai_tool" self_ask = "self_ask" react = "react" rewoo = "rewoo" vanilla = "vanilla" class BaseScratchPad: """ Base class for output handlers. Attributes: ----------- logger : logging.Logger The logger object to log messages. Methods: -------- stop(): Stop the output. update_status(output: str, **kwargs): Update the status of the output. thinking(name: str): Log that a process is thinking. done(_all=False): Log that the process is done. stream_print(item: str): Not implemented. json_print(item: Dict[str, Any]): Log a JSON object. panel_print(item: Any, title: str = "Output", stream: bool = False): Log a panel output. clear(): Not implemented. print(content: str, **kwargs): Log arbitrary content. format_json(json_obj: str): Format a JSON object. debug(content: str, **kwargs): Log a debug message. info(content: str, **kwargs): Log an informational message. warning(content: str, **kwargs): Log a warning message. error(content: str, **kwargs): Log an error message. critical(content: str, **kwargs): Log a critical message. """ def __init__(self): """ Initialize the BaseOutput object. """ self.logger = logging self.log = [] def stop(self): """ Stop the output. """ def update_status(self, output: str, **kwargs): """ Update the status of the output. """ if check_log(): self.logger.info(output) def thinking(self, name: str): """ Log that a process is thinking. """ if check_log(): self.logger.info(f"{name} is thinking...") def done(self, _all=False): """ Log that the process is done. """ if check_log(): self.logger.info("Done") def stream_print(self, item: str): """ Stream print. """ def json_print(self, item: Dict[str, Any]): """ Log a JSON object. """ if check_log(): self.logger.info(json.dumps(item, indent=2)) def panel_print(self, item: Any, title: str = "Output", stream: bool = False): """ Log a panel output. Args: item : Any The item to log. title : str, optional The title of the panel, defaults to "Output". stream : bool, optional """ if not stream: self.log.append(item) if check_log(): self.logger.info("-" * 20) self.logger.info(item) self.logger.info("-" * 20) def clear(self): """ Not implemented. """ def print(self, content: str, **kwargs): """ Log arbitrary content. """ self.log.append(content) if check_log(): self.logger.info(content) def format_json(self, json_obj: str): """ Format a JSON object. """ formatted_json = json.dumps(json_obj, indent=2) return formatted_json def debug(self, content: str, **kwargs): """ Log a debug message. """ if check_log(): self.logger.debug(content, **kwargs) def info(self, content: str, **kwargs): """ Log an informational message. """ if check_log(): self.logger.info(content, **kwargs) def warning(self, content: str, **kwargs): """ Log a warning message. """ if check_log(): self.logger.warning(content, **kwargs) def error(self, content: str, **kwargs): """ Log an error message. """ if check_log(): self.logger.error(content, **kwargs) def critical(self, content: str, **kwargs): """ Log a critical message. """ if check_log(): self.logger.critical(content, **kwargs) @dataclass class AgentAction: """Agent's action to take. Args: tool: The tool to invoke. tool_input: The input to the tool. log: The log message. """ tool: str tool_input: Union[str, dict] log: str class AgentFinish(NamedTuple): """Agent's return value when finishing execution. Args: return_values: The return values of the agent. log: The log message. """ return_values: dict log: str class AgentOutput(LLMInterface): """Output from an agent. Args: text: The text output from the agent. agent_type: The type of agent. status: The status after executing the agent. error: The error message if any. """ model_config = ConfigDict(extra="allow") text: str type: str = "agent" agent_type: AgentType status: Literal["thinking", "finished", "stopped", "failed"] error: Optional[str] = None intermediate_steps: Optional[list] = None ================================================ FILE: libs/kotaemon/kotaemon/agents/langchain_based.py ================================================ from typing import List, Optional from langchain.agents import AgentType as LCAgentType from langchain.agents import initialize_agent from langchain.agents.agent import AgentExecutor as LCAgentExecutor from kotaemon.llms import LLM, ChatLLM from .base import BaseAgent from .io import AgentOutput, AgentType from .tools import BaseTool class LangchainAgent(BaseAgent): """Wrapper for Langchain Agent""" name: str = "LangchainAgent" agent_type: AgentType description: str = "LangchainAgent for answering multi-step reasoning questions" AGENT_TYPE_MAP = { AgentType.openai: LCAgentType.OPENAI_FUNCTIONS, AgentType.openai_multi: LCAgentType.OPENAI_MULTI_FUNCTIONS, AgentType.react: LCAgentType.ZERO_SHOT_REACT_DESCRIPTION, AgentType.self_ask: LCAgentType.SELF_ASK_WITH_SEARCH, } agent: Optional[LCAgentExecutor] = None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.agent_type not in self.AGENT_TYPE_MAP: raise NotImplementedError( f"AgentType {self.agent_type } not supported by Langchain wrapper" ) self.update_agent_tools() def update_agent_tools(self): assert isinstance(self.llm, (ChatLLM, LLM)) langchain_plugins = [tool.to_langchain_format() for tool in self.plugins] # a fix for search_doc tool name: # use "Intermediate Answer" for self-ask agent found_search_tool = False if self.agent_type == AgentType.self_ask: for plugin in langchain_plugins: if plugin.name == "search_doc": plugin.name = "Intermediate Answer" langchain_plugins = [plugin] found_search_tool = True break if self.agent_type != AgentType.self_ask or found_search_tool: # reinit Langchain AgentExecutor self.agent = initialize_agent( langchain_plugins, self.llm.to_langchain_format(), agent=self.AGENT_TYPE_MAP[self.agent_type], handle_parsing_errors=True, verbose=True, ) def add_tools(self, tools: List[BaseTool]) -> None: super().add_tools(tools) self.update_agent_tools() return def run(self, instruction: str) -> AgentOutput: assert ( self.agent is not None ), "Lanchain AgentExecutor is not correctly initialized" # Langchain AgentExecutor call output = self.agent(instruction)["output"] return AgentOutput( text=output, agent_type=self.agent_type, status="finished", ) ================================================ FILE: libs/kotaemon/kotaemon/agents/react/__init__.py ================================================ from .agent import ReactAgent __all__ = ["ReactAgent"] ================================================ FILE: libs/kotaemon/kotaemon/agents/react/agent.py ================================================ import logging import re from functools import partial from typing import Optional import tiktoken from kotaemon.agents.base import BaseAgent, BaseLLM from kotaemon.agents.io import AgentAction, AgentFinish, AgentOutput, AgentType from kotaemon.agents.tools import BaseTool from kotaemon.base import Document, Param from kotaemon.indices.splitters import TokenSplitter from kotaemon.llms import PromptTemplate FINAL_ANSWER_ACTION = "Final Answer:" class ReactAgent(BaseAgent): """ Sequential ReactAgent class inherited from BaseAgent. Implementing ReAct agent paradigm https://arxiv.org/pdf/2210.03629.pdf """ name: str = "ReactAgent" agent_type: AgentType = AgentType.react description: str = "ReactAgent for answering multi-step reasoning questions" llm: BaseLLM prompt_template: Optional[PromptTemplate] = None output_lang: str = "English" plugins: list[BaseTool] = Param( default_callback=lambda _: [], help="List of tools to be used in the agent. " ) examples: dict[str, str | list[str]] = Param( default_callback=lambda _: {}, help="Examples to be used in the agent. " ) intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = Param( default_callback=lambda _: [], help="List of AgentAction and observation (tool) output", ) max_iterations: int = 5 strict_decode: bool = False max_context_length: int = Param( default=3000, help="Max context length for each tool output.", ) trim_func: TokenSplitter | None = None def _compose_plugin_description(self) -> str: """ Compose the worker prompt from the workers. Example: toolname1[input]: tool1 description toolname2[input]: tool2 description """ prompt = "" try: for plugin in self.plugins: prompt += f"{plugin.name}[input]: {plugin.description}\n" except Exception: raise ValueError("Worker must have a name and description.") return prompt def _construct_scratchpad( self, intermediate_steps: list[tuple[AgentAction | AgentFinish, str]] = [] ) -> str: """Construct the scratchpad that lets the agent continue its thought process.""" thoughts = "" for action, observation in intermediate_steps: thoughts += action.log thoughts += f"\nObservation: {observation}\nThought:" return thoughts def _parse_output(self, text: str) -> Optional[AgentAction | AgentFinish]: """ Parse text output from LLM for the next Action or Final Answer Using Regex to parse "Action:\n Action Input:\n" for the next Action Using FINAL_ANSWER_ACTION to parse Final Answer Args: text[str]: input text to parse """ includes_answer = FINAL_ANSWER_ACTION in text regex = ( r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)" ) action_match = re.search(regex, text, re.DOTALL) action_output: Optional[AgentAction | AgentFinish] = None if action_match: if includes_answer: raise Exception( "Parsing LLM output produced both a final answer " f"and a parse-able action: {text}" ) action = action_match.group(1).strip() action_input = action_match.group(2) tool_input = action_input.strip(" ") # ensure if its a well formed SQL query we don't remove any trailing " chars if tool_input.startswith("SELECT ") is False: tool_input = tool_input.strip('"') action_output = AgentAction(action, tool_input, text) elif includes_answer: action_output = AgentFinish( {"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text ) else: if self.strict_decode: raise Exception(f"Could not parse LLM output: `{text}`") else: action_output = AgentFinish({"output": text}, text) return action_output def _compose_prompt(self, instruction) -> str: """ Compose the prompt from template, worker description, examples and instruction. """ agent_scratchpad = self._construct_scratchpad(self.intermediate_steps) tool_description = self._compose_plugin_description() tool_names = ", ".join([plugin.name for plugin in self.plugins]) if self.prompt_template is None: from .prompt import zero_shot_react_prompt self.prompt_template = zero_shot_react_prompt return self.prompt_template.populate( instruction=instruction, agent_scratchpad=agent_scratchpad, tool_description=tool_description, tool_names=tool_names, lang=self.output_lang, ) def _format_function_map(self) -> dict[str, BaseTool]: """Format the function map for the open AI function API. Return: Dict[str, Callable]: The function map. """ # Map the function name to the real function object. function_map = {} for plugin in self.plugins: function_map[plugin.name] = plugin return function_map def _trim(self, text: str | Document) -> str: """ Trim the text to the maximum token length. """ evidence_trim_func = ( self.trim_func if self.trim_func else TokenSplitter( chunk_size=self.max_context_length, chunk_overlap=0, separator=" ", tokenizer=partial( tiktoken.encoding_for_model("gpt-3.5-turbo").encode, allowed_special=set(), disallowed_special="all", ), ) ) if isinstance(text, str): texts = evidence_trim_func([Document(text=text)]) elif isinstance(text, Document): texts = evidence_trim_func([text]) else: raise ValueError("Invalid text type to trim") trim_text = texts[0].text logging.info(f"len (trimmed): {len(trim_text)}") return trim_text def clear(self): """ Clear and reset the agent. """ self.intermediate_steps = [] def run(self, instruction, max_iterations=None) -> AgentOutput: """ Run the agent with the given instruction. Args: instruction: Instruction to run the agent with. max_iterations: Maximum number of iterations of reasoning steps, defaults to 10. Return: AgentOutput object. """ if not max_iterations: max_iterations = self.max_iterations assert max_iterations > 0 self.clear() logging.info(f"Running {self.name} with instruction: {instruction}") total_cost = 0.0 total_token = 0 status = "failed" response_text = None for step_count in range(1, max_iterations + 1): prompt = self._compose_prompt(instruction) logging.info(f"Prompt: {prompt}") response = self.llm( prompt, stop=["Observation:"] ) # could cause bugs if llm doesn't have `stop` as a parameter response_text = response.text logging.info(f"Response: {response_text}") action_step = self._parse_output(response_text) if action_step is None: raise ValueError("Invalid action") is_finished_chain = isinstance(action_step, AgentFinish) if is_finished_chain: result = "" else: assert isinstance(action_step, AgentAction) action_name = action_step.tool tool_input = action_step.tool_input logging.info(f"Action: {action_name}") logging.info(f"Tool Input: {tool_input}") function_map = self._format_function_map() if action_name not in function_map: available = ", ".join(function_map.keys()) result = ( f"Tool '{action_name}' not found. " f"Available tools: {available}" ) else: result = function_map[action_name](tool_input) # trim the worker output to 1000 tokens, as we are appending # all workers' logs and it can exceed the token limit if we # don't limit each. Fix this number regarding to the LLM capacity. result = self._trim(result) logging.info(f"Result: {result}") self.intermediate_steps.append((action_step, result)) if is_finished_chain: logging.info(f"Finished after {step_count} steps.") status = "finished" break else: status = "stopped" return AgentOutput( text=response_text, agent_type=self.agent_type, status=status, total_tokens=total_token, total_cost=total_cost, intermediate_steps=self.intermediate_steps, max_iterations=max_iterations, ) def stream(self, instruction, max_iterations=None): """ Stream the agent with the given instruction. Args: instruction: Instruction to run the agent with. max_iterations: Maximum number of iterations of reasoning steps, defaults to 10. Return: AgentOutput object. """ if not max_iterations: max_iterations = self.max_iterations assert max_iterations > 0 self.clear() logging.info(f"Running {self.name} with instruction: {instruction}") print(f"Running {self.name} with instruction: {instruction}") total_cost = 0.0 total_token = 0 status = "failed" response_text = None for step_count in range(1, max_iterations + 1): prompt = self._compose_prompt(instruction) logging.info(f"Prompt: {prompt}") print(f"Prompt: {prompt}") response = self.llm( prompt, stop=["Observation:"] ) # TODO: could cause bugs if llm doesn't have `stop` as a parameter response_text = response.text logging.info(f"Response: {response_text}") print(f"Response: {response_text}") action_step = self._parse_output(response_text) if action_step is None: raise ValueError("Invalid action") is_finished_chain = isinstance(action_step, AgentFinish) if is_finished_chain: result = response_text if "Final Answer:" in response_text: result = response_text.split("Final Answer:")[-1].strip() else: assert isinstance(action_step, AgentAction) action_name = action_step.tool tool_input = action_step.tool_input logging.info(f"Action: {action_name}") print(f"Action: {action_name}") logging.info(f"Tool Input: {tool_input}") print(f"Tool Input: {tool_input}") function_map = self._format_function_map() if action_name not in function_map: available = ", ".join(function_map.keys()) result = ( f"Tool '{action_name}' not found. " f"Available tools: {available}" ) else: result = function_map[action_name](tool_input) # trim the worker output to 1000 tokens, as we are appending # all workers' logs and it can exceed the token limit if we # don't limit each. Fix this number regarding to the LLM capacity. result = self._trim(result) logging.info(f"Result: {result}") print(f"Result: {result}") self.intermediate_steps.append((action_step, result)) if is_finished_chain: logging.info(f"Finished after {step_count} steps.") status = "finished" yield AgentOutput( text=result, agent_type=self.agent_type, status=status, intermediate_steps=self.intermediate_steps[-1], ) break else: yield AgentOutput( text="", agent_type=self.agent_type, status="thinking", intermediate_steps=self.intermediate_steps[-1], ) else: status = "stopped" yield AgentOutput( text="", agent_type=self.agent_type, status=status, intermediate_steps=self.intermediate_steps[-1], ) return AgentOutput( text=response_text, agent_type=self.agent_type, status=status, total_tokens=total_token, total_cost=total_cost, intermediate_steps=self.intermediate_steps, max_iterations=max_iterations, ) ================================================ FILE: libs/kotaemon/kotaemon/agents/react/prompt.py ================================================ # flake8: noqa from kotaemon.llms import PromptTemplate zero_shot_react_prompt = PromptTemplate( template="""Answer the following questions as best you can. Give answer in {lang}. You have access to the following tools: {tool_description} Use the following format: Question: the input question you must answer Thought: you should always think about what to do Action: the action to take, should be one of [{tool_names}] Action Input: the input to the action, should be different from the action input of the same action in previous steps. Observation: the result of the action ... (this Thought/Action/Action Input/Observation can repeat N times) #Thought: I now know the final answer Final Answer: the final answer to the original input question Begin! After each Action Input. Question: {instruction} Thought:{agent_scratchpad} """ ) ================================================ FILE: libs/kotaemon/kotaemon/agents/rewoo/__init__.py ================================================ from .agent import RewooAgent __all__ = ["RewooAgent"] ================================================ FILE: libs/kotaemon/kotaemon/agents/rewoo/agent.py ================================================ import logging import re from concurrent.futures import ThreadPoolExecutor from functools import partial from typing import Any import tiktoken from kotaemon.agents.base import BaseAgent from kotaemon.agents.io import AgentOutput, AgentType, BaseScratchPad from kotaemon.agents.tools import BaseTool from kotaemon.agents.utils import get_plugin_response_content from kotaemon.base import Document, Node, Param from kotaemon.indices.qa.citation import CitationPipeline from kotaemon.indices.splitters import TokenSplitter from kotaemon.llms import BaseLLM, PromptTemplate from .planner import Planner from .solver import Solver class RewooAgent(BaseAgent): """Distributive RewooAgent class inherited from BaseAgent. Implementing ReWOO paradigm https://arxiv.org/pdf/2305.18323.pdf""" name: str = "RewooAgent" agent_type: AgentType = AgentType.rewoo description: str = "RewooAgent for answering multi-step reasoning questions" output_lang: str = "English" planner_llm: BaseLLM solver_llm: BaseLLM prompt_template: dict[str, PromptTemplate] = Param( default_callback=lambda _: {}, help="A dict to supply different prompt to the agent.", ) plugins: list[BaseTool] = Param( default_callback=lambda _: [], help="A list of plugins to be used in the model." ) examples: dict[str, str | list[str]] = Param( default_callback=lambda _: {}, help="Examples to be used in the agent." ) max_context_length: int = Param( default=3000, help="Max context length for each tool output.", ) trim_func: TokenSplitter | None = None @Node.auto(depends_on=["planner_llm", "plugins", "prompt_template", "examples"]) def planner(self): return Planner( model=self.planner_llm, plugins=self.plugins, prompt_template=self.prompt_template.get("Planner", None), examples=self.examples.get("Planner", None), ) @Node.auto(depends_on=["solver_llm", "prompt_template", "examples"]) def solver(self): return Solver( model=self.solver_llm, prompt_template=self.prompt_template.get("Solver", None), examples=self.examples.get("Solver", None), output_lang=self.output_lang, ) def _parse_plan_map( self, planner_response: str ) -> tuple[dict[str, list[str]], dict[str, str]]: """ Parse planner output. It should be an n-to-n mapping from Plans to #Es. This is because sometimes LLM cannot follow the strict output format. Example: #Plan1 #E1 #E2 should result in: {"#Plan1": ["#E1", "#E2"]} Or: #Plan1 #Plan2 #E1 should result in: {"#Plan1": [], "#Plan2": ["#E1"]} This function should also return a plan map. Returns: tuple[Dict[str, List[str]], Dict[str, str]]: A list of plan map """ valid_chunk = [ line for line in planner_response.splitlines() if line.startswith("#Plan") or line.startswith("#E") ] plan_to_es: dict[str, list[str]] = dict() plans: dict[str, str] = dict() prev_key = "" for line in valid_chunk: key, description = line.split(":", 1) key = key.strip() if key.startswith("#Plan"): plans[key] = description.strip() plan_to_es[key] = [] prev_key = key elif key.startswith("#E"): plan_to_es[prev_key].append(key) return plan_to_es, plans def _parse_planner_evidences( self, planner_response: str ) -> tuple[dict[str, str], list[list[str]]]: """ Parse planner output. This should return a mapping from #E to tool call. It should also identify the level of each #E in dependency map. Example: { "#E1": "Tool1", "#E2": "Tool2", "#E3": "Tool3", "#E4": "Tool4" }, [[#E1, #E2], [#E3, #E4]] Returns: tuple[dict[str, str], List[List[str]]]: A mapping from #E to tool call and a list of levels. """ evidences: dict[str, str] = dict() dependence: dict[str, list[str]] = dict() for line in planner_response.splitlines(): if line.startswith("#E") and line[2].isdigit(): e, tool_call = line.split(":", 1) e, tool_call = e.strip(), tool_call.strip() if len(e) == 3: dependence[e] = [] evidences[e] = tool_call for var in re.findall(r"#E\d+", tool_call): if var in evidences: dependence[e].append(var) else: evidences[e] = "No evidence found" level = [] while dependence: select = [i for i in dependence if not dependence[i]] if len(select) == 0: raise ValueError("Circular dependency detected.") level.append(select) for item in select: dependence.pop(item) for item in dependence: for i in select: if i in dependence[item]: dependence[item].remove(i) return evidences, level def _run_plugin( self, e: str, planner_evidences: dict[str, str], worker_evidences: dict[str, str], output=BaseScratchPad(), ): """ Run a plugin for a given evidence. This function should also cumulate the cost and tokens. """ result = dict(e=e, plugin_cost=0, plugin_token=0, evidence="") tool_call = planner_evidences[e] if "[" not in tool_call: result["evidence"] = tool_call else: tool, tool_input = tool_call.split("[", 1) tool_input = tool_input[:-1] # find variables in input and replace with previous evidences for var in re.findall(r"#E\d+", tool_input): print("Tool input: ", tool_input) print("Var: ", var) print("Worker evidences: ", worker_evidences) if var in worker_evidences: tool_input = tool_input.replace( var, worker_evidences.get(var, "") or "" ) try: selected_plugin = self._find_plugin(tool) if selected_plugin is None: raise ValueError("Invalid plugin detected") tool_response = selected_plugin(tool_input) result["evidence"] = get_plugin_response_content(tool_response) except ValueError: result["evidence"] = "No evidence found." finally: output.panel_print( result["evidence"], f"[green] Function Response of [blue]{tool}: " ) return result def _get_worker_evidence( self, planner_evidences: dict[str, str], evidences_level: list[list[str]], output=BaseScratchPad(), ) -> Any: """ Parallel execution of plugins in DAG for speedup. This is one of core benefits of ReWOO agents. Args: planner_evidences: A mapping from #E to tool call. evidences_level: A list of levels of evidences. Calculated from DAG of plugin calls. output: Output object, defaults to BaseOutput(). Returns: A mapping from #E to tool call. """ worker_evidences: dict[str, str] = dict() plugin_cost, plugin_token = 0.0, 0.0 with ThreadPoolExecutor() as pool: for level in evidences_level: results = [] for e in level: results.append( pool.submit( self._run_plugin, e, planner_evidences, worker_evidences, output, ) ) if len(results) > 1: output.update_status(f"Running tasks {level} in parallel.") else: output.update_status(f"Running task {level[0]}.") for r in results: resp = r.result() plugin_cost += resp["plugin_cost"] plugin_token += resp["plugin_token"] worker_evidences[resp["e"]] = self._trim_evidence(resp["evidence"]) output.done() return worker_evidences, plugin_cost, plugin_token def _find_plugin(self, name: str): for p in self.plugins: if p.name == name: return p def _trim_evidence(self, evidence: str): evidence_trim_func = ( self.trim_func if self.trim_func else TokenSplitter( chunk_size=self.max_context_length, chunk_overlap=0, separator=" ", tokenizer=partial( tiktoken.encoding_for_model("gpt-3.5-turbo").encode, allowed_special=set(), disallowed_special="all", ), ) ) if evidence: texts = evidence_trim_func([Document(text=evidence)]) evidence = texts[0].text logging.info(f"len (trimmed): {len(evidence)}") return evidence @BaseAgent.safeguard_run def run(self, instruction: str, use_citation: bool = False) -> AgentOutput: """ Run the agent with a given instruction. """ logging.info(f"Running {self.name} with instruction: {instruction}") total_cost = 0.0 total_token = 0 # Plan planner_output = self.planner(instruction) planner_text_output = planner_output.text plan_to_es, plans = self._parse_plan_map(planner_text_output) planner_evidences, evidence_level = self._parse_planner_evidences( planner_text_output ) # Work worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence( planner_evidences, evidence_level ) worker_log = "" for plan in plan_to_es: worker_log += f"{plan}: {plans[plan]}\n" for e in plan_to_es[plan]: worker_log += f"{e}: {worker_evidences[e]}\n" # Solve solver_output = self.solver(instruction, worker_log) solver_output_text = solver_output.text if use_citation: citation_pipeline = CitationPipeline(llm=self.solver_llm) citation = citation_pipeline(context=worker_log, question=instruction) else: citation = None return AgentOutput( text=solver_output_text, agent_type=self.agent_type, status="finished", total_tokens=total_token, total_cost=total_cost, citation=citation, metadata={"citation": citation, "worker_log": worker_log}, ) def stream(self, instruction: str, use_citation: bool = False): """ Stream the agent with a given instruction. """ logging.info(f"Streaming {self.name} with instruction: {instruction}") total_cost = 0.0 total_token = 0 # Plan planner_output = self.planner(instruction) planner_text_output = planner_output.text plan_to_es, plans = self._parse_plan_map(planner_text_output) planner_evidences, evidence_level = self._parse_planner_evidences( planner_text_output ) print("Planner output:", planner_text_output) # output planner to info panel yield AgentOutput( text="", agent_type=self.agent_type, status="thinking", intermediate_steps=[{"planner_log": planner_text_output}], ) # Work worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence( planner_evidences, evidence_level ) worker_log = "" for plan in plan_to_es: worker_log += f"{plan}: {plans[plan]}\n" current_progress = f"{plan}: {plans[plan]}\n" for e in plan_to_es[plan]: worker_log += f"#Action: {planner_evidences.get(e, None)}\n" worker_log += f"{e}: {worker_evidences[e]}\n" current_progress += f"#Action: {planner_evidences.get(e, None)}\n" current_progress += f"{e}: {worker_evidences[e]}\n" yield AgentOutput( text="", agent_type=self.agent_type, status="thinking", intermediate_steps=[{"worker_log": current_progress}], ) # Solve solver_response = "" for solver_output in self.solver.stream(instruction, worker_log): solver_output_text = solver_output.text solver_response += solver_output_text yield AgentOutput( text=solver_output_text, agent_type=self.agent_type, status="thinking", ) if use_citation: citation_pipeline = CitationPipeline(llm=self.solver_llm) citation = citation_pipeline.invoke( context=worker_log, question=instruction ) else: citation = None return AgentOutput( text="", agent_type=self.agent_type, status="finished", total_tokens=total_token, total_cost=total_cost, citation=citation, metadata={"citation": citation, "worker_log": worker_log}, ) ================================================ FILE: libs/kotaemon/kotaemon/agents/rewoo/planner.py ================================================ from typing import Any, List, Optional, Union from kotaemon.agents.base import BaseLLM, BaseTool from kotaemon.agents.io import BaseScratchPad from kotaemon.base import BaseComponent from kotaemon.llms import PromptTemplate from .prompt import few_shot_planner_prompt, zero_shot_planner_prompt class Planner(BaseComponent): model: BaseLLM prompt_template: Optional[PromptTemplate] = None examples: Optional[Union[str, List[str]]] = None plugins: List[BaseTool] def _compose_worker_description(self) -> str: """ Compose the worker prompt from the workers. Example: toolname1[input]: tool1 description toolname2[input]: tool2 description """ prompt = "" try: for worker in self.plugins: prompt += f"{worker.name}[input]: {worker.description}\n" except Exception: raise ValueError("Worker must have a name and description.") return prompt def _compose_fewshot_prompt(self) -> str: if self.examples is None: return "" if isinstance(self.examples, str): return self.examples else: return "\n\n".join([e.strip("\n") for e in self.examples]) def _compose_prompt(self, instruction) -> str: """ Compose the prompt from template, worker description, examples and instruction. """ worker_desctription = self._compose_worker_description() fewshot = self._compose_fewshot_prompt() if self.prompt_template is not None: if "fewshot" in self.prompt_template.placeholders: return self.prompt_template.populate( tool_description=worker_desctription, fewshot=fewshot, task=instruction, ) else: return self.prompt_template.populate( tool_description=worker_desctription, task=instruction ) else: if self.examples is not None: return few_shot_planner_prompt.populate( tool_description=worker_desctription, fewshot=fewshot, task=instruction, ) else: return zero_shot_planner_prompt.populate( tool_description=worker_desctription, task=instruction ) def run(self, instruction: str, output: BaseScratchPad = BaseScratchPad()) -> Any: response = None output.info("Running Planner") prompt = self._compose_prompt(instruction) output.debug(f"Prompt: {prompt}") try: response = self.model(prompt) self.log_progress(".planner", response=response) output.info("Planner run successful.") except ValueError as e: output.error("Planner failed to retrieve response from LLM") raise ValueError("Planner failed to retrieve response from LLM") from e return response def stream(self, instruction: str, output: BaseScratchPad = BaseScratchPad()): response = None output.info("Running Planner") prompt = self._compose_prompt(instruction) output.debug(f"Prompt: {prompt}") response = "" try: for text in self.model.stream(prompt): response += text yield text self.log_progress(".planner", response=response) output.info("Planner run successful.") except NotImplementedError: print("Streaming is not supported, falling back to normal run") response = self.model(prompt) yield response except ValueError as e: output.error("Planner failed to retrieve response from LLM") raise ValueError("Planner failed to retrieve response from LLM") from e return response ================================================ FILE: libs/kotaemon/kotaemon/agents/rewoo/prompt.py ================================================ # flake8: noqa from kotaemon.llms import PromptTemplate zero_shot_planner_prompt = PromptTemplate( template="""You are an AI agent who makes step-by-step plans to solve a problem under the help of external tools. For each step, make one plan followed by one tool-call, which will be executed later to retrieve evidence for that step. You should store each evidence into a distinct variable #E1, #E2, #E3 ... that can be referred to in later tool-call inputs. ##Available Tools## {tool_description} ##Output Format (Replace '<...>')## #Plan1: #E1: [] (eg. Search[What is Python]) #Plan2: #E2: [] And so on... ##Your Task## {task} ##Now Begin## """ ) one_shot_planner_prompt = PromptTemplate( template="""You are an AI agent who makes step-by-step plans to solve a problem under the help of external tools. For each step, make one plan followed by one tool-call, which will be executed later to retrieve evidence for that step. You should store each evidence into a distinct variable #E1, #E2, #E3 ... that can be referred to in later tool-call inputs. ##Available Tools## {tool_description} ##Output Format## #Plan1: #E1: [] #Plan2: #E2: [] And so on... ##Example## Task: What is the 4th root of 64 to the power of 3? #Plan1: Find the 4th root of 64 #E1: Calculator[64^(1/4)] #Plan2: Raise the result from #Plan1 to the power of 3 #E2: Calculator[#E1^3] ##Your Task## {task} ##Now Begin## """ ) few_shot_planner_prompt = PromptTemplate( template="""You are an AI agent who makes step-by-step plans to solve a problem under the help of external tools. For each step, make one plan followed by one tool-call, which will be executed later to retrieve evidence for that step. You should store each evidence into a distinct variable #E1, #E2, #E3 ... that can be referred to in later tool-call inputs. ##Available Tools## {tool_description} ##Output Format (Replace '<...>')## #Plan1: #E1: [] #Plan2: #E2: [] And so on... ##Examples## {fewshot} ##Your Task## {task} ##Now Begin## """ ) zero_shot_solver_prompt = PromptTemplate( template="""You are an AI agent who solves a problem with my assistance. I will provide step-by-step plans(#Plan) and evidences(#E) that could be helpful. Your task is to briefly summarize each step, then make a short final conclusion for your task. Give answer in {lang}. ##My Plans and Evidences## {plan_evidence} ##Example Output## First, I , and I think <...>; Second, I <...>, and I think <...>; .... So, . ##Your Task## {task} ##Now Begin## """ ) few_shot_solver_prompt = PromptTemplate( template="""You are an AI agent who solves a problem with my assistance. I will provide step-by-step plans and evidences that could be helpful. Your task is to briefly summarize each step, then make a short final conclusion for your task. Give answer in {lang}. ##My Plans and Evidences## {plan_evidence} ##Example Output## First, I , and I think <...>; Second, I <...>, and I think <...>; .... So, . ##Example## {fewshot} ##Your Task## {task} ##Now Begin## """ ) ================================================ FILE: libs/kotaemon/kotaemon/agents/rewoo/solver.py ================================================ from typing import Any, List, Optional, Union from kotaemon.agents.io import BaseScratchPad from kotaemon.base import BaseComponent from kotaemon.llms import BaseLLM, PromptTemplate from .prompt import few_shot_solver_prompt, zero_shot_solver_prompt class Solver(BaseComponent): model: BaseLLM prompt_template: Optional[PromptTemplate] = None examples: Optional[Union[str, List[str]]] = None output_lang: str = "English" def _compose_fewshot_prompt(self) -> str: if self.examples is None: return "" if isinstance(self.examples, str): return self.examples else: return "\n\n".join([e.strip("\n") for e in self.examples]) def _compose_prompt(self, instruction, plan_evidence, output_lang) -> str: """ Compose the prompt from template, plan&evidence, examples and instruction. """ fewshot = self._compose_fewshot_prompt() if self.prompt_template is not None: if "fewshot" in self.prompt_template.placeholders: return self.prompt_template.populate( plan_evidence=plan_evidence, fewshot=fewshot, task=instruction, lang=output_lang, ) else: return self.prompt_template.populate( plan_evidence=plan_evidence, task=instruction, lang=output_lang ) else: if self.examples is not None: return few_shot_solver_prompt.populate( plan_evidence=plan_evidence, fewshot=fewshot, task=instruction, lang=output_lang, ) else: return zero_shot_solver_prompt.populate( plan_evidence=plan_evidence, task=instruction, lang=output_lang, ) def run( self, instruction: str, plan_evidence: str, output: BaseScratchPad = BaseScratchPad(), ) -> Any: response = None output.info("Running Solver") output.debug(f"Instruction: {instruction}") output.debug(f"Plan Evidence: {plan_evidence}") prompt = self._compose_prompt(instruction, plan_evidence, self.output_lang) output.debug(f"Prompt: {prompt}") try: response = self.model(prompt) output.info("Solver run successful.") except ValueError: output.error("Solver failed to retrieve response from LLM") return response def stream( self, instruction: str, plan_evidence: str, output: BaseScratchPad = BaseScratchPad(), ) -> Any: response = "" output.info("Running Solver") output.debug(f"Instruction: {instruction}") output.debug(f"Plan Evidence: {plan_evidence}") prompt = self._compose_prompt(instruction, plan_evidence, self.output_lang) output.debug(f"Prompt: {prompt}") try: for text in self.model.stream(prompt): response += text.text yield text output.info("Planner run successful.") except NotImplementedError: response = self.model(prompt).text output.info("Solver run successful.") except ValueError: output.error("Solver failed to retrieve response from LLM") return response ================================================ FILE: libs/kotaemon/kotaemon/agents/tools/__init__.py ================================================ from .base import BaseTool, ComponentTool from .google import GoogleSearchTool from .llm import LLMTool from .mcp import ( MCPTool, build_args_model, create_tools_from_config, discover_tools_info, format_tool_list, parse_mcp_config, ) from .wikipedia import WikipediaTool __all__ = [ "BaseTool", "ComponentTool", "GoogleSearchTool", "WikipediaTool", "LLMTool", "MCPTool", "build_args_model", "create_tools_from_config", "discover_tools_info", "format_tool_list", "parse_mcp_config", ] ================================================ FILE: libs/kotaemon/kotaemon/agents/tools/base.py ================================================ from typing import Any, Callable, Dict, Optional, Tuple, Type, Union from langchain.agents import Tool as LCTool from pydantic import BaseModel from kotaemon.base import BaseComponent class ToolException(Exception): """An optional exception that tool throws when execution error occurs. When this exception is thrown, the agent will not stop working, but will handle the exception according to the handle_tool_error variable of the tool, and the processing result will be returned to the agent as observation, and printed in red on the console. """ class BaseTool(BaseComponent): name: str """The unique name of the tool that clearly communicates its purpose.""" description: str """Description used to tell the model how/when/why to use the tool. You can provide few-shot examples as a part of the description. This will be input to the prompt of LLM. """ args_schema: Optional[Type[BaseModel]] = None """Pydantic model class to validate and parse the tool's input arguments.""" verbose: bool = False """Whether to log the tool's progress.""" handle_tool_error: Optional[ Union[bool, str, Callable[[ToolException], str]] ] = False """Handle the content of the ToolException thrown.""" def _parse_input( self, tool_input: Union[str, Dict], ) -> Union[str, Dict[str, Any]]: """Convert tool input to pydantic model.""" args_schema = self.args_schema if isinstance(tool_input, str): if args_schema is not None: key_ = next(iter(args_schema.model_fields.keys())) args_schema.validate({key_: tool_input}) return tool_input else: if args_schema is not None: result = args_schema.parse_obj(tool_input) return {k: v for k, v in result.dict().items() if k in tool_input} return tool_input def _run_tool( self, *args: Any, **kwargs: Any, ) -> Any: """Call tool.""" raise NotImplementedError(f"_run_tool is not implemented for {self.name}") def _to_args_and_kwargs(self, tool_input: Union[str, Dict]) -> Tuple[Tuple, Dict]: # For backwards compatibility, if run_input is a string, # pass as a positional argument. if isinstance(tool_input, str): return (tool_input,), {} else: return (), tool_input def _handle_tool_error(self, e: ToolException) -> Any: """Handle the content of the ToolException thrown.""" observation = None if not self.handle_tool_error: raise e elif isinstance(self.handle_tool_error, bool): if e.args: observation = e.args[0] else: observation = "Tool execution error" elif isinstance(self.handle_tool_error, str): observation = self.handle_tool_error elif callable(self.handle_tool_error): observation = self.handle_tool_error(e) else: raise ValueError( f"Got unexpected type of `handle_tool_error`. Expected bool, str " f"or callable. Received: {self.handle_tool_error}" ) return observation def to_langchain_format(self) -> LCTool: """Convert this tool to Langchain format to use with its agent""" return LCTool(name=self.name, description=self.description, func=self.run) def run( self, tool_input: Union[str, Dict], verbose: Optional[bool] = None, **kwargs: Any, ) -> Any: """Run the tool.""" parsed_input = self._parse_input(tool_input) # TODO (verbose_): Add logging try: tool_args, tool_kwargs = self._to_args_and_kwargs(parsed_input) call_kwargs = {**kwargs, **tool_kwargs} observation = self._run_tool(*tool_args, **call_kwargs) except ToolException as e: observation = self._handle_tool_error(e) return observation else: return observation @classmethod def from_langchain_format(cls, langchain_tool: LCTool) -> "BaseTool": """Wrapper for Langchain Tool""" new_tool = BaseTool( name=langchain_tool.name, description=langchain_tool.description ) new_tool._run_tool = langchain_tool._run # type: ignore return new_tool class ComponentTool(BaseTool): """Wrapper around other BaseComponent to use it as a tool Args: component: BaseComponent-based component to wrap postprocessor: Optional postprocessor for the component output """ component: BaseComponent postprocessor: Optional[Callable] = None def _run_tool(self, *args: Any, **kwargs: Any) -> Any: output = self.component(*args, **kwargs) if self.postprocessor: output = self.postprocessor(output) return output ================================================ FILE: libs/kotaemon/kotaemon/agents/tools/google.py ================================================ from typing import AnyStr, Optional, Type from urllib.error import HTTPError from langchain_community.utilities import SerpAPIWrapper from pydantic import BaseModel, Field from .base import BaseTool class GoogleSearchArgs(BaseModel): query: str = Field(..., description="a search query") class GoogleSearchTool(BaseTool): name: str = "google_search" description: str = ( "A search engine retrieving top search results as snippets from Google. " "Input should be a search query." ) args_schema: Optional[Type[BaseModel]] = GoogleSearchArgs def _run_tool(self, query: AnyStr) -> str: try: from googlesearch import search except ImportError: raise ImportError( "install googlesearch using `pip3 install googlesearch-python` to " "use this tool" ) try: output = "" search_results = search(query, advanced=True) if search_results: output = "\n".join( "{} {}".format(item.title, item.description) for item in search_results ) except HTTPError: output = "No evidence found." return output class SerpTool(BaseTool): name = "google_search" description = ( "Worker that searches results from Google. Useful when you need to find short " "and succinct answers about a specific topic. Input should be a search query." ) args_schema: Optional[Type[BaseModel]] = GoogleSearchArgs def _run_tool(self, query: AnyStr) -> str: tool = SerpAPIWrapper() evidence = tool.run(query) return evidence ================================================ FILE: libs/kotaemon/kotaemon/agents/tools/llm.py ================================================ from typing import AnyStr, Optional, Type from pydantic import BaseModel, Field from kotaemon.agents.tools.base import ToolException from kotaemon.llms import BaseLLM from .base import BaseTool class LLMArgs(BaseModel): query: str = Field(..., description="a search question or prompt") class LLMTool(BaseTool): name: str = "llm" description: str = ( "A pretrained LLM like yourself. Useful when you need to act with " "general world knowledge and common sense. Prioritize it when you " "are confident in solving the problem " "yourself. Input can be any instruction." ) llm: BaseLLM args_schema: Optional[Type[BaseModel]] = LLMArgs dummy_mode: bool = True def _run_tool(self, query: AnyStr) -> str: output = None try: if not self.dummy_mode: response = self.llm(query) else: response = None except ValueError: raise ToolException("LLM Tool call failed") output = response.text if response else "<->" return output ================================================ FILE: libs/kotaemon/kotaemon/agents/tools/mcp.py ================================================ """MCP Tool for kotaemon agents. Bridges the MCP SDK's tool schema with kotaemon's BaseTool abstraction so MCP tools can be seamlessly used by ReAct/ReWOO agents. This module contains: - MCPTool: BaseTool wrapper for individual MCP server tools - Tool discovery/creation functions for building MCPTool instances from config - Config parsing utilities """ import asyncio import json import logging import shlex from typing import Any, Optional, Type from pydantic import BaseModel, Field, create_model from .base import BaseTool logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # JSON Schema → Pydantic helpers # --------------------------------------------------------------------------- def _json_schema_type_to_python(json_type: str) -> type: """Map JSON Schema types to Python types.""" mapping: dict[str, type] = { "string": str, "integer": int, "number": float, "boolean": bool, "object": dict, "array": list, } return mapping.get(json_type, str) def build_args_model(tool_name: str, input_schema: dict) -> Type[BaseModel]: """Build a Pydantic model from MCP tool's JSON Schema input_schema.""" properties = input_schema.get("properties", {}) required = set(input_schema.get("required", [])) fields: dict[str, Any] = {} for prop_name, prop_info in properties.items(): python_type = _json_schema_type_to_python(prop_info.get("type", "string")) description = prop_info.get("description", "") if prop_name in required: fields[prop_name] = (python_type, Field(..., description=description)) else: default = prop_info.get("default", None) fields[prop_name] = ( Optional[python_type], Field(default=default, description=description), ) model_name = f"MCPArgs_{tool_name}" return create_model(model_name, **fields) # --------------------------------------------------------------------------- # Config parsing # --------------------------------------------------------------------------- def parse_mcp_config(config: dict) -> dict: """Parse a JSON config into normalised transport/command/args/env. Handles the case where the user puts the full command string (e.g. ``"npx -y mcp-remote https://..."`` ) into the command field. Returns a dict with keys: transport, command, args, env. """ transport = config.get("transport", "stdio") command = config.get("command", "") args = config.get("args", []) env = config.get("env", {}) url = config.get("url", "") # If stdio and args is empty but command has spaces, split it if transport == "stdio" and not args and " " in command: parts = shlex.split(command) command = parts[0] args = parts[1:] return { "transport": transport, "command": command if transport == "stdio" else url, "args": args, "env": env, } # --------------------------------------------------------------------------- # Tool discovery & creation # --------------------------------------------------------------------------- def _make_tool(parsed: dict, tool_info: Any) -> "MCPTool": """Build an MCPTool from MCP tool info.""" input_schema = tool_info.inputSchema if hasattr(tool_info, "inputSchema") else {} args_model = ( build_args_model(tool_info.name, input_schema) if input_schema else None ) return MCPTool( name=tool_info.name, description=tool_info.description or f"MCP tool: {tool_info.name}", args_schema=args_model, server_transport=parsed["transport"], server_command=parsed["command"], server_args=parsed.get("args", []), server_env=parsed.get("env", {}), mcp_tool_name=tool_info.name, ) async def _async_discover_tools(parsed: dict) -> list["MCPTool"]: """Async: connect to an MCP server and return MCPTool wrappers.""" from mcp import ClientSession from mcp.client.sse import sse_client from mcp.client.stdio import StdioServerParameters, stdio_client tools: list[MCPTool] = [] transport = parsed["transport"] if transport == "stdio": server_params = StdioServerParameters( command=parsed["command"], args=parsed.get("args", []), env=parsed.get("env") or None, ) async with stdio_client(server_params) as (read, write): async with ClientSession(read, write) as session: await session.initialize() result = await session.list_tools() for tool_info in result.tools: tools.append(_make_tool(parsed, tool_info)) elif transport == "sse": async with sse_client(url=parsed["command"]) as (read, write): async with ClientSession(read, write) as session: await session.initialize() result = await session.list_tools() for tool_info in result.tools: tools.append(_make_tool(parsed, tool_info)) return tools def _run_async(coro: Any) -> Any: """Run an async coroutine from a sync context, handling event loops.""" try: loop = asyncio.get_event_loop() if loop.is_running(): import concurrent.futures with concurrent.futures.ThreadPoolExecutor() as pool: return pool.submit(asyncio.run, coro).result() else: return loop.run_until_complete(coro) except RuntimeError: return asyncio.run(coro) def create_tools_from_config( config: dict, enabled_tools: Optional[list[str]] = None, ) -> list["MCPTool"]: """Create MCPTool instances from an MCP server config dict. Args: config: MCP server JSON config with keys like transport, command, etc. enabled_tools: If provided, only return tools whose names are in this list. If ``None`` or empty, return all discovered tools. Returns: List of MCPTool instances ready for use by agents. """ parsed = parse_mcp_config(config) tools = _run_async(_async_discover_tools(parsed)) if enabled_tools: tools = [t for t in tools if t.mcp_tool_name in enabled_tools] return tools async def async_discover_tools_info(config: dict) -> list[dict]: """Connect to an MCP server and return raw tool info dicts. Returns a list of dicts with keys: name, description. Useful for UI display without instantiating full MCPTool objects. """ from mcp import ClientSession from mcp.client.sse import sse_client from mcp.client.stdio import StdioServerParameters, stdio_client parsed = parse_mcp_config(config) transport = parsed["transport"] tool_infos: list[dict] = [] if transport == "stdio": server_params = StdioServerParameters( command=parsed["command"], args=parsed.get("args", []), env=parsed.get("env") or None, ) async with stdio_client(server_params) as (read, write): async with ClientSession(read, write) as session: await session.initialize() result = await session.list_tools() for t in result.tools: tool_infos.append( { "name": t.name, "description": t.description or "", } ) elif transport == "sse": async with sse_client(url=parsed["command"]) as (read, write): async with ClientSession(read, write) as session: await session.initialize() result = await session.list_tools() for t in result.tools: tool_infos.append( { "name": t.name, "description": t.description or "", } ) return tool_infos def discover_tools_info(config: dict) -> list[dict]: """Sync wrapper around async_discover_tools_info.""" return _run_async(async_discover_tools_info(config)) def format_tool_list( tool_infos: list[dict], enabled_tools: Optional[list[str]] = None, ) -> str: """Format tool info dicts into a readable HTML string. Args: tool_infos: List of dicts with 'name' and 'description' keys. enabled_tools: If provided, marks which tools are enabled. """ lines = [f"✅ Connected! Found {len(tool_infos)} tool(s):
"] for t in tool_infos: desc = (t.get("description") or "No description")[:120] if enabled_tools is not None: check = "✅" if t["name"] in enabled_tools else "⬜" lines.append(f"  {check} {t['name']} — {desc}
") else: lines.append(f"  • {t['name']} — {desc}
") if enabled_tools is not None: enabled_count = sum(1 for t in tool_infos if t["name"] in enabled_tools) lines.append( f"
{enabled_count}/{len(tool_infos)} tool(s) enabled. " 'Add "enabled_tools": ["tool_name", ...] ' "to your config JSON to limit tools." ) else: lines.append( "
All tools enabled. Add " '"enabled_tools": ["tool_name", ...] ' "to your config JSON to limit tools." ) return "".join(lines) # --------------------------------------------------------------------------- # MCPTool class # --------------------------------------------------------------------------- class MCPTool(BaseTool): """A kotaemon BaseTool wrapper around a single MCP server tool. This tool holds the MCP server configuration and establishes a connection to invoke the tool on demand. Example usage:: tool = MCPTool( name="search", description="Search the web", server_transport="stdio", server_command="uvx", server_args=["mcp-server-fetch"], mcp_tool_name="fetch", ) result = tool.run("https://example.com") """ name: str = "" description: str = "" args_schema: Optional[Type[BaseModel]] = None # MCP server connection details server_transport: str = "stdio" server_command: str = "" server_args: list[str] = [] server_env: dict[str, str] = {} # The original MCP tool name (on the server) mcp_tool_name: str = "" def _run_tool(self, *args: Any, **kwargs: Any) -> str: """Invoke the MCP tool by establishing a session.""" return _run_async(self._arun_tool(*args, **kwargs)) async def _arun_tool(self, *args: Any, **kwargs: Any) -> str: """Async implementation that connects to the MCP server and calls the tool.""" from mcp import ClientSession from mcp.client.sse import sse_client from mcp.client.stdio import StdioServerParameters, stdio_client # Build tool arguments if args and isinstance(args[0], str): try: tool_args = json.loads(args[0]) except json.JSONDecodeError: # If not JSON, assume single string argument if self.args_schema: first_field = next(iter(self.args_schema.model_fields.keys())) tool_args = {first_field: args[0]} else: tool_args = {"input": args[0]} else: tool_args = kwargs if self.server_transport == "stdio": cmd = self.server_command cmd_args = self.server_args # Auto-split if full command string with no separate args if not cmd_args and " " in cmd: parts = shlex.split(cmd) cmd = parts[0] cmd_args = parts[1:] server_params = StdioServerParameters( command=cmd, args=cmd_args, env=self.server_env if self.server_env else None, ) async with stdio_client(server_params) as (read, write): async with ClientSession(read, write) as session: await session.initialize() result = await session.call_tool(self.mcp_tool_name, tool_args) return self._format_result(result) elif self.server_transport == "sse": async with sse_client(url=self.server_command) as (read, write): async with ClientSession(read, write) as session: await session.initialize() result = await session.call_tool(self.mcp_tool_name, tool_args) return self._format_result(result) else: return f"Unsupported transport: {self.server_transport}" def _format_result(self, result: Any) -> str: """Format MCP CallToolResult into a string.""" if result.isError: return f"MCP Tool Error: {result.content}" parts = [] for content in result.content: if hasattr(content, "text"): parts.append(content.text) elif hasattr(content, "data"): parts.append(f"[Binary data: {content.mimeType}]") else: parts.append(str(content)) return "\n".join(parts) ================================================ FILE: libs/kotaemon/kotaemon/agents/tools/wikipedia.py ================================================ from typing import Any, AnyStr, Optional, Type, Union from pydantic import BaseModel, Field from kotaemon.base import Document from .base import BaseTool class Wiki: """Wrapper around wikipedia API.""" def __init__(self) -> None: """Check that wikipedia package is installed.""" try: import wikipedia # noqa: F401 except ImportError: raise ValueError( "Could not import wikipedia python package. " "Please install it with `pip install wikipedia`." ) def search(self, search: str) -> Union[str, Document]: """Try to search for wiki page. If page exists, return the page summary, and a PageWithLookups object. If page does not exist, return similar entries. """ import wikipedia try: page_content = wikipedia.page(search).content url = wikipedia.page(search).url result: Union[str, Document] = Document( text=page_content, metadata={"page": url} ) except wikipedia.PageError: result = f"Could not find [{search}]. Similar: {wikipedia.search(search)}" except wikipedia.DisambiguationError: result = f"Could not find [{search}]. Similar: {wikipedia.search(search)}" return result class WikipediaArgs(BaseModel): query: str = Field(..., description="a search query as input to wkipedia") class WikipediaTool(BaseTool): """Tool that adds the capability to query the Wikipedia API.""" name: str = "wikipedia" description: str = ( "Search engine from Wikipedia, retrieving relevant wiki page. " "Useful when you need to get holistic knowledge about people, " "places, companies, historical events, or other subjects. " "Input should be a search query." ) args_schema: Optional[Type[BaseModel]] = WikipediaArgs doc_store: Any = None def _run_tool(self, query: AnyStr) -> AnyStr: if not self.doc_store: self.doc_store = Wiki() tool = self.doc_store evidence = tool.search(query) return evidence ================================================ FILE: libs/kotaemon/kotaemon/agents/utils.py ================================================ from kotaemon.base import Document def get_plugin_response_content(output) -> str: """ Wrapper for AgentOutput content return """ if isinstance(output, Document): return output.text else: return str(output) def calculate_cost(model_name: str, prompt_token: int, completion_token: int) -> float: """ Calculate the cost of a prompt and completion. Returns: float: Cost of the provided model name with provided token information """ # TODO: to be implemented return 0.0 ================================================ FILE: libs/kotaemon/kotaemon/base/__init__.py ================================================ from .component import BaseComponent, Node, Param, lazy from .schema import ( AIMessage, BaseMessage, Document, DocumentWithEmbedding, ExtractorOutput, HumanMessage, LLMInterface, RetrievedDocument, StructuredOutputLLMInterface, SystemMessage, ) __all__ = [ "BaseComponent", "Document", "DocumentWithEmbedding", "BaseMessage", "SystemMessage", "AIMessage", "HumanMessage", "RetrievedDocument", "LLMInterface", "StructuredOutputLLMInterface", "ExtractorOutput", "Param", "Node", "lazy", ] ================================================ FILE: libs/kotaemon/kotaemon/base/component.py ================================================ from abc import abstractmethod from typing import Any, AsyncGenerator, Iterator, Optional from theflow import Function, Node, Param, lazy from kotaemon.base.schema import Document class BaseComponent(Function): """A component is a class that can be used to compose a pipeline. !!! tip "Benefits of component" - Auto caching, logging - Allow deployment !!! tip "For each component, the spirit is" - Tolerate multiple input types, e.g. str, Document, List[str], List[Document] - Enforce single output type. Hence, the output type of a component should be as generic as possible. """ inflow = None def flow(self): if self.inflow is None: raise ValueError("No inflow provided.") if not isinstance(self.inflow, BaseComponent): raise ValueError( f"inflow must be a BaseComponent, found {type(self.inflow)}" ) return self.__call__(self.inflow.flow()) def set_output_queue(self, queue): self._queue = queue for name in self._ff_nodes: node = getattr(self, name) if isinstance(node, BaseComponent): node.set_output_queue(queue) def report_output(self, output: Optional[Document]): if self._queue is not None: self._queue.put_nowait(output) def invoke(self, *args, **kwargs) -> Document | list[Document] | None: ... async def ainvoke(self, *args, **kwargs) -> Document | list[Document] | None: ... def stream(self, *args, **kwargs) -> Iterator[Document] | None: ... def astream(self, *args, **kwargs) -> AsyncGenerator[Document, None] | None: ... @abstractmethod def run( self, *args, **kwargs ) -> Document | list[Document] | Iterator[Document] | None | Any: """Run the component.""" ... __all__ = ["BaseComponent", "Param", "Node", "lazy"] ================================================ FILE: libs/kotaemon/kotaemon/base/schema.py ================================================ from __future__ import annotations from typing import TYPE_CHECKING, Any, Literal, Optional, TypeVar from langchain.schema.messages import AIMessage as LCAIMessage from langchain.schema.messages import HumanMessage as LCHumanMessage from langchain.schema.messages import SystemMessage as LCSystemMessage from llama_index.core.bridge.pydantic import Field from llama_index.core.schema import Document as BaseDocument if TYPE_CHECKING: from haystack.schema import Document as HaystackDocument from openai.types.chat.chat_completion_message_param import ( ChatCompletionMessageParam, ) IO_Type = TypeVar("IO_Type", "Document", str) SAMPLE_TEXT = "A sample Document from kotaemon" class Document(BaseDocument): """ Base document class, mostly inherited from Document class from llama-index. This class accept one positional argument `content` of an arbitrary type, which will store the raw content of the document. If specified, the class will use `content` to initialize the base llama_index class. Attributes: content: raw content of the document, can be anything source: id of the source of the Document. Optional. channel: the channel to show the document. Optional.: - chat: show in chat message - info: show in information panel - index: show in index panel - debug: show in debug panel """ content: Any = None source: Optional[str] = None channel: Optional[Literal["chat", "info", "index", "debug", "plot"]] = None def __init__(self, content: Optional[Any] = None, *args, **kwargs): if content is None: if kwargs.get("text", None) is not None: kwargs["content"] = kwargs["text"] elif kwargs.get("embedding", None) is not None: kwargs["content"] = kwargs["embedding"] # default text indicating this document only contains embedding kwargs["text"] = "" elif isinstance(content, Document): # TODO: simplify the Document class temp_ = content.dict() temp_.update(kwargs) kwargs = temp_ else: kwargs["content"] = content if content: kwargs["text"] = str(content) else: kwargs["text"] = "" super().__init__(*args, **kwargs) def __bool__(self): return bool(self.content) @classmethod def example(cls) -> "Document": document = Document( text=SAMPLE_TEXT, metadata={"filename": "README.md", "category": "codebase"}, ) return document def to_haystack_format(self) -> "HaystackDocument": """Convert struct to Haystack document format.""" from haystack.schema import Document as HaystackDocument metadata = self.metadata or {} text = self.text return HaystackDocument(content=text, meta=metadata) def __str__(self): return str(self.content) class DocumentWithEmbedding(Document): """Subclass of Document which must contains embedding Use this if you want to enforce component's IOs to must contain embedding. """ def __init__(self, embedding: list[float], *args, **kwargs): kwargs["embedding"] = embedding super().__init__(*args, **kwargs) class BaseMessage(Document): def __add__(self, other: Any): raise NotImplementedError def to_openai_format(self) -> "ChatCompletionMessageParam": raise NotImplementedError class SystemMessage(BaseMessage, LCSystemMessage): def to_openai_format(self) -> "ChatCompletionMessageParam": return {"role": "system", "content": self.content} class AIMessage(BaseMessage, LCAIMessage): def to_openai_format(self) -> "ChatCompletionMessageParam": return {"role": "assistant", "content": self.content} class HumanMessage(BaseMessage, LCHumanMessage): def to_openai_format(self) -> "ChatCompletionMessageParam": return {"role": "user", "content": self.content} class RetrievedDocument(Document): """Subclass of Document with retrieval-related information Attributes: score (float): score of the document (from 0.0 to 1.0) retrieval_metadata (dict): metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other """ score: float = Field(default=0.0) retrieval_metadata: dict = Field(default={}) class LLMInterface(AIMessage): candidates: list[str] = Field(default_factory=list) completion_tokens: int = -1 total_tokens: int = -1 prompt_tokens: int = -1 total_cost: float = 0 logits: list[list[float]] = Field(default_factory=list) messages: list[AIMessage] = Field(default_factory=list) logprobs: list[float] = [] class StructuredOutputLLMInterface(LLMInterface): parsed: Any refusal: str = "" class ExtractorOutput(Document): """ Represents the output of an extractor. """ matches: list[str] ================================================ FILE: libs/kotaemon/kotaemon/chatbot/__init__.py ================================================ from .base import BaseChatBot, ChatConversation from .simple_respondent import SimpleRespondentChatbot __all__ = ["BaseChatBot", "SimpleRespondentChatbot", "ChatConversation"] ================================================ FILE: libs/kotaemon/kotaemon/chatbot/base.py ================================================ from abc import abstractmethod from typing import List, Optional from theflow import SessionFunction from kotaemon.base import BaseComponent, LLMInterface from kotaemon.base.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage class BaseChatBot(BaseComponent): @abstractmethod def run(self, messages: List[BaseMessage]) -> LLMInterface: ... def session_chat_storage(obj): """Store using the bot location rather than the session location""" return obj._store_result class ChatConversation(SessionFunction): """Base implementation of a chat bot component A chatbot component should: - handle internal state, including history messages - return output for a given input """ class Config: store_result = session_chat_storage system_message: str = "" bot: BaseChatBot def __init__(self, *args, **kwargs): self._history: List[BaseMessage] = [] self._store_result = ( f"{self.__module__}.{self.__class__.__name__},uninitiated_bot" ) super().__init__(*args, **kwargs) def run(self, message: HumanMessage) -> Optional[BaseMessage]: """Chat, given a message, return a response Args: message: The message to respond to Returns: The response to the message. If None, no response is sent. """ user_message = ( HumanMessage(content=message) if isinstance(message, str) else message ) self.history.append(user_message) output = self.bot(self.history).text output_message = None if output is not None: output_message = AIMessage(content=output) self.history.append(output_message) return output_message def start_session(self): self._store_result = self.bot.config.store_result super().start_session() if not self.history and self.system_message: system_message = SystemMessage(content=self.system_message) self.history.append(system_message) def end_session(self): super().end_session() self._history = [] def check_end( self, history: Optional[List[BaseMessage]] = None, user_message: Optional[HumanMessage] = None, bot_message: Optional[AIMessage] = None, ) -> bool: """Check if a conversation should end""" if user_message is not None and user_message.content == "": return True return False def terminal_session(self): """Create a terminal session""" self.start_session() print(">> Start chat:") while True: human = HumanMessage(content=input("Human: ")) if self.check_end(history=self.history, user_message=human): break output = self(human) if output is None: print("AI: ") else: print("AI:", output.content) if self.check_end(history=self.history, bot_message=output): break self.end_session() @property def history(self): return self._history @history.setter def history(self, value): self._history = value self._variablex() ================================================ FILE: libs/kotaemon/kotaemon/chatbot/simple_respondent.py ================================================ from ..llms import ChatLLM from .base import BaseChatBot class SimpleRespondentChatbot(BaseChatBot): """Simple text respondent chatbot that essentially wraps around a chat LLM""" llm: ChatLLM def _get_message(self) -> str: return self.llm(self.history).text ================================================ FILE: libs/kotaemon/kotaemon/cli.py ================================================ import os import click import yaml from trogon import tui # check if the output is not a .yml file -> raise error def check_config_format(config): if os.path.exists(config): if isinstance(config, str): with open(config) as f: yaml.safe_load(f) else: raise ValueError("config must be yaml format.") @tui(command="ui", help="Open the terminal UI") # generate the terminal UI @click.group() def main(): pass @click.group() def promptui(): pass main.add_command(promptui) @promptui.command() @click.argument("export_path", nargs=1) @click.option("--output", default="promptui.yml", show_default=True, required=False) def export(export_path, output): """Export a pipeline to a config file""" import sys from theflow.utils.modules import import_dotted_string from kotaemon.contribs.promptui.config import export_pipeline_to_config sys.path.append(os.getcwd()) cls = import_dotted_string(export_path, safe=False) export_pipeline_to_config(cls, output) check_config_format(output) @promptui.command() @click.argument("run_path", required=False, default="promptui.yml") @click.option( "--share", is_flag=True, show_default=True, default=False, help="Share the app through Gradio. Requires --username to enable authentication.", ) @click.option( "--username", required=False, help=( "Username for the user. If not provided, the promptui will not have " "authentication." ), ) @click.option( "--password", required=False, help="Password for the user. If not provided, will be prompted.", ) @click.option( "--appname", required=False, help="The share app subdomain. Requires --share and --username", ) @click.option( "--port", required=False, help="Port to run the app. If not provided, will $GRADIO_SERVER_PORT (7860)", ) def run(run_path, share, username, password, appname, port): """Run the UI from a config file Examples: \b # Run with default config file $ kh promptui run \b # Run with username and password supplied $ kh promptui run --username admin --password password \b # Run with username and prompted password $ kh promptui run --username admin # Run and share to promptui # kh promptui run --username admin --password password --share --appname hey \ --port 7861 """ import sys from kotaemon.contribs.promptui.ui import build_from_dict sys.path.append(os.getcwd()) check_config_format(run_path) demo = build_from_dict(run_path) params: dict = {} if username is not None: if password is not None: auth = (username, password) else: auth = (username, click.prompt("Password", hide_input=True)) params["auth"] = auth port = int(port) if port else int(os.getenv("GRADIO_SERVER_PORT", "7860")) params["server_port"] = port if share: if username is None: raise ValueError( "Username must be provided to enable authentication for sharing" ) if appname: from kotaemon.contribs.promptui.tunnel import Tunnel tunnel = Tunnel( appname=str(appname), username=str(username), local_port=port ) url = tunnel.run() print(f"App is shared at {url}") else: params["share"] = True print("App is shared at Gradio") demo.launch(**params) @main.command() @click.argument("module", required=True) @click.option( "--output", default="docs.md", required=False, help="The output markdown file" ) @click.option( "--separation-level", required=False, default=1, help="Organize markdown layout" ) def makedoc(module, output, separation_level): """Make documentation for module `module` Example: \b # Make component documentation for kotaemon library $ kh makedoc kotaemon """ from kotaemon.contribs.docs import make_doc make_doc(module, output, separation_level) print(f"Documentation exported to {output}") @main.command() @click.option( "--template", default="project-default", required=False, help="Template name", show_default=True, ) def start_project(template): """Start a project from a template. Important: the value for --template corresponds to the name of the template folder, which is located at https://github.com/Cinnamon/kotaemon/tree/main/templates The default value is "project-default", which should work when you are starting a client project. """ print("Retrieving template...") os.system( "cookiecutter git@github.com:Cinnamon/kotaemon.git " f"--directory='templates/{template}'" ) if __name__ == "__main__": main() ================================================ FILE: libs/kotaemon/kotaemon/contribs/__init__.py ================================================ ================================================ FILE: libs/kotaemon/kotaemon/contribs/docs.py ================================================ import inspect from collections import defaultdict from theflow.utils.documentation import get_function_documentation_from_module def from_definition_to_markdown(definition: dict) -> str: """From definition to markdown""" # Handle params params = " N/A\n" if definition["params"]: params = "\n| Name | Description | Type | Default |\n" params += "| --- | --- | --- | --- |\n" for name, p in definition["params"].items(): type_ = p["type"].__name__ if inspect.isclass(p["type"]) else p["type"] params += f"| {name} | {p['desc']} | {type_} | {p['default']} |\n" # Handle nodes nodes = " N/A\n" if definition["nodes"]: nodes = "\n| Name | Description | Type | Input | Output |\n" nodes += "| --- | --- | --- | --- | --- |\n" for name, n in definition["nodes"].items(): type_ = n["type"].__name__ if inspect.isclass(n["type"]) else str(n["type"]) input_ = ( n["input"].__name__ if inspect.isclass(n["input"]) else str(n["input"]) ) output_ = ( n["output"].__name__ if inspect.isclass(n["output"]) else str(n["output"]) ) nodes += f"|{name}|{n['desc']}|{type_}|{input_}|{output_}|\n" description = inspect.cleandoc(definition["desc"]) return f"{description}\n\n_**Params:**_{params}\n_**Nodes:**_{nodes}" def make_doc(module: str, output: str, separation_level: int): """Run exporting components to markdown Args: module (str): module name output_path (str): output path to save separation_level (int): level of separation """ documentation = sorted( get_function_documentation_from_module(module).items(), key=lambda x: x[0] ) entries = defaultdict(list) for name, definition in documentation: section = name.split(".")[separation_level].capitalize() cls_name = name.split(".")[-1] markdown = from_definition_to_markdown(definition) entries[section].append(f"### {cls_name}\n{markdown}") final = "\n".join( [f"## {section}\n" + "\n".join(entries[section]) for section in entries] ) with open(output, "w") as f: f.write(final) ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/.gitignore ================================================ /frpc_* ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/__init__.py ================================================ ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/base.py ================================================ import gradio as gr COMPONENTS_CLASS = { "text": gr.components.Textbox, "checkbox": gr.components.CheckboxGroup, "dropdown": gr.components.Dropdown, "file": gr.components.File, "image": gr.components.Image, "number": gr.components.Number, "radio": gr.components.Radio, "slider": gr.components.Slider, } SUPPORTED_COMPONENTS = set(COMPONENTS_CLASS.keys()) DEFAULT_COMPONENT_BY_TYPES = { "str": "text", "bool": "checkbox", "int": "number", "float": "number", "list": "dropdown", } def get_component(component_def: dict) -> gr.components.Component: """Get the component based on component definition""" component_cls = None if "component" in component_def: component = component_def["component"] if component not in SUPPORTED_COMPONENTS: raise ValueError( f"Unsupported UI component: {component}. " f"Must be one of {SUPPORTED_COMPONENTS}" ) component_cls = COMPONENTS_CLASS[component] else: raise ValueError( f"Cannot decide the component from {component_def}. " "Please specify `component` with 1 of the following " f"values: {SUPPORTED_COMPONENTS}" ) return component_cls(**component_def.get("params", {})) ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/cli.py ================================================ """CLI commands that can be imported by the kotaemon.cli module""" ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/config.py ================================================ """Get config from Pipeline""" import inspect from pathlib import Path from typing import Any, Dict, Optional, Type, Union import yaml from kotaemon.base import BaseComponent from kotaemon.chatbot import BaseChatBot from .base import DEFAULT_COMPONENT_BY_TYPES def config_from_value(value: Any) -> dict: """Get the config from default value Args: value (Any): default value Returns: dict: config """ component = DEFAULT_COMPONENT_BY_TYPES.get(type(value).__name__, "text") return { "component": component, "params": { "value": value, }, } def handle_param(param: dict) -> dict: """Convert param definition into promptui-compliant config Supported gradio's UI components are (https://www.gradio.app/docs/components) - CheckBoxGroup: list (multi select) - DropDown: list (single select) - File - Image - Number: int / float - Radio: list (single select) - Slider: int / float - TextBox: str """ params = {} default = param.get("default", None) if isinstance(default, str) and default.startswith("{{") and default.endswith("}}"): default = None if default is not None: params["value"] = default ui_component = param.get("component_ui", "") if not ui_component: type_: str = type(default).__name__ if default is not None else "" ui_component = DEFAULT_COMPONENT_BY_TYPES.get(type_, "text") return { "component": ui_component, "params": params, } def handle_node(node: dict) -> dict: """Convert node definition into promptui-compliant config""" config = {} for name, param_def in node.get("params", {}).items(): if isinstance(param_def["auto_callback"], str): continue if param_def.get("ignore_ui", False): continue config[name] = handle_param(param_def) for name, node_def in node.get("nodes", {}).items(): if isinstance(node_def["auto_callback"], str): continue if node_def.get("ignore_ui", False): continue for key, value in handle_node(node_def["default"]).items(): config[f"{name}.{key}"] = value for key, value in node_def.get("default_kwargs", {}).items(): config[f"{name}.{key}"] = config_from_value(value) return config def handle_input(pipeline: Union[BaseComponent, Type[BaseComponent]]) -> dict: """Get the input from the pipeline""" signature = inspect.signature(pipeline.run) inputs: Dict[str, Dict] = {} for name, param in signature.parameters.items(): if name in ["self", "args", "kwargs"]: continue input_def: Dict[str, Optional[Any]] = {"component": "text"} default = param.default if default is param.empty: inputs[name] = input_def continue params = {} params["value"] = default type_ = type(default).__name__ if default is not None else None ui_component = None if type_ is not None: ui_component = "text" input_def["component"] = ui_component input_def["params"] = params inputs[name] = input_def return inputs def export_pipeline_to_config( pipeline: Union[BaseComponent, Type[BaseComponent]], path: Optional[str] = None, ) -> dict: """Export a pipeline to a promptui-compliant config dict""" if inspect.isclass(pipeline): pipeline = pipeline() pipeline_def = pipeline.describe() ui_type = "chat" if isinstance(pipeline, BaseChatBot) else "simple" if ui_type == "chat": params = {f".bot.{k}": v for k, v in handle_node(pipeline_def).items()} params["system_message"] = {"component": "text", "params": {"value": ""}} outputs = [] if hasattr(pipeline, "_promptui_outputs"): outputs = pipeline._promptui_outputs config_obj: dict = { "ui-type": ui_type, "params": params, "inputs": {}, "outputs": outputs, "logs": { "full_pipeline": { "input": { "step": ".", "getter": "_get_input", }, "output": { "step": ".", "getter": "_get_output", }, "preference": { "step": "preference", }, } }, } else: outputs = [{"step": ".", "getter": "_get_output", "component": "text"}] if hasattr(pipeline, "_promptui_outputs"): outputs = pipeline._promptui_outputs config_obj = { "ui-type": ui_type, "params": handle_node(pipeline_def), "inputs": handle_input(pipeline), "outputs": outputs, "logs": { "full_pipeline": { "input": { "step": ".", "getter": "_get_input", }, "output": { "step": ".", "getter": "_get_output", }, }, }, } config = {f"{pipeline.__module__}.{pipeline.__class__.__name__}": config_obj} if path is not None: old_config = config if Path(path).is_file(): with open(path) as f: old_config = yaml.safe_load(f) old_config.update(config) with open(path, "w") as f: yaml.safe_dump(old_config, f, sort_keys=False) return config ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/export.py ================================================ """Export logs into Excel file""" import os import pickle from pathlib import Path from typing import Any, Dict, List, Type, Union import pandas as pd import yaml from theflow.storage import storage from theflow.utils.modules import import_dotted_string from kotaemon.base import BaseComponent from .logs import ResultLog def from_log_to_dict(pipeline_cls: Type[BaseComponent], log_config: dict) -> dict: """Export the log to panda dataframes Args: pipeline_cls (Type[BaseComponent]): Pipeline class log_config (dict): Log config Returns: dataframe """ # get the directory pipeline_log_path = storage.url(pipeline_cls().config.store_result) dirs = list(sorted([f.path for f in os.scandir(pipeline_log_path) if f.is_dir()])) # get resultlog callback resultlog = getattr(pipeline_cls, "_promptui_resultlog", ResultLog) allowed_resultlog_callbacks = {i for i in dir(resultlog) if not i.startswith("__")} ids = [] params: Dict[str, List[Any]] = {} logged_infos: Dict[str, List[Any]] = {} for idx, each_dir in enumerate(dirs): ids.append(str(Path(each_dir).name)) # get the params params_file = os.path.join(each_dir, "params.pkl") if os.path.exists(params_file): with open(params_file, "rb") as f: each_params = pickle.load(f) for key, value in each_params.items(): if key not in params: params[key] = [None] * len(dirs) params[key][idx] = value # get the progress progress_file = os.path.join(each_dir, "progress.pkl") if os.path.exists(progress_file): with open(progress_file, "rb") as f: progress = pickle.load(f) for name, col_info in log_config.items(): step = col_info["step"] getter = col_info.get("getter", None) if name not in logged_infos: logged_infos[name] = [None] * len(dirs) if step not in progress: continue info = progress[step] if getter: if getter in allowed_resultlog_callbacks: info = getattr(resultlog, getter)(info) else: implicit_name = f"get_{name}" if implicit_name in allowed_resultlog_callbacks: info = getattr(resultlog, implicit_name)(info) logged_infos[name][idx] = info return {"ids": ids, **params, **logged_infos} def export(config: dict, pipeline_def, output_path): """Export from config to Excel file""" pipeline_name = f"{pipeline_def.__module__}.{pipeline_def.__name__}" # export to Excel if not config.get("logs", {}): raise ValueError(f"Pipeline {pipeline_name} has no logs to export") pds: Dict[str, pd.DataFrame] = {} for log_name, log_def in config["logs"].items(): pds[log_name] = pd.DataFrame(from_log_to_dict(pipeline_def, log_def)) # from the list of pds, export to Excel to output_path with pd.ExcelWriter(output_path, engine="openpyxl") as writer: # type: ignore for log_name, df in pds.items(): df.to_excel(writer, sheet_name=log_name) def export_from_dict( config: Union[str, dict], pipeline: Union[str, Type[BaseComponent]], output_path: str, ): """CLI to export the logs of a pipeline into Excel file Args: config_path (str): Path to the config file pipeline_name (str): Name of the pipeline output_path (str): Path to the output Excel file """ # get the pipeline class and the relevant config dict config_dict: dict if isinstance(config, str): with open(config) as f: config_dict = yaml.safe_load(f) elif isinstance(config, dict): config_dict = config else: raise TypeError(f"`config` must be str or dict, not {type(config)}") pipeline_name: str pipeline_cls: Type[BaseComponent] pipeline_config: dict if isinstance(pipeline, str): if pipeline not in config_dict: raise ValueError(f"Pipeline {pipeline} not found in config file") pipeline_name = pipeline pipeline_cls = import_dotted_string(pipeline, safe=False) pipeline_config = config_dict[pipeline] elif isinstance(pipeline, type) and issubclass(pipeline, BaseComponent): pipeline_name = f"{pipeline.__module__}.{pipeline.__name__}" if pipeline_name not in config_dict: raise ValueError(f"Pipeline {pipeline_name} not found in config file") pipeline_cls = pipeline pipeline_config = config_dict[pipeline_name] else: raise TypeError( f"`pipeline` must be str or subclass of BaseComponent, not {type(pipeline)}" ) export(pipeline_config, pipeline_cls, output_path) ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/logs.py ================================================ class ResultLog: """Callback getter to get the desired log result The callback resolution will be as follow: 1. Explicit string name 2. Implicitly by: `get_` 3. Pass through """ @staticmethod def _get_input(obj): return obj["input"] @staticmethod def _get_output(obj): return obj["output"] ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/themes.py ================================================ from __future__ import annotations from typing import Iterable from gradio.themes.base import Base from gradio.themes.utils import colors, fonts, sizes class John(Base): def __init__( self, *, primary_hue: colors.Color | str = colors.neutral, secondary_hue: colors.Color | str = colors.neutral, neutral_hue: colors.Color | str = colors.neutral, spacing_size: sizes.Size | str = sizes.spacing_sm, radius_size: sizes.Size | str = sizes.radius_none, text_size: sizes.Size | str = sizes.text_sm, font: fonts.Font | str | Iterable[fonts.Font | str] = ( fonts.GoogleFont("Quicksand"), "ui-sans-serif", "system-ui", "sans-serif", ), font_mono: fonts.Font | str | Iterable[fonts.Font | str] = ( fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "Consolas", "monospace", ), ): super().__init__( primary_hue=primary_hue, secondary_hue=secondary_hue, neutral_hue=neutral_hue, spacing_size=spacing_size, radius_size=radius_size, text_size=text_size, font=font, font_mono=font_mono, ) self.name = "monochrome" super().set( # Colors slider_color="*neutral_900", slider_color_dark="*neutral_500", body_text_color="*neutral_900", block_label_text_color="*body_text_color", block_title_text_color="*body_text_color", body_text_color_subdued="*neutral_700", background_fill_primary_dark="*neutral_900", background_fill_secondary_dark="*neutral_800", block_background_fill_dark="*neutral_800", input_background_fill_dark="*neutral_700", # Button Colors button_primary_background_fill="*neutral_900", button_primary_background_fill_hover="*neutral_700", button_primary_text_color="white", button_primary_background_fill_dark="*neutral_600", button_primary_background_fill_hover_dark="*neutral_600", button_primary_text_color_dark="white", button_secondary_background_fill=( "linear-gradient(to bottom right, *neutral_100, *neutral_200)" ), button_secondary_background_fill_hover=( "linear-gradient(to bottom right, *neutral_100, *neutral_100)" ), button_secondary_background_fill_dark=( "linear-gradient(to bottom right, *neutral_600, *neutral_700)" ), button_secondary_background_fill_hover_dark=( "linear-gradient(to bottom right, *neutral_600, *neutral_600)" ), button_cancel_background_fill="*button_primary_background_fill", button_cancel_background_fill_hover="*button_primary_background_fill_hover", button_cancel_text_color="*button_primary_text_color", # Padding checkbox_label_padding="*spacing_sm", button_large_padding="*spacing_sm", button_small_padding="*spacing_sm", # Borders block_border_width="0px", block_border_width_dark="1px", shadow_drop_lg="0 1px 4px 0 rgb(0 0 0 / 0.1)", block_shadow="*shadow_drop_lg", block_shadow_dark="none", # Block Labels block_title_text_weight="600", block_label_text_weight="600", block_label_text_size="*text_sm", ) ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/tunnel.py ================================================ import atexit import logging import os import platform import stat import subprocess from pathlib import Path import requests VERSION = "1.0" machine = platform.machine() if machine == "x86_64": machine = "amd64" BINARY_REMOTE_NAME = f"frpc_{platform.system().lower()}_{machine.lower()}" EXTENSION = ".exe" if os.name == "nt" else "" BINARY_URL = ( "some-endpoint.com" f"/kotaemon/tunneling/{VERSION}/{BINARY_REMOTE_NAME}{EXTENSION}" ) BINARY_FILENAME = f"{BINARY_REMOTE_NAME}_v{VERSION}" BINARY_FOLDER = Path(__file__).parent BINARY_PATH = f"{BINARY_FOLDER / BINARY_FILENAME}" logger = logging.getLogger(__name__) class Tunnel: def __init__(self, appname, username, local_port): self.proc = None self.url = None self.appname = appname self.username = username self.local_port = local_port @staticmethod def download_binary(): if not Path(BINARY_PATH).exists(): print("First time setting tunneling...") resp = requests.get(BINARY_URL) if resp.status_code == 404: raise OSError( f"Cannot set up a share link as this platform is incompatible. " "Please create a GitHub issue with information about your " f"platform: {platform.uname()}" ) if resp.status_code == 403: raise OSError( "You do not have permission to setup the tunneling. Please " "make sure that you are within Cinnamon VPN or within other " "approved IPs. If this is new server, please contact @channel " "at #llm-productization to add your IP address" ) resp.raise_for_status() # Save file data to local copy with open(BINARY_PATH, "wb") as file: file.write(resp.content) st = os.stat(BINARY_PATH) os.chmod(BINARY_PATH, st.st_mode | stat.S_IEXEC) def run(self) -> str: """Setting up tunneling""" if platform.system().lower() == "windows": logger.warning("Tunneling is not fully supported on Windows.") self.download_binary() self.url = self._start_tunnel(BINARY_PATH) return self.url def kill(self): if self.proc is not None: print(f"Killing tunnel 127.0.0.1:{self.local_port} <> {self.url}") self.proc.terminate() self.proc = None def _start_tunnel(self, binary: str) -> str: command = [ binary, "http", "-l", str(self.local_port), "-i", "127.0.0.1", "--uc", "--sd", str(self.appname), "-n", str(self.appname + self.username), "--server_addr", "44.229.38.9:7000", "--token", "Wz807/DyC;#t;#/", "--disable_log_color", ] self.proc = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) atexit.register(self.kill) return f"https://{self.appname}.promptui.dm.cinnamon.is" ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/ui/__init__.py ================================================ from typing import Union import gradio as gr import yaml from theflow.utils.modules import import_dotted_string from ..themes import John from .chat import build_chat_ui from .pipeline import build_pipeline_ui def build_from_dict(config: Union[str, dict]): """Build a full UI from YAML config file""" if isinstance(config, str): with open(config) as f: config_dict: dict = yaml.safe_load(f) elif isinstance(config, dict): config_dict = config else: raise ValueError( f"config must be either a yaml path or a dict, got {type(config)}" ) demos = [] for key, value in config_dict.items(): pipeline_def = import_dotted_string(key, safe=False) if value["ui-type"] == "chat": demos.append(build_chat_ui(value, pipeline_def).queue()) else: demos.append(build_pipeline_ui(value, pipeline_def).queue()) if len(demos) == 1: demo = demos[0] else: demo = gr.TabbedInterface( demos, tab_names=list(config_dict.keys()), title="PromptUI from kotaemon", analytics_enabled=False, theme=John(), ) demo.queue() return demo ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/ui/blocks.py ================================================ from __future__ import annotations from typing import Any, AsyncGenerator import anyio from gradio import ChatInterface from gradio.components import Component, get_component_instance from gradio.events import on from gradio.helpers import special_args from gradio.routes import Request class ChatBlock(ChatInterface): """The ChatBlock subclasses ChatInterface to provide extra functionalities: - Show additional outputs to the chat interface - Disallow blank user message """ def __init__( self, *args, additional_outputs: str | Component | list[str | Component] | None = None, **kwargs, ): if additional_outputs: if not isinstance(additional_outputs, list): additional_outputs = [additional_outputs] self.additional_outputs = [ get_component_instance(i) for i in additional_outputs # type: ignore ] else: self.additional_outputs = [] super().__init__(*args, **kwargs) async def _submit_fn( self, message: str, history_with_input: list[list[str | None]], request: Request, *args, ) -> tuple[Any, ...]: input_args = args[: -len(self.additional_outputs)] output_args = args[-len(self.additional_outputs) :] if not message: return history_with_input, history_with_input, *output_args history = history_with_input[:-1] inputs, _, _ = special_args( self.fn, inputs=[message, history, *input_args], request=request ) if self.is_async: response = await self.fn(*inputs) else: response = await anyio.to_thread.run_sync( self.fn, *inputs, limiter=self.limiter ) output = [] if self.additional_outputs: text = response[0] output = response[1:] else: text = response history.append([message, text]) return history, history, *output async def _stream_fn( self, message: str, history_with_input: list[list[str | None]], *args, ) -> AsyncGenerator: raise NotImplementedError("Stream function not implemented for ChatBlock") def _display_input( self, message: str, history: list[list[str | None]] ) -> tuple[list[list[str | None]], list[list[str | None]]]: """Stop displaying the input message if the message is a blank string""" if not message: return history, history return super()._display_input(message, history) def _setup_events(self) -> None: """Include additional outputs in the submit event""" submit_fn = self._stream_fn if self.is_generator else self._submit_fn submit_triggers = ( [self.textbox.submit, self.submit_btn.click] if self.submit_btn else [self.textbox.submit] ) submit_event = ( on( submit_triggers, self._clear_and_save_textbox, [self.textbox], [self.textbox, self.saved_input], api_name=False, queue=False, ) .then( self._display_input, [self.saved_input, self.chatbot_state], [self.chatbot, self.chatbot_state], api_name=False, queue=False, ) .then( submit_fn, [self.saved_input, self.chatbot_state] + self.additional_inputs + self.additional_outputs, [self.chatbot, self.chatbot_state] + self.additional_outputs, api_name=False, ) ) self._setup_stop_events(submit_triggers, submit_event) if self.retry_btn: retry_event = ( self.retry_btn.click( self._delete_prev_fn, [self.chatbot_state], [self.chatbot, self.saved_input, self.chatbot_state], api_name=False, queue=False, ) .then( self._display_input, [self.saved_input, self.chatbot_state], [self.chatbot, self.chatbot_state], api_name=False, queue=False, ) .then( submit_fn, [self.saved_input, self.chatbot_state] + self.additional_inputs + self.additional_outputs, [self.chatbot, self.chatbot_state] + self.additional_outputs, api_name=False, ) ) self._setup_stop_events([self.retry_btn.click], retry_event) if self.undo_btn: self.undo_btn.click( self._delete_prev_fn, [self.chatbot_state], [self.chatbot, self.saved_input, self.chatbot_state], api_name=False, queue=False, ).then( lambda x: x, [self.saved_input], [self.textbox], api_name=False, queue=False, ) if self.clear_btn: self.clear_btn.click( lambda: ([], [], None), None, [self.chatbot, self.chatbot_state, self.saved_input], queue=False, api_name=False, ) def _setup_api(self) -> None: api_fn = self._api_stream_fn if self.is_generator else self._api_submit_fn self.fake_api_btn.click( api_fn, [self.textbox, self.chatbot_state] + self.additional_inputs, [self.textbox, self.chatbot_state] + self.additional_outputs, api_name="chat", ) ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/ui/chat.py ================================================ import pickle from datetime import datetime from pathlib import Path import gradio as gr from theflow.storage import storage from kotaemon.chatbot import ChatConversation from kotaemon.contribs.promptui.base import get_component from kotaemon.contribs.promptui.export import export from kotaemon.contribs.promptui.ui.blocks import ChatBlock from ..logs import ResultLog USAGE_INSTRUCTION = """## How to use: 1. Set the desired parameters. 2. Click "New chat" to start a chat session with the supplied parameters. This set of parameters will persist until the end of the chat session. During an ongoing chat session, changing the parameters will not take any effect. 3. Chat and interact with the chat bot on the right panel. You can add any additional input (if any), and they will be supplied to the chatbot. 4. During chat, the log of the chat will show up in the "Output" tabs. This is empty by default, so if you want to show the log here, tell the AI developers to configure the UI settings. 5. When finishing chat, select your preference in the radio box. Click "End chat". This will save the chat log and the preference to disk. 6. To compare the result of different run, click "Export" to get an Excel spreadsheet summary of different run. ## Support: In case of errors, you can: - PromptUI instruction: https://github.com/Cinnamon/kotaemon/wiki/Utilities#prompt-engineering-ui - Create bug fix and make PR at: https://github.com/Cinnamon/kotaemon - Ping any of @john @tadashi @ian @jacky in Slack channel #llm-productization ## Contribute: - Follow installation at: https://github.com/Cinnamon/kotaemon/ """ def construct_chat_ui( config, func_new_chat, func_chat, func_end_chat, func_export_to_excel ) -> gr.Blocks: """Construct the prompt engineering UI for chat Args: config: the UI config func_new_chat: the function for starting a new chat session func_chat: the function for chatting interaction func_end_chat: the function for ending and saving the chat func_export_to_excel: the function to export the logs to excel Returns: the UI object """ inputs, outputs, params = [], [], [] for name, component_def in config.get("inputs", {}).items(): if "params" not in component_def: component_def["params"] = {} component_def["params"]["interactive"] = True component = get_component(component_def) if hasattr(component, "label") and not component.label: # type: ignore component.label = name # type: ignore inputs.append(component) for name, component_def in config.get("params", {}).items(): if "params" not in component_def: component_def["params"] = {} component_def["params"]["interactive"] = True component = get_component(component_def) if hasattr(component, "label") and not component.label: # type: ignore component.label = name # type: ignore params.append(component) for idx, component_def in enumerate(config.get("outputs", [])): if "params" not in component_def: component_def["params"] = {} component_def["params"]["interactive"] = False component = get_component(component_def) if hasattr(component, "label") and not component.label: # type: ignore component.label = f"Output {idx}" # type: ignore outputs.append(component) sess = gr.State(value=None) chatbot = gr.Chatbot(label="Chatbot", show_copy_button=True) chat = ChatBlock( func_chat, chatbot=chatbot, additional_inputs=[sess], additional_outputs=outputs ) param_state = gr.Textbox(interactive=False) with gr.Blocks(analytics_enabled=False, title="Welcome to PromptUI") as demo: sess.render() with gr.Accordion(label="HOW TO", open=False): gr.Markdown(USAGE_INSTRUCTION) with gr.Row(): run_btn = gr.Button("New chat") run_btn.click( func_new_chat, inputs=params, outputs=[ chat.chatbot, chat.chatbot_state, chat.saved_input, param_state, sess, *outputs, ], ) with gr.Accordion(label="End chat", open=False): likes = gr.Radio(["like", "dislike", "neutral"], value="neutral") save_log = gr.Checkbox( value=True, label="Save log", info="If saved, log can be exported later", show_label=True, ) end_btn = gr.Button("End chat") end_btn.click( func_end_chat, inputs=[likes, save_log, sess], outputs=[param_state, sess], ) with gr.Accordion(label="Export", open=False): exported_file = gr.File( label="Output file", show_label=True, height=100 ) export_btn = gr.Button("Export") export_btn.click(func_export_to_excel, inputs=[], outputs=exported_file) with gr.Row(): with gr.Column(): with gr.Tab("Params"): for component in params: component.render() with gr.Accordion(label="Session state", open=False): param_state.render() with gr.Tab("Outputs"): for component in outputs: component.render() with gr.Column(): chat.render() return demo.queue() def build_chat_ui(config, pipeline_def): """Build the chat UI Args: config: the UI config pipeline_def: the pipeline definition Returns: the UI object """ output_dir: Path = Path(storage.url(pipeline_def().config.store_result)) exported_dir = output_dir.parent / "exported" exported_dir.mkdir(parents=True, exist_ok=True) resultlog = getattr(pipeline_def, "_promptui_resultlog", ResultLog) allowed_resultlog_callbacks = {i for i in dir(resultlog) if not i.startswith("__")} def new_chat(*args): """Start a new chat function Args: *args: the pipeline init params Returns: new empty states """ gr.Info("Starting new session...") param_dicts = { name: value for name, value in zip(config["params"].keys(), args) } for key in param_dicts.keys(): if config["params"][key].get("component").lower() == "file": param_dicts[key] = param_dicts[key].name # TODO: currently hard-code as ChatConversation pipeline = pipeline_def() session = ChatConversation(bot=pipeline) session.set(param_dicts) session.start_session() param_state_str = "\n".join( f"- {name}: {value}" for name, value in param_dicts.items() ) gr.Info("New chat session started.") return ( [], [], None, param_state_str, session, *[None] * len(config.get("outputs", [])), ) def chat(message, history, session, *args): """The chat interface # TODO: wrap the input and output of this chat function so that it work with more types of chat conversation than simple text Args: message: the message from the user history: the gradio history of the chat session: the chat object session *args: the additional inputs Returns: the response from the chatbot """ if session is None: raise gr.Error( "No active chat session. Please set the params and click New chat" ) pred = session(message) text_response = pred.content additional_outputs = [] for output_def in config.get("outputs", []): value = session.last_run.logs(output_def["step"]) getter = output_def.get("getter", None) if getter and getter in allowed_resultlog_callbacks: value = getattr(resultlog, getter)(value) additional_outputs.append(value) return text_response, *additional_outputs def end_chat(preference: str, save_log: bool, session): """End the chat session Args: preference: the preference of the user save_log: whether to save the result session: the chat object session Returns: the new empty state """ gr.Info("Ending session...") session.end_session() output_dir: Path = ( Path(storage.url(session.config.store_result)) / session.last_run.id() ) if not save_log: if output_dir.exists(): import shutil shutil.rmtree(output_dir) session = None param_state = "" gr.Info("End session without saving log.") return param_state, session # add preference result to progress with (output_dir / "progress.pkl").open("rb") as fi: progress = pickle.load(fi) progress["preference"] = preference with (output_dir / "progress.pkl").open("wb") as fo: pickle.dump(progress, fo) # get the original params param_dicts = {name: session.getx(name) for name in config["params"].keys()} with (output_dir / "params.pkl").open("wb") as fo: pickle.dump(param_dicts, fo) session = None param_state = "" gr.Info("End session and save log.") return param_state, session def export_func(): name = ( f"{pipeline_def.__module__}.{pipeline_def.__name__}_{datetime.now()}.xlsx" ) path = str(exported_dir / name) gr.Info(f"Begin exporting {name}...") try: export(config=config, pipeline_def=pipeline_def, output_path=path) except Exception as e: raise gr.Error(f"Failed to export. Please contact project's AIR: {e}") gr.Info(f"Exported {name}. Please go to the `Exported file` tab to download") return path demo = construct_chat_ui( config=config, func_new_chat=new_chat, func_chat=chat, func_end_chat=end_chat, func_export_to_excel=export_func, ) return demo ================================================ FILE: libs/kotaemon/kotaemon/contribs/promptui/ui/pipeline.py ================================================ import pickle import time from datetime import datetime from pathlib import Path from typing import Any, Dict import gradio as gr import pandas as pd from theflow.storage import storage from kotaemon.contribs.promptui.base import get_component from kotaemon.contribs.promptui.export import export from ..logs import ResultLog USAGE_INSTRUCTION = """## How to use: 1. Set the desired parameters. 2. Set the desired inputs. 3. Click "Run" to execute the pipeline with the supplied parameters and inputs 4. The pipeline output will show up in the output panel. 5. Repeat from step 1. 6. To compare the result of different run, click "Export" to get an Excel spreadsheet summary of different run. ## Support: In case of errors, you can: - PromptUI instruction: https://github.com/Cinnamon/kotaemon/wiki/Utilities#prompt-engineering-ui - Create bug fix and make PR at: https://github.com/Cinnamon/kotaemon - Ping any of @john @tadashi @ian @jacky in Slack channel #llm-productization ## Contribute: - Follow installation at: https://github.com/Cinnamon/kotaemon/ """ def construct_pipeline_ui( config, func_run, func_save, func_load_params, func_activate_params, func_export ) -> gr.Blocks: """Create UI from config file. Execute the UI from config file - Can do now: Log from stdout to UI - In the future, we can provide some hooks and callbacks to let developers better fine-tune the UI behavior. """ inputs, outputs, params = [], [], [] for name, component_def in config.get("inputs", {}).items(): if "params" not in component_def: component_def["params"] = {} component_def["params"]["interactive"] = True component = get_component(component_def) if hasattr(component, "label") and not component.label: # type: ignore component.label = name # type: ignore inputs.append(component) for name, component_def in config.get("params", {}).items(): if "params" not in component_def: component_def["params"] = {} component_def["params"]["interactive"] = True component = get_component(component_def) if hasattr(component, "label") and not component.label: # type: ignore component.label = name # type: ignore params.append(component) for idx, component_def in enumerate(config.get("outputs", [])): if "params" not in component_def: component_def["params"] = {} component_def["params"]["interactive"] = False component = get_component(component_def) if hasattr(component, "label") and not component.label: # type: ignore component.label = f"Output {idx}" # type: ignore outputs.append(component) exported_file = gr.File(label="Output file", show_label=True) history_dataframe = gr.DataFrame(wrap=True) temp = gr.Tab with gr.Blocks(analytics_enabled=False, title="Welcome to PromptUI") as demo: with gr.Accordion(label="HOW TO", open=False): gr.Markdown(USAGE_INSTRUCTION) with gr.Accordion(label="Params History", open=False): with gr.Row(): save_btn = gr.Button("Save params") save_btn.click(func_save, inputs=params, outputs=history_dataframe) load_params_btn = gr.Button("Reload params") load_params_btn.click( func_load_params, inputs=[], outputs=history_dataframe ) history_dataframe.render() history_dataframe.select( func_activate_params, inputs=params, outputs=params ) with gr.Row(): run_btn = gr.Button("Run") run_btn.click(func_run, inputs=inputs + params, outputs=outputs) export_btn = gr.Button( "Export (Result will be in Exported file next to Output)" ) export_btn.click(func_export, inputs=[], outputs=exported_file) with gr.Row(): with gr.Column(): if params: with temp("Params"): for component in params: component.render() if inputs: with temp("Inputs"): for component in inputs: component.render() if not params and not inputs: gr.Text("No params or inputs") with gr.Column(): with temp("Outputs"): for component in outputs: component.render() with temp("Exported file"): exported_file.render() return demo def load_saved_params(path: str) -> Dict: """Load the saved params from path to a dataframe""" # get all pickle files files = list(sorted(Path(path).glob("*.pkl"))) data: Dict[str, Any] = {"_id": [None] * len(files)} for idx, each_file in enumerate(files): with open(each_file, "rb") as f: each_data = pickle.load(f) data["_id"][idx] = Path(each_file).stem for key, value in each_data.items(): if key not in data: data[key] = [None] * len(files) data[key][idx] = value return data def build_pipeline_ui(config: dict, pipeline_def): """Build a tab from config file""" inputs_name = list(config.get("inputs", {}).keys()) params_name = list(config.get("params", {}).keys()) outputs_def = config.get("outputs", []) output_dir: Path = Path(storage.url(pipeline_def().config.store_result)) exported_dir = output_dir.parent / "exported" exported_dir.mkdir(parents=True, exist_ok=True) save_dir = ( output_dir.parent / "saved" / f"{pipeline_def.__module__}.{pipeline_def.__name__}" ) save_dir.mkdir(parents=True, exist_ok=True) resultlog = getattr(pipeline_def, "_promptui_resultlog", ResultLog) allowed_resultlog_callbacks = {i for i in dir(resultlog) if not i.startswith("__")} def run_func(*args): inputs = { name: value for name, value in zip(inputs_name, args[: len(inputs_name)]) } params = { name: value for name, value in zip(params_name, args[len(inputs_name) :]) } pipeline = pipeline_def() pipeline.set(params) pipeline(**inputs) with storage.open( storage.url( pipeline.config.store_result, pipeline.last_run.id(), "params.pkl" ), "wb", ) as f: pickle.dump(params, f) if outputs_def: outputs = [] for output_def in outputs_def: output = pipeline.last_run.logs(output_def["step"]) getter = output_def.get("getter", None) if getter and getter in allowed_resultlog_callbacks: output = getattr(resultlog, getter)(output) outputs.append(output) if len(outputs_def) == 1: return outputs[0] return outputs def save_func(*args): params = {name: value for name, value in zip(params_name, args)} filename = save_dir / f"{int(time.time())}.pkl" with open(filename, "wb") as f: pickle.dump(params, f) gr.Info("Params saved") data = load_saved_params(str(save_dir)) return pd.DataFrame(data) def load_params_func(): data = load_saved_params(str(save_dir)) return pd.DataFrame(data) def activate_params_func(ev: gr.SelectData, *args): data = load_saved_params(str(save_dir)) output_args = [each for each in args] if ev.value is None: gr.Info(f'Blank value: "{ev.value}". Skip') return output_args column = list(data.keys())[ev.index[1]] if column not in params_name: gr.Info(f'Column "{column}" not in params. Skip') return output_args value = data[column][ev.index[0]] if value is None: gr.Info(f'Blank value: "{ev.value}". Skip') return output_args output_args[params_name.index(column)] = value return output_args def export_func(): name = ( f"{pipeline_def.__module__}.{pipeline_def.__name__}_{datetime.now()}.xlsx" ) path = str(exported_dir / name) gr.Info(f"Begin exporting {name}...") try: export(config=config, pipeline_def=pipeline_def, output_path=path) except Exception as e: raise gr.Error(f"Failed to export. Please contact project's AIR: {e}") gr.Info(f"Exported {name}. Please go to the `Exported file` tab to download") return path return construct_pipeline_ui( config, run_func, save_func, load_params_func, activate_params_func, export_func ) ================================================ FILE: libs/kotaemon/kotaemon/embeddings/__init__.py ================================================ from .base import BaseEmbeddings from .endpoint_based import EndpointEmbeddings from .fastembed import FastEmbedEmbeddings from .langchain_based import ( LCAzureOpenAIEmbeddings, LCCohereEmbeddings, LCGoogleEmbeddings, LCHuggingFaceEmbeddings, LCMistralEmbeddings, LCOpenAIEmbeddings, ) from .openai import AzureOpenAIEmbeddings, OpenAIEmbeddings from .tei_endpoint_embed import TeiEndpointEmbeddings from .voyageai import VoyageAIEmbeddings __all__ = [ "BaseEmbeddings", "EndpointEmbeddings", "TeiEndpointEmbeddings", "LCOpenAIEmbeddings", "LCAzureOpenAIEmbeddings", "LCCohereEmbeddings", "LCHuggingFaceEmbeddings", "LCGoogleEmbeddings", "LCMistralEmbeddings", "OpenAIEmbeddings", "AzureOpenAIEmbeddings", "FastEmbedEmbeddings", "VoyageAIEmbeddings", ] ================================================ FILE: libs/kotaemon/kotaemon/embeddings/base.py ================================================ from __future__ import annotations from kotaemon.base import BaseComponent, Document, DocumentWithEmbedding class BaseEmbeddings(BaseComponent): def run( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: return self.invoke(text, *args, **kwargs) def invoke( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: raise NotImplementedError async def ainvoke( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: raise NotImplementedError def prepare_input( self, text: str | list[str] | Document | list[Document] ) -> list[Document]: if isinstance(text, (str, Document)): return [Document(content=text)] elif isinstance(text, list): return [Document(content=_) for _ in text] return text ================================================ FILE: libs/kotaemon/kotaemon/embeddings/endpoint_based.py ================================================ import requests from kotaemon.base import Document, DocumentWithEmbedding from .base import BaseEmbeddings class EndpointEmbeddings(BaseEmbeddings): """ An Embeddings component that uses an OpenAI API compatible endpoint. Attributes: endpoint_url (str): The url of an OpenAI API compatible endpoint. """ endpoint_url: str def run( self, text: str | list[str] | Document | list[Document] ) -> list[DocumentWithEmbedding]: """ Generate embeddings from text Args: text (str | list[str] | Document | list[Document]): text to generate embeddings from Returns: list[DocumentWithEmbedding]: embeddings """ if not isinstance(text, list): text = [text] outputs = [] for item in text: response = requests.post( self.endpoint_url, json={"input": str(item)} ).json() outputs.append( DocumentWithEmbedding( text=str(item), embedding=response["data"][0]["embedding"], total_tokens=response["usage"]["total_tokens"], prompt_tokens=response["usage"]["prompt_tokens"], ) ) return outputs ================================================ FILE: libs/kotaemon/kotaemon/embeddings/fastembed.py ================================================ from typing import TYPE_CHECKING, Optional from kotaemon.base import Document, DocumentWithEmbedding, Param from .base import BaseEmbeddings if TYPE_CHECKING: from fastembed import TextEmbedding class FastEmbedEmbeddings(BaseEmbeddings): """Utilize fastembed library for embeddings locally without GPU. Supported model: https://qdrant.github.io/fastembed/examples/Supported_Models/ Code: https://github.com/qdrant/fastembed """ model_name: str = Param( "BAAI/bge-small-en-v1.5", help=( "Model name for fastembed. Please refer " "[here](https://qdrant.github.io/fastembed/examples/Supported_Models/) " "for the list of supported models." ), required=True, ) batch_size: int = Param( 256, help="Batch size for embeddings. Higher values use more memory, but are faster", ) parallel: Optional[int] = Param( None, help=( "Number of threads to use for embeddings. " "If > 1, data-parallel encoding will be used. " "If 0, use all available CPUs. " "If None, use default onnxruntime threading. " "Defaults to None." ), ) @Param.auto() def client_(self) -> "TextEmbedding": try: from fastembed import TextEmbedding except ImportError: raise ImportError("Please install FastEmbed: `pip install fastembed`") return TextEmbedding(model_name=self.model_name) def invoke( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: input_ = self.prepare_input(text) embeddings = self.client_.embed( [_.content for _ in input_], batch_size=self.batch_size, parallel=self.parallel, ) return [ DocumentWithEmbedding( content=doc, embedding=list(embedding), ) for doc, embedding in zip(input_, embeddings) ] async def ainvoke( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: """Fastembed does not support async API.""" return self.invoke(text, *args, **kwargs) ================================================ FILE: libs/kotaemon/kotaemon/embeddings/langchain_based.py ================================================ from typing import Optional from kotaemon.base import DocumentWithEmbedding, Param from .base import BaseEmbeddings class LCEmbeddingMixin: def _get_lc_class(self): raise NotImplementedError( "Please return the relevant Langchain class in in _get_lc_class" ) def __init__(self, **params): self._lc_class = self._get_lc_class() self._obj = self._lc_class(**params) self._kwargs: dict = params super().__init__() def run(self, text): input_docs = self.prepare_input(text) input_ = [doc.text for doc in input_docs] embeddings = self._obj.embed_documents(input_) return [ DocumentWithEmbedding(content=doc, embedding=each_embedding) for doc, each_embedding in zip(input_docs, embeddings) ] def __repr__(self): kwargs = [] for key, value_obj in self._kwargs.items(): value = repr(value_obj) kwargs.append(f"{key}={value}") kwargs_repr = ", ".join(kwargs) return f"{self.__class__.__name__}({kwargs_repr})" def __str__(self): kwargs = [] for key, value_obj in self._kwargs.items(): value = str(value_obj) if len(value) > 20: value = f"{value[:15]}..." kwargs.append(f"{key}={value}") kwargs_repr = ", ".join(kwargs) return f"{self.__class__.__name__}({kwargs_repr})" def __setattr__(self, name, value): if name == "_lc_class": return super().__setattr__(name, value) if name in self._lc_class.__fields__: self._kwargs[name] = value self._obj = self._lc_class(**self._kwargs) else: super().__setattr__(name, value) def __getattr__(self, name): if name in self._kwargs: return self._kwargs[name] return getattr(self._obj, name) def dump(self, *args, **kwargs): from theflow.utils.modules import serialize params = {key: serialize(value) for key, value in self._kwargs.items()} return { "__type__": f"{self.__module__}.{self.__class__.__qualname__}", **params, } def specs(self, path: str): path = path.strip(".") if "." in path: raise ValueError("path should not contain '.'") if path in self._lc_class.__fields__: return { "__type__": "theflow.base.ParamAttr", "refresh_on_set": True, "strict_type": True, } raise ValueError(f"Invalid param {path}") class LCOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings): """Wrapper around Langchain's OpenAI embedding, focusing on key parameters""" def __init__( self, model: str = "text-embedding-ada-002", openai_api_version: Optional[str] = None, openai_api_base: Optional[str] = None, openai_api_type: Optional[str] = None, openai_api_key: Optional[str] = None, request_timeout: Optional[float] = None, **params, ): super().__init__( model=model, openai_api_version=openai_api_version, openai_api_base=openai_api_base, openai_api_type=openai_api_type, openai_api_key=openai_api_key, request_timeout=request_timeout, **params, ) def _get_lc_class(self): try: from langchain_openai import OpenAIEmbeddings except ImportError: from langchain.embeddings import OpenAIEmbeddings return OpenAIEmbeddings class LCAzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings): """Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters""" def __init__( self, azure_endpoint: Optional[str] = None, deployment: Optional[str] = None, openai_api_key: Optional[str] = None, api_version: Optional[str] = None, request_timeout: Optional[float] = None, **params, ): super().__init__( azure_endpoint=azure_endpoint, deployment=deployment, api_version=api_version, openai_api_key=openai_api_key, request_timeout=request_timeout, **params, ) def _get_lc_class(self): try: from langchain_openai import AzureOpenAIEmbeddings except ImportError: from langchain.embeddings import AzureOpenAIEmbeddings return AzureOpenAIEmbeddings class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings): """Wrapper around Langchain's Cohere embedding, focusing on key parameters""" cohere_api_key: str = Param( help="API key (https://dashboard.cohere.com/api-keys)", default=None, required=True, ) model: str = Param( help="Model name to use (https://docs.cohere.com/docs/models)", default=None, required=True, ) user_agent: str = Param( help="User agent (leave default)", default="default", required=True ) def __init__( self, model: str = "embed-english-v2.0", cohere_api_key: Optional[str] = None, truncate: Optional[str] = None, request_timeout: Optional[float] = None, **params, ): super().__init__( model=model, cohere_api_key=cohere_api_key, truncate=truncate, request_timeout=request_timeout, **params, ) def _get_lc_class(self): try: from langchain_cohere import CohereEmbeddings except ImportError: from langchain.embeddings import CohereEmbeddings return CohereEmbeddings class LCHuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings): """Wrapper around Langchain's HuggingFace embedding, focusing on key parameters""" model_name: str = Param( help=( "Model name to use (https://huggingface.co/models?" "pipeline_tag=sentence-similarity&sort=trending)" ), default=None, required=True, ) def __init__( self, model_name: str = "sentence-transformers/all-mpnet-base-v2", **params, ): super().__init__( model_name=model_name, **params, ) def _get_lc_class(self): try: from langchain_community.embeddings import HuggingFaceBgeEmbeddings except ImportError: from langchain.embeddings import HuggingFaceBgeEmbeddings return HuggingFaceBgeEmbeddings class LCGoogleEmbeddings(LCEmbeddingMixin, BaseEmbeddings): """Wrapper around Langchain's Google GenAI embedding, focusing on key parameters""" google_api_key: str = Param( help="API key (https://aistudio.google.com/app/apikey)", default=None, required=True, ) model: str = Param( help="Model name to use (https://ai.google.dev/gemini-api/docs/models/gemini#text-embedding-and-embedding)", # noqa default="models/text-embedding-004", required=True, ) def __init__( self, model: str = "models/text-embedding-004", google_api_key: Optional[str] = None, **params, ): super().__init__( model=model, google_api_key=google_api_key, **params, ) def _get_lc_class(self): try: from langchain_google_genai import GoogleGenerativeAIEmbeddings except ImportError: raise ImportError("Please install langchain-google-genai") return GoogleGenerativeAIEmbeddings class LCMistralEmbeddings(LCEmbeddingMixin, BaseEmbeddings): """Wrapper around LangChain's MistralAI embedding, focusing on key parameters""" api_key: str = Param( help="API key (https://console.mistral.ai/api-keys)", default=None, required=True, ) model: str = Param( help="Model name to use ('mistral-embed')", default="mistral-embed", required=True, ) def __init__( self, model: str = "mistral-embed", api_key: Optional[str] = None, **params, ): super().__init__( model=model, api_key=api_key, **params, ) def _get_lc_class(self): try: from langchain_mistralai import MistralAIEmbeddings except ImportError: raise ImportError( "Please install langchain_mistralai: " "`pip install -U langchain_mistralai`" ) return MistralAIEmbeddings ================================================ FILE: libs/kotaemon/kotaemon/embeddings/openai.py ================================================ from itertools import islice from typing import Optional import numpy as np import openai import tiktoken from tenacity import ( retry, retry_if_not_exception_type, stop_after_attempt, wait_random_exponential, ) from theflow.utils.modules import import_dotted_string from kotaemon.base import Param from .base import BaseEmbeddings, Document, DocumentWithEmbedding def split_text_by_chunk_size(text: str, chunk_size: int) -> list[list[int]]: """Split the text into chunks of a given size Args: text: text to split chunk_size: size of each chunk Returns: list of chunks (as tokens) """ encoding = tiktoken.get_encoding("cl100k_base") tokens = iter(encoding.encode(text)) result = [] while chunk := list(islice(tokens, chunk_size)): result.append(chunk) return result class BaseOpenAIEmbeddings(BaseEmbeddings): """Base interface for OpenAI embedding model, using the openai library. This class exposes the parameters in resources.Chat. To subclass this class: - Implement the `prepare_client` method to return the OpenAI client - Implement the `openai_response` method to return the OpenAI response - Implement the params relate to the OpenAI client """ _dependencies = ["openai"] api_key: str = Param(None, help="API key", required=True) timeout: Optional[float] = Param(None, help="Timeout for the API request.") max_retries: Optional[int] = Param( None, help="Maximum number of retries for the API request." ) dimensions: Optional[int] = Param( None, help=( "The number of dimensions the resulting output embeddings should have. " "Only supported in `text-embedding-3` and later models." ), ) context_length: Optional[int] = Param( None, help="The maximum context length of the embedding model" ) @Param.auto(depends_on=["max_retries"]) def max_retries_(self): if self.max_retries is None: from openai._constants import DEFAULT_MAX_RETRIES return DEFAULT_MAX_RETRIES return self.max_retries def prepare_client(self, async_version: bool = False): """Get the OpenAI client Args: async_version (bool): Whether to get the async version of the client """ raise NotImplementedError def openai_response(self, client, **kwargs): """Get the openai response""" raise NotImplementedError def invoke( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: input_doc = self.prepare_input(text) client = self.prepare_client(async_version=False) input_: list[str | list[int]] = [] splitted_indices = {} for idx, text in enumerate(input_doc): if self.context_length: chunks = split_text_by_chunk_size(text.text or " ", self.context_length) splitted_indices[idx] = (len(input_), len(input_) + len(chunks)) input_.extend(chunks) else: splitted_indices[idx] = (len(input_), len(input_) + 1) input_.append(text.text) resp = self.openai_response(client, input=input_, **kwargs).dict() output_ = list(sorted(resp["data"], key=lambda x: x["index"])) output = [] for idx, doc in enumerate(input_doc): embs = output_[splitted_indices[idx][0] : splitted_indices[idx][1]] if len(embs) == 1: output.append( DocumentWithEmbedding(embedding=embs[0]["embedding"], content=doc) ) continue chunk_lens = [ len(_) for _ in input_[splitted_indices[idx][0] : splitted_indices[idx][1]] ] vs: list[list[float]] = [_["embedding"] for _ in embs] emb = np.average(vs, axis=0, weights=chunk_lens) emb = emb / np.linalg.norm(emb) output.append(DocumentWithEmbedding(embedding=emb.tolist(), content=doc)) return output async def ainvoke( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: input_ = self.prepare_input(text) client = self.prepare_client(async_version=True) resp = await self.openai_response( client, input=[_.text if _.text else " " for _ in input_], **kwargs ).dict() output_ = sorted(resp["data"], key=lambda x: x["index"]) return [ DocumentWithEmbedding(embedding=o["embedding"], content=i) for i, o in zip(input_, output_) ] class OpenAIEmbeddings(BaseOpenAIEmbeddings): """OpenAI chat model""" base_url: Optional[str] = Param(None, help="OpenAI base URL") organization: Optional[str] = Param(None, help="OpenAI organization") model: str = Param( None, help=( "ID of the model to use. You can go to [Model overview](https://platform." "openai.com/docs/models/overview) to see the available models." ), required=True, ) def prepare_client(self, async_version: bool = False): """Get the OpenAI client Args: async_version (bool): Whether to get the async version of the client """ params = { "api_key": self.api_key, "organization": self.organization, "base_url": self.base_url, "timeout": self.timeout, "max_retries": self.max_retries_, } if async_version: from openai import AsyncOpenAI return AsyncOpenAI(**params) from openai import OpenAI return OpenAI(**params) @retry( retry=retry_if_not_exception_type( (openai.NotFoundError, openai.BadRequestError) ), wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(6), ) def openai_response(self, client, **kwargs): """Get the openai response""" params: dict = { "model": self.model, } if self.dimensions: params["dimensions"] = self.dimensions params.update(kwargs) return client.embeddings.create(**params) class AzureOpenAIEmbeddings(BaseOpenAIEmbeddings): azure_endpoint: str = Param( None, help=( "HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, " "azure_deployment, and api_version parameters are used to construct " "the full URL for the Azure OpenAI model." ), required=True, ) azure_deployment: str = Param(None, help="Azure deployment name", required=True) api_version: str = Param(None, help="Azure model version", required=True) azure_ad_token: Optional[str] = Param(None, help="Azure AD token") azure_ad_token_provider: Optional[str] = Param(None, help="Azure AD token provider") @Param.auto(depends_on=["azure_ad_token_provider"]) def azure_ad_token_provider_(self): if isinstance(self.azure_ad_token_provider, str): return import_dotted_string(self.azure_ad_token_provider, safe=False) def prepare_client(self, async_version: bool = False): """Get the OpenAI client Args: async_version (bool): Whether to get the async version of the client """ params = { "azure_endpoint": self.azure_endpoint, "api_version": self.api_version, "api_key": self.api_key, "azure_ad_token": self.azure_ad_token, "azure_ad_token_provider": self.azure_ad_token_provider_, "timeout": self.timeout, "max_retries": self.max_retries_, } if async_version: from openai import AsyncAzureOpenAI return AsyncAzureOpenAI(**params) from openai import AzureOpenAI return AzureOpenAI(**params) @retry( retry=retry_if_not_exception_type( (openai.NotFoundError, openai.BadRequestError) ), wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(6), ) def openai_response(self, client, **kwargs): """Get the openai response""" params: dict = { "model": self.azure_deployment, } if self.dimensions: params["dimensions"] = self.dimensions params.update(kwargs) return client.embeddings.create(**params) ================================================ FILE: libs/kotaemon/kotaemon/embeddings/tei_endpoint_embed.py ================================================ import aiohttp import requests from kotaemon.base import Document, DocumentWithEmbedding, Param from .base import BaseEmbeddings session = requests.session() class TeiEndpointEmbeddings(BaseEmbeddings): """An Embeddings component that uses an TEI (Text-Embedding-Inference) API compatible endpoint. Ref: https://github.com/huggingface/text-embeddings-inference Attributes: endpoint_url (str): The url of an TEI (Text-Embedding-Inference) API compatible endpoint. normalize (bool): Whether to normalize embeddings to unit length. truncate (bool): Whether to truncate embeddings to a fixed/default length. """ endpoint_url: str = Param(None, help="TEI embedding service api base URL") normalize: bool = Param( True, help="Normalize embeddings to unit length", ) truncate: bool = Param( True, help="Truncate embeddings to a fixed/default length", ) async def client_(self, inputs: list[str]): async with aiohttp.ClientSession() as session: async with session.post( url=self.endpoint_url, json={ "inputs": inputs, "normalize": self.normalize, "truncate": self.truncate, }, ) as resp: embeddings = await resp.json() return embeddings async def ainvoke( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: if not isinstance(text, list): text = [text] text = self.prepare_input(text) outputs = [] batch_size = 6 num_batch = max(len(text) // batch_size, 1) for i in range(num_batch): if i == num_batch - 1: mini_batch = text[batch_size * i :] else: mini_batch = text[batch_size * i : batch_size * (i + 1)] mini_batch = [x.content for x in mini_batch] embeddings = await self.client_(mini_batch) # type: ignore outputs.extend( [ DocumentWithEmbedding(content=doc, embedding=embedding) for doc, embedding in zip(mini_batch, embeddings) ] ) return outputs def invoke( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: if not isinstance(text, list): text = [text] text = self.prepare_input(text) outputs = [] batch_size = 6 num_batch = max(len(text) // batch_size, 1) for i in range(num_batch): if i == num_batch - 1: mini_batch = text[batch_size * i :] else: mini_batch = text[batch_size * i : batch_size * (i + 1)] mini_batch = [x.content for x in mini_batch] embeddings = session.post( url=self.endpoint_url, json={ "inputs": mini_batch, "normalize": self.normalize, "truncate": self.truncate, }, ).json() outputs.extend( [ DocumentWithEmbedding(content=doc, embedding=embedding) for doc, embedding in zip(mini_batch, embeddings) ] ) return outputs ================================================ FILE: libs/kotaemon/kotaemon/embeddings/voyageai.py ================================================ """Implements embeddings from [Voyage AI](https://voyageai.com). """ import importlib from kotaemon.base import Document, DocumentWithEmbedding, Param from .base import BaseEmbeddings vo = None def _import_voyageai(): global vo if not vo: vo = importlib.import_module("voyageai") return vo def _format_output(texts: list[str], embeddings: list[list]): """Formats the output of all `.embed` calls. Args: texts: List of original documents embeddings: Embeddings corresponding to each document """ return [ DocumentWithEmbedding(content=text, embedding=embedding) for text, embedding in zip(texts, embeddings) ] class VoyageAIEmbeddings(BaseEmbeddings): """Voyage AI provides best-in-class embedding models and rerankers.""" api_key: str = Param(None, help="Voyage API key", required=False) model: str = Param( "voyage-3", help=( "Model name to use. The Voyage " "[documentation](https://docs.voyageai.com/docs/embeddings) " "provides a list of all available embedding models." ), required=True, ) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if not self.api_key: raise ValueError("API key must be provided for VoyageAIEmbeddings.") self._client = _import_voyageai().Client(api_key=self.api_key) self._aclient = _import_voyageai().AsyncClient(api_key=self.api_key) def invoke( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: texts = [t.content for t in self.prepare_input(text)] embeddings = self._client.embed(texts, model=self.model).embeddings return _format_output(texts, embeddings) async def ainvoke( self, text: str | list[str] | Document | list[Document], *args, **kwargs ) -> list[DocumentWithEmbedding]: texts = [t.content for t in self.prepare_input(text)] embeddings = await self._aclient.embed(texts, model=self.model).embeddings return _format_output(texts, embeddings) ================================================ FILE: libs/kotaemon/kotaemon/indices/__init__.py ================================================ from .vectorindex import VectorIndexing, VectorRetrieval __all__ = ["VectorIndexing", "VectorRetrieval"] ================================================ FILE: libs/kotaemon/kotaemon/indices/base.py ================================================ from __future__ import annotations from abc import abstractmethod from typing import Any, Type from llama_index.core.node_parser.interface import NodeParser from kotaemon.base import BaseComponent, Document, RetrievedDocument class DocTransformer(BaseComponent): """This is a base class for document transformers A document transformer transforms a list of documents into another list of documents. Transforming can mean splitting a document into multiple documents, reducing a large list of documents into a smaller list of documents, or adding metadata to each document in a list of documents, etc. """ @abstractmethod def run( self, documents: list[Document], **kwargs, ) -> list[Document]: ... class LlamaIndexDocTransformerMixin: """Allow automatically wrapping a Llama-index component into kotaemon component Example: class TokenSplitter(LlamaIndexMixin, BaseSplitter): def _get_li_class(self): from llama_index.core.text_splitter import TokenTextSplitter return TokenTextSplitter To use this mixin, please: 1. Use this class as the 1st parent class, so that Python will prefer to use the attributes and methods of this class whenever possible. 2. Overwrite `_get_li_class` to return the relevant LlamaIndex component. """ def _get_li_class(self) -> Type[NodeParser]: raise NotImplementedError( "Please return the relevant LlamaIndex class in _get_li_class" ) def __init__(self, **params): self._li_cls = self._get_li_class() self._obj = self._li_cls(**params) self._kwargs = params super().__init__() def __repr__(self): kwargs = [] for key, value_obj in self._kwargs.items(): value = repr(value_obj) kwargs.append(f"{key}={value}") kwargs_repr = ", ".join(kwargs) return f"{self.__class__.__name__}({kwargs_repr})" def __str__(self): kwargs = [] for key, value_obj in self._kwargs.items(): value = str(value_obj) if len(value) > 20: value = f"{value[:15]}..." kwargs.append(f"{key}={value}") kwargs_repr = ", ".join(kwargs) return f"{self.__class__.__name__}({kwargs_repr})" def __setattr__(self, name: str, value: Any) -> None: if name.startswith("_") or name in self._protected_keywords(): return super().__setattr__(name, value) self._kwargs[name] = value return setattr(self._obj, name, value) def __getattr__(self, name: str) -> Any: if name in self._kwargs: return self._kwargs[name] return getattr(self._obj, name) def dump(self, *args, **kwargs): from theflow.utils.modules import serialize params = {key: serialize(value) for key, value in self._kwargs.items()} return { "__type__": f"{self.__module__}.{self.__class__.__qualname__}", **params, } def run( self, documents: list[Document], **kwargs, ) -> list[Document]: """Run Llama-index node parser and convert the output to Document from kotaemon """ docs = self._obj(documents, **kwargs) # type: ignore return [Document.from_dict(doc.to_dict()) for doc in docs] class BaseIndexing(BaseComponent): """Define the base interface for indexing pipeline""" def to_retrieval_pipeline(self, **kwargs): """Convert the indexing pipeline to a retrieval pipeline""" raise NotImplementedError def to_qa_pipeline(self, **kwargs): """Convert the indexing pipeline to a QA pipeline""" raise NotImplementedError class BaseRetrieval(BaseComponent): """Define the base interface for retrieval pipeline""" @abstractmethod def run(self, *args, **kwargs) -> list[RetrievedDocument]: ... ================================================ FILE: libs/kotaemon/kotaemon/indices/extractors/__init__.py ================================================ from .doc_parsers import BaseDocParser, SummaryExtractor, TitleExtractor __all__ = [ "BaseDocParser", "TitleExtractor", "SummaryExtractor", ] ================================================ FILE: libs/kotaemon/kotaemon/indices/extractors/doc_parsers.py ================================================ from ..base import DocTransformer, LlamaIndexDocTransformerMixin class BaseDocParser(DocTransformer): ... class TitleExtractor(LlamaIndexDocTransformerMixin, BaseDocParser): def __init__( self, llm=None, nodes: int = 5, **params, ): super().__init__(llm=llm, nodes=nodes, **params) def _get_li_class(self): from llama_index.core.extractors import TitleExtractor return TitleExtractor class SummaryExtractor(LlamaIndexDocTransformerMixin, BaseDocParser): def __init__( self, llm=None, summaries: list[str] = ["self"], **params, ): super().__init__(llm=llm, summaries=summaries, **params) def _get_li_class(self): from llama_index.core.extractors import SummaryExtractor return SummaryExtractor ================================================ FILE: libs/kotaemon/kotaemon/indices/ingests/__init__.py ================================================ from .files import DocumentIngestor __all__ = ["DocumentIngestor"] ================================================ FILE: libs/kotaemon/kotaemon/indices/ingests/files.py ================================================ from pathlib import Path from typing import Type from decouple import config from llama_index.core.readers.base import BaseReader from llama_index.readers.file import PDFReader from theflow.settings import settings as flowsettings from kotaemon.base import BaseComponent, Document, Param from kotaemon.indices.extractors import BaseDocParser from kotaemon.indices.splitters import BaseSplitter, TokenSplitter from kotaemon.loaders import ( AdobeReader, AzureAIDocumentIntelligenceLoader, DirectoryReader, DoclingReader, HtmlReader, MathpixPDFReader, MhtmlReader, OCRReader, PandasExcelReader, PDFThumbnailReader, TxtReader, UnstructuredReader, WebReader, ) web_reader = WebReader() unstructured = UnstructuredReader() adobe_reader = AdobeReader() azure_reader = AzureAIDocumentIntelligenceLoader( endpoint=str(config("AZURE_DI_ENDPOINT", default="")), credential=str(config("AZURE_DI_CREDENTIAL", default="")), cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None), ) docling_reader = DoclingReader() adobe_reader.vlm_endpoint = ( azure_reader.vlm_endpoint ) = docling_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "") KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = { ".xlsx": PandasExcelReader(), ".docx": unstructured, ".pptx": unstructured, ".xls": unstructured, ".doc": unstructured, ".html": HtmlReader(), ".mhtml": MhtmlReader(), ".png": unstructured, ".jpeg": unstructured, ".jpg": unstructured, ".tiff": unstructured, ".tif": unstructured, ".pdf": PDFThumbnailReader(), ".txt": TxtReader(), ".md": TxtReader(), } class DocumentIngestor(BaseComponent): """Ingest common office document types into Document for indexing Document types: - pdf - xlsx, xls - docx, doc Args: pdf_mode: mode for pdf extraction, one of "normal", "mathpix", "ocr" - normal: parse pdf text - mathpix: parse pdf text using mathpix - ocr: parse pdf image using flax doc_parsers: list of document parsers to parse the document text_splitter: splitter to split the document into text nodes override_file_extractors: override file extractors for specific file extensions The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS` """ pdf_mode: str = "normal" # "normal", "mathpix", "ocr", "multimodal" doc_parsers: list[BaseDocParser] = Param(default_callback=lambda _: []) text_splitter: BaseSplitter = TokenSplitter.withx( chunk_size=1024, chunk_overlap=256, separator="\n\n", backup_separators=["\n", ".", " ", "\u200B"], ) override_file_extractors: dict[str, Type[BaseReader]] = {} def _get_reader(self, input_files: list[str | Path]): """Get appropriate readers for the input files based on file extension""" file_extractors: dict[str, BaseReader] = { ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items() } for ext, cls in self.override_file_extractors.items(): file_extractors[ext] = cls() if self.pdf_mode == "normal": file_extractors[".pdf"] = PDFReader() elif self.pdf_mode == "ocr": file_extractors[".pdf"] = OCRReader() elif self.pdf_mode == "multimodal": file_extractors[".pdf"] = AdobeReader() else: file_extractors[".pdf"] = MathpixPDFReader() main_reader = DirectoryReader( input_files=input_files, file_extractor=file_extractors, ) return main_reader def run(self, file_paths: list[str | Path] | str | Path) -> list[Document]: """Ingest the file paths into Document Args: file_paths: list of file paths or a single file path Returns: list of parsed Documents """ if not isinstance(file_paths, list): file_paths = [file_paths] documents = self._get_reader(input_files=file_paths)() print(f"Read {len(file_paths)} files into {len(documents)} documents.") nodes = self.text_splitter(documents) print(f"Transform {len(documents)} documents into {len(nodes)} nodes.") self.log_progress(".num_docs", num_docs=len(nodes)) # document parsers call if self.doc_parsers: for parser in self.doc_parsers: nodes = parser(nodes) return nodes ================================================ FILE: libs/kotaemon/kotaemon/indices/qa/__init__.py ================================================ from .citation import CitationPipeline __all__ = [ "CitationPipeline", ] ================================================ FILE: libs/kotaemon/kotaemon/indices/qa/citation.py ================================================ from typing import List from pydantic import BaseModel, Field from kotaemon.base import BaseComponent from kotaemon.base.schema import HumanMessage, SystemMessage from kotaemon.llms import BaseLLM class CiteEvidence(BaseModel): """List of evidences (maximum 5) to support the answer.""" evidences: List[str] = Field( ..., description=( "Each source should be a direct quote from the context, " "as a substring of the original content (max 15 words)." ), ) class CitationPipeline(BaseComponent): """Citation pipeline to extract cited evidences from source (based on input question)""" llm: BaseLLM def run(self, context: str, question: str): return self.invoke(context, question) def prepare_llm(self, context: str, question: str): schema = CiteEvidence.schema() function = { "name": schema["title"], "description": schema["description"], "parameters": schema, } llm_kwargs = { "tools": [{"type": "function", "function": function}], "tool_choice": "required", "tools_pydantic": [CiteEvidence], } messages = [ SystemMessage( content=( "You are a world class algorithm to answer " "questions with correct and exact citations." ) ), HumanMessage( content=( "Answer question using the following context. " "Use the provided function CiteEvidence() to cite your sources." ) ), HumanMessage(content=context), HumanMessage(content=f"Question: {question}"), HumanMessage( content=( "Tips: Make sure to cite your sources, " "and use the exact words from the context." ) ), ] return messages, llm_kwargs def invoke(self, context: str, question: str): messages, llm_kwargs = self.prepare_llm(context, question) try: print("CitationPipeline: invoking LLM") llm_output = self.get_from_path("llm").invoke(messages, **llm_kwargs) print("CitationPipeline: finish invoking LLM") if not llm_output.additional_kwargs.get("tool_calls"): return None first_func = llm_output.additional_kwargs["tool_calls"][0] if "function" in first_func: # openai and cohere format function_output = first_func["function"]["arguments"] else: # anthropic format function_output = first_func["args"] print("CitationPipeline:", function_output) if isinstance(function_output, str): output = CiteEvidence.parse_raw(function_output) else: output = CiteEvidence.parse_obj(function_output) except Exception as e: print(e) return None return output async def ainvoke(self, context: str, question: str): raise NotImplementedError() ================================================ FILE: libs/kotaemon/kotaemon/indices/qa/citation_qa.py ================================================ import threading from collections import defaultdict from typing import Generator import numpy as np from decouple import config from theflow.settings import settings as flowsettings from kotaemon.base import ( AIMessage, BaseComponent, Document, HumanMessage, Node, SystemMessage, ) from kotaemon.llms import ChatLLM, PromptTemplate from .citation import CitationPipeline from .format_context import ( EVIDENCE_MODE_FIGURE, EVIDENCE_MODE_TABLE, EVIDENCE_MODE_TEXT, ) from .utils import find_text try: from ktem.llms.manager import llms from ktem.reasoning.prompt_optimization.mindmap import CreateMindmapPipeline from ktem.utils.render import Render except ImportError: raise ImportError("Please install `ktem` to use this component") MAX_IMAGES = 10 CITATION_TIMEOUT = 5.0 CONTEXT_RELEVANT_WARNING_SCORE = config( "CONTEXT_RELEVANT_WARNING_SCORE", 0.3, cast=float ) DEFAULT_QA_TEXT_PROMPT = ( "Use the following pieces of context to answer the question at the end in detail with clear explanation. " # noqa: E501 "If you don't know the answer, just say that you don't know, don't try to " "make up an answer. Give answer in " "{lang}.\n\n" "{context}\n" "Question: {question}\n" "Helpful Answer:" ) DEFAULT_QA_TABLE_PROMPT = ( "Use the given context: texts, tables, and figures below to answer the question, " "then provide answer with clear explanation." "If you don't know the answer, just say that you don't know, " "don't try to make up an answer. Give answer in {lang}.\n\n" "Context:\n" "{context}\n" "Question: {question}\n" "Helpful Answer:" ) # noqa DEFAULT_QA_CHATBOT_PROMPT = ( "Pick the most suitable chatbot scenarios to answer the question at the end, " "output the provided answer text. If you don't know the answer, " "just say that you don't know. Keep the answer as concise as possible. " "Give answer in {lang}.\n\n" "Context:\n" "{context}\n" "Question: {question}\n" "Answer:" ) # noqa DEFAULT_QA_FIGURE_PROMPT = ( "Use the given context: texts, tables, and figures below to answer the question. " "If you don't know the answer, just say that you don't know. " "Give answer in {lang}.\n\n" "Context: \n" "{context}\n" "Question: {question}\n" "Answer: " ) # noqa class AnswerWithContextPipeline(BaseComponent): """Answer the question based on the evidence Args: llm: the language model to generate the answer citation_pipeline: generates citation from the evidence qa_template: the prompt template for LLM to generate answer (refer to evidence_mode) qa_table_template: the prompt template for LLM to generate answer for table (refer to evidence_mode) qa_chatbot_template: the prompt template for LLM to generate answer for pre-made scenarios (refer to evidence_mode) lang: the language of the answer. Currently support English and Japanese """ llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) vlm_endpoint: str = getattr(flowsettings, "KH_VLM_ENDPOINT", "") use_multimodal: bool = getattr(flowsettings, "KH_REASONINGS_USE_MULTIMODAL", True) citation_pipeline: CitationPipeline = Node( default_callback=lambda _: CitationPipeline(llm=llms.get_default()) ) create_mindmap_pipeline: CreateMindmapPipeline = Node( default_callback=lambda _: CreateMindmapPipeline(llm=llms.get_default()) ) qa_template: str = DEFAULT_QA_TEXT_PROMPT qa_table_template: str = DEFAULT_QA_TABLE_PROMPT qa_chatbot_template: str = DEFAULT_QA_CHATBOT_PROMPT qa_figure_template: str = DEFAULT_QA_FIGURE_PROMPT enable_citation: bool = False enable_mindmap: bool = False enable_citation_viz: bool = False system_prompt: str = "" lang: str = "English" # support English and Japanese n_last_interactions: int = 5 def get_prompt(self, question, evidence, evidence_mode: int): """Prepare the prompt and other information for LLM""" if evidence_mode == EVIDENCE_MODE_TEXT: prompt_template = PromptTemplate(self.qa_template) elif evidence_mode == EVIDENCE_MODE_TABLE: prompt_template = PromptTemplate(self.qa_table_template) elif evidence_mode == EVIDENCE_MODE_FIGURE: if self.use_multimodal: prompt_template = PromptTemplate(self.qa_figure_template) else: prompt_template = PromptTemplate(self.qa_template) else: prompt_template = PromptTemplate(self.qa_chatbot_template) prompt = prompt_template.populate( context=evidence, question=question, lang=self.lang, ) return prompt, evidence def run( self, question: str, evidence: str, evidence_mode: int = 0, **kwargs ) -> Document: return self.invoke(question, evidence, evidence_mode, **kwargs) def invoke( self, question: str, evidence: str, evidence_mode: int = 0, images: list[str] = [], **kwargs, ) -> Document: raise NotImplementedError async def ainvoke( # type: ignore self, question: str, evidence: str, evidence_mode: int = 0, images: list[str] = [], **kwargs, ) -> Document: """Answer the question based on the evidence In addition to the question and the evidence, this method also take into account evidence_mode. The evidence_mode tells which kind of evidence is. The kind of evidence affects: 1. How the evidence is represented. 2. The prompt to generate the answer. By default, the evidence_mode is 0, which means the evidence is plain text with no particular semantic representation. The evidence_mode can be: 1. "table": There will be HTML markup telling that there is a table within the evidence. 2. "chatbot": There will be HTML markup telling that there is a chatbot. This chatbot is a scenario, extracted from an Excel file, where each row corresponds to an interaction. Args: question: the original question posed by user evidence: the text that contain relevant information to answer the question (determined by retrieval pipeline) evidence_mode: the mode of evidence, 0 for text, 1 for table, 2 for chatbot """ raise NotImplementedError def stream( # type: ignore self, question: str, evidence: str, evidence_mode: int = 0, images: list[str] = [], **kwargs, ) -> Generator[Document, None, Document]: history = kwargs.get("history", []) print(f"Got {len(images)} images") # check if evidence exists, use QA prompt if evidence: prompt, evidence = self.get_prompt(question, evidence, evidence_mode) else: prompt = question # retrieve the citation citation = None mindmap = None def citation_call(): nonlocal citation citation = self.citation_pipeline(context=evidence, question=question) def mindmap_call(): nonlocal mindmap mindmap = self.create_mindmap_pipeline(context=evidence, question=question) citation_thread = None mindmap_thread = None # execute function call in thread if evidence: if self.enable_citation: citation_thread = threading.Thread(target=citation_call) citation_thread.start() if self.enable_mindmap: mindmap_thread = threading.Thread(target=mindmap_call) mindmap_thread.start() output = "" logprobs = [] messages = [] if self.system_prompt: messages.append(SystemMessage(content=self.system_prompt)) for human, ai in history[-self.n_last_interactions :]: messages.append(HumanMessage(content=human)) messages.append(AIMessage(content=ai)) if self.use_multimodal and evidence_mode == EVIDENCE_MODE_FIGURE: # create image message: messages.append( HumanMessage( content=[ {"type": "text", "text": prompt}, ] + [ { "type": "image_url", "image_url": {"url": image}, } for image in images[:MAX_IMAGES] ], ) ) else: # append main prompt messages.append(HumanMessage(content=prompt)) try: # try streaming first print("Trying LLM streaming") for out_msg in self.llm.stream(messages): output += out_msg.text logprobs += out_msg.logprobs yield Document(channel="chat", content=out_msg.text) except NotImplementedError: print("Streaming is not supported, falling back to normal processing") output = self.llm(messages).text yield Document(channel="chat", content=output) if logprobs: qa_score = np.exp(np.average(logprobs)) else: qa_score = None if citation_thread: citation_thread.join(timeout=CITATION_TIMEOUT) if mindmap_thread: mindmap_thread.join(timeout=CITATION_TIMEOUT) answer = Document( text=output, metadata={ "citation_viz": self.enable_citation_viz, "mindmap": mindmap, "citation": citation, "qa_score": qa_score, }, ) return answer def match_evidence_with_context(self, answer, docs) -> dict[str, list[dict]]: """Match the evidence with the context""" spans: dict[str, list[dict]] = defaultdict(list) if not answer.metadata["citation"]: return spans evidences = answer.metadata["citation"].evidences for quote in evidences: matched_excerpts = [] for doc in docs: matches = find_text(quote, doc.text) for start, end in matches: if "|" not in doc.text[start:end]: spans[doc.doc_id].append( { "start": start, "end": end, } ) matched_excerpts.append(doc.text[start:end]) # print("Matched citation:", quote, matched_excerpts), return spans def prepare_citations(self, answer, docs) -> tuple[list[Document], list[Document]]: """Prepare the citations to show on the UI""" with_citation, without_citation = [], [] has_llm_score = any("llm_trulens_score" in doc.metadata for doc in docs) spans = self.match_evidence_with_context(answer, docs) id2docs = {doc.doc_id: doc for doc in docs} not_detected = set(id2docs.keys()) - set(spans.keys()) # render highlight spans for _id, ss in spans.items(): if not ss: not_detected.add(_id) continue cur_doc = id2docs[_id] highlight_text = "" ss = sorted(ss, key=lambda x: x["start"]) last_end = 0 text = cur_doc.text[: ss[0]["start"]] for idx, span in enumerate(ss): # prevent overlapping between span span_start = max(last_end, span["start"]) span_end = max(last_end, span["end"]) to_highlight = cur_doc.text[span_start:span_end] last_end = span_end # append to highlight on PDF viewer highlight_text += (" " if highlight_text else "") + to_highlight span_idx = span.get("idx", None) if span_idx is not None: to_highlight = f"【{span_idx}】" + to_highlight text += Render.highlight( to_highlight, elem_id=str(span_idx) if span_idx is not None else None, ) if idx < len(ss) - 1: text += cur_doc.text[span["end"] : ss[idx + 1]["start"]] text += cur_doc.text[ss[-1]["end"] :] # add to display list with_citation.append( Document( channel="info", content=Render.collapsible_with_header_score( cur_doc, override_text=text, highlight_text=highlight_text, open_collapsible=True, ), ) ) print("Got {} cited docs".format(len(with_citation))) sorted_not_detected_items_with_scores = [ (id_, id2docs[id_].metadata.get("llm_trulens_score", 0.0)) for id_ in not_detected ] sorted_not_detected_items_with_scores.sort(key=lambda x: x[1], reverse=True) for id_, _ in sorted_not_detected_items_with_scores: doc = id2docs[id_] doc_score = doc.metadata.get("llm_trulens_score", 0.0) is_open = not has_llm_score or ( doc_score > CONTEXT_RELEVANT_WARNING_SCORE # and len(with_citation) == 0 ) without_citation.append( Document( channel="info", content=Render.collapsible_with_header_score( doc, open_collapsible=is_open ), ) ) return with_citation, without_citation ================================================ FILE: libs/kotaemon/kotaemon/indices/qa/citation_qa_inline.py ================================================ import re import threading from collections import defaultdict from dataclasses import dataclass from typing import Generator import numpy as np from kotaemon.base import AIMessage, Document, HumanMessage, SystemMessage from kotaemon.llms import PromptTemplate from .citation_qa import CITATION_TIMEOUT, MAX_IMAGES, AnswerWithContextPipeline from .format_context import EVIDENCE_MODE_FIGURE from .utils import find_start_end_phrase DEFAULT_QA_CITATION_PROMPT = """ Use the following pieces of context to answer the question at the end. Provide DETAILED ansswer with clear explanation. Format answer with easy to follow bullets / paragraphs. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use the same language as the question to response. CONTEXT: ---- {context} ---- Answer using this format: CITATION LIST // the index in this array CITATION【number】 // output 2 phrase to mark start and end of the relevant span // each has ~ 6 words // MUST COPY EXACTLY from the CONTEXT // NO CHANGE or REPHRASE // RELEVANT_SPAN_FROM_CONTEXT START_PHRASE: string END_PHRASE: string // When you answer, ensure to add citations from the documents // in the CONTEXT with a number that corresponds to the answersInText array. // (in the form [number]) // Try to include the number after each facts / statements you make. // You can create as many citations as you need. FINAL ANSWER string STRICTLY FOLLOW THIS EXAMPLE: CITATION LIST CITATION【1】 START_PHRASE: Known as fixed-size chunking , the traditional END_PHRASE: not degrade the final retrieval performance. CITATION【2】 START_PHRASE: Fixed-size Chunker This is our baseline chunker END_PHRASE: this shows good retrieval quality. FINAL ANSWER An alternative to semantic chunking is fixed-size chunking. This traditional method involves splitting documents into chunks of a predetermined or user-specified size, regardless of semantic content, which is computationally efficient【1】. However, it may result in the fragmentation of semantically related content, thereby potentially degrading retrieval performance【1】【2】. QUESTION: {question}\n ANSWER: """ # noqa START_ANSWER = "FINAL ANSWER" START_CITATION = "CITATION LIST" CITATION_PATTERN = r"citation【(\d+)】" START_ANSWER_PATTERN = "start_phrase:" END_ANSWER_PATTERN = "end_phrase:" @dataclass class InlineEvidence: """List of evidences to support the answer.""" start_phrase: str | None = None end_phrase: str | None = None idx: int | None = None class AnswerWithInlineCitation(AnswerWithContextPipeline): """Answer the question based on the evidence with inline citation""" qa_citation_template: str = DEFAULT_QA_CITATION_PROMPT def get_prompt(self, question, evidence, evidence_mode: int): """Prepare the prompt and other information for LLM""" prompt_template = PromptTemplate(self.qa_citation_template) prompt = prompt_template.populate( context=evidence, question=question, safe=False, ) return prompt, evidence def answer_to_citations(self, answer) -> list[InlineEvidence]: citations: list[InlineEvidence] = [] lines = answer.split("\n") current_evidence = None for line in lines: # check citation idx using regex match = re.match(CITATION_PATTERN, line.lower()) if match: try: parsed_citation_idx = int(match.group(1)) except ValueError: parsed_citation_idx = None # conclude the current evidence if exists if current_evidence: citations.append(current_evidence) current_evidence = None current_evidence = InlineEvidence(idx=parsed_citation_idx) else: for keyword in [START_ANSWER_PATTERN, END_ANSWER_PATTERN]: if line.lower().startswith(keyword): matched_phrase = line[len(keyword) :].strip() if not current_evidence: current_evidence = InlineEvidence(idx=None) if keyword == START_ANSWER_PATTERN: current_evidence.start_phrase = matched_phrase else: current_evidence.end_phrase = matched_phrase break if ( current_evidence and current_evidence.end_phrase and current_evidence.start_phrase ): citations.append(current_evidence) current_evidence = None if current_evidence: citations.append(current_evidence) return citations def replace_citation_with_link(self, answer: str): # Define the regex pattern to match 【number】 pattern = r"【\d+】" alternate_pattern = r"\[\d+\]" # Regular expression to match merged citations multi_pattern = r"【([\d,\s]+)】" # Function to replace merged citations with independent ones def split_citations(match): # Extract the numbers, split by comma, and create individual citations numbers = match.group(1).split(",") return "".join(f"【{num.strip()}】" for num in numbers) # Replace merged citations in the text answer = re.sub(multi_pattern, split_citations, answer) # Find all citations in the answer matches = list(re.finditer(pattern, answer)) if not matches: matches = list(re.finditer(alternate_pattern, answer)) matched_citations = set() for match in matches: citation = match.group() matched_citations.add(citation) for citation in matched_citations: citation_id = citation[1:-1] answer = answer.replace( citation, ( "【{citation_id}】" ), ) answer = answer.replace(START_CITATION, "") return answer def stream( # type: ignore self, question: str, evidence: str, evidence_mode: int = 0, images: list[str] = [], **kwargs, ) -> Generator[Document, None, Document]: history = kwargs.get("history", []) print(f"Got {len(images)} images") # check if evidence exists, use QA prompt if evidence: prompt, evidence = self.get_prompt(question, evidence, evidence_mode) else: prompt = question output = "" logprobs = [] citation = None mindmap = None def mindmap_call(): nonlocal mindmap mindmap = self.create_mindmap_pipeline(context=evidence, question=question) mindmap_thread = None # execute function call in thread if evidence: if self.enable_mindmap: mindmap_thread = threading.Thread(target=mindmap_call) mindmap_thread.start() messages = [] if self.system_prompt: messages.append(SystemMessage(content=self.system_prompt)) for human, ai in history[-self.n_last_interactions :]: messages.append(HumanMessage(content=human)) messages.append(AIMessage(content=ai)) if self.use_multimodal and evidence_mode == EVIDENCE_MODE_FIGURE: # create image message: messages.append( HumanMessage( content=[ {"type": "text", "text": prompt}, ] + [ { "type": "image_url", "image_url": {"url": image}, } for image in images[:MAX_IMAGES] ], ) ) else: # append main prompt messages.append(HumanMessage(content=prompt)) final_answer = "" try: # try streaming first print("Trying LLM streaming") for out_msg in self.llm.stream(messages): if evidence: if START_ANSWER in output: if not final_answer: try: left_over_answer = output.split(START_ANSWER)[ 1 ].lstrip() except IndexError: left_over_answer = "" if left_over_answer: out_msg.text = left_over_answer + out_msg.text final_answer += ( out_msg.text.lstrip() if not final_answer else out_msg.text ) yield Document(channel="chat", content=out_msg.text) # check for the edge case of citation list is repeated # with smaller LLMs if START_CITATION in out_msg.text: break else: yield Document(channel="chat", content=out_msg.text) output += out_msg.text logprobs += out_msg.logprobs except NotImplementedError: print("Streaming is not supported, falling back to normal processing") output = self.llm(messages).text yield Document(channel="chat", content=output) if logprobs: qa_score = np.exp(np.average(logprobs)) else: qa_score = None citation = self.answer_to_citations(output) if mindmap_thread: mindmap_thread.join(timeout=CITATION_TIMEOUT) # convert citation to link answer = Document( text=final_answer, metadata={ "citation_viz": self.enable_citation_viz, "mindmap": mindmap, "citation": citation, "qa_score": qa_score, }, ) # yield the final answer final_answer = self.replace_citation_with_link(final_answer) if final_answer: yield Document(channel="chat", content=None) yield Document(channel="chat", content=final_answer) return answer def match_evidence_with_context(self, answer, docs) -> dict[str, list[dict]]: """Match the evidence with the context""" spans: dict[str, list[dict]] = defaultdict(list) if not answer.metadata["citation"]: return spans evidences = answer.metadata["citation"] for e_id, evidence in enumerate(evidences): start_phrase, end_phrase = evidence.start_phrase, evidence.end_phrase evidence_idx = evidence.idx if evidence_idx is None: evidence_idx = e_id + 1 best_match = None best_match_length = 0 best_match_doc_idx = None for doc in docs: match, match_length = find_start_end_phrase( start_phrase, end_phrase, doc.text ) if best_match is None or ( match is not None and match_length > best_match_length ): best_match = match best_match_length = match_length best_match_doc_idx = doc.doc_id if best_match is not None and best_match_doc_idx is not None: spans[best_match_doc_idx].append( { "start": best_match[0], "end": best_match[1], "idx": evidence_idx, } ) return spans ================================================ FILE: libs/kotaemon/kotaemon/indices/qa/format_context.py ================================================ import html from functools import partial import tiktoken from kotaemon.base import BaseComponent, Document, RetrievedDocument from kotaemon.indices.splitters import TokenSplitter EVIDENCE_MODE_TEXT = 0 EVIDENCE_MODE_TABLE = 1 EVIDENCE_MODE_CHATBOT = 2 EVIDENCE_MODE_FIGURE = 3 class PrepareEvidencePipeline(BaseComponent): """Prepare the evidence text from the list of retrieved documents This step usually happens after `DocumentRetrievalPipeline`. Args: trim_func: a callback function or a BaseComponent, that splits a large chunk of text into smaller ones. The first one will be retained. """ max_context_length: int = 32000 trim_func: TokenSplitter | None = None def run(self, docs: list[RetrievedDocument]) -> Document: evidence = "" images = [] table_found = 0 evidence_modes = [] evidence_trim_func = ( self.trim_func if self.trim_func else TokenSplitter( chunk_size=self.max_context_length, chunk_overlap=0, separator=" ", tokenizer=partial( tiktoken.encoding_for_model("gpt-3.5-turbo").encode, allowed_special=set(), disallowed_special="all", ), ) ) for _, retrieved_item in enumerate(docs): retrieved_content = "" page = retrieved_item.metadata.get("page_label", None) source = filename = retrieved_item.metadata.get("file_name", "-") if page: source += f" (Page {page})" if retrieved_item.metadata.get("type", "") == "table": evidence_modes.append(EVIDENCE_MODE_TABLE) if table_found < 5: retrieved_content = retrieved_item.metadata.get( "table_origin", retrieved_item.text ) if retrieved_content not in evidence: table_found += 1 evidence += ( f"
Table from {source}\n" + retrieved_content + "\n
" ) elif retrieved_item.metadata.get("type", "") == "chatbot": evidence_modes.append(EVIDENCE_MODE_CHATBOT) retrieved_content = retrieved_item.metadata["window"] evidence += ( f"
Chatbot scenario from {filename} (Row {page})\n" + retrieved_content + "\n
" ) elif retrieved_item.metadata.get("type", "") == "image": evidence_modes.append(EVIDENCE_MODE_FIGURE) retrieved_content = retrieved_item.metadata.get("image_origin", "") retrieved_caption = html.escape(retrieved_item.get_content()) evidence += ( f"
Figure from {source}\n" + "" + "\n
" ) images.append(retrieved_content) else: if "window" in retrieved_item.metadata: retrieved_content = retrieved_item.metadata["window"] else: retrieved_content = retrieved_item.text retrieved_content = retrieved_content.replace("\n", " ") if retrieved_content not in evidence: evidence += ( f"
Content from {source}: " + retrieved_content + " \n
" ) # resolve evidence mode evidence_mode = EVIDENCE_MODE_TEXT if EVIDENCE_MODE_FIGURE in evidence_modes: evidence_mode = EVIDENCE_MODE_FIGURE elif EVIDENCE_MODE_TABLE in evidence_modes: evidence_mode = EVIDENCE_MODE_TABLE # trim context by trim_len print("len (original)", len(evidence)) if evidence: texts = evidence_trim_func([Document(text=evidence)]) evidence = texts[0].text print("len (trimmed)", len(evidence)) return Document(content=(evidence_mode, evidence, images)) ================================================ FILE: libs/kotaemon/kotaemon/indices/qa/utils.py ================================================ from difflib import SequenceMatcher def find_text(search_span, context, min_length=5): search_span, context = search_span.lower(), context.lower() sentence_list = search_span.split("\n") context = context.replace("\n", " ") matches_span = [] # don't search for small text if len(search_span) > min_length: for sentence in sentence_list: match_results = SequenceMatcher( None, sentence, context, autojunk=False, ).get_matching_blocks() matched_blocks = [] for _, start, length in match_results: if length > max(len(sentence) * 0.25, min_length): matched_blocks.append((start, start + length)) if matched_blocks: start_index = min(start for start, _ in matched_blocks) end_index = max(end for _, end in matched_blocks) length = end_index - start_index if length > max(len(sentence) * 0.35, min_length): matches_span.append((start_index, end_index)) if matches_span: # merge all matches into one span final_span = min(start for start, _ in matches_span), max( end for _, end in matches_span ) matches_span = [final_span] return matches_span def find_start_end_phrase( start_phrase, end_phrase, context, min_length=5, max_excerpt_length=300 ): start_phrase, end_phrase = start_phrase.lower(), end_phrase.lower() context = context.lower() context = context.replace("\n", " ") matches = [] matched_length = 0 for sentence in [start_phrase, end_phrase]: if sentence is None: continue match = SequenceMatcher( None, sentence, context, autojunk=False ).find_longest_match() if match.size > max(len(sentence) * 0.35, min_length): matches.append((match.b, match.b + match.size)) matched_length += match.size # check if second match is before the first match if len(matches) == 2 and matches[1][0] < matches[0][0]: # if so, keep only the first match matches = [matches[0]] if matches: start_idx = min(start for start, _ in matches) end_idx = max(end for _, end in matches) # check if the excerpt is too long if end_idx - start_idx > max_excerpt_length: end_idx = start_idx + max_excerpt_length final_match = (start_idx, end_idx) else: final_match = None return final_match, matched_length def replace_think_tag_with_details(text): text = text.replace( "", '
Thought
', # noqa ) text = text.replace("", "
") return text def strip_think_tag(text): if "
" in text: text = text.split("")[1] return text ================================================ FILE: libs/kotaemon/kotaemon/indices/rankings/__init__.py ================================================ from .base import BaseReranking from .cohere import CohereReranking from .llm import LLMReranking from .llm_scoring import LLMScoring from .llm_trulens import LLMTrulensScoring __all__ = [ "CohereReranking", "LLMReranking", "LLMScoring", "BaseReranking", "LLMTrulensScoring", ] ================================================ FILE: libs/kotaemon/kotaemon/indices/rankings/base.py ================================================ from __future__ import annotations from abc import abstractmethod from kotaemon.base import BaseComponent, Document class BaseReranking(BaseComponent): @abstractmethod def run(self, documents: list[Document], query: str) -> list[Document]: """Main method to transform list of documents (re-ranking, filtering, etc)""" ... ================================================ FILE: libs/kotaemon/kotaemon/indices/rankings/cohere.py ================================================ from __future__ import annotations from decouple import config from kotaemon.base import Document from .base import BaseReranking class CohereReranking(BaseReranking): model_name: str = "rerank-multilingual-v2.0" cohere_api_key: str = config("COHERE_API_KEY", "") use_key_from_ktem: bool = False def run(self, documents: list[Document], query: str) -> list[Document]: """Use Cohere Reranker model to re-order documents with their relevance score""" try: import cohere except ImportError: raise ImportError( "Please install Cohere `pip install cohere` to use Cohere Reranking" ) # try to get COHERE_API_KEY from embeddings if not self.cohere_api_key and self.use_key_from_ktem: try: from ktem.embeddings.manager import ( embedding_models_manager as embeddings, ) cohere_model = embeddings.get("cohere") ktem_cohere_api_key = cohere_model._kwargs.get( # type: ignore "cohere_api_key" ) if ktem_cohere_api_key != "your-key": self.cohere_api_key = ktem_cohere_api_key except Exception as e: print("Cannot get Cohere API key from `ktem`", e) if not self.cohere_api_key: print("Cohere API key not found. Skipping rerankings.") return documents cohere_client = cohere.Client(self.cohere_api_key) compressed_docs: list[Document] = [] if not documents: # to avoid empty api call return compressed_docs _docs = [d.content for d in documents] response = cohere_client.rerank( model=self.model_name, query=query, documents=_docs ) for r in response.results: doc = documents[r.index] doc.metadata["reranking_score"] = r.relevance_score compressed_docs.append(doc) return compressed_docs ================================================ FILE: libs/kotaemon/kotaemon/indices/rankings/llm.py ================================================ from __future__ import annotations from concurrent.futures import ThreadPoolExecutor from langchain.output_parsers.boolean import BooleanOutputParser from kotaemon.base import Document from kotaemon.llms import BaseLLM, PromptTemplate from .base import BaseReranking RERANK_PROMPT_TEMPLATE = """Given the following question and context, return YES if the context is relevant to the question and NO if it isn't. > Question: {question} > Context: >>> {context} >>> > Relevant (YES / NO):""" class LLMReranking(BaseReranking): llm: BaseLLM prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE) top_k: int = 3 concurrent: bool = True def run( self, documents: list[Document], query: str, ) -> list[Document]: """Filter down documents based on their relevance to the query.""" filtered_docs = [] output_parser = BooleanOutputParser() if self.concurrent: with ThreadPoolExecutor() as executor: futures = [] for doc in documents: _prompt = self.prompt_template.populate( question=query, context=doc.get_content() ) futures.append(executor.submit(lambda: self.llm(_prompt).text)) results = [future.result() for future in futures] else: results = [] for doc in documents: _prompt = self.prompt_template.populate( question=query, context=doc.get_content() ) results.append(self.llm(_prompt).text) # use Boolean parser to extract relevancy output from LLM results = [output_parser.parse(result) for result in results] for include_doc, doc in zip(results, documents): if include_doc: filtered_docs.append(doc) # prevent returning empty result if len(filtered_docs) == 0: filtered_docs = documents[: self.top_k] return filtered_docs ================================================ FILE: libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py ================================================ from __future__ import annotations from concurrent.futures import ThreadPoolExecutor import numpy as np from langchain.output_parsers.boolean import BooleanOutputParser from kotaemon.base import Document from .llm import LLMReranking class LLMScoring(LLMReranking): def run( self, documents: list[Document], query: str, ) -> list[Document]: """Filter down documents based on their relevance to the query.""" filtered_docs: list[Document] = [] output_parser = BooleanOutputParser() if self.concurrent: with ThreadPoolExecutor() as executor: futures = [] for doc in documents: _prompt = self.prompt_template.populate( question=query, context=doc.get_content() ) futures.append(executor.submit(lambda: self.llm(_prompt))) results = [future.result() for future in futures] else: results = [] for doc in documents: _prompt = self.prompt_template.populate( question=query, context=doc.get_content() ) results.append(self.llm(_prompt)) for result, doc in zip(results, documents): score = np.exp(np.average(result.logprobs)) include_doc = output_parser.parse(result.text) if include_doc: doc.metadata["llm_reranking_score"] = score else: doc.metadata["llm_reranking_score"] = 1 - score filtered_docs.append(doc) # prevent returning empty result if len(filtered_docs) == 0: filtered_docs = documents[: self.top_k] return filtered_docs ================================================ FILE: libs/kotaemon/kotaemon/indices/rankings/llm_trulens.py ================================================ from __future__ import annotations import re from concurrent.futures import ThreadPoolExecutor from functools import partial import tiktoken from kotaemon.base import Document, HumanMessage, SystemMessage from kotaemon.indices.splitters import TokenSplitter from kotaemon.llms import BaseLLM, PromptTemplate from .llm import LLMReranking SYSTEM_PROMPT_TEMPLATE = PromptTemplate( """You are a RELEVANCE grader; providing the relevance of the given CONTEXT to the given QUESTION. Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant. A few additional scoring guidelines: - Long CONTEXTS should score equally well as short CONTEXTS. - RELEVANCE score should increase as the CONTEXTS provides more RELEVANT context to the QUESTION. - RELEVANCE score should increase as the CONTEXTS provides RELEVANT context to more parts of the QUESTION. - CONTEXT that is RELEVANT to some of the QUESTION should score of 2, 3 or 4. Higher score indicates more RELEVANCE. - CONTEXT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE. - CONTEXT that is RELEVANT to the entire QUESTION should get a score of 9 or 10. Higher score indicates more RELEVANCE. - CONTEXT must be relevant and helpful for answering the entire QUESTION to get a score of 10. - Never elaborate.""" # noqa: E501 ) USER_PROMPT_TEMPLATE = PromptTemplate( """QUESTION: {question} CONTEXT: {context} RELEVANCE: """ ) # noqa PATTERN_INTEGER: re.Pattern = re.compile(r"([+-]?[1-9][0-9]*|0)") """Regex that matches integers.""" MAX_CONTEXT_LEN = 7500 def validate_rating(rating) -> int: """Validate a rating is between 0 and 10.""" if not 0 <= rating <= 10: raise ValueError("Rating must be between 0 and 10") return rating def re_0_10_rating(s: str) -> int: """Extract a 0-10 rating from a string. If the string does not match an integer or matches an integer outside the 0-10 range, raises an error instead. If multiple numbers are found within the expected 0-10 range, the smallest is returned. Args: s: String to extract rating from. Returns: int: Extracted rating. Raises: ParseError: If no integers between 0 and 10 are found in the string. """ matches = PATTERN_INTEGER.findall(s) if not matches: raise AssertionError vals = set() for match in matches: try: vals.add(validate_rating(int(match))) except ValueError: pass if not vals: raise AssertionError # Min to handle cases like "The rating is 8 out of 10." return min(vals) class LLMTrulensScoring(LLMReranking): llm: BaseLLM system_prompt_template: PromptTemplate = SYSTEM_PROMPT_TEMPLATE user_prompt_template: PromptTemplate = USER_PROMPT_TEMPLATE concurrent: bool = True normalize: float = 10 trim_func: TokenSplitter = TokenSplitter.withx( chunk_size=MAX_CONTEXT_LEN, chunk_overlap=0, separator=" ", tokenizer=partial( tiktoken.encoding_for_model("gpt-3.5-turbo").encode, allowed_special=set(), disallowed_special="all", ), ) def run( self, documents: list[Document], query: str, ) -> list[Document]: """Filter down documents based on their relevance to the query.""" filtered_docs = [] documents = sorted(documents, key=lambda doc: doc.get_content()) if self.concurrent: with ThreadPoolExecutor() as executor: futures = [] for doc in documents: chunked_doc_content = self.trim_func( [ Document(content=doc.get_content()) # skip metadata which cause troubles ] )[0].text messages = [] messages.append( SystemMessage(self.system_prompt_template.populate()) ) messages.append( HumanMessage( self.user_prompt_template.populate( question=query, context=chunked_doc_content ) ) ) def llm_call(): return self.llm(messages).text futures.append(executor.submit(llm_call)) results = [future.result() for future in futures] else: results = [] for doc in documents: messages = [] messages.append(SystemMessage(self.system_prompt_template.populate())) messages.append( SystemMessage( self.user_prompt_template.populate( question=query, context=doc.get_content() ) ) ) results.append(self.llm(messages).text) # use Boolean parser to extract relevancy output from LLM results = [ (r_idx, float(re_0_10_rating(result)) / self.normalize) for r_idx, result in enumerate(results) ] results.sort(key=lambda x: x[1], reverse=True) for r_idx, score in results: doc = documents[r_idx] doc.metadata["llm_trulens_score"] = score filtered_docs.append(doc) print( "LLM rerank scores", [doc.metadata["llm_trulens_score"] for doc in filtered_docs], ) return filtered_docs ================================================ FILE: libs/kotaemon/kotaemon/indices/retrievers/__init__.py ================================================ ================================================ FILE: libs/kotaemon/kotaemon/indices/retrievers/jina_web_search.py ================================================ import requests from decouple import config from kotaemon.base import BaseComponent, RetrievedDocument JINA_API_KEY = config("JINA_API_KEY", default="") JINA_URL = config("JINA_URL", default="https://r.jina.ai/") class WebSearch(BaseComponent): """WebSearch component for fetching data from the web using Jina API """ def run( self, text: str, *args, **kwargs, ) -> list[RetrievedDocument]: if JINA_API_KEY == "": raise ValueError( "This feature requires JINA_API_KEY " "(get free one from https://jina.ai/reader)" ) # setup the request api_url = f"https://s.jina.ai/{text}" headers = {"X-With-Generated-Alt": "true", "Accept": "application/json"} if JINA_API_KEY: headers["Authorization"] = f"Bearer {JINA_API_KEY}" response = requests.get(api_url, headers=headers) response.raise_for_status() response_dict = response.json() return [ RetrievedDocument( text=( "###URL: [{url}]({url})\n\n" "####{title}\n\n" "{description}\n" "{content}" ).format( url=item["url"], title=item["title"], description=item["description"], content=item["content"], ), metadata={ "file_name": "Web search", "type": "table", "llm_trulens_score": 1.0, }, ) for item in response_dict["data"] ] def generate_relevant_scores(self, text, documents: list[RetrievedDocument]): return documents ================================================ FILE: libs/kotaemon/kotaemon/indices/retrievers/tavily_web_search.py ================================================ from decouple import config from kotaemon.base import BaseComponent, RetrievedDocument TAVILY_API_KEY = config("TAVILY_API_KEY", default="") class WebSearch(BaseComponent): """WebSearch component for fetching data from the web using Jina API """ def run( self, text: str, *args, **kwargs, ) -> list[RetrievedDocument]: if TAVILY_API_KEY == "": raise ValueError( "This feature requires TAVILY_API_KEY " "(get free one from https://app.tavily.com/)" ) try: from tavily import TavilyClient except ImportError: raise ImportError( "Please install `pip install tavily-python` to use this feature" ) tavily_client = TavilyClient(api_key=TAVILY_API_KEY) results = tavily_client.search( query=text, search_depth="advanced", )["results"] context = "\n\n".join( "###URL: [{url}]({url})\n\n{content}".format( url=result["url"], content=result["content"], ) for result in results ) return [ RetrievedDocument( text=context, metadata={ "file_name": "Web search", "type": "table", "llm_trulens_score": 1.0, }, ) ] def generate_relevant_scores(self, text, documents: list[RetrievedDocument]): return documents ================================================ FILE: libs/kotaemon/kotaemon/indices/splitters/__init__.py ================================================ from ..base import DocTransformer, LlamaIndexDocTransformerMixin class BaseSplitter(DocTransformer): """Represent base splitter class""" ... class TokenSplitter(LlamaIndexDocTransformerMixin, BaseSplitter): def __init__( self, chunk_size: int = 1024, chunk_overlap: int = 20, separator: str = " ", **params, ): super().__init__( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator=separator, **params, ) def _get_li_class(self): from llama_index.core.text_splitter import TokenTextSplitter return TokenTextSplitter class SentenceWindowSplitter(LlamaIndexDocTransformerMixin, BaseSplitter): def __init__( self, window_size: int = 3, window_metadata_key: str = "window", original_text_metadata_key: str = "original_text", **params, ): super().__init__( window_size=window_size, window_metadata_key=window_metadata_key, original_text_metadata_key=original_text_metadata_key, **params, ) def _get_li_class(self): from llama_index.core.node_parser import SentenceWindowNodeParser return SentenceWindowNodeParser ================================================ FILE: libs/kotaemon/kotaemon/indices/vectorindex.py ================================================ from __future__ import annotations import threading import uuid from pathlib import Path from typing import Optional, Sequence, cast from theflow.settings import settings as flowsettings from kotaemon.base import BaseComponent, Document, RetrievedDocument from kotaemon.embeddings import BaseEmbeddings from kotaemon.storages import BaseDocumentStore, BaseVectorStore from .base import BaseIndexing, BaseRetrieval from .rankings import BaseReranking, LLMReranking VECTOR_STORE_FNAME = "vectorstore" DOC_STORE_FNAME = "docstore" class VectorIndexing(BaseIndexing): """Ingest the document, run through the embedding, and store the embedding in a vector store. This pipeline supports the following set of inputs: - List of documents - List of texts """ cache_dir: Optional[str] = getattr(flowsettings, "KH_CHUNKS_OUTPUT_DIR", None) vector_store: BaseVectorStore doc_store: Optional[BaseDocumentStore] = None embedding: BaseEmbeddings count_: int = 0 def to_retrieval_pipeline(self, *args, **kwargs): """Convert the indexing pipeline to a retrieval pipeline""" return VectorRetrieval( vector_store=self.vector_store, doc_store=self.doc_store, embedding=self.embedding, **kwargs, ) def write_chunk_to_file(self, docs: list[Document]): # save the chunks content into markdown format if self.cache_dir: file_name = docs[0].metadata.get("file_name") if not file_name: return file_name = Path(file_name) for i in range(len(docs)): markdown_content = "" if "page_label" in docs[i].metadata: page_label = str(docs[i].metadata["page_label"]) markdown_content += f"Page label: {page_label}" if "file_name" in docs[i].metadata: filename = docs[i].metadata["file_name"] markdown_content += f"\nFile name: {filename}" if "section" in docs[i].metadata: section = docs[i].metadata["section"] markdown_content += f"\nSection: {section}" if "type" in docs[i].metadata: if docs[i].metadata["type"] == "image": image_origin = docs[i].metadata["image_origin"] image_origin = f'

' markdown_content += f"\nImage origin: {image_origin}" if docs[i].text: markdown_content += f"\ntext:\n{docs[i].text}" with open( Path(self.cache_dir) / f"{file_name.stem}_{self.count_+i}.md", "w", encoding="utf-8", ) as f: f.write(markdown_content) def add_to_docstore(self, docs: list[Document]): if self.doc_store: print("Adding documents to doc store") self.doc_store.add(docs) def add_to_vectorstore(self, docs: list[Document]): # in case we want to skip embedding if self.vector_store: print(f"Getting embeddings for {len(docs)} nodes") embeddings = self.embedding(docs) print("Adding embeddings to vector store") self.vector_store.add( embeddings=embeddings, ids=[t.doc_id for t in docs], ) def run(self, text: str | list[str] | Document | list[Document]): input_: list[Document] = [] if not isinstance(text, list): text = [text] for item in cast(list, text): if isinstance(item, str): input_.append(Document(text=item, id_=str(uuid.uuid4()))) elif isinstance(item, Document): input_.append(item) else: raise ValueError( f"Invalid input type {type(item)}, should be str or Document" ) self.add_to_vectorstore(input_) self.add_to_docstore(input_) self.write_chunk_to_file(input_) self.count_ += len(input_) class VectorRetrieval(BaseRetrieval): """Retrieve list of documents from vector store""" vector_store: BaseVectorStore doc_store: Optional[BaseDocumentStore] = None embedding: BaseEmbeddings rerankers: Sequence[BaseReranking] = [] top_k: int = 5 first_round_top_k_mult: int = 10 retrieval_mode: str = "hybrid" # vector, text, hybrid def _filter_docs( self, documents: list[RetrievedDocument], top_k: int | None = None ): if top_k: documents = documents[:top_k] return documents def run( self, text: str | Document, top_k: Optional[int] = None, **kwargs ) -> list[RetrievedDocument]: """Retrieve a list of documents from vector store Args: text: the text to retrieve similar documents top_k: number of top similar documents to return Returns: list[RetrievedDocument]: list of retrieved documents """ if top_k is None: top_k = self.top_k do_extend = kwargs.pop("do_extend", False) thumbnail_count = kwargs.pop("thumbnail_count", 3) if do_extend: top_k_first_round = top_k * self.first_round_top_k_mult else: top_k_first_round = top_k if self.doc_store is None: raise ValueError( "doc_store is not provided. Please provide a doc_store to " "retrieve the documents" ) result: list[RetrievedDocument] = [] # TODO: should declare scope directly in the run params scope = kwargs.pop("scope", None) emb: list[float] if self.retrieval_mode == "vector": emb = self.embedding(text)[0].embedding _, scores, ids = self.vector_store.query( embedding=emb, top_k=top_k_first_round, doc_ids=scope, **kwargs ) docs = self.doc_store.get(ids) result = [ RetrievedDocument(**doc.to_dict(), score=score) for doc, score in zip(docs, scores) ] elif self.retrieval_mode == "text": query = text.text if isinstance(text, Document) else text docs = [] if scope: docs = self.doc_store.query( query, top_k=top_k_first_round, doc_ids=scope ) result = [RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in docs] elif self.retrieval_mode == "hybrid": # similarity search section emb = self.embedding(text)[0].embedding vs_docs: list[RetrievedDocument] = [] vs_ids: list[str] = [] vs_scores: list[float] = [] def query_vectorstore(): nonlocal vs_docs nonlocal vs_scores nonlocal vs_ids assert self.doc_store is not None _, vs_scores, vs_ids = self.vector_store.query( embedding=emb, top_k=top_k_first_round, doc_ids=scope, **kwargs ) if vs_ids: vs_docs = self.doc_store.get(vs_ids) # full-text search section ds_docs: list[RetrievedDocument] = [] def query_docstore(): nonlocal ds_docs assert self.doc_store is not None query = text.text if isinstance(text, Document) else text if scope: ds_docs = self.doc_store.query( query, top_k=top_k_first_round, doc_ids=scope ) vs_query_thread = threading.Thread(target=query_vectorstore) ds_query_thread = threading.Thread(target=query_docstore) vs_query_thread.start() ds_query_thread.start() vs_query_thread.join() ds_query_thread.join() result = [ RetrievedDocument(**doc.to_dict(), score=-1.0) for doc in ds_docs if doc not in vs_ids ] result += [ RetrievedDocument(**doc.to_dict(), score=score) for doc, score in zip(vs_docs, vs_scores) ] print(f"Got {len(vs_docs)} from vectorstore") print(f"Got {len(ds_docs)} from docstore") # use additional reranker to re-order the document list if self.rerankers and text: for reranker in self.rerankers: # if reranker is LLMReranking, limit the document with top_k items only if isinstance(reranker, LLMReranking): result = self._filter_docs(result, top_k=top_k) result = reranker.run(documents=result, query=text) result = self._filter_docs(result, top_k=top_k) print(f"Got raw {len(result)} retrieved documents") # add page thumbnails to the result if exists thumbnail_doc_ids: set[str] = set() # we should copy the text from retrieved text chunk # to the thumbnail to get relevant LLM score correctly text_thumbnail_docs: dict[str, RetrievedDocument] = {} non_thumbnail_docs = [] raw_thumbnail_docs = [] for doc in result: if doc.metadata.get("type") == "thumbnail": # change type to image to display on UI doc.metadata["type"] = "image" raw_thumbnail_docs.append(doc) continue if ( "thumbnail_doc_id" in doc.metadata and len(thumbnail_doc_ids) < thumbnail_count ): thumbnail_id = doc.metadata["thumbnail_doc_id"] thumbnail_doc_ids.add(thumbnail_id) text_thumbnail_docs[thumbnail_id] = doc else: non_thumbnail_docs.append(doc) linked_thumbnail_docs = self.doc_store.get(list(thumbnail_doc_ids)) print( "thumbnail docs", len(linked_thumbnail_docs), "non-thumbnail docs", len(non_thumbnail_docs), "raw-thumbnail docs", len(raw_thumbnail_docs), ) additional_docs = [] for thumbnail_doc in linked_thumbnail_docs: text_doc = text_thumbnail_docs[thumbnail_doc.doc_id] doc_dict = thumbnail_doc.to_dict() doc_dict["_id"] = text_doc.doc_id doc_dict["content"] = text_doc.content doc_dict["metadata"]["type"] = "image" for key in text_doc.metadata: if key not in doc_dict["metadata"]: doc_dict["metadata"][key] = text_doc.metadata[key] additional_docs.append(RetrievedDocument(**doc_dict, score=text_doc.score)) result = additional_docs + non_thumbnail_docs if not result: # return output from raw retrieved thumbnails result = self._filter_docs(raw_thumbnail_docs, top_k=thumbnail_count) return result class TextVectorQA(BaseComponent): retrieving_pipeline: BaseRetrieval qa_pipeline: BaseComponent def run(self, question, **kwargs): retrieved_documents = self.retrieving_pipeline(question, **kwargs) return self.qa_pipeline(question, retrieved_documents, **kwargs) ================================================ FILE: libs/kotaemon/kotaemon/llms/__init__.py ================================================ from kotaemon.base.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage from .base import BaseLLM from .branching import GatedBranchingPipeline, SimpleBranchingPipeline from .chats import ( AzureChatOpenAI, ChatLLM, ChatOpenAI, EndpointChatLLM, LCAnthropicChat, LCAzureChatOpenAI, LCChatOpenAI, LCCohereChat, LCGeminiChat, LCOllamaChat, LlamaCppChat, StructuredOutputChatOpenAI, ) from .completions import LLM, AzureOpenAI, LlamaCpp, OpenAI from .cot import ManualSequentialChainOfThought, Thought from .linear import GatedLinearPipeline, SimpleLinearPipeline from .prompts import BasePromptComponent, PromptTemplate __all__ = [ "BaseLLM", # chat-specific components "ChatLLM", "EndpointChatLLM", "BaseMessage", "HumanMessage", "AIMessage", "SystemMessage", "AzureChatOpenAI", "ChatOpenAI", "StructuredOutputChatOpenAI", "LCAnthropicChat", "LCGeminiChat", "LCCohereChat", "LCOllamaChat", "LCAzureChatOpenAI", "LCChatOpenAI", "LlamaCppChat", # completion-specific components "LLM", "OpenAI", "AzureOpenAI", "LlamaCpp", # prompt-specific components "BasePromptComponent", "PromptTemplate", # strategies "SimpleLinearPipeline", "GatedLinearPipeline", "SimpleBranchingPipeline", "GatedBranchingPipeline", # chain-of-thoughts "ManualSequentialChainOfThought", "Thought", ] ================================================ FILE: libs/kotaemon/kotaemon/llms/base.py ================================================ from typing import AsyncGenerator, Iterator from langchain_core.language_models.base import BaseLanguageModel from kotaemon.base import BaseComponent, LLMInterface class BaseLLM(BaseComponent): def to_langchain_format(self) -> BaseLanguageModel: raise NotImplementedError def invoke(self, *args, **kwargs) -> LLMInterface: raise NotImplementedError async def ainvoke(self, *args, **kwargs) -> LLMInterface: raise NotImplementedError def stream(self, *args, **kwargs) -> Iterator[LLMInterface]: raise NotImplementedError def astream(self, *args, **kwargs) -> AsyncGenerator[LLMInterface, None]: raise NotImplementedError def run(self, *args, **kwargs): return self.invoke(*args, **kwargs) ================================================ FILE: libs/kotaemon/kotaemon/llms/branching.py ================================================ from typing import List, Optional from kotaemon.base import BaseComponent, Document, Param from .linear import GatedLinearPipeline class SimpleBranchingPipeline(BaseComponent): """ A simple branching pipeline for executing multiple branches. Attributes: branches (List[BaseComponent]): The list of branches to be executed. Example: ```python from kotaemon.llms import ( LCAzureChatOpenAI, BasePromptComponent, GatedLinearPipeline, ) from kotaemon.parsers import RegexExtractor def identity(x): return x pipeline = SimpleBranchingPipeline() llm = LCAzureChatOpenAI( openai_api_base="your openai api base", openai_api_key="your openai api key", openai_api_version="your openai api version", deployment_name="dummy-q2-gpt35", temperature=0, request_timeout=600, ) for i in range(3): pipeline.add_branch( GatedLinearPipeline( prompt=BasePromptComponent(template=f"what is {i} in Japanese ?"), condition=RegexExtractor(pattern=f"{i}"), llm=llm, post_processor=identity, ) ) print(pipeline(condition_text="1")) print(pipeline(condition_text="2")) print(pipeline(condition_text="12")) ``` """ branches: List[BaseComponent] = Param(default_callback=lambda *_: []) def add_branch(self, component: BaseComponent): """ Add a new branch to the pipeline. Args: component (BaseComponent): The branch component to be added. """ self.branches.append(component) def run(self, **prompt_kwargs): """ Execute the pipeline by running each branch and return the outputs as a list. Args: **prompt_kwargs: Keyword arguments for the branches. Returns: List: The outputs of each branch as a list. """ output = [] for i, branch in enumerate(self.branches): self._prepare_child(branch, name=f"branch-{i}") output.append(branch(**prompt_kwargs)) return output class GatedBranchingPipeline(SimpleBranchingPipeline): """ A simple gated branching pipeline for executing multiple branches based on a condition. This class extends the SimpleBranchingPipeline class and adds the ability to execute the branches until a branch returns a non-empty output based on a condition. Attributes: branches (List[BaseComponent]): The list of branches to be executed. Example: ```python from kotaemon.llms import ( LCAzureChatOpenAI, BasePromptComponent, GatedLinearPipeline, ) from kotaemon.parsers import RegexExtractor def identity(x): return x pipeline = GatedBranchingPipeline() llm = LCAzureChatOpenAI( openai_api_base="your openai api base", openai_api_key="your openai api key", openai_api_version="your openai api version", deployment_name="dummy-q2-gpt35", temperature=0, request_timeout=600, ) for i in range(3): pipeline.add_branch( GatedLinearPipeline( prompt=BasePromptComponent(template=f"what is {i} in Japanese ?"), condition=RegexExtractor(pattern=f"{i}"), llm=llm, post_processor=identity, ) ) print(pipeline(condition_text="1")) print(pipeline(condition_text="2")) ``` """ def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs): """ Execute the pipeline by running each branch and return the output of the first branch that returns a non-empty output based on the provided condition. Args: condition_text (str): The condition text to evaluate for each branch. Default to None. **prompt_kwargs: Keyword arguments for the branches. Returns: Union[OutputType, None]: The output of the first branch that satisfies the condition, or None if no branch satisfies the condition. Raises: ValueError: If condition_text is None """ if condition_text is None: raise ValueError("`condition_text` must be provided.") for i, branch in enumerate(self.branches): self._prepare_child(branch, name=f"branch-{i}") output = branch(condition_text=condition_text, **prompt_kwargs) if output: return output return Document(None) if __name__ == "__main__": import dotenv from kotaemon.llms import BasePromptComponent, LCAzureChatOpenAI from kotaemon.parsers import RegexExtractor def identity(x): return x secrets = dotenv.dotenv_values(".env") pipeline = GatedBranchingPipeline() llm = LCAzureChatOpenAI( openai_api_base=secrets.get("OPENAI_API_BASE", ""), openai_api_key=secrets.get("OPENAI_API_KEY", ""), openai_api_version=secrets.get("OPENAI_API_VERSION", ""), deployment_name="dummy-q2-gpt35", temperature=0, request_timeout=600, ) for i in range(3): pipeline.add_branch( GatedLinearPipeline( prompt=BasePromptComponent(template=f"what is {i} in Japanese ?"), condition=RegexExtractor(pattern=f"{i}"), llm=llm, post_processor=identity, ) ) pipeline(condition_text="1") ================================================ FILE: libs/kotaemon/kotaemon/llms/chats/__init__.py ================================================ from .base import ChatLLM from .endpoint_based import EndpointChatLLM from .langchain_based import ( LCAnthropicChat, LCAzureChatOpenAI, LCChatMixin, LCChatOpenAI, LCCohereChat, LCGeminiChat, LCOllamaChat, ) from .llamacpp import LlamaCppChat from .openai import AzureChatOpenAI, ChatOpenAI, StructuredOutputChatOpenAI __all__ = [ "ChatOpenAI", "AzureChatOpenAI", "ChatLLM", "EndpointChatLLM", "ChatOpenAI", "StructuredOutputChatOpenAI", "LCAnthropicChat", "LCGeminiChat", "LCCohereChat", "LCOllamaChat", "LCChatOpenAI", "LCAzureChatOpenAI", "LCChatMixin", "LlamaCppChat", ] ================================================ FILE: libs/kotaemon/kotaemon/llms/chats/base.py ================================================ from __future__ import annotations import logging from kotaemon.base import BaseComponent from kotaemon.llms.base import BaseLLM logger = logging.getLogger(__name__) class ChatLLM(BaseLLM): def flow(self): if self.inflow is None: raise ValueError("No inflow provided.") if not isinstance(self.inflow, BaseComponent): raise ValueError( f"inflow must be a BaseComponent, found {type(self.inflow)}" ) text = self.inflow.flow().text return self.__call__(text) ================================================ FILE: libs/kotaemon/kotaemon/llms/chats/endpoint_based.py ================================================ import requests from kotaemon.base import ( AIMessage, BaseMessage, HumanMessage, LLMInterface, Param, SystemMessage, ) from .base import ChatLLM class EndpointChatLLM(ChatLLM): """ A ChatLLM that uses an endpoint to generate responses. This expects an OpenAI API compatible endpoint. Attributes: endpoint_url (str): The url of a OpenAI API compatible endpoint. """ endpoint_url: str = Param( help="URL of the OpenAI API compatible endpoint", required=True ) def run( self, messages: str | BaseMessage | list[BaseMessage], **kwargs ) -> LLMInterface: """ Generate response from messages Args: messages (str | BaseMessage | list[BaseMessage]): history of messages to generate response from **kwargs: additional arguments to pass to the OpenAI API Returns: LLMInterface: generated response """ if isinstance(messages, str): input_ = [HumanMessage(content=messages)] elif isinstance(messages, BaseMessage): input_ = [messages] else: input_ = messages def decide_role(message: BaseMessage): if isinstance(message, SystemMessage): return "system" elif isinstance(message, AIMessage): return "assistant" else: return "user" request_json = { "messages": [{"content": m.text, "role": decide_role(m)} for m in input_] } response = requests.post(self.endpoint_url, json=request_json).json() content = "" candidates = [] if response["choices"]: candidates = [ each["message"]["content"] for each in response["choices"] if each["message"]["content"] ] content = candidates[0] return LLMInterface( content=content, candidates=candidates, completion_tokens=response["usage"]["completion_tokens"], total_tokens=response["usage"]["total_tokens"], prompt_tokens=response["usage"]["prompt_tokens"], ) def invoke( self, messages: str | BaseMessage | list[BaseMessage], **kwargs ) -> LLMInterface: """Same as run""" return self.run(messages, **kwargs) async def ainvoke( self, messages: str | BaseMessage | list[BaseMessage], **kwargs ) -> LLMInterface: return self.invoke(messages, **kwargs) ================================================ FILE: libs/kotaemon/kotaemon/llms/chats/langchain_based.py ================================================ from __future__ import annotations import logging from typing import AsyncGenerator, Iterator from kotaemon.base import BaseMessage, HumanMessage, LLMInterface, Param from .base import ChatLLM logger = logging.getLogger(__name__) class LCChatMixin: """Mixin for langchain based chat models""" def _get_lc_class(self): raise NotImplementedError( "Please return the relevant Langchain class in in _get_lc_class" ) def _get_tool_call_kwargs(self): return {} def __init__(self, stream: bool = False, **params): self._lc_class = self._get_lc_class() self._obj = self._lc_class(**params) self._kwargs: dict = params self._stream = stream super().__init__() def run( self, messages: str | BaseMessage | list[BaseMessage], **kwargs ) -> LLMInterface: if self._stream: return self.stream(messages, **kwargs) # type: ignore return self.invoke(messages, **kwargs) def prepare_message(self, messages: str | BaseMessage | list[BaseMessage]): input_: list[BaseMessage] = [] if isinstance(messages, str): input_ = [HumanMessage(content=messages)] elif isinstance(messages, BaseMessage): input_ = [messages] else: input_ = messages return input_ def prepare_response(self, pred): all_text = [each.text for each in pred.generations[0]] all_messages = [each.message for each in pred.generations[0]] completion_tokens, total_tokens, prompt_tokens = 0, 0, 0 try: if pred.llm_output is not None: completion_tokens = pred.llm_output["token_usage"]["completion_tokens"] total_tokens = pred.llm_output["token_usage"]["total_tokens"] prompt_tokens = pred.llm_output["token_usage"]["prompt_tokens"] except Exception: pass return LLMInterface( text=all_text[0] if len(all_text) > 0 else "", candidates=all_text, completion_tokens=completion_tokens, total_tokens=total_tokens, prompt_tokens=prompt_tokens, messages=all_messages, logits=[], ) def invoke( self, messages: str | BaseMessage | list[BaseMessage], **kwargs ) -> LLMInterface: """Generate response from messages Args: messages: history of messages to generate response from **kwargs: additional arguments to pass to the langchain chat model Returns: LLMInterface: generated response """ input_ = self.prepare_message(messages) if "tools_pydantic" in kwargs: tools = kwargs.pop( "tools_pydantic", ) lc_tool_call = self._obj.bind_tools(tools) pred = lc_tool_call.invoke( input_, **self._get_tool_call_kwargs(), ) if pred.tool_calls: tool_calls = pred.tool_calls else: tool_calls = pred.additional_kwargs.get("tool_calls", []) output = LLMInterface( content="", additional_kwargs={"tool_calls": tool_calls}, ) else: pred = self._obj.generate(messages=[input_], **kwargs) output = self.prepare_response(pred) return output async def ainvoke( self, messages: str | BaseMessage | list[BaseMessage], **kwargs ) -> LLMInterface: input_ = self.prepare_message(messages) pred = await self._obj.agenerate(messages=[input_], **kwargs) return self.prepare_response(pred) def stream( self, messages: str | BaseMessage | list[BaseMessage], **kwargs ) -> Iterator[LLMInterface]: for response in self._obj.stream(input=messages, **kwargs): yield LLMInterface(content=response.content) async def astream( self, messages: str | BaseMessage | list[BaseMessage], **kwargs ) -> AsyncGenerator[LLMInterface, None]: async for response in self._obj.astream(input=messages, **kwargs): yield LLMInterface(content=response.content) def to_langchain_format(self): return self._obj def __repr__(self): kwargs = [] for key, value_obj in self._kwargs.items(): value = repr(value_obj) kwargs.append(f"{key}={value}") kwargs_repr = ", ".join(kwargs) return f"{self.__class__.__name__}({kwargs_repr})" def __str__(self): kwargs = [] for key, value_obj in self._kwargs.items(): value = str(value_obj) if len(value) > 20: value = f"{value[:15]}..." kwargs.append(f"{key}={value}") kwargs_repr = ", ".join(kwargs) return f"{self.__class__.__name__}({kwargs_repr})" def __setattr__(self, name, value): if name == "_lc_class": return super().__setattr__(name, value) if name in self._lc_class.__fields__: self._kwargs[name] = value self._obj = self._lc_class(**self._kwargs) else: super().__setattr__(name, value) def __getattr__(self, name): if name in self._kwargs: return self._kwargs[name] return getattr(self._obj, name) def dump(self, *args, **kwargs): from theflow.utils.modules import serialize params = {key: serialize(value) for key, value in self._kwargs.items()} return { "__type__": f"{self.__module__}.{self.__class__.__qualname__}", **params, } def specs(self, path: str): path = path.strip(".") if "." in path: raise ValueError("path should not contain '.'") if path in self._lc_class.__fields__: return { "__type__": "theflow.base.ParamAttr", "refresh_on_set": True, "strict_type": True, } raise ValueError(f"Invalid param {path}") class LCChatOpenAI(LCChatMixin, ChatLLM): # type: ignore def __init__( self, openai_api_base: str | None = None, openai_api_key: str | None = None, model: str | None = None, temperature: float = 0.7, request_timeout: float | None = None, **params, ): super().__init__( openai_api_base=openai_api_base, openai_api_key=openai_api_key, model=model, temperature=temperature, request_timeout=request_timeout, **params, ) def _get_lc_class(self): try: from langchain_openai import ChatOpenAI except ImportError: from langchain.chat_models import ChatOpenAI return ChatOpenAI class LCAzureChatOpenAI(LCChatMixin, ChatLLM): # type: ignore def __init__( self, azure_endpoint: str | None = None, openai_api_key: str | None = None, openai_api_version: str = "", deployment_name: str | None = None, temperature: float = 0.7, request_timeout: float | None = None, **params, ): super().__init__( azure_endpoint=azure_endpoint, openai_api_key=openai_api_key, openai_api_version=openai_api_version, deployment_name=deployment_name, temperature=temperature, request_timeout=request_timeout, **params, ) def _get_lc_class(self): try: from langchain_openai import AzureChatOpenAI except ImportError: from langchain.chat_models import AzureChatOpenAI return AzureChatOpenAI class LCAnthropicChat(LCChatMixin, ChatLLM): # type: ignore api_key: str = Param( help="API key (https://console.anthropic.com/settings/keys)", required=True ) model_name: str = Param( help=( "Model name to use " "(https://docs.anthropic.com/en/docs/about-claude/models)" ), required=True, ) def _get_tool_call_kwargs(self): return {"tool_choice": {"type": "any"}} def __init__( self, api_key: str | None = None, model_name: str | None = None, temperature: float = 0.7, **params, ): super().__init__( api_key=api_key, model_name=model_name, temperature=temperature, **params, ) def _get_lc_class(self): try: from langchain_anthropic import ChatAnthropic except ImportError: raise ImportError("Please install langchain-anthropic") return ChatAnthropic class LCGeminiChat(LCChatMixin, ChatLLM): # type: ignore api_key: str = Param( help="API key (https://aistudio.google.com/app/apikey)", required=True ) model_name: str = Param( help=( "Model name to use (https://cloud.google" ".com/vertex-ai/generative-ai/docs/learn/models)" ), required=True, ) def _get_tool_call_kwargs(self): return { "tool_config": { "function_calling_config": { "mode": "ANY", } } } def __init__( self, api_key: str | None = None, model_name: str | None = None, temperature: float = 0.7, **params, ): super().__init__( google_api_key=api_key, model=model_name, temperature=temperature, **params, ) def _get_lc_class(self): try: from langchain_google_genai import ChatGoogleGenerativeAI except ImportError: raise ImportError("Please install langchain-google-genai") return ChatGoogleGenerativeAI class LCCohereChat(LCChatMixin, ChatLLM): # type: ignore api_key: str = Param( help="API key (https://dashboard.cohere.com/api-keys)", required=True ) model_name: str = Param( help=("Model name to use (https://dashboard.cohere.com/playground/chat)"), required=True, ) def __init__( self, api_key: str | None = None, model_name: str | None = None, temperature: float = 0.7, **params, ): super().__init__( cohere_api_key=api_key, model_name=model_name, temperature=temperature, **params, ) def _get_lc_class(self): try: from langchain_cohere import ChatCohere except ImportError: raise ImportError("Please install langchain-cohere") return ChatCohere class LCOllamaChat(LCChatMixin, ChatLLM): # type: ignore base_url: str = Param( help="Base Ollama URL. (default: http://localhost:11434/api/)", # noqa required=True, ) model: str = Param( help="Model name to use (https://ollama.com/library)", required=True, ) num_ctx: int = Param( help="The size of the context window (default: 8192)", required=True, ) def __init__( self, model: str | None = None, base_url: str | None = None, num_ctx: int | None = None, **params, ): super().__init__( base_url=base_url, model=model, num_ctx=num_ctx, **params, ) def _get_lc_class(self): try: from langchain_ollama import ChatOllama except ImportError: raise ImportError("Please install langchain-ollama") return ChatOllama ================================================ FILE: libs/kotaemon/kotaemon/llms/chats/llamacpp.py ================================================ from typing import TYPE_CHECKING, Iterator, Optional, cast from kotaemon.base import BaseMessage, HumanMessage, LLMInterface, Param from .base import ChatLLM if TYPE_CHECKING: from llama_cpp import CreateChatCompletionResponse as CCCR from llama_cpp import Llama class LlamaCppChat(ChatLLM): """Wrapper around the llama-cpp-python's Llama model""" model_path: Optional[str] = Param( help="Path to the model file. This is required to load the model.", ) repo_id: Optional[str] = Param( help="Id of a repo on the HuggingFace Hub in the form of `user_name/repo_name`." ) filename: Optional[str] = Param( help="A filename or glob pattern to match the model file in the repo." ) chat_format: str = Param( help=( "Chat format to use. Please refer to llama_cpp.llama_chat_format for a " "list of supported formats. If blank, the chat format will be auto-" "inferred." ), required=True, ) lora_base: Optional[str] = Param(None, help="Path to the base Lora model") n_ctx: Optional[int] = Param(512, help="Text context, 0 = from model") n_gpu_layers: Optional[int] = Param( 0, help="Number of layers to offload to GPU. If -1, all layers are offloaded", ) use_mmap: Optional[bool] = Param( True, help=(), ) vocab_only: Optional[bool] = Param( False, help="If True, only the vocabulary is loaded. This is useful for debugging.", ) _role_mapper: dict[str, str] = { "human": "user", "system": "system", "ai": "assistant", } @Param.auto() def client_object(self) -> "Llama": """Get the llama-cpp-python client object""" try: from llama_cpp import Llama except ImportError: raise ImportError( "llama-cpp-python is not installed. " "Please install it using `pip install llama-cpp-python`" ) errors = [] if not self.model_path and (not self.repo_id or not self.filename): errors.append( "- `model_path` or `repo_id` and `filename` are required to load the" " model" ) if not self.chat_format: errors.append( "- `chat_format` is required to know how to format the chat messages. " "Please refer to llama_cpp.llama_chat_format for a list of supported " "formats." ) if errors: raise ValueError("\n".join(errors)) if self.model_path: return Llama( model_path=cast(str, self.model_path), chat_format=self.chat_format, lora_base=self.lora_base, n_ctx=self.n_ctx, n_gpu_layers=self.n_gpu_layers, use_mmap=self.use_mmap, vocab_only=self.vocab_only, ) else: return Llama.from_pretrained( repo_id=self.repo_id, filename=self.filename, chat_format=self.chat_format, lora_base=self.lora_base, n_ctx=self.n_ctx, n_gpu_layers=self.n_gpu_layers, use_mmap=self.use_mmap, vocab_only=self.vocab_only, ) def prepare_message( self, messages: str | BaseMessage | list[BaseMessage] ) -> list[dict]: input_: list[BaseMessage] = [] if isinstance(messages, str): input_ = [HumanMessage(content=messages)] elif isinstance(messages, BaseMessage): input_ = [messages] else: input_ = messages output_ = [ {"role": self._role_mapper[each.type], "content": each.content} for each in input_ ] return output_ def invoke( self, messages: str | BaseMessage | list[BaseMessage], **kwargs ) -> LLMInterface: pred: "CCCR" = self.client_object.create_chat_completion( messages=self.prepare_message(messages), stream=False, ) return LLMInterface( content=pred["choices"][0]["message"]["content"] if pred["choices"] else "", candidates=[ c["message"]["content"] for c in pred["choices"] if c["message"]["content"] ], completion_tokens=pred["usage"]["completion_tokens"], total_tokens=pred["usage"]["total_tokens"], prompt_tokens=pred["usage"]["prompt_tokens"], ) def stream( self, messages: str | BaseMessage | list[BaseMessage], **kwargs ) -> Iterator[LLMInterface]: pred = self.client_object.create_chat_completion( messages=self.prepare_message(messages), stream=True, ) for chunk in pred: if not chunk["choices"]: continue if "content" not in chunk["choices"][0]["delta"]: continue yield LLMInterface(content=chunk["choices"][0]["delta"]["content"]) ================================================ FILE: libs/kotaemon/kotaemon/llms/chats/openai.py ================================================ from typing import TYPE_CHECKING, AsyncGenerator, Iterator, Optional, Type from pydantic import BaseModel from theflow.utils.modules import import_dotted_string from kotaemon.base import ( AIMessage, BaseMessage, HumanMessage, LLMInterface, Param, StructuredOutputLLMInterface, ) from .base import ChatLLM if TYPE_CHECKING: from openai.types.chat.chat_completion_message_param import ( ChatCompletionMessageParam, ) class BaseChatOpenAI(ChatLLM): """Base interface for OpenAI chat model, using the openai library This class exposes the parameters in resources.Chat. To subclass this class: - Implement the `prepare_client` method to return the OpenAI client - Implement the `openai_response` method to return the OpenAI response - Implement the params relate to the OpenAI client """ _dependencies = ["openai"] _capabilities = ["chat", "text"] # consider as mixin api_key: str = Param(help="API key", required=True) timeout: Optional[float] = Param(None, help="Timeout for the API request") max_retries: Optional[int] = Param( None, help="Maximum number of retries for the API request" ) temperature: Optional[float] = Param( None, help=( "Number between 0 and 2 that controls the randomness of the generated " "tokens. Lower values make the model more deterministic, while higher " "values make the model more random." ), ) max_tokens: Optional[int] = Param( None, help=( "Maximum number of tokens to generate. The total length of input tokens " "and generated tokens is limited by the model's context length." ), ) n: int = Param( 1, help=( "Number of completions to generate. The API will generate n completion " "for each prompt." ), ) stop: Optional[str | list[str]] = Param( None, help=( "Stop sequence. If a stop sequence is detected, generation will stop " "at that point. If not specified, generation will continue until the " "maximum token length is reached." ), ) frequency_penalty: Optional[float] = Param( None, help=( "Number between -2.0 and 2.0. Positive values penalize new tokens " "based on their existing frequency in the text so far, decrearsing the " "model's likelihood of repeating the same text." ), ) presence_penalty: Optional[float] = Param( None, help=( "Number between -2.0 and 2.0. Positive values penalize new tokens " "based on their existing presence in the text so far, decrearsing the " "model's likelihood of repeating the same text." ), ) tool_choice: Optional[str] = Param( None, help=( "Choice of tool to use for the completion. Available choices are: " "auto, default." ), ) tools: Optional[list[str]] = Param( None, help="List of tools to use for the completion.", ) logprobs: Optional[bool] = Param( None, help=( "Include log probabilities on the logprobs most likely tokens, " "as well as the chosen token." ), ) logit_bias: Optional[dict] = Param( None, help=( "Dictionary of logit bias values to add to the logits of the tokens " "in the vocabulary." ), ) top_logprobs: Optional[int] = Param( None, help=( "An integer between 0 and 5 specifying the number of most likely tokens " "to return at each token position, each with an associated log " "probability. `logprobs` must also be set to `true` if this parameter " "is used." ), ) top_p: Optional[float] = Param( None, help=( "An alternative to sampling with temperature, called nucleus sampling, " "where the model considers the results of the token with top_p " "probability mass. So 0.1 means that only the tokens comprising the " "top 10% probability mass are considered." ), ) @Param.auto(depends_on=["max_retries"]) def max_retries_(self): if self.max_retries is None: from openai._constants import DEFAULT_MAX_RETRIES return DEFAULT_MAX_RETRIES return self.max_retries def prepare_message( self, messages: str | BaseMessage | list[BaseMessage] ) -> list["ChatCompletionMessageParam"]: """Prepare the message into OpenAI format Returns: list[dict]: List of messages in OpenAI format """ input_: list[BaseMessage] = [] output_: list["ChatCompletionMessageParam"] = [] if isinstance(messages, str): input_ = [HumanMessage(content=messages)] elif isinstance(messages, BaseMessage): input_ = [messages] else: input_ = messages for message in input_: output_.append(message.to_openai_format()) return output_ def prepare_output(self, resp: dict) -> LLMInterface: """Convert the OpenAI response into LLMInterface""" additional_kwargs = {} if "tool_calls" in resp["choices"][0]["message"]: additional_kwargs["tool_calls"] = resp["choices"][0]["message"][ "tool_calls" ] if resp["choices"][0].get("logprobs") is None: logprobs = [] else: all_logprobs = resp["choices"][0]["logprobs"].get("content") logprobs = ( [logprob["logprob"] for logprob in all_logprobs] if all_logprobs else [] ) output = LLMInterface( candidates=[(_["message"]["content"] or "") for _ in resp["choices"]], content=resp["choices"][0]["message"]["content"] or "", total_tokens=resp["usage"]["total_tokens"], prompt_tokens=resp["usage"]["prompt_tokens"], completion_tokens=resp["usage"]["completion_tokens"], additional_kwargs=additional_kwargs, messages=[ AIMessage(content=(_["message"]["content"]) or "") for _ in resp["choices"] ], logprobs=logprobs, ) return output def prepare_client(self, async_version: bool = False): """Get the OpenAI client Args: async_version (bool): Whether to get the async version of the client """ raise NotImplementedError def openai_response(self, client, **kwargs): """Get the openai response""" raise NotImplementedError async def aopenai_response(self, client, **kwargs): """Get the openai response""" raise NotImplementedError def invoke( self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs ) -> LLMInterface: client = self.prepare_client(async_version=False) input_messages = self.prepare_message(messages) resp = self.openai_response( client, messages=input_messages, stream=False, **kwargs ).dict() return self.prepare_output(resp) async def ainvoke( self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs ) -> LLMInterface: client = self.prepare_client(async_version=True) input_messages = self.prepare_message(messages) resp = ( await self.aopenai_response( client, messages=input_messages, stream=False, **kwargs ) ).dict() return self.prepare_output(resp) def stream( self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs ) -> Iterator[LLMInterface]: client = self.prepare_client(async_version=False) input_messages = self.prepare_message(messages) resp = self.openai_response( client, messages=input_messages, stream=True, **kwargs ) for c in resp: chunk = c.dict() if not chunk["choices"]: continue if chunk["choices"][0]["delta"]["content"] is not None: if chunk["choices"][0].get("logprobs") is None: logprobs = [] else: logprobs = [ logprob["logprob"] for logprob in chunk["choices"][0]["logprobs"].get( "content", [] ) ] yield LLMInterface( content=chunk["choices"][0]["delta"]["content"], logprobs=logprobs ) async def astream( self, messages: str | BaseMessage | list[BaseMessage], *args, **kwargs ) -> AsyncGenerator[LLMInterface, None]: client = self.prepare_client(async_version=True) input_messages = self.prepare_message(messages) resp = self.openai_response( client, messages=input_messages, stream=True, **kwargs ) async for chunk in resp: if not chunk.choices: continue if chunk.choices[0].delta.content is not None: yield LLMInterface(content=chunk.choices[0].delta.content) class ChatOpenAI(BaseChatOpenAI): """OpenAI chat model""" base_url: Optional[str] = Param(None, help="OpenAI base URL") organization: Optional[str] = Param(None, help="OpenAI organization") model: str = Param(help="OpenAI model", required=True) def prepare_client(self, async_version: bool = False): """Get the OpenAI client Args: async_version (bool): Whether to get the async version of the client """ params = { "api_key": self.api_key, "organization": self.organization, "base_url": self.base_url, "timeout": self.timeout, "max_retries": self.max_retries_, } if async_version: from openai import AsyncOpenAI return AsyncOpenAI(**params) from openai import OpenAI return OpenAI(**params) def prepare_params(self, **kwargs): if "tools_pydantic" in kwargs: kwargs.pop("tools_pydantic") params_ = { "model": self.model, "temperature": self.temperature, "max_tokens": self.max_tokens, "n": self.n, "stop": self.stop, "frequency_penalty": self.frequency_penalty, "presence_penalty": self.presence_penalty, "tool_choice": self.tool_choice, "tools": self.tools, "logprobs": self.logprobs, "logit_bias": self.logit_bias, "top_logprobs": self.top_logprobs, "top_p": self.top_p, } params = {k: v for k, v in params_.items() if v is not None} params.update(kwargs) return params def openai_response(self, client, **kwargs): """Get the openai response""" params = self.prepare_params(**kwargs) return client.chat.completions.create(**params) async def aopenai_response(self, client, **kwargs): params = self.prepare_params(**kwargs) return await client.chat.completions.create(**params) class StructuredOutputChatOpenAI(ChatOpenAI): """OpenAI chat model that returns structured output""" response_schema: Type[BaseModel] = Param( help="class that subclasses pydantics BaseModel", required=True ) def prepare_output(self, resp: dict) -> StructuredOutputLLMInterface: """Convert the OpenAI response into StructuredOutputLLMInterface""" additional_kwargs = {} if "tool_calls" in resp["choices"][0]["message"]: additional_kwargs["tool_calls"] = resp["choices"][0]["message"][ "tool_calls" ] if resp["choices"][0].get("logprobs") is None: logprobs = [] else: all_logprobs = resp["choices"][0]["logprobs"].get("content") logprobs = ( [logprob["logprob"] for logprob in all_logprobs] if all_logprobs else [] ) output = StructuredOutputLLMInterface( parsed=resp["choices"][0]["message"]["parsed"], candidates=[(_["message"]["content"] or "") for _ in resp["choices"]], content=resp["choices"][0]["message"]["content"] or "", total_tokens=resp["usage"]["total_tokens"], prompt_tokens=resp["usage"]["prompt_tokens"], completion_tokens=resp["usage"]["completion_tokens"], messages=[ AIMessage(content=(_["message"]["content"]) or "") for _ in resp["choices"] ], additional_kwargs=additional_kwargs, logprobs=logprobs, ) return output def prepare_params(self, **kwargs): if "tools_pydantic" in kwargs: kwargs.pop("tools_pydantic") params_ = { "model": self.model, "temperature": self.temperature, "max_tokens": self.max_tokens, "n": self.n, "stop": self.stop, "frequency_penalty": self.frequency_penalty, "presence_penalty": self.presence_penalty, "tool_choice": self.tool_choice, "tools": self.tools, "logprobs": self.logprobs, "logit_bias": self.logit_bias, "top_logprobs": self.top_logprobs, "top_p": self.top_p, "response_format": self.response_schema, } params = {k: v for k, v in params_.items() if v is not None} params.update(kwargs) # doesn't do streaming params.pop("stream") return params def openai_response(self, client, **kwargs): """Get the openai response""" params = self.prepare_params(**kwargs) return client.beta.chat.completions.parse(**params) async def aopenai_response(self, client, **kwargs): """Get the openai response""" params = self.prepare_params(**kwargs) return await client.beta.chat.completions.parse(**params) class AzureChatOpenAI(BaseChatOpenAI): """OpenAI chat model provided by Microsoft Azure""" azure_endpoint: str = Param( help=( "HTTPS endpoint for the Azure OpenAI model. The azure_endpoint, " "azure_deployment, and api_version parameters are used to construct " "the full URL for the Azure OpenAI model." ), required=True, ) azure_deployment: str = Param(help="Azure deployment name", required=True) api_version: str = Param(help="Azure model version", required=True) azure_ad_token: Optional[str] = Param(None, help="Azure AD token") azure_ad_token_provider: Optional[str] = Param(None, help="Azure AD token provider") @Param.auto(depends_on=["azure_ad_token_provider"]) def azure_ad_token_provider_(self): if isinstance(self.azure_ad_token_provider, str): return import_dotted_string(self.azure_ad_token_provider, safe=False) def prepare_client(self, async_version: bool = False): """Get the OpenAI client Args: async_version (bool): Whether to get the async version of the client """ params = { "azure_endpoint": self.azure_endpoint, "api_version": self.api_version, "api_key": self.api_key, "azure_ad_token": self.azure_ad_token, "azure_ad_token_provider": self.azure_ad_token_provider_, "timeout": self.timeout, "max_retries": self.max_retries_, } if async_version: from openai import AsyncAzureOpenAI return AsyncAzureOpenAI(**params) from openai import AzureOpenAI return AzureOpenAI(**params) def prepare_params(self, **kwargs): if "tools_pydantic" in kwargs: kwargs.pop("tools_pydantic") params_ = { "model": self.azure_deployment, "temperature": self.temperature, "max_tokens": self.max_tokens, "n": self.n, "stop": self.stop, "frequency_penalty": self.frequency_penalty, "presence_penalty": self.presence_penalty, "tool_choice": self.tool_choice, "tools": self.tools, "logprobs": self.logprobs, "logit_bias": self.logit_bias, "top_logprobs": self.top_logprobs, "top_p": self.top_p, } params = {k: v for k, v in params_.items() if v is not None} params.update(kwargs) return params def openai_response(self, client, **kwargs): """Get the openai response""" params = self.prepare_params(**kwargs) return client.chat.completions.create(**params) async def aopenai_response(self, client, **kwargs): params = self.prepare_params(**kwargs) return await client.chat.completions.create(**params) ================================================ FILE: libs/kotaemon/kotaemon/llms/completions/__init__.py ================================================ from .base import LLM from .langchain_based import AzureOpenAI, LCCompletionMixin, LlamaCpp, OpenAI __all__ = ["LLM", "OpenAI", "AzureOpenAI", "LCCompletionMixin", "LlamaCpp"] ================================================ FILE: libs/kotaemon/kotaemon/llms/completions/base.py ================================================ from kotaemon.llms.base import BaseLLM class LLM(BaseLLM): pass ================================================ FILE: libs/kotaemon/kotaemon/llms/completions/langchain_based.py ================================================ import logging from typing import Optional from kotaemon.base import LLMInterface from .base import LLM logger = logging.getLogger(__name__) class LCCompletionMixin: def _get_lc_class(self): raise NotImplementedError( "Please return the relevant Langchain class in in _get_lc_class" ) def __init__(self, **params): self._lc_class = self._get_lc_class() self._obj = self._lc_class(**params) self._kwargs: dict = params super().__init__() def run(self, text: str) -> LLMInterface: pred = self._obj.generate([text]) all_text = [each.text for each in pred.generations[0]] completion_tokens, total_tokens, prompt_tokens = 0, 0, 0 try: if pred.llm_output is not None: completion_tokens = pred.llm_output["token_usage"]["completion_tokens"] total_tokens = pred.llm_output["token_usage"]["total_tokens"] prompt_tokens = pred.llm_output["token_usage"]["prompt_tokens"] except Exception: logger.warning( f"Cannot get token usage from LLM output for {self._lc_class.__name__}" ) return LLMInterface( text=all_text[0] if len(all_text) > 0 else "", candidates=all_text, completion_tokens=completion_tokens, total_tokens=total_tokens, prompt_tokens=prompt_tokens, logits=[], ) def to_langchain_format(self): return self._obj def __repr__(self): kwargs = [] for key, value_obj in self._kwargs.items(): value = repr(value_obj) kwargs.append(f"{key}={value}") kwargs_repr = ", ".join(kwargs) return f"{self.__class__.__name__}({kwargs_repr})" def __str__(self): kwargs = [] for key, value_obj in self._kwargs.items(): value = str(value_obj) if len(value) > 20: value = f"{value[:15]}..." kwargs.append(f"{key}={value}") kwargs_repr = ", ".join(kwargs) return f"{self.__class__.__name__}({kwargs_repr})" def __setattr__(self, name, value): if name == "_lc_class": return super().__setattr__(name, value) if name in self._lc_class.__fields__: self._kwargs[name] = value self._obj = self._lc_class(**self._kwargs) else: super().__setattr__(name, value) def __getattr__(self, name): if name in self._kwargs: return self._kwargs[name] return getattr(self._obj, name) def dump(self, *args, **kwargs): from theflow.utils.modules import serialize params = {key: serialize(value) for key, value in self._kwargs.items()} return { "__type__": f"{self.__module__}.{self.__class__.__qualname__}", **params, } def specs(self, path: str): path = path.strip(".") if "." in path: raise ValueError("path should not contain '.'") if path in self._lc_class.__fields__: return { "__type__": "theflow.base.ParamAttr", "refresh_on_set": True, "strict_type": True, } raise ValueError(f"Invalid param {path}") class OpenAI(LCCompletionMixin, LLM): """Wrapper around Langchain's OpenAI class, focusing on key parameters""" def __init__( self, openai_api_key: Optional[str] = None, openai_api_base: Optional[str] = None, model_name: str = "text-davinci-003", temperature: float = 0.7, max_tokens: int = 256, top_p: float = 1, frequency_penalty: float = 0, n: int = 1, best_of: int = 1, request_timeout: Optional[float] = None, max_retries: int = 2, streaming: bool = False, **params, ): super().__init__( openai_api_key=openai_api_key, openai_api_base=openai_api_base, model_name=model_name, temperature=temperature, max_tokens=max_tokens, top_p=top_p, frequency_penalty=frequency_penalty, n=n, best_of=best_of, request_timeout=request_timeout, max_retries=max_retries, streaming=streaming, **params, ) def _get_lc_class(self): try: from langchain_openai import OpenAI except ImportError: from langchain.llms import OpenAI return OpenAI class AzureOpenAI(LCCompletionMixin, LLM): """Wrapper around Langchain's AzureOpenAI class, focusing on key parameters""" def __init__( self, azure_endpoint: Optional[str] = None, deployment_name: Optional[str] = None, openai_api_version: str = "", openai_api_key: Optional[str] = None, model_name: str = "text-davinci-003", temperature: float = 0.7, max_tokens: int = 256, top_p: float = 1, frequency_penalty: float = 0, n: int = 1, best_of: int = 1, request_timeout: Optional[float] = None, max_retries: int = 2, streaming: bool = False, **params, ): super().__init__( azure_endpoint=azure_endpoint, deployment_name=deployment_name, openai_api_version=openai_api_version, openai_api_key=openai_api_key, model_name=model_name, temperature=temperature, max_tokens=max_tokens, top_p=top_p, frequency_penalty=frequency_penalty, n=n, best_of=best_of, request_timeout=request_timeout, max_retries=max_retries, streaming=streaming, **params, ) def _get_lc_class(self): try: from langchain_openai import AzureOpenAI except ImportError: from langchain.llms import AzureOpenAI return AzureOpenAI class LlamaCpp(LCCompletionMixin, LLM): """Wrapper around Langchain's LlamaCpp class, focusing on key parameters""" def __init__( self, model_path: str, lora_base: Optional[str] = None, n_ctx: int = 512, n_gpu_layers: Optional[int] = None, use_mmap: bool = True, **params, ): super().__init__( model_path=model_path, lora_base=lora_base, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers, use_mmap=use_mmap, **params, ) def _get_lc_class(self): try: from langchain_community.llms import LlamaCpp except ImportError: from langchain.llms import LlamaCpp return LlamaCpp ================================================ FILE: libs/kotaemon/kotaemon/llms/cot.py ================================================ from copy import deepcopy from typing import Callable, List from theflow import Function, Node, Param from kotaemon.base import BaseComponent, Document from .chats import LCAzureChatOpenAI from .completions import LLM from .prompts import BasePromptComponent class Thought(BaseComponent): """A thought in the chain of thought - Input: `**kwargs` pairs, where key is the placeholder in the prompt, and value is the value. - Output: an output dictionary _**Usage:**_ Create and run a thought: ```python >> from kotaemon.pipelines.cot import Thought >> thought = Thought( prompt="How to {action} {object}?", llm=LCAzureChatOpenAI(...), post_process=lambda string: {"tutorial": string}, ) >> output = thought(action="install", object="python") >> print(output) {'tutorial': 'As an AI language model,...'} ``` Basically, when a thought is run, it will: 1. Populate the prompt template with the input `**kwargs`. 2. Run the LLM model with the populated prompt. 3. Post-process the LLM output with the post-processor. This `Thought` allows chaining sequentially with the + operator. For example: ```python >> llm = LCAzureChatOpenAI(...) >> thought1 = Thought( prompt="Word {word} in {language} is ", llm=llm, post_process=lambda string: {"translated": string}, ) >> thought2 = Thought( prompt="Translate {translated} to Japanese", llm=llm, post_process=lambda string: {"output": string}, ) >> thought = thought1 + thought2 >> thought(word="hello", language="French") {'word': 'hello', 'language': 'French', 'translated': '"Bonjour"', 'output': 'こんにちは (Konnichiwa)'} ``` Under the hood, when the `+` operator is used, a `ManualSequentialChainOfThought` is created. """ prompt: str = Param( help=( "The prompt template string. This prompt template has Python-like variable" " placeholders, that then will be substituted with real values when this" " component is executed" ) ) llm: LLM = Node(LCAzureChatOpenAI, help="The LLM model to execute the input prompt") post_process: Function = Node( help=( "The function post-processor that post-processes LLM output prediction ." "It should take a string as input (this is the LLM output text) and return " "a dictionary, where the key should" ) ) @Node.auto(depends_on="prompt") def prompt_template(self): """Automatically wrap around param prompt. Can ignore""" return BasePromptComponent(template=self.prompt) def run(self, **kwargs) -> Document: """Run the chain of thought""" prompt = self.prompt_template(**kwargs).text response = self.llm(prompt).text response = self.post_process(response) return Document(response) def get_variables(self) -> List[str]: return [] def __add__(self, next_thought: "Thought") -> "ManualSequentialChainOfThought": return ManualSequentialChainOfThought( thoughts=[self, next_thought], llm=self.llm ) class ManualSequentialChainOfThought(BaseComponent): """Perform sequential chain-of-thought with manual pre-defined prompts This method supports variable number of steps. Each step corresponds to a `kotaemon.pipelines.cot.Thought`. Please refer that section for Thought's detail. This section is about chaining thought together. _**Usage:**_ **Create and run a chain of thought without "+" operator:** ```pycon >>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought >>> llm = LCAzureChatOpenAI(...) >>> thought1 = Thought( >>> prompt="Word {word} in {language} is ", >>> post_process=lambda string: {"translated": string}, >>> ) >>> thought2 = Thought( >>> prompt="Translate {translated} to Japanese", >>> post_process=lambda string: {"output": string}, >>> ) >>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm) >>> thought(word="hello", language="French") {'word': 'hello', 'language': 'French', 'translated': '"Bonjour"', 'output': 'こんにちは (Konnichiwa)'} ``` **Create and run a chain of thought without "+" operator:** Please refer the `kotaemon.pipelines.cot.Thought` section for examples. This chain-of-thought optionally takes a termination check callback function. This function will be called after each thought is executed. It takes in a dictionary of all thought outputs so far, and it returns True or False. If True, the chain-of-thought will terminate. If unset, the default callback always returns False. """ thoughts: List[Thought] = Param( default_callback=lambda *_: [], help="List of Thought" ) llm: LLM = Param(help="The LLM model to use (base of kotaemon.llms.BaseLLM)") terminate: Callable = Param( default=lambda _: False, help="Callback on terminate condition. Default to always return False", ) def run(self, **kwargs) -> Document: """Run the manual chain of thought""" inputs = deepcopy(kwargs) for idx, thought in enumerate(self.thoughts): if self.llm: thought.llm = self.llm self._prepare_child(thought, f"thought{idx}") output = thought(**inputs) inputs.update(output.content) if self.terminate(inputs): break return Document(inputs) def __add__(self, next_thought: Thought) -> "ManualSequentialChainOfThought": return ManualSequentialChainOfThought( thoughts=self.thoughts + [next_thought], llm=self.llm ) ================================================ FILE: libs/kotaemon/kotaemon/llms/linear.py ================================================ from typing import Any, Callable, Optional, Union from ..base import BaseComponent from ..base.schema import Document, IO_Type from .chats import ChatLLM from .completions import LLM from .prompts import BasePromptComponent class SimpleLinearPipeline(BaseComponent): """ A simple pipeline for running a function with a prompt, a language model, and an optional post-processor. Attributes: prompt (BasePromptComponent): The prompt component used to generate the initial input. llm (Union[ChatLLM, LLM]): The language model component used to generate the output. post_processor (Union[BaseComponent, Callable[[IO_Type], IO_Type]]): An optional post-processor component or function. Example Usage: ```python from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent def identity(x): return x llm = LCAzureChatOpenAI( openai_api_base="your openai api base", openai_api_key="your openai api key", openai_api_version="your openai api version", deployment_name="dummy-q2-gpt35", temperature=0, request_timeout=600, ) pipeline = SimpleLinearPipeline( prompt=BasePromptComponent(template="what is {word} in Japanese ?"), llm=llm, post_processor=identity, ) print(pipeline(word="lone")) ``` """ prompt: BasePromptComponent llm: Union[ChatLLM, LLM] post_processor: Union[BaseComponent, Callable[[IO_Type], IO_Type]] def run( self, *, llm_kwargs: Optional[dict] = {}, post_processor_kwargs: Optional[dict] = {}, **prompt_kwargs, ): """ Run the function with the given arguments and return the final output as a Document object. Args: llm_kwargs (dict): Keyword arguments for the llm call. post_processor_kwargs (dict): Keyword arguments for the post_processor. **prompt_kwargs: Keyword arguments for populating the prompt. Returns: Document: The final output of the function as a Document object. """ prompt = self.prompt(**prompt_kwargs) llm_output = self.llm(prompt.text, **llm_kwargs) if self.post_processor is not None: final_output = self.post_processor(llm_output, **post_processor_kwargs)[0] else: final_output = llm_output return Document(final_output) class GatedLinearPipeline(SimpleLinearPipeline): """ A pipeline that extends the SimpleLinearPipeline class and adds a condition attribute. Attributes: condition (Callable[[IO_Type], Any]): A callable function that represents the condition. Usage: ```{.py3 title="Example Usage"} from kotaemon.llms import LCAzureChatOpenAI, BasePromptComponent from kotaemon.parsers import RegexExtractor def identity(x): return x llm = LCAzureChatOpenAI( openai_api_base="your openai api base", openai_api_key="your openai api key", openai_api_version="your openai api version", deployment_name="dummy-q2-gpt35", temperature=0, request_timeout=600, ) pipeline = GatedLinearPipeline( prompt=BasePromptComponent(template="what is {word} in Japanese ?"), condition=RegexExtractor(pattern="some pattern"), llm=llm, post_processor=identity, ) print(pipeline(condition_text="some pattern", word="lone")) print(pipeline(condition_text="other pattern", word="lone")) ``` """ condition: Callable[[IO_Type], Any] def run( self, *, condition_text: Optional[str] = None, llm_kwargs: Optional[dict] = {}, post_processor_kwargs: Optional[dict] = {}, **prompt_kwargs, ) -> Document: """ Run the pipeline with the given arguments and return the final output as a Document object. Args: condition_text (str): The condition text to evaluate. Default to None. llm_kwargs (dict): Additional keyword arguments for the language model call. post_processor_kwargs (dict): Additional keyword arguments for the post-processor. **prompt_kwargs: Keyword arguments for populating the prompt. Returns: Document: The final output of the pipeline as a Document object. Raises: ValueError: If condition_text is None """ if condition_text is None: raise ValueError("`condition_text` must be provided") if self.condition(condition_text)[0]: return super().run( llm_kwargs=llm_kwargs, post_processor_kwargs=post_processor_kwargs, **prompt_kwargs, ) return Document(None) ================================================ FILE: libs/kotaemon/kotaemon/llms/prompts/__init__.py ================================================ from .base import BasePromptComponent from .template import PromptTemplate __all__ = ["BasePromptComponent", "PromptTemplate"] ================================================ FILE: libs/kotaemon/kotaemon/llms/prompts/base.py ================================================ from typing import Callable from theflow import Param from kotaemon.base import BaseComponent, Document from .template import PromptTemplate class BasePromptComponent(BaseComponent): """ Base class for prompt components. Args: template (PromptTemplate): The prompt template. **kwargs: Any additional keyword arguments that will be used to populate the given template. """ class Config: middleware_switches = {"theflow.middleware.CachingMiddleware": False} allow_extra = True template: str | PromptTemplate @Param.auto(depends_on="template") def template__(self): return ( self.template if isinstance(self.template, PromptTemplate) else PromptTemplate(self.template) ) def __init__(self, **kwargs): super().__init__(**kwargs) self.__set(**kwargs) def __check_redundant_kwargs(self, **kwargs): """ Check for redundant keyword arguments. Parameters: **kwargs (dict): A dictionary of keyword arguments. Raises: ValueError: If any keys provided are not in the template. Returns: None """ self.template__.check_redundant_kwargs(**kwargs) def __check_unset_placeholders(self): """ Check if all the placeholders in the template are set. This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a `ValueError` is raised with the names of the missing keys. Parameters: None Returns: None """ self.template__.check_missing_kwargs(**self.__dict__) def __validate_value_type(self, **kwargs): """ Validates the value types of the given keyword arguments. Parameters: **kwargs (dict): A dictionary of keyword arguments to be validated. Raises: ValueError: If any of the values in the kwargs dictionary have an unsupported type. Returns: None """ type_error = [] for k, v in kwargs.items(): if k.startswith("template"): continue if not isinstance(v, (str, int, Document, Callable)): # type: ignore type_error.append((k, type(v))) if type_error: raise ValueError( "Type of values must be either int, str, Document, Callable, " f"found unsupported type for (key, type): {type_error}" ) def __set(self, **kwargs): """ Set the values of the attributes in the object based on the provided keyword arguments. Args: kwargs (dict): A dictionary with the attribute names as keys and the new values as values. Returns: None """ self.__check_redundant_kwargs(**kwargs) self.__validate_value_type(**kwargs) self.__dict__.update(kwargs) def __prepare_value(self): """ Generate a dictionary of keyword arguments based on the template's placeholders and the current instance's attributes. Returns: dict: A dictionary of keyword arguments. """ def __prepare(key, value): if isinstance(value, str): return value if isinstance(value, (int, Document)): return str(value) raise ValueError( f"Unsupported type {type(value)} for template value of key {key}" ) kwargs = {} for k in self.template__.placeholders: v = getattr(self, k) # if get a callable, execute to get its output if isinstance(v, Callable): # type: ignore[arg-type] v = v() if isinstance(v, list): v = str([__prepare(k, each) for each in v]) elif isinstance(v, (str, int, Document)): v = __prepare(k, v) else: raise ValueError( f"Unsupported type {type(v)} for template value of key `{k}`" ) kwargs[k] = v return kwargs def set_value(self, **kwargs): """ Similar to `__set` but for external use. Set the values of the attributes in the object based on the provided keyword arguments. Args: kwargs (dict): A dictionary with the attribute names as keys and the new values as values. Returns: None """ self.__set(**kwargs) def run(self, **kwargs): """ Run the function with the given keyword arguments. Args: **kwargs: The keyword arguments to pass to the function. Returns: The result of calling the `populate` method of the `template` object with the given keyword arguments. """ self.__set(**kwargs) self.__check_unset_placeholders() prepared_kwargs = self.__prepare_value() text = self.template__.populate(**prepared_kwargs) return Document(text=text, metadata={"origin": "PromptComponent"}) def flow(self): return self.__call__() ================================================ FILE: libs/kotaemon/kotaemon/llms/prompts/template.py ================================================ import warnings from string import Formatter class PromptTemplate: """ Base class for prompt templates. """ def __init__(self, template: str, ignore_invalid=True): template = template formatter = Formatter() parsed_template = list(formatter.parse(template)) placeholders = set() for _, key, _, _ in parsed_template: if key is None: continue if not key.isidentifier(): if ignore_invalid: warnings.warn(f"Ignore invalid placeholder: {key}.", UserWarning) else: raise ValueError( "Placeholder name must be a valid Python identifier, found:" f" {key}." ) placeholders.add(key) self.template = template self.placeholders = placeholders self.__formatter = formatter self.__parsed_template = parsed_template def check_missing_kwargs(self, **kwargs): """ Check if all the placeholders in the template are set. This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a `ValueError` is raised with the names of the missing keys. Parameters: None Returns: None """ missing_keys = self.placeholders.difference(kwargs.keys()) if missing_keys: raise ValueError(f"Missing keys in template: {','.join(missing_keys)}") def check_redundant_kwargs(self, **kwargs): """ Check if all the placeholders in the template are set. This function checks if all the expected placeholders in the template are set as attributes of the object. If any placeholders are missing, a `ValueError` is raised with the names of the missing keys. Parameters: None Returns: None """ provided_keys = set(kwargs.keys()) redundant_keys = provided_keys - self.placeholders if redundant_keys: warnings.warn( f"Keys provided but not in template: {','.join(redundant_keys)}", UserWarning, ) def populate(self, safe=True, **kwargs) -> str: """ Strictly populate the template with the given keyword arguments. Args: **kwargs: The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template. Returns: The populated template. Raises: ValueError: If an unknown placeholder is provided. """ if safe: self.check_missing_kwargs(**kwargs) return self.partial_populate(**kwargs) def partial_populate(self, **kwargs): """ Partially populate the template with the given keyword arguments. Args: **kwargs: The keyword arguments to populate the template. Each keyword corresponds to a placeholder in the template. Returns: str: The populated template. """ self.check_redundant_kwargs(**kwargs) prompt = [] for literal_text, field_name, format_spec, conversion in self.__parsed_template: prompt.append(literal_text) if field_name is None: continue if field_name not in kwargs: if conversion: value = f"{{{field_name}}}!{conversion}:{format_spec}" else: value = f"{{{field_name}:{format_spec}}}" else: value = kwargs[field_name] if conversion is not None: value = self.__formatter.convert_field(value, conversion) if format_spec is not None: value = self.__formatter.format_field(value, format_spec) prompt.append(value) return "".join(prompt) def __add__(self, other): """ Create a new PromptTemplate object by concatenating the template of the current object with the template of another PromptTemplate object. Parameters: other (PromptTemplate): Another PromptTemplate object. Returns: PromptTemplate: A new PromptTemplate object with the concatenated templates. """ return PromptTemplate(self.template + "\n" + other.template) ================================================ FILE: libs/kotaemon/kotaemon/loaders/__init__.py ================================================ from .adobe_loader import AdobeReader from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader from .base import AutoReader, BaseReader from .composite_loader import DirectoryReader from .docling_loader import DoclingReader from .docx_loader import DocxReader from .excel_loader import ExcelReader, PandasExcelReader from .html_loader import HtmlReader, MhtmlReader from .mathpix_loader import MathpixPDFReader from .ocr_loader import ImageReader, OCRReader from .pdf_loader import PDFThumbnailReader from .txt_loader import TxtReader from .unstructured_loader import UnstructuredReader from .web_loader import WebReader __all__ = [ "AutoReader", "AzureAIDocumentIntelligenceLoader", "BaseReader", "PandasExcelReader", "ExcelReader", "MathpixPDFReader", "ImageReader", "OCRReader", "DirectoryReader", "UnstructuredReader", "DocxReader", "HtmlReader", "MhtmlReader", "AdobeReader", "TxtReader", "PDFThumbnailReader", "WebReader", "DoclingReader", ] ================================================ FILE: libs/kotaemon/kotaemon/loaders/adobe_loader.py ================================================ import logging import os import re from collections import defaultdict from pathlib import Path from typing import Any, Dict, List, Optional from decouple import config from llama_index.core.readers.base import BaseReader from kotaemon.base import Document logger = logging.getLogger(__name__) DEFAULT_VLM_ENDPOINT = ( "{0}openai/deployments/{1}/chat/completions?api-version={2}".format( config("AZURE_OPENAI_ENDPOINT", default=""), "gpt-4-vision", config("OPENAI_API_VERSION", default=""), ) ) class AdobeReader(BaseReader): """Read PDF using the Adobe's PDF Services. Be able to extract text, table, and figure with high accuracy Example: ```python >> from kotaemon.loaders import AdobeReader >> reader = AdobeReader() >> documents = reader.load_data("path/to/pdf") ``` Args: endpoint: URL to the Vision Language Model endpoint. If not provided, will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT` max_figures_to_caption: an int decides how many figured will be captioned. The rest will be ignored (are indexed without captions). """ def __init__( self, vlm_endpoint: Optional[str] = None, max_figures_to_caption: int = 100, *args: Any, **kwargs: Any, ) -> None: """Init params""" super().__init__(*args) self.table_regex = r"/Table(\[\d+\])?$" self.figure_regex = r"/Figure(\[\d+\])?$" self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT self.max_figures_to_caption = max_figures_to_caption def load_data( self, file: Path, extra_info: Optional[Dict] = None, **kwargs ) -> List[Document]: """Load data by calling to the Adobe's API Args: file (Path): Path to the PDF file Returns: List[Document]: list of documents extracted from the PDF file, includes 3 types: text, table, and image """ from .utils.adobe import ( generate_figure_captions, load_json, parse_figure_paths, parse_table_paths, request_adobe_service, ) filename = file.name filepath = str(Path(file).resolve()) output_path = request_adobe_service(file_path=str(file), output_path="") results_path = os.path.join(output_path, "structuredData.json") if not os.path.exists(results_path): logger.exception("Fail to parse the document.") return [] data = load_json(results_path) texts = defaultdict(list) tables = [] figures = [] elements = data["elements"] for item_id, item in enumerate(elements): page_number = item.get("Page", -1) + 1 item_path = item["Path"] item_text = item.get("Text", "") file_paths = [ Path(output_path) / path for path in item.get("filePaths", []) ] prev_item = elements[item_id - 1] title = prev_item.get("Text", "") if re.search(self.table_regex, item_path): table_content = parse_table_paths(file_paths) if not table_content: continue table_caption = ( table_content.replace("|", "").replace("---", "") + f"\n(Table in Page {page_number}. {title})" ) tables.append((page_number, table_content, table_caption)) elif re.search(self.figure_regex, item_path): figure_caption = ( item_text + f"\n(Figure in Page {page_number}. {title})" ) figure_content = parse_figure_paths(file_paths) if not figure_content: continue figures.append([page_number, figure_content, figure_caption]) else: if item_text and "Table" not in item_path and "Figure" not in item_path: texts[page_number].append(item_text) # get figure caption using GPT-4V figure_captions = generate_figure_captions( self.vlm_endpoint, [item[1] for item in figures], self.max_figures_to_caption, ) for item, caption in zip(figures, figure_captions): # update figure caption item[2] += " " + caption # Wrap elements with Document documents = [] # join plain text elements for page_number, txts in texts.items(): documents.append( Document( text="\n".join(txts), metadata={ "page_label": page_number, "file_name": filename, "file_path": filepath, **(extra_info if extra_info else {}), }, ) ) # table elements for page_number, table_content, table_caption in tables: documents.append( Document( text=table_content, metadata={ "table_origin": table_content, "type": "table", "page_label": page_number, "file_name": filename, "file_path": filepath, **(extra_info if extra_info else {}), }, metadata_template="", metadata_seperator="", ) ) # figure elements for page_number, figure_content, figure_caption in figures: documents.append( Document( text=figure_caption, metadata={ "image_origin": figure_content, "type": "image", "page_label": page_number, "file_name": filename, "file_path": filepath, **(extra_info if extra_info else {}), }, metadata_template="", metadata_seperator="", ) ) return documents ================================================ FILE: libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py ================================================ import base64 import os from io import BytesIO from pathlib import Path from typing import Optional from PIL import Image from kotaemon.base import Document, Param from .base import BaseReader from .utils.adobe import generate_single_figure_caption def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Image.Image: """Crop the image based on the bounding box Args: file_path (Path): path to the image file bbox (list[float]): bounding box of the image (in percentage [x0, y0, x1, y1]) page_number (int, optional): page number of the image. Defaults to 0. Returns: Image.Image: cropped image """ left, upper, right, lower = bbox left, right = min(left, right), max(left, right) upper, lower = min(upper, lower), max(upper, lower) img: Image.Image suffix = file_path.suffix.lower() if suffix == ".pdf": try: import fitz except ImportError: raise ImportError("Please install PyMuPDF: 'pip install PyMuPDF'") doc = fitz.open(file_path) page = doc.load_page(page_number) pm = page.get_pixmap(dpi=150) img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) elif suffix in [".tif", ".tiff"]: img = Image.open(file_path) img.seek(page_number) else: img = Image.open(file_path) return img.crop( ( int(left * img.width), int(upper * img.height), int(right * img.width), int(lower * img.height), ) ) class AzureAIDocumentIntelligenceLoader(BaseReader): """Utilize Azure AI Document Intelligence to parse document As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff, heif, docx, xlsx, pptx and html. """ _dependencies = ["azure-ai-documentintelligence", "PyMuPDF", "Pillow"] endpoint: str = Param( os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT", None), help="Endpoint of Azure AI Document Intelligence", ) credential: str = Param( os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL", None), help="Credential of Azure AI Document Intelligence", ) model: str = Param( "prebuilt-layout", help=( "Model to use for document analysis. Default is prebuilt-layout. " "As of April 24, you can view the supported models [here]" "(https://learn.microsoft.com/en-us/azure/ai-services/" "document-intelligence/concept-model-overview?view=doc-intel-4.0.0" "#model-analysis-features)" ), ) output_content_format: str = Param( "markdown", help="Output content format. Can be 'markdown' or 'text'.Default is markdown", ) vlm_endpoint: str = Param( help=( "Default VLM endpoint for figure captioning. If not provided, will not " "caption the figures" ) ) figure_friendly_filetypes: list[str] = Param( [".pdf", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif", ".tif"], help=( "File types that we can reliably open and extract figures. " "For files like .docx or .html, the visual layout may be different " "when viewed from different tools, hence we cannot use Azure DI " "location to extract figures." ), ) cache_dir: str = Param( None, help="Directory to cache the downloaded files. Default is None", ) @Param.auto(depends_on=["endpoint", "credential"]) def client_(self): try: from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.core.credentials import AzureKeyCredential except ImportError: raise ImportError("Please install azure-ai-documentintelligence") return DocumentIntelligenceClient( self.endpoint, AzureKeyCredential(self.credential) ) def run( self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs ) -> list[Document]: return self.load_data(Path(file_path), extra_info=extra_info, **kwargs) def load_data( self, file_path: Path, extra_info: Optional[dict] = None, **kwargs ) -> list[Document]: """Extract the input file, allowing multi-modal extraction""" metadata = extra_info or {} file_name = Path(file_path) with open(file_path, "rb") as fi: poller = self.client_.begin_analyze_document( self.model, body=fi, content_type="application/octet-stream", output_content_format=self.output_content_format, ) result = poller.result() # the total text content of the document in `output_content_format` format text_content = result.content removed_spans: list[dict] = [] # extract the figures figures = [] for figure_desc in result.get("figures", []): if not self.vlm_endpoint: continue if file_path.suffix.lower() not in self.figure_friendly_filetypes: continue # read & crop the image page_number = figure_desc["boundingRegions"][0]["pageNumber"] page_width = result.pages[page_number - 1]["width"] page_height = result.pages[page_number - 1]["height"] polygon = figure_desc["boundingRegions"][0]["polygon"] xs = [polygon[i] for i in range(0, len(polygon), 2)] ys = [polygon[i] for i in range(1, len(polygon), 2)] bbox = [ min(xs) / page_width, min(ys) / page_height, max(xs) / page_width, max(ys) / page_height, ] img = crop_image(file_path, bbox, page_number - 1) # convert the image into base64 img_bytes = BytesIO() img.save(img_bytes, format="PNG") img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8") img_base64 = f"data:image/png;base64,{img_base64}" # caption the image caption = generate_single_figure_caption( figure=img_base64, vlm_endpoint=self.vlm_endpoint ) # store the image into document figure_metadata = { "image_origin": img_base64, "type": "image", "page_label": page_number, } figure_metadata.update(metadata) figures.append( Document( text=caption, metadata=figure_metadata, ) ) removed_spans += figure_desc["spans"] # extract the tables tables = [] for table_desc in result.get("tables", []): if not table_desc["spans"]: continue # convert the tables into markdown format boundingRegions = table_desc["boundingRegions"] if boundingRegions: page_number = boundingRegions[0]["pageNumber"] else: page_number = 1 # store the tables into document offset = table_desc["spans"][0]["offset"] length = table_desc["spans"][0]["length"] table_metadata = { "type": "table", "page_label": page_number, "table_origin": text_content[offset : offset + length], } table_metadata.update(metadata) tables.append( Document( text=text_content[offset : offset + length], metadata=table_metadata, ) ) removed_spans += table_desc["spans"] # save the text content into markdown format if self.cache_dir is not None: with open( Path(self.cache_dir) / f"{file_name.stem}.md", "w", encoding="utf-8" ) as f: f.write(text_content) removed_spans = sorted(removed_spans, key=lambda x: x["offset"], reverse=True) for span in removed_spans: text_content = ( text_content[: span["offset"]] + text_content[span["offset"] + span["length"] :] ) return [Document(content=text_content, metadata=metadata)] + figures + tables ================================================ FILE: libs/kotaemon/kotaemon/loaders/base.py ================================================ from pathlib import Path from typing import TYPE_CHECKING, Any, List, Type, Union from kotaemon.base import BaseComponent, Document if TYPE_CHECKING: from llama_index.core.readers.base import BaseReader as LIBaseReader class BaseReader(BaseComponent): """The base class for all readers""" ... class AutoReader(BaseReader): """General auto reader for a variety of files. (based on llama-hub)""" def __init__(self, reader_type: Union[str, Type["LIBaseReader"]]) -> None: """Init reader using string identifier or class name from llama-hub. When a string is given, first attempts a direct import from ``llama_index.readers.file`` (works in uv/pip-less venvs where the package is already installed). Falls back to the deprecated ``download_loader`` only if the direct import fails. """ import importlib if isinstance(reader_type, str): # Try direct import first — avoids pip-install side-effect of # download_loader, which fails in venvs without pip (e.g. uv). try: module = importlib.import_module("llama_index.readers.file") reader_cls = getattr(module, reader_type) self._reader = reader_cls() except (ImportError, AttributeError): from llama_index.core import download_loader self._reader = download_loader(reader_type)() else: self._reader = reader_type() super().__init__() def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]: documents = self._reader.load_data(file=file, **kwargs) # convert Document to new base class from kotaemon converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents] return converted_documents def run(self, file: Union[Path, str], **kwargs: Any) -> List[Document]: return self.load_data(file=file, **kwargs) class LIReaderMixin(BaseComponent): """Base wrapper around llama-index reader To use the LIBaseReader, you need to implement the _get_wrapped_class method to return the relevant llama-index reader class that you want to wrap. Example: ```python class DirectoryReader(LIBaseReader): def _get_wrapped_class(self) -> Type["BaseReader"]: from llama_index import SimpleDirectoryReader return SimpleDirectoryReader ``` """ def _get_wrapped_class(self) -> Type["LIBaseReader"]: raise NotImplementedError( "Please return the relevant llama-index class in in _get_wrapped_class" ) def __init__(self, *args, **kwargs): self._reader_class = self._get_wrapped_class() self._reader = self._reader_class(*args, **kwargs) super().__init__() def __setattr__(self, name: str, value: Any) -> None: if name.startswith("_"): return super().__setattr__(name, value) return setattr(self._reader, name, value) def __getattr__(self, name: str) -> Any: return getattr(self._reader, name) def load_data(self, *args, **kwargs: Any) -> List[Document]: documents = self._reader.load_data(*args, **kwargs) # convert Document to new base class from kotaemon converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents] return converted_documents def run(self, *args, **kwargs: Any) -> List[Document]: return self.load_data(*args, **kwargs) ================================================ FILE: libs/kotaemon/kotaemon/loaders/composite_loader.py ================================================ from typing import Callable, List, Optional, Type from llama_index.core.readers.base import BaseReader as LIBaseReader from .base import BaseReader, LIReaderMixin class DirectoryReader(LIReaderMixin, BaseReader): """Wrap around llama-index SimpleDirectoryReader Args: input_dir (str): Path to the directory. input_files (List): List of file paths to read (Optional; overrides input_dir, exclude) exclude (List): glob of python file paths to exclude (Optional) exclude_hidden (bool): Whether to exclude hidden files (dotfiles). encoding (str): Encoding of the files. Default is utf-8. errors (str): how encoding and decoding errors are to be handled, see https://docs.python.org/3/library/functions.html#open recursive (bool): Whether to recursively search in subdirectories. False by default. filename_as_id (bool): Whether to use the filename as the document id. False by default. required_exts (Optional[List[str]]): List of required extensions. Default is None. file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file extension to a BaseReader class that specifies how to convert that file to text. If not specified, use default from DEFAULT_FILE_READER_CLS. num_files_limit (Optional[int]): Maximum number of files to read. Default is None. file_metadata (Optional[Callable[str, Dict]]): A function that takes in a filename and returns a Dict of metadata for the Document. Default is None. """ input_dir: Optional[str] = None input_files: Optional[List] = None exclude: Optional[List] = None exclude_hidden: bool = True errors: str = "ignore" recursive: bool = False encoding: str = "utf-8" filename_as_id: bool = False required_exts: Optional[list[str]] = None file_extractor: Optional[dict[str, "LIBaseReader"]] = None num_files_limit: Optional[int] = None file_metadata: Optional[Callable[[str], dict]] = None def _get_wrapped_class(self) -> Type["LIBaseReader"]: from llama_index.core import SimpleDirectoryReader return SimpleDirectoryReader ================================================ FILE: libs/kotaemon/kotaemon/loaders/docling_loader.py ================================================ import base64 from collections import defaultdict from io import BytesIO from pathlib import Path from typing import List, Optional from kotaemon.base import Document, Param from .azureai_document_intelligence_loader import crop_image from .base import BaseReader from .utils.adobe import generate_single_figure_caption, make_markdown_table class DoclingReader(BaseReader): """Using Docling to extract document structure and content""" _dependencies = ["docling"] vlm_endpoint: str = Param( help=( "Default VLM endpoint for figure captioning. " "If not provided, will not caption the figures" ) ) max_figure_to_caption: int = Param( 100, help=( "The maximum number of figures to caption. " "The rest will be indexed without captions." ), ) figure_friendly_filetypes: list[str] = Param( [".pdf", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif", ".tif"], help=( "File types that we can reliably open and extract figures. " "For files like .docx or .html, the visual layout may be different " "when viewed from different tools, hence we cannot use Azure DI location " "to extract figures." ), ) @Param.auto(cache=True) def converter_(self): try: from docling.document_converter import DocumentConverter except ImportError: raise ImportError("Please install docling: 'pip install docling'") return DocumentConverter() def run( self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs ) -> List[Document]: return self.load_data(file_path, extra_info, **kwargs) def load_data( self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs ) -> List[Document]: """Extract the input file, allowing multi-modal extraction""" metadata = extra_info or {} result = self.converter_.convert(file_path) result_dict = result.document.export_to_dict() file_path = Path(file_path) file_name = file_path.name # extract the figures figures = [] gen_caption_count = 0 for figure_obj in result_dict.get("pictures", []): if not self.vlm_endpoint: continue if file_path.suffix.lower() not in self.figure_friendly_filetypes: continue # retrieve extractive captions provided by docling caption_refs = [caption["$ref"] for caption in figure_obj["captions"]] extractive_captions = [] for caption_ref in caption_refs: text_id = caption_ref.split("/")[-1] try: caption_text = result_dict["texts"][int(text_id)]["text"] extractive_captions.append(caption_text) except (ValueError, TypeError, IndexError) as e: print(e) continue # read & crop image page_number = figure_obj["prov"][0]["page_no"] try: page_number_text = str(page_number) page_width = result_dict["pages"][page_number_text]["size"]["width"] page_height = result_dict["pages"][page_number_text]["size"]["height"] bbox_obj = figure_obj["prov"][0]["bbox"] bbox: list[float] = [ bbox_obj["l"], bbox_obj["t"], bbox_obj["r"], bbox_obj["b"], ] if bbox_obj["coord_origin"] == "BOTTOMLEFT": bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height) img = crop_image(file_path, bbox, page_number - 1) except KeyError as e: print(e, list(result_dict["pages"].keys())) continue # convert img to base64 img_bytes = BytesIO() img.save(img_bytes, format="PNG") img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8") img_base64 = f"data:image/png;base64,{img_base64}" # generate the generative caption if gen_caption_count >= self.max_figure_to_caption: gen_caption = "" else: gen_caption_count += 1 gen_caption = generate_single_figure_caption( figure=img_base64, vlm_endpoint=self.vlm_endpoint ) # join the extractive and generative captions caption = "\n".join(extractive_captions + [gen_caption]) # store the image into document figure_metadata = { "image_origin": img_base64, "type": "image", "page_label": page_number, "file_name": file_name, "file_path": file_path, } figure_metadata.update(metadata) figures.append( Document( text=caption, metadata=figure_metadata, ) ) # extract the tables tables = [] for table_obj in result_dict.get("tables", []): # convert the tables into markdown format markdown_table = self._parse_table(table_obj) caption_refs = [caption["$ref"] for caption in table_obj["captions"]] extractive_captions = [] for caption_ref in caption_refs: text_id = caption_ref.split("/")[-1] try: caption_text = result_dict["texts"][int(text_id)]["text"] extractive_captions.append(caption_text) except (ValueError, TypeError, IndexError) as e: print(e) continue # join the extractive and generative captions caption = "\n".join(extractive_captions) markdown_table = f"{caption}\n{markdown_table}" page_number = table_obj["prov"][0].get("page_no", 1) table_metadata = { "type": "table", "page_label": page_number, "table_origin": markdown_table, "file_name": file_name, "file_path": file_path, } table_metadata.update(metadata) tables.append( Document( text=markdown_table, metadata=table_metadata, ) ) # join plain text elements texts = [] page_number_to_text = defaultdict(list) for text_obj in result_dict["texts"]: page_number = text_obj["prov"][0].get("page_no", 1) page_number_to_text[page_number].append(text_obj["text"]) for page_number, txts in page_number_to_text.items(): texts.append( Document( text="\n".join(txts), metadata={ "page_label": page_number, "file_name": file_name, "file_path": file_path, **metadata, }, ) ) return texts + tables + figures def _convert_bbox_bl_tl( self, bbox: list[float], page_width: int, page_height: int ) -> list[float]: """Convert bbox from bottom-left to top-left""" x0, y0, x1, y1 = bbox return [ x0 / page_width, (page_height - y1) / page_height, x1 / page_width, (page_height - y0) / page_height, ] def _parse_table(self, table_obj: dict) -> str: """Convert docling table object to markdown table""" table_as_list: List[List[str]] = [] grid = table_obj["data"]["grid"] for row in grid: table_as_list.append([]) for cell in row: table_as_list[-1].append(cell["text"]) return make_markdown_table(table_as_list) ================================================ FILE: libs/kotaemon/kotaemon/loaders/docx_loader.py ================================================ import unicodedata from pathlib import Path from typing import List, Optional import pandas as pd from llama_index.core.readers.base import BaseReader from kotaemon.base import Document class DocxReader(BaseReader): """Read Docx files that respect table, using python-docx library Reader behavior: - All paragraphs are extracted as a Document - Each table is extracted as a Document, rendered as a CSV string - The output is a list of Documents, concatenating the above (tables + paragraphs) """ def __init__(self, *args, **kwargs): try: import docx # noqa except ImportError: raise ImportError( "docx is not installed. " "Please install it using `pip install python-docx`" ) def _load_single_table(self, table) -> List[List[str]]: """Extract content from tables. Return a list of columns: list[str] Some merged cells will share duplicated content. """ n_row = len(table.rows) n_col = len(table.columns) arrays = [["" for _ in range(n_row)] for _ in range(n_col)] for i, row in enumerate(table.rows): for j, cell in enumerate(row.cells): arrays[j][i] = cell.text return arrays def load_data( self, file_path: Path, extra_info: Optional[dict] = None, **kwargs ) -> List[Document]: """Load data using Docx reader Args: file_path (Path): Path to .docx file Returns: List[Document]: list of documents extracted from the HTML file """ import docx file_path = Path(file_path).resolve() doc = docx.Document(str(file_path)) all_text = "\n".join( [unicodedata.normalize("NFKC", p.text) for p in doc.paragraphs] ) pages = [all_text] # 1 page only tables = [] for t in doc.tables: # return list of columns: list of string arrays = self._load_single_table(t) tables.append(pd.DataFrame({a[0]: a[1:] for a in arrays})) extra_info = extra_info or {} # create output Document with metadata from table documents = [ Document( text=table.to_csv( index=False ).strip(), # strip_special_chars_markdown() metadata={ "table_origin": table.to_csv(index=False), "type": "table", **extra_info, }, metadata_template="", metadata_seperator="", ) for table in tables # page_id ] # create Document from non-table text documents.extend( [ Document( text=non_table_text.strip(), metadata={"page_label": 1, **extra_info}, ) for _, non_table_text in enumerate(pages) ] ) return documents ================================================ FILE: libs/kotaemon/kotaemon/loaders/excel_loader.py ================================================ """Pandas Excel reader. Pandas parser for .xlsx files. """ from pathlib import Path from typing import Any, List, Optional, Union from llama_index.core.readers.base import BaseReader from kotaemon.base import Document class PandasExcelReader(BaseReader): r"""Pandas-based CSV parser. Parses CSVs using the separator detection from Pandas `read_csv` function. If special parameters are required, use the `pandas_config` dict. Args: pandas_config (dict): Options for the `pandas.read_excel` function call. Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html for more information. Set to empty dict by default, this means defaults will be used. """ def __init__( self, *args: Any, pandas_config: Optional[dict] = None, row_joiner: str = "\n", col_joiner: str = " ", **kwargs: Any, ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._pandas_config = pandas_config or {} self._row_joiner = row_joiner if row_joiner else "\n" self._col_joiner = col_joiner if col_joiner else " " def load_data( self, file: Path, include_sheetname: bool = False, sheet_name: Optional[Union[str, int, list]] = None, extra_info: Optional[dict] = None, **kwargs, ) -> List[Document]: """Parse file and extract values from a specific column. Args: file (Path): The path to the Excel file to read. include_sheetname (bool): Whether to include the sheet name in the output. sheet_name (Union[str, int, None]): The specific sheet to read from, default is None which reads all sheets. Returns: List[Document]: A list of`Document objects containing the values from the specified column in the Excel file. """ import itertools try: import pandas as pd except ImportError: raise ImportError( "install pandas using `pip3 install pandas` to use this loader" ) if sheet_name is not None: sheet_name = ( [sheet_name] if not isinstance(sheet_name, list) else sheet_name ) dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config) sheet_names = dfs.keys() df_sheets = [] for key in sheet_names: sheet = [] if include_sheetname: sheet.append([key]) dfs[key] = dfs[key].dropna(axis=0, how="all") dfs[key] = dfs[key].dropna(axis=0, how="all") dfs[key].fillna("", inplace=True) sheet.extend(dfs[key].values.astype(str).tolist()) df_sheets.append(sheet) text_list = list( itertools.chain.from_iterable(df_sheets) ) # flatten list of lists output = [ Document( text=self._row_joiner.join( self._col_joiner.join(sublist) for sublist in text_list ), metadata=extra_info or {}, ) ] return output class ExcelReader(BaseReader): r"""Spreadsheet exporter respecting multiple worksheets Parses CSVs using the separator detection from Pandas `read_csv` function. If special parameters are required, use the `pandas_config` dict. Args: pandas_config (dict): Options for the `pandas.read_excel` function call. Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html for more information. Set to empty dict by default, this means defaults will be used. """ def __init__( self, *args: Any, pandas_config: Optional[dict] = None, row_joiner: str = "\n", col_joiner: str = " ", **kwargs: Any, ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._pandas_config = pandas_config or {} self._row_joiner = row_joiner if row_joiner else "\n" self._col_joiner = col_joiner if col_joiner else " " def load_data( self, file: Path, include_sheetname: bool = True, sheet_name: Optional[Union[str, int, list]] = None, extra_info: Optional[dict] = None, **kwargs, ) -> List[Document]: """Parse file and extract values from a specific column. Args: file (Path): The path to the Excel file to read. include_sheetname (bool): Whether to include the sheet name in the output. sheet_name (Union[str, int, None]): The specific sheet to read from, default is None which reads all sheets. Returns: List[Document]: A list of`Document objects containing the values from the specified column in the Excel file. """ try: import pandas as pd except ImportError: raise ImportError( "install pandas using `pip3 install pandas` to use this loader" ) if sheet_name is not None: sheet_name = ( [sheet_name] if not isinstance(sheet_name, list) else sheet_name ) # clean up input file = Path(file) extra_info = extra_info or {} dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config) sheet_names = dfs.keys() output = [] for idx, key in enumerate(sheet_names): dfs[key] = dfs[key].dropna(axis=0, how="all") dfs[key] = dfs[key].dropna(axis=0, how="all") dfs[key] = dfs[key].astype("object") dfs[key].fillna("", inplace=True) rows = dfs[key].values.astype(str).tolist() content = self._row_joiner.join( self._col_joiner.join(row).strip() for row in rows ).strip() if include_sheetname: content = f"(Sheet {key} of file {file.name})\n{content}" metadata = {"page_label": idx + 1, "sheet_name": key, **extra_info} output.append(Document(text=content, metadata=metadata)) return output ================================================ FILE: libs/kotaemon/kotaemon/loaders/html_loader.py ================================================ import email from pathlib import Path from typing import Optional from llama_index.core.readers.base import BaseReader from theflow.settings import settings as flowsettings from kotaemon.base import Document class HtmlReader(BaseReader): """Reader HTML usimg html2text Reader behavior: - HTML is read with html2text. - All of the texts will be split by `page_break_pattern` - Each page is extracted as a Document - The output is a list of Documents Args: page_break_pattern (str): Pattern to split the HTML into pages """ def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs): try: import html2text # noqa except ImportError: raise ImportError( "html2text is not installed. " "Please install it using `pip install html2text`" ) self._page_break_pattern: Optional[str] = page_break_pattern super().__init__() def load_data( self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs ) -> list[Document]: """Load data using Html reader Args: file_path: path to HTML file extra_info: extra information passed to this reader during extracting data Returns: list[Document]: list of documents extracted from the HTML file """ import html2text file_path = Path(file_path).resolve() with file_path.open("r") as f: html_text = "".join([line[:-1] for line in f.readlines()]) # read HTML all_text = html2text.html2text(html_text) pages = ( all_text.split(self._page_break_pattern) if self._page_break_pattern else [all_text] ) extra_info = extra_info or {} # create Document from non-table text documents = [ Document( text=page.strip(), metadata={"page_label": page_id + 1, **extra_info}, ) for page_id, page in enumerate(pages) ] return documents class MhtmlReader(BaseReader): """Parse `MHTML` files with `BeautifulSoup`.""" def __init__( self, cache_dir: Optional[str] = getattr( flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None ), open_encoding: Optional[str] = None, bs_kwargs: Optional[dict] = None, get_text_separator: str = "", ) -> None: """initialize with path, and optionally, file encoding to use, and any kwargs to pass to the BeautifulSoup object. Args: cache_dir: Path for markdwon format. file_path: Path to file to load. open_encoding: The encoding to use when opening the file. bs_kwargs: Any kwargs to pass to the BeautifulSoup object. get_text_separator: The separator to use when getting the text from the soup. """ try: import bs4 # noqa:F401 except ImportError: raise ImportError( "beautifulsoup4 package not found, please install it with " "`pip install beautifulsoup4`" ) self.cache_dir = cache_dir self.open_encoding = open_encoding if bs_kwargs is None: bs_kwargs = {"features": "lxml"} self.bs_kwargs = bs_kwargs self.get_text_separator = get_text_separator def load_data( self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs ) -> list[Document]: """Load MHTML document into document objects.""" from bs4 import BeautifulSoup extra_info = extra_info or {} metadata: dict = extra_info page = [] file_name = Path(file_path) with open(file_path, "r", encoding=self.open_encoding) as f: message = email.message_from_string(f.read()) parts = message.get_payload() if not isinstance(parts, list): parts = [message] for part in parts: if part.get_content_type() == "text/html": html = part.get_payload(decode=True).decode() soup = BeautifulSoup(html, **self.bs_kwargs) text = soup.get_text(self.get_text_separator) if soup.title: title = str(soup.title.string) else: title = "" metadata = { "source": str(file_path), "title": title, **extra_info, } lines = [line for line in text.split("\n") if line.strip()] text = "\n\n".join(lines) if text: page.append(text) # save the page into markdown format print(self.cache_dir) if self.cache_dir is not None: print(Path(self.cache_dir) / f"{file_name.stem}.md") with open(Path(self.cache_dir) / f"{file_name.stem}.md", "w") as f: f.write(page[0]) return [Document(text="\n\n".join(page), metadata=metadata)] ================================================ FILE: libs/kotaemon/kotaemon/loaders/mathpix_loader.py ================================================ import json import re import time from pathlib import Path from typing import Any, Dict, Generator, List, Optional, Union import requests from langchain.utils import get_from_dict_or_env from llama_index.core.readers.base import BaseReader from kotaemon.base import Document from .utils.table import strip_special_chars_markdown # MathpixPDFLoader implementation taken largely from Daniel Gross's: # https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21 class MathpixPDFReader(BaseReader): """Load `PDF` files using `Mathpix` service.""" def __init__( self, processed_file_format: str = "md", max_wait_time_seconds: int = 900, should_clean_pdf: bool = True, **kwargs: Any, ) -> None: """Initialize with a file path. Args: processed_file_format: a format of the processed file. Default is "mmd". max_wait_time_seconds: a maximum time to wait for the response from the server. Default is 500. should_clean_pdf: a flag to clean the PDF file. Default is False. **kwargs: additional keyword arguments. """ self.mathpix_api_key = get_from_dict_or_env( kwargs, "mathpix_api_key", "MATHPIX_API_KEY", default="empty" ) self.mathpix_api_id = get_from_dict_or_env( kwargs, "mathpix_api_id", "MATHPIX_API_ID", default="empty" ) self.processed_file_format = processed_file_format self.max_wait_time_seconds = max_wait_time_seconds self.should_clean_pdf = should_clean_pdf super().__init__() @property def _mathpix_headers(self) -> Dict[str, str]: return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key} @property def url(self) -> str: return "https://api.mathpix.com/v3/pdf" @property def data(self) -> dict: options = { "conversion_formats": {self.processed_file_format: True}, "enable_tables_fallback": True, } return {"options_json": json.dumps(options)} def send_pdf(self, file_path) -> str: with open(file_path, "rb") as f: files = {"file": f} response = requests.post( self.url, headers=self._mathpix_headers, files=files, data=self.data ) response_data = response.json() if "pdf_id" in response_data: pdf_id = response_data["pdf_id"] return pdf_id else: raise ValueError("Unable to send PDF to Mathpix.") def wait_for_processing(self, pdf_id: str) -> None: """Wait for processing to complete. Args: pdf_id: a PDF id. Returns: None """ url = self.url + "/" + pdf_id for _ in range(0, self.max_wait_time_seconds, 5): response = requests.get(url, headers=self._mathpix_headers) response_data = response.json() status = response_data.get("status", None) print( f"Processing status: {status}," f"Progress: {response_data.get('percent_done', 0)}%" ) if status == "completed": return elif status == "error": raise ValueError(f"Mathpix processing error: {response_data}") elif status in [ "split", "processing", ]: # Add handling for processing states time.sleep(5) continue else: print(f"Unknown status: {response_data}") time.sleep(5) raise TimeoutError( f"Processing did not complete within {self.max_wait_time_seconds} seconds" ) def get_processed_pdf(self, pdf_id: str) -> str: self.wait_for_processing(pdf_id) url = f"{self.url}/{pdf_id}.{self.processed_file_format}" response = requests.get(url, headers=self._mathpix_headers) if response.status_code != 200: raise ValueError(f"Failed to get processed PDF: {response.text}") content = response.content.decode("utf-8") print(f"Retrieved content length: {len(content)}") # Debug print return content def clean_pdf(self, contents: str) -> str: """Clean the PDF file. Args: contents: a PDF file contents. Returns: """ contents = "\n".join( [line for line in contents.split("\n") if not line.startswith("![]")] ) # replace \section{Title} with # Title contents = contents.replace("\\section{", "# ") # replace the "\" slash that Mathpix adds to escape $, %, (, etc. # http:// or https:// followed by anything but a closing paren url_regex = "http[s]?://[^)]+" markup_regex = r"\[]\(\s*({0})\s*\)".format(url_regex) contents = ( contents.replace(r"\$", "$") .replace(r"\%", "%") .replace(r"\(", "(") .replace(r"\)", ")") .replace("$\\begin{array}", "") .replace("\\end{array}$", "") .replace("\\\\", "") .replace("\\text", "") .replace("}", "") .replace("{", "") .replace("\\mathrm", "") ) contents = re.sub(markup_regex, "", contents) return contents def parse_markdown_text_to_tables( self, content: str ) -> tuple[list[tuple[int, str]], list[tuple[int, str]]]: """Parse markdown text to get tables and texts separately. Returns: Tuple of (tables, texts) where each is a list of (page_num, content) tuples """ print("Starting markdown parsing...") print(f"Content length: {len(content)}") # Split by page markers if present pages = re.split(r"(?m)^# Page \d+\n", content) tables: list[tuple[int, str]] = [] texts: list[tuple[int, str]] = [] for page_num, page_content in enumerate(pages, 1): if not page_content.strip(): continue # Extract tables from the page table_matches = re.findall(r"(\|[^\n]+\|(?:\n\|[^\n]+\|)*)", page_content) if table_matches: for table in table_matches: tables.append( (page_num, table.strip()) ) # Store as tuple with page number # Remove tables from page content page_content = re.sub( r"(\|[^\n]+\|(?:\n\|[^\n]+\|)*)", "", page_content ) # Split remaining content into meaningful chunks chunks = re.split(r"\n\s*\n", page_content) for chunk in chunks: if chunk.strip(): texts.append( (page_num, chunk.strip()) ) # Store as tuple with page number print(f"Found {len(tables)} tables and {len(texts)} text sections") return tables, texts def load_data( self, file: Union[str, List[str], Path], extra_info: Optional[Dict] = None, **load_kwargs: Any, ) -> List[Document]: """Load data from file path.""" file_path = Path(file) if isinstance(file, str) else file if "response_content" in load_kwargs: content = load_kwargs["response_content"] else: pdf_id = self.send_pdf(file_path) content = self.get_processed_pdf(pdf_id) if self.should_clean_pdf: content = self.clean_pdf(content) tables, texts = self.parse_markdown_text_to_tables(content) documents = [] # Handle tables for page_num, table_content in tables: text = strip_special_chars_markdown(table_content) metadata = { "table_origin": table_content, "type": "table", "page_label": page_num, "page_number": page_num, } if extra_info: metadata.update(extra_info) documents.append( Document( text=text, metadata=metadata, metadata_template="", metadata_seperator="", ) ) # Handle text sections for page_num, text_content in texts: if not text_content.strip(): continue metadata = { "source": str(file_path), "type": "text", "page_label": page_num, "page_number": page_num, } if extra_info: metadata.update(extra_info) documents.append(Document(text=text_content, metadata=metadata)) # Fallback if no content was parsed if not documents and content.strip(): metadata = { "source": str(file_path), "type": "text", "page_label": 1, "page_number": 1, } if extra_info: metadata.update(extra_info) documents.append(Document(text=content.strip(), metadata=metadata)) return documents def lazy_load_data( self, file: Union[str, List[str], Path], extra_info: Optional[Dict] = None, **load_kwargs: Any, ) -> Generator[Document, None, None]: """Lazy load data from file path.""" file_path = Path(file) if isinstance(file, str) else file if "response_content" in load_kwargs: content = load_kwargs["response_content"] else: pdf_id = self.send_pdf(file_path) print(f"PDF ID: {pdf_id}") content = self.get_processed_pdf(pdf_id) if self.should_clean_pdf: content = self.clean_pdf(content) tables, texts = self.parse_markdown_text_to_tables(content) # Handle tables for page_num, table_content in tables: # Changed variable name for clarity text = strip_special_chars_markdown(table_content) # Pass just the content metadata = { "table_origin": table_content, # Use table_content here too "type": "table", "page_label": page_num, "page_number": page_num, } if extra_info: metadata.update(extra_info) yield Document( text=text, metadata=metadata, metadata_template="", metadata_seperator="", ) # Handle text sections for page_num, text_content in texts: # Changed variable name for clarity if not text_content.strip(): continue metadata = { "source": str(file_path), "type": "text", "page_label": page_num, "page_number": page_num, } if extra_info: metadata.update(extra_info) yield Document( text=text_content, metadata=metadata ) # Use text_content directly # Fallback if no content was parsed if not (tables or texts) and content.strip(): metadata = { "source": str(file_path), "type": "text", "page_label": 1, "page_number": 1, } if extra_info: metadata.update(extra_info) yield Document(text=content.strip(), metadata=metadata) print(f"Completed processing PDF: {file_path}") ================================================ FILE: libs/kotaemon/kotaemon/loaders/ocr_loader.py ================================================ import logging import os from pathlib import Path from typing import List, Optional from uuid import uuid4 import requests from llama_index.core.readers.base import BaseReader from tenacity import after_log, retry, stop_after_attempt, wait_exponential from kotaemon.base import Document from .utils.pdf_ocr import parse_ocr_output, read_pdf_unstructured from .utils.table import strip_special_chars_markdown logger = logging.getLogger(__name__) DEFAULT_OCR_ENDPOINT = "http://127.0.0.1:8000/v2/ai/infer/" @retry( stop=stop_after_attempt(6), wait=wait_exponential(multiplier=20, exp_base=2, min=1, max=1000), after=after_log(logger, logging.WARNING), ) def tenacious_api_post(url, file_path, table_only, **kwargs): with file_path.open("rb") as content: files = {"input": content} data = {"job_id": uuid4(), "table_only": table_only} resp = requests.post(url=url, files=files, data=data, **kwargs) resp.raise_for_status() return resp class OCRReader(BaseReader): """Read PDF using OCR, with high focus on table extraction Example: ```python >> from kotaemon.loaders import OCRReader >> reader = OCRReader() >> documents = reader.load_data("path/to/pdf") ``` Args: endpoint: URL to FullOCR endpoint. If not provided, will look for environment variable `OCR_READER_ENDPOINT` or use the default `kotaemon.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT` (http://127.0.0.1:8000/v2/ai/infer/) use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted. """ def __init__(self, endpoint: Optional[str] = None, use_ocr=True): """Init the OCR reader with OCR endpoint (FullOCR pipeline)""" super().__init__() self.ocr_endpoint = endpoint or os.getenv( "OCR_READER_ENDPOINT", DEFAULT_OCR_ENDPOINT ) self.use_ocr = use_ocr def load_data( self, file_path: Path, extra_info: Optional[dict] = None, **kwargs ) -> List[Document]: """Load data using OCR reader Args: file_path (Path): Path to PDF file debug_path (Path): Path to store debug image output artifact_path (Path): Path to OCR endpoints artifacts directory Returns: List[Document]: list of documents extracted from the PDF file """ file_path = Path(file_path).resolve() # call the API from FullOCR endpoint if "response_content" in kwargs: # overriding response content if specified ocr_results = kwargs["response_content"] else: # call original API resp = tenacious_api_post( url=self.ocr_endpoint, file_path=file_path, table_only=not self.use_ocr ) ocr_results = resp.json()["result"] debug_path = kwargs.pop("debug_path", None) artifact_path = kwargs.pop("artifact_path", None) # read PDF through normal reader (unstructured) pdf_page_items = read_pdf_unstructured(file_path) # merge PDF text output with OCR output tables, texts = parse_ocr_output( ocr_results, pdf_page_items, debug_path=debug_path, artifact_path=artifact_path, ) extra_info = extra_info or {} # create output Document with metadata from table documents = [ Document( text=strip_special_chars_markdown(table_text), metadata={ "table_origin": table_text, "type": "table", "page_label": page_id + 1, **extra_info, }, metadata_template="", metadata_seperator="", ) for page_id, table_text in tables ] # create Document from non-table text documents.extend( [ Document( text=non_table_text, metadata={"page_label": page_id + 1, **extra_info}, ) for page_id, non_table_text in texts ] ) return documents class ImageReader(BaseReader): """Read PDF using OCR, with high focus on table extraction Example: ```python >> from knowledgehub.loaders import OCRReader >> reader = OCRReader() >> documents = reader.load_data("path/to/pdf") ``` Args: endpoint: URL to FullOCR endpoint. If not provided, will look for environment variable `OCR_READER_ENDPOINT` or use the default `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT` (http://127.0.0.1:8000/v2/ai/infer/) use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF If False, only the table and text within table cells will be extracted. """ def __init__(self, endpoint: Optional[str] = None): """Init the OCR reader with OCR endpoint (FullOCR pipeline)""" super().__init__() self.ocr_endpoint = endpoint or os.getenv( "OCR_READER_ENDPOINT", DEFAULT_OCR_ENDPOINT ) def load_data( self, file_path: Path, extra_info: Optional[dict] = None, **kwargs ) -> List[Document]: """Load data using OCR reader Args: file_path (Path): Path to PDF file debug_path (Path): Path to store debug image output artifact_path (Path): Path to OCR endpoints artifacts directory Returns: List[Document]: list of documents extracted from the PDF file """ file_path = Path(file_path).resolve() # call the API from FullOCR endpoint if "response_content" in kwargs: # overriding response content if specified ocr_results = kwargs["response_content"] else: # call original API resp = tenacious_api_post( url=self.ocr_endpoint, file_path=file_path, table_only=False ) ocr_results = resp.json()["result"] extra_info = extra_info or {} result = [] for ocr_result in ocr_results: result.append( Document( content=ocr_result["csv_string"], metadata=extra_info, ) ) return result ================================================ FILE: libs/kotaemon/kotaemon/loaders/pdf_loader.py ================================================ import base64 from io import BytesIO from pathlib import Path from typing import Dict, List, Optional from decouple import config from fsspec import AbstractFileSystem from llama_index.readers.file import PDFReader from PIL import Image from kotaemon.base import Document PDF_LOADER_DPI = config("PDF_LOADER_DPI", default=40, cast=int) def get_page_thumbnails( file_path: Path, pages: list[int], dpi: int = PDF_LOADER_DPI ) -> List[Image.Image]: """Get image thumbnails of the pages in the PDF file. Args: file_path (Path): path to the image file page_number (list[int]): list of page numbers to extract Returns: list[Image.Image]: list of page thumbnails """ img: Image.Image suffix = file_path.suffix.lower() assert suffix == ".pdf", "This function only supports PDF files." try: import fitz except ImportError: raise ImportError("Please install PyMuPDF: 'pip install PyMuPDF'") doc = fitz.open(file_path) output_imgs = [] for page_number in pages: page = doc.load_page(page_number) pm = page.get_pixmap(dpi=dpi) img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) output_imgs.append(convert_image_to_base64(img)) return output_imgs def convert_image_to_base64(img: Image.Image) -> str: # convert the image into base64 img_bytes = BytesIO() img.save(img_bytes, format="PNG") img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8") img_base64 = f"data:image/png;base64,{img_base64}" return img_base64 class PDFThumbnailReader(PDFReader): """PDF parser with thumbnail for each page.""" def __init__(self) -> None: """ Initialize PDFReader. """ super().__init__(return_full_document=False) def load_data( self, file: Path, extra_info: Optional[Dict] = None, fs: Optional[AbstractFileSystem] = None, ) -> List[Document]: """Parse file.""" documents = super().load_data(file, extra_info, fs) page_numbers_str = [] filtered_docs = [] is_int_page_number: dict[str, bool] = {} for doc in documents: if "page_label" in doc.metadata: page_num_str = doc.metadata["page_label"] page_numbers_str.append(page_num_str) try: _ = int(page_num_str) is_int_page_number[page_num_str] = True filtered_docs.append(doc) except ValueError: is_int_page_number[page_num_str] = False continue documents = filtered_docs page_numbers = list(range(len(page_numbers_str))) print("Page numbers:", len(page_numbers)) page_thumbnails = get_page_thumbnails(file, page_numbers) documents.extend( [ Document( text="Page thumbnail", metadata={ "image_origin": page_thumbnail, "type": "thumbnail", "page_label": page_number, **(extra_info if extra_info is not None else {}), }, ) for (page_thumbnail, page_number) in zip( page_thumbnails, page_numbers_str ) if is_int_page_number[page_number] ] ) return documents ================================================ FILE: libs/kotaemon/kotaemon/loaders/txt_loader.py ================================================ from pathlib import Path from typing import Optional from kotaemon.base import Document from .base import BaseReader class TxtReader(BaseReader): def run( self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs ) -> list[Document]: return self.load_data(Path(file_path), extra_info=extra_info, **kwargs) def load_data( self, file_path: Path, extra_info: Optional[dict] = None, **kwargs ) -> list[Document]: with open(file_path, "r", encoding="utf-8") as f: text = f.read() metadata = extra_info or {} return [Document(text=text, metadata=metadata)] ================================================ FILE: libs/kotaemon/kotaemon/loaders/unstructured_loader.py ================================================ """Unstructured file reader. A parser for unstructured text files using Unstructured.io. Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents. To use .doc and .xls parser, install sudo apt-get install -y libmagic-dev poppler-utils libreoffice pip install xlrd """ from pathlib import Path from typing import Any, Dict, List, Optional from llama_index.core.readers.base import BaseReader from kotaemon.base import Document class UnstructuredReader(BaseReader): """General unstructured text reader for a variety of files.""" def __init__(self, *args: Any, **kwargs: Any) -> None: """Init params.""" super().__init__(*args) # not passing kwargs to parent bc it cannot accept it self.api = False # we default to local if "url" in kwargs: self.server_url = str(kwargs["url"]) self.api = True # is url was set, switch to api else: self.server_url = "http://localhost:8000" if "api" in kwargs: self.api = kwargs["api"] self.api_key = "" if "api_key" in kwargs: self.api_key = kwargs["api_key"] """ Loads data using Unstructured.io Depending on the construction if url is set or api = True it'll parse file using API call, else parse it locally additional_metadata is extended by the returned metadata if split_documents is True Returns list of documents """ def load_data( self, file: Path, extra_info: Optional[Dict] = None, split_documents: Optional[bool] = False, **kwargs, ) -> List[Document]: """If api is set, parse through api""" file_path_str = str(file) if self.api: from unstructured.partition.api import partition_via_api elements = partition_via_api( filename=file_path_str, api_key=self.api_key, api_url=self.server_url + "/general/v0/general", ) else: """Parse file locally""" from unstructured.partition.auto import partition elements = partition(filename=file_path_str) """ Process elements """ docs = [] file_name = Path(file).name file_path = str(Path(file).resolve()) if split_documents: for node in elements: metadata = {"file_name": file_name, "file_path": file_path} if hasattr(node, "metadata"): """Load metadata fields""" for field, val in vars(node.metadata).items(): if field == "_known_field_names": continue # removing coordinates because it does not serialize # and dont want to bother with it if field == "coordinates": continue # removing bc it might cause interference if field == "parent_id": continue metadata[field] = val if extra_info is not None: metadata.update(extra_info) metadata["file_name"] = file_name docs.append(Document(text=node.text, metadata=metadata)) else: text_chunks = [" ".join(str(el).split()) for el in elements] metadata = {"file_name": file_name, "file_path": file_path} if extra_info is not None: metadata.update(extra_info) # Create a single document by joining all the texts docs.append(Document(text="\n\n".join(text_chunks), metadata=metadata)) return docs ================================================ FILE: libs/kotaemon/kotaemon/loaders/utils/__init__.py ================================================ ================================================ FILE: libs/kotaemon/kotaemon/loaders/utils/adobe.py ================================================ # need pip install pdfservices-sdk==2.3.0 import base64 import json import logging import os import tempfile import zipfile from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import List, Union import pandas as pd from decouple import config from kotaemon.loaders.utils.gpt4v import generate_gpt4v def request_adobe_service(file_path: str, output_path: str = "") -> str: """Main function to call the adobe service, and unzip the results. Args: file_path (str): path to the pdf file output_path (str): path to store the results Returns: output_path (str): path to the results """ try: from adobe.pdfservices.operation.auth.credentials import Credentials from adobe.pdfservices.operation.exception.exceptions import ( SdkException, ServiceApiException, ServiceUsageException, ) from adobe.pdfservices.operation.execution_context import ExecutionContext from adobe.pdfservices.operation.io.file_ref import FileRef from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ( ExtractPDFOperation, ) from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ( # noqa: E501 ExtractElementType, ) from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ( # noqa: E501 ExtractPDFOptions, ) from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import ( # noqa: E501 ExtractRenditionsElementType, ) except ImportError: raise ImportError( "pdfservices-sdk is not installed. " "Please install it by running `pip install pdfservices-sdk" "@git+https://github.com/niallcm/pdfservices-python-sdk.git" "@bump-and-unfreeze-requirements`" ) if not output_path: output_path = tempfile.mkdtemp() try: # Initial setup, create credentials instance. credentials = ( Credentials.service_principal_credentials_builder() .with_client_id(config("PDF_SERVICES_CLIENT_ID", default="")) .with_client_secret(config("PDF_SERVICES_CLIENT_SECRET", default="")) .build() ) # Create an ExecutionContext using credentials # and create a new operation instance. execution_context = ExecutionContext.create(credentials) extract_pdf_operation = ExtractPDFOperation.create_new() # Set operation input from a source file. source = FileRef.create_from_local_file(file_path) extract_pdf_operation.set_input(source) # Build ExtractPDF options and set them into the operation extract_pdf_options: ExtractPDFOptions = ( ExtractPDFOptions.builder() .with_elements_to_extract( [ExtractElementType.TEXT, ExtractElementType.TABLES] ) .with_elements_to_extract_renditions( [ ExtractRenditionsElementType.TABLES, ExtractRenditionsElementType.FIGURES, ] ) .build() ) extract_pdf_operation.set_options(extract_pdf_options) # Execute the operation. result: FileRef = extract_pdf_operation.execute(execution_context) # Save the result to the specified location. zip_file_path = os.path.join( output_path, "ExtractTextTableWithFigureTableRendition.zip" ) result.save_as(zip_file_path) # Open the ZIP file with zipfile.ZipFile(zip_file_path, "r") as zip_ref: # Extract all contents to the destination folder zip_ref.extractall(output_path) except (ServiceApiException, ServiceUsageException, SdkException): logging.exception("Exception encountered while executing operation") return output_path def make_markdown_table(table_as_list: List[List[str]]) -> str: """ Convert table from python list representation to markdown format. The input list consists of rows of tables, the first row is the header. Args: table_as_list: list of table rows Example: [["Name", "Age", "Height"], ["Jake", 20, 5'10], ["Mary", 21, 5'7]] Returns: markdown representation of the table """ markdown = "\n" + str("| ") for e in table_as_list[0]: to_add = " " + str(e) + str(" |") markdown += to_add markdown += "\n" markdown += "| " for i in range(len(table_as_list[0])): markdown += str("--- | ") markdown += "\n" for entry in table_as_list[1:]: markdown += str("| ") for e in entry: to_add = str(e) + str(" | ") markdown += to_add markdown += "\n" return markdown + "\n" def load_json(input_path: Union[str | Path]) -> dict: """Load json file""" with open(input_path, "r") as fi: data = json.load(fi) return data def load_excel(input_path: Union[str | Path]) -> str: """Load excel file and convert to markdown""" df = pd.read_excel(input_path).fillna("") # Convert dataframe to a list of rows row_list = [df.columns.values.tolist()] + df.values.tolist() for item_id, item in enumerate(row_list[0]): if "Unnamed" in item: row_list[0][item_id] = "" for row in row_list: for item_id, item in enumerate(row): row[item_id] = str(item).replace("_x000D_", " ").replace("\n", " ").strip() markdown_str = make_markdown_table(row_list) return markdown_str def encode_image_base64(image_path: Union[str | Path]) -> Union[bytes, str]: """Convert image to base64""" with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") def parse_table_paths(file_paths: List[Path]) -> str: """Read the table stored in an excel file given the file path""" content = "" for path in file_paths: if path.suffix == ".xlsx": content = load_excel(path) break return content def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]: """Read and convert an image to base64 given the image path""" content = "" for path in file_paths: if path.suffix == ".png": base64_image = encode_image_base64(path) content = f"data:image/png;base64,{base64_image}" # type: ignore break return content def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str: output = "" """Summarize a single figure using GPT-4V""" if figure: try: output = generate_gpt4v( endpoint=vlm_endpoint, prompt="Provide a short 2 sentence summary of this image?", images=figure, ) if "sorry" in output.lower(): output = "" except Exception as e: print(f"Error generating caption: {e}") return output def generate_figure_captions( vlm_endpoint: str, figures: List, max_figures_to_process: int ) -> List: """Summarize several figures using GPT-4V. Args: vlm_endpoint (str): endpoint to the vision language model service figures (List): list of base64 images max_figures_to_process (int): the maximum number of figures will be summarized, the rest are ignored. Returns: results (List[str]): list of all figure captions and empty strings for ignored figures. """ to_gen_figures = figures[:max_figures_to_process] other_figures = figures[max_figures_to_process:] with ThreadPoolExecutor() as executor: futures = [ executor.submit( lambda: generate_single_figure_caption(vlm_endpoint, figure) ) for figure in to_gen_figures ] results = [future.result() for future in futures] return results + [""] * len(other_figures) ================================================ FILE: libs/kotaemon/kotaemon/loaders/utils/box.py ================================================ from typing import List, Tuple def bbox_to_points(box: List[int]): """Convert bounding box to list of points""" x1, y1, x2, y2 = box return [(x1, y1), (x2, y1), (x2, y2), (x1, y2)] def points_to_bbox(points: List[Tuple[int, int]]): """Convert list of points to bounding box""" all_x = [p[0] for p in points] all_y = [p[1] for p in points] return [min(all_x), min(all_y), max(all_x), max(all_y)] def scale_points(points: List[Tuple[int, int]], scale_factor: float = 1.0): """Scale points by a scale factor""" return [(int(pos[0] * scale_factor), int(pos[1] * scale_factor)) for pos in points] def union_points(points: List[Tuple[int, int]]): """Return union bounding box of list of points""" all_x = [p[0] for p in points] all_y = [p[1] for p in points] bbox = (min(all_x), min(all_y), max(all_x), max(all_y)) return bbox def scale_box(box: List[int], scale_factor: float = 1.0): """Scale box by a scale factor""" return [int(pos * scale_factor) for pos in box] def box_h(box: List[int]): "Return box height" return box[3] - box[1] def box_w(box: List[int]): "Return box width" return box[2] - box[0] def box_area(box: List[int]): "Return box area" x1, y1, x2, y2 = box return (x2 - x1) * (y2 - y1) def get_rect_iou(gt_box: List[tuple], pd_box: List[tuple], iou_type=0) -> int: """Intersection over union on layout rectangle Args: gt_box: List[tuple] A list contains bounding box coordinates of ground truth pd_box: List[tuple] A list contains bounding box coordinates of prediction iou_type: int 0: intersection / union, normal IOU 1: intersection / min(areas), useful when boxes are under/over-segmented Input format: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)] Annotation for each element in bbox: (x1, y1) (x2, y1) +-------+ | | | | +-------+ (x1, y2) (x2, y2) Returns: Intersection over union value """ assert iou_type in [0, 1], "Only support 0: origin iou, 1: intersection / min(area)" # determine the (x, y)-coordinates of the intersection rectangle # gt_box: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)] # pd_box: [(x1, y1), (x2, y1), (x2, y2), (x1, y2)] x_left = max(gt_box[0][0], pd_box[0][0]) y_top = max(gt_box[0][1], pd_box[0][1]) x_right = min(gt_box[2][0], pd_box[2][0]) y_bottom = min(gt_box[2][1], pd_box[2][1]) # compute the area of intersection rectangle interArea = max(0, x_right - x_left) * max(0, y_bottom - y_top) # compute the area of both the prediction and ground-truth # rectangles gt_area = (gt_box[2][0] - gt_box[0][0]) * (gt_box[2][1] - gt_box[0][1]) pd_area = (pd_box[2][0] - pd_box[0][0]) * (pd_box[2][1] - pd_box[0][1]) # compute the intersection over union by taking the intersection # area and dividing it by the sum of prediction + ground-truth # areas - the intersection area if iou_type == 0: iou = interArea / float(gt_area + pd_area - interArea) elif iou_type == 1: iou = interArea / max(min(gt_area, pd_area), 1) # return the intersection over union value return iou def sort_funsd_reading_order(lines: List[dict], box_key_name: str = "box"): """Sort cell list to create the right reading order using their locations Args: lines: list of cells to sort Returns: a list of cell lists in the right reading order that contain no key or start with a key and contain no other key """ sorted_list = [] if len(lines) == 0: return lines while len(lines) > 1: topleft_line = lines[0] for line in lines[1:]: topleft_line_pos = topleft_line[box_key_name] topleft_line_center_y = (topleft_line_pos[1] + topleft_line_pos[3]) / 2 x1, y1, x2, y2 = line[box_key_name] box_center_x = (x1 + x2) / 2 box_center_y = (y1 + y2) / 2 cell_h = y2 - y1 if box_center_y <= topleft_line_center_y - cell_h / 2: topleft_line = line continue if ( box_center_x < topleft_line_pos[2] and box_center_y < topleft_line_pos[3] ): topleft_line = line continue sorted_list.append(topleft_line) lines.remove(topleft_line) sorted_list.append(lines[0]) return sorted_list ================================================ FILE: libs/kotaemon/kotaemon/loaders/utils/gpt4v.py ================================================ import json import logging from typing import Any, List import requests from decouple import config logger = logging.getLogger(__name__) def generate_gpt4v( endpoint: str, images: str | List[str], prompt: str, max_tokens: int = 512, max_images: int = 10, ) -> str: # OpenAI API Key api_key = config("AZURE_OPENAI_API_KEY", default="") headers = {"Content-Type": "application/json", "api-key": api_key} if isinstance(images, str): images = [images] payload = { "messages": [ { "role": "user", "content": [ {"type": "text", "text": prompt}, ] + [ { "type": "image_url", "image_url": {"url": image}, } for image in images[:max_images] ], } ], "max_tokens": max_tokens, "temperature": 0, } if len(images) > max_images: print(f"Truncated to {max_images} images (original {len(images)} images") response = requests.post(endpoint, headers=headers, json=payload) try: response.raise_for_status() except Exception as e: logger.exception(f"Error generating gpt4v: {response.text}; error {e}") return "" output = response.json() output = output["choices"][0]["message"]["content"] return output def stream_gpt4v( endpoint: str, images: str | List[str], prompt: str, max_tokens: int = 512, max_images: int = 10, ) -> Any: # OpenAI API Key api_key = config("AZURE_OPENAI_API_KEY", default="") headers = {"Content-Type": "application/json", "api-key": api_key} if isinstance(images, str): images = [images] payload = { "messages": [ { "role": "user", "content": [ {"type": "text", "text": prompt}, ] + [ { "type": "image_url", "image_url": {"url": image}, } for image in images[:max_images] ], } ], "max_tokens": max_tokens, "stream": True, "logprobs": True, "temperature": 0, } if len(images) > max_images: print(f"Truncated to {max_images} images (original {len(images)} images") try: response = requests.post(endpoint, headers=headers, json=payload, stream=True) assert response.status_code == 200, str(response.content) output = "" logprobs = [] for line in response.iter_lines(): if line: if line.startswith(b"\xef\xbb\xbf"): line = line[9:] else: line = line[6:] try: if line == "[DONE]": break line = json.loads(line.decode("utf-8")) except Exception: break if len(line["choices"]): if line["choices"][0].get("logprobs") is None: _logprobs = [] else: _logprobs = [ logprob["logprob"] for logprob in line["choices"][0]["logprobs"].get( "content", [] ) ] output += line["choices"][0]["delta"].get("content", "") logprobs += _logprobs yield line["choices"][0]["delta"].get("content", ""), _logprobs except Exception as e: logger.error(f"Error streaming gpt4v {e}") logprobs = [] output = "" return output, logprobs ================================================ FILE: libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py ================================================ from collections import defaultdict from pathlib import Path from typing import Dict, List, Optional, Union from .box import ( bbox_to_points, box_area, box_h, box_w, get_rect_iou, points_to_bbox, scale_box, scale_points, sort_funsd_reading_order, union_points, ) from .table import table_cells_to_markdown IOU_THRES = 0.5 PADDING_THRES = 1.1 def read_pdf_unstructured(input_path: Union[Path, str]): """Convert PDF from specified path to list of text items with location information Args: input_path: path to input file Returns: Dict page_number: list of text boxes """ try: from unstructured.partition.auto import partition except ImportError as e: raise ImportError( "Please install unstructured PDF reader `pip install unstructured[pdf]`: " f"{e}" ) page_items = defaultdict(list) items = partition(input_path) for item in items: page_number = item.metadata.page_number bbox = points_to_bbox(item.metadata.coordinates.points) coord_system = item.metadata.coordinates.system max_w, max_h = coord_system.width, coord_system.height page_items[page_number - 1].append( { "text": item.text, "box": bbox, "location": bbox_to_points(bbox), "page_shape": (max_w, max_h), } ) return page_items def merge_ocr_and_pdf_texts( ocr_list: List[dict], pdf_text_list: List[dict], debug_info=None ): """Merge PDF and OCR text using IOU overlapping location Args: ocr_list: List of OCR items {"text", "box", "location"} pdf_text_list: List of PDF items {"text", "box", "location"} Returns: Combined list of PDF text and non-overlap OCR text """ not_matched_ocr = [] # check for debug info if debug_info is not None: cv2, debug_im = debug_info for ocr_item in ocr_list: matched = False for pdf_item in pdf_text_list: if ( get_rect_iou(ocr_item["location"], pdf_item["location"], iou_type=1) > IOU_THRES ): matched = True break color = (255, 0, 0) if not matched: ocr_item["matched"] = False not_matched_ocr.append(ocr_item) color = (0, 255, 255) if debug_info is not None: cv2.rectangle( debug_im, ocr_item["location"][0], ocr_item["location"][2], color=color, thickness=1, ) if debug_info is not None: for pdf_item in pdf_text_list: cv2.rectangle( debug_im, pdf_item["location"][0], pdf_item["location"][2], color=(0, 255, 0), thickness=2, ) return pdf_text_list + not_matched_ocr def merge_table_cell_and_ocr( table_list: List[dict], ocr_list: List[dict], pdf_list: List[dict], debug_info=None ): """Merge table items with OCR text using IOU overlapping location Args: table_list: List of table items "type": ("table", "cell", "text"), "text", "box", "location"} ocr_list: List of OCR items {"text", "box", "location"} pdf_list: List of PDF items {"text", "box", "location"} Returns: all_table_cells: List of tables, each of table is represented by list of cells with combined text from OCR not_matched_items: List of PDF text which is not overlapped by table region """ # check for debug info if debug_info is not None: cv2, debug_im = debug_info cell_list = [item for item in table_list if item["type"] == "cell"] table_list = [item for item in table_list if item["type"] == "table"] # sort table by area table_list = sorted(table_list, key=lambda item: box_area(item["bbox"])) all_tables = [] matched_pdf_ids = [] matched_cell_ids = [] for table in table_list: if debug_info is not None: cv2.rectangle( debug_im, table["location"][0], table["location"][2], color=[0, 0, 255], thickness=5, ) cur_table_cells = [] for cell_id, cell in enumerate(cell_list): if cell_id in matched_cell_ids: continue if get_rect_iou( table["location"], cell["location"], iou_type=1 ) > IOU_THRES and box_area(table["bbox"]) > box_area(cell["bbox"]): color = [128, 0, 128] # cell matched to table for item_list, item_type in [(pdf_list, "pdf"), (ocr_list, "ocr")]: cell["ocr"] = [] for item_id, item in enumerate(item_list): if item_type == "pdf" and item_id in matched_pdf_ids: continue if ( get_rect_iou(item["location"], cell["location"], iou_type=1) > IOU_THRES ): cell["ocr"].append(item) if item_type == "pdf": matched_pdf_ids.append(item_id) if len(cell["ocr"]) > 0: # check if union of matched ocr does # not extend over cell boundary, # if True, continue to use OCR_list to match all_box_points_in_cell = [] for item in cell["ocr"]: all_box_points_in_cell.extend(item["location"]) union_box = union_points(all_box_points_in_cell) cell_okay = ( box_h(union_box) <= box_h(cell["bbox"]) * PADDING_THRES and box_w(union_box) <= box_w(cell["bbox"]) * PADDING_THRES ) else: cell_okay = False if cell_okay: if item_type == "pdf": color = [255, 0, 255] break if debug_info is not None: cv2.rectangle( debug_im, cell["location"][0], cell["location"][2], color=color, thickness=3, ) matched_cell_ids.append(cell_id) cur_table_cells.append(cell) all_tables.append(cur_table_cells) not_matched_items = [ item for _id, item in enumerate(pdf_list) if _id not in matched_pdf_ids ] if debug_info is not None: for item in not_matched_items: cv2.rectangle( debug_im, item["location"][0], item["location"][2], color=[128, 128, 128], thickness=3, ) return all_tables, not_matched_items def parse_ocr_output( ocr_page_items: List[dict], pdf_page_items: Dict[int, List[dict]], artifact_path: Optional[str] = None, debug_path: Optional[str] = None, ): """Main function to combine OCR output and PDF text to form list of table / non-table regions Args: ocr_page_items: List of OCR items by page pdf_page_items: Dict of PDF texts (page number as key) debug_path: If specified, use OpenCV to plot debug image and save to debug_path """ all_tables = [] all_texts = [] for page_id, page in enumerate(ocr_page_items): ocr_list = page["json"]["ocr"] table_list = page["json"]["table"] page_shape = page["image_shape"] pdf_item_list = pdf_page_items[page_id] # create bbox additional information for item in ocr_list: item["box"] = points_to_bbox(item["location"]) # re-scale pdf items according to new image size for item in pdf_item_list: scale_factor = page_shape[0] / item["page_shape"][0] item["box"] = scale_box(item["box"], scale_factor=scale_factor) item["location"] = scale_points(item["location"], scale_factor=scale_factor) # if using debug mode, openCV must be installed if debug_path and artifact_path is not None: try: import cv2 except ImportError: raise ImportError( "Please install openCV first to use OCRReader debug mode" ) image_path = Path(artifact_path) / page["image"] image = cv2.imread(str(image_path)) debug_info = (cv2, image) else: debug_info = None new_pdf_list = merge_ocr_and_pdf_texts( ocr_list, pdf_item_list, debug_info=debug_info ) # sort by reading order ocr_list = sort_funsd_reading_order(ocr_list) new_pdf_list = sort_funsd_reading_order(new_pdf_list) all_table_cells, non_table_text_list = merge_table_cell_and_ocr( table_list, ocr_list, new_pdf_list, debug_info=debug_info ) table_texts = [table_cells_to_markdown(cells) for cells in all_table_cells] all_tables.extend([(page_id, text) for text in table_texts]) all_texts.append( (page_id, " ".join(item["text"] for item in non_table_text_list)) ) # export debug image to debug_path if debug_path: cv2.imwrite(str(Path(debug_path) / "page_{}.png".format(page_id)), image) return all_tables, all_texts ================================================ FILE: libs/kotaemon/kotaemon/loaders/utils/table.py ================================================ import csv from io import StringIO from typing import List, Optional, Tuple from .box import get_rect_iou def check_col_conflicts( col_a: List[str], col_b: List[str], thres: float = 0.15 ) -> bool: """Check if 2 columns A and B has non-empty content in the same row (to be used with merge_cols) Args: col_a: column A (list of str) col_b: column B (list of str) thres: percentage of overlapping allowed Returns: if number of overlapping greater than threshold """ num_rows = len([cell for cell in col_a if cell]) assert len(col_a) == len(col_b) conflict_count = 0 for cell_a, cell_b in zip(col_a, col_b): if cell_a and cell_b: conflict_count += 1 return conflict_count > num_rows * thres def merge_cols(col_a: List[str], col_b: List[str]) -> List[str]: """Merge column A and B if they do not have conflict rows Args: col_a: column A (list of str) col_b: column B (list of str) Returns: merged column """ for r_id in range(len(col_a)): if col_b[r_id]: col_a[r_id] = col_a[r_id] + " " + col_b[r_id] return col_a def add_index_col(csv_rows: List[List[str]]) -> List[List[str]]: """Add index column as the first column of the table csv_rows Args: csv_rows: input table Returns: output table with index column """ new_csv_rows = [["row id"] + [""] * len(csv_rows[0])] for r_id, row in enumerate(csv_rows): new_csv_rows.append([str(r_id + 1)] + row) return new_csv_rows def compress_csv(csv_rows: List[List[str]]) -> List[List[str]]: """Compress table csv_rows by merging sparse columns (merge_cols) Args: csv_rows: input table Returns: output: compressed table """ csv_cols = [[r[c_id] for r in csv_rows] for c_id in range(len(csv_rows[0]))] to_remove_col_ids = [] last_c_id = 0 for c_id in range(1, len(csv_cols)): if not check_col_conflicts(csv_cols[last_c_id], csv_cols[c_id]): to_remove_col_ids.append(c_id) csv_cols[last_c_id] = merge_cols(csv_cols[last_c_id], csv_cols[c_id]) else: last_c_id = c_id csv_cols = [r for c_id, r in enumerate(csv_cols) if c_id not in to_remove_col_ids] csv_rows = [[c[r_id] for c in csv_cols] for r_id in range(len(csv_cols[0]))] return csv_rows def get_table_from_ocr(ocr_list: List[dict], table_list: List[dict]): """Get list of text lines belong to table regions specified by table_list Args: ocr_list: list of OCR output in Casia format (Flax) table_list: list of table output in Casia format (Flax) Returns: _type_: _description_ """ table_texts = [] for table in table_list: if table["type"] != "table": continue cur_table_texts = [] for ocr in ocr_list: _iou = get_rect_iou(table["location"], ocr["location"], iou_type=1) if _iou > 0.8: cur_table_texts.append(ocr["text"]) table_texts.append(cur_table_texts) return table_texts def make_markdown_table(array: List[List[str]]) -> str: """Convert table rows in list format to markdown string Args: Python list with rows of table as lists First element as header. Example Input: [["Name", "Age", "Height"], ["Jake", 20, 5'10], ["Mary", 21, 5'7]] Returns: String to put into a .md file """ array = compress_csv(array) array = add_index_col(array) markdown = "\n" + str("| ") for e in array[0]: to_add = " " + str(e) + str(" |") markdown += to_add markdown += "\n" markdown += "| " for i in range(len(array[0])): markdown += str("--- | ") markdown += "\n" for entry in array[1:]: markdown += str("| ") for e in entry: to_add = str(e) + str(" | ") markdown += to_add markdown += "\n" return markdown + "\n" def parse_csv_string_to_list(csv_str: str) -> List[List[str]]: """Convert CSV string to list of rows Args: csv_str: input CSV string Returns: Output table in list format """ io = StringIO(csv_str) csv_reader = csv.reader(io, delimiter=",") rows = [row for row in csv_reader] return rows def format_cell(cell: str, length_limit: Optional[int] = None) -> str: """Format cell content by remove redundant character and enforce length limit Args: cell: input cell text length_limit: limit of text length. Returns: new cell text """ cell = cell.replace("\n", " ") if length_limit: cell = cell[:length_limit] return cell def extract_tables_from_csv_string( csv_content: str, table_texts: List[List[str]] ) -> Tuple[List[str], str]: """Extract list of table from FullOCR output (csv_content) with the specified table_texts Args: csv_content: CSV output from FullOCR pipeline table_texts: list of table texts extracted from get_table_from_ocr() Returns: List of tables and non-text content """ rows = parse_csv_string_to_list(csv_content) used_row_ids = [] table_csv_list = [] for table in table_texts: cur_rows = [] for row_id, row in enumerate(rows): scores = [ any(cell in cell_reference for cell in table) for cell_reference in row if cell_reference ] score = sum(scores) / len(scores) if score > 0.5 and row_id not in used_row_ids: used_row_ids.append(row_id) cur_rows.append([format_cell(cell) for cell in row]) if cur_rows: table_csv_list.append(make_markdown_table(cur_rows)) else: print("table not matched", table) non_table_rows = [ row for row_id, row in enumerate(rows) if row_id not in used_row_ids ] non_table_text = "\n".join( " ".join(format_cell(cell) for cell in row) for row in non_table_rows ) return table_csv_list, non_table_text def strip_special_chars_markdown(text: str) -> str: """Strip special characters from input text in markdown table format""" return text.replace("|", "").replace(":---:", "").replace("---", "") def parse_markdown_text_to_tables(text: str) -> Tuple[List[str], List[str]]: """Convert markdown text to list of non-table spans and table spans Args: text: input markdown text Returns: list of table spans and non-table spans """ # init empty tables and texts list tables = [] texts = [] # split input by line break lines = text.split("\n") cur_table = [] cur_text: List[str] = [] for line in lines: line = line.strip() if line.startswith("|"): if len(cur_text) > 0: texts.append(cur_text) cur_text = [] cur_table.append(line) else: # add new table to the list if len(cur_table) > 0: tables.append(cur_table) cur_table = [] cur_text.append(line) table_texts = ["\n".join(table) for table in tables] non_table_texts = ["\n".join(text) for text in texts] return table_texts, non_table_texts def table_cells_to_markdown(cells: List[dict]): """Convert list of cells with attached text to Markdown table""" if len(cells) == 0: return "" all_row_ids = [] all_col_ids = [] for cell in cells: all_row_ids.extend(cell["rows"]) all_col_ids.extend(cell["columns"]) num_rows, num_cols = max(all_row_ids) + 1, max(all_col_ids) + 1 table_rows = [["" for c in range(num_cols)] for r in range(num_rows)] # start filling in the grid for cell in cells: cell_text = " ".join(item["text"] for item in cell["ocr"]) start_row_id, end_row_id = cell["rows"] start_col_id, end_col_id = cell["columns"] span_cell = end_row_id != start_row_id or end_col_id != start_col_id # do not repeat long text in span cell to prevent context length issue if span_cell and len(cell_text.replace(" ", "")) < 20 and start_row_id > 0: for row in range(start_row_id, end_row_id + 1): for col in range(start_col_id, end_col_id + 1): table_rows[row][col] += cell_text + " " else: table_rows[start_row_id][start_col_id] += cell_text + " " return make_markdown_table(table_rows) ================================================ FILE: libs/kotaemon/kotaemon/loaders/web_loader.py ================================================ from pathlib import Path from typing import Optional import requests from decouple import config from kotaemon.base import Document from .base import BaseReader JINA_API_KEY = config("JINA_API_KEY", default="") JINA_URL = config("JINA_URL", default="https://r.jina.ai/") class WebReader(BaseReader): def run( self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs ) -> list[Document]: return self.load_data(Path(file_path), extra_info=extra_info, **kwargs) def fetch_url(self, url: str): # setup the request api_url = f"https://r.jina.ai/{url}" headers = { "X-With-Links-Summary": "true", } if JINA_API_KEY: headers["Authorization"] = f"Bearer {JINA_API_KEY}" response = requests.get(api_url, headers=headers) response.raise_for_status() data = response.text return data def load_data( self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs ) -> list[Document]: file_path = str(file_path) output = self.fetch_url(file_path) metadata = extra_info or {} return [Document(text=output, metadata=metadata)] ================================================ FILE: libs/kotaemon/kotaemon/parsers/__init__.py ================================================ from .regex_extractor import FirstMatchRegexExtractor, RegexExtractor __all__ = ["RegexExtractor", "FirstMatchRegexExtractor"] ================================================ FILE: libs/kotaemon/kotaemon/parsers/regex_extractor.py ================================================ from __future__ import annotations import re from typing import Callable from kotaemon.base import BaseComponent, Document, ExtractorOutput, Param class RegexExtractor(BaseComponent): """ Simple class for extracting text from a document using a regex pattern. Args: pattern (List[str]): The regex pattern(s) to use. output_map (dict, optional): A mapping from extracted text to the desired output. Defaults to None. """ class Config: middleware_switches = {"theflow.middleware.CachingMiddleware": False} pattern: list[str] output_map: dict[str, str] | Callable[[str], str] = Param( default_callback=lambda *_: {} ) def __init__(self, pattern: str | list[str], **kwargs): if isinstance(pattern, str): pattern = [pattern] super().__init__(pattern=pattern, **kwargs) @staticmethod def run_raw_static(pattern: str, text: str) -> list[str]: """ Finds all non-overlapping occurrences of a pattern in a string. Parameters: pattern (str): The regular expression pattern to search for. text (str): The input string to search in. Returns: List[str]: A list of all non-overlapping occurrences of the pattern in the string. """ return re.findall(pattern, text) @staticmethod def map_output(text, output_map) -> str: """ Maps the given `text` to its corresponding value in the `output_map` dictionary. Parameters: text (str): The input text to be mapped. output_map (dict): A dictionary containing mapping of input text to output values. Returns: str: The corresponding value from the `output_map` if `text` is found in the dictionary, otherwise returns the original `text`. """ if not output_map: return text if isinstance(output_map, dict): return output_map.get(text, text) return output_map(text) def run_raw(self, text: str) -> ExtractorOutput: """ Matches the raw text against the pattern and rans the output mapping, returning an instance of ExtractorOutput. Args: text (str): The raw text to be processed. Returns: ExtractorOutput: The processed output as a list of ExtractorOutput. """ output: list[str] = sum( [self.run_raw_static(p, text) for p in self.pattern], [] ) output = [self.map_output(text, self.output_map) for text in output] return ExtractorOutput( text=output[0] if output else "", matches=output, metadata={"origin": "RegexExtractor"}, ) def run( self, text: str | list[str] | Document | list[Document] ) -> list[ExtractorOutput]: """Match the input against a pattern and return the output for each input Parameters: text: contains the input string to be processed Returns: A list contains the output ExtractorOutput for each input Example: ```pycon >>> document1 = Document(...) >>> document2 = Document(...) >>> document_batch = [document1, document2] >>> batch_output = self(document_batch) >>> print(batch_output) [output1_document1, output1_document2] ``` """ # TODO: this conversion seems common input_: list[str] = [] if not isinstance(text, list): text = [text] for item in text: if isinstance(item, str): input_.append(item) elif isinstance(item, Document): input_.append(item.text) else: raise ValueError( f"Invalid input type {type(item)}, should be str or Document" ) output = [] for each_input in input_: output.append(self.run_raw(each_input)) return output class FirstMatchRegexExtractor(RegexExtractor): pattern: list[str] def run_raw(self, text: str) -> ExtractorOutput: for p in self.pattern: output = self.run_raw_static(p, text) if output: output = [self.map_output(text, self.output_map) for text in output] return ExtractorOutput( text=output[0], matches=output, metadata={"origin": "FirstMatchRegexExtractor"}, ) return ExtractorOutput( text=None, matches=[], metadata={"origin": "FirstMatchRegexExtractor"} ) ================================================ FILE: libs/kotaemon/kotaemon/rerankings/__init__.py ================================================ from .base import BaseReranking from .cohere import CohereReranking from .tei_fast_rerank import TeiFastReranking from .voyageai import VoyageAIReranking __all__ = ["BaseReranking", "TeiFastReranking", "CohereReranking", "VoyageAIReranking"] ================================================ FILE: libs/kotaemon/kotaemon/rerankings/base.py ================================================ from __future__ import annotations from abc import abstractmethod from kotaemon.base import BaseComponent, Document class BaseReranking(BaseComponent): @abstractmethod def run(self, documents: list[Document], query: str) -> list[Document]: """Main method to transform list of documents (re-ranking, filtering, etc)""" ... ================================================ FILE: libs/kotaemon/kotaemon/rerankings/cohere.py ================================================ from __future__ import annotations import os from decouple import config from kotaemon.base import Document, Param from .base import BaseReranking class CohereReranking(BaseReranking): """Cohere Reranking model""" model_name: str = Param( "rerank-multilingual-v2.0", help=( "ID of the model to use. You can go to [Supported Models]" "(https://docs.cohere.com/docs/rerank-2) to see the supported models" ), required=True, ) cohere_api_key: str = Param( config("COHERE_API_KEY", ""), help="Cohere API key", required=True, ) base_url: str = Param( None, help="Rerank API base url. Default is https://api.cohere.com", required=False, ) def run(self, documents: list[Document], query: str) -> list[Document]: """Use Cohere Reranker model to re-order documents with their relevance score""" try: import cohere except ImportError: raise ImportError( "Please install Cohere " "`pip install cohere` to use Cohere Reranking" ) if not self.cohere_api_key or "COHERE_API_KEY" in self.cohere_api_key: print("Cohere API key not found. Skipping rerankings.") return documents cohere_client = cohere.Client( self.cohere_api_key, base_url=self.base_url or os.getenv("CO_API_URL") ) compressed_docs: list[Document] = [] if not documents: # to avoid empty api call return compressed_docs _docs = [d.content for d in documents] response = cohere_client.rerank( model=self.model_name, query=query, documents=_docs ) for r in response.results: doc = documents[r.index] doc.metadata["reranking_score"] = r.relevance_score compressed_docs.append(doc) return compressed_docs ================================================ FILE: libs/kotaemon/kotaemon/rerankings/tei_fast_rerank.py ================================================ from __future__ import annotations from typing import Optional import requests from kotaemon.base import Document, Param from .base import BaseReranking session = requests.session() class TeiFastReranking(BaseReranking): """Text Embeddings Inference (TEI) Reranking model (https://huggingface.co/docs/text-embeddings-inference/en/index) """ endpoint_url: str = Param( None, help="TEI Reranking service api base URL", required=True ) model_name: Optional[str] = Param( None, help=( "ID of the model to use. You can go to [Supported Models]" "(https://github.com/huggingface" "/text-embeddings-inference?tab=readme-ov-file" "#supported-models) to see the supported models" ), ) is_truncated: Optional[bool] = Param(True, help="Whether to truncate the inputs") max_tokens: Optional[int] = Param( 512, help=( "This option is used to specify the " "maximum number of tokens supported by the reranker model." ), ) def client(self, query, texts): if self.is_truncated: max_tokens = self.max_tokens # default is 512 tokens. truncated_texts = [text[:max_tokens] for text in texts] response = session.post( url=self.endpoint_url, json={ "query": query, "texts": truncated_texts, "is_truncated": self.is_truncated, # default is True }, ).json() return response def run(self, documents: list[Document], query: str) -> list[Document]: """Use the deployed TEI rerankings service to re-order documents with their relevance score""" if not self.endpoint_url: print("TEI API reranking URL not found. Skipping rerankings.") return documents compressed_docs: list[Document] = [] if not documents: # to avoid empty api call return compressed_docs if isinstance(documents[0], str): documents = self.prepare_input(documents) batch_size = 6 num_batch = max(len(documents) // batch_size, 1) for i in range(num_batch): if i == num_batch - 1: mini_batch = documents[batch_size * i :] else: mini_batch = documents[batch_size * i : batch_size * (i + 1)] _docs = [d.content for d in mini_batch] rerank_resp = self.client(query, _docs) for r in rerank_resp: doc = mini_batch[r["index"]] doc.metadata["reranking_score"] = r["score"] compressed_docs.append(doc) compressed_docs = sorted( compressed_docs, key=lambda x: x.metadata["reranking_score"], reverse=True ) return compressed_docs ================================================ FILE: libs/kotaemon/kotaemon/rerankings/voyageai.py ================================================ from __future__ import annotations import importlib from decouple import config from kotaemon.base import Document, Param from .base import BaseReranking vo = None def _import_voyageai(): global vo if not vo: vo = importlib.import_module("voyageai") return vo class VoyageAIReranking(BaseReranking): """VoyageAI Reranking model""" model_name: str = Param( "rerank-2", help=( "ID of the model to use. You can go to [Supported Models]" "(https://docs.voyageai.com/docs/reranker) to see the supported models" ), required=True, ) api_key: str = Param( config("VOYAGE_API_KEY", ""), help="VoyageAI API key", required=True, ) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if not self.api_key: raise ValueError("API key must be provided for VoyageAIEmbeddings.") self._client = _import_voyageai().Client(api_key=self.api_key) self._aclient = _import_voyageai().AsyncClient(api_key=self.api_key) def run(self, documents: list[Document], query: str) -> list[Document]: """Use VoyageAI Reranker model to re-order documents with their relevance score""" compressed_docs: list[Document] = [] if not documents: # to avoid empty api call return compressed_docs _docs = [d.content for d in documents] response = self._client.rerank( model=self.model_name, query=query, documents=_docs ) for r in response.results: doc = documents[r.index] doc.metadata["reranking_score"] = r.relevance_score compressed_docs.append(doc) return compressed_docs ================================================ FILE: libs/kotaemon/kotaemon/storages/__init__.py ================================================ from .docstores import ( BaseDocumentStore, ElasticsearchDocumentStore, InMemoryDocumentStore, LanceDBDocumentStore, SimpleFileDocumentStore, ) from .vectorstores import ( BaseVectorStore, ChromaVectorStore, InMemoryVectorStore, LanceDBVectorStore, MilvusVectorStore, QdrantVectorStore, SimpleFileVectorStore, ) __all__ = [ # Document stores "BaseDocumentStore", "InMemoryDocumentStore", "ElasticsearchDocumentStore", "SimpleFileDocumentStore", "LanceDBDocumentStore", # Vector stores "BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore", "SimpleFileVectorStore", "LanceDBVectorStore", "MilvusVectorStore", "QdrantVectorStore", ] ================================================ FILE: libs/kotaemon/kotaemon/storages/docstores/__init__.py ================================================ from .base import BaseDocumentStore from .elasticsearch import ElasticsearchDocumentStore from .in_memory import InMemoryDocumentStore from .lancedb import LanceDBDocumentStore from .simple_file import SimpleFileDocumentStore __all__ = [ "BaseDocumentStore", "InMemoryDocumentStore", "ElasticsearchDocumentStore", "SimpleFileDocumentStore", "LanceDBDocumentStore", ] ================================================ FILE: libs/kotaemon/kotaemon/storages/docstores/base.py ================================================ from abc import ABC, abstractmethod from typing import List, Optional, Union from kotaemon.base import Document class BaseDocumentStore(ABC): """A document store is in charged of storing and managing documents""" @abstractmethod def __init__(self, *args, **kwargs): ... @abstractmethod def add( self, docs: Union[Document, List[Document]], ids: Optional[Union[List[str], str]] = None, **kwargs, ): """Add document into document store Args: docs: Document or list of documents ids: List of ids of the documents. Optional, if not set will use doc.doc_id """ ... @abstractmethod def get(self, ids: Union[List[str], str]) -> List[Document]: """Get document by id""" ... @abstractmethod def get_all(self) -> List[Document]: """Get all documents""" ... @abstractmethod def count(self) -> int: """Count number of documents""" ... @abstractmethod def query( self, query: str, top_k: int = 10, doc_ids: Optional[list] = None ) -> List[Document]: """Search document store using search query""" ... @abstractmethod def delete(self, ids: Union[List[str], str]): """Delete document by id""" ... @abstractmethod def drop(self): """Drop the document store""" ... ================================================ FILE: libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py ================================================ from typing import List, Optional, Union from kotaemon.base import Document from .base import BaseDocumentStore MAX_DOCS_TO_GET = 10**4 class ElasticsearchDocumentStore(BaseDocumentStore): """Simple memory document store that store document in a dictionary""" def __init__( self, collection_name: str = "docstore", elasticsearch_url: str = "http://localhost:9200", k1: float = 2.0, b: float = 0.75, **kwargs, ): try: from elasticsearch import Elasticsearch from elasticsearch.helpers import bulk except ImportError: raise ImportError( "To use ElaticsearchDocstore please install `pip install elasticsearch`" ) self.elasticsearch_url = elasticsearch_url self.index_name = collection_name self.k1 = k1 self.b = b # Create an Elasticsearch client instance self.client = Elasticsearch(elasticsearch_url, **kwargs) self.es_bulk = bulk # Define the index settings and mappings settings = { "analysis": {"analyzer": {"default": {"type": "standard"}}}, "similarity": { "custom_bm25": { "type": "BM25", "k1": k1, "b": b, } }, } mappings = { "properties": { "content": { "type": "text", "similarity": "custom_bm25", # Use the custom BM25 similarity } } } # Create the index with the specified settings and mappings if not self.client.indices.exists(index=self.index_name): self.client.indices.create( index=self.index_name, mappings=mappings, settings=settings ) def add( self, docs: Union[Document, List[Document]], ids: Optional[Union[List[str], str]] = None, refresh_indices: bool = True, **kwargs, ): """Add document into document store Args: docs: list of documents to add ids: specify the ids of documents to add or use existing doc.doc_id refresh_indices: request Elasticsearch to update its index (default to True) """ if ids and not isinstance(ids, list): ids = [ids] if not isinstance(docs, list): docs = [docs] doc_ids = ids if ids else [doc.doc_id for doc in docs] requests = [] for doc_id, doc in zip(doc_ids, docs): text = doc.text metadata = doc.metadata request = { "_op_type": "index", "_index": self.index_name, "content": text, "metadata": metadata, "_id": doc_id, } requests.append(request) success, failed = self.es_bulk(self.client, requests) print("Added/Updated documents to index", success) print("Failed documents to index", failed) if refresh_indices: self.client.indices.refresh(index=self.index_name) def query_raw(self, query: dict) -> List[Document]: """Query Elasticsearch store using query format of ES client Args: query (dict): Elasticsearch query format Returns: List[Document]: List of result documents """ res = self.client.search(index=self.index_name, body=query) docs = [] for r in res["hits"]["hits"]: docs.append( Document( id_=r["_id"], text=r["_source"]["content"], metadata=r["_source"]["metadata"], ) ) return docs def query( self, query: str, top_k: int = 10, doc_ids: Optional[list] = None ) -> List[Document]: """Search Elasticsearch docstore using search query (BM25) Args: query (str): query text top_k (int, optional): number of top documents to return. Defaults to 10. Returns: List[Document]: List of result documents """ query_dict: dict = {"match": {"content": query}} if doc_ids is not None: query_dict = {"bool": {"must": [query_dict, {"terms": {"_id": doc_ids}}]}} query_dict = {"query": query_dict, "size": top_k} return self.query_raw(query_dict) def get(self, ids: Union[List[str], str]) -> List[Document]: """Get document by id""" if not isinstance(ids, list): ids = [ids] query_dict = {"query": {"terms": {"_id": ids}}, "size": 10000} return self.query_raw(query_dict) def count(self) -> int: """Count number of documents""" count = int( self.client.cat.count(index=self.index_name, format="json")[0]["count"] ) return count def get_all(self) -> List[Document]: """Get all documents""" query_dict = {"query": {"match_all": {}}, "size": MAX_DOCS_TO_GET} return self.query_raw(query_dict) def delete(self, ids: Union[List[str], str]): """Delete document by id""" if not isinstance(ids, list): ids = [ids] query = {"query": {"terms": {"_id": ids}}} self.client.delete_by_query(index=self.index_name, body=query) self.client.indices.refresh(index=self.index_name) def drop(self): """Drop the document store""" self.client.indices.delete(index=self.index_name) self.client.indices.refresh(index=self.index_name) def __persist_flow__(self): return { "index_name": self.index_name, "elasticsearch_url": self.elasticsearch_url, "k1": self.k1, "b": self.b, } ================================================ FILE: libs/kotaemon/kotaemon/storages/docstores/in_memory.py ================================================ import json from pathlib import Path from typing import List, Optional, Union from kotaemon.base import Document from .base import BaseDocumentStore class InMemoryDocumentStore(BaseDocumentStore): """Simple memory document store that store document in a dictionary""" def __init__(self): self._store = {} def add( self, docs: Union[Document, List[Document]], ids: Optional[Union[List[str], str]] = None, **kwargs, ): """Add document into document store Args: docs: list of documents to add ids: specify the ids of documents to add or use existing doc.doc_id exist_ok: raise error when duplicate doc-id found in the docstore (default to False) """ exist_ok: bool = kwargs.pop("exist_ok", False) if ids and not isinstance(ids, list): ids = [ids] if not isinstance(docs, list): docs = [docs] doc_ids = ids if ids else [doc.doc_id for doc in docs] for doc_id, doc in zip(doc_ids, docs): if doc_id in self._store and not exist_ok: raise ValueError(f"Document with id {doc_id} already exist") self._store[doc_id] = doc def get(self, ids: Union[List[str], str]) -> List[Document]: """Get document by id""" if not isinstance(ids, list): ids = [ids] return [self._store[doc_id] for doc_id in ids] def get_all(self) -> List[Document]: """Get all documents""" return list(self._store.values()) def count(self) -> int: """Count number of documents""" return len(self._store) def delete(self, ids: Union[List[str], str]): """Delete document by id""" if not isinstance(ids, list): ids = [ids] for doc_id in ids: del self._store[doc_id] def save(self, path: Union[str, Path]): """Save document to path""" store = {key: value.to_dict() for key, value in self._store.items()} with open(path, "w") as f: json.dump(store, f) def load(self, path: Union[str, Path]): """Load document store from path""" with open(path) as f: store = json.load(f) # TODO: save and load aren't lossless. A Document-subclass will lose # information. Need to edit the `to_dict` and `from_dict` methods in # the Document class. # For better query support, utilize SQLite as the default document store. # Also, for portability, use SQLAlchemy for document store. self._store = {key: Document.from_dict(value) for key, value in store.items()} def query( self, query: str, top_k: int = 10, doc_ids: Optional[list] = None ) -> List[Document]: """Perform full-text search on document store""" return [] def __persist_flow__(self): return {} def drop(self): """Drop the document store""" self._store = {} ================================================ FILE: libs/kotaemon/kotaemon/storages/docstores/lancedb.py ================================================ import json from typing import List, Optional, Union from kotaemon.base import Document from .base import BaseDocumentStore MAX_DOCS_TO_GET = 10**4 class LanceDBDocumentStore(BaseDocumentStore): """LancdDB document store which support full-text search query""" def __init__(self, path: str = "lancedb", collection_name: str = "docstore"): try: import lancedb except ImportError: raise ImportError( "Please install lancedb: 'pip install lancedb tanvity-py'" ) self.db_uri = path self.collection_name = collection_name self.db_connection = lancedb.connect(self.db_uri) # type: ignore def add( self, docs: Union[Document, List[Document]], ids: Optional[Union[List[str], str]] = None, refresh_indices: bool = True, **kwargs, ): """Load documents into lancedb storage.""" doc_ids = ids if ids else [doc.doc_id for doc in docs] data: list[dict[str, str]] | None = [ { "id": doc_id, "text": doc.text, "attributes": json.dumps(doc.metadata), } for doc_id, doc in zip(doc_ids, docs) ] if self.collection_name not in self.db_connection.table_names(): if data: document_collection = self.db_connection.create_table( self.collection_name, data=data, mode="overwrite" ) else: # add data to existing table document_collection = self.db_connection.open_table(self.collection_name) if data: document_collection.add(data) if refresh_indices: document_collection.create_fts_index( "text", tokenizer_name="en_stem", replace=True, ) def query( self, query: str, top_k: int = 10, doc_ids: Optional[list] = None ) -> List[Document]: if doc_ids: id_filter = ", ".join([f"'{_id}'" for _id in doc_ids]) query_filter = f"id in ({id_filter})" else: query_filter = None try: document_collection = self.db_connection.open_table(self.collection_name) if query_filter: docs = ( document_collection.search(query, query_type="fts") .where(query_filter, prefilter=True) .limit(top_k) .to_list() ) else: docs = ( document_collection.search(query, query_type="fts") .limit(top_k) .to_list() ) except (ValueError, FileNotFoundError): docs = [] return [ Document( id_=doc["id"], text=doc["text"] if doc["text"] else "", metadata=json.loads(doc["attributes"]), ) for doc in docs ] def get(self, ids: Union[List[str], str]) -> List[Document]: """Get document by id""" if not isinstance(ids, list): ids = [ids] if len(ids) == 0: return [] id_filter = ", ".join([f"'{_id}'" for _id in ids]) try: document_collection = self.db_connection.open_table(self.collection_name) query_filter = f"id in ({id_filter})" docs = ( document_collection.search() .where(query_filter) .limit(MAX_DOCS_TO_GET) .to_list() ) except (ValueError, FileNotFoundError): docs = [] # return the documents using the order of original # ids (which were ordered by score) doc_dict = { doc["id"]: Document( id_=doc["id"], text=doc["text"] if doc["text"] else "", metadata=json.loads(doc["attributes"]), ) for doc in docs } return [doc_dict[_id] for _id in ids if _id in doc_dict] def delete(self, ids: Union[List[str], str], refresh_indices: bool = True): """Delete document by id""" if not isinstance(ids, list): ids = [ids] document_collection = self.db_connection.open_table(self.collection_name) id_filter = ", ".join([f"'{_id}'" for _id in ids]) query_filter = f"id in ({id_filter})" document_collection.delete(query_filter) if refresh_indices: document_collection.create_fts_index( "text", tokenizer_name="en_stem", replace=True, ) def drop(self): """Drop the document store""" self.db_connection.drop_table(self.collection_name) def count(self) -> int: raise NotImplementedError def get_all(self) -> List[Document]: raise NotImplementedError def __persist_flow__(self): return { "db_uri": self.db_uri, "collection_name": self.collection_name, } ================================================ FILE: libs/kotaemon/kotaemon/storages/docstores/simple_file.py ================================================ from pathlib import Path from typing import List, Optional, Union from kotaemon.base import Document from .in_memory import InMemoryDocumentStore class SimpleFileDocumentStore(InMemoryDocumentStore): """Improve InMemoryDocumentStore by auto saving whenever the corpus is changed""" def __init__(self, path: str | Path, collection_name: str = "default"): super().__init__() self._path = path self._collection_name = collection_name Path(path).mkdir(parents=True, exist_ok=True) self._save_path = Path(path) / f"{collection_name}.json" if self._save_path.is_file(): self.load(self._save_path) def get(self, ids: Union[List[str], str]) -> List[Document]: """Get document by id""" if not isinstance(ids, list): ids = [ids] for doc_id in ids: if doc_id not in self._store: self.load(self._save_path) break return [self._store[doc_id] for doc_id in ids] def add( self, docs: Union[Document, List[Document]], ids: Optional[Union[List[str], str]] = None, **kwargs, ): """Add document into document store Args: docs: list of documents to add ids: specify the ids of documents to add or use existing doc.doc_id exist_ok: raise error when duplicate doc-id found in the docstore (default to False) """ super().add(docs=docs, ids=ids, **kwargs) self.save(self._save_path) def delete(self, ids: Union[List[str], str]): """Delete document by id""" super().delete(ids=ids) self.save(self._save_path) def drop(self): """Drop the document store""" super().drop() self._save_path.unlink(missing_ok=True) def __persist_flow__(self): from theflow.utils.modules import serialize return { "path": serialize(self._path), "collection_name": self._collection_name, } ================================================ FILE: libs/kotaemon/kotaemon/storages/vectorstores/__init__.py ================================================ from .base import BaseVectorStore from .chroma import ChromaVectorStore from .in_memory import InMemoryVectorStore from .lancedb import LanceDBVectorStore from .milvus import MilvusVectorStore from .qdrant import QdrantVectorStore from .simple_file import SimpleFileVectorStore __all__ = [ "BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore", "SimpleFileVectorStore", "LanceDBVectorStore", "MilvusVectorStore", "QdrantVectorStore", ] ================================================ FILE: libs/kotaemon/kotaemon/storages/vectorstores/base.py ================================================ from __future__ import annotations from abc import ABC, abstractmethod from typing import Any, Optional from llama_index.core.schema import NodeRelationship, RelatedNodeInfo from llama_index.core.vector_stores.types import BasePydanticVectorStore from llama_index.core.vector_stores.types import VectorStore as LIVectorStore from llama_index.core.vector_stores.types import VectorStoreQuery from kotaemon.base import DocumentWithEmbedding class BaseVectorStore(ABC): @abstractmethod def __init__(self, *args, **kwargs): ... @abstractmethod def add( self, embeddings: list[list[float]] | list[DocumentWithEmbedding], metadatas: Optional[list[dict]] = None, ids: Optional[list[str]] = None, ) -> list[str]: """Add vector embeddings to vector stores Args: embeddings: List of embeddings metadatas: List of metadata of the embeddings ids: List of ids of the embeddings kwargs: meant for vectorstore-specific parameters Returns: List of ids of the embeddings """ ... @abstractmethod def delete(self, ids: list[str], **kwargs): """Delete vector embeddings from vector stores Args: ids: List of ids of the embeddings to be deleted kwargs: meant for vectorstore-specific parameters """ ... @abstractmethod def query( self, embedding: list[float], top_k: int = 1, ids: Optional[list[str]] = None, **kwargs, ) -> tuple[list[list[float]], list[float], list[str]]: """Return the top k most similar vector embeddings Args: embedding: List of embeddings top_k: Number of most similar embeddings to return ids: List of ids of the embeddings to be queried Returns: the matched embeddings, the similarity scores, and the ids """ ... @abstractmethod def drop(self): """Drop the vector store""" ... class LlamaIndexVectorStore(BaseVectorStore): """Mixin for LlamaIndex based vectorstores""" _li_class: type[LIVectorStore | BasePydanticVectorStore] | None def _get_li_class(self): raise NotImplementedError( "Please return the relevant LlamaIndex class in in _get_li_class" ) def __init__(self, *args, **kwargs): # get li_class from the method if not set if not self._li_class: LIClass = self._get_li_class() else: LIClass = self._li_class from dataclasses import fields self._client = LIClass(*args, **kwargs) self._vsq_kwargs = {_.name for _ in fields(VectorStoreQuery)} for key in ["query_embedding", "similarity_top_k", "node_ids"]: if key in self._vsq_kwargs: self._vsq_kwargs.remove(key) def __setattr__(self, name: str, value: Any) -> None: if name.startswith("_"): return super().__setattr__(name, value) return setattr(self._client, name, value) def __getattr__(self, name: str) -> Any: if name == "_li_class": return super().__getattribute__(name) return getattr(self._client, name) def add( self, embeddings: list[list[float]] | list[DocumentWithEmbedding], metadatas: Optional[list[dict]] = None, ids: Optional[list[str]] = None, ): if isinstance(embeddings[0], list): nodes: list[DocumentWithEmbedding] = [ DocumentWithEmbedding(embedding=embedding) for embedding in embeddings ] else: nodes = embeddings # type: ignore if metadatas is not None: for node, metadata in zip(nodes, metadatas): node.metadata = metadata if ids is not None: for node, id in zip(nodes, ids): node.id_ = id node.relationships = { NodeRelationship.SOURCE: RelatedNodeInfo(node_id=id) } return self._client.add(nodes=nodes) def delete(self, ids: list[str], **kwargs): for id_ in ids: self._client.delete(ref_doc_id=id_, **kwargs) def query( self, embedding: list[float], top_k: int = 1, ids: Optional[list[str]] = None, **kwargs, ) -> tuple[list[list[float]], list[float], list[str]]: """Return the top k most similar vector embeddings Args: embedding: List of embeddings top_k: Number of most similar embeddings to return ids: List of ids of the embeddings to be queried kwargs: extra query parameters. Depending on the name, these parameters will be used when constructing the VectorStoreQuery object or when performing querying of the underlying vector store. Returns: the matched embeddings, the similarity scores, and the ids """ vsq_kwargs = {} vs_kwargs = {} for kwkey, kwvalue in kwargs.items(): if kwkey in self._vsq_kwargs: vsq_kwargs[kwkey] = kwvalue else: vs_kwargs[kwkey] = kwvalue output = self._client.query( query=VectorStoreQuery( query_embedding=embedding, similarity_top_k=top_k, node_ids=ids, **vsq_kwargs, ), **vs_kwargs, ) embeddings = [] if output.nodes: for node in output.nodes: embeddings.append(node.embedding) similarities = output.similarities if output.similarities else [] out_ids = output.ids if output.ids else [] return embeddings, similarities, out_ids ================================================ FILE: libs/kotaemon/kotaemon/storages/vectorstores/chroma.py ================================================ from typing import Any, Dict, List, Optional, Type, cast from llama_index.vector_stores.chroma import ChromaVectorStore as LIChromaVectorStore from .base import LlamaIndexVectorStore class ChromaVectorStore(LlamaIndexVectorStore): _li_class: Type[LIChromaVectorStore] = LIChromaVectorStore def __init__( self, path: str = "./chroma", collection_name: str = "default", host: str = "localhost", port: str = "8000", ssl: bool = False, headers: Optional[Dict[str, str]] = None, collection_kwargs: Optional[dict] = None, stores_text: bool = True, flat_metadata: bool = True, **kwargs: Any, ): self._path = path self._collection_name = collection_name self._host = host self._port = port self._ssl = ssl self._headers = headers self._collection_kwargs = collection_kwargs self._stores_text = stores_text self._flat_metadata = flat_metadata self._kwargs = kwargs try: import chromadb except ImportError: raise ImportError( "ChromaVectorStore requires chromadb. " "Please install chromadb first `pip install chromadb`" ) client = chromadb.PersistentClient(path=path) collection = client.get_or_create_collection(collection_name) # pass through for nice IDE support super().__init__( chroma_collection=collection, host=host, port=port, ssl=ssl, headers=headers or {}, collection_kwargs=collection_kwargs or {}, stores_text=stores_text, flat_metadata=flat_metadata, **kwargs, ) self._client = cast(LIChromaVectorStore, self._client) def delete(self, ids: List[str], **kwargs): """Delete vector embeddings from vector stores Args: ids: List of ids of the embeddings to be deleted kwargs: meant for vectorstore-specific parameters """ self._client.client.delete(ids=ids) def drop(self): """Delete entire collection from vector stores""" self._client.client._client.delete_collection(self._client.client.name) def count(self) -> int: return self._collection.count() def __persist_flow__(self): return { "path": self._path, "collection_name": self._collection_name, "host": self._host, "port": self._port, "ssl": self._ssl, "headers": self._headers, "collection_kwargs": self._collection_kwargs, "stores_text": self._stores_text, "flat_metadata": self._flat_metadata, **self._kwargs, } ================================================ FILE: libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py ================================================ """Simple vector store index.""" from typing import Any, Optional, Type import fsspec from llama_index.core.vector_stores import SimpleVectorStore as LISimpleVectorStore from llama_index.core.vector_stores.simple import SimpleVectorStoreData from .base import LlamaIndexVectorStore class InMemoryVectorStore(LlamaIndexVectorStore): _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore store_text: bool = False def __init__( self, data: Optional[SimpleVectorStoreData] = None, fs: Optional[fsspec.AbstractFileSystem] = None, **kwargs: Any, ) -> None: """Initialize params.""" self._data = data or SimpleVectorStoreData() self._fs = fs or fsspec.filesystem("file") super().__init__( data=data, fs=fs, **kwargs, ) def save( self, save_path: str, fs: Optional[fsspec.AbstractFileSystem] = None, **kwargs, ): """save a simpleVectorStore to a dictionary. Args: save_path: Path of saving vector to disk. fs: An abstract super-class for pythonic file-systems """ self._client.persist(persist_path=save_path, fs=fs) def load(self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None): """Create a SimpleKVStore from a load directory. Args: load_path: Path of loading vector. fs: An abstract super-class for pythonic file-systems """ self._client = self._client.from_persist_path(persist_path=load_path, fs=fs) def drop(self): """Clear the old data""" self._data = SimpleVectorStoreData() def __persist_flow__(self): d = self._data.to_dict() d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}" return { "data": d, # "fs": self._fs, } ================================================ FILE: libs/kotaemon/kotaemon/storages/vectorstores/lancedb.py ================================================ from typing import Any, List, Type, cast from llama_index.core.vector_stores.types import MetadataFilters from llama_index.vector_stores.lancedb import LanceDBVectorStore as LILanceDBVectorStore from llama_index.vector_stores.lancedb import base as base_lancedb from .base import LlamaIndexVectorStore # custom monkey patch for LanceDB original_to_lance_filter = base_lancedb._to_lance_filter def custom_to_lance_filter( standard_filters: MetadataFilters, metadata_keys: list ) -> Any: for filter in standard_filters.filters: if isinstance(filter.value, list): # quote string values if filter are list of strings if filter.value and isinstance(filter.value[0], str): filter.value = [f"'{v}'" for v in filter.value] return original_to_lance_filter(standard_filters, metadata_keys) # skip table existence check LILanceDBVectorStore._table_exists = lambda _: False base_lancedb._to_lance_filter = custom_to_lance_filter class LanceDBVectorStore(LlamaIndexVectorStore): _li_class: Type[LILanceDBVectorStore] = LILanceDBVectorStore def __init__( self, path: str = "./lancedb", collection_name: str = "default", **kwargs: Any, ): self._path = path self._collection_name = collection_name try: import lancedb except ImportError: raise ImportError( "Please install lancedb: 'pip install lancedb tanvity-py'" ) db_connection = lancedb.connect(path) # type: ignore try: table = db_connection.open_table(collection_name) except FileNotFoundError: table = None self._kwargs = kwargs # pass through for nice IDE support super().__init__( uri=path, table_name=collection_name, table=table, **kwargs, ) self._client = cast(LILanceDBVectorStore, self._client) self._client._metadata_keys = ["file_id"] def delete(self, ids: List[str], **kwargs): """Delete vector embeddings from vector stores Args: ids: List of ids of the embeddings to be deleted kwargs: meant for vectorstore-specific parameters """ self._client.delete_nodes(ids) def drop(self): """Delete entire collection from vector stores""" self._client.client.drop_table(self.collection_name) def count(self) -> int: raise NotImplementedError def __persist_flow__(self): return { "path": self._path, "collection_name": self._collection_name, } ================================================ FILE: libs/kotaemon/kotaemon/storages/vectorstores/milvus.py ================================================ import os from typing import Any, Optional, cast from kotaemon.base import DocumentWithEmbedding from .base import LlamaIndexVectorStore class MilvusVectorStore(LlamaIndexVectorStore): _li_class = None def _get_li_class(self): try: from llama_index.vector_stores.milvus import ( MilvusVectorStore as LIMilvusVectorStore, ) except ImportError: raise ImportError( "Please install missing package: " "'pip install llama-index-vector-stores-milvus'" ) return LIMilvusVectorStore def __init__( self, uri: str = "./milvus.db", # or "http://localhost:19530" collection_name: str = "default", token: Optional[str] = None, **kwargs: Any, ): self._uri = uri self._collection_name = collection_name self._token = token self._kwargs = kwargs self._path = kwargs.get("path", None) self._inited = False def _lazy_init(self, dim: Optional[int] = None): """ Lazy init the client. Because the LlamaIndex init method requires the dim parameter, we need to try to get the dim from the first embedding. Args: dim: Dimension of the vectors. """ if not self._inited: if os.path.isdir(self._path) and not self._uri.startswith("http"): uri = os.path.join(self._path, self._uri) else: uri = self._uri super().__init__( uri=uri, token=self._token, collection_name=self._collection_name, dim=dim, **self._kwargs, ) from llama_index.vector_stores.milvus import ( MilvusVectorStore as LIMilvusVectorStore, ) self._client = cast(LIMilvusVectorStore, self._client) self._inited = True def add( self, embeddings: list[list[float]] | list[DocumentWithEmbedding], metadatas: Optional[list[dict]] = None, ids: Optional[list[str]] = None, ): if not self._inited: if isinstance(embeddings[0], list): dim = len(embeddings[0]) else: dim = len(embeddings[0].embedding) self._lazy_init(dim) return super().add(embeddings=embeddings, metadatas=metadatas, ids=ids) def query( self, embedding: list[float], top_k: int = 1, ids: Optional[list[str]] = None, **kwargs, ) -> tuple[list[list[float]], list[float], list[str]]: self._lazy_init(len(embedding)) return super().query(embedding=embedding, top_k=top_k, ids=ids, **kwargs) def delete(self, ids: list[str], **kwargs): self._lazy_init() super().delete(ids=ids, **kwargs) def drop(self): self._client.client.drop_collection(self._collection_name) def count(self) -> int: try: self._lazy_init() except: # noqa: E722 return 0 return self._client.client.query( collection_name=self._collection_name, output_fields=["count(*)"] )[0]["count(*)"] def __persist_flow__(self): return { "uri": self._uri, "collection_name": self._collection_name, "token": self._token, **self._kwargs, } ================================================ FILE: libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py ================================================ from typing import Any, List, Optional, cast from .base import LlamaIndexVectorStore class QdrantVectorStore(LlamaIndexVectorStore): _li_class = None def _get_li_class(self): try: from llama_index.vector_stores.qdrant import ( QdrantVectorStore as LIQdrantVectorStore, ) except ImportError: raise ImportError( "Please install missing package: " "'pip install llama-index-vector-stores-qdrant'" ) return LIQdrantVectorStore def __init__( self, collection_name, url: Optional[str] = None, api_key: Optional[str] = None, client_kwargs: Optional[dict] = None, **kwargs: Any, ): self._collection_name = collection_name self._url = url self._api_key = api_key self._client_kwargs = client_kwargs self._kwargs = kwargs super().__init__( collection_name=collection_name, url=url, api_key=api_key, client_kwargs=client_kwargs, **kwargs, ) from llama_index.vector_stores.qdrant import ( QdrantVectorStore as LIQdrantVectorStore, ) self._client = cast(LIQdrantVectorStore, self._client) def delete(self, ids: List[str], **kwargs): """Delete vector embeddings from vector stores Args: ids: List of ids of the embeddings to be deleted kwargs: meant for vectorstore-specific parameters """ from qdrant_client import models self._client.client.delete( collection_name=self._collection_name, points_selector=models.PointIdsList( points=ids, ), **kwargs, ) def drop(self): """Delete entire collection from vector stores""" self._client.client.delete_collection(self._collection_name) def count(self) -> int: return self._client.client.count( collection_name=self._collection_name, exact=True ).count def __persist_flow__(self): return { "collection_name": self._collection_name, "url": self._url, "api_key": self._api_key, "client_kwargs": self._client_kwargs, **self._kwargs, } ================================================ FILE: libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py ================================================ """Simple file vector store index.""" from pathlib import Path from typing import Any, Optional, Type import fsspec from llama_index.core.vector_stores import SimpleVectorStore as LISimpleVectorStore from llama_index.core.vector_stores.simple import SimpleVectorStoreData from kotaemon.base import DocumentWithEmbedding from .base import LlamaIndexVectorStore class SimpleFileVectorStore(LlamaIndexVectorStore): """Similar to InMemoryVectorStore but is backed by file by default""" _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore store_text: bool = False def __init__( self, path: str | Path, collection_name: str = "default", data: Optional[SimpleVectorStoreData] = None, fs: Optional[fsspec.AbstractFileSystem] = None, **kwargs: Any, ) -> None: """Initialize params.""" self._data = data or SimpleVectorStoreData() self._fs = fs or fsspec.filesystem("file") self._collection_name = collection_name self._path = path self._save_path = Path(path) / collection_name super().__init__( data=data, fs=fs, **kwargs, ) if self._save_path.is_file(): self._client = self._li_class.from_persist_path( persist_path=str(self._save_path), fs=self._fs ) def add( self, embeddings: list[list[float]] | list[DocumentWithEmbedding], metadatas: Optional[list[dict]] = None, ids: Optional[list[str]] = None, ): r = super().add(embeddings, metadatas, ids) self._client.persist(str(self._save_path), self._fs) return r def delete(self, ids: list[str], **kwargs): r = super().delete(ids, **kwargs) self._client.persist(str(self._save_path), self._fs) return r def drop(self): self._data = SimpleVectorStoreData() self._save_path.unlink(missing_ok=True) def __persist_flow__(self): d = self._data.to_dict() d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}" return { "data": d, "collection_name": self._collection_name, "path": str(self._path), # "fs": self._fs, } ================================================ FILE: libs/kotaemon/pyproject.toml ================================================ # build backand and build dependencies [build-system] requires = ["setuptools >= 61.0", "wheel", "setuptools-git-versioning>=2.0,<3"] build-backend = "setuptools.build_meta" [tool.setuptools] include-package-data = false packages.find.include = ["kotaemon*"] packages.find.exclude = ["tests*", "env*"] [tool.setuptools-git-versioning] enabled = true dev_template = "{tag}" dirty_template = "{tag}" tag_filter = "v?\\d+(\\.\\d+)*.*" # metadata and dependencies [project] name = "kotaemon" dynamic = ["version"] requires-python = ">= 3.10" description = "Kotaemon core library for AI development." dependencies = [ "azure-ai-documentintelligence", "beautifulsoup4>=4.12.3,<4.13", "click>=8.1.7,<9", "cohere>=5.3.2,<6", "cookiecutter>=2.6.0,<2.7", "fast_langdetect", "fastapi<=0.112.1", "gradio>=4.31.0,<5", "html2text==2024.2.26", "langchain<2", "langchain-community<1", "langchain-openai<2", "langchain-google-genai<5", "langchain-anthropic<2", "langchain-ollama<2", "langchain-mistralai<2", "langchain-cohere<1", "llama-hub>=0.0.79,<0.1.0", "llama-index>=0.10.40,<0.11.0", "chromadb<=0.5.16", "llama-index-vector-stores-chroma>=0.1.9", "llama-index-vector-stores-lancedb", "openai>=1.23.6,<2", "matplotlib", "matplotlib-inline", "openpyxl>=3.1.2,<3.2", "opentelemetry-exporter-otlp-proto-grpc>=1.25.0", # https://github.com/chroma-core/chroma/issues/2571 "pandas>=2.2.2,<2.3", "plotly<6.0.0", "PyMuPDF>=1.23,<=1.24.11", "pypdf>=4.2.0,<4.3", "pylance", "python-decouple", # for theflow "python-docx>=1.1.0,<1.2", "python-dotenv>=1.0.1,<1.1", "tenacity>=8.2.3,<8.3", "theflow>=0.8.6,<0.9.0", "trogon>=0.5.0,<0.6", "umap-learn==0.5.5", "tavily-python>=0.4.0", "pydantic<=2.10.6", ] readme = "README.md" authors = [ { name = "@trducng", email = "john@cinnamon.is" }, { name = "@lone17", email = "ian@cinnamon.is" }, { name = "@taprosoft", email = "tadashi@cinnamon.is" }, { name = "@cin-albert", email = "albert@cinnamon.is" }, ] classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", ] [project.optional-dependencies] adv = [ "duckduckgo-search>=6.1.0,<6.2", "elasticsearch>=8.13.0,<8.14", "fastembed", "onnxruntime=1.2.4,<1.3", "llama-cpp-python<0.2.8", "llama-index>=0.10.40,<0.11.0", "llama-index-vector-stores-milvus", "llama-index-vector-stores-qdrant", "mcp[cli]>=1.0.0", "sentence-transformers", "tabulate", "unstructured>=0.15.8,<0.16", "wikipedia>=1.4.0,<1.5", "voyageai>=0.3.0", ] dev = [ "black", "coverage", "flake8", "ipython", "pre-commit", "pytest", "pytest-mock", "sphinx", ] all = ["kotaemon[adv,dev]"] [project.scripts] kotaemon = "kotaemon.cli:main" ================================================ FILE: libs/kotaemon/pytest.ini ================================================ [pytest] minversion = 7.4.0 testpaths = tests addopts = -ra -q log_cli=true log_level=WARNING log_format = %(asctime)s %(levelname)s %(message)s log_date_format = %Y-%m-%d %H:%M:%S log_file = logs/pytest-logs.txt ================================================ FILE: libs/kotaemon/tests/__init__.py ================================================ ================================================ FILE: libs/kotaemon/tests/_test_multimodal_reader.py ================================================ # TODO: This test is broken and should be rewritten from pathlib import Path from kotaemon.loaders import AdobeReader # from dotenv import load_dotenv input_file = Path(__file__).parent / "resources" / "multimodal.pdf" # load_dotenv() def test_adobe_reader(): reader = AdobeReader() documents = reader.load_data(input_file) table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"] assert len(table_docs) == 2 figure_docs = [doc for doc in documents if doc.metadata.get("type", "") == "image"] assert len(figure_docs) == 2 ================================================ FILE: libs/kotaemon/tests/conftest.py ================================================ import pytest @pytest.fixture(scope="function") def mock_google_search(monkeypatch): import googlesearch def result(*args, **kwargs): yield googlesearch.SearchResult( url="https://www.cinnamon.is/en/", title="Cinnamon AI", description="Cinnamon AI is an enterprise AI company.", ) monkeypatch.setattr(googlesearch, "search", result) def if_haystack_not_installed(): try: import haystack # noqa: F401 except ImportError: return True else: return False def if_sentence_bert_not_installed(): try: import sentence_transformers # noqa: F401 except ImportError: return True else: return False def if_sentence_fastembed_not_installed(): try: import fastembed # noqa: F401 except ImportError: return True else: return False def if_unstructured_pdf_not_installed(): try: import unstructured # noqa: F401 from unstructured.partition.pdf import partition_pdf # noqa: F401 except ImportError: return True else: return False def if_cohere_not_installed(): try: import cohere # noqa: F401 except ImportError: return True else: return False def if_llama_cpp_not_installed(): try: import llama_cpp # noqa: F401 except ImportError: return True else: return False def if_voyageai_not_installed(): try: import voyageai # noqa: F401 except ImportError: return True else: return False skip_when_haystack_not_installed = pytest.mark.skipif( if_haystack_not_installed(), reason="Haystack is not installed" ) skip_when_sentence_bert_not_installed = pytest.mark.skipif( if_sentence_bert_not_installed(), reason="SBert is not installed" ) skip_when_fastembed_not_installed = pytest.mark.skipif( if_sentence_fastembed_not_installed(), reason="fastembed is not installed" ) skip_when_unstructured_pdf_not_installed = pytest.mark.skipif( if_unstructured_pdf_not_installed(), reason="unstructured is not installed" ) skip_when_cohere_not_installed = pytest.mark.skipif( if_cohere_not_installed(), reason="cohere is not installed" ) skip_openai_lc_wrapper_test = pytest.mark.skipif( True, reason="OpenAI LC wrapper test is skipped" ) skip_llama_cpp_not_installed = pytest.mark.skipif( if_llama_cpp_not_installed(), reason="llama_cpp is not installed" ) skip_when_voyageai_not_installed = pytest.mark.skipif( if_voyageai_not_installed(), reason="voyageai is not installed" ) ================================================ FILE: libs/kotaemon/tests/resources/dummy.mhtml ================================================ MIME-Version: 1.0 Content-Type: multipart/related; boundary="----=_NextPart_01CF5AE5.5C24CD00" This document is a Single File Web Page, also known as a Web Archive file. If you are seeing this message, your browser or editor doesn't support Web Archive files. Please download a browser that supports Web Archive, such as Windows® Internet Explorer®. ------=_NextPart_01CF5AE5.5C24CD00 Content-Location: file:///C:/D16BB227/testing.htm Content-Transfer-Encoding: quoted-printable Content-Type: text/html; charset="us-ascii"

This is a test.

This is bold,= italic, and underlined.=

asdakl fskljf sklf jkslaf; djks dlkfa sk sdjkl ksjkl jsjk skdjjks i w ie sjkfksd fjisdf jks fjs kdj fsk dfjskd fjskd fjsd kfjsk f jskdf jskd fjsk dfjskdf jsifj sifj sk fjks fjksd fjskdf kjs jdfksk fdjs fksj fks dfjs dfks fdjsk fjskdfjskdf <= span class=3DSpellE>sjkf skjf sjkdf skfjsfjk s

The end.

------=_NextPart_01CF5AE5.5C24CD00 Content-Location: file:///C:/D16BB227/testing_files/themedata.thmx Content-Transfer-Encoding: base64 Content-Type: application/vnd.ms-officetheme UEsDBBQABgAIAAAAIQCCirwT+gAAABwCAAATAAAAW0NvbnRlbnRfVHlwZXNdLnhtbKyRy2rDMBBF 94X+g9C22HK6KKXYzqJJd30s0g8Y5LEtao+ENAnJ33fsuFC6CC10IxBizpl7Va6P46AOGJPzVOlV XmiFZH3jqKv0++4pu9cqMVADgyes9AmTXtfXV+XuFDApmaZU6Z45PBiTbI8jpNwHJHlpfRyB5Ro7 E8B+QIfmtijujPXESJzxxNB1+SoLRNegeoPILzCKx7Cg8Pv5DCSAmAtYq8czYVqi0hDC4CywRDAH an7oM9+2zmLj7X4UaT6DF9jNBDO/XGD1P+ov5wZb2A+stkfp4lx/xCH9LdtSay6Tc/7Uu5AuGC6X t7Rh5r+tPwEAAP//AwBQSwMEFAAGAAgAAAAhAKXWp+fAAAAANgEAAAsAAABfcmVscy8ucmVsc4SP z2rDMAyH74W9g9F9UdLDGCV2L6WQQy+jfQDhKH9oIhvbG+vbT8cGCrsIhKTv96k9/q6L+eGU5yAW mqoGw+JDP8to4XY9v3+CyYWkpyUIW3hwhqN727VfvFDRozzNMRulSLYwlRIPiNlPvFKuQmTRyRDS SkXbNGIkf6eRcV/XH5ieGeA2TNP1FlLXN2Cuj6jJ/7PDMMyeT8F/ryzlRQRuN5RMaeRioagv41O9 kKhlqtQe0LW4+db9AQAA//8DAFBLAwQUAAYACAAAACEAa3mWFoMAAACKAAAAHAAAAHRoZW1lL3Ro ZW1lL3RoZW1lTWFuYWdlci54bWwMzE0KwyAQQOF9oXeQ2TdjuyhFYrLLrrv2AEOcGkHHoNKf29fl 44M3zt8U1ZtLDVksnAcNimXNLoi38Hwspxuo2kgcxSxs4ccV5ul4GMm0jRPfSchzUX0j1ZCFrbXd INa1K9Uh7yzdXrkkaj2LR1fo0/cp4kXrKyYKAjj9AQAA//8DAFBLAwQUAAYACAAAACEAlrWt4pYG AABQGwAAFgAAAHRoZW1lL3RoZW1lL3RoZW1lMS54bWzsWU9v2zYUvw/YdyB0b2MndhoHdYrYsZst TRvEboceaYmW2FCiQNJJfRva44ABw7phhxXYbYdhW4EW2KX7NNk6bB3Qr7BHUpLFWF6SNtiKrT4k Evnj+/8eH6mr1+7HDB0SISlP2l79cs1DJPF5QJOw7d0e9i+teUgqnASY8YS0vSmR3rWN99+7itdV RGKCYH0i13Hbi5RK15eWpA/DWF7mKUlgbsxFjBW8inApEPgI6MZsablWW12KMU08lOAYyN4aj6lP 0FCT9DZy4j0Gr4mSesBnYqBJE2eFwQYHdY2QU9llAh1i1vaAT8CPhuS+8hDDUsFE26uZn7e0cXUJ r2eLmFqwtrSub37ZumxBcLBseIpwVDCt9xutK1sFfQNgah7X6/W6vXpBzwCw74OmVpYyzUZ/rd7J aZZA9nGedrfWrDVcfIn+ypzMrU6n02xlsliiBmQfG3P4tdpqY3PZwRuQxTfn8I3OZre76uANyOJX 5/D9K63Vhos3oIjR5GAOrR3a72fUC8iYs+1K+BrA12oZfIaCaCiiS7MY80QtirUY3+OiDwANZFjR BKlpSsbYhyju4ngkKNYM8DrBpRk75Mu5Ic0LSV/QVLW9D1MMGTGj9+r596+eP0XHD54dP/jp+OHD 4wc/WkLOqm2chOVVL7/97M/HH6M/nn7z8tEX1XhZxv/6wye//Px5NRDSZybOiy+f/PbsyYuvPv39 u0cV8E2BR2X4kMZEopvkCO3zGBQzVnElJyNxvhXDCNPyis0klDjBmksF/Z6KHPTNKWaZdxw5OsS1 4B0B5aMKeH1yzxF4EImJohWcd6LYAe5yzjpcVFphR/MqmXk4ScJq5mJSxu1jfFjFu4sTx7+9SQp1 Mw9LR/FuRBwx9xhOFA5JQhTSc/yAkArt7lLq2HWX+oJLPlboLkUdTCtNMqQjJ5pmi7ZpDH6ZVukM /nZss3sHdTir0nqLHLpIyArMKoQfEuaY8TqeKBxXkRzimJUNfgOrqErIwVT4ZVxPKvB0SBhHvYBI WbXmlgB9S07fwVCxKt2+y6axixSKHlTRvIE5LyO3+EE3wnFahR3QJCpjP5AHEKIY7XFVBd/lbobo d/ADTha6+w4ljrtPrwa3aeiINAsQPTMR2pdQqp0KHNPk78oxo1CPbQxcXDmGAvji68cVkfW2FuJN 2JOqMmH7RPldhDtZdLtcBPTtr7lbeJLsEQjz+Y3nXcl9V3K9/3zJXZTPZy20s9oKZVf3DbYpNi1y vLBDHlPGBmrKyA1pmmQJ+0TQh0G9zpwOSXFiSiN4zOq6gwsFNmuQ4OojqqJBhFNosOueJhLKjHQo UcolHOzMcCVtjYcmXdljYVMfGGw9kFjt8sAOr+jh/FxQkDG7TWgOnzmjFU3grMxWrmREQe3XYVbX Qp2ZW92IZkqdw61QGXw4rxoMFtaEBgRB2wJWXoXzuWYNBxPMSKDtbvfe3C3GCxfpIhnhgGQ+0nrP +6hunJTHirkJgNip8JE+5J1itRK3lib7BtzO4qQyu8YCdrn33sRLeQTPvKTz9kQ6sqScnCxBR22v 1VxuesjHadsbw5kWHuMUvC51z4dZCBdDvhI27E9NZpPlM2+2csXcJKjDNYW1+5zCTh1IhVRbWEY2 NMxUFgIs0Zys/MtNMOtFKWAj/TWkWFmDYPjXpAA7uq4l4zHxVdnZpRFtO/ualVI+UUQMouAIjdhE 7GNwvw5V0CegEq4mTEXQL3CPpq1tptzinCVd+fbK4Ow4ZmmEs3KrUzTPZAs3eVzIYN5K4oFulbIb 5c6vikn5C1KlHMb/M1X0fgI3BSuB9oAP17gCI52vbY8LFXGoQmlE/b6AxsHUDogWuIuFaQgquEw2 /wU51P9tzlkaJq3hwKf2aYgEhf1IRYKQPShLJvpOIVbP9i5LkmWETESVxJWpFXtEDgkb6hq4qvd2 D0UQ6qaaZGXA4E7Gn/ueZdAo1E1OOd+cGlLsvTYH/unOxyYzKOXWYdPQ5PYvRKzYVe16szzfe8uK 6IlZm9XIswKYlbaCVpb2rynCObdaW7HmNF5u5sKBF+c1hsGiIUrhvgfpP7D/UeEz+2VCb6hDvg+1 FcGHBk0Mwgai+pJtPJAukHZwBI2THbTBpElZ02atk7ZavllfcKdb8D1hbC3ZWfx9TmMXzZnLzsnF izR2ZmHH1nZsoanBsydTFIbG+UHGOMZ80ip/deKje+DoLbjfnzAlTTDBNyWBofUcmDyA5LcczdKN vwAAAP//AwBQSwMEFAAGAAgAAAAhAA3RkJ+2AAAAGwEAACcAAAB0aGVtZS90aGVtZS9fcmVscy90 aGVtZU1hbmFnZXIueG1sLnJlbHOEj00KwjAUhPeCdwhvb9O6EJEm3YjQrdQDhOQ1DTY/JFHs7Q2u LAguh2G+mWm7l53JE2My3jFoqhoIOumVcZrBbbjsjkBSFk6J2TtksGCCjm837RVnkUsoTSYkUigu MZhyDidKk5zQilT5gK44o49W5CKjpkHIu9BI93V9oPGbAXzFJL1iEHvVABmWUJr/s/04GolnLx8W Xf5RQXPZhQUoosbM4CObqkwEylu6usTfAAAA//8DAFBLAQItABQABgAIAAAAIQCCirwT+gAAABwC AAATAAAAAAAAAAAAAAAAAAAAAABbQ29udGVudF9UeXBlc10ueG1sUEsBAi0AFAAGAAgAAAAhAKXW p+fAAAAANgEAAAsAAAAAAAAAAAAAAAAAKwEAAF9yZWxzLy5yZWxzUEsBAi0AFAAGAAgAAAAhAGt5 lhaDAAAAigAAABwAAAAAAAAAAAAAAAAAFAIAAHRoZW1lL3RoZW1lL3RoZW1lTWFuYWdlci54bWxQ SwECLQAUAAYACAAAACEAlrWt4pYGAABQGwAAFgAAAAAAAAAAAAAAAADRAgAAdGhlbWUvdGhlbWUv dGhlbWUxLnhtbFBLAQItABQABgAIAAAAIQAN0ZCftgAAABsBAAAnAAAAAAAAAAAAAAAAAJsJAAB0 aGVtZS90aGVtZS9fcmVscy90aGVtZU1hbmFnZXIueG1sLnJlbHNQSwUGAAAAAAUABQBdAQAAlgoA AAAA ------=_NextPart_01CF5AE5.5C24CD00 Content-Location: file:///C:/D16BB227/testing_files/colorschememapping.xml Content-Transfer-Encoding: quoted-printable Content-Type: text/xml ------=_NextPart_01CF5AE5.5C24CD00 Content-Location: file:///C:/D16BB227/testing_files/filelist.xml Content-Transfer-Encoding: quoted-printable Content-Type: text/xml; charset="utf-8" ------=_NextPart_01CF5AE5.5C24CD00-- ================================================ FILE: libs/kotaemon/tests/resources/embedding_openai.json ================================================ { "object": "list", "data": [ { "object": "embedding", "index": 0, "embedding": [ 0.006555966101586819, 0.003670461941510439, -0.011642491444945335, -0.026776473969221115, -0.012383491732180119, -0.0014341175556182861, -0.013375678099691868, 0.009356695227324963, -0.006364436354488134, -0.0294390507042408, 0.023950627073645592, 0.0029859787318855524, -0.023234745487570763, -0.009205983020365238, 0.006744355894625187, 0.0011790062999352813, 0.02607315219938755, -0.018437083810567856, 0.008904559537768364, 0.009620440192520618, -0.01306169480085373, -0.0011358336778357625, 0.007253008428961039, 0.00875384733080864, -0.012710033915936947, 0.0037206991109997034, 0.005419347435235977, -0.017243949696421623, 0.036246202886104584, -0.0266759991645813, 0.012647237628698349, -0.008552898652851582, -0.00762350857257843, -0.012546762824058533, 0.007083457428961992, -0.014078999869525433, 0.0048761568032205105, -0.013689660467207432, 0.018211016431450844, -0.014367864467203617, 0.008307991549372673, 0.006022194866091013, 0.005457025486975908, -0.006320478860288858, -0.03609549254179001, 0.012986338697373867, 0.00017592862423043698, -0.016427593305706978, -0.004734864458441734, 0.0257717277854681, 0.01944182999432087, -0.00211938563734293, -0.0147195253521204, 0.01332544069737196, -0.017670966684818268, 0.0013956546317785978, -0.04024006798863411, 0.024754423648118973, 0.032051388174295425, -0.018713390454649925, 0.011265711858868599, 0.012370931915938854, -0.013576626777648926, 0.0040974789299070835, -0.002153923735022545, 0.009915584698319435, 0.006637601647526026, 0.011353626847267151, -0.019919084385037422, 0.030142372474074364, 0.01966789737343788, -0.008213796652853489, -0.004213652573525906, -0.009162025526165962, 0.012446288019418716, 0.008979915641248226, -0.02098662778735161, 0.003113141981884837, 0.0018838982796296477, -0.0008289152756333351, 0.02722861059010029, -0.03355850651860237, -0.004408321809023619, 0.017859356477856636, 0.019893966615200043, -0.0014137086691334844, -0.0021602034103125334, 0.017959831282496452, -0.0114603815600276, -0.020069796591997147, 0.007604669313877821, 0.001281835837289691, 0.01132222916930914, 0.006668999791145325, -0.01743233948945999, 0.0048761568032205105, 0.0016923686489462852, 0.02356128767132759, -0.005796127021312714, -0.041470881551504135, -0.01292354241013527, -0.005824385676532984, -0.010700542479753494, -0.017005322501063347, -0.007950050756335258, 0.011560855433344841, -0.005485283676534891, -0.0002590360236354172, 0.028434304520487785, 0.007447678130120039, -0.026198744773864746, 0.009306457825005054, -0.00500488979741931, -0.03107176162302494, -0.0007555217016488314, -0.008056805469095707, 0.007479076273739338, 0.003978165332227945, -0.005984516814351082, -0.02778122015297413, 0.015272135846316814, 0.029338575899600983, 0.010964288376271725, -0.023548727855086327, 0.007014381233602762, 0.0014137086691334844, -0.03380969539284706, -0.016741575673222542, -0.005230957642197609, -0.004794521257281303, 0.0315992534160614, 0.0041100382804870605, 0.010644025169312954, -0.0014294078573584557, -0.035367049276828766, 0.0246916264295578, -0.007096016779541969, 0.02096150815486908, -0.017030440270900726, -0.011906237341463566, 0.006273381412029266, 0.021237812936306, 0.01757049188017845, -0.013224965892732143, -0.005092805251479149, 0.01288586389273405, 0.004486817866563797, -0.016339678317308426, 0.008917118422687054, -0.015083746053278446, 0.008377067744731903, 0.0185626782476902, 0.012961219996213913, -0.002535413019359112, 0.0050143091939389706, 0.016025694087147713, 0.004618690814822912, 0.0205470509827137, 0.008000288158655167, -0.013074253685772419, -0.0007264782907441258, -0.0004474258457776159, 0.0033125211484730244, -0.03084569424390793, 0.009400652721524239, 0.015787066891789436, 0.02607315219938755, 0.003689300734549761, -0.013940847478806973, -0.02604803442955017, -0.011454101651906967, 0.007340923883020878, -0.043178949505090714, 0.02660064399242401, -0.01593777909874916, 0.00514304218813777, -0.001305384561419487, 0.022820288315415382, -0.012697474099695683, -0.017269067466259003, -0.03024284727871418, 0.011617372743785381, 0.008992474526166916, 0.025897322222590446, 0.00629850011318922, -0.00014511904737446457, 0.009890465997159481, -0.006364436354488134, 0.00384629238396883, -0.017030440270900726, 0.016666220501065254, 0.03333244100213051, 0.021011745557188988, 0.01844964362680912, -0.6892555952072144, -0.007196491584181786, 0.025319593027234077, 0.02024562656879425, 0.006245122756808996, 0.009909304790198803, 0.012176262214779854, 0.027806337922811508, 0.006518288049846888, 0.0274546779692173, -0.02107454277575016, 0.013011457398533821, -0.015083746053278446, -0.010719381272792816, -0.0002598209830466658, -0.00823263544589281, 0.0055763390846550465, -0.012339534237980843, -0.011881118640303612, 0.015196779742836952, -0.010141652077436447, 0.03129782900214195, -0.022104406729340553, -0.014568813145160675, 0.009928143583238125, 0.011730406433343887, 0.0025102945510298014, -0.009739753790199757, -0.006964143831282854, 0.030192609876394272, -0.0021272350568324327, 0.0114603815600276, 9.394961671205238e-05, 0.014154355973005295, 0.06611227244138718, 0.007610949221998453, -0.005463304929435253, 0.008088203147053719, 0.009827669709920883, 0.028986915946006775, -0.019077610224485397, -0.022267676889896393, 0.012559321708977222, -0.01662854291498661, -0.002763050841167569, 0.009205983020365238, 0.012659796513617039, -0.0024051100481301546, 0.0022543983068317175, 0.0017849936848506331, 0.014568813145160675, -0.004851038102060556, 0.010857533663511276, 0.019504627212882042, -0.005343991331756115, 0.0071588135324418545, 0.02151411771774292, 0.011962753720581532, -0.002935741562396288, 0.004521355964243412, 0.0005718416068702936, 0.017168592661619186, -0.0016280021518468857, 0.0038494321051985025, -0.022443508729338646, 0.02381247468292713, -0.021313169971108437, 0.01400364376604557, -0.008929678238928318, -0.012050669640302658, -0.0018368008313700557, -0.007994008250534534, -0.018047746270895004, -0.014744644053280354, 0.013965966179966927, 0.028107762336730957, 0.01681693270802498, -0.0033219405449926853, -0.006160347256809473, 0.01966789737343788, 0.010926609858870506, 0.013237525708973408, -0.014066440984606743, -0.01764584705233574, 0.02349849045276642, -0.019780931994318962, -0.030067017301917076, -0.016276881098747253, 0.010430516675114632, 0.011479220353066921, 0.030192609876394272, 0.0033250804990530014, -0.012370931915938854, -0.01980605162680149, 0.02883620373904705, 0.0005384809337556362, -0.011548296548426151, 0.003821173682808876, 0.027303965762257576, 0.0004238771216478199, 0.014933033846318722, -0.0028038686141371727, 0.0003752097545657307, 0.00652456795796752, 0.010279805399477482, 0.006656440440565348, -0.00809448305517435, 0.033156611025333405, 0.02444044128060341, -0.030142372474074364, 0.006317338906228542, 0.004094338975846767, -0.04071732237935066, 0.02121269516646862, 0.008471262641251087, -0.032001152634620667, 0.0013022447237744927, 0.016013136133551598, 0.026374576613307, -0.014694406650960445, 0.023159390315413475, 0.0020424597896635532, 0.013664542697370052, 0.0057458896189928055, 0.0041539957746863365, 0.005303173791617155, 0.0003883577883243561, 0.004342385567724705, -0.01690484769642353, 0.001676669460721314, 0.020697763189673424, 0.005133622791618109, 0.00046273251064121723, -0.005651694722473621, 0.009808829985558987, 0.00202676048502326, 0.009896745905280113, -0.017884474247694016, 0.00862825382500887, 0.020308423787355423, -0.007994008250534534, 0.005127343349158764, -0.0029388812836259604, -0.0006303209811449051, 0.011818322353065014, -0.018399406224489212, -0.012025550939142704, 0.008678491227328777, 0.00016974708705674857, -0.013677101582288742, 0.01983116939663887, -0.007918652147054672, -0.03245328739285469, 0.021162457764148712, -0.008044245652854443, -0.0002005566784646362, 0.0036076651886105537, -0.029187863692641258, -0.03569359332323074, -0.028961796313524246, 0.0027096737176179886, 0.016364796087145805, -0.0071022966876626015, -0.005174440797418356, -0.01400364376604557, -0.023787355050444603, -0.024566033855080605, 0.011648771353065968, 0.005579478573054075, -0.029012033715844154, 0.010857533663511276, -0.0049420930445194244, -0.025093525648117065, -0.0024961652234196663, -0.0030016780365258455, 0.010694262571632862, -0.010681703686714172, -0.015724271535873413, -0.003249724628403783, -0.01731930486857891, 0.006876228842884302, -0.009601601399481297, -0.009821389801800251, 0.012176262214779854, 0.02582196518778801, -0.010185610502958298, -0.0005722341011278331, 0.02747979573905468, -0.019793491810560226, -0.0016405613860115409, 0.017005322501063347, 0.0072027710266411304, -0.02220488153398037, 0.0006103045307099819, -0.007912373170256615, -0.004559034015983343, -0.0010808866936713457, 0.0038023346569389105, 0.01679181307554245, 0.01718115247786045, 0.018713390454649925, 0.004100618418306112, 0.017859356477856636, -0.026022914797067642, -0.016013136133551598, -0.028811084106564522, -0.0023705719504505396, -0.030343322083353996, 0.003858851734548807, 0.013890610076487064, 0.011969033628702164, -0.002329754177480936, -0.0014223431935533881, 0.01764584705233574, 0.016276881098747253, 0.02692718617618084, 0.0021821821574121714, -0.020923830568790436, -0.02855989895761013, -0.007510474417358637, -0.002681415295228362, 0.008326830342411995, -0.01458137296140194, -0.0007727907504886389, 0.01720627024769783, 0.032604001462459564, 0.004417741671204567, 0.03554287925362587, 0.003953046631067991, -0.0322021022439003, -0.02505584806203842, 0.009231101721525192, 0.012659796513617039, 0.009563923813402653, -0.016842050477862358, -0.005513542331755161, 0.0041414364241063595, -0.014405542053282261, 0.02916274592280388, 0.012163703329861164, -0.008502661250531673, 0.010185610502958298, 0.028710609301924706, -0.0177839994430542, 0.00950112659484148, 0.028384067118167877, 0.014744644053280354, -0.009444610215723515, 0.005055127199739218, 0.02803240716457367, -0.0009089809027500451, 0.0004933458403684199, -0.014405542053282261, -0.008433585055172443, 0.0012111896649003029, -0.031498778611421585, 0.0004556679050438106, -0.021388525143265724, 0.019605102017521858, 0.03569359332323074, 0.013689660467207432, 0.01161109283566475, -0.0064680506475269794, -0.005783567670732737, 0.009205983020365238, -0.013413355685770512, -0.001551076304167509, -0.013488711789250374, 0.003160239430144429, -0.005114783998578787, -0.012408610433340073, -0.003158669453114271, 0.0335836261510849, -0.026274101808667183, 0.009319016709923744, 0.0005424057017080486, -0.0011326938401907682, -0.008841762319207191, -0.005205838941037655, 0.0030566249042749405, -0.013036576099693775, -0.022443508729338646, -0.0017865635454654694, 0.011830881237983704, 0.007096016779541969, -0.01605081371963024, -0.024038542062044144, -0.005039427895098925, -0.0011122849537059665, -0.0032779830507934093, -0.008571737445890903, 0.0041194576770067215, 0.014292508363723755, -0.02342313528060913, 0.015485644340515137, -0.002646877197548747, 0.020722880959510803, 0.00680087273940444, -0.009865347295999527, -0.00600649556145072, 0.021036865189671516, 0.020371221005916595, -0.0016107329865917563, -0.019291117787361145, 0.013287762179970741, -0.004006423521786928, -0.00762350857257843, -0.015334932133555412, 0.0008681631297804415, -0.008295431733131409, -0.0007543442770838737, 0.011523177847266197, -0.0027818898670375347, 0.0037332584615796804, 0.01304913591593504, -0.005237237084656954, -0.00340043636970222, 0.016691338270902634, 0.02612338960170746, -0.002295216079801321, -0.008333110250532627, -0.009683237411081791, -0.022305356338620186, -0.00935041531920433, 0.06972935795783997, -0.001001605880446732, -0.0031319810077548027, 0.019617659971117973, -0.019014813005924225, -0.012848186306655407, -0.04051637277007103, -0.018223576247692108, -0.01665366068482399, -0.0031979172490537167, -0.004436580464243889, 0.004458559211343527, -0.013802695088088512, 0.01994420401751995, 0.013551508076488972, 0.009080389514565468, 0.00340043636970222, -0.027052778750658035, -0.011341067962348461, 0.0048667374067008495, -0.0151088647544384, 0.010110254399478436, 0.00046077012666501105, 0.009990940801799297, 0.0205470509827137, 0.008307991549372673, 0.00277247023768723, 0.01720627024769783, -0.007460237015038729, -0.0038965295534580946, 0.008483821526169777, -0.00610383041203022, 0.01745745725929737, 0.008778966031968594, 0.013312880881130695, 0.014078999869525433, 0.0029043431859463453, 0.03780356049537659, 0.008464982733130455, -0.005384809337556362, 0.011868558824062347, 0.005287474486976862, -0.006179186515510082, -0.006009635515511036, 0.0142673896625638, 0.0023344638757407665, -0.0057710083201527596, 0.028660371899604797, -0.0014239131705835462, -0.028685491532087326, 0.023121710866689682, 0.007742822170257568, -0.009218541905283928, -0.023071475327014923, -0.01187483873218298, 0.018072864040732384, 0.00027355772908777, -0.013928287662565708, -0.010844974778592587, -0.017796559259295464, -0.014744644053280354, -0.01513398252427578, -0.014694406650960445, -0.001071467180736363, -0.031172236427664757, -0.019190644845366478, -0.02369944006204605, 0.002376851625740528, -0.007466516923159361, 0.001561280689202249, -0.0048981355503201485, -0.03863247483968735, -0.0125404829159379, -0.0036516229156404734, 0.01909017004072666, 0.0004517431079875678, -0.001898027490824461, -0.006486889906227589, 0.004044101573526859, 0.013802695088088512, -0.006047313567250967, -0.029012033715844154, 0.011271991766989231, -0.006292220205068588, 0.015611236914992332, 0.0031727987807244062, 0.00666272034868598, 0.009111788123846054, -0.01690484769642353, 0.029589762911200523, 0.008521500043570995, 0.015071186237037182, 0.026299219578504562, 0.0003149642434436828, 0.018650593236088753, 0.004537055268883705, 0.0050582666881382465, 0.001783423707820475, -0.019906524568796158, -1.6802998288767412e-05, -0.008307991549372673, -0.01179320365190506, -0.0020785678643733263, 0.004810220096260309, -0.0034695127978920937, 0.01676669530570507, 0.01690484769642353, 0.01568659394979477, -0.015372609719634056, -0.0016327118501067162, 0.034211594611406326, -0.024088779464364052, 0.016251763328909874, -0.0064806099981069565, 0.0001965337578440085, 0.013438474386930466, 0.021564355120062828, 0.03504050895571709, -0.004694046452641487, -0.03443766012787819, -0.005557499825954437, -0.025281915441155434, 0.008490101434290409, 0.0157996267080307, 0.009130626916885376, 0.0012418029364198446, -0.0018619195325300097, -0.022192321717739105, -0.006907626986503601, -0.0032748430967330933, -0.005548080429434776, 0.019542304798960686, -0.006304779555648565, -0.0023250444792211056, -0.015309813432395458, -0.006499449256807566, -0.018324051052331924, 0.005968817975372076, -0.006851110141724348, -0.017859356477856636, -0.02425205148756504, -0.0063236188143491745, -0.006568525452166796, -0.013890610076487064, -0.04551498219370842, -0.044786542654037476, -0.016163846477866173, 0.00394362723454833, -0.017444897443056107, 0.016364796087145805, -0.003949906677007675, -0.000322813808452338, -0.019981881603598595, -0.010625186376273632, -0.006342457607388496, -0.01692996546626091, -0.0006311059114523232, -0.006116389762610197, 0.027002541348338127, 0.027931932359933853, 0.025369830429553986, 0.013978525064885616, 0.0114603815600276, 0.006474330555647612, -0.008559177629649639, -0.01662854291498661, 0.004559034015983343, -0.012094627134501934, -0.014204593375325203, 0.012710033915936947, 0.020999185740947723, 0.028459424152970314, -0.007026940584182739, 0.009595322422683239, 0.0048447586596012115, -0.008188677951693535, -0.020873593166470528, 0.0010173050686717033, -0.013928287662565708, -0.014405542053282261, -0.020974067971110344, -0.0010471334680914879, 0.002153923735022545, 0.010958008468151093, 0.011234313249588013, -0.015548440627753735, 0.011812042444944382, 0.008596856147050858, 0.02079823799431324, 0.003050345228984952, 0.026299219578504562, -0.01969301700592041, 0.02742955833673477, 0.007265567779541016, 0.020597288385033607, -0.013162169605493546, 0.007711423560976982, -0.006794593296945095, 0.016917407512664795, 0.009262500330805779, 0.021451322361826897, 0.02770586498081684, -0.012270457111299038, -0.010593787766993046, 0.005884042475372553, 0.026173627004027367, 0.0031162817031145096, 0.007510474417358637, 0.004194813314825296, -0.007862135767936707, 0.0027096737176179886, -0.020195389166474342, -0.004998610354959965, -0.013501270674169064, 0.004712885711342096, -0.012201380915939808, -0.01872594840824604, 0.018625473603606224, -0.011881118640303612, -0.029087388888001442, -0.015787066891789436, -0.0003167304093949497, 0.048001728951931, 0.005959398113191128, 0.012100907042622566, 0.01939159259200096, 0.0028556757606565952, -0.005940559320151806, 0.02068520337343216, -0.018675711005926132, -0.005522961728274822, 0.039813049137592316, 0.0015455815009772778, -0.0063236188143491745, -0.02416413463652134, 0.004684627056121826, 0.004078639671206474, -0.015071186237037182, -0.024051101878285408, 0.008050525560975075, 0.028760846704244614, -0.002221429953351617, -0.014619050547480583, 0.006113249808549881, -0.033131491392850876, 0.02471674606204033, 0.003482071915641427, -0.021363407373428345, -0.015058627352118492, 0.012440008111298084, -0.023071475327014923, 0.0088919997215271, -0.0305191520601511, 0.02720349095761776, 0.03413623571395874, -0.005836945027112961, -0.006668999791145325, 0.003796054981648922, 0.00045880774268880486, -0.0013477721950039268, -0.007686304859817028, 0.02833382971584797, -0.007058338727802038, 0.0036641822662204504, 0.01891433820128441, -0.0021272350568324327, -0.02079823799431324, -0.005208978895097971, -0.012320694513618946, 0.022581661120057106, -0.017331864684820175, 0.005482144188135862, -0.011899957433342934, 0.011328508146107197, -0.003271703375503421, 0.012075788341462612, -0.013790135271847248, -0.00652456795796752, -0.006210584659129381, -0.0016672499477863312, 0.02665088139474392, 2.0819775272684637e-06, -0.001816391944885254, -0.01485767774283886, -0.017984949052333832, -0.024503236636519432, -0.04712257534265518, -0.004483677912503481, 0.0015801197150722146, -0.029589762911200523, -0.004659508354961872, -0.005466444883495569, 0.009551363997161388, 0.02565869502723217, 0.016779253259301186, -0.012358373031020164, 0.012553042732179165, 0.023247305303812027, -0.0181733388453722, -0.0057301907800138, 0.018688270822167397, 0.015975456684827805, -0.02584708482027054, 0.016251763328909874, -0.0035699873697012663, -0.019178085029125214, 0.004577872809022665, -0.014104118570685387, -0.004160275217145681, 0.011585974134504795, 0.013388236984610558, 0.00026119465474039316, -0.016113610938191414, 0.006348737049847841, 0.02936369553208351, 0.016867170110344887, 0.009036432020366192, -0.007322084624320269, -0.003296822076663375, 0.037050001323223114, -0.014217152260243893, 0.0041539957746863365, -0.007334643974900246, -0.012697474099695683, 0.019655339419841766, -0.007052059285342693, -0.007667466066777706, -0.0007700434071011841, -0.03576894849538803, -0.003962466027587652, -0.0019985020626336336, 0.008138440549373627, 0.014342745766043663, -0.015837304294109344, -0.03210162743926048, -0.0056108771823346615, -0.016703898087143898, 0.00034538135514594615, 0.009984660893678665, -0.01095172856003046, 0.001340707647614181, 0.013840372674167156, 0.016716457903385162, 0.00038973146001808345, 0.005940559320151806, -0.011535737663507462, -0.01093916967511177, -0.016314558684825897, -0.02427716925740242, -0.012634677812457085, -0.0012151143746450543, 0.041244812309741974, 0.006957864388823509, -0.018110541626811028, -0.0088919997215271, -0.005127343349158764, -0.02747979573905468, -0.03227745741605759, -0.002741072094067931, -0.0029655699618160725, -0.0056234365329146385, 0.016741575673222542, -0.008527779951691628, 0.022016491740942, 0.006263962015509605, -0.0022748070769011974, -0.00584008451551199, -0.019630219787359238, 0.011824601329863071, 0.004703465849161148, 0.01513398252427578, -0.000919970334507525, -0.02328498288989067, 0.002163343131542206, 0.0026296081487089396, 0.011912517249584198, -0.0042073726654052734, 0.00936297420412302, -0.009331576526165009, -0.010487033985555172, -0.014355304650962353, -0.0040284027345478535, -0.002763050841167569, 0.007253008428961039, 0.016364796087145805, -0.0157996267080307, 0.001935705542564392, 0.0050582666881382465, -0.003422415116801858, 0.003422415116801858, -0.00468148710206151, 0.008433585055172443, -0.015360050834715366, 0.015837304294109344, -0.019981881603598595, -0.006320478860288858, 0.003088023280724883, -0.02336033806204796, 0.015159101225435734, -0.025420067831873894, 0.00652456795796752, 0.0294390507042408, 0.01068798266351223, 0.00756699126213789, -0.0008603135356679559, -0.0018933177925646305, 0.0024160996545106173, 0.005488423630595207, -0.00935041531920433, -0.011315949261188507, -2.2285437808022834e-05, -0.007372322026640177, -0.021149897947907448, -0.02966511808335781, 0.0021617733873426914, -0.025721492245793343, -0.014330185949802399, 0.001420773332938552, 0.03476420417428017, 0.015787066891789436, -0.026550406590104103, -0.01319984719157219, -0.023159390315413475, 0.013011457398533821, 0.009406931698322296, -0.013388236984610558, 0.001562850666232407, -0.00962672010064125, -0.0042324913665652275, 0.021916016936302185, -0.01762072928249836, 0.005290614441037178, 0.018537558615207672, 0.016176406294107437, -0.004251330625265837, 0.039260439574718475, 0.2337038666009903, -0.012791668996214867, 0.0035228899214416742, 0.027354203164577484, 0.02217976190149784, 0.013551508076488972, 0.010210729204118252, 0.0017080678371712565, 0.006982983089983463, 0.0028289873152971268, -0.01159853395074606, 0.0012826207093894482, -0.00470974575728178, 0.006756915245205164, 0.005450745578855276, 0.00941949151456356, -0.02107454277575016, -0.019793491810560226, -0.030921051278710365, -0.024503236636519432, 0.006279660854488611, -0.03295566141605377, -0.025256795808672905, -0.014305067248642445, 0.017495134845376015, -0.0072969659231603146, -0.008446143940091133, 0.0011868559522554278, 0.03363386541604996, 0.009746033698320389, 0.004763122648000717, -0.011573415249586105, 0.011228034272789955, 0.011246873065829277, -0.014367864467203617, -0.019567424431443214, 0.03604525327682495, 0.013363118283450603, 0.03084569424390793, 0.003871411085128784, -0.016553185880184174, -0.025068406015634537, -0.004518216010183096, -0.017633287236094475, 0.0027238030452281237, 0.011096160858869553, 0.007246728986501694, -0.026173627004027367, 0.018123101443052292, 0.033985525369644165, 0.004656368400901556, 0.007014381233602762, 0.02300867810845375, 0.044032983481884, 0.00041406514355912805, -0.015950338914990425, 0.008653372526168823, 0.016666220501065254, -0.005259216297417879, -0.000553002639207989, 0.0005902881384827197, 0.03185044229030609, -2.5854542400338687e-05, 0.03943626955151558, -0.023598965257406235, 0.007472796365618706, -0.02519400045275688, -0.002959290286526084, 0.003968745935708284, -0.0035166102461516857, -0.01667878031730652, -0.0014144936576485634, 0.00603475421667099, -0.00840846635401249, -0.029514405876398087, -0.009519966319203377, 0.010499592870473862, 0.0036798813380301, 0.03335756063461304, 0.0274546779692173, -0.009388092905282974, -0.0015463664894923568, -0.009859067387878895, -0.02010747417807579, -0.038657594472169876, -0.029037151485681534, 0.0022512583527714014, -0.00729068648070097, -0.012107186019420624, -0.023372897878289223, -0.014531135559082031, -0.010254686698317528, 0.001805402571335435, -0.006069292314350605, 0.023686882108449936, 0.026198744773864746, -0.0034569534473121166, 0.010612627491354942, -0.02091127075254917, 0.012364652007818222, -0.005510402377694845, -0.00246790680103004, 7.820140308467671e-05, 0.0090866694226861, 0.0022292796056717634, 0.009903025813400745, -0.0071148560382425785, 0.0007563066901639104, -0.00300324778072536, -0.01563635654747486, -0.022280236706137657, -0.0016735296230763197, -0.006807152647525072, -0.01108988095074892, -0.00680087273940444, 0.00976487249135971, -0.005080245900899172, -0.00265472661703825, -0.0003416528052184731, -0.008885719813406467, -0.003959326073527336, -0.013915728777647018, -0.014794881455600262, 0.002414529677480459, -0.006169767118990421, -0.014807440340518951, -0.006888788193464279, 0.00915574561804533, 0.014531135559082031, -0.030016779899597168, 0.00854661874473095, -0.0002739502233453095, 0.021237812936306, 0.008113321848213673, -0.006989262532442808, 0.007334643974900246, 0.016992762684822083, -0.0028666651342064142, -0.016879728063941002, 0.0035448686685413122, -0.0021131059620529413, -0.010003499686717987, 0.0088919997215271, -0.0073534827679395676, -0.014920474030077457, -0.01649039052426815, 0.03112199902534485, -0.027555152773857117, -0.010970567353069782, 0.002863525412976742, -0.04272681474685669, -0.018185898661613464, -0.013903168961405754, -0.0035385889932513237, 0.027931932359933853, -0.020735440775752068, -0.019680457189679146, -0.02485489845275879, -0.006210584659129381, 0.009896745905280113, -0.012502805329859257, 0.009645558893680573, 0.02996654249727726, -0.007862135767936707, -0.038657594472169876, -0.013149609789252281, -0.16045789420604706, 0.014430660754442215, 0.011523177847266197, -0.012760271318256855, 0.02066008374094963, 0.015171661041676998, 0.024641389027237892, 0.004643809515982866, -0.009067830629646778, 0.018575238063931465, 0.009859067387878895, -0.01288586389273405, -0.014945592731237411, -0.014305067248642445, 0.005290614441037178, -0.01612616889178753, -0.008182398043572903, 0.012352093122899532, 0.029740475118160248, 0.028685491532087326, 0.035115864127874374, -0.007334643974900246, -0.0009168304968625307, -0.00477882195264101, 0.013714779168367386, -0.007811898365616798, 0.0007111715967766941, 0.03393528610467911, -0.010053737089037895, -0.007020661141723394, -0.016000576317310333, -0.02888644114136696, 0.02582196518778801, 0.004559034015983343, 0.021966254338622093, -0.006430373061448336, 0.012446288019418716, -0.042601220309734344, -0.01635223627090454, 0.024930253624916077, 0.032855186611413956, 0.018625473603606224, 0.019228322431445122, -0.013526389375329018, -0.015410288237035275, -0.005224677734076977, 0.020609848201274872, 0.015083746053278446, 0.01621408388018608, -0.0070897373370826244, 0.008119601756334305, -0.003028366481885314, -0.0005110073834657669, -0.00622942391782999, 0.019981881603598595, 0.029489288106560707, 0.006888788193464279, 0.02452835626900196, 0.0024773261975497007, -0.008458703756332397, -0.0002564848982729018, -0.012370931915938854, -0.006254542153328657, -0.004706605803221464, 0.006050453521311283, -0.005224677734076977, 0.0014686556532979012, 0.009689517319202423, -0.00024942029267549515, 0.018349168822169304, -0.008910838514566422, -0.023084033280611038, -0.01039911899715662, -0.02554566040635109, 0.006825991440564394, 0.008389626629650593, -0.03378457576036453, 0.03109688125550747, -0.0114603815600276, -0.014832559041678905, -0.008295431733131409, 0.021313169971108437, -0.0024836058728396893, 0.007070898078382015, -0.023121710866689682, 0.00038973146001808345, 0.0032465846743434668, 0.012653516605496407, -0.03134806826710701, -0.027002541348338127, 0.007453957572579384, -0.017984949052333832, -0.014166914857923985, -0.018022626638412476, 0.00543504673987627, 0.015598678030073643, 0.012534203007817268, 0.01623920351266861, -0.01571171171963215, -0.019893966615200043, -0.002260677982121706, 0.017143474891781807, -0.016013136133551598, 0.011190355755388737, 0.03049403429031372, -0.005469584837555885, 0.017532814294099808, 0.007981449365615845, 0.022405831143260002, -0.03169972822070122, -0.03217698261141777, 0.006631322205066681, 0.014694406650960445, 0.026148507371544838, 0.01623920351266861, 0.026449931785464287, -0.013262644410133362, -0.01734442450106144, 0.0048321993090212345, -0.013099372386932373, 0.052045829594135284, 0.006223144009709358, -0.04420881345868111, -0.0010887362295761704, 0.0028792244847863913, 0.0002376459160586819, -0.08409722149372101, -0.03438742458820343, 0.004712885711342096, 0.014405542053282261, -0.001269276486709714, 0.03835617005825043, 0.004216792527586222, -0.0017881334060803056, 0.0019875126890838146, 0.005535521078854799, 0.008923398330807686, -0.03049403429031372, -0.01787191443145275, -0.010568669065833092, 0.04071732237935066, -0.02720349095761776, 0.012779110111296177, 0.0010651875054463744, -0.033960405737161636, 0.011604813858866692, -0.008678491227328777, 0.007868414744734764, 0.0028478263411670923, -0.001828951295465231, -0.02222999930381775, 0.012201380915939808, -0.022971000522375107, 0.03127271309494972, 0.0016358516877517104, -0.004926394205540419, -0.016754135489463806, -0.024063661694526672, -0.014166914857923985, -0.008307991549372673, -0.004449139814823866, 0.013626864179968834, -0.04099362716078758, 0.03102152608335018, 0.013413355685770512, -0.04106898233294487, 0.027253728359937668, 0.010593787766993046, -0.006461771205067635, -0.05581362545490265, -0.012659796513617039, 0.017168592661619186, -0.0019137266790494323, 0.03777844086289406, 0.01870083063840866, -0.009048991836607456, -0.03222722187638283, -0.03139830380678177, -0.021916016936302185, 0.010832414962351322, 0.01048075407743454, -0.0017535953084006906, 0.008056805469095707, 0.028911558911204338, -0.009921864606440067, 0.011962753720581532, 0.013375678099691868, 0.006147787906229496, -0.02038377895951271, 0.01704300008714199, -0.012559321708977222, 0.018575238063931465, -0.012697474099695683, 0.0232221856713295, 0.0201577115803957, 0.00969579629600048, -0.01563635654747486, 0.028409186750650406, 0.006245122756808996, 0.010053737089037895, -0.03792915120720863, -0.005469584837555885, -0.015837304294109344, -0.020258186385035515, 0.006606203503906727, -0.005811826325953007, -0.03310637176036835, -0.020735440775752068, 0.009218541905283928, -0.008182398043572903, 0.026022914797067642, 0.01360174547880888, -0.01054983027279377, 0.0057710083201527596, -0.015347491018474102, -0.03544240817427635, -0.03054427169263363, 0.01035516057163477, 0.006989262532442808, -0.011510618962347507, 0.007529313676059246, 0.014757202938199043, -0.01180576253682375, -0.0010895212180912495, 0.01607593148946762, 0.010807296261191368, -0.02485489845275879, -0.011824601329863071, -0.0650070533156395, 0.032352812588214874, -0.016113610938191414, -0.019353915005922318, 0.0009600031771697104, -0.022016491740942, 0.006687839049845934, -0.01512142363935709, 0.002095836913213134, -0.00881036464124918, -0.018675711005926132, -0.0048635974526405334, -0.002117815660312772, 0.012615839019417763, -0.02798216976225376, 0.0032685634214431047, 0.020182831212878227, 0.014192033559083939, 0.005880902521312237, -0.002144504338502884, 0.00037756460369564593, -0.03245328739285469, -0.016502948477864265, 0.001036928966641426, -0.023774797096848488, 0.011177796870470047, -0.0314234234392643, 0.0005577123956754804, -0.002411389723420143, -0.00874128844588995, 0.0064366525039076805, -0.042576100677251816, 0.0064021144062280655, 0.022682135924696922, 0.0008108612382784486, -0.010122813284397125, 0.006794593296945095, 0.015083746053278446, 0.007366042118519545, 0.020333541557192802, -0.030895931646227837, -0.033181726932525635, 0.015611236914992332, -0.010832414962351322, 0.0036013855133205652, 0.004634389653801918, -0.005340851843357086, -0.0033941566944122314, 0.030418677255511284, 0.009808829985558987, -0.003227745648473501, 0.029263220727443695, -0.028409186750650406, -0.013036576099693775, -0.012841906398534775, -0.008351949043571949, -0.017143474891781807, -0.0003757984668482095, -0.008333110250532627, -0.04493725299835205, 0.02248118631541729, 0.00018220828496851027, 0.02966511808335781, -0.007868414744734764, 0.005246656946837902, 0.010813576169312, -0.0205470509827137, -0.002692404668778181, -0.014619050547480583, -0.02234303392469883, -0.006920186337083578, -0.00593741936609149, 0.00517130084335804, 0.005312593188136816, 0.019630219787359238, 0.020647525787353516, -0.003180648200213909, 0.0170932374894619, -0.040616847574710846, 0.037728201597929, 0.020258186385035515, 0.021112220361828804, -0.030318202450871468, -0.0004332966054789722, 0.034563254565000534, 0.006088131107389927, -0.006254542153328657, -0.01731930486857891, -0.009205983020365238, 0.00310058263130486, -0.03607037290930748, -0.0266759991645813, 0.006939025595784187, 0.024289729073643684, -0.0038368727546185255, 0.00473800441250205, 0.017520254477858543, 0.00783073715865612, 0.017658406868577003, 0.023372897878289223, 0.0023721419274806976, 0.009438330307602882, -0.016000576317310333, -0.004552754107862711, -0.008785245940089226, 0.005924860015511513, -0.006289080251008272, -0.037878915667533875, -0.007328364532440901, 0.012898423708975315, 0.011812042444944382, 0.029790712520480156, -0.006361296400427818, 0.015046067535877228, -0.02499305084347725, -0.00584008451551199, 0.02609827183187008, -0.0014945593429729342, -0.028434304520487785, 0.04242539033293724, -0.008973635733127594, 0.0035794067662209272, 0.02770586498081684, -0.015071186237037182, 0.008433585055172443, 0.002970279660075903, -0.007793059106916189, -0.0035228899214416742, 0.01905249059200287, -0.0050017498433589935, 0.011717847548425198, -0.010041178204119205, -0.016452711075544357, -0.015410288237035275, 0.0009136906592175364, 0.01372733898460865, -0.008351949043571949, 0.024339966475963593, -0.0161889661103487, 0.05631599947810173, -0.012722592800855637, -0.0041634151712059975, 0.02287052571773529, 0.014292508363723755, 0.020735440775752068, 0.024427881464362144, 0.018072864040732384, -0.008559177629649639, -0.02024562656879425, -0.0038494321051985025, 0.006756915245205164, -0.0008289152756333351, -0.02259422093629837, -0.011127559468150139, 0.004289008677005768, -0.01591266132891178, 0.004398902412503958, -0.019630219787359238, 0.01292354241013527, 0.018324051052331924, 0.008942237123847008, 0.0243902038782835, 0.013212407007813454, -0.018072864040732384, -0.019931644201278687, 0.00423877127468586, 0.0012331684119999409, -0.018349168822169304, -0.016151288524270058, 0.017243949696421623, 0.0007649412145838141, -0.0012857605470344424, -0.01260955911129713, 0.006656440440565348, 0.002959290286526084, -0.007108576130121946, -0.009513686411082745, -0.0036422032862901688, 0.010562390089035034, 0.02289564348757267, 0.0006240413058549166, -0.018097983673214912, -0.03295566141605377, 0.006813432089984417, 0.002750491490587592, -0.02936369553208351, -0.005384809337556362, -0.02317194826900959 ] } ], "model": "ada", "usage": { "prompt_tokens": 3, "total_tokens": 3 } } ================================================ FILE: libs/kotaemon/tests/resources/embedding_openai_batch.json ================================================ { "object": "list", "data": [ { "object": "embedding", "index": 0, "embedding": [ 0.006555966101586819, 0.003670461941510439, -0.011642491444945335, -0.026776473969221115, -0.012383491732180119, -0.0014341175556182861, -0.013375678099691868, 0.009356695227324963, -0.006364436354488134, -0.0294390507042408, 0.023950627073645592, 0.0029859787318855524, -0.023234745487570763, -0.009205983020365238, 0.006744355894625187, 0.0011790062999352813, 0.02607315219938755, -0.018437083810567856, 0.008904559537768364, 0.009620440192520618, -0.01306169480085373, -0.0011358336778357625, 0.007253008428961039, 0.00875384733080864, -0.012710033915936947, 0.0037206991109997034, 0.005419347435235977, -0.017243949696421623, 0.036246202886104584, -0.0266759991645813, 0.012647237628698349, -0.008552898652851582, -0.00762350857257843, -0.012546762824058533, 0.007083457428961992, -0.014078999869525433, 0.0048761568032205105, -0.013689660467207432, 0.018211016431450844, -0.014367864467203617, 0.008307991549372673, 0.006022194866091013, 0.005457025486975908, -0.006320478860288858, -0.03609549254179001, 0.012986338697373867, 0.00017592862423043698, -0.016427593305706978, -0.004734864458441734, 0.0257717277854681, 0.01944182999432087, -0.00211938563734293, -0.0147195253521204, 0.01332544069737196, -0.017670966684818268, 0.0013956546317785978, -0.04024006798863411, 0.024754423648118973, 0.032051388174295425, -0.018713390454649925, 0.011265711858868599, 0.012370931915938854, -0.013576626777648926, 0.0040974789299070835, -0.002153923735022545, 0.009915584698319435, 0.006637601647526026, 0.011353626847267151, -0.019919084385037422, 0.030142372474074364, 0.01966789737343788, -0.008213796652853489, -0.004213652573525906, -0.009162025526165962, 0.012446288019418716, 0.008979915641248226, -0.02098662778735161, 0.003113141981884837, 0.0018838982796296477, -0.0008289152756333351, 0.02722861059010029, -0.03355850651860237, -0.004408321809023619, 0.017859356477856636, 0.019893966615200043, -0.0014137086691334844, -0.0021602034103125334, 0.017959831282496452, -0.0114603815600276, -0.020069796591997147, 0.007604669313877821, 0.001281835837289691, 0.01132222916930914, 0.006668999791145325, -0.01743233948945999, 0.0048761568032205105, 0.0016923686489462852, 0.02356128767132759, -0.005796127021312714, -0.041470881551504135, -0.01292354241013527, -0.005824385676532984, -0.010700542479753494, -0.017005322501063347, -0.007950050756335258, 0.011560855433344841, -0.005485283676534891, -0.0002590360236354172, 0.028434304520487785, 0.007447678130120039, -0.026198744773864746, 0.009306457825005054, -0.00500488979741931, -0.03107176162302494, -0.0007555217016488314, -0.008056805469095707, 0.007479076273739338, 0.003978165332227945, -0.005984516814351082, -0.02778122015297413, 0.015272135846316814, 0.029338575899600983, 0.010964288376271725, -0.023548727855086327, 0.007014381233602762, 0.0014137086691334844, -0.03380969539284706, -0.016741575673222542, -0.005230957642197609, -0.004794521257281303, 0.0315992534160614, 0.0041100382804870605, 0.010644025169312954, -0.0014294078573584557, -0.035367049276828766, 0.0246916264295578, -0.007096016779541969, 0.02096150815486908, -0.017030440270900726, -0.011906237341463566, 0.006273381412029266, 0.021237812936306, 0.01757049188017845, -0.013224965892732143, -0.005092805251479149, 0.01288586389273405, 0.004486817866563797, -0.016339678317308426, 0.008917118422687054, -0.015083746053278446, 0.008377067744731903, 0.0185626782476902, 0.012961219996213913, -0.002535413019359112, 0.0050143091939389706, 0.016025694087147713, 0.004618690814822912, 0.0205470509827137, 0.008000288158655167, -0.013074253685772419, -0.0007264782907441258, -0.0004474258457776159, 0.0033125211484730244, -0.03084569424390793, 0.009400652721524239, 0.015787066891789436, 0.02607315219938755, 0.003689300734549761, -0.013940847478806973, -0.02604803442955017, -0.011454101651906967, 0.007340923883020878, -0.043178949505090714, 0.02660064399242401, -0.01593777909874916, 0.00514304218813777, -0.001305384561419487, 0.022820288315415382, -0.012697474099695683, -0.017269067466259003, -0.03024284727871418, 0.011617372743785381, 0.008992474526166916, 0.025897322222590446, 0.00629850011318922, -0.00014511904737446457, 0.009890465997159481, -0.006364436354488134, 0.00384629238396883, -0.017030440270900726, 0.016666220501065254, 0.03333244100213051, 0.021011745557188988, 0.01844964362680912, -0.6892555952072144, -0.007196491584181786, 0.025319593027234077, 0.02024562656879425, 0.006245122756808996, 0.009909304790198803, 0.012176262214779854, 0.027806337922811508, 0.006518288049846888, 0.0274546779692173, -0.02107454277575016, 0.013011457398533821, -0.015083746053278446, -0.010719381272792816, -0.0002598209830466658, -0.00823263544589281, 0.0055763390846550465, -0.012339534237980843, -0.011881118640303612, 0.015196779742836952, -0.010141652077436447, 0.03129782900214195, -0.022104406729340553, -0.014568813145160675, 0.009928143583238125, 0.011730406433343887, 0.0025102945510298014, -0.009739753790199757, -0.006964143831282854, 0.030192609876394272, -0.0021272350568324327, 0.0114603815600276, 9.394961671205238e-05, 0.014154355973005295, 0.06611227244138718, 0.007610949221998453, -0.005463304929435253, 0.008088203147053719, 0.009827669709920883, 0.028986915946006775, -0.019077610224485397, -0.022267676889896393, 0.012559321708977222, -0.01662854291498661, -0.002763050841167569, 0.009205983020365238, 0.012659796513617039, -0.0024051100481301546, 0.0022543983068317175, 0.0017849936848506331, 0.014568813145160675, -0.004851038102060556, 0.010857533663511276, 0.019504627212882042, -0.005343991331756115, 0.0071588135324418545, 0.02151411771774292, 0.011962753720581532, -0.002935741562396288, 0.004521355964243412, 0.0005718416068702936, 0.017168592661619186, -0.0016280021518468857, 0.0038494321051985025, -0.022443508729338646, 0.02381247468292713, -0.021313169971108437, 0.01400364376604557, -0.008929678238928318, -0.012050669640302658, -0.0018368008313700557, -0.007994008250534534, -0.018047746270895004, -0.014744644053280354, 0.013965966179966927, 0.028107762336730957, 0.01681693270802498, -0.0033219405449926853, -0.006160347256809473, 0.01966789737343788, 0.010926609858870506, 0.013237525708973408, -0.014066440984606743, -0.01764584705233574, 0.02349849045276642, -0.019780931994318962, -0.030067017301917076, -0.016276881098747253, 0.010430516675114632, 0.011479220353066921, 0.030192609876394272, 0.0033250804990530014, -0.012370931915938854, -0.01980605162680149, 0.02883620373904705, 0.0005384809337556362, -0.011548296548426151, 0.003821173682808876, 0.027303965762257576, 0.0004238771216478199, 0.014933033846318722, -0.0028038686141371727, 0.0003752097545657307, 0.00652456795796752, 0.010279805399477482, 0.006656440440565348, -0.00809448305517435, 0.033156611025333405, 0.02444044128060341, -0.030142372474074364, 0.006317338906228542, 0.004094338975846767, -0.04071732237935066, 0.02121269516646862, 0.008471262641251087, -0.032001152634620667, 0.0013022447237744927, 0.016013136133551598, 0.026374576613307, -0.014694406650960445, 0.023159390315413475, 0.0020424597896635532, 0.013664542697370052, 0.0057458896189928055, 0.0041539957746863365, 0.005303173791617155, 0.0003883577883243561, 0.004342385567724705, -0.01690484769642353, 0.001676669460721314, 0.020697763189673424, 0.005133622791618109, 0.00046273251064121723, -0.005651694722473621, 0.009808829985558987, 0.00202676048502326, 0.009896745905280113, -0.017884474247694016, 0.00862825382500887, 0.020308423787355423, -0.007994008250534534, 0.005127343349158764, -0.0029388812836259604, -0.0006303209811449051, 0.011818322353065014, -0.018399406224489212, -0.012025550939142704, 0.008678491227328777, 0.00016974708705674857, -0.013677101582288742, 0.01983116939663887, -0.007918652147054672, -0.03245328739285469, 0.021162457764148712, -0.008044245652854443, -0.0002005566784646362, 0.0036076651886105537, -0.029187863692641258, -0.03569359332323074, -0.028961796313524246, 0.0027096737176179886, 0.016364796087145805, -0.0071022966876626015, -0.005174440797418356, -0.01400364376604557, -0.023787355050444603, -0.024566033855080605, 0.011648771353065968, 0.005579478573054075, -0.029012033715844154, 0.010857533663511276, -0.0049420930445194244, -0.025093525648117065, -0.0024961652234196663, -0.0030016780365258455, 0.010694262571632862, -0.010681703686714172, -0.015724271535873413, -0.003249724628403783, -0.01731930486857891, 0.006876228842884302, -0.009601601399481297, -0.009821389801800251, 0.012176262214779854, 0.02582196518778801, -0.010185610502958298, -0.0005722341011278331, 0.02747979573905468, -0.019793491810560226, -0.0016405613860115409, 0.017005322501063347, 0.0072027710266411304, -0.02220488153398037, 0.0006103045307099819, -0.007912373170256615, -0.004559034015983343, -0.0010808866936713457, 0.0038023346569389105, 0.01679181307554245, 0.01718115247786045, 0.018713390454649925, 0.004100618418306112, 0.017859356477856636, -0.026022914797067642, -0.016013136133551598, -0.028811084106564522, -0.0023705719504505396, -0.030343322083353996, 0.003858851734548807, 0.013890610076487064, 0.011969033628702164, -0.002329754177480936, -0.0014223431935533881, 0.01764584705233574, 0.016276881098747253, 0.02692718617618084, 0.0021821821574121714, -0.020923830568790436, -0.02855989895761013, -0.007510474417358637, -0.002681415295228362, 0.008326830342411995, -0.01458137296140194, -0.0007727907504886389, 0.01720627024769783, 0.032604001462459564, 0.004417741671204567, 0.03554287925362587, 0.003953046631067991, -0.0322021022439003, -0.02505584806203842, 0.009231101721525192, 0.012659796513617039, 0.009563923813402653, -0.016842050477862358, -0.005513542331755161, 0.0041414364241063595, -0.014405542053282261, 0.02916274592280388, 0.012163703329861164, -0.008502661250531673, 0.010185610502958298, 0.028710609301924706, -0.0177839994430542, 0.00950112659484148, 0.028384067118167877, 0.014744644053280354, -0.009444610215723515, 0.005055127199739218, 0.02803240716457367, -0.0009089809027500451, 0.0004933458403684199, -0.014405542053282261, -0.008433585055172443, 0.0012111896649003029, -0.031498778611421585, 0.0004556679050438106, -0.021388525143265724, 0.019605102017521858, 0.03569359332323074, 0.013689660467207432, 0.01161109283566475, -0.0064680506475269794, -0.005783567670732737, 0.009205983020365238, -0.013413355685770512, -0.001551076304167509, -0.013488711789250374, 0.003160239430144429, -0.005114783998578787, -0.012408610433340073, -0.003158669453114271, 0.0335836261510849, -0.026274101808667183, 0.009319016709923744, 0.0005424057017080486, -0.0011326938401907682, -0.008841762319207191, -0.005205838941037655, 0.0030566249042749405, -0.013036576099693775, -0.022443508729338646, -0.0017865635454654694, 0.011830881237983704, 0.007096016779541969, -0.01605081371963024, -0.024038542062044144, -0.005039427895098925, -0.0011122849537059665, -0.0032779830507934093, -0.008571737445890903, 0.0041194576770067215, 0.014292508363723755, -0.02342313528060913, 0.015485644340515137, -0.002646877197548747, 0.020722880959510803, 0.00680087273940444, -0.009865347295999527, -0.00600649556145072, 0.021036865189671516, 0.020371221005916595, -0.0016107329865917563, -0.019291117787361145, 0.013287762179970741, -0.004006423521786928, -0.00762350857257843, -0.015334932133555412, 0.0008681631297804415, -0.008295431733131409, -0.0007543442770838737, 0.011523177847266197, -0.0027818898670375347, 0.0037332584615796804, 0.01304913591593504, -0.005237237084656954, -0.00340043636970222, 0.016691338270902634, 0.02612338960170746, -0.002295216079801321, -0.008333110250532627, -0.009683237411081791, -0.022305356338620186, -0.00935041531920433, 0.06972935795783997, -0.001001605880446732, -0.0031319810077548027, 0.019617659971117973, -0.019014813005924225, -0.012848186306655407, -0.04051637277007103, -0.018223576247692108, -0.01665366068482399, -0.0031979172490537167, -0.004436580464243889, 0.004458559211343527, -0.013802695088088512, 0.01994420401751995, 0.013551508076488972, 0.009080389514565468, 0.00340043636970222, -0.027052778750658035, -0.011341067962348461, 0.0048667374067008495, -0.0151088647544384, 0.010110254399478436, 0.00046077012666501105, 0.009990940801799297, 0.0205470509827137, 0.008307991549372673, 0.00277247023768723, 0.01720627024769783, -0.007460237015038729, -0.0038965295534580946, 0.008483821526169777, -0.00610383041203022, 0.01745745725929737, 0.008778966031968594, 0.013312880881130695, 0.014078999869525433, 0.0029043431859463453, 0.03780356049537659, 0.008464982733130455, -0.005384809337556362, 0.011868558824062347, 0.005287474486976862, -0.006179186515510082, -0.006009635515511036, 0.0142673896625638, 0.0023344638757407665, -0.0057710083201527596, 0.028660371899604797, -0.0014239131705835462, -0.028685491532087326, 0.023121710866689682, 0.007742822170257568, -0.009218541905283928, -0.023071475327014923, -0.01187483873218298, 0.018072864040732384, 0.00027355772908777, -0.013928287662565708, -0.010844974778592587, -0.017796559259295464, -0.014744644053280354, -0.01513398252427578, -0.014694406650960445, -0.001071467180736363, -0.031172236427664757, -0.019190644845366478, -0.02369944006204605, 0.002376851625740528, -0.007466516923159361, 0.001561280689202249, -0.0048981355503201485, -0.03863247483968735, -0.0125404829159379, -0.0036516229156404734, 0.01909017004072666, 0.0004517431079875678, -0.001898027490824461, -0.006486889906227589, 0.004044101573526859, 0.013802695088088512, -0.006047313567250967, -0.029012033715844154, 0.011271991766989231, -0.006292220205068588, 0.015611236914992332, 0.0031727987807244062, 0.00666272034868598, 0.009111788123846054, -0.01690484769642353, 0.029589762911200523, 0.008521500043570995, 0.015071186237037182, 0.026299219578504562, 0.0003149642434436828, 0.018650593236088753, 0.004537055268883705, 0.0050582666881382465, 0.001783423707820475, -0.019906524568796158, -1.6802998288767412e-05, -0.008307991549372673, -0.01179320365190506, -0.0020785678643733263, 0.004810220096260309, -0.0034695127978920937, 0.01676669530570507, 0.01690484769642353, 0.01568659394979477, -0.015372609719634056, -0.0016327118501067162, 0.034211594611406326, -0.024088779464364052, 0.016251763328909874, -0.0064806099981069565, 0.0001965337578440085, 0.013438474386930466, 0.021564355120062828, 0.03504050895571709, -0.004694046452641487, -0.03443766012787819, -0.005557499825954437, -0.025281915441155434, 0.008490101434290409, 0.0157996267080307, 0.009130626916885376, 0.0012418029364198446, -0.0018619195325300097, -0.022192321717739105, -0.006907626986503601, -0.0032748430967330933, -0.005548080429434776, 0.019542304798960686, -0.006304779555648565, -0.0023250444792211056, -0.015309813432395458, -0.006499449256807566, -0.018324051052331924, 0.005968817975372076, -0.006851110141724348, -0.017859356477856636, -0.02425205148756504, -0.0063236188143491745, -0.006568525452166796, -0.013890610076487064, -0.04551498219370842, -0.044786542654037476, -0.016163846477866173, 0.00394362723454833, -0.017444897443056107, 0.016364796087145805, -0.003949906677007675, -0.000322813808452338, -0.019981881603598595, -0.010625186376273632, -0.006342457607388496, -0.01692996546626091, -0.0006311059114523232, -0.006116389762610197, 0.027002541348338127, 0.027931932359933853, 0.025369830429553986, 0.013978525064885616, 0.0114603815600276, 0.006474330555647612, -0.008559177629649639, -0.01662854291498661, 0.004559034015983343, -0.012094627134501934, -0.014204593375325203, 0.012710033915936947, 0.020999185740947723, 0.028459424152970314, -0.007026940584182739, 0.009595322422683239, 0.0048447586596012115, -0.008188677951693535, -0.020873593166470528, 0.0010173050686717033, -0.013928287662565708, -0.014405542053282261, -0.020974067971110344, -0.0010471334680914879, 0.002153923735022545, 0.010958008468151093, 0.011234313249588013, -0.015548440627753735, 0.011812042444944382, 0.008596856147050858, 0.02079823799431324, 0.003050345228984952, 0.026299219578504562, -0.01969301700592041, 0.02742955833673477, 0.007265567779541016, 0.020597288385033607, -0.013162169605493546, 0.007711423560976982, -0.006794593296945095, 0.016917407512664795, 0.009262500330805779, 0.021451322361826897, 0.02770586498081684, -0.012270457111299038, -0.010593787766993046, 0.005884042475372553, 0.026173627004027367, 0.0031162817031145096, 0.007510474417358637, 0.004194813314825296, -0.007862135767936707, 0.0027096737176179886, -0.020195389166474342, -0.004998610354959965, -0.013501270674169064, 0.004712885711342096, -0.012201380915939808, -0.01872594840824604, 0.018625473603606224, -0.011881118640303612, -0.029087388888001442, -0.015787066891789436, -0.0003167304093949497, 0.048001728951931, 0.005959398113191128, 0.012100907042622566, 0.01939159259200096, 0.0028556757606565952, -0.005940559320151806, 0.02068520337343216, -0.018675711005926132, -0.005522961728274822, 0.039813049137592316, 0.0015455815009772778, -0.0063236188143491745, -0.02416413463652134, 0.004684627056121826, 0.004078639671206474, -0.015071186237037182, -0.024051101878285408, 0.008050525560975075, 0.028760846704244614, -0.002221429953351617, -0.014619050547480583, 0.006113249808549881, -0.033131491392850876, 0.02471674606204033, 0.003482071915641427, -0.021363407373428345, -0.015058627352118492, 0.012440008111298084, -0.023071475327014923, 0.0088919997215271, -0.0305191520601511, 0.02720349095761776, 0.03413623571395874, -0.005836945027112961, -0.006668999791145325, 0.003796054981648922, 0.00045880774268880486, -0.0013477721950039268, -0.007686304859817028, 0.02833382971584797, -0.007058338727802038, 0.0036641822662204504, 0.01891433820128441, -0.0021272350568324327, -0.02079823799431324, -0.005208978895097971, -0.012320694513618946, 0.022581661120057106, -0.017331864684820175, 0.005482144188135862, -0.011899957433342934, 0.011328508146107197, -0.003271703375503421, 0.012075788341462612, -0.013790135271847248, -0.00652456795796752, -0.006210584659129381, -0.0016672499477863312, 0.02665088139474392, 2.0819775272684637e-06, -0.001816391944885254, -0.01485767774283886, -0.017984949052333832, -0.024503236636519432, -0.04712257534265518, -0.004483677912503481, 0.0015801197150722146, -0.029589762911200523, -0.004659508354961872, -0.005466444883495569, 0.009551363997161388, 0.02565869502723217, 0.016779253259301186, -0.012358373031020164, 0.012553042732179165, 0.023247305303812027, -0.0181733388453722, -0.0057301907800138, 0.018688270822167397, 0.015975456684827805, -0.02584708482027054, 0.016251763328909874, -0.0035699873697012663, -0.019178085029125214, 0.004577872809022665, -0.014104118570685387, -0.004160275217145681, 0.011585974134504795, 0.013388236984610558, 0.00026119465474039316, -0.016113610938191414, 0.006348737049847841, 0.02936369553208351, 0.016867170110344887, 0.009036432020366192, -0.007322084624320269, -0.003296822076663375, 0.037050001323223114, -0.014217152260243893, 0.0041539957746863365, -0.007334643974900246, -0.012697474099695683, 0.019655339419841766, -0.007052059285342693, -0.007667466066777706, -0.0007700434071011841, -0.03576894849538803, -0.003962466027587652, -0.0019985020626336336, 0.008138440549373627, 0.014342745766043663, -0.015837304294109344, -0.03210162743926048, -0.0056108771823346615, -0.016703898087143898, 0.00034538135514594615, 0.009984660893678665, -0.01095172856003046, 0.001340707647614181, 0.013840372674167156, 0.016716457903385162, 0.00038973146001808345, 0.005940559320151806, -0.011535737663507462, -0.01093916967511177, -0.016314558684825897, -0.02427716925740242, -0.012634677812457085, -0.0012151143746450543, 0.041244812309741974, 0.006957864388823509, -0.018110541626811028, -0.0088919997215271, -0.005127343349158764, -0.02747979573905468, -0.03227745741605759, -0.002741072094067931, -0.0029655699618160725, -0.0056234365329146385, 0.016741575673222542, -0.008527779951691628, 0.022016491740942, 0.006263962015509605, -0.0022748070769011974, -0.00584008451551199, -0.019630219787359238, 0.011824601329863071, 0.004703465849161148, 0.01513398252427578, -0.000919970334507525, -0.02328498288989067, 0.002163343131542206, 0.0026296081487089396, 0.011912517249584198, -0.0042073726654052734, 0.00936297420412302, -0.009331576526165009, -0.010487033985555172, -0.014355304650962353, -0.0040284027345478535, -0.002763050841167569, 0.007253008428961039, 0.016364796087145805, -0.0157996267080307, 0.001935705542564392, 0.0050582666881382465, -0.003422415116801858, 0.003422415116801858, -0.00468148710206151, 0.008433585055172443, -0.015360050834715366, 0.015837304294109344, -0.019981881603598595, -0.006320478860288858, 0.003088023280724883, -0.02336033806204796, 0.015159101225435734, -0.025420067831873894, 0.00652456795796752, 0.0294390507042408, 0.01068798266351223, 0.00756699126213789, -0.0008603135356679559, -0.0018933177925646305, 0.0024160996545106173, 0.005488423630595207, -0.00935041531920433, -0.011315949261188507, -2.2285437808022834e-05, -0.007372322026640177, -0.021149897947907448, -0.02966511808335781, 0.0021617733873426914, -0.025721492245793343, -0.014330185949802399, 0.001420773332938552, 0.03476420417428017, 0.015787066891789436, -0.026550406590104103, -0.01319984719157219, -0.023159390315413475, 0.013011457398533821, 0.009406931698322296, -0.013388236984610558, 0.001562850666232407, -0.00962672010064125, -0.0042324913665652275, 0.021916016936302185, -0.01762072928249836, 0.005290614441037178, 0.018537558615207672, 0.016176406294107437, -0.004251330625265837, 0.039260439574718475, 0.2337038666009903, -0.012791668996214867, 0.0035228899214416742, 0.027354203164577484, 0.02217976190149784, 0.013551508076488972, 0.010210729204118252, 0.0017080678371712565, 0.006982983089983463, 0.0028289873152971268, -0.01159853395074606, 0.0012826207093894482, -0.00470974575728178, 0.006756915245205164, 0.005450745578855276, 0.00941949151456356, -0.02107454277575016, -0.019793491810560226, -0.030921051278710365, -0.024503236636519432, 0.006279660854488611, -0.03295566141605377, -0.025256795808672905, -0.014305067248642445, 0.017495134845376015, -0.0072969659231603146, -0.008446143940091133, 0.0011868559522554278, 0.03363386541604996, 0.009746033698320389, 0.004763122648000717, -0.011573415249586105, 0.011228034272789955, 0.011246873065829277, -0.014367864467203617, -0.019567424431443214, 0.03604525327682495, 0.013363118283450603, 0.03084569424390793, 0.003871411085128784, -0.016553185880184174, -0.025068406015634537, -0.004518216010183096, -0.017633287236094475, 0.0027238030452281237, 0.011096160858869553, 0.007246728986501694, -0.026173627004027367, 0.018123101443052292, 0.033985525369644165, 0.004656368400901556, 0.007014381233602762, 0.02300867810845375, 0.044032983481884, 0.00041406514355912805, -0.015950338914990425, 0.008653372526168823, 0.016666220501065254, -0.005259216297417879, -0.000553002639207989, 0.0005902881384827197, 0.03185044229030609, -2.5854542400338687e-05, 0.03943626955151558, -0.023598965257406235, 0.007472796365618706, -0.02519400045275688, -0.002959290286526084, 0.003968745935708284, -0.0035166102461516857, -0.01667878031730652, -0.0014144936576485634, 0.00603475421667099, -0.00840846635401249, -0.029514405876398087, -0.009519966319203377, 0.010499592870473862, 0.0036798813380301, 0.03335756063461304, 0.0274546779692173, -0.009388092905282974, -0.0015463664894923568, -0.009859067387878895, -0.02010747417807579, -0.038657594472169876, -0.029037151485681534, 0.0022512583527714014, -0.00729068648070097, -0.012107186019420624, -0.023372897878289223, -0.014531135559082031, -0.010254686698317528, 0.001805402571335435, -0.006069292314350605, 0.023686882108449936, 0.026198744773864746, -0.0034569534473121166, 0.010612627491354942, -0.02091127075254917, 0.012364652007818222, -0.005510402377694845, -0.00246790680103004, 7.820140308467671e-05, 0.0090866694226861, 0.0022292796056717634, 0.009903025813400745, -0.0071148560382425785, 0.0007563066901639104, -0.00300324778072536, -0.01563635654747486, -0.022280236706137657, -0.0016735296230763197, -0.006807152647525072, -0.01108988095074892, -0.00680087273940444, 0.00976487249135971, -0.005080245900899172, -0.00265472661703825, -0.0003416528052184731, -0.008885719813406467, -0.003959326073527336, -0.013915728777647018, -0.014794881455600262, 0.002414529677480459, -0.006169767118990421, -0.014807440340518951, -0.006888788193464279, 0.00915574561804533, 0.014531135559082031, -0.030016779899597168, 0.00854661874473095, -0.0002739502233453095, 0.021237812936306, 0.008113321848213673, -0.006989262532442808, 0.007334643974900246, 0.016992762684822083, -0.0028666651342064142, -0.016879728063941002, 0.0035448686685413122, -0.0021131059620529413, -0.010003499686717987, 0.0088919997215271, -0.0073534827679395676, -0.014920474030077457, -0.01649039052426815, 0.03112199902534485, -0.027555152773857117, -0.010970567353069782, 0.002863525412976742, -0.04272681474685669, -0.018185898661613464, -0.013903168961405754, -0.0035385889932513237, 0.027931932359933853, -0.020735440775752068, -0.019680457189679146, -0.02485489845275879, -0.006210584659129381, 0.009896745905280113, -0.012502805329859257, 0.009645558893680573, 0.02996654249727726, -0.007862135767936707, -0.038657594472169876, -0.013149609789252281, -0.16045789420604706, 0.014430660754442215, 0.011523177847266197, -0.012760271318256855, 0.02066008374094963, 0.015171661041676998, 0.024641389027237892, 0.004643809515982866, -0.009067830629646778, 0.018575238063931465, 0.009859067387878895, -0.01288586389273405, -0.014945592731237411, -0.014305067248642445, 0.005290614441037178, -0.01612616889178753, -0.008182398043572903, 0.012352093122899532, 0.029740475118160248, 0.028685491532087326, 0.035115864127874374, -0.007334643974900246, -0.0009168304968625307, -0.00477882195264101, 0.013714779168367386, -0.007811898365616798, 0.0007111715967766941, 0.03393528610467911, -0.010053737089037895, -0.007020661141723394, -0.016000576317310333, -0.02888644114136696, 0.02582196518778801, 0.004559034015983343, 0.021966254338622093, -0.006430373061448336, 0.012446288019418716, -0.042601220309734344, -0.01635223627090454, 0.024930253624916077, 0.032855186611413956, 0.018625473603606224, 0.019228322431445122, -0.013526389375329018, -0.015410288237035275, -0.005224677734076977, 0.020609848201274872, 0.015083746053278446, 0.01621408388018608, -0.0070897373370826244, 0.008119601756334305, -0.003028366481885314, -0.0005110073834657669, -0.00622942391782999, 0.019981881603598595, 0.029489288106560707, 0.006888788193464279, 0.02452835626900196, 0.0024773261975497007, -0.008458703756332397, -0.0002564848982729018, -0.012370931915938854, -0.006254542153328657, -0.004706605803221464, 0.006050453521311283, -0.005224677734076977, 0.0014686556532979012, 0.009689517319202423, -0.00024942029267549515, 0.018349168822169304, -0.008910838514566422, -0.023084033280611038, -0.01039911899715662, -0.02554566040635109, 0.006825991440564394, 0.008389626629650593, -0.03378457576036453, 0.03109688125550747, -0.0114603815600276, -0.014832559041678905, -0.008295431733131409, 0.021313169971108437, -0.0024836058728396893, 0.007070898078382015, -0.023121710866689682, 0.00038973146001808345, 0.0032465846743434668, 0.012653516605496407, -0.03134806826710701, -0.027002541348338127, 0.007453957572579384, -0.017984949052333832, -0.014166914857923985, -0.018022626638412476, 0.00543504673987627, 0.015598678030073643, 0.012534203007817268, 0.01623920351266861, -0.01571171171963215, -0.019893966615200043, -0.002260677982121706, 0.017143474891781807, -0.016013136133551598, 0.011190355755388737, 0.03049403429031372, -0.005469584837555885, 0.017532814294099808, 0.007981449365615845, 0.022405831143260002, -0.03169972822070122, -0.03217698261141777, 0.006631322205066681, 0.014694406650960445, 0.026148507371544838, 0.01623920351266861, 0.026449931785464287, -0.013262644410133362, -0.01734442450106144, 0.0048321993090212345, -0.013099372386932373, 0.052045829594135284, 0.006223144009709358, -0.04420881345868111, -0.0010887362295761704, 0.0028792244847863913, 0.0002376459160586819, -0.08409722149372101, -0.03438742458820343, 0.004712885711342096, 0.014405542053282261, -0.001269276486709714, 0.03835617005825043, 0.004216792527586222, -0.0017881334060803056, 0.0019875126890838146, 0.005535521078854799, 0.008923398330807686, -0.03049403429031372, -0.01787191443145275, -0.010568669065833092, 0.04071732237935066, -0.02720349095761776, 0.012779110111296177, 0.0010651875054463744, -0.033960405737161636, 0.011604813858866692, -0.008678491227328777, 0.007868414744734764, 0.0028478263411670923, -0.001828951295465231, -0.02222999930381775, 0.012201380915939808, -0.022971000522375107, 0.03127271309494972, 0.0016358516877517104, -0.004926394205540419, -0.016754135489463806, -0.024063661694526672, -0.014166914857923985, -0.008307991549372673, -0.004449139814823866, 0.013626864179968834, -0.04099362716078758, 0.03102152608335018, 0.013413355685770512, -0.04106898233294487, 0.027253728359937668, 0.010593787766993046, -0.006461771205067635, -0.05581362545490265, -0.012659796513617039, 0.017168592661619186, -0.0019137266790494323, 0.03777844086289406, 0.01870083063840866, -0.009048991836607456, -0.03222722187638283, -0.03139830380678177, -0.021916016936302185, 0.010832414962351322, 0.01048075407743454, -0.0017535953084006906, 0.008056805469095707, 0.028911558911204338, -0.009921864606440067, 0.011962753720581532, 0.013375678099691868, 0.006147787906229496, -0.02038377895951271, 0.01704300008714199, -0.012559321708977222, 0.018575238063931465, -0.012697474099695683, 0.0232221856713295, 0.0201577115803957, 0.00969579629600048, -0.01563635654747486, 0.028409186750650406, 0.006245122756808996, 0.010053737089037895, -0.03792915120720863, -0.005469584837555885, -0.015837304294109344, -0.020258186385035515, 0.006606203503906727, -0.005811826325953007, -0.03310637176036835, -0.020735440775752068, 0.009218541905283928, -0.008182398043572903, 0.026022914797067642, 0.01360174547880888, -0.01054983027279377, 0.0057710083201527596, -0.015347491018474102, -0.03544240817427635, -0.03054427169263363, 0.01035516057163477, 0.006989262532442808, -0.011510618962347507, 0.007529313676059246, 0.014757202938199043, -0.01180576253682375, -0.0010895212180912495, 0.01607593148946762, 0.010807296261191368, -0.02485489845275879, -0.011824601329863071, -0.0650070533156395, 0.032352812588214874, -0.016113610938191414, -0.019353915005922318, 0.0009600031771697104, -0.022016491740942, 0.006687839049845934, -0.01512142363935709, 0.002095836913213134, -0.00881036464124918, -0.018675711005926132, -0.0048635974526405334, -0.002117815660312772, 0.012615839019417763, -0.02798216976225376, 0.0032685634214431047, 0.020182831212878227, 0.014192033559083939, 0.005880902521312237, -0.002144504338502884, 0.00037756460369564593, -0.03245328739285469, -0.016502948477864265, 0.001036928966641426, -0.023774797096848488, 0.011177796870470047, -0.0314234234392643, 0.0005577123956754804, -0.002411389723420143, -0.00874128844588995, 0.0064366525039076805, -0.042576100677251816, 0.0064021144062280655, 0.022682135924696922, 0.0008108612382784486, -0.010122813284397125, 0.006794593296945095, 0.015083746053278446, 0.007366042118519545, 0.020333541557192802, -0.030895931646227837, -0.033181726932525635, 0.015611236914992332, -0.010832414962351322, 0.0036013855133205652, 0.004634389653801918, -0.005340851843357086, -0.0033941566944122314, 0.030418677255511284, 0.009808829985558987, -0.003227745648473501, 0.029263220727443695, -0.028409186750650406, -0.013036576099693775, -0.012841906398534775, -0.008351949043571949, -0.017143474891781807, -0.0003757984668482095, -0.008333110250532627, -0.04493725299835205, 0.02248118631541729, 0.00018220828496851027, 0.02966511808335781, -0.007868414744734764, 0.005246656946837902, 0.010813576169312, -0.0205470509827137, -0.002692404668778181, -0.014619050547480583, -0.02234303392469883, -0.006920186337083578, -0.00593741936609149, 0.00517130084335804, 0.005312593188136816, 0.019630219787359238, 0.020647525787353516, -0.003180648200213909, 0.0170932374894619, -0.040616847574710846, 0.037728201597929, 0.020258186385035515, 0.021112220361828804, -0.030318202450871468, -0.0004332966054789722, 0.034563254565000534, 0.006088131107389927, -0.006254542153328657, -0.01731930486857891, -0.009205983020365238, 0.00310058263130486, -0.03607037290930748, -0.0266759991645813, 0.006939025595784187, 0.024289729073643684, -0.0038368727546185255, 0.00473800441250205, 0.017520254477858543, 0.00783073715865612, 0.017658406868577003, 0.023372897878289223, 0.0023721419274806976, 0.009438330307602882, -0.016000576317310333, -0.004552754107862711, -0.008785245940089226, 0.005924860015511513, -0.006289080251008272, -0.037878915667533875, -0.007328364532440901, 0.012898423708975315, 0.011812042444944382, 0.029790712520480156, -0.006361296400427818, 0.015046067535877228, -0.02499305084347725, -0.00584008451551199, 0.02609827183187008, -0.0014945593429729342, -0.028434304520487785, 0.04242539033293724, -0.008973635733127594, 0.0035794067662209272, 0.02770586498081684, -0.015071186237037182, 0.008433585055172443, 0.002970279660075903, -0.007793059106916189, -0.0035228899214416742, 0.01905249059200287, -0.0050017498433589935, 0.011717847548425198, -0.010041178204119205, -0.016452711075544357, -0.015410288237035275, 0.0009136906592175364, 0.01372733898460865, -0.008351949043571949, 0.024339966475963593, -0.0161889661103487, 0.05631599947810173, -0.012722592800855637, -0.0041634151712059975, 0.02287052571773529, 0.014292508363723755, 0.020735440775752068, 0.024427881464362144, 0.018072864040732384, -0.008559177629649639, -0.02024562656879425, -0.0038494321051985025, 0.006756915245205164, -0.0008289152756333351, -0.02259422093629837, -0.011127559468150139, 0.004289008677005768, -0.01591266132891178, 0.004398902412503958, -0.019630219787359238, 0.01292354241013527, 0.018324051052331924, 0.008942237123847008, 0.0243902038782835, 0.013212407007813454, -0.018072864040732384, -0.019931644201278687, 0.00423877127468586, 0.0012331684119999409, -0.018349168822169304, -0.016151288524270058, 0.017243949696421623, 0.0007649412145838141, -0.0012857605470344424, -0.01260955911129713, 0.006656440440565348, 0.002959290286526084, -0.007108576130121946, -0.009513686411082745, -0.0036422032862901688, 0.010562390089035034, 0.02289564348757267, 0.0006240413058549166, -0.018097983673214912, -0.03295566141605377, 0.006813432089984417, 0.002750491490587592, -0.02936369553208351, -0.005384809337556362, -0.02317194826900959 ] }, { "object": "embedding", "index": 1, "embedding": [ 0.01023657713085413, -0.006719755474478006, 0.0031741075217723846, -0.007405183278024197, -0.010435158386826515, 0.005877383518964052, -0.011242298409342766, 0.0028522126376628876, 0.00522398529574275, -0.016347775235772133, 0.03156811743974686, 0.029928214848041534, 0.005617945920675993, 0.005086258985102177, 0.02043471857905388, 0.0074948654510080814, 0.02780146524310112, -0.012901417911052704, 0.013311393558979034, 0.0038499266374856234, -0.0013556417543441057, 0.02084469422698021, 0.011761173605918884, -0.014605378732085228, -0.002282090485095978, -0.002938691759482026, 0.004013276193290949, -0.02040909416973591, 0.01891012117266655, -0.025367235764861107, 0.01896136812865734, -0.010870757512748241, -0.010473594069480896, -0.005979877430945635, -0.004096552263945341, -0.022228362038731575, -0.011773984879255295, -0.0177955012768507, 0.02324048802256584, -0.006566014606505632, 0.006367432419210672, 0.0005869376473128796, -0.0010113263269886374, 0.0065083615481853485, -0.038076478987932205, 0.01811579428613186, 0.023471100255846977, -0.009845818392932415, -0.016091540455818176, 0.02248459681868553, 0.02122904546558857, 0.0023685696069151163, -0.017590513452887535, -0.013811050914227962, -0.030312566086649895, 0.014323520474135876, -0.03871706500649452, 0.029825720936059952, 0.02003755420446396, -0.0033887040335685015, 0.019935060292482376, -0.012920635752379894, -0.019140733405947685, 0.019896624609827995, -0.012786111794412136, 0.002304510911926627, 0.005758875049650669, -6.095583376009017e-05, -0.03625721111893654, 0.0196532029658556, 0.013977603055536747, 0.011741955764591694, 0.006931148935109377, -0.009788165800273418, 0.012030219659209251, 0.003657750552520156, -0.016104351729154587, 0.0059318337589502335, 0.006143227219581604, -0.0033887040335685015, 0.021126551553606987, -0.046916574239730835, -0.0228177011013031, 0.01429789699614048, 0.03195246681571007, -0.004461687058210373, -0.0117803905159235, 0.007040048483759165, -0.01507941260933876, -0.011613838374614716, 0.014835990034043789, 0.017910806462168694, 0.013759803958237171, 0.021100929006934166, -0.01820547692477703, 0.003545647719874978, -0.00044400669867172837, 0.032977405935525894, -0.003929999656975269, -0.031439997255802155, -0.005480220075696707, -0.02357359416782856, 0.00333745707757771, -0.006777408067137003, 0.000521677837241441, -0.0029210757929831743, 0.00032910145819187164, -0.003926796838641167, 0.026443421840667725, -0.010403129272162914, -0.03277241811156273, 0.02312518283724785, 0.010614522732794285, -0.027596479281783104, -0.01773144118487835, -0.009339755401015282, 0.014067285694181919, 0.004765965510159731, -0.013823863118886948, -0.027724595740437508, 0.0022884963545948267, 0.020972810685634613, 0.020575648173689842, -0.03743589296936989, 0.016129974275827408, -0.004756357055157423, -0.01506660133600235, -0.028365181758999825, -0.010576087981462479, -0.0010553667088970542, 0.018295157700777054, 0.0022580684162676334, 0.024034816771745682, -0.009512714110314846, -0.012882200069725513, 0.015604693442583084, 0.004615427926182747, 0.01621965691447258, -0.004429657477885485, -0.00846855714917183, 0.014054473489522934, 0.03046630695462227, 0.009807383641600609, -0.018666699528694153, -0.005236797034740448, 0.013964791782200336, 0.002373374067246914, -0.0050446209497749805, 0.007213007193058729, -0.0037378238048404455, 0.01932009682059288, 0.0057748896069824696, 0.01694992557168007, -0.014361955225467682, 0.0234967228025198, 0.019806943833827972, -0.005627554841339588, 0.011914914473891258, -0.004134987480938435, -0.01775706559419632, 0.0029723227489739656, -0.008686356246471405, 0.005156723782420158, -0.018038922920823097, 0.022868948057293892, 0.014938483946025372, 0.01589936390519142, -0.0007162560941651464, -0.009980342350900173, -0.030389437451958656, -0.008948997594416142, 0.015207529999315739, -0.03758963197469711, 0.020575648173689842, -0.022177115082740784, 0.035334765911102295, 0.005444987677037716, 0.012677212245762348, -0.004330366384238005, -0.022279608994722366, -0.04430298134684563, 0.01099887490272522, 0.004112567286938429, 0.03018444962799549, 0.008673544973134995, -0.0011778789339587092, 0.014618190005421638, -0.005390537902712822, 0.01775706559419632, -0.009474278427660465, 0.006604449823498726, 0.02739149145781994, 0.008058581501245499, 0.004231075756251812, -0.6772795915603638, -0.003121259156614542, 0.025303177535533905, 0.0057332515716552734, 0.010127676650881767, 0.0010986062698066235, 0.02974884957075119, 0.021049682050943375, 0.01465662568807602, 0.015105036087334156, -0.022894570603966713, 0.01137682143598795, -0.030261319130659103, -0.01026860624551773, -0.005838948301970959, -0.006393055897206068, -0.004516136832535267, -0.022676771506667137, 0.0039940583519637585, 0.006681320257484913, -0.006175256334245205, 0.037333399057388306, 0.002186002442613244, -0.0066300733014941216, 0.01861545257270336, -0.004410440102219582, -0.002250061137601733, -0.02284332364797592, -0.00948709063231945, 0.0037602444645017385, -0.001710366690531373, 0.009800978004932404, -0.0129590705037117, 0.006847872864454985, 0.07159198075532913, -0.0021379583049565554, -0.00130119186360389, -0.0110373105853796, -0.0034111246932297945, 0.028032077476382256, -0.013631686568260193, -0.019140733405947685, 0.012433789670467377, -0.0001967802527360618, -0.01329858135432005, -0.007482053712010384, 0.011953349225223064, -0.003997261635959148, -0.010377505794167519, -0.018846062943339348, 0.004202249459922314, -0.0018769192975014448, 0.009980342350900173, 0.02435510978102684, -0.014592566527426243, 0.018231099471449852, 0.03131188079714775, -0.0053296820260584354, -0.0018192664720118046, -0.002783349482342601, 0.009397407993674278, 0.009301319718360901, -0.000698239600751549, 0.014361955225467682, -0.014925671741366386, 0.017500830814242363, -0.01782112382352352, 0.032285574823617935, 0.014631002210080624, -0.014823177829384804, -0.006114400923252106, 0.001224321429617703, -0.008840097114443779, -0.004362395964562893, 0.014413202181458473, 0.028775157406926155, 0.00983941275626421, -0.01080669928342104, -0.009262884967029095, 0.00967926625162363, 0.015655940398573875, 0.0043399753049016, 0.004109364002943039, -0.0020130439661443233, 0.03015882521867752, -0.023868262767791748, -0.023868262767791748, -0.009358973242342472, -0.006130415480583906, 0.006937554571777582, 0.002025855705142021, 0.011870073154568672, -0.010403129272162914, -0.00846855714917183, 0.023471100255846977, 0.003901173360645771, -0.023855451494455338, -0.004756357055157423, 0.025636283680796623, -0.005633960478007793, 0.008474962785840034, -0.004141393583267927, 0.011005280539393425, -0.012555500492453575, -0.013798239640891552, 0.006847872864454985, -0.022971441969275475, 0.02202337421476841, 0.029646355658769608, -0.031132517382502556, 0.005563496146351099, 0.0035136183723807335, -0.014272273518145084, 0.005041418131440878, -0.0041157701052725315, -0.022881759330630302, 0.0003765449218917638, 0.0010201344266533852, 0.006389853078871965, -0.01926884986460209, 0.01627090387046337, -0.0010417542653158307, 0.008398092351853848, 0.0011298349127173424, 0.005438581574708223, -0.003050794592127204, 0.014605378732085228, 0.010678581893444061, -0.042304351925849915, 0.005249608773738146, -0.003196527948603034, 0.004705110099166632, 0.002853814046829939, -0.010242982767522335, -0.01078748144209385, -0.0075909532606601715, 0.01176757924258709, -0.014182590879499912, -0.0024118092842400074, 0.021510904654860497, 0.005288043990731239, 0.001864107558503747, -0.017564889043569565, -0.011741955764591694, 0.025200683623552322, -0.01003799494355917, 0.002890647854655981, 0.003169303061440587, -0.007981711067259312, 0.004801197908818722, 0.028057700023055077, 0.004791588988155127, 8.422715472988784e-05, 0.016091540455818176, -0.00481400964781642, -0.0185385812073946, -0.00987784843891859, -0.019409779459238052, -0.04791589081287384, -0.007853593677282333, 0.02980009652674198, 0.01744958385825157, -0.0005477016675285995, -0.005957457236945629, -0.011953349225223064, -0.0197172611951828, -0.007450024131685495, 0.020216919481754303, -0.007385965436697006, -0.024239802733063698, -4.3239608203293756e-05, -0.018333593383431435, -0.011581809259951115, 0.00608877744525671, -0.012440195307135582, 0.015655940398573875, -0.0077446941286325455, -0.020614081993699074, -0.021395597606897354, -0.015271589159965515, -0.0005056631634943187, -0.011223080568015575, -0.018769193440675735, 0.011613838374614716, 0.022984253242611885, -0.005845354404300451, 0.002275684615597129, 0.03618033975362778, -0.02429104968905449, 0.004833227023482323, 0.0247394610196352, 0.03738464415073395, -0.0323624424636364, -0.01097965706139803, 0.008744009770452976, -0.018602639436721802, 0.004000464454293251, 0.005938239395618439, 0.02201056107878685, 0.011114181019365788, 0.005358508322387934, -0.013048752211034298, 0.01488723699003458, 0.005803716368973255, 0.004477701615542173, -0.01932009682059288, 0.0016703300643712282, -0.030363813042640686, -0.002752921776846051, 0.017705818638205528, -0.0026488262228667736, 0.015361270867288113, -0.0025447309017181396, 0.027647726237773895, 0.008378875441849232, 0.04028009623289108, 0.007136136759072542, -0.01488723699003458, -0.01627090387046337, -0.013144840486347675, -0.0019249633187428117, 0.016450269147753716, -0.012183960527181625, 0.008564645424485207, -0.006649290677160025, 0.028032077476382256, 0.004836430307477713, 0.03659031540155411, 0.01620684564113617, -0.03505290672183037, -0.0251750610768795, -0.01411853265017271, 0.012280048802495003, -0.0006562010967172682, -0.0055795107036828995, -0.0033822981640696526, 0.01121667493134737, -0.008628703653812408, 0.04125379025936127, 0.0010465586092323065, -0.018423276022076607, 0.007616576738655567, 0.037640880793333054, 0.004807603545486927, -0.016885867342352867, 0.02052440121769905, 0.005595525726675987, 0.007027236744761467, -0.003356674686074257, 0.022689582780003548, -0.025034131482243538, 0.009557554498314857, -0.016155598685145378, 0.00549943745136261, 0.006598043721169233, -0.013311393558979034, 0.005717237014323473, -0.014938483946025372, 0.020280977711081505, 0.0401776023209095, -0.000625372864305973, -0.008955403231084347, 0.0024742663372308016, 0.005147114861756563, -0.014092909172177315, -0.011024498380720615, 0.012267236597836018, -0.016091540455818176, 0.008833691477775574, 0.009435843676328659, -0.019601956009864807, -0.0034463568590581417, 0.014682249166071415, -0.009756136685609818, 0.026520293205976486, 0.010768263600766659, 0.024893201887607574, -0.007014425005763769, -0.01701398566365242, -0.0014581356663256884, 0.0028249877505004406, -0.015553447417914867, -0.006124009378254414, 0.02238210290670395, -0.000762698648031801, -0.03464293107390404, -0.01215833704918623, -0.001716772560030222, -0.008692762814462185, 0.008077799342572689, -0.01653994992375374, 0.021754326298832893, -0.003926796838641167, 0.0035264301113784313, -0.0023925916757434607, -0.014618190005421638, 0.01852576993405819, -0.01816704124212265, -0.0038787529338151217, 0.021895255893468857, 0.006037530489265919, 0.00013292176299728453, 0.0070656719617545605, -0.016411833465099335, -0.00541936419904232, -0.0051439120434224606, 0.0014685451751574874, -0.011351197957992554, 0.003955623134970665, -0.02509818971157074, -0.01705241948366165, -0.023881075903773308, -0.010697798803448677, -0.014912860468029976, 0.008526209741830826, -0.0033534718677401543, -0.013618875294923782, 0.018448898568749428, 0.03259305655956268, 0.012446600943803787, -0.003862738376483321, -0.0021667848341166973, -0.009916283190250397, -0.027647726237773895, 0.09865036606788635, 0.006809437647461891, -0.007751100230962038, 0.014682249166071415, -0.026827774941921234, -0.004753153771162033, -0.03133750334382057, -0.031004400923848152, -0.01702679693698883, 0.006242518313229084, 0.0016134779434651136, 0.021036868914961815, -0.01583530567586422, 0.011927725747227669, 0.009724107570946217, 0.011709926649928093, 0.01588655263185501, -0.007321906741708517, -0.006399461999535561, -0.003433545120060444, -0.019102297723293304, 0.016475891694426537, -0.017603324726223946, 0.013849485665559769, -0.004676283337175846, 0.0009240464423783123, -0.010825916193425655, 0.007001613266766071, -0.008462151512503624, -0.007687041535973549, 0.002632811665534973, 0.005361711140722036, 0.006835061125457287, 0.010678581893444061, -0.01044156402349472, 0.0174239594489336, 0.011985378339886665, 0.02315080538392067, 0.011549779213964939, -0.016680879518389702, 0.011223080568015575, 0.002693667309358716, -0.004775574430823326, -0.02509818971157074, -0.009595990180969238, -0.005316870287060738, -0.02504694275557995, 0.03051755391061306, -0.013003911823034286, -0.029441367834806442, 0.025713153183460236, -0.0028185818810015917, 0.010691393166780472, -0.02595657669007778, 0.0003090831160079688, 0.01659119687974453, -0.011639461852610111, -0.035001661628484726, -0.010902786627411842, -0.0024406355805695057, -0.010825916193425655, -0.029979461804032326, -0.01656557433307171, -0.009134767577052116, -0.013798239640891552, -0.034591685980558395, -0.03797398507595062, -0.012305672280490398, -0.0393064059317112, -0.013913544826209545, 0.022561466321349144, -0.043534278869628906, -0.020357847213745117, -0.0037602444645017385, 0.009653642773628235, 0.012408166192471981, 0.007373153697699308, -0.025034131482243538, 0.007161760237067938, 0.003913985099643469, -0.0014597370754927397, -0.023483911529183388, 0.002860219916328788, -0.0022228360176086426, 0.0009752933401614428, 0.0060503422282636166, 0.013798239640891552, 0.004894082900136709, -0.034566063433885574, 0.02857016958296299, 0.020024742931127548, 0.009346161037683487, 0.019140733405947685, -0.0030187652446329594, 0.014246650040149689, -0.015374083071947098, -0.003545647719874978, 0.016142787411808968, -0.015156283043324947, 0.00502540310844779, 0.007988116703927517, -0.005896601360291243, -0.00850058626383543, 0.006473129615187645, 0.005573105067014694, 0.013951979577541351, 0.02664840966463089, 0.022305231541395187, -0.0019217603839933872, -0.011120586656033993, 0.020947188138961792, -0.02203618548810482, -0.009403813630342484, -0.008263569325208664, 0.005002982914447784, 0.02397075667977333, 0.030825035646557808, 0.03218308091163635, 0.008071393705904484, -0.012645183131098747, -0.0004888477851636708, -0.023445475846529007, -0.001793642994016409, 0.009320537559688091, -0.002629608614370227, 0.0033534718677401543, -0.009749731048941612, -0.021446844562888145, -0.004551369231194258, -0.0021443641744554043, -0.014336331747472286, 0.018436087295413017, -0.0060823713429272175, -0.00885931495577097, -0.026955891400575638, -0.00512469420209527, -0.014797554351389408, 0.01046078186482191, -0.005541075486689806, -0.016527138650417328, -0.02511100098490715, -0.0049997796304523945, -0.012023814022541046, 0.017231784760951996, -0.03443794697523117, -0.03218308091163635, -0.010684987530112267, 0.01390073262155056, -0.003420733381062746, 0.01892293430864811, -0.009000244550406933, -0.0045737894251942635, -0.017577700316905975, 0.010729828849434853, -0.004916503559798002, -0.024790707975625992, -0.020767822861671448, -0.019115108996629715, 0.017193349078297615, 0.027160879224538803, 0.029236380010843277, 0.007879217155277729, 0.022599902004003525, 0.018026111647486687, -0.005970268975943327, -0.0005601130542345345, -0.005742860492318869, -0.009813789278268814, -0.018666699528694153, 0.012837358750402927, 0.008180293254554272, 0.010922004468739033, 0.007981711067259312, 0.0018689119024202228, 0.004576992709189653, 0.005787701345980167, -0.006642885040491819, -0.013670122250914574, -0.00983941275626421, -0.01294625923037529, -0.013285770080983639, -0.0135163813829422, -0.0028377994894981384, 0.00687349634245038, 0.001427707727998495, -0.014387578703463078, 0.021523715928196907, 0.0060599506832659245, -0.009615207090973854, -0.003865941194817424, 0.01852576993405819, -0.020652517676353455, 0.022497408092021942, -0.009423031471669674, 0.013644498772919178, -0.020934375002980232, 0.007200195454061031, 0.013580439612269402, 0.011658679693937302, 0.005355305504053831, -0.010114865377545357, 0.024508850648999214, -0.01703960821032524, -0.008410904556512833, 0.005451393313705921, 0.01741114817559719, -0.0021139364689588547, -0.014195403084158897, 0.008955403231084347, 0.00019858189625665545, 0.012395353987812996, -0.02900576964020729, -0.023663274943828583, -0.0270327627658844, -0.007962493225932121, -0.009205232374370098, -0.01616840995848179, 0.013606063090264797, -0.013926356099545956, -0.03935765102505684, -0.018666699528694153, -0.005313667468726635, 0.025777211412787437, 0.0016703300643712282, 0.015156283043324947, 0.0024758679792284966, -0.02739149145781994, -0.0009432640508748591, 0.01311921700835228, -0.013670122250914574, -0.00221963319927454, 0.039229534566402435, -0.002186002442613244, -0.004865256603807211, -0.002150770043954253, -0.0204603411257267, 0.021113740280270576, -0.010736234486103058, -0.03448919206857681, 0.011402444913983345, 0.007072078064084053, 0.009768947958946228, -0.009032273665070534, -0.0029226772021502256, -0.013542004860937595, 0.03413046523928642, -0.006847872864454985, -0.027519607916474342, -0.021895255893468857, 0.0026216013357043266, -0.008052175864577293, 0.025572223588824272, -0.035795990377664566, 0.036487821489572525, 0.03525789454579353, -0.022715207189321518, 0.003015562193468213, 0.001989021897315979, 0.014579755254089832, -0.004801197908818722, -0.011056527495384216, 0.03623158857226372, 0.0025671515613794327, -0.021100929006934166, 0.004980562254786491, -0.018410464748740196, -0.001675134408287704, 0.017859559506177902, 0.00018567006918601692, 0.018282346427440643, -0.007155354134738445, -0.0038371148984879255, -0.01964038982987404, 0.00597347179427743, 0.01121026836335659, -0.010889975354075432, -0.0077126650139689445, -0.02195931412279606, -0.0038339118473231792, -0.010185330174863338, 0.021818386390805244, 0.0065307822078466415, 0.0014765525702387094, -0.009576772339642048, -0.016040293499827385, -0.025738777592778206, -0.015591882169246674, -0.0012203177902847528, 0.015450953505933285, -0.01466943696141243, -0.011434474028646946, -0.0034047188237309456, -0.002813777420669794, 0.031516868621110916, 0.004307946190237999, -0.00024062041484285146, -0.0018000488635152578, 0.016399022191762924, -0.02008880116045475, 0.0038211001083254814, 0.02317642979323864, 0.027135256677865982, -0.020947188138961792, 0.0015998654998838902, -0.00026924663688987494, -0.016399022191762924, 0.007289877627044916, -0.009128361940383911, 0.004077334888279438, 0.005662787239998579, 0.00022040188196115196, -0.005749266128987074, 0.006681320257484913, -0.004484107252210379, -0.005912615917623043, 0.0028281905688345432, 0.010076429694890976, -0.024662591516971588, -0.006681320257484913, 0.030235696583986282, -0.011050121858716011, 0.008827285841107368, -0.0037025916390120983, -0.0015502199530601501, -0.012965476140379906, 0.00152139354031533, -0.026955891400575638, 0.00638024415820837, -0.027980830520391464, 0.007610171101987362, 0.008692762814462185, 0.014541319571435452, 0.009685671888291836, -0.036077845841646194, -0.0355910025537014, 0.01060811709612608, 0.005262420512735844, 0.01213911920785904, 0.017705818638205528, 0.001870513428002596, -0.002735305577516556, 0.02274082973599434, 0.007981711067259312, 0.0035200242418795824, 0.01664244383573532, 0.0029915403574705124, 0.011223080568015575, -0.009743324480950832, -0.026084693148732185, -0.01694992557168007, 0.007552518043667078, 0.03935765102505684, -0.00034671759931370616, -0.0142594613134861, -0.006598043721169233, 0.010793887078762054, -0.014400390908122063, 0.005746063310652971, -0.014041662216186523, 0.014169779606163502, 0.007213007193058729, 0.009147578850388527, -0.002192408312112093, 0.02549535408616066, 0.015463764779269695, 0.007597359362989664, -0.004432860296219587, -0.01140885055065155, 0.005822933744639158, 0.012324889190495014, 0.0270327627658844, -0.0009344559512101114, -0.03131188079714775, 0.0033694864250719547, -0.0012227200204506516, 0.011114181019365788, -0.005787701345980167, 0.004224669653922319, 0.009890659712255001, -0.01046078186482191, -0.0008039363892748952, -0.011447285301983356, 0.001793642994016409, 0.01627090387046337, -0.0035200242418795824, -0.005569902248680592, 0.010166112333536148, 0.009704889729619026, -0.0030636063311249018, 0.016514327377080917, -0.003913985099643469, 0.021741515025496483, -0.02705838531255722, 0.004576992709189653, -0.014797554351389408, -0.0013019925681874156, 0.004862053785473108, -0.026046257466077805, 0.00040076710865832865, -0.006790219806134701, 0.0010249388869851828, 0.031875599175691605, -0.007347530219703913, -0.0044648898765444756, 0.01933290809392929, -0.016770562157034874, -0.008186698891222477, 0.0021379583049565554, -0.005749266128987074, -0.021036868914961815, 0.0017135696252807975, -0.007847188040614128, -0.01194694358855486, -0.02903139218688011, 0.0028169802390038967, -0.024137310683727264, -0.012818141840398312, -0.02092156372964382, 0.01975569687783718, 0.026161564514040947, 0.0018448899500072002, -0.0010946026304736733, -0.023727335035800934, 0.016104351729154587, 0.013452322222292423, -0.019960684701800346, 0.007398777175694704, -0.02933887392282486, -0.016001857817173004, 0.013388263992965221, -0.01579686999320984, -0.007635794579982758, 0.00889775063842535, 0.008731197565793991, 0.017590513452887535, 0.038076478987932205, 0.22610150277614594, -0.024124497547745705, 0.0023061123210936785, 0.022279608994722366, 0.013465134426951408, 0.02438073232769966, -0.009045084938406944, -0.0006485941121354699, 0.0080906106159091, 0.008039363659918308, -0.025790024548768997, 0.01277970615774393, -0.001265959581360221, -0.000538092921487987, 0.002045073313638568, -0.017859559506177902, -0.04202249273657799, -0.01431070826947689, -0.02090875245630741, -0.026571540161967278, 0.0007498869090341032, -0.014861613512039185, -0.011652273125946522, -0.006566014606505632, 0.02472664974629879, -0.007501271087676287, -0.008103422820568085, 0.00027305010007694364, 0.021024057641625404, 0.007373153697699308, 0.0004956540069542825, -0.019140733405947685, 0.0003104844072367996, 0.013093593530356884, 0.004218264017254114, -0.020255353301763535, 0.014631002210080624, -0.004054914228618145, 0.019384155049920082, 0.02545691840350628, -0.01894855685532093, -0.0017231784295290709, -0.006325794383883476, -0.014733496122062206, 0.01698836125433445, 0.010345476679503918, -0.0015934596303850412, -0.01577124558389187, -0.006975989788770676, 0.02739149145781994, -0.01701398566365242, 0.00041117664659395814, 0.017193349078297615, 0.04294493794441223, 0.0035264301113784313, 0.013132029213011265, -0.010095647536218166, 0.002938691759482026, -0.014682249166071415, -0.0053713200613856316, -0.014605378732085228, 0.02775021828711033, 0.0006537988665513694, 0.010492810979485512, -0.022535841912031174, 0.023368606343865395, -0.033361759036779404, -0.005384131800383329, 0.03331051394343376, 0.0019073471194133162, -0.004026087932288647, -0.004006870090961456, -0.012312077917158604, 0.018320782110095024, -0.034566063433885574, -0.005374522879719734, -0.00016395017155446112, 0.00687349634245038, 0.03292616084218025, 0.012862982228398323, -0.013721369206905365, 0.010415940545499325, 0.014195403084158897, -0.011953349225223064, -0.0247394610196352, -0.03605222329497337, 0.022177115082740784, -0.005835745483636856, -0.007040048483759165, -0.006034327670931816, 0.005159926600754261, -0.031183764338493347, -0.017090855166316032, 0.006377041339874268, 0.020345035940408707, 0.040305718779563904, 0.004743545316159725, 0.008993837982416153, -0.014541319571435452, -0.005217579193413258, 0.005903006996959448, -0.03448919206857681, 0.0002622402098495513, -0.0002678453456610441, -0.007219412829726934, -0.0060503422282636166, -0.00964083056896925, 0.01178679708391428, -0.005573105067014694, -0.031158139929175377, -0.010781075805425644, -0.014067285694181919, -0.003763447282835841, -0.007456430234014988, -0.002764131873846054, 0.011882884427905083, -0.0002940693811979145, 0.008705574087798595, 0.005621149204671383, -0.016770562157034874, 0.020947188138961792, -0.013401075266301632, -0.0018721148371696472, -0.0043463814072310925, -0.011223080568015575, -0.011107774451375008, -0.007943276315927505, 0.004211857914924622, 0.01619403436779976, -0.032285574823617935, 0.001259553711861372, -0.008237945847213268, 0.027980830520391464, 0.000949669920373708, -0.016706503927707672, 0.002653630683198571, 0.012638777494430542, -0.01657838560640812, -0.007257848046720028, 0.018730757758021355, -0.002205220051109791, -0.0010481601348146796, 0.009314131923019886, -0.0022596698254346848, -0.015989046543836594, 0.004990171175450087, 0.023048311471939087, -0.010294229723513126, -0.01657838560640812, -0.012081466615200043, -0.03520664945244789, -0.022663960233330727, -0.03374611213803291, 0.0006313783233053982, 0.04133065789937973, -0.014566943049430847, -0.017577700316905975, -0.029569486156105995, -0.011242298409342766, 0.016527138650417328, -0.031106894835829735, 0.00647953525185585, 0.019486648961901665, -0.015040977858006954, -0.03899892419576645, -0.011107774451375008, -0.16378523409366608, 0.02549535408616066, 0.00891056191176176, -0.005118288565427065, 0.03548850864171982, 0.013798239640891552, 0.012440195307135582, 0.0008255562279373407, -0.010121271014213562, 0.001718374085612595, 0.031414374709129333, 0.0060055009089410305, -0.008609486743807793, -0.0006441900623030961, 0.017116479575634003, -0.018064547330141068, -0.009019461460411549, 0.010281417518854141, 0.02862141653895378, 0.023778581991791725, 0.0017423960380256176, -0.002471063518896699, 0.006655696779489517, -0.002250061137601733, 0.01470787264406681, -0.006040733307600021, 0.006373838521540165, 0.041894376277923584, -0.016437456011772156, 0.007635794579982758, -0.03400234505534172, -0.03679530322551727, 0.03172185644507408, 0.017680194228887558, 0.019973495975136757, 0.005326479207724333, -0.010531246662139893, -0.02708400972187519, -0.014323520474135876, 0.002940293401479721, 0.028390806168317795, 0.03287491202354431, 0.0009328544838353992, 0.02003755420446396, -0.015348459593951702, -0.0016350977821275592, 0.003641735762357712, 0.012440195307135582, 0.005115085281431675, -0.007366748061031103, 0.016283715143799782, -0.004560977686196566, -0.015912175178527832, -0.0129590705037117, 0.002512701554223895, 0.020614081993699074, 0.01374699268490076, 0.016821809113025665, -0.005512249190360308, 0.003782664891332388, -0.02204899676144123, -0.015732811763882637, -0.0023989975452423096, 0.00907070841640234, 0.009391002357006073, -0.0027048776391893625, 0.007347530219703913, 0.017257407307624817, -0.0012227200204506516, 0.013087187893688679, 0.009211638011038303, -0.019922249019145966, 0.02316361851990223, -0.0023861858062446117, 0.00347518315538764, 0.010121271014213562, -0.018692322075366974, 0.01848733425140381, 0.00313407089561224, -0.023676088079810143, -0.020998435094952583, 0.026776527985930443, -0.01964038982987404, 0.005364914424717426, -0.00030127595528028905, 0.003420733381062746, -0.003888361854478717, 0.006751784589141607, -0.030773788690567017, -0.007936869747936726, 0.02197212725877762, -0.030671294778585434, -0.011671490967273712, -0.008609486743807793, 0.003657750552520156, 0.02320205233991146, 0.006905525457113981, 0.005444987677037716, -0.017552077770233154, -0.013862297870218754, -0.0018016502726823092, -0.0028025670908391476, -0.009865036234259605, -0.01310640573501587, 0.04432860389351845, -0.004295134451240301, 0.02782708965241909, 0.001064174808561802, 0.017231784760951996, -0.01896136812865734, -0.020280977711081505, -0.00022720811830367893, 0.024483226239681244, 0.017500830814242363, 0.013951979577541351, 0.005438581574708223, 0.0038339118473231792, -0.014105720445513725, 0.00026384167722426355, 0.0073411245830357075, 0.04986327514052391, -0.011306356638669968, -0.028390806168317795, -0.002086711348965764, -0.005976674612611532, -0.01780831255018711, -0.08840097486972809, -0.026904644444584846, 0.007655011955648661, 0.01738552562892437, -0.007001613266766071, 0.009352566674351692, 0.0024342297110706568, 0.0030043520964682102, -0.00830841064453125, 0.012741271406412125, 0.011844449676573277, -0.02623843401670456, -0.024572908878326416, -0.0005709229735657573, 0.034232959151268005, -0.027929583564400673, 0.01891012117266655, -0.020204106345772743, -0.012734864838421345, 0.026776527985930443, -0.00251430319622159, 0.001099407090805471, -0.008276381529867649, 0.014131343923509121, -0.018231099471449852, -0.005486625712364912, -0.01698836125433445, 0.03215745463967323, 0.002780146664008498, -0.002128349617123604, -0.01506660133600235, -0.019063862040638924, 0.0063546206802129745, 0.0014044864801689982, 0.01137682143598795, 0.008571051061153412, -0.026776527985930443, 0.013951979577541351, 0.021498091518878937, -0.038563322275877, 0.007808752823621035, 0.004125379025936127, -0.02276645414531231, -0.05173378810286522, -0.008744009770452976, -0.0009520720923319459, -0.0035232272930443287, 0.014771930873394012, 0.0011250305688008666, -0.013990415260195732, -0.028698287904262543, -0.04373926669359207, -0.007853593677282333, 0.007879217155277729, 0.024457603693008423, 0.013849485665559769, 0.02507256716489792, 0.022125868126749992, -0.005592322442680597, 0.02816019393503666, 0.012254425324499607, -0.00028085726080462337, -0.032669924199581146, 0.022907383739948273, -0.00753970630466938, 0.0050222002901136875, -0.015976233407855034, 0.013836674392223358, 0.018372029066085815, 0.004708312917500734, -0.021088115870952606, 0.03295178338885307, 0.005685207433998585, 0.015117848291993141, -0.03436107560992241, -0.009262884967029095, -0.023266112431883812, -0.0015718397917225957, 0.008391686715185642, 0.0009889057837426662, -0.011460097506642342, -0.0154381413012743, 0.008583863265812397, 0.013849485665559769, 0.0004984565894119442, 0.006226503290235996, 0.005361711140722036, -0.0038851588033139706, -0.010512028820812702, -0.014746307395398617, -0.006809437647461891, 0.011882884427905083, 0.0064314911141991615, -0.02550816535949707, 0.003427139250561595, 0.02284332364797592, -0.014797554351389408, 0.0044969189912080765, 0.023663274943828583, 0.0019041441846638918, -0.026020634919404984, 0.005438581574708223, -0.06108635663986206, 0.028723910450935364, -0.011876478791236877, -0.022689582780003548, 0.010025182738900185, -0.02892889827489853, 0.0033406601287424564, 0.0023477505892515182, -0.0189741812646389, -0.0009905073093250394, 0.00869916845113039, 0.00667491415515542, 0.0037089975085109472, 0.004727530293166637, -0.03874268755316734, -0.001560629578307271, 0.01935853250324726, 0.0013035940937697887, 0.01623246818780899, 0.0004672279756050557, -7.51688567106612e-05, -0.01702679693698883, 0.0073603419587016106, -0.004833227023482323, -0.028108946979045868, 0.0002832594618666917, -0.03487354516983032, 0.002471063518896699, 0.007309095002710819, -0.031183764338493347, 0.028108946979045868, -0.0479927621781826, -0.003910782281309366, 0.01580968126654625, 0.0031372737139463425, 0.012209584005177021, -0.03090190701186657, 0.011690708808600903, 0.010108459740877151, 0.013042346574366093, -0.0432780422270298, -0.031004400923848152, 0.03333613649010658, -0.01580968126654625, 0.01142166182398796, 0.007001613266766071, -0.021164987236261368, 0.004189437720924616, 0.015963422134518623, 0.020755011588335037, -0.0024470414500683546, 0.03866581618785858, -0.022228362038731575, -0.001466943765990436, 0.00762298284098506, -0.0011986979516223073, 0.01666806824505329, -0.0010177321964874864, -0.008250758051872253, -0.023752957582473755, 0.01616840995848179, 0.004394425079226494, 0.014105720445513725, 0.008763226680457592, -0.0016431050607934594, 0.008295598439872265, -0.03169623389840126, -0.017244596034288406, -0.0317474789917469, -0.025982199236750603, -0.005864571779966354, 0.011581809259951115, 0.02741711400449276, 0.016155598685145378, 0.02505975402891636, 0.017897995188832283, -0.01272845920175314, 0.024906013160943985, -0.05765280872583389, 0.02357359416782856, 0.014156967401504517, 0.004826821386814117, -0.043047431856393814, 0.008404498919844627, 0.021498091518878937, 0.001606271369382739, -0.0024998898152261972, -0.009192420169711113, -0.01060811709612608, 0.004897285718470812, -0.04061320051550865, -0.002450244501233101, 0.025264741852879524, 0.013567628338932991, -0.022612713277339935, 0.012241613119840622, 0.014374767430126667, 0.0047723716124892235, 0.004509730730205774, 0.010729828849434853, 0.0045193396508693695, 0.017231784760951996, -0.02195931412279606, -0.0007931265281513333, -0.008417310193181038, 0.03487354516983032, -0.017218973487615585, -0.026904644444584846, -0.008737603202462196, 0.01970444992184639, 0.013990415260195732, 0.03928077965974808, 0.012446600943803787, 0.010960440151393414, -0.016501516103744507, 0.013426698744297028, 0.0066621024161577225, 0.0011090158950537443, -0.02980009652674198, 0.057345326989889145, -0.0031372737139463425, 0.01810298301279545, 0.023355793207883835, 0.007763911969959736, 0.018743569031357765, -0.00771907065063715, -0.02933887392282486, -0.010422347113490105, 0.02276645414531231, -0.014528508298099041, 0.002581564709544182, -0.0050446209497749805, -0.02236928977072239, -0.007206601090729237, 0.00023361398780252784, 0.0018256723415106535, 0.0009008251363411546, 0.039178285747766495, -0.010781075805425644, 0.045789141207933426, 0.013260146602988243, -0.015271589159965515, 0.01817985251545906, 0.01529721263796091, 0.01333701703697443, 0.00445528095588088, 0.02203618548810482, -0.017859559506177902, -0.009685671888291836, 0.0341048389673233, -0.008628703653812408, 0.010146894492208958, -0.028032077476382256, -0.006540391128510237, 0.006248923949897289, -0.024790707975625992, 0.005390537902712822, -0.010652958415448666, 0.029876967892050743, 0.03018444962799549, 0.0038307090289890766, 0.032669924199581146, 0.017193349078297615, -0.028903275728225708, -0.006047139409929514, -0.0038243031594902277, 0.0026344130747020245, -0.006245721131563187, -0.0285189226269722, 0.015335647389292717, 0.009608801454305649, -0.0038723470643162727, -0.005493031814694405, 0.026033446192741394, -0.007661418057978153, -0.00520476745441556, -0.01388792134821415, -0.02043471857905388, 0.002728899708017707, 0.01976850815117359, -0.0064314911141991615, -0.025392860174179077, -0.02584127150475979, 0.00866713933646679, 0.0018817236414179206, -0.028493300080299377, 0.001047359430231154, -0.020665328949689865 ] } ], "model": "ada", "usage": { "prompt_tokens": 7, "total_tokens": 7 } } ================================================ FILE: libs/kotaemon/tests/resources/fullocr_sample_output.json ================================================ [{"csv_string": ",,,\u5358\u4f4d,\u5b9f\u65bd\u4f8b1,\u5b9f\u65bd\u4f8b2,\u5b9f\u65bd\u4f8b3,\u6bd4\u8f03\u4f8b1,\u6bd4\u8f03\u4f8b2,\u6bd4\u8f03\u4f8b3,\u6bd4\u8f03\u4f8b4\n\u71b1\u786c\u5316\u6027\u6a39\u8102\u7d44\u6210\u7269,\u71b1\u786c\u5316\u6027\u6a39\u8102,\u30a8\u30dd\u30ad\u30b7\u6a39\u81021,\u8cea\u91cf%,10.2,12.4,12.4,10.2,12.4,,10.2\n,,\u30a8\u30dd\u30ad\u30b7\u6a39\u81022,,-,-,-,-,-,14.4,-\n,\u786c\u5316\u5264,\u786c\u5316\u52641,,4.8,6.6,6.6,4.8,6.6,7.6,4.8\n,\u7121\u6a5f\u5145\u586b\u6750,\u7121\u6a5f\u5145\u586b\u52641,,74,70,70,74,70,67,74\n,,\u7121\u6a5f\u5145\u586b\u52642,,10,10,10,10,10,10,10\n,\u786c\u5316\u4fc3\u9032\u5264,\u786c\u5316\u4fc3\u9032\u52641,,0.2,0.2,0.2,0.2,0.2,0.2,0.2\n,\u30ab\u30c3\u30d7\u30ea\u30f3\u30b0\u5264,\u30ab\u30c3\u30d7\u30ea\u30f3\u30b0\u52641,,0.2,0.2,0.2,0.2,0.2,0.2,0.2\n,\u96e2\u578b\u5264,\u96e2\u578b\u52641,,0.2,0.2,0.2,0.2,0.2,0.2,0.2\n,\u7740\u8272\u5264,\u7740\u8272\u52641,,0.4,0.4,0.4,0.4,0.4,0.4,0.4\n,\u5408\u8a08,,,100,100,100,100,100,100,100\n,,,\u5358\u4f4d,\u5b9f\u65bd\u4f8b1,\u5b9f\u65bd\u4f8b2,\u5b9f\u65bd\u4f8b3,\u6bd4\u8f03\u4f8b1,\u6bd4\u8f03\u4f8b2,\u6bd4\u8f03\u4f8b3,\u6bd4\u8f03\u4f8b4\n\u71b1\u786c\u5316\u6027\u6a39\u8102\u7d44\u6210\u7269,\u71b1\u786c\u5316\u6027\u6a39\u8102,\u30a8\u30dd\u30ad\u30b7\u6a39\u81021,%,,10,10,(\u30bb\u30e9\u30df\u30c3\u30af),(\u30bb\u30e9\u30df\u30c3\u30af),,\n,,\u30a8\u30dd\u30ad\u30b7\u6a39\u81022,,5,-,-,,,5,5\n,,\u30a8\u30dd\u30ad\u30b7\u6a39\u81023,,5,-,-,,,5,5\n,,\u30b7\u30a2\u30cd\u30fc\u30c8\u6a39\u81021,,10,10,10,,,10,10\n,,\u30d5\u30a7\u30ce\u30fc\u30eb\u7cfb\u786c\u5316\u52641,,5,5,5,,,5,5\n,\u786c\u5316\u89e6\u5a92,\u786c\u5316\u89e6\u5a921,,0.2,0.2,0.2,,,0.2,0.2\n,\u7121\u6a5f\u5145\u586b\u6750,\u7121\u6a5f\u5145\u586b\u67501,,74.8,74.8,74.8,,,74.8,74.8\n,\u5408\u8a08,,,100,100,100,,,100,100\nIL1TD-L1TU,,,ppm,510,470,470,,,510,510\nL125D-L125U,,,ppm,390,330,330,,,390,390\n\u30ac\u30e9\u30b9\u8ee2\u79fb\u6e29\u5ea6 (Tg),,,\u00b0C,,224,224,,,235,235\n", "image": "7810d908b0ff4ce381dcab873196d133.jpg", "image_shape": [1653, 2339], "json": {"ocr": [{"location": [[237, 290], [352, 290], [352, 309], [237, 309]], "type": "textline", "text": "\u71b1\u786c\u5316\u6027\u6a39\u8102", "confidence_by_character": [0.9686674475669861, 0.9664099812507629, 0.9206223487854004, 0.9281898140907288, 0.962303102016449, 0.9605254530906677], "confidence_by_field": 0.9206223487854004, "original_text": "\u71b1\u786c\u5316\u6027\u6a39\u8102"}, {"location": [[694, 272], [739, 272], [739, 297], [694, 297]], "type": "textline", "text": "10.2", "confidence_by_character": [0.9169260263442993, 0.9255782961845398, 0.924151599407196, 0.9199540019035339], "confidence_by_field": 0.9169260263442993, "original_text": "10.2"}, {"location": [[767, 242], [843, 242], [843, 264], [767, 264]], "type": "textline", "text": "\u5b9f\u65bd\u4f8b2", "confidence_by_character": [0.9473932385444641, 0.9163241982460022, 0.7033942937850952, 0.8979774117469788], "confidence_by_field": 0.7033942937850952, "original_text": "\u5b9f\u65bd\u4f8b2"}, {"location": [[783, 272], [830, 272], [830, 297], [783, 297]], "type": "textline", "text": "12.4", "confidence_by_character": [0.9123356342315674, 0.9198176264762878, 0.9250789880752563, 0.92127925157547], "confidence_by_field": 0.9123356342315674, "original_text": "12.4"}, {"location": [[961, 268], [1011, 268], [1011, 298], [961, 298]], "type": "textline", "text": "10.2", "confidence_by_character": [0.917376697063446, 0.9228717684745789, 0.9257153868675232, 0.9199367761611938], "confidence_by_field": 0.917376697063446, "original_text": "10.2"}, {"location": [[1050, 268], [1100, 268], [1100, 298], [1050, 298]], "type": "textline", "text": "12.4", "confidence_by_character": [0.9123420119285583, 0.9206566214561462, 0.9267528057098389, 0.9224498867988586], "confidence_by_field": 0.9123420119285583, "original_text": "12.4"}, {"location": [[1130, 242], [1203, 242], [1203, 264], [1130, 264]], "type": "textline", "text": "\u6bd4\u8f03\u4f8b3", "confidence_by_character": [0.9575085639953613, 0.9481230974197388, 0.756417453289032, 0.9103184342384338], "confidence_by_field": 0.756417453289032, "original_text": "\u6bd4\u8f03\u4f8b3"}, {"location": [[236, 335], [297, 335], [297, 358], [236, 358]], "type": "textline", "text": "\u786c\u5316\u5264", "confidence_by_character": [0.9772048592567444, 0.9326395988464355, 0.9597039222717285], "confidence_by_field": 0.9326395988464355, "original_text": "\u786c\u5316\u5264"}, {"location": [[383, 335], [454, 335], [454, 358], [383, 358]], "type": "textline", "text": "\u786c\u5316\u52641", "confidence_by_character": [0.9785639047622681, 0.9359536170959473, 0.9465298056602478, 0.911139965057373], "confidence_by_field": 0.911139965057373, "original_text": "\u786c\u5316\u52641"}, {"location": [[385, 305], [510, 305], [510, 325], [385, 325]], "type": "textline", "text": "\u30a8\u30dd\u30ad\u30b7\u6a39\u81022", "confidence_by_character": [0.9240000247955322, 0.9118557572364807, 0.8709841966629028, 0.898737370967865, 0.9585683345794678, 0.9542934894561768, 0.9125517010688782], "confidence_by_field": 0.8709841966629028, "original_text": "\u30a8\u30dd\u30ad\u30b7\u6a39\u81022"}, {"location": [[385, 274], [509, 274], [509, 294], [385, 294]], "type": "textline", "text": "\u30a8\u30dd\u30ad\u30b7\u6a39\u81021", "confidence_by_character": [0.9230291247367859, 0.913360595703125, 0.8715535402297974, 0.9017773270606995, 0.9544979333877563, 0.9605312943458557, 0.9045484662055969], "confidence_by_field": 0.8715535402297974, "original_text": "\u30a8\u30dd\u30ad\u30b7\u6a39\u81021"}, {"location": [[601, 240], [644, 240], [644, 265], [601, 265]], "type": "textline", "text": "\u5358\u4f4d", "confidence_by_character": [0.9253715872764587, 0.9197276830673218], "confidence_by_field": 0.9197276830673218, "original_text": "\u5358\u4f4d"}, {"location": [[675, 242], [752, 242], [752, 264], [675, 264]], "type": "textline", "text": "\u5b9f\u65bd\u4f8b1", "confidence_by_character": [0.9467881917953491, 0.9383226037025452, 0.7930335402488708, 0.7423544526100159], "confidence_by_field": 0.7423544526100159, "original_text": "\u5b9f\u65bd\u4f8b1"}, {"location": [[856, 242], [933, 242], [933, 264], [856, 264]], "type": "textline", "text": "\u5b9f\u65bd\u4f8b3", "confidence_by_character": [0.9493170976638794, 0.9393699169158936, 0.7815940976142883, 0.8886772990226746], "confidence_by_field": 0.7815940976142883, "original_text": "\u5b9f\u65bd\u4f8b3"}, {"location": [[1232, 272], [1280, 272], [1280, 297], [1232, 297]], "type": "textline", "text": "10.2", "confidence_by_character": [0.9169086217880249, 0.9247543811798096, 0.9242917895317078, 0.9190678596496582], "confidence_by_field": 0.9169086217880249, "original_text": "10.2"}, {"location": [[709, 308], [722, 308], [722, 324], [709, 324]], "type": "textline", "text": "-", "confidence_by_character": [0.8763231635093689], "confidence_by_field": 0.8763231635093689, "original_text": "-"}, {"location": [[788, 334], [825, 334], [825, 359], [788, 359]], "type": "textline", "text": "6.6", "confidence_by_character": [0.9220069050788879, 0.9281726479530334, 0.9160889387130737], "confidence_by_field": 0.9160889387130737, "original_text": "6.6"}, {"location": [[799, 308], [812, 308], [812, 324], [799, 324]], "type": "textline", "text": "-", "confidence_by_character": [0.888385534286499], "confidence_by_field": 0.888385534286499, "original_text": "-"}, {"location": [[873, 273], [919, 273], [919, 297], [873, 297]], "type": "textline", "text": "12.4", "confidence_by_character": [0.912204921245575, 0.9198381900787354, 0.9228084087371826, 0.9204709529876709], "confidence_by_field": 0.912204921245575, "original_text": "12.4"}, {"location": [[949, 242], [1022, 242], [1022, 264], [949, 264]], "type": "textline", "text": "\u6bd4\u8f03\u4f8b1", "confidence_by_character": [0.9624030590057373, 0.9464587569236755, 0.7610214948654175, 0.9083214998245239], "confidence_by_field": 0.7610214948654175, "original_text": "\u6bd4\u8f03\u4f8b1"}, {"location": [[1040, 242], [1114, 242], [1114, 264], [1040, 264]], "type": "textline", "text": "\u6bd4\u8f03\u4f8b2", "confidence_by_character": [0.9574549794197083, 0.95237135887146, 0.7252858281135559, 0.9158275127410889], "confidence_by_field": 0.7252858281135559, "original_text": "\u6bd4\u8f03\u4f8b2"}, {"location": [[1219, 242], [1293, 242], [1293, 264], [1219, 264]], "type": "textline", "text": "\u6bd4\u8f03\u4f8b4", "confidence_by_character": [0.9668106436729431, 0.9518352746963501, 0.792457640171051, 0.91578209400177], "confidence_by_field": 0.792457640171051, "original_text": "\u6bd4\u8f03\u4f8b4"}, {"location": [[886, 307], [904, 307], [904, 324], [886, 324]], "type": "textline", "text": "-", "confidence_by_character": [0.8876243233680725], "confidence_by_field": 0.8876243233680725, "original_text": "-"}, {"location": [[980, 310], [993, 310], [993, 324], [980, 324]], "type": "textline", "text": "-", "confidence_by_character": [0.8970376253128052], "confidence_by_field": 0.8970376253128052, "original_text": "-"}, {"location": [[1065, 306], [1087, 306], [1087, 328], [1065, 328]], "type": "textline", "text": "-", "confidence_by_character": [0.8992383480072021], "confidence_by_field": 0.8992383480072021, "original_text": "-"}, {"location": [[1249, 310], [1262, 310], [1262, 324], [1249, 324]], "type": "textline", "text": "-", "confidence_by_character": [0.8871487379074097], "confidence_by_field": 0.8871487379074097, "original_text": "-"}, {"location": [[237, 382], [333, 382], [333, 401], [237, 401]], "type": "textline", "text": "\u7121\u6a5f\u5145\u586b\u6750", "confidence_by_character": [0.952167809009552, 0.9192871451377869, 0.9678506255149841, 0.9592251181602478, 0.9547764658927917], "confidence_by_field": 0.9192871451377869, "original_text": "\u7121\u6a5f\u5145\u586b\u6750"}, {"location": [[384, 367], [490, 367], [490, 386], [384, 386]], "type": "textline", "text": "\u7121\u6a5f\u5145\u586b\u52641", "confidence_by_character": [0.9493359923362732, 0.9333418011665344, 0.9685581922531128, 0.9741933941841125, 0.9563856720924377, 0.9097828269004822], "confidence_by_field": 0.9097828269004822, "original_text": "\u7121\u6a5f\u5145\u586b\u52641"}, {"location": [[697, 334], [734, 334], [734, 358], [697, 358]], "type": "textline", "text": "4.8", "confidence_by_character": [0.9166890978813171, 0.9267215132713318, 0.9219436645507812], "confidence_by_field": 0.9166890978813171, "original_text": "4.8"}, {"location": [[880, 337], [914, 337], [914, 359], [880, 359]], "type": "textline", "text": "6.6", "confidence_by_character": [0.9212806820869446, 0.9280492067337036, 0.917919933795929], "confidence_by_field": 0.917919933795929, "original_text": "6.6"}, {"location": [[701, 396], [733, 396], [733, 421], [701, 421]], "type": "textline", "text": "10", "confidence_by_character": [0.9160857200622559, 0.9220792651176453], "confidence_by_field": 0.9160857200622559, "original_text": "10"}, {"location": [[701, 364], [733, 364], [733, 389], [701, 389]], "type": "textline", "text": "74", "confidence_by_character": [0.9179115295410156, 0.9187763333320618], "confidence_by_field": 0.9179115295410156, "original_text": "74"}, {"location": [[793, 365], [820, 365], [820, 388], [793, 388]], "type": "textline", "text": "70", "confidence_by_character": [0.9186102151870728, 0.9202508330345154], "confidence_by_field": 0.9186102151870728, "original_text": "70"}, {"location": [[883, 367], [910, 367], [910, 388], [883, 388]], "type": "textline", "text": "70", "confidence_by_character": [0.9183272123336792, 0.920913577079773], "confidence_by_field": 0.9183272123336792, "original_text": "70"}, {"location": [[969, 334], [1006, 334], [1006, 359], [969, 359]], "type": "textline", "text": "4.8", "confidence_by_character": [0.9184820652008057, 0.9289669990539551, 0.9193996787071228], "confidence_by_field": 0.9184820652008057, "original_text": "4.8"}, {"location": [[970, 364], [1002, 364], [1002, 389], [970, 389]], "type": "textline", "text": "74", "confidence_by_character": [0.9170815348625183, 0.9173583984375], "confidence_by_field": 0.9170815348625183, "original_text": "74"}, {"location": [[1059, 334], [1094, 334], [1094, 359], [1059, 359]], "type": "textline", "text": "6.6", "confidence_by_character": [0.9224190711975098, 0.9298495650291443, 0.9167705178260803], "confidence_by_field": 0.9167705178260803, "original_text": "6.6"}, {"location": [[1143, 303], [1190, 303], [1190, 327], [1143, 327]], "type": "textline", "text": "14.4", "confidence_by_character": [0.9120434522628784, 0.9212027788162231, 0.9268625974655151, 0.9202296137809753], "confidence_by_field": 0.9120434522628784, "original_text": "14.4"}, {"location": [[1061, 364], [1093, 364], [1093, 391], [1061, 391]], "type": "textline", "text": "70", "confidence_by_character": [0.9193744659423828, 0.9198716878890991], "confidence_by_field": 0.9193744659423828, "original_text": "70"}, {"location": [[1148, 334], [1185, 334], [1185, 359], [1148, 359]], "type": "textline", "text": "7.6", "confidence_by_character": [0.9237031936645508, 0.922902524471283, 0.9160147905349731], "confidence_by_field": 0.9160147905349731, "original_text": "7.6"}, {"location": [[1149, 364], [1182, 364], [1182, 389], [1149, 389]], "type": "textline", "text": "67", "confidence_by_character": [0.9192152619361877, 0.9226703643798828], "confidence_by_field": 0.9192152619361877, "original_text": "67"}, {"location": [[1238, 334], [1275, 334], [1275, 358], [1238, 358]], "type": "textline", "text": "4.8", "confidence_by_character": [0.9171125292778015, 0.9256289005279541, 0.9216449856758118], "confidence_by_field": 0.9171125292778015, "original_text": "4.8"}, {"location": [[1243, 365], [1270, 365], [1270, 388], [1243, 388]], "type": "textline", "text": "74", "confidence_by_character": [0.9173100590705872, 0.9199501872062683], "confidence_by_field": 0.9173100590705872, "original_text": "74"}, {"location": [[384, 429], [495, 429], [495, 448], [384, 448]], "type": "textline", "text": "\u786c\u5316\u4fc3\u9032\u52641", "confidence_by_character": [0.9833092093467712, 0.9183294773101807, 0.8089151382446289, 0.9589380621910095, 0.9683259725570679, 0.9022980332374573], "confidence_by_field": 0.8089151382446289, "original_text": "\u786c\u5316\u4fc3\u9032\u52641"}, {"location": [[236, 520], [295, 520], [295, 542], [236, 542]], "type": "textline", "text": "\u7740\u8272\u5264", "confidence_by_character": [0.9404123425483704, 0.9459457397460938, 0.9636843204498291], "confidence_by_field": 0.9404123425483704, "original_text": "\u7740\u8272\u5264"}, {"location": [[236, 489], [295, 489], [295, 512], [236, 512]], "type": "textline", "text": "\u96e2\u578b\u5264", "confidence_by_character": [0.9581624865531921, 0.945020318031311, 0.9589981436729431], "confidence_by_field": 0.945020318031311, "original_text": "\u96e2\u578b\u5264"}, {"location": [[237, 459], [371, 459], [371, 480], [237, 480]], "type": "textline", "text": "\u30ab\u30c3\u30d7\u30ea\u30f3\u30b0\u5264", "confidence_by_character": [0.9071572422981262, 0.9223423600196838, 0.9463334083557129, 0.9168652892112732, 0.921983003616333, 0.9373961091041565, 0.9660329818725586], "confidence_by_field": 0.9071572422981262, "original_text": "\u30ab\u30c3\u30d7\u30ea\u30f3\u30b0\u5264"}, {"location": [[237, 429], [333, 429], [333, 448], [237, 448]], "type": "textline", "text": "\u786c\u5316\u4fc3\u9032\u5264", "confidence_by_character": [0.9770804643630981, 0.9250267744064331, 0.831341564655304, 0.9542734026908875, 0.9714828729629517], "confidence_by_field": 0.831341564655304, "original_text": "\u786c\u5316\u4fc3\u9032\u5264"}, {"location": [[384, 398], [491, 398], [491, 417], [384, 417]], "type": "textline", "text": "\u7121\u6a5f\u5145\u586b\u52642", "confidence_by_character": [0.9506935477256775, 0.931281566619873, 0.9721326231956482, 0.9734131693840027, 0.9598016738891602, 0.915614664554596], "confidence_by_field": 0.915614664554596, "original_text": "\u7121\u6a5f\u5145\u586b\u52642"}, {"location": [[592, 410], [652, 410], [652, 434], [592, 434]], "type": "textline", "text": "\u8cea\u91cf%", "confidence_by_character": [0.9058261513710022, 0.9396486282348633, 0.9175691604614258], "confidence_by_field": 0.9058261513710022, "original_text": "\u8cea\u91cf%"}, {"location": [[883, 397], [910, 397], [910, 419], [883, 419]], "type": "textline", "text": "10", "confidence_by_character": [0.9157432913780212, 0.9219378232955933], "confidence_by_field": 0.9157432913780212, "original_text": "10"}, {"location": [[970, 396], [1002, 396], [1002, 421], [970, 421]], "type": "textline", "text": "10", "confidence_by_character": [0.9164617657661438, 0.9216349720954895], "confidence_by_field": 0.9164617657661438, "original_text": "10"}, {"location": [[1061, 396], [1093, 396], [1093, 421], [1061, 421]], "type": "textline", "text": "10", "confidence_by_character": [0.9159533977508545, 0.9209455251693726], "confidence_by_field": 0.9159533977508545, "original_text": "10"}, {"location": [[1151, 396], [1183, 396], [1183, 421], [1151, 421]], "type": "textline", "text": "10", "confidence_by_character": [0.9155728816986084, 0.9205871224403381], "confidence_by_field": 0.9155728816986084, "original_text": "10"}, {"location": [[1243, 397], [1270, 397], [1270, 419], [1243, 419]], "type": "textline", "text": "10", "confidence_by_character": [0.9161996841430664, 0.9221776127815247], "confidence_by_field": 0.9161996841430664, "original_text": "10"}, {"location": [[385, 459], [532, 459], [532, 479], [385, 479]], "type": "textline", "text": "\u30ab\u30c3\u30d7\u30ea\u30f3\u30b0\u52641", "confidence_by_character": [0.906684398651123, 0.9242702126502991, 0.9463043808937073, 0.9182814955711365, 0.9192276000976562, 0.9397355914115906, 0.958014190196991, 0.9101974368095398], "confidence_by_field": 0.906684398651123, "original_text": "\u30ab\u30c3\u30d7\u30ea\u30f3\u30b0\u52641"}, {"location": [[384, 520], [457, 520], [457, 542], [384, 542]], "type": "textline", "text": "\u7740\u8272\u52641", "confidence_by_character": [0.9345429539680481, 0.9415675401687622, 0.9616525769233704, 0.9126831889152527], "confidence_by_field": 0.9126831889152527, "original_text": "\u7740\u8272\u52641"}, {"location": [[384, 491], [457, 491], [457, 510], [384, 510]], "type": "textline", "text": "\u96e2\u578b\u52641", "confidence_by_character": [0.938474178314209, 0.9532917737960815, 0.9655312895774841, 0.9134681224822998], "confidence_by_field": 0.9134681224822998, "original_text": "\u96e2\u578b\u52641"}, {"location": [[697, 458], [734, 458], [734, 483], [697, 483]], "type": "textline", "text": "0.2", "confidence_by_character": [0.918220579624176, 0.9233868718147278, 0.9168131947517395], "confidence_by_field": 0.9168131947517395, "original_text": "0.2"}, {"location": [[697, 427], [734, 427], [734, 451], [697, 451]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9187952280044556, 0.9251962304115295, 0.9168661236763], "confidence_by_field": 0.9168661236763, "original_text": "0.2"}, {"location": [[878, 427], [915, 427], [915, 451], [878, 451]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9180716276168823, 0.9220828413963318, 0.9168251156806946], "confidence_by_field": 0.9168251156806946, "original_text": "0.2"}, {"location": [[697, 520], [734, 520], [734, 543], [697, 543]], "type": "textline", "text": "0.4", "confidence_by_character": [0.9189480543136597, 0.9261084794998169, 0.9156019687652588], "confidence_by_field": 0.9156019687652588, "original_text": "0.4"}, {"location": [[699, 488], [734, 488], [734, 513], [699, 513]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9182897210121155, 0.9248169660568237, 0.9167609214782715], "confidence_by_field": 0.9167609214782715, "original_text": "0.2"}, {"location": [[788, 456], [825, 456], [825, 481], [788, 481]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9177680015563965, 0.9243178963661194, 0.9165362119674683], "confidence_by_field": 0.9165362119674683, "original_text": "0.2"}, {"location": [[788, 427], [825, 427], [825, 451], [788, 451]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9185940027236938, 0.922125518321991, 0.9162130951881409], "confidence_by_field": 0.9162130951881409, "original_text": "0.2"}, {"location": [[793, 397], [820, 397], [820, 419], [793, 419]], "type": "textline", "text": "10", "confidence_by_character": [0.9177662134170532, 0.9242537021636963], "confidence_by_field": 0.9177662134170532, "original_text": "10"}, {"location": [[789, 488], [825, 488], [825, 513], [789, 513]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9175012111663818, 0.9248098731040955, 0.9154093265533447], "confidence_by_field": 0.9154093265533447, "original_text": "0.2"}, {"location": [[878, 458], [914, 458], [914, 483], [878, 483]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9184542298316956, 0.9236595034599304, 0.9165345430374146], "confidence_by_field": 0.9165345430374146, "original_text": "0.2"}, {"location": [[969, 456], [1006, 456], [1006, 481], [969, 481]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9164131879806519, 0.9254051446914673, 0.9154728651046753], "confidence_by_field": 0.9154728651046753, "original_text": "0.2"}, {"location": [[969, 426], [1006, 426], [1006, 451], [969, 451]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9165401458740234, 0.9244502782821655, 0.9159473776817322], "confidence_by_field": 0.9159473776817322, "original_text": "0.2"}, {"location": [[1057, 456], [1094, 456], [1094, 481], [1057, 481]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9178974628448486, 0.9235621690750122, 0.9160559177398682], "confidence_by_field": 0.9160559177398682, "original_text": "0.2"}, {"location": [[1057, 426], [1094, 426], [1094, 451], [1057, 451]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9180322885513306, 0.921731173992157, 0.9166355133056641], "confidence_by_field": 0.9166355133056641, "original_text": "0.2"}, {"location": [[1148, 456], [1185, 456], [1185, 483], [1148, 483]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9164595603942871, 0.9242724180221558, 0.9157667756080627], "confidence_by_field": 0.9157667756080627, "original_text": "0.2"}, {"location": [[1148, 426], [1185, 426], [1185, 451], [1148, 451]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9180106520652771, 0.9206836223602295, 0.9163994789123535], "confidence_by_field": 0.9163994789123535, "original_text": "0.2"}, {"location": [[1238, 456], [1274, 456], [1274, 481], [1238, 481]], "type": "textline", "text": "0.2", "confidence_by_character": [0.917959451675415, 0.9236196279525757, 0.916109561920166], "confidence_by_field": 0.916109561920166, "original_text": "0.2"}, {"location": [[1238, 426], [1275, 426], [1275, 451], [1238, 451]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9176772832870483, 0.9209798574447632, 0.9162566065788269], "confidence_by_field": 0.9162566065788269, "original_text": "0.2"}, {"location": [[791, 521], [825, 521], [825, 543], [791, 543]], "type": "textline", "text": "0.4", "confidence_by_character": [0.9188475012779236, 0.9243553280830383, 0.9138782024383545], "confidence_by_field": 0.9138782024383545, "original_text": "0.4"}, {"location": [[878, 518], [914, 518], [914, 543], [878, 543]], "type": "textline", "text": "0.4", "confidence_by_character": [0.9186568856239319, 0.924356997013092, 0.9140645265579224], "confidence_by_field": 0.9140645265579224, "original_text": "0.4"}, {"location": [[878, 488], [914, 488], [914, 513], [878, 513]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9191107749938965, 0.9236589670181274, 0.9163731336593628], "confidence_by_field": 0.9163731336593628, "original_text": "0.2"}, {"location": [[969, 518], [1006, 518], [1006, 543], [969, 543]], "type": "textline", "text": "0.4", "confidence_by_character": [0.9177034497261047, 0.9257544279098511, 0.911573052406311], "confidence_by_field": 0.911573052406311, "original_text": "0.4"}, {"location": [[969, 488], [1006, 488], [1006, 513], [969, 513]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9173431992530823, 0.9257070422172546, 0.9158886075019836], "confidence_by_field": 0.9158886075019836, "original_text": "0.2"}, {"location": [[1057, 488], [1094, 488], [1094, 513], [1057, 513]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9179250001907349, 0.9226782917976379, 0.9163774251937866], "confidence_by_field": 0.9163774251937866, "original_text": "0.2"}, {"location": [[1059, 518], [1094, 518], [1094, 543], [1059, 543]], "type": "textline", "text": "0.4", "confidence_by_character": [0.9183651208877563, 0.9259626865386963, 0.9120912551879883], "confidence_by_field": 0.9120912551879883, "original_text": "0.4"}, {"location": [[1148, 488], [1185, 488], [1185, 513], [1148, 513]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9188930988311768, 0.922532856464386, 0.91632080078125], "confidence_by_field": 0.91632080078125, "original_text": "0.2"}, {"location": [[1149, 518], [1185, 518], [1185, 543], [1149, 543]], "type": "textline", "text": "0.4", "confidence_by_character": [0.9181385040283203, 0.9250556230545044, 0.9124614000320435], "confidence_by_field": 0.9124614000320435, "original_text": "0.4"}, {"location": [[1238, 518], [1274, 518], [1274, 543], [1238, 543]], "type": "textline", "text": "0.4", "confidence_by_character": [0.9189369678497314, 0.924277663230896, 0.9133647680282593], "confidence_by_field": 0.9133647680282593, "original_text": "0.4"}, {"location": [[1238, 488], [1274, 488], [1274, 513], [1238, 513]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9190401434898376, 0.9235551357269287, 0.916479766368866], "confidence_by_field": 0.916479766368866, "original_text": "0.2"}, {"location": [[383, 551], [423, 551], [423, 571], [383, 571]], "type": "textline", "text": "\u5408\u8a08", "confidence_by_character": [0.9172796607017517, 0.9246873259544373], "confidence_by_field": 0.9172796607017517, "original_text": "\u5408\u8a08"}, {"location": [[696, 550], [736, 550], [736, 574], [696, 574]], "type": "textline", "text": "100", "confidence_by_character": [0.9186745285987854, 0.9217865467071533, 0.9134313464164734], "confidence_by_field": 0.9134313464164734, "original_text": "100"}, {"location": [[786, 548], [826, 548], [826, 574], [786, 574]], "type": "textline", "text": "100", "confidence_by_character": [0.9187353849411011, 0.9226377010345459, 0.9132767915725708], "confidence_by_field": 0.9132767915725708, "original_text": "100"}, {"location": [[877, 548], [917, 548], [917, 574], [877, 574]], "type": "textline", "text": "100", "confidence_by_character": [0.918876051902771, 0.9223041534423828, 0.9158591628074646], "confidence_by_field": 0.9158591628074646, "original_text": "100"}, {"location": [[965, 548], [1007, 548], [1007, 574], [965, 574]], "type": "textline", "text": "100", "confidence_by_character": [0.9187213182449341, 0.9187850952148438, 0.9130398035049438], "confidence_by_field": 0.9130398035049438, "original_text": "100"}, {"location": [[1056, 548], [1096, 548], [1096, 574], [1056, 574]], "type": "textline", "text": "100", "confidence_by_character": [0.9191818237304688, 0.9226840734481812, 0.9121555089950562], "confidence_by_field": 0.9121555089950562, "original_text": "100"}, {"location": [[1146, 548], [1186, 548], [1186, 574], [1146, 574]], "type": "textline", "text": "100", "confidence_by_character": [0.9189720749855042, 0.9227191805839539, 0.9106909036636353], "confidence_by_field": 0.9106909036636353, "original_text": "100"}, {"location": [[1237, 550], [1277, 550], [1277, 574], [1237, 574]], "type": "textline", "text": "100", "confidence_by_character": [0.9188445806503296, 0.9203974008560181, 0.9133830666542053], "confidence_by_field": 0.9133830666542053, "original_text": "100"}, {"location": [[797, 763], [828, 763], [828, 793], [797, 793]], "type": "textline", "text": "10", "confidence_by_character": [0.9181389808654785, 0.9251900911331177], "confidence_by_field": 0.9181389808654785, "original_text": "10"}, {"location": [[862, 736], [938, 736], [938, 758], [862, 758]], "type": "textline", "text": "\u5b9f\u65bd\u4f8b3", "confidence_by_character": [0.940096378326416, 0.9323399066925049, 0.7783450484275818, 0.8894999623298645], "confidence_by_field": 0.7783450484275818, "original_text": "\u5b9f\u65bd\u4f8b3"}, {"location": [[886, 763], [919, 763], [919, 791], [886, 791]], "type": "textline", "text": "10", "confidence_by_character": [0.915480375289917, 0.92236328125], "confidence_by_field": 0.915480375289917, "original_text": "10"}, {"location": [[1225, 736], [1299, 736], [1299, 758], [1225, 758]], "type": "textline", "text": "\u6bd4\u8f03\u4f8b4", "confidence_by_character": [0.9564768671989441, 0.9437181949615479, 0.8032115697860718, 0.9161063432693481], "confidence_by_field": 0.8032115697860718, "original_text": "\u6bd4\u8f03\u4f8b4"}, {"location": [[243, 831], [358, 831], [358, 851], [243, 851]], "type": "textline", "text": "\u71b1\u786c\u5316\u6027\u6a39\u8102", "confidence_by_character": [0.9673244953155518, 0.9746560454368591, 0.9246954917907715, 0.9313607811927795, 0.9588099122047424, 0.9608355164527893], "confidence_by_field": 0.9246954917907715, "original_text": "\u71b1\u786c\u5316\u6027\u6a39\u8102"}, {"location": [[391, 864], [534, 864], [534, 883], [391, 883]], "type": "textline", "text": "\u30b7\u30a2\u30cd\u30fc\u30c8\u6a39\u81021", "confidence_by_character": [0.9139906764030457, 0.9277936220169067, 0.9299982190132141, 0.9196512699127197, 0.9228595495223999, 0.9653444886207581, 0.9586682319641113, 0.9014871120452881], "confidence_by_field": 0.9014871120452881, "original_text": "\u30b7\u30a2\u30cd\u30fc\u30c8\u6a39\u81021"}, {"location": [[391, 832], [515, 832], [515, 852], [391, 852]], "type": "textline", "text": "\u30a8\u30dd\u30ad\u30b7\u6a39\u81023", "confidence_by_character": [0.9210723042488098, 0.9120734930038452, 0.8719823360443115, 0.9054407477378845, 0.9484961628913879, 0.955137312412262, 0.9074426889419556], "confidence_by_field": 0.8719823360443115, "original_text": "\u30a8\u30dd\u30ad\u30b7\u6a39\u81023"}, {"location": [[391, 801], [515, 801], [515, 821], [391, 821]], "type": "textline", "text": "\u30a8\u30dd\u30ad\u30b7\u6a39\u81022", "confidence_by_character": [0.9195655584335327, 0.8951327800750732, 0.8749004602432251, 0.9026333689689636, 0.9528785347938538, 0.9567875266075134, 0.9132635593414307], "confidence_by_field": 0.8749004602432251, "original_text": "\u30a8\u30dd\u30ad\u30b7\u6a39\u81022"}, {"location": [[391, 769], [512, 769], [512, 789], [391, 789]], "type": "textline", "text": "\u30a8\u30dd\u30ad\u30b7\u6a39\u81021", "confidence_by_character": [0.9198406338691711, 0.9114395380020142, 0.8742417097091675, 0.9063976407051086, 0.946216344833374, 0.9661325216293335, 0.8822324275970459], "confidence_by_field": 0.8742417097091675, "original_text": "\u30a8\u30dd\u30ad\u30b7\u6a39\u81021"}, {"location": [[605, 734], [649, 734], [649, 760], [605, 760]], "type": "textline", "text": "\u5358\u4f4d", "confidence_by_character": [0.9275341629981995, 0.9182473421096802], "confidence_by_field": 0.9182473421096802, "original_text": "\u5358\u4f4d"}, {"location": [[681, 736], [759, 736], [759, 760], [681, 760]], "type": "textline", "text": "\u5b9f\u65bd\u4f8b1", "confidence_by_character": [0.9462378621101379, 0.9295192360877991, 0.7775577306747437, 0.7519926428794861], "confidence_by_field": 0.7519926428794861, "original_text": "\u5b9f\u65bd\u4f8b1"}, {"location": [[772, 736], [847, 736], [847, 760], [772, 760]], "type": "textline", "text": "\u5b9f\u65bd\u4f8b2", "confidence_by_character": [0.945273756980896, 0.9234545826911926, 0.7456976771354675, 0.8920645117759705], "confidence_by_field": 0.7456976771354675, "original_text": "\u5b9f\u65bd\u4f8b2"}, {"location": [[956, 736], [1027, 736], [1027, 758], [956, 758]], "type": "textline", "text": "\u6bd4\u8f03\u4f8b1", "confidence_by_character": [0.9605406522750854, 0.9484114050865173, 0.8022181391716003, 0.9111186861991882], "confidence_by_field": 0.8022181391716003, "original_text": "\u6bd4\u8f03\u4f8b1"}, {"location": [[1046, 736], [1119, 736], [1119, 758], [1046, 758]], "type": "textline", "text": "\u6bd4\u8f03\u4f8b2", "confidence_by_character": [0.939124345779419, 0.9446831941604614, 0.7813613414764404, 0.9183083176612854], "confidence_by_field": 0.7813613414764404, "original_text": "\u6bd4\u8f03\u4f8b2"}, {"location": [[1136, 736], [1209, 736], [1209, 758], [1136, 758]], "type": "textline", "text": "\u6bd4\u8f03\u4f8b3", "confidence_by_character": [0.9410742521286011, 0.9363664984703064, 0.8050448894500732, 0.912356972694397], "confidence_by_field": 0.8050448894500732, "original_text": "\u6bd4\u8f03\u4f8b3"}, {"location": [[710, 799], [731, 799], [731, 823], [710, 823]], "type": "textline", "text": "5", "confidence_by_character": [0.9124702215194702], "confidence_by_field": 0.9124702215194702, "original_text": "5"}, {"location": [[1161, 798], [1183, 798], [1183, 823], [1161, 823]], "type": "textline", "text": "5", "confidence_by_character": [0.9127680659294128], "confidence_by_field": 0.9127680659294128, "original_text": "5"}, {"location": [[612, 874], [641, 874], [641, 901], [612, 901]], "type": "textline", "text": "%", "confidence_by_character": [0.9244439005851746], "confidence_by_field": 0.9244439005851746, "original_text": "%"}, {"location": [[705, 861], [738, 861], [738, 887], [705, 887]], "type": "textline", "text": "10", "confidence_by_character": [0.9159777164459229, 0.9210568070411682], "confidence_by_field": 0.9159777164459229, "original_text": "10"}, {"location": [[710, 829], [731, 829], [731, 853], [710, 853]], "type": "textline", "text": "5", "confidence_by_character": [0.9128246903419495], "confidence_by_field": 0.9128246903419495, "original_text": "5"}, {"location": [[710, 895], [731, 895], [731, 917], [710, 917]], "type": "textline", "text": "5", "confidence_by_character": [0.913021445274353], "confidence_by_field": 0.913021445274353, "original_text": "5"}, {"location": [[797, 863], [825, 863], [825, 885], [797, 885]], "type": "textline", "text": "10", "confidence_by_character": [0.9169679284095764, 0.9230868220329285], "confidence_by_field": 0.9169679284095764, "original_text": "10"}, {"location": [[804, 834], [820, 834], [820, 852], [804, 852]], "type": "textline", "text": "-", "confidence_by_character": [0.9069211483001709], "confidence_by_field": 0.9069211483001709, "original_text": "-"}, {"location": [[804, 804], [820, 804], [820, 822], [804, 822]], "type": "textline", "text": "-", "confidence_by_character": [0.9056602120399475], "confidence_by_field": 0.9056602120399475, "original_text": "-"}, {"location": [[894, 834], [909, 834], [909, 850], [894, 850]], "type": "textline", "text": "-", "confidence_by_character": [0.9042128920555115], "confidence_by_field": 0.9042128920555115, "original_text": "-"}, {"location": [[894, 806], [909, 806], [909, 822], [894, 822]], "type": "textline", "text": "-", "confidence_by_character": [0.9011750817298889], "confidence_by_field": 0.9011750817298889, "original_text": "-"}, {"location": [[1251, 798], [1272, 798], [1272, 823], [1251, 823]], "type": "textline", "text": "5", "confidence_by_character": [0.9124239683151245], "confidence_by_field": 0.9124239683151245, "original_text": "5"}, {"location": [[801, 895], [822, 895], [822, 917], [801, 917]], "type": "textline", "text": "5", "confidence_by_character": [0.9138578176498413], "confidence_by_field": 0.9138578176498413, "original_text": "5"}, {"location": [[888, 863], [915, 863], [915, 885], [888, 885]], "type": "textline", "text": "10", "confidence_by_character": [0.916037380695343, 0.9229444265365601], "confidence_by_field": 0.916037380695343, "original_text": "10"}, {"location": [[1161, 829], [1183, 829], [1183, 853], [1161, 853]], "type": "textline", "text": "5", "confidence_by_character": [0.913336455821991], "confidence_by_field": 0.913336455821991, "original_text": "5"}, {"location": [[891, 893], [912, 893], [912, 917], [891, 917]], "type": "textline", "text": "5", "confidence_by_character": [0.9128530025482178], "confidence_by_field": 0.9128530025482178, "original_text": "5"}, {"location": [[1156, 860], [1190, 860], [1190, 887], [1156, 887]], "type": "textline", "text": "10", "confidence_by_character": [0.9161958694458008, 0.922942042350769], "confidence_by_field": 0.9161958694458008, "original_text": "10"}, {"location": [[1157, 889], [1185, 889], [1185, 920], [1157, 920]], "type": "textline", "text": "5", "confidence_by_character": [0.9128023386001587], "confidence_by_field": 0.9128023386001587, "original_text": "5"}, {"location": [[1246, 860], [1278, 860], [1278, 887], [1246, 887]], "type": "textline", "text": "10", "confidence_by_character": [0.9154711961746216, 0.9222509860992432], "confidence_by_field": 0.9154711961746216, "original_text": "10"}, {"location": [[1249, 829], [1272, 829], [1272, 853], [1249, 853]], "type": "textline", "text": "5", "confidence_by_character": [0.9134175777435303], "confidence_by_field": 0.9134175777435303, "original_text": "5"}, {"location": [[1251, 893], [1272, 893], [1272, 918], [1251, 918]], "type": "textline", "text": "5", "confidence_by_character": [0.912886381149292], "confidence_by_field": 0.912886381149292, "original_text": "5"}, {"location": [[242, 958], [338, 958], [338, 978], [242, 978]], "type": "textline", "text": "\u7121\u6a5f\u5145\u586b\u6750", "confidence_by_character": [0.9516746997833252, 0.9333645105361938, 0.9753089547157288, 0.9772308468818665, 0.961030125617981], "confidence_by_field": 0.9333645105361938, "original_text": "\u7121\u6a5f\u5145\u586b\u6750"}, {"location": [[242, 927], [320, 927], [320, 946], [242, 946]], "type": "textline", "text": "\u786c\u5316\u89e6\u5a92", "confidence_by_character": [0.9817054271697998, 0.9334205985069275, 0.9472013115882874, 0.9651457667350769], "confidence_by_field": 0.9334205985069275, "original_text": "\u786c\u5316\u89e6\u5a92"}, {"location": [[392, 895], [571, 895], [571, 915], [392, 915]], "type": "textline", "text": "\u30d5\u30a7\u30ce\u30fc\u30eb\u7cfb\u786c\u5316\u52641", "confidence_by_character": [0.8958667516708374, 0.8963339328765869, 0.918157160282135, 0.9283046126365662, 0.9119880199432373, 0.9494227766990662, 0.9859360456466675, 0.9373766779899597, 0.948577344417572, 0.9160327911376953], "confidence_by_field": 0.8958667516708374, "original_text": "\u30d5\u30a7\u30ce\u30fc\u30eb\u7cfb\u786c\u5316\u52641"}, {"location": [[390, 958], [496, 958], [496, 978], [390, 978]], "type": "textline", "text": "\u7121\u6a5f\u5145\u586b\u67501", "confidence_by_character": [0.955777108669281, 0.9075387120246887, 0.9792174100875854, 0.9780659079551697, 0.9546536803245544, 0.9078008532524109], "confidence_by_field": 0.9075387120246887, "original_text": "\u7121\u6a5f\u5145\u586b\u67501"}, {"location": [[390, 927], [477, 927], [477, 946], [390, 946]], "type": "textline", "text": "\u786c\u5316\u89e6\u5a921", "confidence_by_character": [0.9808465838432312, 0.9380431771278381, 0.9594202041625977, 0.9618983864784241, 0.9127021431922913], "confidence_by_field": 0.9127021431922913, "original_text": "\u786c\u5316\u89e6\u5a921"}, {"location": [[705, 926], [739, 926], [739, 949], [705, 949]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9183443188667297, 0.9227449893951416, 0.9166637063026428], "confidence_by_field": 0.9166637063026428, "original_text": "0.2"}, {"location": [[959, 939], [1025, 939], [1025, 963], [959, 963]], "type": "textline", "text": "(\u30bb\u30e9\u30df\u30c3\u30af)", "confidence_by_character": [0.916853666305542, 0.9260740876197815, 0.9092920422554016, 0.9257704615592957, 0.9260854721069336, 0.9179700613021851, 0.9244228005409241], "confidence_by_field": 0.9092920422554016, "original_text": "(\u30bb\u30e9\u30df\u30c3\u30af)"}, {"location": [[1049, 939], [1115, 939], [1115, 963], [1049, 963]], "type": "textline", "text": "(\u30bb\u30e9\u30df\u30c3\u30af)", "confidence_by_character": [0.9169306755065918, 0.9260130524635315, 0.9072588682174683, 0.9243903160095215, 0.925151526927948, 0.9165183901786804, 0.9235830307006836], "confidence_by_field": 0.9072588682174683, "original_text": "(\u30bb\u30e9\u30df\u30c3\u30af)"}, {"location": [[273, 1052], [489, 1052], [489, 1071], [273, 1071]], "type": "textline", "text": "L125D-L125U", "confidence_by_character": [0.9245724678039551, 0.9132282733917236, 0.906276524066925, 0.9224117398262024, 0.9258933067321777, 0.9184830784797668, 0.9157562255859375, 0.9087009429931641, 0.912157416343689, 0.9245545268058777, 0.8954219222068787], "confidence_by_field": 0.8954219222068787, "original_text": "L125D-L125U"}, {"location": [[277, 1020], [458, 1020], [458, 1039], [277, 1039]], "type": "textline", "text": "IL1TD-L1TU", "confidence_by_character": [0.8654385209083557, 0.9101585149765015, 0.9015868306159973, 0.8901948928833008, 0.9379502534866333, 0.9125338792800903, 0.9176797866821289, 0.8999291658401489, 0.9107672572135925, 0.9229012131690979], "confidence_by_field": 0.8654385209083557, "original_text": "IL1TD-L1TU"}, {"location": [[386, 987], [431, 987], [431, 1011], [386, 1011]], "type": "textline", "text": "\u5408\u8a08", "confidence_by_character": [0.9146950840950012, 0.9339513182640076], "confidence_by_field": 0.9146950840950012, "original_text": "\u5408\u8a08"}, {"location": [[701, 958], [743, 958], [743, 977], [701, 977]], "type": "textline", "text": "74.8", "confidence_by_character": [0.9160793423652649, 0.9184203147888184, 0.9271222949028015, 0.9212100505828857], "confidence_by_field": 0.9160793423652649, "original_text": "74.8"}, {"location": [[792, 958], [831, 958], [831, 977], [792, 977]], "type": "textline", "text": "74.8", "confidence_by_character": [0.9163533449172974, 0.9201879501342773, 0.9259200692176819, 0.9224289059638977], "confidence_by_field": 0.9163533449172974, "original_text": "74.8"}, {"location": [[796, 926], [826, 926], [826, 945], [796, 945]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9181907176971436, 0.9227388501167297, 0.9164943099021912], "confidence_by_field": 0.9164943099021912, "original_text": "0.2"}, {"location": [[886, 926], [917, 926], [917, 945], [886, 945]], "type": "textline", "text": "0.2", "confidence_by_character": [0.917766809463501, 0.9248180985450745, 0.916694164276123], "confidence_by_field": 0.916694164276123, "original_text": "0.2"}, {"location": [[1153, 923], [1190, 923], [1190, 949], [1153, 949]], "type": "textline", "text": "0.2", "confidence_by_character": [0.9172568917274475, 0.923073947429657, 0.9173568487167358], "confidence_by_field": 0.9172568917274475, "original_text": "0.2"}, {"location": [[1243, 923], [1280, 923], [1280, 949], [1243, 949]], "type": "textline", "text": "0.2", "confidence_by_character": [0.918009340763092, 0.9230136871337891, 0.9172936081886292], "confidence_by_field": 0.9172936081886292, "original_text": "0.2"}, {"location": [[603, 1021], [649, 1021], [649, 1044], [603, 1044]], "type": "textline", "text": "ppm", "confidence_by_character": [0.93487948179245, 0.9136003255844116, 0.9295913577079773], "confidence_by_field": 0.9136003255844116, "original_text": "ppm"}, {"location": [[702, 1019], [741, 1019], [741, 1042], [702, 1042]], "type": "textline", "text": "510", "confidence_by_character": [0.9160666465759277, 0.9195601344108582, 0.9182965755462646], "confidence_by_field": 0.9160666465759277, "original_text": "510"}, {"location": [[704, 990], [739, 990], [739, 1009], [704, 1009]], "type": "textline", "text": "100", "confidence_by_character": [0.9155290126800537, 0.9208177924156189, 0.9127534031867981], "confidence_by_field": 0.9127534031867981, "original_text": "100"}, {"location": [[794, 990], [830, 990], [830, 1009], [794, 1009]], "type": "textline", "text": "100", "confidence_by_character": [0.9162766337394714, 0.9207051396369934, 0.9104440212249756], "confidence_by_field": 0.9104440212249756, "original_text": "100"}, {"location": [[881, 958], [922, 958], [922, 977], [881, 977]], "type": "textline", "text": "74.8", "confidence_by_character": [0.9179209470748901, 0.919108510017395, 0.9271717667579651, 0.9218475222587585], "confidence_by_field": 0.9179209470748901, "original_text": "74.8"}, {"location": [[1149, 957], [1195, 957], [1195, 980], [1149, 980]], "type": "textline", "text": "74.8", "confidence_by_character": [0.91691654920578, 0.9166283011436462, 0.9277328848838806, 0.9221088886260986], "confidence_by_field": 0.9166283011436462, "original_text": "74.8"}, {"location": [[881, 1019], [920, 1019], [920, 1042], [881, 1042]], "type": "textline", "text": "470", "confidence_by_character": [0.9185625314712524, 0.9136890769004822, 0.9183825254440308], "confidence_by_field": 0.9136890769004822, "original_text": "470"}, {"location": [[883, 990], [920, 990], [920, 1009], [883, 1009]], "type": "textline", "text": "100", "confidence_by_character": [0.9160255789756775, 0.9194254279136658, 0.9109370708465576], "confidence_by_field": 0.9109370708465576, "original_text": "100"}, {"location": [[1151, 987], [1193, 987], [1193, 1012], [1151, 1012]], "type": "textline", "text": "100", "confidence_by_character": [0.9162431955337524, 0.9209614992141724, 0.9097722768783569], "confidence_by_field": 0.9097722768783569, "original_text": "100"}, {"location": [[1240, 957], [1285, 957], [1285, 980], [1240, 980]], "type": "textline", "text": "74.8", "confidence_by_character": [0.9182584881782532, 0.92033451795578, 0.9307538270950317, 0.9210019111633301], "confidence_by_field": 0.9182584881782532, "original_text": "74.8"}, {"location": [[1241, 1019], [1282, 1019], [1282, 1042], [1241, 1042]], "type": "textline", "text": "510", "confidence_by_character": [0.914992094039917, 0.9192731976509094, 0.9183305501937866], "confidence_by_field": 0.914992094039917, "original_text": "510"}, {"location": [[1241, 988], [1282, 988], [1282, 1012], [1241, 1012]], "type": "textline", "text": "100", "confidence_by_character": [0.9163695573806763, 0.9216129779815674, 0.9101614952087402], "confidence_by_field": 0.9101614952087402, "original_text": "100"}, {"location": [[278, 1082], [413, 1082], [413, 1103], [278, 1103]], "type": "textline", "text": "\u30ac\u30e9\u30b9\u8ee2\u79fb\u6e29\u5ea6", "confidence_by_character": [0.9343377947807312, 0.920525074005127, 0.9253968000411987, 0.9179226756095886, 0.9002148509025574, 0.9482362270355225, 0.9286996722221375], "confidence_by_field": 0.9002148509025574, "original_text": "\u30ac\u30e9\u30b9\u8ee2\u79fb\u6e29\u5ea6"}, {"location": [[420, 1082], [458, 1082], [458, 1104], [420, 1104]], "type": "textline", "text": "(Tg)", "confidence_by_character": [0.9211322665214539, 0.8978011012077332, 0.9115868210792542, 0.9274181723594666], "confidence_by_field": 0.8978011012077332, "original_text": "(Tg)"}, {"location": [[605, 1051], [650, 1051], [650, 1075], [605, 1075]], "type": "textline", "text": "ppm", "confidence_by_character": [0.9387204647064209, 0.9056726098060608, 0.9325636029243469], "confidence_by_field": 0.9056726098060608, "original_text": "ppm"}, {"location": [[617, 1082], [641, 1082], [641, 1104], [617, 1104]], "type": "textline", "text": "\u00b0C", "confidence_by_character": [0.9452391266822815, 0.9272077679634094], "confidence_by_field": 0.9272077679634094, "original_text": "\u00b0C"}, {"location": [[704, 1052], [739, 1052], [739, 1071], [704, 1071]], "type": "textline", "text": "390", "confidence_by_character": [0.9161151647567749, 0.9135255217552185, 0.9237526655197144], "confidence_by_field": 0.9135255217552185, "original_text": "390"}, {"location": [[793, 1084], [828, 1084], [828, 1103], [793, 1103]], "type": "textline", "text": "224", "confidence_by_character": [0.9199969172477722, 0.9111586213111877, 0.9230489730834961], "confidence_by_field": 0.9111586213111877, "original_text": "224"}, {"location": [[793, 1052], [830, 1052], [830, 1071], [793, 1071]], "type": "textline", "text": "330", "confidence_by_character": [0.9186529517173767, 0.913848340511322, 0.9239091277122498], "confidence_by_field": 0.913848340511322, "original_text": "330"}, {"location": [[793, 1019], [831, 1019], [831, 1042], [793, 1042]], "type": "textline", "text": "470", "confidence_by_character": [0.9227962493896484, 0.9154515862464905, 0.9186294078826904], "confidence_by_field": 0.9154515862464905, "original_text": "470"}, {"location": [[1151, 1019], [1191, 1019], [1191, 1044], [1151, 1044]], "type": "textline", "text": "510", "confidence_by_character": [0.919104814529419, 0.9196509122848511, 0.9189945459365845], "confidence_by_field": 0.9189945459365845, "original_text": "510"}, {"location": [[881, 1081], [920, 1081], [920, 1104], [881, 1104]], "type": "textline", "text": "224", "confidence_by_character": [0.9203312993049622, 0.9107922315597534, 0.9243770241737366], "confidence_by_field": 0.9107922315597534, "original_text": "224"}, {"location": [[881, 1050], [920, 1050], [920, 1074], [881, 1074]], "type": "textline", "text": "330", "confidence_by_character": [0.9171018600463867, 0.9141280055046082, 0.9222427010536194], "confidence_by_field": 0.9141280055046082, "original_text": "330"}, {"location": [[1151, 1084], [1191, 1084], [1191, 1108], [1151, 1108]], "type": "textline", "text": "235", "confidence_by_character": [0.9159690141677856, 0.9248628616333008, 0.9205185174942017], "confidence_by_field": 0.9159690141677856, "original_text": "235"}, {"location": [[1151, 1050], [1191, 1050], [1191, 1074], [1151, 1074]], "type": "textline", "text": "390", "confidence_by_character": [0.9146880507469177, 0.911052405834198, 0.9227294325828552], "confidence_by_field": 0.911052405834198, "original_text": "390"}, {"location": [[1241, 1084], [1280, 1084], [1280, 1108], [1241, 1108]], "type": "textline", "text": "235", "confidence_by_character": [0.9159569144248962, 0.9244524240493774, 0.920172393321991], "confidence_by_field": 0.9159569144248962, "original_text": "235"}, {"location": [[1241, 1050], [1282, 1050], [1282, 1074], [1241, 1074]], "type": "textline", "text": "390", "confidence_by_character": [0.9140211939811707, 0.9115845561027527, 0.9224275946617126], "confidence_by_field": 0.9115845561027527, "original_text": "390"}, {"location": [[184, 338], [205, 338], [205, 508], [184, 508]], "type": "textline", "text": "\u71b1\u786c\u5316\u6027\u6a39\u8102\u7d44\u6210\u7269", "confidence_by_character": [0.9959474205970764, 0.9997753500938416, 0.9999818801879883, 0.9999709129333496, 0.9999679327011108, 0.9999550580978394, 0.9999865293502808, 0.9992927312850952, 0.9999799728393555], "confidence_by_field": 0.9959474205970764, "original_text": "\u71b1\u786c\u5316\u6027\u6a39\u8102\u7d44\u6210\u7269"}, {"location": [[189, 804], [210, 804], [210, 976], [189, 976]], "type": "textline", "text": "\u71b1\u786c\u5316\u6027\u6a39\u8102\u7d44\u6210\u7269", "confidence_by_character": [0.9931052327156067, 0.9999232292175293, 0.9999855756759644, 0.9999867677688599, 0.9999822378158569, 0.9999606609344482, 0.9999895095825195, 0.9993935823440552, 0.9371008276939392], "confidence_by_field": 0.9371008276939392, "original_text": "\u71b1\u786c\u5316\u6027\u6a39\u8102\u7d44\u6210\u7269"}], "table": [{"location": [[1218, 1080], [1303, 1080], [1303, 1107], [1218, 1107]], "bbox": [1218, 1080, 1303, 1107], "points": [[1277, 1081], [1278, 1080], [1279, 1081], [1280, 1080], [1281, 1081], [1282, 1080], [1284, 1081], [1285, 1080], [1286, 1081], [1287, 1080], [1288, 1081], [1289, 1080], [1291, 1081], [1302, 1081], [1303, 1082], [1303, 1106], [1302, 1107], [1219, 1107], [1218, 1106], [1218, 1082], [1219, 1081]], "type": "cell", "rows": [11, 11], "columns": [10, 10], "text_list": []}, {"location": [[1128, 1080], [1213, 1080], [1213, 1107], [1128, 1107]], "bbox": [1128, 1080, 1213, 1107], "points": [[1129, 1081], [1130, 1080], [1132, 1081], [1212, 1081], [1213, 1082], [1213, 1106], [1212, 1107], [1129, 1107], [1128, 1106], [1129, 1105]], "type": "cell", "rows": [11, 11], "columns": [9, 9], "text_list": []}, {"location": [[858, 1080], [943, 1080], [943, 1107], [858, 1107]], "bbox": [858, 1080, 943, 1107], "points": [[877, 1081], [878, 1080], [879, 1081], [880, 1080], [881, 1081], [882, 1080], [884, 1081], [885, 1080], [887, 1080], [888, 1081], [889, 1080], [934, 1080], [935, 1081], [942, 1081], [943, 1082], [943, 1106], [942, 1107], [859, 1107], [858, 1106], [858, 1082], [859, 1081]], "type": "cell", "rows": [11, 11], "columns": [6, 6], "text_list": []}, {"location": [[769, 1080], [853, 1080], [853, 1107], [769, 1107]], "bbox": [769, 1080, 853, 1107], "points": [[792, 1081], [794, 1080], [795, 1081], [806, 1081], [808, 1080], [809, 1081], [811, 1081], [812, 1080], [815, 1080], [816, 1081], [817, 1080], [838, 1080], [839, 1081], [840, 1080], [842, 1081], [852, 1081], [853, 1082], [853, 1106], [852, 1107], [770, 1107], [769, 1106], [769, 1082], [770, 1081]], "type": "cell", "rows": [11, 11], "columns": [5, 5], "text_list": []}, {"location": [[678, 1080], [763, 1080], [763, 1107], [678, 1107]], "bbox": [678, 1080, 763, 1107], "points": [[704, 1081], [705, 1080], [706, 1081], [707, 1080], [708, 1081], [709, 1080], [711, 1081], [712, 1080], [713, 1081], [714, 1080], [715, 1081], [716, 1080], [718, 1081], [719, 1080], [720, 1081], [721, 1080], [722, 1081], [723, 1080], [725, 1081], [726, 1080], [727, 1081], [762, 1081], [763, 1082], [763, 1106], [762, 1107], [679, 1107], [678, 1106], [678, 1082], [679, 1081]], "type": "cell", "rows": [11, 11], "columns": [4, 4], "text_list": []}, {"location": [[577, 1080], [674, 1080], [674, 1107], [577, 1107]], "bbox": [577, 1080, 674, 1107], "points": [[622, 1081], [623, 1080], [625, 1080], [626, 1081], [628, 1081], [629, 1080], [635, 1080], [636, 1081], [672, 1081], [673, 1082], [673, 1104], [674, 1105], [674, 1106], [673, 1107], [578, 1107], [577, 1106], [577, 1082], [578, 1081]], "type": "cell", "rows": [11, 11], "columns": [3, 3], "text_list": []}, {"location": [[171, 1080], [574, 1080], [574, 1107], [171, 1107]], "bbox": [171, 1080, 574, 1107], "points": [[171, 1081], [173, 1080], [174, 1081], [198, 1081], [199, 1080], [201, 1081], [202, 1080], [221, 1080], [222, 1081], [223, 1080], [224, 1081], [225, 1080], [226, 1081], [285, 1081], [286, 1080], [288, 1080], [290, 1081], [292, 1081], [293, 1080], [297, 1080], [298, 1081], [299, 1081], [300, 1080], [312, 1080], [313, 1081], [318, 1081], [319, 1080], [320, 1081], [321, 1080], [370, 1080], [371, 1081], [374, 1081], [375, 1080], [390, 1080], [391, 1081], [394, 1081], [395, 1080], [408, 1080], [409, 1081], [410, 1080], [411, 1081], [418, 1081], [419, 1080], [421, 1081], [422, 1080], [423, 1081], [425, 1081], [426, 1080], [443, 1080], [444, 1081], [451, 1081], [452, 1080], [456, 1080], [457, 1081], [573, 1081], [574, 1082], [574, 1106], [573, 1107], [173, 1107], [171, 1106]], "type": "cell", "rows": [11, 11], "columns": [0, 2], "text_list": []}, {"location": [[1218, 1050], [1303, 1050], [1303, 1075], [1218, 1075]], "bbox": [1218, 1050, 1303, 1075], "points": [[1218, 1051], [1219, 1050], [1302, 1050], [1303, 1051], [1303, 1074], [1302, 1075], [1219, 1075], [1218, 1074]], "type": "cell", "rows": [10, 10], "columns": [10, 10], "text_list": []}, {"location": [[1129, 1050], [1213, 1050], [1213, 1075], [1129, 1075]], "bbox": [1129, 1050, 1213, 1075], "points": [[1129, 1051], [1130, 1050], [1212, 1050], [1213, 1051], [1213, 1074], [1212, 1075], [1130, 1075], [1129, 1074]], "type": "cell", "rows": [10, 10], "columns": [9, 9], "text_list": []}, {"location": [[858, 1050], [943, 1050], [943, 1075], [858, 1075]], "bbox": [858, 1050, 943, 1075], "points": [[858, 1051], [859, 1050], [942, 1050], [943, 1051], [943, 1074], [942, 1075], [859, 1075], [858, 1074]], "type": "cell", "rows": [10, 10], "columns": [6, 6], "text_list": []}, {"location": [[769, 1050], [853, 1050], [853, 1075], [769, 1075]], "bbox": [769, 1050, 853, 1075], "points": [[769, 1051], [770, 1050], [852, 1050], [853, 1051], [853, 1074], [852, 1075], [770, 1075], [769, 1074]], "type": "cell", "rows": [10, 10], "columns": [5, 5], "text_list": []}, {"location": [[678, 1050], [763, 1050], [763, 1075], [678, 1075]], "bbox": [678, 1050, 763, 1075], "points": [[678, 1051], [679, 1050], [762, 1050], [763, 1051], [763, 1074], [762, 1075], [679, 1075], [678, 1074]], "type": "cell", "rows": [10, 10], "columns": [4, 4], "text_list": []}, {"location": [[577, 1050], [673, 1050], [673, 1075], [577, 1075]], "bbox": [577, 1050, 673, 1075], "points": [[577, 1051], [578, 1050], [672, 1050], [673, 1051], [673, 1074], [672, 1075], [580, 1075], [578, 1074], [578, 1061], [577, 1060], [577, 1054], [578, 1053], [578, 1052]], "type": "cell", "rows": [10, 10], "columns": [3, 3], "text_list": []}, {"location": [[171, 1050], [574, 1050], [574, 1075], [171, 1075]], "bbox": [171, 1050, 574, 1075], "points": [[171, 1051], [173, 1050], [573, 1050], [574, 1051], [574, 1074], [573, 1075], [173, 1075], [171, 1074]], "type": "cell", "rows": [10, 10], "columns": [0, 2], "text_list": []}, {"location": [[577, 1018], [673, 1018], [673, 1044], [577, 1044]], "bbox": [577, 1018, 673, 1044], "points": [[577, 1019], [578, 1018], [672, 1018], [673, 1019], [673, 1043], [672, 1044], [580, 1044], [578, 1043], [578, 1031], [577, 1030]], "type": "cell", "rows": [9, 9], "columns": [3, 3], "text_list": []}, {"location": [[1218, 1017], [1303, 1017], [1303, 1044], [1218, 1044]], "bbox": [1218, 1017, 1303, 1044], "points": [[1244, 1018], [1245, 1017], [1289, 1017], [1291, 1018], [1292, 1017], [1293, 1018], [1302, 1018], [1303, 1019], [1303, 1043], [1302, 1044], [1219, 1044], [1218, 1043], [1218, 1019], [1219, 1018]], "type": "cell", "rows": [9, 9], "columns": [10, 10], "text_list": []}, {"location": [[1129, 1017], [1213, 1017], [1213, 1044], [1129, 1044]], "bbox": [1129, 1017, 1213, 1044], "points": [[1153, 1018], [1154, 1017], [1155, 1018], [1156, 1018], [1157, 1017], [1196, 1017], [1197, 1018], [1198, 1017], [1199, 1018], [1212, 1018], [1213, 1019], [1213, 1043], [1212, 1044], [1130, 1044], [1129, 1043], [1129, 1019], [1130, 1018]], "type": "cell", "rows": [9, 9], "columns": [9, 9], "text_list": []}, {"location": [[858, 1017], [943, 1017], [943, 1044], [858, 1044]], "bbox": [858, 1017, 943, 1044], "points": [[884, 1018], [885, 1017], [929, 1017], [930, 1018], [942, 1018], [943, 1019], [943, 1043], [942, 1044], [859, 1044], [858, 1043], [858, 1019], [859, 1018]], "type": "cell", "rows": [9, 9], "columns": [6, 6], "text_list": []}, {"location": [[769, 1017], [853, 1017], [853, 1044], [769, 1044]], "bbox": [769, 1017, 853, 1044], "points": [[795, 1018], [796, 1017], [836, 1017], [837, 1018], [838, 1017], [839, 1018], [852, 1018], [853, 1019], [853, 1043], [852, 1044], [770, 1044], [769, 1043], [769, 1019], [770, 1018]], "type": "cell", "rows": [9, 9], "columns": [5, 5], "text_list": []}, {"location": [[678, 1017], [763, 1017], [763, 1044], [678, 1044]], "bbox": [678, 1017, 763, 1044], "points": [[704, 1018], [705, 1017], [706, 1018], [707, 1017], [747, 1017], [748, 1018], [749, 1017], [750, 1018], [762, 1018], [763, 1019], [763, 1043], [762, 1044], [679, 1044], [678, 1043], [678, 1019], [679, 1018]], "type": "cell", "rows": [9, 9], "columns": [4, 4], "text_list": []}, {"location": [[171, 1017], [574, 1017], [574, 1044], [171, 1044]], "bbox": [171, 1017, 574, 1044], "points": [[273, 1018], [274, 1017], [276, 1018], [277, 1017], [366, 1017], [367, 1018], [383, 1018], [384, 1017], [396, 1017], [397, 1018], [398, 1017], [399, 1018], [401, 1017], [402, 1018], [403, 1017], [464, 1017], [465, 1018], [467, 1018], [468, 1017], [471, 1017], [472, 1018], [573, 1018], [574, 1019], [574, 1043], [573, 1044], [173, 1044], [171, 1043], [171, 1019], [173, 1018]], "type": "cell", "rows": [9, 9], "columns": [0, 2], "text_list": []}, {"location": [[1218, 987], [1303, 987], [1303, 1013], [1218, 1013]], "bbox": [1218, 987, 1303, 1013], "points": [[1218, 988], [1219, 987], [1302, 987], [1303, 988], [1303, 1012], [1302, 1013], [1219, 1013], [1218, 1012]], "type": "cell", "rows": [8, 8], "columns": [10, 10], "text_list": []}, {"location": [[1129, 987], [1213, 987], [1213, 1013], [1129, 1013]], "bbox": [1129, 987, 1213, 1013], "points": [[1129, 988], [1130, 987], [1212, 987], [1213, 988], [1213, 1012], [1212, 1013], [1130, 1013], [1129, 1012]], "type": "cell", "rows": [8, 8], "columns": [9, 9], "text_list": []}, {"location": [[858, 987], [943, 987], [943, 1013], [858, 1013]], "bbox": [858, 987, 943, 1013], "points": [[858, 988], [859, 987], [942, 987], [943, 988], [943, 1012], [942, 1013], [859, 1013], [858, 1012]], "type": "cell", "rows": [8, 8], "columns": [6, 6], "text_list": []}, {"location": [[769, 987], [853, 987], [853, 1013], [769, 1013]], "bbox": [769, 987, 853, 1013], "points": [[769, 988], [770, 987], [852, 987], [853, 988], [853, 1012], [852, 1013], [770, 1013], [769, 1012]], "type": "cell", "rows": [8, 8], "columns": [5, 5], "text_list": []}, {"location": [[678, 987], [763, 987], [763, 1013], [678, 1013]], "bbox": [678, 987, 763, 1013], "points": [[678, 988], [679, 987], [762, 987], [763, 988], [763, 1012], [762, 1013], [679, 1013], [678, 1012]], "type": "cell", "rows": [8, 8], "columns": [4, 4], "text_list": []}, {"location": [[240, 987], [574, 987], [574, 1013], [240, 1013]], "bbox": [240, 987, 574, 1013], "points": [[240, 988], [242, 987], [573, 987], [574, 988], [574, 1012], [573, 1013], [242, 1013], [240, 1012]], "type": "cell", "rows": [8, 8], "columns": [1, 2], "text_list": []}, {"location": [[240, 955], [383, 955], [383, 983], [240, 983]], "bbox": [240, 955, 383, 983], "points": [[240, 956], [242, 955], [382, 955], [383, 956], [383, 982], [382, 983], [242, 983], [240, 982]], "type": "cell", "rows": [7, 7], "columns": [1, 1], "text_list": []}, {"location": [[1218, 954], [1303, 954], [1303, 983], [1218, 983]], "bbox": [1218, 954, 1303, 983], "points": [[1249, 955], [1250, 954], [1264, 954], [1265, 955], [1302, 955], [1303, 956], [1303, 982], [1302, 983], [1219, 983], [1218, 982], [1219, 981], [1218, 980], [1218, 956], [1219, 955]], "type": "cell", "rows": [7, 7], "columns": [10, 10], "text_list": []}, {"location": [[1129, 954], [1213, 954], [1213, 983], [1129, 983]], "bbox": [1129, 954, 1213, 983], "points": [[1164, 955], [1165, 954], [1167, 955], [1168, 954], [1169, 955], [1170, 954], [1171, 955], [1173, 954], [1174, 955], [1212, 955], [1213, 956], [1213, 982], [1212, 983], [1130, 983], [1129, 982], [1129, 956], [1130, 955]], "type": "cell", "rows": [7, 7], "columns": [9, 9], "text_list": []}, {"location": [[858, 954], [943, 954], [943, 983], [858, 983]], "bbox": [858, 954, 943, 983], "points": [[888, 955], [889, 954], [904, 954], [905, 955], [942, 955], [943, 956], [943, 982], [942, 983], [859, 983], [858, 982], [858, 956], [859, 955]], "type": "cell", "rows": [7, 7], "columns": [6, 6], "text_list": []}, {"location": [[769, 954], [853, 954], [853, 983], [769, 983]], "bbox": [769, 954, 853, 983], "points": [[806, 955], [808, 954], [809, 955], [810, 954], [811, 955], [852, 955], [853, 956], [853, 982], [852, 983], [770, 983], [769, 982], [769, 956], [770, 955]], "type": "cell", "rows": [7, 7], "columns": [5, 5], "text_list": []}, {"location": [[678, 954], [763, 954], [763, 983], [678, 983]], "bbox": [678, 954, 763, 983], "points": [[713, 955], [714, 954], [715, 955], [762, 955], [763, 956], [763, 982], [762, 983], [679, 983], [678, 982], [678, 956], [679, 955]], "type": "cell", "rows": [7, 7], "columns": [4, 4], "text_list": []}, {"location": [[388, 954], [574, 954], [574, 983], [388, 983]], "bbox": [388, 954, 574, 983], "points": [[388, 955], [389, 954], [391, 954], [392, 955], [573, 955], [574, 956], [574, 982], [573, 983], [389, 983], [388, 982]], "type": "cell", "rows": [7, 7], "columns": [2, 2], "text_list": []}, {"location": [[1218, 923], [1303, 923], [1303, 950], [1218, 950]], "bbox": [1218, 923, 1303, 950], "points": [[1244, 925], [1245, 923], [1287, 923], [1288, 925], [1289, 923], [1291, 925], [1292, 923], [1293, 925], [1302, 925], [1303, 926], [1303, 949], [1302, 950], [1219, 950], [1218, 949], [1219, 948], [1218, 947], [1218, 926], [1219, 925]], "type": "cell", "rows": [6, 6], "columns": [10, 10], "text_list": []}, {"location": [[1129, 923], [1213, 923], [1213, 950], [1129, 950]], "bbox": [1129, 923, 1213, 950], "points": [[1129, 925], [1130, 923], [1132, 925], [1141, 925], [1142, 923], [1143, 925], [1144, 923], [1146, 925], [1147, 923], [1148, 925], [1149, 923], [1150, 925], [1151, 923], [1187, 923], [1188, 925], [1189, 923], [1190, 925], [1191, 923], [1192, 925], [1212, 925], [1213, 926], [1213, 949], [1212, 950], [1130, 950], [1129, 949]], "type": "cell", "rows": [6, 6], "columns": [9, 9], "text_list": []}, {"location": [[858, 923], [943, 923], [943, 950], [858, 950]], "bbox": [858, 923, 943, 950], "points": [[884, 925], [885, 923], [929, 923], [930, 925], [932, 923], [933, 925], [942, 925], [943, 926], [943, 949], [942, 950], [859, 950], [858, 949], [858, 926], [859, 925]], "type": "cell", "rows": [6, 6], "columns": [6, 6], "text_list": []}, {"location": [[769, 923], [853, 923], [853, 950], [769, 950]], "bbox": [769, 923, 853, 950], "points": [[790, 925], [791, 923], [792, 925], [794, 923], [824, 923], [825, 925], [826, 923], [828, 925], [829, 923], [830, 925], [831, 923], [832, 925], [833, 923], [835, 925], [836, 923], [837, 925], [852, 925], [853, 926], [853, 949], [852, 950], [770, 950], [769, 949], [769, 926], [770, 925]], "type": "cell", "rows": [6, 6], "columns": [5, 5], "text_list": []}, {"location": [[678, 923], [763, 923], [763, 950], [678, 950]], "bbox": [678, 923, 763, 950], "points": [[678, 925], [679, 923], [680, 925], [694, 925], [695, 923], [697, 925], [699, 925], [700, 923], [701, 925], [702, 923], [737, 923], [739, 925], [740, 923], [741, 925], [742, 923], [743, 925], [744, 923], [746, 925], [747, 923], [748, 925], [749, 923], [750, 925], [751, 923], [753, 925], [762, 925], [763, 926], [763, 949], [762, 950], [679, 950], [678, 949]], "type": "cell", "rows": [6, 6], "columns": [4, 4], "text_list": []}, {"location": [[388, 923], [574, 923], [574, 950], [388, 950]], "bbox": [388, 923, 574, 950], "points": [[388, 925], [389, 923], [390, 925], [391, 923], [410, 923], [411, 925], [414, 925], [415, 923], [422, 923], [423, 925], [425, 925], [426, 923], [429, 923], [430, 925], [431, 923], [438, 923], [439, 925], [440, 925], [442, 923], [446, 923], [447, 925], [449, 925], [450, 923], [454, 923], [456, 925], [457, 923], [490, 923], [491, 925], [492, 923], [493, 925], [494, 923], [495, 925], [497, 923], [498, 925], [499, 923], [500, 925], [501, 923], [502, 925], [504, 923], [505, 925], [506, 923], [507, 925], [508, 923], [509, 925], [511, 923], [512, 925], [513, 923], [514, 925], [515, 923], [516, 925], [518, 923], [519, 925], [520, 923], [521, 925], [522, 923], [523, 925], [525, 923], [526, 925], [527, 923], [528, 925], [529, 923], [530, 925], [532, 923], [533, 925], [534, 923], [535, 925], [536, 923], [537, 925], [539, 923], [546, 923], [547, 925], [548, 923], [549, 925], [550, 923], [552, 925], [553, 923], [554, 925], [555, 923], [556, 925], [573, 925], [574, 926], [574, 949], [573, 950], [389, 950], [388, 949]], "type": "cell", "rows": [6, 6], "columns": [2, 2], "text_list": []}, {"location": [[240, 923], [383, 923], [383, 950], [240, 950]], "bbox": [240, 923, 383, 950], "points": [[240, 925], [242, 923], [243, 925], [244, 925], [245, 923], [260, 923], [261, 925], [263, 923], [264, 925], [267, 925], [268, 923], [274, 923], [276, 925], [278, 925], [279, 923], [281, 923], [283, 925], [284, 923], [291, 923], [292, 925], [293, 925], [294, 923], [300, 923], [301, 925], [302, 923], [321, 923], [322, 925], [323, 923], [325, 925], [326, 923], [327, 925], [382, 925], [383, 926], [383, 949], [382, 950], [242, 950], [240, 949]], "type": "cell", "rows": [6, 6], "columns": [1, 1], "text_list": []}, {"location": [[1218, 893], [1303, 893], [1303, 920], [1218, 920]], "bbox": [1218, 893, 1303, 920], "points": [[1218, 894], [1219, 893], [1302, 893], [1303, 894], [1303, 919], [1302, 920], [1219, 920], [1218, 919]], "type": "cell", "rows": [5, 5], "columns": [10, 10], "text_list": []}, {"location": [[1129, 893], [1213, 893], [1213, 920], [1129, 920]], "bbox": [1129, 893, 1213, 920], "points": [[1129, 894], [1130, 893], [1212, 893], [1213, 894], [1213, 919], [1212, 920], [1130, 920], [1129, 919]], "type": "cell", "rows": [5, 5], "columns": [9, 9], "text_list": []}, {"location": [[858, 893], [943, 893], [943, 920], [858, 920]], "bbox": [858, 893, 943, 920], "points": [[858, 894], [859, 893], [942, 893], [943, 894], [943, 919], [942, 920], [859, 920], [858, 919]], "type": "cell", "rows": [5, 5], "columns": [6, 6], "text_list": []}, {"location": [[769, 893], [853, 893], [853, 920], [769, 920]], "bbox": [769, 893, 853, 920], "points": [[769, 894], [770, 893], [852, 893], [853, 894], [853, 919], [852, 920], [770, 920], [769, 919]], "type": "cell", "rows": [5, 5], "columns": [5, 5], "text_list": []}, {"location": [[678, 893], [763, 893], [763, 920], [678, 920]], "bbox": [678, 893, 763, 920], "points": [[678, 894], [679, 893], [762, 893], [763, 894], [763, 919], [762, 920], [679, 920], [678, 919], [679, 918], [678, 916]], "type": "cell", "rows": [5, 5], "columns": [4, 4], "text_list": []}, {"location": [[388, 893], [574, 893], [574, 920], [388, 920]], "bbox": [388, 893, 574, 920], "points": [[388, 894], [389, 893], [573, 893], [574, 894], [574, 919], [573, 920], [389, 920], [388, 919]], "type": "cell", "rows": [5, 5], "columns": [2, 2], "text_list": []}, {"location": [[1218, 860], [1303, 860], [1303, 887], [1218, 887]], "bbox": [1218, 860, 1303, 887], "points": [[1244, 861], [1245, 860], [1246, 861], [1247, 860], [1287, 860], [1288, 861], [1289, 860], [1291, 861], [1302, 861], [1303, 863], [1303, 886], [1302, 887], [1219, 887], [1218, 886], [1218, 863], [1219, 861]], "type": "cell", "rows": [4, 4], "columns": [10, 10], "text_list": []}, {"location": [[1129, 860], [1213, 860], [1213, 887], [1129, 887]], "bbox": [1129, 860, 1213, 887], "points": [[1143, 861], [1144, 860], [1146, 861], [1147, 860], [1148, 861], [1149, 860], [1150, 861], [1151, 860], [1153, 861], [1154, 860], [1155, 861], [1156, 860], [1157, 861], [1158, 860], [1191, 860], [1192, 861], [1194, 860], [1195, 861], [1212, 861], [1213, 863], [1213, 886], [1212, 887], [1130, 887], [1129, 886], [1129, 863], [1130, 861]], "type": "cell", "rows": [4, 4], "columns": [9, 9], "text_list": []}, {"location": [[858, 860], [943, 860], [943, 887], [858, 887]], "bbox": [858, 860, 943, 887], "points": [[874, 861], [875, 860], [877, 861], [878, 860], [879, 861], [884, 861], [885, 860], [886, 861], [887, 860], [929, 860], [930, 861], [932, 860], [933, 861], [942, 861], [943, 863], [943, 886], [942, 887], [859, 887], [858, 886], [858, 863], [859, 861]], "type": "cell", "rows": [4, 4], "columns": [6, 6], "text_list": []}, {"location": [[769, 860], [853, 860], [853, 887], [769, 887]], "bbox": [769, 860, 853, 887], "points": [[795, 861], [796, 860], [797, 861], [798, 860], [831, 860], [832, 861], [833, 860], [835, 861], [852, 861], [853, 863], [853, 886], [852, 887], [770, 887], [769, 886], [769, 863], [770, 861]], "type": "cell", "rows": [4, 4], "columns": [5, 5], "text_list": []}, {"location": [[678, 860], [763, 860], [763, 887], [678, 887]], "bbox": [678, 860, 763, 887], "points": [[678, 861], [679, 860], [680, 861], [704, 861], [705, 860], [706, 861], [707, 860], [747, 860], [748, 861], [749, 860], [750, 861], [751, 860], [753, 861], [762, 861], [763, 863], [763, 886], [762, 887], [679, 887], [678, 886]], "type": "cell", "rows": [4, 4], "columns": [4, 4], "text_list": []}, {"location": [[388, 860], [574, 860], [574, 887], [388, 887]], "bbox": [388, 860, 574, 887], "points": [[388, 861], [389, 860], [392, 860], [394, 861], [411, 861], [412, 860], [414, 861], [415, 860], [416, 861], [417, 860], [548, 860], [549, 861], [550, 860], [552, 861], [553, 860], [554, 861], [555, 860], [556, 861], [557, 860], [559, 861], [560, 860], [561, 861], [573, 861], [574, 863], [574, 886], [573, 887], [389, 887], [388, 886]], "type": "cell", "rows": [4, 4], "columns": [2, 2], "text_list": []}, {"location": [[1218, 830], [1303, 830], [1303, 857], [1218, 857]], "bbox": [1218, 830, 1303, 857], "points": [[1218, 831], [1219, 830], [1302, 830], [1303, 831], [1303, 856], [1302, 857], [1219, 857], [1218, 856]], "type": "cell", "rows": [3, 3], "columns": [10, 10], "text_list": []}, {"location": [[1129, 830], [1213, 830], [1213, 857], [1129, 857]], "bbox": [1129, 830, 1213, 857], "points": [[1129, 831], [1130, 830], [1212, 830], [1213, 831], [1213, 856], [1212, 857], [1130, 857], [1129, 856]], "type": "cell", "rows": [3, 3], "columns": [9, 9], "text_list": []}, {"location": [[858, 830], [943, 830], [943, 857], [858, 857]], "bbox": [858, 830, 943, 857], "points": [[858, 831], [859, 830], [942, 830], [943, 831], [943, 856], [942, 857], [859, 857], [858, 856]], "type": "cell", "rows": [3, 3], "columns": [6, 6], "text_list": []}, {"location": [[769, 830], [853, 830], [853, 857], [769, 857]], "bbox": [769, 830, 853, 857], "points": [[769, 831], [770, 830], [852, 830], [853, 831], [853, 856], [852, 857], [770, 857], [769, 856]], "type": "cell", "rows": [3, 3], "columns": [5, 5], "text_list": []}, {"location": [[678, 830], [763, 830], [763, 857], [678, 857]], "bbox": [678, 830, 763, 857], "points": [[678, 831], [679, 830], [762, 830], [763, 831], [763, 856], [762, 857], [679, 857], [678, 856]], "type": "cell", "rows": [3, 3], "columns": [4, 4], "text_list": []}, {"location": [[388, 830], [574, 830], [574, 857], [388, 857]], "bbox": [388, 830, 574, 857], "points": [[388, 831], [389, 830], [573, 830], [574, 831], [574, 856], [573, 857], [389, 857], [388, 856]], "type": "cell", "rows": [3, 3], "columns": [2, 2], "text_list": []}, {"location": [[769, 798], [853, 798], [853, 824], [769, 824]], "bbox": [769, 798, 853, 824], "points": [[769, 799], [770, 798], [852, 798], [853, 799], [853, 823], [852, 824], [770, 824], [769, 823]], "type": "cell", "rows": [2, 2], "columns": [5, 5], "text_list": []}, {"location": [[1218, 797], [1303, 797], [1303, 824], [1218, 824]], "bbox": [1218, 797, 1303, 824], "points": [[1244, 798], [1245, 797], [1246, 798], [1247, 797], [1249, 798], [1250, 797], [1275, 797], [1277, 798], [1278, 797], [1279, 798], [1302, 798], [1303, 799], [1303, 823], [1302, 824], [1219, 824], [1218, 823], [1218, 799], [1219, 798]], "type": "cell", "rows": [2, 2], "columns": [10, 10], "text_list": []}, {"location": [[1129, 797], [1213, 797], [1213, 824], [1129, 824]], "bbox": [1129, 797, 1213, 824], "points": [[1129, 798], [1130, 797], [1132, 798], [1146, 798], [1147, 797], [1148, 798], [1149, 797], [1150, 798], [1151, 797], [1153, 798], [1154, 797], [1155, 798], [1164, 798], [1165, 797], [1167, 798], [1168, 797], [1169, 798], [1170, 797], [1171, 798], [1173, 797], [1174, 798], [1212, 798], [1213, 799], [1213, 823], [1212, 824], [1130, 824], [1129, 823]], "type": "cell", "rows": [2, 2], "columns": [9, 9], "text_list": []}, {"location": [[858, 797], [943, 797], [943, 824], [858, 824]], "bbox": [858, 797, 943, 824], "points": [[888, 798], [889, 797], [891, 798], [892, 797], [893, 798], [894, 797], [895, 798], [942, 798], [943, 799], [943, 823], [942, 824], [859, 824], [858, 823], [858, 799], [859, 798]], "type": "cell", "rows": [2, 2], "columns": [6, 6], "text_list": []}, {"location": [[678, 797], [763, 797], [763, 824], [678, 824]], "bbox": [678, 797, 763, 824], "points": [[678, 798], [679, 797], [680, 798], [704, 798], [705, 797], [706, 798], [707, 797], [708, 798], [709, 797], [711, 798], [712, 797], [733, 797], [734, 798], [735, 797], [736, 798], [762, 798], [763, 799], [763, 823], [762, 824], [679, 824], [678, 823]], "type": "cell", "rows": [2, 2], "columns": [4, 4], "text_list": []}, {"location": [[388, 797], [574, 797], [574, 824], [388, 824]], "bbox": [388, 797, 574, 824], "points": [[388, 798], [389, 797], [394, 797], [395, 798], [418, 798], [419, 797], [421, 798], [424, 798], [425, 797], [429, 797], [430, 798], [431, 797], [432, 798], [433, 797], [435, 798], [484, 798], [485, 797], [486, 798], [495, 798], [497, 797], [498, 798], [573, 798], [574, 799], [574, 823], [573, 824], [389, 824], [388, 823]], "type": "cell", "rows": [2, 2], "columns": [2, 2], "text_list": []}, {"location": [[240, 768], [383, 768], [383, 920], [240, 920]], "bbox": [240, 768, 383, 920], "points": [[240, 769], [242, 768], [382, 768], [383, 769], [383, 919], [382, 920], [242, 920], [240, 919]], "type": "cell", "rows": [1, 5], "columns": [1, 1], "text_list": []}, {"location": [[1218, 767], [1303, 767], [1303, 792], [1218, 792]], "bbox": [1218, 767, 1303, 792], "points": [[1218, 768], [1219, 767], [1220, 768], [1223, 768], [1224, 767], [1302, 767], [1303, 768], [1303, 791], [1302, 792], [1219, 792], [1218, 791]], "type": "cell", "rows": [1, 1], "columns": [10, 10], "text_list": []}, {"location": [[1128, 767], [1213, 767], [1213, 792], [1128, 792]], "bbox": [1128, 767, 1213, 792], "points": [[1130, 767], [1132, 768], [1133, 767], [1205, 767], [1206, 768], [1208, 767], [1209, 768], [1212, 768], [1213, 769], [1213, 791], [1212, 792], [1130, 792], [1129, 791], [1129, 773], [1128, 771], [1128, 769]], "type": "cell", "rows": [1, 1], "columns": [9, 9], "text_list": []}, {"location": [[1038, 767], [1123, 767], [1123, 1107], [1038, 1107]], "bbox": [1038, 767, 1123, 1107], "points": [[1038, 768], [1039, 767], [1040, 768], [1045, 768], [1046, 767], [1047, 768], [1049, 767], [1050, 768], [1051, 767], [1052, 768], [1053, 767], [1054, 768], [1068, 768], [1070, 767], [1071, 768], [1072, 767], [1073, 768], [1074, 767], [1075, 768], [1077, 767], [1078, 768], [1079, 767], [1080, 768], [1081, 767], [1082, 768], [1122, 768], [1123, 769], [1123, 1106], [1122, 1107], [1039, 1107], [1038, 1106]], "type": "cell", "rows": [1, 11], "columns": [8, 8], "text_list": []}, {"location": [[948, 767], [1033, 767], [1033, 1107], [948, 1107]], "bbox": [948, 767, 1033, 1107], "points": [[951, 768], [953, 767], [954, 768], [955, 767], [956, 768], [957, 767], [958, 768], [960, 767], [961, 768], [962, 767], [963, 768], [975, 768], [976, 767], [977, 768], [978, 767], [980, 768], [981, 767], [982, 768], [983, 767], [984, 768], [985, 767], [987, 768], [988, 767], [989, 768], [1032, 768], [1033, 769], [1033, 1106], [1032, 1107], [949, 1107], [948, 1106], [948, 1082], [949, 1081], [949, 1075], [948, 1074], [948, 1051], [949, 1050], [949, 1044], [948, 1043], [948, 1019], [949, 1018], [949, 1012], [948, 1011], [948, 988], [949, 987], [949, 982], [948, 981], [948, 956], [949, 955], [949, 950], [948, 949], [948, 926], [949, 925], [949, 919], [948, 918], [948, 894], [949, 893], [949, 887], [948, 886], [948, 863], [949, 861], [949, 856], [948, 854], [948, 831], [949, 830], [949, 824], [948, 823], [948, 799], [949, 798], [949, 792], [948, 791], [948, 769], [949, 768]], "type": "cell", "rows": [1, 11], "columns": [7, 7], "text_list": []}, {"location": [[858, 767], [943, 767], [943, 792], [858, 792]], "bbox": [858, 767, 943, 792], "points": [[858, 768], [859, 767], [860, 768], [861, 768], [863, 767], [936, 767], [937, 768], [939, 767], [940, 768], [941, 767], [943, 769], [943, 791], [942, 792], [859, 792], [858, 791]], "type": "cell", "rows": [1, 1], "columns": [6, 6], "text_list": []}, {"location": [[769, 767], [853, 767], [853, 792], [769, 792]], "bbox": [769, 767, 853, 792], "points": [[769, 768], [770, 767], [771, 768], [773, 767], [850, 767], [851, 768], [852, 768], [853, 769], [853, 791], [852, 792], [770, 792], [769, 791]], "type": "cell", "rows": [1, 1], "columns": [5, 5], "text_list": []}, {"location": [[678, 767], [763, 767], [763, 792], [678, 792]], "bbox": [678, 767, 763, 792], "points": [[678, 768], [679, 767], [680, 768], [682, 768], [684, 767], [761, 767], [763, 769], [763, 791], [762, 792], [679, 792], [678, 791]], "type": "cell", "rows": [1, 1], "columns": [4, 4], "text_list": []}, {"location": [[577, 767], [674, 767], [674, 1013], [577, 1013]], "bbox": [577, 767, 674, 1013], "points": [[624, 768], [625, 767], [626, 768], [628, 767], [630, 767], [631, 768], [632, 767], [633, 768], [635, 767], [636, 768], [673, 768], [674, 769], [673, 770], [673, 1012], [672, 1013], [578, 1013], [577, 1012], [578, 1011], [578, 988], [580, 987], [580, 983], [578, 982], [578, 955], [580, 954], [580, 950], [578, 949], [578, 925], [580, 923], [580, 920], [578, 919], [578, 893], [580, 892], [580, 888], [578, 887], [578, 861], [580, 860], [580, 857], [578, 856], [578, 830], [580, 829], [580, 824], [578, 823], [578, 798], [580, 797], [580, 792], [578, 791], [578, 769], [580, 768]], "type": "cell", "rows": [1, 8], "columns": [3, 3], "text_list": []}, {"location": [[388, 767], [574, 767], [574, 792], [388, 792]], "bbox": [388, 767, 574, 792], "points": [[388, 768], [389, 767], [390, 768], [391, 767], [562, 767], [563, 768], [564, 767], [566, 768], [573, 768], [574, 769], [574, 791], [573, 792], [389, 792], [388, 791]], "type": "cell", "rows": [1, 1], "columns": [2, 2], "text_list": []}, {"location": [[171, 767], [237, 767], [237, 1013], [171, 1013]], "bbox": [171, 767, 237, 1013], "points": [[171, 768], [173, 767], [174, 768], [236, 768], [237, 769], [237, 770], [236, 771], [236, 833], [237, 835], [236, 836], [237, 837], [236, 838], [237, 839], [236, 840], [237, 842], [236, 843], [237, 844], [236, 845], [236, 929], [237, 930], [237, 937], [236, 939], [236, 962], [237, 963], [237, 966], [236, 967], [237, 968], [236, 969], [237, 970], [236, 971], [236, 1009], [237, 1010], [237, 1012], [236, 1013], [173, 1013], [171, 1012]], "type": "cell", "rows": [1, 8], "columns": [0, 0], "text_list": []}, {"location": [[1218, 734], [1303, 734], [1303, 761], [1218, 761]], "bbox": [1218, 734, 1303, 761], "points": [[1218, 735], [1219, 734], [1220, 735], [1225, 735], [1226, 734], [1257, 734], [1258, 735], [1259, 734], [1302, 734], [1303, 735], [1303, 760], [1302, 761], [1219, 761], [1218, 760]], "type": "cell", "rows": [0, 0], "columns": [10, 10], "text_list": []}, {"location": [[1128, 734], [1213, 734], [1213, 761], [1128, 761]], "bbox": [1128, 734, 1213, 761], "points": [[1130, 734], [1132, 735], [1134, 735], [1135, 734], [1165, 734], [1167, 735], [1168, 735], [1169, 734], [1175, 734], [1176, 735], [1177, 735], [1178, 734], [1189, 734], [1190, 735], [1198, 735], [1199, 734], [1203, 734], [1204, 735], [1211, 735], [1212, 734], [1213, 735], [1213, 760], [1212, 761], [1129, 761], [1128, 760], [1128, 736]], "type": "cell", "rows": [0, 0], "columns": [9, 9], "text_list": []}, {"location": [[1038, 734], [1123, 734], [1123, 761], [1038, 761]], "bbox": [1038, 734, 1123, 761], "points": [[1038, 735], [1039, 734], [1040, 735], [1045, 735], [1046, 734], [1047, 735], [1050, 735], [1051, 734], [1056, 734], [1057, 735], [1058, 734], [1063, 734], [1064, 735], [1065, 734], [1075, 734], [1077, 735], [1078, 735], [1079, 734], [1084, 734], [1085, 735], [1088, 735], [1089, 734], [1098, 734], [1099, 735], [1104, 735], [1105, 734], [1106, 735], [1111, 735], [1112, 734], [1114, 734], [1115, 735], [1122, 735], [1123, 736], [1123, 760], [1122, 761], [1039, 761], [1038, 760]], "type": "cell", "rows": [0, 0], "columns": [8, 8], "text_list": []}, {"location": [[948, 734], [1033, 734], [1033, 761], [948, 761]], "bbox": [948, 734, 1033, 761], "points": [[954, 735], [955, 734], [957, 734], [958, 735], [960, 734], [964, 734], [966, 735], [968, 735], [969, 734], [971, 734], [973, 735], [974, 734], [975, 735], [976, 734], [978, 734], [980, 735], [981, 734], [985, 734], [987, 735], [989, 735], [990, 734], [995, 734], [996, 735], [997, 735], [998, 734], [1016, 734], [1017, 735], [1018, 734], [1019, 735], [1020, 734], [1023, 734], [1024, 735], [1025, 734], [1026, 735], [1027, 734], [1029, 735], [1030, 734], [1031, 735], [1032, 734], [1033, 735], [1033, 760], [1032, 761], [949, 761], [948, 760], [948, 736], [949, 735]], "type": "cell", "rows": [0, 0], "columns": [7, 7], "text_list": []}, {"location": [[858, 734], [943, 734], [943, 761], [858, 761]], "bbox": [858, 734, 943, 761], "points": [[858, 735], [859, 734], [860, 735], [863, 735], [864, 734], [865, 735], [866, 734], [878, 734], [879, 735], [886, 735], [887, 734], [889, 734], [891, 735], [895, 735], [897, 734], [898, 735], [900, 735], [901, 734], [902, 735], [905, 735], [906, 734], [934, 734], [935, 735], [940, 735], [941, 734], [943, 736], [943, 760], [942, 761], [859, 761], [858, 760]], "type": "cell", "rows": [0, 0], "columns": [6, 6], "text_list": []}, {"location": [[769, 734], [853, 734], [853, 761], [769, 761]], "bbox": [769, 734, 853, 761], "points": [[769, 735], [770, 734], [771, 735], [773, 734], [774, 735], [775, 734], [776, 735], [777, 734], [787, 734], [788, 735], [789, 734], [790, 735], [791, 734], [792, 735], [796, 735], [797, 734], [799, 734], [801, 735], [804, 735], [805, 734], [806, 735], [811, 735], [812, 734], [813, 735], [815, 734], [845, 734], [846, 735], [847, 734], [849, 735], [851, 735], [852, 734], [853, 735], [853, 760], [852, 761], [770, 761], [769, 760]], "type": "cell", "rows": [0, 0], "columns": [5, 5], "text_list": []}, {"location": [[678, 734], [763, 734], [763, 761], [678, 761]], "bbox": [678, 734, 763, 761], "points": [[678, 735], [679, 734], [680, 735], [685, 735], [686, 734], [698, 734], [699, 735], [706, 735], [707, 734], [709, 734], [711, 735], [715, 735], [716, 734], [718, 735], [720, 735], [721, 734], [759, 734], [760, 735], [761, 734], [762, 734], [763, 735], [763, 760], [762, 761], [679, 761], [678, 760]], "type": "cell", "rows": [0, 0], "columns": [4, 4], "text_list": []}, {"location": [[577, 734], [674, 734], [674, 761], [577, 761]], "bbox": [577, 734, 674, 761], "points": [[605, 735], [606, 734], [621, 734], [622, 735], [623, 734], [635, 734], [636, 735], [638, 735], [639, 734], [640, 735], [642, 734], [643, 735], [673, 735], [674, 736], [674, 760], [673, 761], [578, 761], [577, 760], [577, 736], [578, 735]], "type": "cell", "rows": [0, 0], "columns": [3, 3], "text_list": []}, {"location": [[171, 734], [574, 734], [574, 761], [171, 761]], "bbox": [171, 734, 574, 761], "points": [[196, 735], [197, 734], [198, 735], [199, 734], [214, 734], [215, 735], [216, 734], [217, 735], [218, 734], [219, 735], [573, 735], [574, 736], [574, 760], [573, 761], [173, 761], [171, 760], [171, 736], [173, 735]], "type": "cell", "rows": [0, 0], "columns": [0, 2], "text_list": []}, {"location": [[166, 729], [1309, 729], [1309, 1113], [166, 1113]], "bbox": [166, 729, 1309, 1113], "points": [[167, 729], [167, 788], [166, 789], [167, 790], [167, 792], [166, 794], [167, 795], [167, 1082], [166, 1084], [167, 1085], [167, 1113], [1308, 1113], [1309, 1112], [1309, 1107], [1308, 1106], [1308, 1101], [1309, 1100], [1308, 1099], [1309, 1098], [1308, 1096], [1309, 1095], [1309, 1075], [1308, 1074], [1308, 1073], [1309, 1072], [1308, 1071], [1309, 1070], [1309, 1056], [1308, 1054], [1309, 1053], [1308, 1052], [1309, 1051], [1309, 1044], [1308, 1043], [1309, 1042], [1308, 1040], [1309, 1039], [1308, 1038], [1309, 1037], [1309, 1020], [1308, 1019], [1309, 1018], [1309, 1011], [1308, 1010], [1309, 1009], [1309, 950], [1308, 949], [1309, 948], [1308, 947], [1309, 946], [1309, 929], [1308, 928], [1309, 927], [1308, 926], [1309, 925], [1309, 919], [1308, 918], [1308, 916], [1309, 915], [1309, 897], [1308, 895], [1309, 894], [1309, 887], [1308, 886], [1309, 885], [1308, 884], [1309, 882], [1308, 881], [1309, 880], [1309, 864], [1308, 863], [1309, 861], [1309, 856], [1308, 854], [1308, 853], [1309, 852], [1309, 836], [1308, 835], [1309, 833], [1308, 832], [1309, 831], [1309, 824], [1308, 823], [1309, 822], [1308, 820], [1309, 819], [1309, 792], [1308, 791], [1308, 790], [1309, 789], [1309, 773], [1308, 771], [1308, 769], [1309, 768], [1309, 761], [1308, 760], [1308, 736], [1309, 735], [1309, 730], [171, 730], [170, 729]], "type": "table", "contains": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84]}, {"location": [[1213, 549], [1298, 549], [1298, 576], [1213, 576]], "bbox": [1213, 549, 1298, 576], "points": [[1213, 550], [1215, 549], [1296, 549], [1298, 550], [1298, 575], [1296, 576], [1215, 576], [1213, 575]], "type": "cell", "rows": [10, 10], "columns": [10, 10], "text_list": []}, {"location": [[1123, 549], [1209, 549], [1209, 576], [1123, 576]], "bbox": [1123, 549, 1209, 576], "points": [[1123, 550], [1125, 549], [1208, 549], [1209, 550], [1209, 575], [1208, 576], [1125, 576], [1123, 575]], "type": "cell", "rows": [10, 10], "columns": [9, 9], "text_list": []}, {"location": [[1032, 549], [1118, 549], [1118, 576], [1032, 576]], "bbox": [1032, 549, 1118, 576], "points": [[1035, 549], [1116, 549], [1118, 550], [1118, 575], [1116, 576], [1035, 576], [1032, 574], [1032, 568], [1033, 567], [1032, 566], [1033, 564], [1032, 563], [1033, 562], [1032, 561], [1033, 560], [1032, 559], [1033, 557], [1033, 553], [1032, 552]], "type": "cell", "rows": [10, 10], "columns": [8, 8], "text_list": []}, {"location": [[942, 549], [1027, 549], [1027, 576], [942, 576]], "bbox": [942, 549, 1027, 576], "points": [[942, 550], [943, 549], [1026, 549], [1027, 550], [1027, 575], [1026, 576], [943, 576], [942, 575]], "type": "cell", "rows": [10, 10], "columns": [7, 7], "text_list": []}, {"location": [[852, 549], [937, 549], [937, 576], [852, 576]], "bbox": [852, 549, 937, 576], "points": [[853, 550], [854, 549], [936, 549], [937, 550], [937, 575], [936, 576], [854, 576], [852, 574], [853, 573]], "type": "cell", "rows": [10, 10], "columns": [6, 6], "text_list": []}, {"location": [[762, 549], [847, 549], [847, 576], [762, 576]], "bbox": [762, 549, 847, 576], "points": [[763, 550], [764, 549], [846, 549], [847, 550], [847, 575], [846, 576], [763, 576], [762, 575], [763, 574]], "type": "cell", "rows": [10, 10], "columns": [5, 5], "text_list": []}, {"location": [[673, 549], [759, 549], [759, 576], [673, 576]], "bbox": [673, 549, 759, 576], "points": [[673, 550], [674, 549], [756, 549], [759, 552], [757, 553], [759, 554], [759, 574], [756, 576], [674, 576], [673, 575]], "type": "cell", "rows": [10, 10], "columns": [4, 4], "text_list": []}, {"location": [[233, 549], [569, 549], [569, 576], [233, 576]], "bbox": [233, 549, 569, 576], "points": [[235, 550], [236, 549], [568, 549], [569, 550], [569, 575], [568, 576], [235, 576], [233, 575], [235, 574]], "type": "cell", "rows": [10, 10], "columns": [1, 2], "text_list": []}, {"location": [[1213, 519], [1298, 519], [1298, 544], [1213, 544]], "bbox": [1213, 519, 1298, 544], "points": [[1213, 520], [1215, 519], [1296, 519], [1298, 520], [1298, 543], [1296, 544], [1215, 544], [1213, 543]], "type": "cell", "rows": [9, 9], "columns": [10, 10], "text_list": []}, {"location": [[1123, 519], [1209, 519], [1209, 544], [1123, 544]], "bbox": [1123, 519, 1209, 544], "points": [[1123, 520], [1125, 519], [1208, 519], [1209, 520], [1209, 543], [1208, 544], [1125, 544], [1123, 543]], "type": "cell", "rows": [9, 9], "columns": [9, 9], "text_list": []}, {"location": [[1032, 519], [1118, 519], [1118, 544], [1032, 544]], "bbox": [1032, 519, 1118, 544], "points": [[1035, 519], [1116, 519], [1118, 520], [1118, 543], [1116, 544], [1035, 544], [1032, 542], [1033, 541], [1033, 527], [1032, 526], [1033, 525], [1032, 523], [1033, 522], [1032, 521]], "type": "cell", "rows": [9, 9], "columns": [8, 8], "text_list": []}, {"location": [[942, 519], [1027, 519], [1027, 544], [942, 544]], "bbox": [942, 519, 1027, 544], "points": [[942, 520], [943, 519], [1026, 519], [1027, 520], [1027, 543], [1026, 544], [944, 544], [942, 542], [943, 541], [942, 540], [943, 539], [942, 537], [943, 536], [942, 535], [943, 534], [942, 533]], "type": "cell", "rows": [9, 9], "columns": [7, 7], "text_list": []}, {"location": [[853, 519], [937, 519], [937, 544], [853, 544]], "bbox": [853, 519, 937, 544], "points": [[853, 520], [854, 519], [936, 519], [937, 520], [937, 543], [936, 544], [854, 544], [853, 543]], "type": "cell", "rows": [9, 9], "columns": [6, 6], "text_list": []}, {"location": [[763, 519], [847, 519], [847, 544], [763, 544]], "bbox": [763, 519, 847, 544], "points": [[763, 520], [764, 519], [846, 519], [847, 520], [847, 543], [846, 544], [764, 544], [763, 543]], "type": "cell", "rows": [9, 9], "columns": [5, 5], "text_list": []}, {"location": [[673, 519], [759, 519], [759, 544], [673, 544]], "bbox": [673, 519, 759, 544], "points": [[673, 520], [674, 519], [756, 519], [759, 521], [757, 522], [759, 523], [759, 540], [757, 541], [757, 543], [756, 544], [674, 544], [673, 543]], "type": "cell", "rows": [9, 9], "columns": [4, 4], "text_list": []}, {"location": [[383, 519], [569, 519], [569, 544], [383, 544]], "bbox": [383, 519, 569, 544], "points": [[383, 520], [384, 519], [568, 519], [569, 520], [569, 543], [568, 544], [384, 544], [383, 543]], "type": "cell", "rows": [9, 9], "columns": [2, 2], "text_list": []}, {"location": [[233, 519], [377, 519], [377, 544], [233, 544]], "bbox": [233, 519, 377, 544], "points": [[233, 520], [235, 519], [376, 519], [377, 520], [377, 543], [376, 544], [236, 544], [235, 543], [235, 540], [233, 539], [233, 537], [235, 536], [233, 535], [233, 523], [235, 522], [233, 521]], "type": "cell", "rows": [9, 9], "columns": [1, 1], "text_list": []}, {"location": [[1213, 488], [1298, 488], [1298, 515], [1213, 515]], "bbox": [1213, 488, 1298, 515], "points": [[1213, 490], [1215, 488], [1216, 490], [1218, 490], [1219, 488], [1220, 490], [1222, 488], [1296, 488], [1298, 490], [1298, 514], [1296, 515], [1215, 515], [1213, 514]], "type": "cell", "rows": [8, 8], "columns": [10, 10], "text_list": []}, {"location": [[1123, 488], [1209, 488], [1209, 515], [1123, 515]], "bbox": [1123, 488, 1209, 515], "points": [[1123, 490], [1125, 488], [1126, 490], [1132, 490], [1133, 488], [1194, 488], [1195, 490], [1196, 488], [1197, 490], [1198, 488], [1199, 490], [1202, 490], [1203, 488], [1204, 490], [1205, 488], [1206, 490], [1208, 490], [1209, 491], [1209, 514], [1208, 515], [1125, 515], [1123, 514]], "type": "cell", "rows": [8, 8], "columns": [9, 9], "text_list": []}, {"location": [[1032, 488], [1118, 488], [1118, 515], [1032, 515]], "bbox": [1032, 488, 1118, 515], "points": [[1035, 488], [1036, 490], [1037, 488], [1038, 490], [1039, 488], [1040, 490], [1042, 488], [1043, 490], [1044, 488], [1107, 488], [1108, 490], [1109, 488], [1111, 490], [1112, 488], [1116, 488], [1118, 490], [1118, 514], [1116, 515], [1035, 515], [1033, 514], [1033, 513], [1032, 512], [1033, 511], [1032, 509], [1033, 508], [1033, 497], [1032, 495], [1033, 494], [1032, 493], [1032, 491]], "type": "cell", "rows": [8, 8], "columns": [8, 8], "text_list": []}, {"location": [[942, 488], [1027, 488], [1027, 515], [942, 515]], "bbox": [942, 488, 1027, 515], "points": [[942, 490], [943, 488], [944, 488], [946, 490], [949, 490], [950, 488], [951, 490], [953, 488], [1016, 488], [1017, 490], [1018, 488], [1019, 490], [1020, 488], [1022, 490], [1026, 490], [1027, 491], [1027, 514], [1026, 515], [943, 515], [942, 514], [943, 513], [942, 512]], "type": "cell", "rows": [8, 8], "columns": [7, 7], "text_list": []}, {"location": [[853, 488], [937, 488], [937, 515], [853, 515]], "bbox": [853, 488, 937, 515], "points": [[853, 490], [854, 488], [856, 490], [860, 490], [861, 488], [863, 490], [864, 488], [929, 488], [930, 490], [932, 488], [933, 490], [934, 488], [935, 490], [936, 490], [937, 491], [937, 514], [936, 515], [854, 515], [853, 514]], "type": "cell", "rows": [8, 8], "columns": [6, 6], "text_list": []}, {"location": [[762, 488], [847, 488], [847, 515], [762, 515]], "bbox": [762, 488, 847, 515], "points": [[763, 490], [764, 488], [766, 490], [771, 490], [773, 488], [774, 490], [775, 488], [838, 488], [839, 490], [840, 488], [842, 490], [843, 488], [844, 490], [845, 488], [847, 491], [847, 514], [846, 515], [763, 515], [762, 514], [763, 513]], "type": "cell", "rows": [8, 8], "columns": [5, 5], "text_list": []}, {"location": [[673, 488], [759, 488], [759, 515], [673, 515]], "bbox": [673, 488, 759, 515], "points": [[673, 490], [674, 488], [679, 488], [680, 490], [681, 488], [682, 490], [684, 488], [751, 488], [753, 490], [754, 488], [755, 490], [757, 490], [759, 491], [759, 511], [757, 512], [757, 514], [756, 515], [674, 515], [673, 514]], "type": "cell", "rows": [8, 8], "columns": [4, 4], "text_list": []}, {"location": [[383, 488], [569, 488], [569, 515], [383, 515]], "bbox": [383, 488, 569, 515], "points": [[383, 490], [384, 488], [475, 488], [477, 490], [478, 488], [479, 490], [480, 488], [481, 490], [483, 488], [484, 490], [485, 488], [486, 490], [487, 488], [488, 490], [490, 488], [491, 490], [492, 488], [501, 488], [502, 490], [504, 488], [505, 490], [506, 488], [507, 490], [508, 488], [509, 490], [511, 488], [512, 490], [513, 488], [550, 488], [552, 490], [553, 488], [554, 490], [555, 488], [556, 490], [557, 488], [567, 488], [569, 491], [569, 514], [568, 515], [384, 515], [383, 514]], "type": "cell", "rows": [8, 8], "columns": [2, 2], "text_list": []}, {"location": [[233, 488], [377, 488], [377, 515], [233, 515]], "bbox": [233, 488, 377, 515], "points": [[233, 490], [235, 488], [356, 488], [357, 490], [359, 488], [360, 490], [361, 488], [362, 490], [363, 488], [364, 490], [366, 488], [367, 490], [368, 488], [369, 490], [370, 488], [371, 490], [373, 488], [374, 490], [375, 488], [376, 488], [377, 490], [377, 514], [376, 515], [236, 515], [235, 514], [235, 511], [233, 509]], "type": "cell", "rows": [8, 8], "columns": [1, 1], "text_list": []}, {"location": [[1212, 457], [1298, 457], [1298, 484], [1212, 484]], "bbox": [1212, 457, 1298, 484], "points": [[1212, 458], [1213, 457], [1296, 457], [1298, 458], [1298, 483], [1296, 484], [1215, 484], [1213, 483], [1213, 459]], "type": "cell", "rows": [7, 7], "columns": [10, 10], "text_list": []}, {"location": [[1123, 457], [1209, 457], [1209, 484], [1123, 484]], "bbox": [1123, 457, 1209, 484], "points": [[1123, 458], [1125, 457], [1208, 457], [1209, 458], [1209, 483], [1208, 484], [1125, 484], [1123, 483]], "type": "cell", "rows": [7, 7], "columns": [9, 9], "text_list": []}, {"location": [[1032, 457], [1118, 457], [1118, 484], [1032, 484]], "bbox": [1032, 457, 1118, 484], "points": [[1032, 458], [1033, 457], [1116, 457], [1118, 458], [1118, 483], [1116, 484], [1033, 484], [1032, 483], [1032, 479], [1033, 478], [1032, 477], [1033, 475], [1032, 474], [1033, 473], [1032, 472], [1033, 471], [1032, 470], [1033, 468], [1032, 467]], "type": "cell", "rows": [7, 7], "columns": [8, 8], "text_list": []}, {"location": [[942, 457], [1027, 457], [1027, 484], [942, 484]], "bbox": [942, 457, 1027, 484], "points": [[942, 458], [943, 457], [1026, 457], [1027, 458], [1027, 483], [1026, 484], [943, 484], [942, 483]], "type": "cell", "rows": [7, 7], "columns": [7, 7], "text_list": []}, {"location": [[852, 457], [937, 457], [937, 484], [852, 484]], "bbox": [852, 457, 937, 484], "points": [[852, 458], [853, 457], [936, 457], [937, 458], [937, 483], [936, 484], [854, 484], [853, 483], [853, 461], [852, 460]], "type": "cell", "rows": [7, 7], "columns": [6, 6], "text_list": []}, {"location": [[762, 457], [847, 457], [847, 484], [762, 484]], "bbox": [762, 457, 847, 484], "points": [[762, 458], [763, 457], [846, 457], [847, 458], [847, 483], [846, 484], [764, 484], [763, 483], [763, 459]], "type": "cell", "rows": [7, 7], "columns": [5, 5], "text_list": []}, {"location": [[673, 457], [759, 457], [759, 484], [673, 484]], "bbox": [673, 457, 759, 484], "points": [[673, 458], [674, 457], [757, 457], [759, 458], [759, 483], [757, 484], [674, 484], [673, 483]], "type": "cell", "rows": [7, 7], "columns": [4, 4], "text_list": []}, {"location": [[383, 457], [569, 457], [569, 484], [383, 484]], "bbox": [383, 457, 569, 484], "points": [[383, 458], [384, 457], [568, 457], [569, 458], [569, 483], [568, 484], [384, 484], [383, 483]], "type": "cell", "rows": [7, 7], "columns": [2, 2], "text_list": []}, {"location": [[233, 457], [377, 457], [377, 484], [233, 484]], "bbox": [233, 457, 377, 484], "points": [[233, 458], [235, 457], [376, 457], [377, 458], [377, 483], [376, 484], [236, 484], [235, 483], [235, 475], [233, 474], [233, 460], [235, 459]], "type": "cell", "rows": [7, 7], "columns": [1, 1], "text_list": []}, {"location": [[1213, 426], [1298, 426], [1298, 452], [1213, 452]], "bbox": [1213, 426, 1298, 452], "points": [[1213, 428], [1215, 426], [1296, 426], [1298, 428], [1298, 451], [1296, 452], [1215, 452], [1213, 451]], "type": "cell", "rows": [6, 6], "columns": [10, 10], "text_list": []}, {"location": [[1123, 426], [1209, 426], [1209, 452], [1123, 452]], "bbox": [1123, 426, 1209, 452], "points": [[1123, 428], [1125, 426], [1206, 426], [1209, 429], [1209, 451], [1208, 452], [1125, 452], [1123, 451]], "type": "cell", "rows": [6, 6], "columns": [9, 9], "text_list": []}, {"location": [[1032, 426], [1118, 426], [1118, 452], [1032, 452]], "bbox": [1032, 426, 1118, 452], "points": [[1032, 428], [1033, 426], [1116, 426], [1118, 428], [1118, 451], [1116, 452], [1035, 452], [1032, 450]], "type": "cell", "rows": [6, 6], "columns": [8, 8], "text_list": []}, {"location": [[942, 426], [1027, 426], [1027, 452], [942, 452]], "bbox": [942, 426, 1027, 452], "points": [[942, 428], [943, 426], [1026, 426], [1027, 428], [1027, 451], [1026, 452], [943, 452], [942, 451]], "type": "cell", "rows": [6, 6], "columns": [7, 7], "text_list": []}, {"location": [[852, 426], [937, 426], [937, 452], [852, 452]], "bbox": [852, 426, 937, 452], "points": [[852, 428], [853, 426], [936, 426], [937, 428], [937, 451], [936, 452], [854, 452], [853, 451], [853, 431], [852, 430], [853, 429]], "type": "cell", "rows": [6, 6], "columns": [6, 6], "text_list": []}, {"location": [[762, 426], [847, 426], [847, 452], [762, 452]], "bbox": [762, 426, 847, 452], "points": [[762, 428], [763, 426], [846, 426], [847, 428], [847, 451], [846, 452], [764, 452], [763, 451], [763, 429]], "type": "cell", "rows": [6, 6], "columns": [5, 5], "text_list": []}, {"location": [[673, 426], [759, 426], [759, 452], [673, 452]], "bbox": [673, 426, 759, 452], "points": [[673, 428], [674, 426], [756, 426], [759, 429], [759, 451], [757, 452], [674, 452], [673, 451]], "type": "cell", "rows": [6, 6], "columns": [4, 4], "text_list": []}, {"location": [[383, 426], [569, 426], [569, 452], [383, 452]], "bbox": [383, 426, 569, 452], "points": [[383, 428], [384, 426], [568, 426], [569, 428], [569, 451], [568, 452], [384, 452], [383, 451]], "type": "cell", "rows": [6, 6], "columns": [2, 2], "text_list": []}, {"location": [[233, 426], [377, 426], [377, 452], [233, 452]], "bbox": [233, 426, 377, 452], "points": [[233, 428], [235, 426], [376, 426], [377, 428], [377, 451], [376, 452], [236, 452], [233, 450]], "type": "cell", "rows": [6, 6], "columns": [1, 1], "text_list": []}, {"location": [[1213, 395], [1298, 395], [1298, 422], [1213, 422]], "bbox": [1213, 395, 1298, 422], "points": [[1244, 396], [1245, 395], [1273, 395], [1274, 396], [1275, 395], [1277, 396], [1296, 396], [1298, 397], [1298, 421], [1296, 422], [1215, 422], [1213, 421], [1213, 397], [1215, 396]], "type": "cell", "rows": [5, 5], "columns": [10, 10], "text_list": []}, {"location": [[1123, 395], [1209, 395], [1209, 422], [1123, 422]], "bbox": [1123, 395, 1209, 422], "points": [[1153, 396], [1154, 395], [1180, 395], [1181, 396], [1208, 396], [1209, 397], [1209, 421], [1208, 422], [1125, 422], [1123, 421], [1123, 397], [1125, 396]], "type": "cell", "rows": [5, 5], "columns": [9, 9], "text_list": []}, {"location": [[1032, 395], [1118, 395], [1118, 422], [1032, 422]], "bbox": [1032, 395, 1118, 422], "points": [[1064, 396], [1065, 395], [1088, 395], [1089, 396], [1091, 395], [1092, 396], [1116, 396], [1118, 397], [1118, 421], [1116, 422], [1035, 422], [1032, 419], [1032, 414], [1033, 412], [1032, 411], [1033, 410], [1032, 409], [1033, 408], [1032, 406], [1032, 397], [1033, 396]], "type": "cell", "rows": [5, 5], "columns": [8, 8], "text_list": []}, {"location": [[942, 395], [1027, 395], [1027, 422], [942, 422]], "bbox": [942, 395, 1027, 422], "points": [[973, 396], [974, 395], [997, 395], [998, 396], [999, 395], [1001, 396], [1026, 396], [1027, 397], [1027, 421], [1026, 422], [943, 422], [942, 421], [942, 397], [943, 396]], "type": "cell", "rows": [5, 5], "columns": [7, 7], "text_list": []}, {"location": [[852, 395], [937, 395], [937, 422], [852, 422]], "bbox": [852, 395, 937, 422], "points": [[884, 396], [885, 395], [911, 395], [912, 396], [913, 395], [914, 396], [936, 396], [937, 397], [937, 421], [936, 422], [854, 422], [853, 421], [853, 398], [852, 397], [853, 396]], "type": "cell", "rows": [5, 5], "columns": [6, 6], "text_list": []}, {"location": [[762, 395], [847, 395], [847, 422], [762, 422]], "bbox": [762, 395, 847, 422], "points": [[792, 396], [794, 395], [819, 395], [820, 396], [822, 395], [823, 396], [846, 396], [847, 397], [847, 421], [846, 422], [763, 422], [762, 421], [763, 419], [763, 398], [762, 397], [763, 396]], "type": "cell", "rows": [5, 5], "columns": [5, 5], "text_list": []}, {"location": [[673, 395], [759, 395], [759, 422], [673, 422]], "bbox": [673, 395, 759, 422], "points": [[673, 396], [674, 395], [675, 396], [701, 396], [702, 395], [737, 395], [739, 396], [757, 396], [759, 397], [759, 419], [756, 422], [674, 422], [673, 421]], "type": "cell", "rows": [5, 5], "columns": [4, 4], "text_list": []}, {"location": [[383, 395], [569, 395], [569, 422], [383, 422]], "bbox": [383, 395, 569, 422], "points": [[383, 396], [384, 395], [394, 395], [395, 396], [396, 395], [397, 396], [402, 396], [403, 395], [404, 396], [405, 396], [406, 395], [422, 395], [423, 396], [444, 396], [445, 395], [457, 395], [458, 396], [459, 395], [460, 396], [465, 396], [466, 395], [471, 395], [472, 396], [473, 395], [474, 396], [477, 396], [478, 395], [480, 395], [481, 396], [483, 395], [487, 395], [488, 396], [568, 396], [569, 397], [569, 421], [568, 422], [384, 422], [383, 421]], "type": "cell", "rows": [5, 5], "columns": [2, 2], "text_list": []}, {"location": [[1213, 364], [1298, 364], [1298, 391], [1213, 391]], "bbox": [1213, 364, 1298, 391], "points": [[1213, 366], [1215, 364], [1216, 366], [1217, 364], [1296, 364], [1298, 366], [1298, 390], [1296, 391], [1215, 391], [1213, 390]], "type": "cell", "rows": [4, 4], "columns": [10, 10], "text_list": []}, {"location": [[1123, 364], [1209, 364], [1209, 391], [1123, 391]], "bbox": [1123, 364, 1209, 391], "points": [[1123, 366], [1125, 364], [1129, 364], [1130, 366], [1132, 364], [1206, 364], [1209, 367], [1209, 390], [1208, 391], [1125, 391], [1123, 390]], "type": "cell", "rows": [4, 4], "columns": [9, 9], "text_list": []}, {"location": [[1032, 364], [1118, 364], [1118, 391], [1032, 391]], "bbox": [1032, 364, 1118, 391], "points": [[1035, 364], [1036, 366], [1037, 364], [1116, 364], [1118, 366], [1118, 390], [1116, 391], [1035, 391], [1032, 389], [1032, 385], [1033, 384], [1032, 383], [1033, 382], [1032, 381], [1033, 380], [1033, 375], [1032, 374], [1033, 373], [1032, 371], [1033, 370], [1032, 369], [1032, 367]], "type": "cell", "rows": [4, 4], "columns": [8, 8], "text_list": []}, {"location": [[942, 364], [1027, 364], [1027, 391], [942, 391]], "bbox": [942, 364, 1027, 391], "points": [[942, 366], [943, 364], [944, 364], [946, 366], [947, 364], [1023, 364], [1024, 366], [1025, 364], [1026, 364], [1027, 366], [1027, 390], [1026, 391], [943, 391], [942, 390]], "type": "cell", "rows": [4, 4], "columns": [7, 7], "text_list": []}, {"location": [[852, 364], [937, 364], [937, 391], [852, 391]], "bbox": [852, 364, 937, 391], "points": [[853, 366], [854, 364], [856, 366], [857, 364], [934, 364], [935, 366], [936, 364], [937, 366], [937, 390], [936, 391], [854, 391], [852, 389], [853, 388]], "type": "cell", "rows": [4, 4], "columns": [6, 6], "text_list": []}, {"location": [[762, 364], [847, 364], [847, 391], [762, 391]], "bbox": [762, 364, 847, 391], "points": [[762, 366], [763, 364], [764, 364], [766, 366], [767, 364], [846, 364], [847, 366], [847, 390], [846, 391], [763, 391], [762, 390], [763, 389], [763, 367]], "type": "cell", "rows": [4, 4], "columns": [5, 5], "text_list": []}, {"location": [[673, 364], [759, 364], [759, 391], [673, 391]], "bbox": [673, 364, 759, 391], "points": [[673, 366], [674, 364], [756, 364], [759, 367], [759, 389], [756, 391], [674, 391], [673, 390]], "type": "cell", "rows": [4, 4], "columns": [4, 4], "text_list": []}, {"location": [[383, 364], [569, 364], [569, 391], [383, 391]], "bbox": [383, 364, 569, 391], "points": [[383, 366], [384, 364], [567, 364], [569, 367], [569, 390], [568, 391], [384, 391], [383, 390]], "type": "cell", "rows": [4, 4], "columns": [2, 2], "text_list": []}, {"location": [[233, 364], [377, 364], [377, 422], [233, 422]], "bbox": [233, 364, 377, 422], "points": [[235, 366], [236, 364], [239, 364], [240, 366], [242, 364], [243, 366], [244, 364], [245, 366], [246, 364], [247, 366], [249, 364], [250, 366], [251, 364], [252, 366], [253, 364], [254, 366], [259, 366], [260, 364], [261, 366], [263, 364], [264, 366], [265, 364], [266, 366], [267, 364], [268, 366], [270, 364], [271, 366], [272, 364], [273, 366], [274, 364], [276, 366], [277, 364], [278, 366], [279, 364], [280, 366], [374, 366], [375, 364], [376, 364], [377, 366], [377, 421], [376, 422], [236, 422], [235, 421], [235, 396], [233, 395], [235, 394], [233, 392], [233, 390], [235, 389], [233, 388], [233, 383], [235, 382]], "type": "cell", "rows": [4, 5], "columns": [1, 1], "text_list": []}, {"location": [[1213, 334], [1298, 334], [1298, 360], [1213, 360]], "bbox": [1213, 334, 1298, 360], "points": [[1213, 335], [1215, 334], [1296, 334], [1298, 335], [1298, 359], [1296, 360], [1215, 360], [1213, 359]], "type": "cell", "rows": [3, 3], "columns": [10, 10], "text_list": []}, {"location": [[1123, 334], [1209, 334], [1209, 360], [1123, 360]], "bbox": [1123, 334, 1209, 360], "points": [[1123, 335], [1125, 334], [1208, 334], [1209, 335], [1209, 359], [1208, 360], [1125, 360], [1123, 359]], "type": "cell", "rows": [3, 3], "columns": [9, 9], "text_list": []}, {"location": [[1032, 334], [1118, 334], [1118, 360], [1032, 360]], "bbox": [1032, 334, 1118, 360], "points": [[1032, 335], [1033, 334], [1116, 334], [1118, 335], [1118, 359], [1116, 360], [1033, 360], [1032, 359], [1032, 355], [1033, 354], [1032, 353], [1033, 352], [1032, 350], [1033, 349], [1032, 348], [1033, 347], [1033, 345], [1032, 343], [1033, 342], [1032, 341]], "type": "cell", "rows": [3, 3], "columns": [8, 8], "text_list": []}, {"location": [[942, 334], [1027, 334], [1027, 360], [942, 360]], "bbox": [942, 334, 1027, 360], "points": [[942, 335], [943, 334], [1026, 334], [1027, 335], [1027, 359], [1026, 360], [943, 360], [942, 359]], "type": "cell", "rows": [3, 3], "columns": [7, 7], "text_list": []}, {"location": [[852, 334], [937, 334], [937, 360], [852, 360]], "bbox": [852, 334, 937, 360], "points": [[854, 334], [936, 334], [937, 335], [937, 359], [936, 360], [854, 360], [853, 359], [853, 337], [852, 336]], "type": "cell", "rows": [3, 3], "columns": [6, 6], "text_list": []}, {"location": [[762, 334], [847, 334], [847, 360], [762, 360]], "bbox": [762, 334, 847, 360], "points": [[762, 335], [763, 334], [846, 334], [847, 335], [847, 359], [846, 360], [764, 360], [763, 359], [763, 336]], "type": "cell", "rows": [3, 3], "columns": [5, 5], "text_list": []}, {"location": [[673, 334], [759, 334], [759, 360], [673, 360]], "bbox": [673, 334, 759, 360], "points": [[673, 335], [674, 334], [756, 334], [759, 336], [759, 359], [757, 360], [674, 360], [673, 359]], "type": "cell", "rows": [3, 3], "columns": [4, 4], "text_list": []}, {"location": [[383, 334], [569, 334], [569, 360], [383, 360]], "bbox": [383, 334, 569, 360], "points": [[383, 335], [384, 334], [568, 334], [569, 335], [569, 359], [568, 360], [384, 360], [383, 359]], "type": "cell", "rows": [3, 3], "columns": [2, 2], "text_list": []}, {"location": [[233, 334], [377, 334], [377, 360], [233, 360]], "bbox": [233, 334, 377, 360], "points": [[233, 335], [235, 334], [376, 334], [377, 335], [377, 359], [376, 360], [236, 360], [235, 359], [235, 356], [233, 355], [233, 339], [235, 337], [233, 336]], "type": "cell", "rows": [3, 3], "columns": [1, 1], "text_list": []}, {"location": [[1213, 302], [1298, 302], [1298, 329], [1213, 329]], "bbox": [1213, 302, 1298, 329], "points": [[1213, 304], [1215, 302], [1296, 302], [1298, 304], [1298, 328], [1296, 329], [1215, 329], [1213, 328]], "type": "cell", "rows": [2, 2], "columns": [10, 10], "text_list": []}, {"location": [[1123, 302], [1209, 302], [1209, 329], [1123, 329]], "bbox": [1123, 302, 1209, 329], "points": [[1123, 304], [1125, 302], [1206, 302], [1209, 305], [1209, 328], [1208, 329], [1125, 329], [1123, 328]], "type": "cell", "rows": [2, 2], "columns": [9, 9], "text_list": []}, {"location": [[1032, 302], [1118, 302], [1118, 329], [1032, 329]], "bbox": [1032, 302, 1118, 329], "points": [[1032, 304], [1033, 302], [1116, 302], [1118, 304], [1118, 328], [1116, 329], [1035, 329], [1032, 327], [1032, 325], [1033, 323], [1032, 322], [1033, 321], [1033, 314], [1032, 313], [1033, 312], [1032, 311], [1033, 309], [1032, 308]], "type": "cell", "rows": [2, 2], "columns": [8, 8], "text_list": []}, {"location": [[942, 302], [1027, 302], [1027, 329], [942, 329]], "bbox": [942, 302, 1027, 329], "points": [[942, 304], [943, 302], [1026, 302], [1027, 304], [1027, 328], [1026, 329], [944, 329], [942, 327]], "type": "cell", "rows": [2, 2], "columns": [7, 7], "text_list": []}, {"location": [[852, 302], [937, 302], [937, 329], [852, 329]], "bbox": [852, 302, 937, 329], "points": [[852, 304], [853, 302], [936, 302], [937, 304], [937, 328], [936, 329], [854, 329], [853, 328], [853, 305]], "type": "cell", "rows": [2, 2], "columns": [6, 6], "text_list": []}, {"location": [[762, 302], [847, 302], [847, 329], [762, 329]], "bbox": [762, 302, 847, 329], "points": [[762, 304], [763, 302], [846, 302], [847, 304], [847, 328], [846, 329], [764, 329], [763, 328], [763, 305]], "type": "cell", "rows": [2, 2], "columns": [5, 5], "text_list": []}, {"location": [[673, 302], [759, 302], [759, 329], [673, 329]], "bbox": [673, 302, 759, 329], "points": [[673, 304], [674, 302], [756, 302], [759, 305], [759, 328], [757, 329], [674, 329], [673, 328]], "type": "cell", "rows": [2, 2], "columns": [4, 4], "text_list": []}, {"location": [[383, 302], [569, 302], [569, 329], [383, 329]], "bbox": [383, 302, 569, 329], "points": [[383, 304], [384, 302], [568, 302], [569, 304], [569, 328], [568, 329], [384, 329], [383, 328]], "type": "cell", "rows": [2, 2], "columns": [2, 2], "text_list": []}, {"location": [[1213, 273], [1298, 273], [1298, 298], [1213, 298]], "bbox": [1213, 273, 1298, 298], "points": [[1213, 274], [1215, 273], [1296, 273], [1298, 274], [1298, 297], [1296, 298], [1215, 298], [1213, 297]], "type": "cell", "rows": [1, 1], "columns": [10, 10], "text_list": []}, {"location": [[1123, 273], [1209, 273], [1209, 298], [1123, 298]], "bbox": [1123, 273, 1209, 298], "points": [[1123, 274], [1125, 273], [1208, 273], [1209, 274], [1209, 297], [1208, 298], [1125, 298], [1123, 297]], "type": "cell", "rows": [1, 1], "columns": [9, 9], "text_list": []}, {"location": [[1032, 273], [1118, 273], [1118, 298], [1032, 298]], "bbox": [1032, 273, 1118, 298], "points": [[1032, 274], [1033, 273], [1116, 273], [1118, 274], [1118, 297], [1116, 298], [1035, 298], [1032, 295]], "type": "cell", "rows": [1, 1], "columns": [8, 8], "text_list": []}, {"location": [[942, 273], [1027, 273], [1027, 298], [942, 298]], "bbox": [942, 273, 1027, 298], "points": [[942, 274], [943, 273], [1026, 273], [1027, 274], [1027, 297], [1026, 298], [943, 298], [942, 297]], "type": "cell", "rows": [1, 1], "columns": [7, 7], "text_list": []}, {"location": [[852, 273], [937, 273], [937, 298], [852, 298]], "bbox": [852, 273, 937, 298], "points": [[852, 274], [853, 273], [936, 273], [937, 274], [937, 297], [936, 298], [854, 298], [853, 297], [853, 281], [852, 280]], "type": "cell", "rows": [1, 1], "columns": [6, 6], "text_list": []}, {"location": [[762, 273], [847, 273], [847, 298], [762, 298]], "bbox": [762, 273, 847, 298], "points": [[762, 274], [763, 273], [846, 273], [847, 274], [847, 297], [846, 298], [763, 298], [762, 297], [763, 295], [763, 277], [762, 276]], "type": "cell", "rows": [1, 1], "columns": [5, 5], "text_list": []}, {"location": [[673, 273], [759, 273], [759, 298], [673, 298]], "bbox": [673, 273, 759, 298], "points": [[673, 274], [674, 273], [757, 273], [759, 274], [759, 295], [756, 298], [674, 298], [673, 297]], "type": "cell", "rows": [1, 1], "columns": [4, 4], "text_list": []}, {"location": [[573, 273], [668, 273], [668, 576], [573, 576]], "bbox": [573, 273, 668, 576], "points": [[574, 274], [575, 273], [667, 273], [668, 274], [668, 575], [667, 576], [574, 576], [573, 575], [574, 574], [574, 457], [575, 456], [575, 453], [574, 452]], "type": "cell", "rows": [1, 10], "columns": [3, 3], "text_list": []}, {"location": [[383, 273], [569, 273], [569, 298], [383, 298]], "bbox": [383, 273, 569, 298], "points": [[383, 274], [384, 273], [568, 273], [569, 274], [569, 297], [568, 298], [384, 298], [383, 297]], "type": "cell", "rows": [1, 1], "columns": [2, 2], "text_list": []}, {"location": [[233, 273], [377, 273], [377, 329], [233, 329]], "bbox": [233, 273, 377, 329], "points": [[233, 274], [235, 273], [376, 273], [377, 274], [377, 328], [376, 329], [236, 329], [235, 328], [235, 305], [233, 304], [235, 302], [233, 301], [233, 294], [235, 293], [233, 292], [235, 291], [233, 290], [235, 288], [235, 276]], "type": "cell", "rows": [1, 2], "columns": [1, 1], "text_list": []}, {"location": [[166, 272], [231, 272], [231, 576], [166, 576]], "bbox": [166, 272, 231, 576], "points": [[166, 273], [167, 272], [168, 273], [230, 273], [231, 274], [231, 278], [230, 279], [230, 329], [231, 330], [230, 332], [230, 339], [231, 340], [231, 353], [230, 354], [230, 360], [231, 361], [230, 362], [230, 385], [231, 387], [231, 392], [230, 394], [230, 430], [231, 431], [231, 459], [230, 460], [230, 464], [231, 465], [231, 467], [230, 468], [230, 484], [231, 485], [231, 486], [230, 487], [230, 488], [231, 490], [230, 491], [231, 492], [231, 508], [230, 509], [230, 526], [231, 527], [231, 534], [230, 535], [230, 571], [231, 573], [231, 575], [230, 576], [167, 576], [166, 575]], "type": "cell", "rows": [1, 10], "columns": [0, 0], "text_list": []}, {"location": [[1212, 240], [1298, 240], [1298, 267], [1212, 267]], "bbox": [1212, 240, 1298, 267], "points": [[1215, 240], [1216, 242], [1217, 240], [1296, 240], [1298, 242], [1298, 266], [1296, 267], [1215, 267], [1212, 265], [1213, 264], [1212, 263], [1212, 243]], "type": "cell", "rows": [0, 0], "columns": [10, 10], "text_list": []}, {"location": [[1123, 240], [1209, 240], [1209, 267], [1123, 267]], "bbox": [1123, 240, 1209, 267], "points": [[1123, 242], [1125, 240], [1126, 242], [1127, 242], [1128, 240], [1170, 240], [1171, 242], [1173, 240], [1183, 240], [1184, 242], [1185, 240], [1190, 240], [1191, 242], [1192, 240], [1208, 240], [1209, 242], [1209, 266], [1208, 267], [1125, 267], [1123, 266]], "type": "cell", "rows": [0, 0], "columns": [9, 9], "text_list": []}, {"location": [[1032, 240], [1118, 240], [1118, 267], [1032, 267]], "bbox": [1032, 240, 1118, 267], "points": [[1035, 240], [1036, 242], [1037, 240], [1038, 242], [1039, 240], [1116, 240], [1118, 242], [1118, 266], [1116, 267], [1035, 267], [1032, 265], [1032, 243]], "type": "cell", "rows": [0, 0], "columns": [8, 8], "text_list": []}, {"location": [[942, 240], [1027, 240], [1027, 267], [942, 267]], "bbox": [942, 240, 1027, 267], "points": [[942, 242], [943, 240], [944, 240], [946, 242], [947, 242], [948, 240], [1026, 240], [1027, 242], [1027, 266], [1026, 267], [943, 267], [942, 266]], "type": "cell", "rows": [0, 0], "columns": [7, 7], "text_list": []}, {"location": [[852, 240], [937, 240], [937, 267], [852, 267]], "bbox": [852, 240, 937, 267], "points": [[854, 240], [856, 242], [857, 240], [858, 242], [859, 240], [860, 242], [861, 240], [873, 240], [874, 242], [875, 240], [877, 242], [880, 242], [881, 240], [885, 240], [886, 242], [887, 240], [892, 240], [893, 242], [894, 240], [897, 240], [898, 242], [899, 240], [936, 240], [937, 242], [937, 266], [936, 267], [854, 267], [852, 265], [852, 243]], "type": "cell", "rows": [0, 0], "columns": [6, 6], "text_list": []}, {"location": [[762, 240], [847, 240], [847, 267], [762, 267]], "bbox": [762, 240, 847, 267], "points": [[762, 242], [763, 240], [764, 240], [766, 242], [767, 242], [768, 240], [769, 242], [770, 242], [771, 240], [781, 240], [782, 242], [785, 242], [787, 240], [788, 242], [790, 242], [791, 240], [795, 240], [796, 242], [797, 240], [803, 240], [804, 242], [805, 240], [846, 240], [847, 242], [847, 266], [846, 267], [763, 267], [762, 266]], "type": "cell", "rows": [0, 0], "columns": [5, 5], "text_list": []}, {"location": [[673, 240], [759, 240], [759, 267], [673, 267]], "bbox": [673, 240, 759, 267], "points": [[673, 242], [674, 240], [675, 242], [678, 242], [679, 240], [680, 242], [681, 240], [693, 240], [694, 242], [695, 240], [697, 242], [700, 242], [701, 240], [705, 240], [706, 242], [707, 240], [712, 240], [713, 242], [714, 240], [756, 240], [759, 243], [759, 265], [756, 267], [674, 267], [673, 266]], "type": "cell", "rows": [0, 0], "columns": [4, 4], "text_list": []}, {"location": [[573, 240], [668, 240], [668, 267], [573, 267]], "bbox": [573, 240, 668, 267], "points": [[601, 242], [602, 240], [630, 240], [631, 242], [632, 242], [633, 240], [637, 240], [638, 242], [667, 242], [668, 243], [668, 266], [667, 267], [574, 267], [573, 266], [573, 243], [574, 242]], "type": "cell", "rows": [0, 0], "columns": [3, 3], "text_list": []}, {"location": [[166, 240], [569, 240], [569, 267], [166, 267]], "bbox": [166, 240, 569, 267], "points": [[166, 242], [167, 240], [168, 242], [568, 242], [569, 243], [569, 266], [568, 267], [167, 267], [166, 266]], "type": "cell", "rows": [0, 0], "columns": [0, 2], "text_list": []}, {"location": [[161, 236], [1303, 236], [1303, 581], [161, 581]], "bbox": [161, 236, 1303, 581], "points": [[162, 236], [162, 238], [161, 239], [161, 580], [162, 581], [1302, 581], [1303, 580], [1303, 237], [166, 237], [164, 236]], "type": "table", "contains": [85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182]}]}}] ================================================ FILE: libs/kotaemon/tests/resources/html/dummy.html ================================================

細則 本社編(情報システム部)              分類番号 157300

2020.2

 

 

 

 

1. スパットくん紛失・盗難時の取扱

スパットくんの紛失・盗難の際は、速やかに停止依頼処理を入力するとともに、報告書を起票します。

NO

項目

内容

対象のスパット

くん確認

紛失・盗難に気づいた時には、対象のスパットくんの端末識別番号を確認します。

※盗難の場合は警察への届出も必要です。

※紛失・盗難の場合は盗難・紛失事故報告も必要です。

あいリクエスト(総務室(大阪))

『盗難・紛失事故兼個人情報等事故報告』

報告書の起票

あいリクエスト(システム業務室)-『モバイル決済端末(スパットくん)紛失・盗難報告書』を起票します。

停止依頼入力

モバイル端末管理ウェブより停止依頼処理を入力します。

※停止依頼入力により該当スパットくんは使用不可となります。

報告書の承認と

担当室への報告

所属長はあいリクエストにて申請・送付された報告書を確認・承認します。承認後、報告書をあいリクエスト(総務室(大阪))-『紛失・盗難事故 兼 個人情報等事故報告』に添付し報告します。(関連細則:本支社編11110/210060「紛失・盗難事故等の被害報告」を参照)

※日計処理の実施が「無」の場合本社担当室(システム業務室、

契約審査室、収納サービス室、損保サービス室)へ連絡する。

【関連マニュアル】 スパットくん・モバイル管理ウェブ操作マニュアル

 

<スパットくん再発見時の対応>

NO

項目

内容

端末設置組織の

確認

スパットくんの設置状況照会を行い、紛失・盗難となったスパットくんであるか確認します。

端末停止解除

入力

モバイル端末管理ウェブより停止解除処理を入力します。

※スパットくんの利用再開は、支社にて停止解除入力から2営業日以上経過してから利用下さい。

        【関連マニュアル】 スパットくん・モバイル管理ウェブ操作マニュアル

 

 

 

2. スパットくん故障時の取扱

スパットくんが故障した場合には、代替機への交換と故障機の返却を行います。

NO

項目

内容

故障内容の確認と報告書の起票

スパットくんが故障した場合は、あいリクエスト(システム業務室)-『モバイル決済端末(スパットくん)故障報告書』を起票します。

修理依頼入力

 

『モバイル端末管理ウェブ』より修理依頼処理を入力します。修理

依頼処理の入力により、代替機が送付されます。
その際、「モバイル決済端末修理依頼書」を印刷し、「モバイル決済端末(スパットくん)故障報告書」を参照して必要事項を記入します。

報告書の承認と担当室への報告

所属長はあいリクエストにて申請・送付された報告書を確認・承認し、システム業務室に報告します。

梱包

以下をセットで梱包します。

・故障したスパットくん本体

・付属品(充電アダプタ、コード、タッチペン、ストラップ)

・モバイル決済端末修理依頼書

※各所属にて保管のスパットくん送付箱に梱包します

 

代替機の受取と

故障機の送付

 

入力後数日でスパットくんの代替機が到着します。

受け取ると同時に、配送業者(日本通運)に故障したスパットくん、および「モバイル決済端末修理依頼書」を渡します。

※「モバイル決済端末修理依頼書」を忘れずに同梱下さい。

 

代替機の端末受取入力

モバイル端末管理ウェブより代替機の端末受取処理を入力します。

          【関連マニュアル】 スパットくん・モバイル管理ウェブ操作マニュアル

 

- 1 -

================================================ FILE: libs/kotaemon/tests/resources/policy.md ================================================ # 5 年ごと配当付特定状態保障定期保険特約条項 目次 ## 1. この特約の概要 第 1 条 特約保険金の支払 第 2 条 特約保険金の支払に関する補則 第 3 条 特約保険金の免責事由に該当した場合の取扱 第 4 条 特約保険金の請求、支払時期および支払場所 第 5 条 特約の保険料払込の免除 第 6 条 特約の締結 第 7 条 特約の責任開始期 第 8 条 特約の保険期間および保険料払込期間 第 9 条 特約の保険料の払込 第 10 条 猶予期間中の保険事故亡保険料の取扱 第 11 条 特約の失効 第 12 条 特約の復活 第 13 条 告知義務 第 14 条 告知義務違反による解除 第 15 条 特約を解除できない場合 第 16 条 重大事由による解除 第 17 条 特約の解約 第 18 条 特約の返還金 第 19 条 特約の消滅とみなす場合 第 20 条 債権者等により特約が解約される場合の取扱 第 21 条 特約保険金額の減額 第 22 条 特約の更新 第 23 条 特約の契約者配当金 第 24 条 主契約の内容变更に伴う特約の取扱 第 25 条 主契約について保険料の自動貸付の規定を適用 する場合の取扱 第 26 条 主契約を払済保険に变更する場合の取扱 第 27 条 法令等の改正等に伴う特約障害保険金および特 約介護保険金の支払事由に関する規定の变更 第 28 条 管轄裁判所 第 29 条 契約内容の登録 第 30 条 主約款の規定の準用 第 31 条 5 年ごと配当付定期保険または 5 年ごと利差配 当付定期保険に付加した場合の特則 第 32 条 5 年ごと配当付生存給付金付定期保険または 5 年己゙と利差配当付生存給付金付定期保険に付加 した場合の特則 第 33 条 5 年ごと配当付逓増定期保険または 5 年ごと利 差配当付逓增定期保険沉付加した場合の特則 第 34 条 5 年ごと配当付養老保険または 5 年ごと利差配 当付養老保険に付加した場合の特則 第 35 条 5 年ごと配当付終身保険に 5 年ごと配当付年金 支払移行特約等を付加した場合または 5 年ごと 利差配当付終身保険厄 5 年己゙と利差配当付年金 支払移行特約等を付加した場合の特約の取扱 第 36 条 保険料払込期間が終身の 5 年ごと配当付終身保 険または保険料払込期間が終身の 5 年ごと利差 配当付終身保険尺付加した場合の特則 第 37 条 5 年ごと配当付更新型終身移行保険または 5 年 ごと利差配当付更新型終身移行保険に付加した 場合の特則 第 38 条 5 年ごと配当付更新型終身移行保険または 5 年 ごと利差配当付更新型終身移行保険に 5 年ごと 配当付年金支払移行特約等を付加した場合の特 約の取扱 第 39 条 5 年ごと配当付介護年金終身保障保険または 5 年ごと利差配当付介護年金終身保障保険に付加 した場合の特則 第 40 条 5 年己゙と配当付終身医療保険または 5 年ごと利 差配当付経身医療保険汇付加した場合の特則 第 41 条 5 年ごと配当付介護年金保険(解約返還金なし 型)に付加した場合の特則 第 42 条 転換後契約または变更後契約に付加した場合の 特則 第 43 条 転換特約、部分保障变更特約または家族内保障 承継特約を付加した場合の特則 第 44 条 特別条件を付けた場合の特則 第 45 条 契約日が平成 22 年 3 月 1 日以前の主契約に付加 した場合の特則 第 46 条 契約日が平成 24 年 10 月 1 日以前の主契約に付加 した場合の特約特定疾病保険金、特約障害保険 金および特約介護保険金の代理請求 ## 2. 5 年ごと配当付特定状態保障定期保険特約条項 ## 3. (この特約の概要) (2015 年 5 月 21 日改正) この特約は、つぎの給付を行うことを主な内容とするものです。なお、特約死亡保険金額、特約特定疾病保険金額、特 約障害保険金額および特約介護保険金額は同額です。 | | 給付の内容 | | :----------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | 特約死亡保険金 | 被保険者がこの特約の保険期間中に死亡したときに支払います。 | | 特約特定疾病保険金 | $\begin{array}{l}\text { 被保険者がこの特約の保険期間中に特定の疾病(悪性新生物(がん)、急性心筋梗塞または脳 } \\ \text { 卒中)により所定の状態に該当したときに支払います。 }\end{array}$ | | 特約障害保険金 | ![](https://cdn.mathpix.com/cropped/2023_09_15_60b7a05a11e6ef69c8bbg-1.jpg?height=93&width=1317&top_left_y=2514&top_left_x=537) | | 特約介護保険金 | 被保険者がこの特約の保険期間中に傷害または疾病により所定の要介護状態に該当したとき | 1。この特約において支払う特約保険金はつぎのとおりです。 | | $\begin{array}{l}\text { 特約保険金を支払う場合(以下「支払事由」 } \\ \text { といいます。) }\end{array}$ | 支払額 | 受取人 | $\begin{array}{l}\text { 支払事由に該当しても特約保険金を支払 } \\ \text { わない場合 (以下「免責事由」といいます。) }\end{array}$ | | :-------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | $\begin{array}{l}\text { 特 } \\ \text { 約 } \\ \text { 死 } \\ 亡 \\ \text { 亡 } \\ \text { 保 } \\ \text { 険 } \\ \text { 金 }\end{array}$ | $\begin{array}{l}\text { 被保険者がこの特約の保険期間中に死亡し } \\ \text { たとき }\end{array}$ | $\begin{array}{l}\text { 特 } \\ \text { 約 } \\ \text { 保 } \\ \text { 険 } \\ \text { 金 } \\ \text { 額 }\end{array}$ | $\begin{array}{l}\text { 特 } \\ \text { 絢 } \\ \text { 㨐 } \\ \text { 険 } \\ \text { 善 } \\ \text { 聚 }\end{array}$ | $\begin{array}{l}\text { つぎのいずれかにより左記の支払事由が } \\ \text { 生じたとき } \\ \text { (1) この特約の責任開始期(復活の取扱が } \\ \text { 行われた後は、最後の復活の際の責任開 } \\ \text { 始期。以下同じ。)の属するもからその } \\ \text { 日を含めて } 3 \text { 年以内の自殺 } \\ \text { (2) 保険契約者または特約死亡保険金受 } \\ \text { 取人の故意 } \\ \text { (3) 戦争その他の变乱 }\end{array}$ | | $\begin{array}{l}\text { 特 } \\ \text { 約 } \\ \text { 特 } \\ \text { 定 } \\ \text { 疾 } \\ \text { 病 } \\ \text { 除 } \\ \text { 金 }\end{array}$ | ![](https://cdn.mathpix.com/cropped/2023_09_15_60b7a05a11e6ef69c8bbg-2.jpg?height=1118&width=621&top_left_y=724&top_left_x=305) | | $\begin{array}{l}\text { 特 } \\ \text { 約 } \\ \text { 特 } \\ \text { 定 } \\ \text { 疾 } \\ \text { 病 } \\ \text { 保 } \\ \text { 険 } \\ \text { 金 } \\ \text { 受 } \\ \text { 取 } \\ \text { 人 }\end{array}$ | + | | $\begin{array}{l}\text { 特 } \\ \text { 約 } \\ \text { 障 } \\ \text { 害 } \\ \text { 保 } \\ \text { 険 } \\ \text { 金 }\end{array}$ | ![](https://cdn.mathpix.com/cropped/2023_09_15_60b7a05a11e6ef69c8bbg-2.jpg?height=900&width=621&top_left_y=1837&top_left_x=305) | | $\begin{array}{l}\text { 特 } \\ \text { 約 } \\ \text { 障 } \\ \text { 害 } \\ \text { 保 } \\ \text { 険 } \\ \text { 金 } \\ \text { 受 } \\ \text { 取 } \\ \text { 人 }\end{array}$ | ![](https://cdn.mathpix.com/cropped/2023_09_15_60b7a05a11e6ef69c8bbg-2.jpg?height=904&width=594&top_left_y=1837&top_left_x=1262) | | | 支払事由 | 支払額 | 受取人 | 免責事由 | | :---------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | $\begin{array}{l}\text { 特 } \\ \text { 約 } \\ \text { 介 } \\ \text { 護 } \\ \text { 保 } \\ \text { 険 } \\ \text { 金 }\end{array}$ | $\begin{array}{l}\text { 被保険者がこの特約の責任開始期以後の傷 } \\ \text { 害または疾病を原因として、この特約の保 } \\ \text { 険期間中に要介護状態(表4)に該当した } \\ \text { とき }\end{array}$ | $\begin{array}{l}\text { 特 } \\ \text { 約 } \\ \text { 保 } \\ \text { 険 } \\ \text { 金 } \\ \text { 額 }\end{array}$ | $\begin{array}{l}\text { 特 } \\ \text { 約 } \\ \text { 介 } \\ \text { 護 } \\ \text { 保 } \\ \text { 険 } \\ \text { 金 } \\ \text { 受 } \\ \text { 取 } \\ \text { 人 }\end{array}$ | $\begin{array}{l}\text { つぎのいずれかにより左記の支払事由が生 } \\ \text { じたとき } \\ \text { (1) 保険契約者または被保険者の故意また } \\ \text { は重大な過失 } \\ \text { (2) 被保険者の犯罪行為 } \\ \text { (3) 被保険者の精神障害を原因とする事故 } \\ \text { (4) 被保険者の泥酔の状態を原因とする事 } \\ \text { 故 } \\ \text { (5) 被保険者が法令に定める運転資格を持 } \\ \text { たないで運転している間に生じた事故 } \\ \text { (6) 被保険者が法令に定める酒気帯び運転 } \\ \text { またはこれに相当する運転をしている間 } \\ \text { に生じた事故 } \\ \text { (7) 被保険者の薬物依存 } \\ \text { (8) 地震、噴火または津波 } \\ \text { (9) 戦争その他の变乱 }\end{array}$ | 2. 第 1 項の特約特定疾病保険金の支払事由の(1)に該当した場合でも、この特約の責任開始期の属する日からその日を含 めて 90 日以内に乳房の悪性新生物(表 1 中、基本分類コード C 50 の悪性新生物。以下同じ。)に䍜患し、医師により診断 確定されたときは、当会社は、特約特定疾病保険金を支払いません。ただし、その後(乳房の悪性新生物についてはこ の特約の責任開始期の属する日からその日を含めて 90 日経過後)、この特約の保険期間中に、被保険者がその乳房の悪性 新生物と因果関係のない悪性新生物(表 1)に罹患し、医師により診断確定されたときは、特約特定疾病保険金を支払 います。 ## 4. 表 1 対象となる悪性新生物、急性心筋梗塞、脳卒中 対象となる悪性新生物、急性心筋梗塞、脳卒中とは、次表によって定義づけられる疾病とし、かつ、平成 21 年 3 月 23 日 総務省告示第 176 号にもとづ<厚生労働省大臣官房統計情報部編「疾病、傷害および死因統計分類提要ICD-10(2003 年版)準拠」に記載された分類項目中、次表の基本分類コードに規定される内容によるものをいいます。 | 疾 病 名 | 疾 病 の 定 義 | 分 類 項 目 | ![](https://cdn.mathpix.com/cropped/2023_09_15_60b7a05a11e6ef69c8bbg-3.jpg?height=85&width=150&top_left_y=1540&top_left_x=1689) | | :--------: | :-----------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | 悪性新生物 | ![](https://cdn.mathpix.com/cropped/2023_09_15_60b7a05a11e6ef69c8bbg-3.jpg?height=732&width=445&top_left_y=1617&top_left_x=442) | ![](https://cdn.mathpix.com/cropped/2023_09_15_60b7a05a11e6ef69c8bbg-3.jpg?height=732&width=750&top_left_y=1617&top_left_x=909) | $\begin{array}{l}\mathrm{C} 00-\mathrm{C} 1 \\ \mathrm{C} 00 \\ \mathrm{C} 01 \\ \mathrm{C} 02 \\ \mathrm{C} 03 \\ \mathrm{C} 04 \\ \mathrm{C} 05 \\ \mathrm{C} 06 \\ \mathrm{C} 07 \\ \mathrm{C} 08 \\ \mathrm{C} 09 \\ \mathrm{C} 10 \\ \mathrm{C} 11 \\ \mathrm{C} 12 \\ \mathrm{C} 13 \\ \mathrm{C} 14\end{array}$ | ================================================ FILE: libs/kotaemon/tests/simple_pipeline.py ================================================ import tempfile from typing import List from kotaemon.base import BaseComponent, LLMInterface, lazy from kotaemon.embeddings import LCAzureOpenAIEmbeddings from kotaemon.indices import VectorRetrieval from kotaemon.llms import AzureOpenAI from kotaemon.storages import ChromaVectorStore class Pipeline(BaseComponent): llm: AzureOpenAI = AzureOpenAI.withx( azure_endpoint="https://test.openai.azure.com/", openai_api_key="some-key", openai_api_version="2023-03-15-preview", deployment_name="gpt35turbo", temperature=0, request_timeout=60, ) retrieving_pipeline: VectorRetrieval = VectorRetrieval.withx( vector_store=lazy(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())), embedding=LCAzureOpenAIEmbeddings.withx( model="text-embedding-ada-002", deployment="embedding-deployment", azure_endpoint="https://test.openai.azure.com/", openai_api_key="some-key", ), ) def run(self, text: str) -> LLMInterface: matched_texts: List[str] = self.retrieving_pipeline(text) return self.llm("\n".join(matched_texts)) ================================================ FILE: libs/kotaemon/tests/test_agent.py ================================================ from unittest.mock import patch import pytest from openai.types.chat.chat_completion import ChatCompletion from kotaemon.agents import ( AgentType, BaseTool, GoogleSearchTool, LangchainAgent, LLMTool, ReactAgent, RewooAgent, WikipediaTool, ) from kotaemon.llms import AzureChatOpenAI from .conftest import skip_openai_lc_wrapper_test FINAL_RESPONSE_TEXT = "Final Answer: Hello Cinnamon AI!" REWOO_VALID_PLAN = ( "#Plan1: Search for Cinnamon AI company on Google\n" "#E1: google_search[Cinnamon AI company]\n" "#Plan2: Search for Cinnamon on Wikipedia\n" "#E2: wikipedia[Cinnamon]\n" ) REWOO_INVALID_PLAN = ( "#E1: google_search[Cinnamon AI company]\n" "#Plan2: Search for Cinnamon on Wikipedia\n" "#E2: wikipedia[Cinnamon]\n" ) def generate_chat_completion_obj(text): return ChatCompletion.parse_obj( { "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", "object": "chat.completion", "created": 1692338378, "model": "gpt-35-turbo", "system_fingerprint": None, "choices": [ { "index": 0, "finish_reason": "stop", "message": { "role": "assistant", "content": text, "function_call": None, "tool_calls": None, }, "logprobs": None, } ], "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, } ) _openai_chat_completion_responses_rewoo = [ generate_chat_completion_obj(text=text) for text in [REWOO_VALID_PLAN, FINAL_RESPONSE_TEXT] ] _openai_chat_completion_responses_rewoo_error = [ generate_chat_completion_obj(text=text) for text in [REWOO_INVALID_PLAN, FINAL_RESPONSE_TEXT] ] _openai_chat_completion_responses_react = [ generate_chat_completion_obj(text=text) for text in [ ( "I don't have prior knowledge about Cinnamon AI company, " "so I should gather information about it.\n" "Action: wikipedia\n" "Action Input: Cinnamon AI company\n" ), ( "The information retrieved from Wikipedia is not " "about Cinnamon AI company, but about Blue Prism, " "a British multinational software corporation. " "I need to try another source to gather information " "about Cinnamon AI company.\n" "Action: google_search\n" "Action Input: Cinnamon AI company\n" ), FINAL_RESPONSE_TEXT, ] ] _openai_chat_completion_responses_react_langchain_tool = [ generate_chat_completion_obj(text=text) for text in [ ( "I don't have prior knowledge about Cinnamon AI company, " "so I should gather information about it.\n" "Action: wikipedia\n" "Action Input: Cinnamon AI company\n" ), # ( # "The information retrieved from Wikipedia is not " # "about Cinnamon AI company, but about Blue Prism, " # "a British multinational software corporation. " # "I need to try another source to gather information " # "about Cinnamon AI company.\n" # "Action: duckduckgo_search\n" # "Action Input: Cinnamon AI company\n" # ), FINAL_RESPONSE_TEXT, ] ] @pytest.fixture def llm(): return AzureChatOpenAI( api_key="dummy", api_version="2024-05-01-preview", azure_deployment="gpt-4o", azure_endpoint="https://test.openai.azure.com/", ) @patch( "openai.resources.chat.completions.Completions.create", side_effect=_openai_chat_completion_responses_rewoo_error, ) def test_agent_fail(openai_completion, llm, mock_google_search): plugins = [ GoogleSearchTool(), WikipediaTool(), LLMTool(llm=llm), ] agent = RewooAgent(planner_llm=llm, solver_llm=llm, plugins=plugins) response = agent("Tell me about Cinnamon AI company") openai_completion.assert_called() assert not response assert response.status == "failed" @patch( "openai.resources.chat.completions.Completions.create", side_effect=_openai_chat_completion_responses_rewoo, ) def test_rewoo_agent(openai_completion, llm, mock_google_search): plugins = [ GoogleSearchTool(), WikipediaTool(), LLMTool(llm=llm), ] agent = RewooAgent(planner_llm=llm, solver_llm=llm, plugins=plugins) response = agent("Tell me about Cinnamon AI company") openai_completion.assert_called() assert response.text == FINAL_RESPONSE_TEXT @patch( "openai.resources.chat.completions.Completions.create", side_effect=_openai_chat_completion_responses_react, ) def test_react_agent(openai_completion, llm, mock_google_search): plugins = [ GoogleSearchTool(), WikipediaTool(), LLMTool(llm=llm), ] agent = ReactAgent(llm=llm, plugins=plugins, max_iterations=4) response = agent("Tell me about Cinnamon AI company") openai_completion.assert_called() assert response.text == FINAL_RESPONSE_TEXT @skip_openai_lc_wrapper_test @patch( "openai.resources.chat.completions.Completions.create", side_effect=_openai_chat_completion_responses_react, ) def test_react_agent_langchain(openai_completion, llm, mock_google_search): from langchain.agents import AgentType, initialize_agent plugins = [ GoogleSearchTool(), WikipediaTool(), LLMTool(llm=llm), ] langchain_plugins = [tool.to_langchain_format() for tool in plugins] agent = initialize_agent( langchain_plugins, llm.to_langchain_format(), agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True, ) response = agent("Tell me about Cinnamon AI company") openai_completion.assert_called() assert response @skip_openai_lc_wrapper_test @patch( "openai.resources.chat.completions.Completions.create", side_effect=_openai_chat_completion_responses_react, ) def test_wrapper_agent_langchain(openai_completion, llm, mock_google_search): plugins = [ GoogleSearchTool(), WikipediaTool(), LLMTool(llm=llm), ] agent = LangchainAgent( llm=llm, plugins=plugins, agent_type=AgentType.react, ) response = agent("Tell me about Cinnamon AI company") openai_completion.assert_called() assert response @patch( "openai.resources.chat.completions.Completions.create", side_effect=_openai_chat_completion_responses_react_langchain_tool, ) def test_react_agent_with_langchain_tools(openai_completion, llm): from langchain_community.tools import DuckDuckGoSearchRun, WikipediaQueryRun from langchain_community.utilities import WikipediaAPIWrapper wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()) search = DuckDuckGoSearchRun() langchain_plugins = [wikipedia, search] plugins = [BaseTool.from_langchain_format(tool) for tool in langchain_plugins] agent = ReactAgent(llm=llm, plugins=plugins, max_iterations=4) response = agent("Tell me about Cinnamon AI company") openai_completion.assert_called() assert response.text == FINAL_RESPONSE_TEXT ================================================ FILE: libs/kotaemon/tests/test_composite.py ================================================ from copy import deepcopy import pytest from openai.types.chat.chat_completion import ChatCompletion from kotaemon.llms import ( AzureChatOpenAI, BasePromptComponent, GatedBranchingPipeline, GatedLinearPipeline, SimpleBranchingPipeline, SimpleLinearPipeline, ) from kotaemon.parsers import RegexExtractor _openai_chat_completion_response = ChatCompletion.parse_obj( { "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", "object": "chat.completion", "created": 1692338378, "model": "gpt-35-turbo", "system_fingerprint": None, "choices": [ { "index": 0, "finish_reason": "stop", "message": { "role": "assistant", "content": "This is a test 123", "finish_reason": "length", "logprobs": None, }, "logprobs": None, } ], "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, } ) @pytest.fixture def mock_llm(): return AzureChatOpenAI( api_key="dummy", api_version="2024-05-01-preview", azure_deployment="gpt-4o", azure_endpoint="https://test.openai.azure.com/", ) @pytest.fixture def mock_post_processor(): return RegexExtractor(pattern=r"\d+") @pytest.fixture def mock_prompt(): return BasePromptComponent(template="Test prompt {value}") @pytest.fixture def mock_simple_linear_pipeline(mock_prompt, mock_llm, mock_post_processor): return SimpleLinearPipeline( prompt=mock_prompt, llm=mock_llm, post_processor=mock_post_processor ) @pytest.fixture def mock_gated_linear_pipeline_positive(mock_prompt, mock_llm, mock_post_processor): return GatedLinearPipeline( prompt=mock_prompt, llm=mock_llm, post_processor=mock_post_processor, condition=RegexExtractor(pattern="positive"), ) @pytest.fixture def mock_gated_linear_pipeline_negative(mock_prompt, mock_llm, mock_post_processor): return GatedLinearPipeline( prompt=mock_prompt, llm=mock_llm, post_processor=mock_post_processor, condition=RegexExtractor(pattern="negative"), ) def test_simple_linear_pipeline_run(mocker, mock_simple_linear_pipeline): openai_mocker = mocker.patch( "openai.resources.chat.completions.Completions.create", return_value=_openai_chat_completion_response, ) result = mock_simple_linear_pipeline(value="abc") assert result.text == "123" assert openai_mocker.call_count == 1 def test_gated_linear_pipeline_run_positive( mocker, mock_gated_linear_pipeline_positive ): openai_mocker = mocker.patch( "openai.resources.chat.completions.Completions.create", return_value=_openai_chat_completion_response, ) result = mock_gated_linear_pipeline_positive( value="abc", condition_text="positive condition" ) assert result.text == "123" assert openai_mocker.call_count == 1 def test_gated_linear_pipeline_run_negative( mocker, mock_gated_linear_pipeline_positive ): openai_mocker = mocker.patch( "openai.resources.chat.completions.Completions.create", return_value=_openai_chat_completion_response, ) result = mock_gated_linear_pipeline_positive( value="abc", condition_text="negative condition" ) assert result.content is None assert openai_mocker.call_count == 0 def test_simple_branching_pipeline_run(mocker, mock_simple_linear_pipeline): response0: ChatCompletion = _openai_chat_completion_response response1: ChatCompletion = deepcopy(_openai_chat_completion_response) response1.choices[0].message.content = "a quick brown fox" response2: ChatCompletion = deepcopy(_openai_chat_completion_response) response2.choices[0].message.content = "jumps over the lazy dog 456" openai_mocker = mocker.patch( "openai.resources.chat.completions.Completions.create", side_effect=[response0, response1, response2], ) pipeline = SimpleBranchingPipeline() for _ in range(3): pipeline.add_branch(mock_simple_linear_pipeline) result = pipeline.run(value="abc") texts = [each.text for each in result] assert len(result) == 3 assert texts == ["123", "", "456"] assert openai_mocker.call_count == 3 def test_simple_gated_branching_pipeline_run( mocker, mock_gated_linear_pipeline_positive, mock_gated_linear_pipeline_negative ): response0: ChatCompletion = deepcopy(_openai_chat_completion_response) response0.choices[0].message.content = "a quick brown fox" openai_mocker = mocker.patch( "openai.resources.chat.completions.Completions.create", return_value=response0, ) pipeline = GatedBranchingPipeline() pipeline.add_branch(mock_gated_linear_pipeline_negative) pipeline.add_branch(mock_gated_linear_pipeline_positive) pipeline.add_branch(mock_gated_linear_pipeline_positive) result = pipeline.run(value="abc", condition_text="positive condition") assert result.text == "" assert openai_mocker.call_count == 2 ================================================ FILE: libs/kotaemon/tests/test_cot.py ================================================ from unittest.mock import patch from openai.types.chat.chat_completion import ChatCompletion from kotaemon.llms import AzureChatOpenAI from kotaemon.llms.cot import ManualSequentialChainOfThought, Thought _openai_chat_completion_response = [ ChatCompletion.parse_obj( { "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", "object": "chat.completion", "created": 1692338378, "model": "gpt-35-turbo", "system_fingerprint": None, "choices": [ { "index": 0, "finish_reason": "stop", "message": { "role": "assistant", "content": text, "function_call": None, "tool_calls": None, }, "logprobs": None, } ], "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, } ) for text in ["Bonjour", "こんにちは (Konnichiwa)"] ] @patch( "openai.resources.chat.completions.Completions.create", side_effect=_openai_chat_completion_response, ) def test_cot_plus_operator(openai_completion): llm = AzureChatOpenAI( api_key="dummy", api_version="2024-05-01-preview", azure_deployment="gpt-4o", azure_endpoint="https://test.openai.azure.com/", ) thought1 = Thought( prompt="Word {word} in {language} is ", llm=llm, post_process=lambda string: {"translated": string}, ) thought2 = Thought( prompt="Translate {translated} to Japanese", llm=llm, post_process=lambda string: {"output": string}, ) thought = thought1 + thought2 output = thought(word="hello", language="French") assert output.content == { "word": "hello", "language": "French", "translated": "Bonjour", "output": "こんにちは (Konnichiwa)", } @patch( "openai.resources.chat.completions.Completions.create", side_effect=_openai_chat_completion_response, ) def test_cot_manual(openai_completion): llm = AzureChatOpenAI( api_key="dummy", api_version="2024-05-01-preview", azure_deployment="gpt-4o", azure_endpoint="https://test.openai.azure.com/", ) thought1 = Thought( prompt="Word {word} in {language} is ", post_process=lambda string: {"translated": string}, ) thought2 = Thought( prompt="Translate {translated} to Japanese", post_process=lambda string: {"output": string}, ) thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm) output = thought(word="hello", language="French") assert output.content == { "word": "hello", "language": "French", "translated": "Bonjour", "output": "こんにちは (Konnichiwa)", } @patch( "openai.resources.chat.completions.Completions.create", side_effect=_openai_chat_completion_response, ) def test_cot_with_termination_callback(openai_completion): llm = AzureChatOpenAI( api_key="dummy", api_version="2024-05-01-preview", azure_deployment="gpt-4o", azure_endpoint="https://test.openai.azure.com/", ) thought1 = Thought( prompt="Word {word} in {language} is ", post_process=lambda string: {"translated": string}, ) thought2 = Thought( prompt="Translate {translated} to Japanese", post_process=lambda string: {"output": string}, ) thought = ManualSequentialChainOfThought( thoughts=[thought1, thought2], llm=llm, terminate=lambda d: True if d.get("translated", "") == "Bonjour" else False, ) output = thought(word="hallo", language="French") assert output.content == { "word": "hallo", "language": "French", "translated": "Bonjour", } ================================================ FILE: libs/kotaemon/tests/test_docstores.py ================================================ import os from unittest.mock import patch import pytest from elastic_transport import ApiResponseMeta from kotaemon.base import Document from kotaemon.storages import ( ElasticsearchDocumentStore, InMemoryDocumentStore, SimpleFileDocumentStore, ) meta_success = ApiResponseMeta( status=200, http_version="1.1", headers={"x-elastic-product": "Elasticsearch"}, duration=1.0, node=None, ) meta_fail = ApiResponseMeta( status=404, http_version="1.1", headers={"x-elastic-product": "Elasticsearch"}, duration=1.0, node=None, ) _elastic_search_responses = [ # check exist (meta_fail, None), # create index ( meta_success, {"acknowledged": True, "shards_acknowledged": True, "index": "test"}, ), # count API ( meta_success, [{"epoch": "1700474422", "timestamp": "10:00:22", "count": "0"}], ), # add documents ( meta_success, { "took": 50, "errors": False, "items": [ { "index": { "_index": "test", "_id": "a3774dab-b8f1-43ba-adb8-842cb7a76eeb", "_version": 1, "result": "created", "_shards": {"total": 2, "successful": 1, "failed": 0}, "_seq_no": 0, "_primary_term": 1, "status": 201, } }, { "index": { "_index": "test", "_id": "b44f5593-7587-4f91-afd0-5736e5bd5bfe", "_version": 1, "result": "created", "_shards": {"total": 2, "successful": 1, "failed": 0}, "_seq_no": 1, "_primary_term": 1, "status": 201, } }, { "index": { "_index": "test", "_id": "13ae7825-eef9-4214-a164-983c2e6bbeaa", "_version": 1, "result": "created", "_shards": {"total": 2, "successful": 1, "failed": 0}, "_seq_no": 2, "_primary_term": 1, "status": 201, } }, ], }, ), # check exist ( meta_success, {"_shards": {"total": 2, "successful": 1, "failed": 0}}, ), # count ( meta_success, [{"epoch": "1700474422", "timestamp": "10:00:22", "count": "3"}], ), # get_all ( meta_success, { "took": 1, "timed_out": False, "_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0}, "hits": { "total": {"value": 3, "relation": "eq"}, "max_score": 1.0, "hits": [ { "_index": "test", "_id": "a3774dab-b8f1-43ba-adb8-842cb7a76eeb", "_score": 1.0, "_source": {"content": "Sample text 0", "metadata": {}}, }, { "_index": "test", "_id": "b44f5593-7587-4f91-afd0-5736e5bd5bfe", "_score": 1.0, "_source": {"content": "Sample text 1", "metadata": {}}, }, { "_index": "test", "_id": "13ae7825-eef9-4214-a164-983c2e6bbeaa", "_score": 1.0, "_source": {"content": "Sample text 2", "metadata": {}}, }, ], }, }, ), # get by-id ( meta_success, { "took": 1, "timed_out": False, "_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0}, "hits": { "total": {"value": 1, "relation": "eq"}, "max_score": 1.0, "hits": [ { "_index": "test", "_id": "a3774dab-b8f1-43ba-adb8-842cb7a76eeb", "_score": 1.0, "_source": {"content": "Sample text 0", "metadata": {}}, } ], }, }, ), # query ( meta_success, { "took": 2, "timed_out": False, "_shards": {"total": 1, "successful": 1, "skipped": 0, "failed": 0}, "hits": { "total": {"value": 3, "relation": "eq"}, "max_score": 0.13353139, "hits": [ { "_index": "test", "_id": "a3774dab-b8f1-43ba-adb8-842cb7a76eeb", "_score": 0.13353139, "_source": {"content": "Sample text 0", "metadata": {}}, }, { "_index": "test", "_id": "b44f5593-7587-4f91-afd0-5736e5bd5bfe", "_score": 0.13353139, "_source": {"content": "Sample text 1", "metadata": {}}, }, { "_index": "test", "_id": "13ae7825-eef9-4214-a164-983c2e6bbeaa", "_score": 0.13353139, "_source": {"content": "Sample text 2", "metadata": {}}, }, ], }, }, ), # delete ( meta_success, { "took": 10, "timed_out": False, "total": 1, "deleted": 1, "batches": 1, "version_conflicts": 0, "noops": 0, "retries": {"bulk": 0, "search": 0}, "throttled_millis": 0, "requests_per_second": -1.0, "throttled_until_millis": 0, "failures": [], }, ), # check exists ( meta_success, {"_shards": {"total": 2, "successful": 1, "failed": 0}}, ), # count ( meta_success, [{"epoch": "1700549363", "timestamp": "06:49:23", "count": "2"}], ), ] def test_inmemory_document_store_base_interfaces(tmp_path): """Test all interfaces of a a document store""" store = InMemoryDocumentStore() docs = [ Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"}) for idx in range(10) ] # Test add and get all assert len(store.get_all()) == 0, "Document store should be empty" store.add(docs) assert len(store.get_all()) == 10, "Document store should have 10 documents" # Test add with provided ids store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)]) assert len(store.get_all()) == 20, "Document store should have 20 documents" # Test add without exist_ok with pytest.raises(ValueError): store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)]) # Update ok with add exist_ok store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)], exist_ok=True) assert len(store.get_all()) == 20, "Document store should have 20 documents" # Test get with str id matched = store.get(docs[0].doc_id) assert len(matched) == 1, "Should return 1 document" assert matched[0].text == docs[0].text, "Should return the correct document" # Test get with list of ids matched = store.get([docs[0].doc_id, docs[1].doc_id]) assert len(matched) == 2, "Should return 2 documents" assert [doc.text for doc in matched] == [doc.text for doc in docs[:2]] # Test delete with str id store.delete(docs[0].doc_id) assert len(store.get_all()) == 19, "Document store should have 19 documents" # Test delete with list of ids store.delete([docs[1].doc_id, docs[2].doc_id]) assert len(store.get_all()) == 17, "Document store should have 17 documents" # Test save store.save(tmp_path / "store.json") assert (tmp_path / "store.json").exists(), "File should exist" # Test load store2 = InMemoryDocumentStore() store2.load(tmp_path / "store.json") assert len(store2.get_all()) == 17, "Laded document store should have 17 documents" os.remove(tmp_path / "store.json") def test_simplefile_document_store_base_interfaces(tmp_path): """Test all interfaces of a a document store""" store = SimpleFileDocumentStore(path=tmp_path) docs = [ Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"}) for idx in range(10) ] # Test add and get all assert len(store.get_all()) == 0, "Document store should be empty" store.add(docs) assert len(store.get_all()) == 10, "Document store should have 10 documents" # Test add with provided ids store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)]) assert len(store.get_all()) == 20, "Document store should have 20 documents" # Test add without exist_ok with pytest.raises(ValueError): store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)]) # Update ok with add exist_ok store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)], exist_ok=True) assert len(store.get_all()) == 20, "Document store should have 20 documents" # Test get with str id matched = store.get(docs[0].doc_id) assert len(matched) == 1, "Should return 1 document" assert matched[0].text == docs[0].text, "Should return the correct document" # Test get with list of ids matched = store.get([docs[0].doc_id, docs[1].doc_id]) assert len(matched) == 2, "Should return 2 documents" assert [doc.text for doc in matched] == [doc.text for doc in docs[:2]] # Test delete with str id store.delete(docs[0].doc_id) assert len(store.get_all()) == 19, "Document store should have 19 documents" # Test delete with list of ids store.delete([docs[1].doc_id, docs[2].doc_id]) assert len(store.get_all()) == 17, "Document store should have 17 documents" # Test save assert (tmp_path / "default.json").exists(), "File should exist" # Test load store2 = SimpleFileDocumentStore(path=tmp_path) assert len(store2.get_all()) == 17, "Laded document store should have 17 documents" os.remove(tmp_path / "default.json") @patch( "elastic_transport.Transport.perform_request", side_effect=_elastic_search_responses, ) def test_elastic_document_store(elastic_api): store = ElasticsearchDocumentStore(collection_name="test") docs = [ Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"}) for idx in range(3) ] # Test add and get all assert store.count() == 0, "Document store should be empty" store.add(docs) assert store.count() == 3, "Document store count should changed after adding docs" docs = store.get_all() first_doc = docs[0] assert len(docs) == 3, "Document store get_all() failed" doc_by_ids = store.get(first_doc.doc_id) assert doc_by_ids[0].doc_id == first_doc.doc_id, "Document store get() failed" docs = store.query("text") assert len(docs) == 3, "Document store query() failed" # delete test store.delete(first_doc.doc_id) assert store.count() == 2, "Document store delete() failed" elastic_api.assert_called() ================================================ FILE: libs/kotaemon/tests/test_documents.py ================================================ from kotaemon.base.schema import Document, RetrievedDocument from .conftest import skip_when_haystack_not_installed def test_document_constructor_with_builtin_types(): for value in ["str", 1, {}, set(), [], tuple, None]: doc = Document(value) assert doc.text == (str(value) if value else "") assert doc.content == value assert bool(doc) == bool(value) def test_document_constructor_with_document(): text = "Sample text" doc1 = Document(text) doc2 = Document(doc1) assert doc2.text == doc1.text assert doc2.content == doc1.content @skip_when_haystack_not_installed def test_document_to_haystack_format(): from haystack.schema import Document as HaystackDocument text = "Sample text" metadata = {"filename": "sample.txt"} doc = Document(text, metadata=metadata) haystack_doc = doc.to_haystack_format() assert isinstance(haystack_doc, HaystackDocument) assert haystack_doc.content == doc.text assert haystack_doc.meta == metadata def test_retrieved_document_default_values(): sample_text = "text" retrieved_doc = RetrievedDocument(text=sample_text) assert retrieved_doc.text == sample_text assert retrieved_doc.score == 0.0 assert retrieved_doc.retrieval_metadata == {} def test_retrieved_document_attributes(): sample_text = "text" score = 0.8 metadata = {"source": "retrieval_system"} retrieved_doc = RetrievedDocument( text=sample_text, score=score, retrieval_metadata=metadata ) assert retrieved_doc.text == sample_text assert retrieved_doc.score == score assert retrieved_doc.retrieval_metadata == metadata ================================================ FILE: libs/kotaemon/tests/test_embedding_models.py ================================================ import json from pathlib import Path from unittest.mock import Mock, patch from openai.types.create_embedding_response import CreateEmbeddingResponse from kotaemon.base import Document, DocumentWithEmbedding from kotaemon.embeddings import ( AzureOpenAIEmbeddings, FastEmbedEmbeddings, LCCohereEmbeddings, LCHuggingFaceEmbeddings, OpenAIEmbeddings, VoyageAIEmbeddings, ) from .conftest import ( skip_when_cohere_not_installed, skip_when_fastembed_not_installed, skip_when_sentence_bert_not_installed, skip_when_voyageai_not_installed, ) with open(Path(__file__).parent / "resources" / "embedding_openai_batch.json") as f: openai_embedding_batch = CreateEmbeddingResponse.model_validate(json.load(f)) with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: openai_embedding = CreateEmbeddingResponse.model_validate(json.load(f)) def assert_embedding_result(output): assert isinstance(output, list) assert isinstance(output[0], Document) assert isinstance(output[0].embedding, list) assert isinstance(output[0].embedding[0], float) @patch( "openai.resources.embeddings.Embeddings.create", side_effect=lambda *args, **kwargs: openai_embedding, ) def test_azureopenai_embeddings_raw(openai_embedding_call): model = AzureOpenAIEmbeddings( azure_deployment="embedding-deployment", azure_endpoint="https://test.openai.azure.com/", api_key="some-key", api_version="version", ) output = model("Hello world") assert_embedding_result(output) openai_embedding_call.assert_called() @patch( "openai.resources.embeddings.Embeddings.create", side_effect=lambda *args, **kwargs: openai_embedding_batch, ) def test_lcazureopenai_embeddings_batch_raw(openai_embedding_call): model = AzureOpenAIEmbeddings( azure_deployment="embedding-deployment", azure_endpoint="https://test.openai.azure.com/", api_key="some-key", api_version="version", ) output = model(["Hello world", "Goodbye world"]) assert_embedding_result(output) openai_embedding_call.assert_called() @patch( "openai.resources.embeddings.Embeddings.create", side_effect=lambda *args, **kwargs: openai_embedding_batch, ) def test_azureopenai_embeddings_batch_raw(openai_embedding_call): model = AzureOpenAIEmbeddings( azure_deployment="text-embedding-ada-002", azure_endpoint="https://test.openai.azure.com/", api_key="some-key", api_version="version", ) output = model(["Hello world", "Goodbye world"]) assert_embedding_result(output) openai_embedding_call.assert_called() @patch( "openai.resources.embeddings.Embeddings.create", side_effect=lambda *args, **kwargs: openai_embedding, ) def test_openai_embeddings_raw(openai_embedding_call): model = OpenAIEmbeddings( api_key="some-key", model="text-embedding-ada-002", ) output = model("Hello world") assert_embedding_result(output) openai_embedding_call.assert_called() @patch( "openai.resources.embeddings.Embeddings.create", side_effect=lambda *args, **kwargs: openai_embedding_batch, ) def test_openai_embeddings_batch_raw(openai_embedding_call): model = OpenAIEmbeddings( api_key="some-key", model="text-embedding-ada-002", ) output = model(["Hello world", "Goodbye world"]) assert_embedding_result(output) openai_embedding_call.assert_called() @skip_when_sentence_bert_not_installed @patch( "sentence_transformers.SentenceTransformer", side_effect=lambda *args, **kwargs: None, ) @patch( "langchain.embeddings.huggingface.HuggingFaceBgeEmbeddings.embed_documents", side_effect=lambda *args, **kwargs: [[1.0, 2.1, 3.2]], ) def test_lchuggingface_embeddings( langchain_huggingface_embedding_call, sentence_transformers_init ): model = LCHuggingFaceEmbeddings( model_name="intfloat/multilingual-e5-large", model_kwargs={"device": "cpu"}, encode_kwargs={"normalize_embeddings": False}, ) output = model("Hello World") assert_embedding_result(output) sentence_transformers_init.assert_called() langchain_huggingface_embedding_call.assert_called() @skip_when_cohere_not_installed @patch( "langchain_cohere.CohereEmbeddings.embed_documents", side_effect=lambda *args, **kwargs: [[1.0, 2.1, 3.2]], ) def test_lccohere_embeddings(langchain_cohere_embedding_call): model = LCCohereEmbeddings( model="embed-english-light-v2.0", cohere_api_key="my-api-key", user_agent="test", ) output = model("Hello World") assert_embedding_result(output) langchain_cohere_embedding_call.assert_called() @skip_when_fastembed_not_installed def test_fastembed_embeddings(): model = FastEmbedEmbeddings() output = model("Hello World") assert_embedding_result(output) voyage_output_mock = Mock() voyage_output_mock.embeddings = [[1.0, 2.1, 3.2]] @skip_when_voyageai_not_installed @patch("voyageai.Client.embed", return_value=voyage_output_mock) @patch("voyageai.AsyncClient.embed", return_value=voyage_output_mock) def test_voyageai_embeddings(sync_call, async_call): model = VoyageAIEmbeddings(api_key="test") output = model("Hello, world!") assert all(isinstance(doc, DocumentWithEmbedding) for doc in output) ================================================ FILE: libs/kotaemon/tests/test_indexing_retrieval.py ================================================ import json from pathlib import Path from typing import cast from unittest.mock import patch from openai.types.create_embedding_response import CreateEmbeddingResponse from kotaemon.base import Document from kotaemon.embeddings import AzureOpenAIEmbeddings from kotaemon.indices import VectorIndexing, VectorRetrieval from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: openai_embedding = CreateEmbeddingResponse.model_validate(json.load(f)) @patch( "openai.resources.embeddings.Embeddings.create", side_effect=lambda *args, **kwargs: openai_embedding, ) def test_indexing(tmp_path): db = ChromaVectorStore(path=str(tmp_path)) doc_store = InMemoryDocumentStore() embedding = AzureOpenAIEmbeddings( azure_deployment="text-embedding-ada-002", azure_endpoint="https://test.openai.azure.com/", api_key="some-key", api_version="version", ) pipeline = VectorIndexing(vector_store=db, embedding=embedding, doc_store=doc_store) pipeline.doc_store = cast(InMemoryDocumentStore, pipeline.doc_store) pipeline.vector_store = cast(ChromaVectorStore, pipeline.vector_store) assert pipeline.vector_store._collection.count() == 0, "Expected empty collection" assert len(pipeline.doc_store._store) == 0, "Expected empty doc store" pipeline(text=Document(text="Hello world")) assert pipeline.vector_store._collection.count() == 1, "Index 1 item" assert len(pipeline.doc_store._store) == 1, "Expected 1 document" @patch( "openai.resources.embeddings.Embeddings.create", side_effect=lambda *args, **kwargs: openai_embedding, ) def test_retrieving(tmp_path): db = ChromaVectorStore(path=str(tmp_path)) doc_store = InMemoryDocumentStore() embedding = AzureOpenAIEmbeddings( azure_deployment="text-embedding-ada-002", azure_endpoint="https://test.openai.azure.com/", api_key="some-key", api_version="version", ) index_pipeline = VectorIndexing( vector_store=db, embedding=embedding, doc_store=doc_store ) retrieval_pipeline = VectorRetrieval( vector_store=db, doc_store=doc_store, embedding=embedding ) index_pipeline(text=Document(text="Hello world")) output = retrieval_pipeline(text="Hello world") output1 = retrieval_pipeline(text="Hello world") assert len(output) == 1, "Expect 1 results" assert output == output1, "Expect identical results" ================================================ FILE: libs/kotaemon/tests/test_ingestor.py ================================================ from pathlib import Path from kotaemon.indices.ingests import DocumentIngestor from kotaemon.indices.splitters import TokenSplitter def test_ingestor_include_src(): dirpath = Path(__file__).parent ingestor = DocumentIngestor( pdf_mode="normal", text_splitter=TokenSplitter(chunk_size=200, chunk_overlap=10), ) nodes = ingestor(dirpath / "resources" / "table.pdf") assert type(nodes) is list assert nodes[0].relationships ================================================ FILE: libs/kotaemon/tests/test_llms_chat_models.py ================================================ from pathlib import Path from unittest.mock import patch import pytest from kotaemon.base.schema import AIMessage, HumanMessage, LLMInterface, SystemMessage from kotaemon.llms import AzureChatOpenAI, LlamaCppChat try: pass except ImportError: pass from openai.types.chat.chat_completion import ChatCompletion from .conftest import skip_llama_cpp_not_installed _openai_chat_completion_response = ChatCompletion.parse_obj( { "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", "object": "chat.completion", "created": 1692338378, "model": "gpt-35-turbo", "system_fingerprint": None, "choices": [ { "index": 0, "finish_reason": "stop", "message": { "role": "assistant", "content": "Hello! How can I assist you today?", "function_call": None, "tool_calls": None, }, "logprobs": None, } ], "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, } ) @patch( "openai.resources.chat.completions.Completions.create", side_effect=lambda *args, **kwargs: _openai_chat_completion_response, ) def test_azureopenai_model(openai_completion): model = AzureChatOpenAI( api_key="dummy", api_version="2024-05-01-preview", azure_deployment="gpt-4o", azure_endpoint="https://test.openai.azure.com/", ) # test for str input - stream mode output = model("hello world") assert isinstance( output, LLMInterface ), "Output for single text is not LLMInterface" openai_completion.assert_called() # test for list[message] input - stream mode messages = [ SystemMessage(content="You are a philosohper"), HumanMessage(content="What is the meaning of life"), AIMessage(content="42"), HumanMessage(content="What is the meaning of 42"), ] output = model(messages) assert isinstance( output, LLMInterface ), "Output for single text is not LLMInterface" openai_completion.assert_called() @skip_llama_cpp_not_installed def test_llamacpp_chat(): from llama_cpp import Llama dir_path = Path(__file__).parent / "resources" / "ggml-vocab-llama.gguf" # test initialization model = LlamaCppChat(model_path=str(dir_path), chat_format="llama", vocab_only=True) assert isinstance(model.client_object, Llama), "Error initializing llama_cpp.Llama" # test error if model_path is omitted with pytest.raises(ValueError): model = LlamaCppChat(chat_format="llama", vocab_only=True) model.client_object # test error if chat_format is omitted with pytest.raises(ValueError): model = LlamaCppChat(model_path=str(dir_path), vocab_only=True) model.client_object ================================================ FILE: libs/kotaemon/tests/test_llms_completion_models.py ================================================ from pathlib import Path from unittest.mock import patch from kotaemon.base.schema import LLMInterface from kotaemon.llms import AzureOpenAI, LlamaCpp, OpenAI try: from langchain_openai import AzureOpenAI as AzureOpenAILC from langchain_openai import OpenAI as OpenAILC except ImportError: from langchain.llms import AzureOpenAI as AzureOpenAILC from langchain.llms import OpenAI as OpenAILC from openai.types.completion import Completion from .conftest import skip_llama_cpp_not_installed, skip_openai_lc_wrapper_test _openai_completion_response = Completion.parse_obj( { "id": "cmpl-7qyNoIo6gRSCJR0hi8o3ZKBH4RkJ0", "object": "text_completion", "created": 1392751226, "model": "gpt-35-turbo", "system_fingerprint": None, "choices": [ { "text": "completion", "index": 0, "finish_reason": "length", "logprobs": None, } ], "usage": {"completion_tokens": 20, "prompt_tokens": 2, "total_tokens": 22}, } ) @skip_openai_lc_wrapper_test @patch( "openai.resources.completions.Completions.create", side_effect=lambda *args, **kwargs: _openai_completion_response, ) def test_azureopenai_model(openai_completion): model = AzureOpenAI( azure_endpoint="https://test.openai.azure.com/", openai_api_key="some-key", openai_api_version="2023-03-15-preview", deployment_name="gpt35turbo", temperature=0, request_timeout=60, ) assert isinstance( model.to_langchain_format(), AzureOpenAILC ), "Agent not wrapped in Langchain's AzureOpenAI" output = model("hello world") assert isinstance( output, LLMInterface ), "Output for single text is not LLMInterface" @patch( "openai.resources.completions.Completions.create", side_effect=lambda *args, **kwargs: _openai_completion_response, ) def test_openai_model(openai_completion): model = OpenAI( openai_api_base="https://test.openai.azure.com/", openai_api_key="some-key", openai_api_version="2023-03-15-preview", deployment_name="gpt35turbo", temperature=0, request_timeout=60, ) assert isinstance( model.to_langchain_format(), OpenAILC ), "Agent is not wrapped in Langchain's OpenAI" output = model("hello world") assert isinstance( output, LLMInterface ), "Output for single text is not LLMInterface" @skip_llama_cpp_not_installed def test_llamacpp_model(): weight_path = Path(__file__).parent / "resources" / "ggml-vocab-llama.gguf" # test initialization model = LlamaCpp(model_path=str(weight_path), vocab_only=True) assert isinstance(model._obj, model._get_lc_class()) ================================================ FILE: libs/kotaemon/tests/test_mcp_manager.py ================================================ """Tests for ktem.mcp.manager module. Uses an in-memory SQLite engine to test MCPManager CRUD operations without depending on the application's database. """ import pytest from sqlalchemy import JSON, Column, String, create_engine from sqlalchemy.orm import DeclarativeBase, Session # --------------------------------------------------------------------------- # In-memory DB setup (mirrors ktem.mcp.db but fully isolated) # --------------------------------------------------------------------------- class _Base(DeclarativeBase): pass class _MCPTable(_Base): __tablename__ = "mcp_table" name = Column(String, primary_key=True, unique=True) config = Column(JSON, default={}) # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture() def manager(): """Fresh manager with a clean in-memory DB for each test.""" engine = create_engine("sqlite:///:memory:") _MCPTable.metadata.create_all(engine) return MCPManagerForTest(engine) # --------------------------------------------------------------------------- # Minimal MCPManager that uses the test engine # --------------------------------------------------------------------------- class MCPManagerForTest: """Same logic as ktem.mcp.manager.MCPManager but uses our test engine.""" def __init__(self, engine): self._engine = engine self._info: dict[str, dict] = {} self.load() def load(self): self._info = {} with Session(self._engine) as session: for item in session.query(_MCPTable).all(): self._info[item.name] = { # type: ignore[index] "name": item.name, "config": item.config, } def info(self) -> dict: return self._info def get(self, name: str) -> dict | None: return self._info.get(name) def add(self, name: str, config: dict): name = name.strip() if not name: raise ValueError("Name must not be empty") with Session(self._engine) as session: session.add(_MCPTable(name=name, config=config)) session.commit() self.load() def update(self, name: str, config: dict): if not name: raise ValueError("Name must not be empty") with Session(self._engine) as session: item = session.query(_MCPTable).filter_by(name=name).first() if not item: raise ValueError(f"MCP server '{name}' not found") item.config = config # type: ignore[assignment] session.commit() self.load() def delete(self, name: str): with Session(self._engine) as session: item = session.query(_MCPTable).filter_by(name=name).first() if item: session.delete(item) session.commit() self.load() def get_enabled_tools(self) -> list[str]: return [ f"[MCP] {name}" for name, entry in self._info.items() if entry.get("config", {}).get("enabled_tools") is not None ] # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- class TestMCPManagerAdd: def test_add_and_retrieve(self, manager): """add() persists data; get() and info() reflect it.""" manager.add("server1", {"command": "uvx", "args": ["mcp-server-fetch"]}) assert manager.info()["server1"]["config"]["command"] == "uvx" assert manager.get("server1")["name"] == "server1" def test_add_multiple(self, manager): manager.add("s1", {"command": "cmd1"}) manager.add("s2", {"command": "cmd2"}) assert set(manager.info().keys()) == {"s1", "s2"} @pytest.mark.parametrize("name", ["", " "]) def test_empty_or_whitespace_name_raises(self, manager, name): with pytest.raises(ValueError, match="Name must not be empty"): manager.add(name, {}) def test_whitespace_name_is_stripped(self, manager): manager.add(" server1 ", {"command": "uvx"}) assert "server1" in manager.info() def test_complex_config_stored_correctly(self, manager): config = { "command": "uvx", "env": {"JIRA_URL": "https://example.atlassian.net"}, "enabled_tools": ["jira_search"], } manager.add("atlassian", config) stored = manager.get("atlassian")["config"] assert stored["env"]["JIRA_URL"] == "https://example.atlassian.net" assert stored["enabled_tools"] == ["jira_search"] class TestMCPManagerUpdateDelete: def test_update_changes_config(self, manager): manager.add("s1", {"command": "cmd1"}) manager.add("s2", {"command": "cmd2"}) manager.update("s1", {"command": "updated"}) assert manager.info()["s1"]["config"]["command"] == "updated" assert manager.info()["s2"]["config"]["command"] == "cmd2" # untouched def test_update_nonexistent_raises(self, manager): with pytest.raises(ValueError, match="not found"): manager.update("ghost", {}) def test_delete_removes_entry(self, manager): manager.add("s1", {}) manager.add("s2", {}) manager.delete("s1") assert "s1" not in manager.info() assert "s2" in manager.info() def test_delete_nonexistent_is_noop(self, manager): manager.delete("ghost") # must not raise assert len(manager.info()) == 0 class TestMCPManagerGetEnabledTools: def test_only_servers_with_enabled_tools_listed(self, manager): manager.add("no_filter", {"command": "uvx"}) manager.add("with_filter", {"command": "uvx", "enabled_tools": ["tool_a"]}) choices = manager.get_enabled_tools() assert "[MCP] no_filter" not in choices assert "[MCP] with_filter" in choices def test_empty_when_no_servers(self, manager): assert manager.get_enabled_tools() == [] class TestMCPManagerLoad: def test_load_picks_up_external_db_changes(self, manager): manager.add("server1", {}) with Session(manager._engine) as session: session.add(_MCPTable(name="external", config={"command": "ext"})) session.commit() assert "external" not in manager.info() # not yet refreshed manager.load() assert "external" in manager.info() ================================================ FILE: libs/kotaemon/tests/test_mcp_tools.py ================================================ """Tests for kotaemon.agents.tools.mcp module. Covers config parsing, JSON Schema -> Pydantic model building, tool formatting, and MCPTool construction (without real MCP servers). """ from types import SimpleNamespace from unittest.mock import patch import pytest from kotaemon.agents.tools.mcp import ( MCPTool, _json_schema_type_to_python, _make_tool, build_args_model, create_tools_from_config, format_tool_list, parse_mcp_config, ) # --------------------------------------------------------------------------- # _json_schema_type_to_python — parametrized to avoid 7 near-identical tests # --------------------------------------------------------------------------- @pytest.mark.parametrize( "json_type, expected", [ ("string", str), ("integer", int), ("number", float), ("boolean", bool), ("object", dict), ("array", list), ("unknown_type", str), # fallback ], ) def test_json_schema_type_to_python(json_type, expected): assert _json_schema_type_to_python(json_type) is expected # --------------------------------------------------------------------------- # build_args_model # --------------------------------------------------------------------------- class TestBuildArgsModel: def test_model_fields_and_name(self): """Required + optional fields and the generated model name.""" schema = { "properties": { "url": {"type": "string", "description": "The URL to fetch"}, "timeout": {"type": "integer", "description": "Timeout in seconds"}, }, "required": ["url"], } model = build_args_model("fetch", schema) assert model.__name__ == "MCPArgs_fetch" assert model.model_fields["url"].is_required() assert not model.model_fields["timeout"].is_required() def test_optional_field_preserves_default(self): schema = { "properties": { "limit": { "type": "integer", "description": "Max results", "default": 10, }, }, "required": [], } assert build_args_model("search", schema).model_fields["limit"].default == 10 def test_empty_schema_produces_no_fields(self): assert len(build_args_model("empty", {}).model_fields) == 0 # --------------------------------------------------------------------------- # parse_mcp_config # --------------------------------------------------------------------------- class TestParseMcpConfig: def test_full_stdio_config(self): config = { "transport": "stdio", "command": "uvx", "args": ["mcp-server-fetch"], "env": {"KEY": "value"}, } parsed = parse_mcp_config(config) assert parsed == { "transport": "stdio", "command": "uvx", "args": ["mcp-server-fetch"], "env": {"KEY": "value"}, } def test_defaults_for_empty_config(self): parsed = parse_mcp_config({}) assert parsed["transport"] == "stdio" assert parsed["command"] == "" assert parsed["args"] == [] assert parsed["env"] == {} def test_auto_split_multi_word_command(self): """stdio with no explicit args: space-delimited command is split.""" parsed = parse_mcp_config( {"command": "npx -y mcp-remote https://example.com/sse"} ) assert parsed["command"] == "npx" assert parsed["args"] == ["-y", "mcp-remote", "https://example.com/sse"] def test_no_split_when_args_already_provided(self): """Explicit args suppress the auto-split.""" parsed = parse_mcp_config( { "command": "npx -y mcp-remote https://example.com/sse", "args": ["--flag"], } ) assert parsed["command"] == "npx -y mcp-remote https://example.com/sse" assert parsed["args"] == ["--flag"] def test_sse_transport_uses_url_as_command(self): """For SSE, the url field becomes the effective command.""" parsed = parse_mcp_config( { "transport": "sse", "url": "http://localhost:8080/sse", "command": "ignored", } ) assert parsed["transport"] == "sse" assert parsed["command"] == "http://localhost:8080/sse" # --------------------------------------------------------------------------- # _make_tool # --------------------------------------------------------------------------- class TestMakeTool: def test_creates_mcp_tool_with_schema(self): parsed = { "transport": "stdio", "command": "uvx", "args": ["mcp-server-fetch"], "env": {}, } tool_info = SimpleNamespace( name="fetch", description="Fetch a URL", inputSchema={ "properties": { "url": {"type": "string", "description": "URL to fetch"} }, "required": ["url"], }, ) tool = _make_tool(parsed, tool_info) assert isinstance(tool, MCPTool) assert tool.name == "fetch" assert tool.description == "Fetch a URL" assert tool.server_transport == "stdio" assert tool.server_command == "uvx" assert tool.server_args == ["mcp-server-fetch"] def test_missing_schema_and_description_uses_defaults(self): """No inputSchema → args_schema is None; None description → auto-generated.""" parsed = {"transport": "stdio", "command": "uvx", "args": [], "env": {}} tool_info = SimpleNamespace(name="ping", description=None) tool = _make_tool(parsed, tool_info) assert tool.description == "MCP tool: ping" assert tool.args_schema is None # --------------------------------------------------------------------------- # format_tool_list # --------------------------------------------------------------------------- class TestFormatToolList: def test_all_tools_enabled_by_default(self): tool_infos = [ {"name": "fetch", "description": "Fetch a URL"}, {"name": "search", "description": "Search the web"}, ] result = format_tool_list(tool_infos) assert "2" in result assert "fetch" in result and "search" in result assert "All tools enabled" in result def test_partial_filter_shows_counts_and_icons(self): tool_infos = [ {"name": "fetch", "description": "Fetch a URL"}, {"name": "search", "description": "Search the web"}, ] result = format_tool_list(tool_infos, enabled_tools=["fetch"]) assert "1/2 tool(s) enabled" in result assert "✅" in result # fetch enabled assert "⬜" in result # search disabled def test_long_description_is_truncated(self): result = format_tool_list([{"name": "tool", "description": "A" * 200}]) assert "A" * 121 not in result def test_none_description_shows_placeholder(self): result = format_tool_list([{"name": "tool", "description": None}]) assert "No description" in result # --------------------------------------------------------------------------- # create_tools_from_config (mocked MCP server connection) # --------------------------------------------------------------------------- class TestCreateToolsFromConfig: def _make_mock_tools(self): return [ MCPTool( name="fetch", description="Fetch", server_transport="stdio", server_command="uvx", mcp_tool_name="fetch", ), MCPTool( name="search", description="Search", server_transport="stdio", server_command="uvx", mcp_tool_name="search", ), ] @patch("kotaemon.agents.tools.mcp._run_async") def test_no_filter_returns_all(self, mock_run_async): mock_run_async.return_value = self._make_mock_tools() tools = create_tools_from_config({"command": "uvx"}) assert len(tools) == 2 @patch("kotaemon.agents.tools.mcp._run_async") def test_enabled_tools_filter(self, mock_run_async): """Non-empty filter returns only nominated tools; empty list returns all.""" mock_run_async.return_value = self._make_mock_tools() filtered = create_tools_from_config({"command": "uvx"}, enabled_tools=["fetch"]) assert len(filtered) == 1 assert filtered[0].mcp_tool_name == "fetch" # Empty list == no filter mock_run_async.return_value = self._make_mock_tools() all_tools = create_tools_from_config({"command": "uvx"}, enabled_tools=[]) assert len(all_tools) == 2 # --------------------------------------------------------------------------- # MCPTool._format_result # --------------------------------------------------------------------------- class TestMCPToolFormatResult: def _make_tool(self): return MCPTool( name="test", description="Test tool", server_transport="stdio", server_command="echo", mcp_tool_name="test", ) def test_text_content_joined(self): result = self._make_tool()._format_result( SimpleNamespace( isError=False, content=[SimpleNamespace(text="Hello"), SimpleNamespace(text="World")], ) ) assert result == "Hello\nWorld" def test_error_flag(self): result = self._make_tool()._format_result( SimpleNamespace( isError=True, content="Something went wrong", ) ) assert "MCP Tool Error" in result def test_binary_content(self): result = self._make_tool()._format_result( SimpleNamespace( isError=False, content=[SimpleNamespace(data=b"bytes", mimeType="image/png")], ) ) assert "[Binary data: image/png]" in result ================================================ FILE: libs/kotaemon/tests/test_post_processing.py ================================================ import pytest from kotaemon.base import Document from kotaemon.parsers import RegexExtractor @pytest.fixture def regex_extractor(): return RegexExtractor( pattern=r"\d+", output_map={"1": "One", "2": "Two", "3": "Three"} ) def test_run_document(regex_extractor): document = Document(text="This is a test. 1 2 3") extracted_document = regex_extractor(document)[0] assert extracted_document.text == "One" assert extracted_document.matches == ["One", "Two", "Three"] def test_run_raw(regex_extractor): output = regex_extractor("This is a test. 123")[0] assert output.text == "123" assert output.matches == ["123"] def test_run_batch_raw(regex_extractor): output = regex_extractor(["This is a test. 123", "456"]) extracted_text = [each.text for each in output] extracted_matches = [each.matches for each in output] assert extracted_text == ["123", "456"] assert extracted_matches == [["123"], ["456"]] ================================================ FILE: libs/kotaemon/tests/test_prompt.py ================================================ import pytest from kotaemon.base import Document from kotaemon.llms import BasePromptComponent, PromptTemplate from kotaemon.parsers import RegexExtractor def test_set_attributes(): template = PromptTemplate("str = {s}, int = {i}, doc = {doc}, comp = {comp}") doc = Document(text="Helloo, Alice!") comp = RegexExtractor( pattern=r"\d+", output_map={"1": "One", "2": "Two", "3": "Three"} ) comp.set_run(kwargs={"text": "This is a test. 1 2 3"}, temp=True) prompt = BasePromptComponent(template=template, s="Alice", i=30, doc=doc, comp=comp) assert prompt.s == "Alice" assert prompt.i == 30 assert prompt.doc == doc assert prompt.comp == comp def test_check_redundant_kwargs(): template = PromptTemplate("Hello, {name}!") prompt = BasePromptComponent(template=template, name="Alice") with pytest.warns(UserWarning, match="Keys provided but not in template: age"): prompt._BasePromptComponent__check_redundant_kwargs(name="Alice", age=30) def test_check_unset_placeholders(): template = PromptTemplate("Hello, {name}! I'm {age} years old.") prompt = BasePromptComponent(template=template, name="Alice") with pytest.raises(ValueError): prompt._BasePromptComponent__check_unset_placeholders() def test_validate_value_type(): template = PromptTemplate("Hello, {name}!") prompt = BasePromptComponent(template=template) with pytest.raises(ValueError): prompt._BasePromptComponent__validate_value_type(name={}) def test_run(): template = PromptTemplate("str = {s}, int = {i}, doc = {doc}, comp = {comp}") doc = Document(text="Helloo, Alice!") comp = RegexExtractor( pattern=r"\d+", output_map={"1": "One", "2": "Two", "3": "Three"} ) comp.set_run(kwargs={"text": "This is a test. 1 2 3"}, temp=True) prompt = BasePromptComponent(template=template, s="Alice", i=30, doc=doc, comp=comp) result = prompt() assert result.text == "str = Alice, int = 30, doc = Helloo, Alice!, comp = ['One']" def test_set_method(): template = PromptTemplate("Hello, {name}!") prompt = BasePromptComponent(template=template) prompt.set_value(name="Alice") assert prompt.name == "Alice" ================================================ FILE: libs/kotaemon/tests/test_promptui.py ================================================ from kotaemon.contribs.promptui.config import export_pipeline_to_config from kotaemon.contribs.promptui.export import export_from_dict from kotaemon.contribs.promptui.ui import build_from_dict from .simple_pipeline import Pipeline class TestPromptConfig: def test_export_prompt_config(self): """Test if the prompt config is exported correctly""" pipeline = Pipeline() config_dict = export_pipeline_to_config(pipeline) config = list(config_dict.values())[0] assert "inputs" in config, "inputs should be in config" assert "text" in config["inputs"], "inputs should have config" assert "params" in config, "params should be in config" assert "llm.deployment_name" in config["params"] assert "llm.azure_endpoint" in config["params"] assert "llm.openai_api_key" in config["params"] assert "llm.openai_api_version" in config["params"] assert "llm.request_timeout" in config["params"] assert "llm.temperature" in config["params"] class TestPromptUI: def test_uigeneration(self): """Test if the gradio UI is exposed without any problem""" pipeline = Pipeline() config = export_pipeline_to_config(pipeline) build_from_dict(config) class TestExport: def test_export(self, tmp_path): """Test if the export functionality works without error""" from pathlib import Path import yaml from theflow.storage import storage config_path = tmp_path / "config.yaml" pipeline = Pipeline() Path(storage.url(pipeline.config.store_result)).mkdir( parents=True, exist_ok=True ) config_dict = export_pipeline_to_config(pipeline) pipeline_name = list(config_dict.keys())[0] config_dict[pipeline_name]["logs"] = { "sheet1": { "inputs": [{"name": "text", "step": ".", "variable": "text"}], "outputs": [{"name": "answer", "step": "."}], }, } with open(config_path, "w") as f: yaml.safe_dump(config_dict, f) export_from_dict( config=str(config_path), pipeline=pipeline_name, output_path=str(tmp_path / "exported.xlsx"), ) ================================================ FILE: libs/kotaemon/tests/test_reader.py ================================================ from pathlib import Path from unittest.mock import patch from langchain.schema import Document as LangchainDocument from llama_index.core.node_parser import SimpleNodeParser from kotaemon.base import Document from kotaemon.loaders import ( AutoReader, AzureAIDocumentIntelligenceLoader, DocxReader, HtmlReader, MhtmlReader, UnstructuredReader, ) from .conftest import skip_when_unstructured_pdf_not_installed def test_docx_reader(): reader = DocxReader() documents = reader.load_data(Path(__file__).parent / "resources" / "dummy.docx") assert len(documents) def test_html_reader(): reader = HtmlReader() documents = reader.load_data( Path(__file__).parent / "resources" / "html" / "dummy.html" ) assert len(documents) def test_pdf_reader(): reader = AutoReader("PDFReader") dirpath = Path(__file__).parent documents = reader.load_data(dirpath / "resources" / "dummy.pdf") # check document reader output assert len(documents) == 1 first_doc = documents[0] assert isinstance(first_doc, Document) assert first_doc.text.lower().replace(" ", "") == "dummypdffile" langchain_doc = first_doc.to_langchain_format() assert isinstance(langchain_doc, LangchainDocument) # test chunking using NodeParser from llama-index node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20) nodes = node_parser.get_nodes_from_documents(documents) assert len(nodes) > 0 @skip_when_unstructured_pdf_not_installed def test_unstructured_pdf_reader(): reader = UnstructuredReader() dirpath = Path(__file__).parent input_path = dirpath / "resources/dummy.pdf" documents = reader.load_data(input_path) # check document reader output assert len(documents) == 1 first_doc = documents[0] assert isinstance(first_doc, Document) assert first_doc.text.lower().replace(" ", "") == "dummypdffile" # split documents mode documents = reader.load_data(input_path, split_documents=True) # check document reader output assert len(documents) == 1 def test_mhtml_reader(): reader = MhtmlReader() input_path = Path(__file__).parent / "resources" / "dummy.mhtml" docs = reader.load_data(input_path) assert len(docs) == 1 assert docs[0].text.startswith("This is a test") @patch("azure.ai.documentintelligence.DocumentIntelligenceClient") def test_azureai_document_intelligence_reader(mock_client): reader = AzureAIDocumentIntelligenceLoader( endpoint="https://endpoint.com", credential="credential", ) docs = reader(Path(__file__).parent / "resources" / "dummy.pdf") assert len(docs) == 1 mock_client.assert_called_once() ================================================ FILE: libs/kotaemon/tests/test_reranking.py ================================================ from unittest.mock import patch import pytest from openai.types.chat.chat_completion import ChatCompletion from kotaemon.base import Document from kotaemon.indices.rankings import LLMReranking from kotaemon.llms import AzureChatOpenAI _openai_chat_completion_responses = [ ChatCompletion.parse_obj( { "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", "object": "chat.completion", "created": 1692338378, "model": "gpt-35-turbo", "system_fingerprint": None, "choices": [ { "index": 0, "finish_reason": "stop", "message": { "role": "assistant", "content": text, "function_call": None, "tool_calls": None, }, "logprobs": None, } ], "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, } ) for text in [ "YES", "NO", "YES", ] ] @pytest.fixture def llm(): return AzureChatOpenAI( api_key="dummy", api_version="2024-05-01-preview", azure_deployment="gpt-4o", azure_endpoint="https://test.openai.azure.com/", ) @patch( "openai.resources.chat.completions.Completions.create", side_effect=_openai_chat_completion_responses, ) def test_reranking(openai_completion, llm): documents = [Document(text=f"test {idx}") for idx in range(3)] query = "test query" reranker = LLMReranking(llm=llm, concurrent=False) rerank_docs = reranker(documents, query=query) assert len(rerank_docs) == 2 ================================================ FILE: libs/kotaemon/tests/test_splitter.py ================================================ from llama_index.core.schema import NodeRelationship from kotaemon.base import Document from kotaemon.indices.splitters import TokenSplitter source1 = Document( content="The City Hall and Raffles Place MRT stations are paired cross-platform " "interchanges on the North–South line (NSL) and East–West line (EWL) of the " "Singapore Mass Rapid Transit (MRT) system. Both are situated in the Downtown " "Core district: City Hall station is near landmarks such as the former City Hall, " "St Andrew's Cathedral and the Padang, while Raffles Place station serves Merlion " "Park, The Fullerton Hotel and the Asian Civilisations Museum. The stations were " "first announced in 1982. Constructing the tunnels between the City Hall and " "Raffles Place stations required the draining of the Singapore River. The " "stations opened on 12 December 1987 as part of the MRT extension to Outram Park " "station. Cross-platform transfers between the NSL and EWL began on 28 October " "1989, ahead of the split of the MRT network into two lines. Both stations are " "designated Civil Defence shelters. City Hall station features a mural by Simon" "Wong which depicts government buildings in the area, while two murals at Raffles " "Place station by Lim Sew Yong and Thang Kiang How depict scenes of Singapore's " "history" ) source2 = Document( content="The pink cockatoo (Cacatua leadbeateri) is a medium-sized cockatoo that " "inhabits arid and semi-arid inland areas across Australia, with the exception of " "the north east. The bird has a soft-textured white and salmon-pink plumage and " "large, bright red and yellow crest. The sexes are quite similar, although males " "are usually bigger while the female has a broader yellow stripe on the crest and " "develops a red eye when mature. The pink cockatoo is usually found in pairs or " "small groups, and feeds both on the ground and in trees. It is listed as an " "endangered species by the Australian government. Formerly known as Major " "Mitchell's cockatoo, after the explorer Thomas Mitchell, the species was " "officially renamed the pink cockatoo in 2023 by BirdLife Australia in light of " "Mitchell's involvement in the massacre of Aboriginal people at Mount Dispersion, " "as well as a general trend to make Australian species names more culturally " "inclusive. This pink cockatoo with a raised crest was photographed near Mount " "Grenfell in New South Wales." ) def test_split_token(): """Test that it can split tokens successfully""" splitter = TokenSplitter(chunk_size=30, chunk_overlap=10) chunks = splitter([source1, source2]) assert isinstance(chunks, list), "Chunks should be a list" assert isinstance(chunks[0], Document), "Chunks should be a list of Documents" assert chunks[0].relationships[NodeRelationship.SOURCE].node_id == source1.doc_id assert ( chunks[1].relationships[NodeRelationship.PREVIOUS].node_id == chunks[0].doc_id ) assert chunks[1].relationships[NodeRelationship.NEXT].node_id == chunks[2].doc_id assert chunks[-1].relationships[NodeRelationship.SOURCE].node_id == source2.doc_id ================================================ FILE: libs/kotaemon/tests/test_table_reader.py ================================================ import json from pathlib import Path import pytest from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader from .conftest import skip_when_unstructured_pdf_not_installed input_file = Path(__file__).parent / "resources" / "table.pdf" input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx" @pytest.fixture def fullocr_output(): with open( Path(__file__).parent / "resources" / "fullocr_sample_output.json", encoding="utf-8", ) as f: fullocr = json.load(f) return fullocr @pytest.fixture def mathpix_output(): with open(Path(__file__).parent / "resources" / "policy.md", encoding="utf-8") as f: content = f.read() return content @skip_when_unstructured_pdf_not_installed def test_ocr_reader(fullocr_output): reader = OCRReader() documents = reader.load_data(input_file, response_content=fullocr_output) table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"] assert len(table_docs) == 2 def test_mathpix_reader(mathpix_output): reader = MathpixPDFReader() documents = reader.load_data(input_file, response_content=mathpix_output) table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"] assert len(table_docs) == 4 def test_excel_reader(): reader = PandasExcelReader() documents = reader.load_data( input_file_excel, ) assert len(documents) == 1 ================================================ FILE: libs/kotaemon/tests/test_telemetry.py ================================================ import os import sys import pytest from .conftest import skip_when_haystack_not_installed @pytest.fixture def clean_artifacts_for_telemetry(): try: del sys.modules["kotaemon"] except KeyError: pass try: del sys.modules["haystack"] except KeyError: pass try: del sys.modules["haystack.telemetry"] except KeyError: pass if "HAYSTACK_TELEMETRY_ENABLED" in os.environ: del os.environ["HAYSTACK_TELEMETRY_ENABLED"] @pytest.mark.usefixtures("clean_artifacts_for_telemetry") @skip_when_haystack_not_installed def test_disable_telemetry_import_haystack_first(): """Test that telemetry is disabled when kotaemon lib is initiated after""" import os import haystack.telemetry assert haystack.telemetry.telemetry is not None assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") != "False" import kotaemon # noqa: F401 assert haystack.telemetry.telemetry is None assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False" @pytest.mark.usefixtures("clean_artifacts_for_telemetry") @skip_when_haystack_not_installed def test_disable_telemetry_import_haystack_after_kotaemon(): """Test that telemetry is disabled when kotaemon lib is initiated before""" import os import haystack.telemetry import kotaemon # noqa: F401 assert haystack.telemetry.telemetry is None assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False" ================================================ FILE: libs/kotaemon/tests/test_template.py ================================================ import pytest from kotaemon.llms import PromptTemplate def test_prompt_template_creation(): # Ensure the PromptTemplate object is created correctly template_string = "This is a template" template = PromptTemplate(template_string) assert template.template == template_string template_string = "Hello, {name}! Today is {day}." template = PromptTemplate(template_string) assert template.template == template_string assert template.placeholders == {"name", "day"} def test_prompt_template_creation_invalid_placeholder(): # Ensure the PromptTemplate object handle invalid placeholder correctly template_string = "Hello, {name}! Today is {0day}." with pytest.raises(ValueError): PromptTemplate(template_string, ignore_invalid=False) with pytest.warns( UserWarning, match="Ignore invalid placeholder: 0day.", ): PromptTemplate(template_string, ignore_invalid=True) def test_prompt_template_addition(): # Ensure the __add__ method concatenates the templates correctly template1 = PromptTemplate("Hello, ") template2 = PromptTemplate("world!") result = template1 + template2 assert result.template == "Hello, \nworld!" template1 = PromptTemplate("Hello, {name}!") template2 = PromptTemplate("Today is {day}.") result = template1 + template2 assert result.template == "Hello, {name}!\nToday is {day}." def test_prompt_template_extract_placeholders(): # Ensure the PromptTemplate correctly extracts placeholders template_string = "Hello, {name}! Today is {day}." result = PromptTemplate(template_string).placeholders assert result == {"name", "day"} def test_prompt_template_populate(): # Ensure the populate method populates the template correctly template_string = "Hello, {name}! Today is {day}." template = PromptTemplate(template_string) result = template.populate(name="John", day="Monday") assert result == "Hello, John! Today is Monday." def test_prompt_template_check_missing_kwargs(): # Ensure the check_missing_kwargs and populate methods raise an exception for # missing placeholders template_string = "Hello, {name}! Today is {day}." template = PromptTemplate(template_string) kwargs = dict(name="John") with pytest.raises(ValueError): template.check_missing_kwargs(**kwargs) with pytest.raises(ValueError): template.populate(**kwargs) def test_prompt_template_check_redundant_kwargs(): # Ensure the check_redundant_kwargs, partial_populate and populate methods warn for # redundant placeholders template_string = "Hello, {name}! Today is {day}." template = PromptTemplate(template_string) kwargs = dict(name="John", day="Monday", age="30") with pytest.warns(UserWarning, match="Keys provided but not in template: age"): template.check_redundant_kwargs(**kwargs) with pytest.warns(UserWarning, match="Keys provided but not in template: age"): template.partial_populate(**kwargs) with pytest.warns(UserWarning, match="Keys provided but not in template: age"): template.populate(**kwargs) def test_prompt_template_populate_complex_template(): # Ensure the populate method produces the same results as the built-in str.format # function template_string = ( "a = {a:.2f}, b = {b}, c = {c:.1%}, d = {d:#.0g}, ascii of {e} = {e!a:>2}" ) template = PromptTemplate(template_string) kwargs = dict(a=1, b="two", c=3, d=4, e="á") populated = template.populate(**kwargs) expected = template_string.format(**kwargs) assert populated == expected def test_prompt_template_partial_populate(): # Ensure the partial_populate method populates correctly template_string = ( "a = {a:.2f}, b = {b}, c = {c:.1%}, d = {d:#.0g}, ascii of {e} = {e!a:>2}" ) template = PromptTemplate(template_string) kwargs = dict(a=1, b="two", d=4, e="á") populated = template.partial_populate(**kwargs) expected = "a = 1.00, b = two, c = {c:.1%}, d = 4., ascii of á = '\\xe1'" assert populated == expected ================================================ FILE: libs/kotaemon/tests/test_tools.py ================================================ import json from pathlib import Path from unittest.mock import patch from openai.types.create_embedding_response import CreateEmbeddingResponse from kotaemon.agents.tools import ComponentTool, GoogleSearchTool, WikipediaTool from kotaemon.base import Document from kotaemon.embeddings import AzureOpenAIEmbeddings from kotaemon.indices.vectorindex import VectorIndexing, VectorRetrieval from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: openai_embedding = CreateEmbeddingResponse.model_validate(json.load(f)) def test_google_tool(mock_google_search): tool = GoogleSearchTool() assert tool.name assert tool.description output = tool("What is Cinnamon AI") assert output def test_wikipedia_tool(): tool = WikipediaTool() assert tool.name assert tool.description output = tool("Cinnamon") assert output @patch( "openai.resources.embeddings.Embeddings.create", side_effect=lambda *args, **kwargs: openai_embedding, ) def test_pipeline_tool(tmp_path): db = ChromaVectorStore(path=str(tmp_path)) doc_store = InMemoryDocumentStore() embedding = AzureOpenAIEmbeddings( azure_deployment="embedding-deployment", azure_endpoint="https://test.openai.azure.com/", api_key="some-key", api_version="version", ) index_pipeline = VectorIndexing( vector_store=db, embedding=embedding, doc_store=doc_store ) retrieval_pipeline = VectorRetrieval( vector_store=db, doc_store=doc_store, embedding=embedding ) index_tool = ComponentTool( name="index_document", description="A tool to use to index a document to be searched later", component=index_pipeline, ) output = index_tool({"text": Document(text="Cinnamon AI")}) retrieval_tool = ComponentTool( name="search_document", description="A tool to use to search a document in a vectorstore", component=retrieval_pipeline, ) output = retrieval_tool("Cinnamon AI") assert output ================================================ FILE: libs/kotaemon/tests/test_vectorstore.py ================================================ import json import os import pytest from kotaemon.base import DocumentWithEmbedding from kotaemon.storages import ( ChromaVectorStore, InMemoryVectorStore, MilvusVectorStore, QdrantVectorStore, SimpleFileVectorStore, ) class TestChromaVectorStore: def test_add(self, tmp_path): """Test that the DB add correctly""" db = ChromaVectorStore(path=str(tmp_path)) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] ids = ["1", "2"] assert db._collection.count() == 0, "Expected empty collection" output = db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) assert output == ids, "Expected output to be the same as ids" assert db._collection.count() == 2, "Expected 2 added entries" def test_add_from_docs(self, tmp_path): db = ChromaVectorStore(path=str(tmp_path)) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] documents = [ DocumentWithEmbedding(embedding=embedding, metadata=metadata) for embedding, metadata in zip(embeddings, metadatas) ] assert db._collection.count() == 0, "Expected empty collection" output = db.add(documents) assert len(output) == 2, "Expected outputting 2 ids" assert db._collection.count() == 2, "Expected 2 added entries" def test_delete(self, tmp_path): db = ChromaVectorStore(path=str(tmp_path)) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = ["a", "b", "c"] db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) assert db._collection.count() == 3, "Expected 3 added entries" db.delete(ids=["a", "b"]) assert db._collection.count() == 1, "Expected 1 remaining entry" db.delete(ids=["c"]) assert db._collection.count() == 0, "Expected 0 remaining entry" def test_query(self, tmp_path): db = ChromaVectorStore(path=str(tmp_path)) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = ["a", "b", "c"] db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) _, sim, out_ids = db.query(embedding=[0.1, 0.2, 0.3], top_k=1) assert sim[0] - 1.0 < 1e-6 assert out_ids == ["a"] _, _, out_ids = db.query(embedding=[0.42, 0.52, 0.53], top_k=1) assert out_ids == ["b"] def test_save_load_delete(self, tmp_path): """Test that save/load func behave correctly.""" embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = ["1", "2", "3"] db = ChromaVectorStore(path=str(tmp_path)) db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) db2 = ChromaVectorStore(path=str(tmp_path)) assert ( db2._collection.count() == 3 ), "load function does not load data completely" # test delete collection function db2.drop() # reinit the chroma with the same collection name db2 = ChromaVectorStore(path=str(tmp_path)) assert ( db2._collection.count() == 0 ), "delete collection function does not work correctly" class TestInMemoryVectorStore: def test_add(self): """Test that add func adds correctly.""" embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] ids = ["1", "2"] db = InMemoryVectorStore() output = db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) assert output == ids, "Excepted output to be the same as ids" def test_save_load_delete(self, tmp_path): """Test that delete func deletes correctly.""" embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = ["1", "2", "3"] db = InMemoryVectorStore() db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) db.delete(["3"]) db.save(save_path=tmp_path / "test_save_load_delete.json") with open(tmp_path / "test_save_load_delete.json") as f: data = json.load(f) assert ( "1" and "2" in data["text_id_to_ref_doc_id"] ), "save function does not save data completely" assert ( "3" not in data["text_id_to_ref_doc_id"] ), "delete function does not delete data completely" db2 = InMemoryVectorStore() db2.load(load_path=tmp_path / "test_save_load_delete.json") assert db2.get("2") == [ 0.4, 0.5, 0.6, ], "load function does not load data completely" class TestSimpleFileVectorStore: def test_add_delete(self, tmp_path): """Test that delete func deletes correctly.""" embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = ["1", "2", "3"] collection_name = "test_save_load_delete" db = SimpleFileVectorStore(path=tmp_path, collection_name=collection_name) db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) db.delete(["3"]) with open(tmp_path / collection_name) as f: data = json.load(f) assert ( "1" and "2" in data["text_id_to_ref_doc_id"] ), "save function does not save data completely" assert ( "3" not in data["text_id_to_ref_doc_id"] ), "delete function does not delete data completely" db2 = SimpleFileVectorStore(path=tmp_path, collection_name=collection_name) assert db2.get("2") == [ 0.4, 0.5, 0.6, ], "load function does not load data completely" os.remove(tmp_path / collection_name) class TestMilvusVectorStore: def test_add(self, tmp_path): """Test that the DB add correctly""" db = MilvusVectorStore( path=str(tmp_path), overwrite=True, ) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] ids = ["1", "2"] assert db.count() == 0, "Expected empty collection" output = db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) assert output == ids, "Expected output to be the same as ids" assert db.count() == 2, "Expected 2 added entries" def test_add_from_docs(self, tmp_path): db = MilvusVectorStore( path=str(tmp_path), overwrite=True, ) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] documents = [ DocumentWithEmbedding(embedding=embedding, metadata=metadata) for embedding, metadata in zip(embeddings, metadatas) ] assert db.count() == 0, "Expected empty collection" output = db.add(documents) assert len(output) == 2, "Expected outputting 2 ids" assert db.count() == 2, "Expected 2 added entries" def test_delete(self, tmp_path): db = MilvusVectorStore( path=str(tmp_path), overwrite=True, ) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = ["a", "b", "c"] db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) assert db.count() == 3, "Expected 3 added entries" db.delete(ids=["a", "b"]) assert db.count() == 1, "Expected 1 remaining entry" db.delete(ids=["c"]) assert db.count() == 0, "Expected 0 remaining entry" def test_query(self, tmp_path): db = MilvusVectorStore(path=str(tmp_path), overwrite=True) import numpy as np embeddings = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]) norms = np.linalg.norm(embeddings, axis=1) normalized_embeddings = (embeddings / norms[:, np.newaxis]).tolist() metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = ["a", "b", "c"] db.add(embeddings=normalized_embeddings, metadatas=metadatas, ids=ids) _, sim, out_ids = db.query(embedding=normalized_embeddings[0], top_k=1) assert sim[0] - 1.0 < 1e-6 assert out_ids == ["a"] query_embedding = [ normalized_embeddings[1][0] + 0.02, normalized_embeddings[1][1] + 0.02, normalized_embeddings[1][2] + 0.02, ] _, _, out_ids = db.query(embedding=query_embedding, top_k=1) assert out_ids == ["b"] def test_save_load_delete(self, tmp_path): """Test that save/load func behave correctly.""" embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = ["1", "2", "3"] db = MilvusVectorStore(path=str(tmp_path), overwrite=True) db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) db2 = MilvusVectorStore(path=str(tmp_path), overrides=False) assert db2.count() == 3, "load function does not load data completely" # test delete collection function db2.drop() # reinit the milvus with the same collection name db2 = MilvusVectorStore(path=str(tmp_path), overwrite=False) assert db2.count() == 0, "delete collection function does not work correctly" class TestQdrantVectorStore: def test_add(self): from qdrant_client import QdrantClient db = QdrantVectorStore(collection_name="test", client=QdrantClient(":memory:")) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] ids = [ "0f0611b3-2d9c-4818-ab69-1f1c4cf66693", "90aba5d3-f4f8-47c6-bad9-5ea457442e07", ] output = db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) assert output == ids, "Expected output to be the same as ids" assert db.count() == 2, "Expected 2 added entries" def test_add_from_docs(self, tmp_path): from qdrant_client import QdrantClient db = QdrantVectorStore(collection_name="test", client=QdrantClient(":memory:")) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] documents = [ DocumentWithEmbedding(embedding=embedding, metadata=metadata) for embedding, metadata in zip(embeddings, metadatas) ] output = db.add(documents) assert len(output) == 2, "Expected outputting 2 ids" assert db.count() == 2, "Expected 2 added entries" def test_delete(self, tmp_path): from qdrant_client import QdrantClient db = QdrantVectorStore(collection_name="test", client=QdrantClient(":memory:")) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = [ "0f0611b3-2d9c-4818-ab69-1f1c4cf66693", "90aba5d3-f4f8-47c6-bad9-5ea457442e07", "6bed07c3-d284-47a3-a711-c3f9186755b8", ] db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) assert db.count() == 3, "Expected 3 added entries" db.delete( ids=[ "0f0611b3-2d9c-4818-ab69-1f1c4cf66693", "90aba5d3-f4f8-47c6-bad9-5ea457442e07", ] ) assert db.count() == 1, "Expected 1 remaining entry" db.delete(ids=["6bed07c3-d284-47a3-a711-c3f9186755b8"]) assert db.count() == 0, "Expected 0 remaining entry" def test_query(self, tmp_path): from qdrant_client import QdrantClient db = QdrantVectorStore(collection_name="test", client=QdrantClient(":memory:")) embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = [ "0f0611b3-2d9c-4818-ab69-1f1c4cf66693", "90aba5d3-f4f8-47c6-bad9-5ea457442e07", "6bed07c3-d284-47a3-a711-c3f9186755b8", ] db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) _, sim, out_ids = db.query(embedding=[0.1, 0.2, 0.3], top_k=1) assert sim[0] - 1.0 < 1e-6 assert out_ids == ["0f0611b3-2d9c-4818-ab69-1f1c4cf66693"] _, _, out_ids = db.query(embedding=[0.4, 0.5, 0.6], top_k=1) assert out_ids == ["90aba5d3-f4f8-47c6-bad9-5ea457442e07"] def test_save_load_delete(self, tmp_path): """Test that save/load func behave correctly.""" embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] ids = [ "0f0611b3-2d9c-4818-ab69-1f1c4cf66693", "90aba5d3-f4f8-47c6-bad9-5ea457442e07", "6bed07c3-d284-47a3-a711-c3f9186755b8", ] from qdrant_client import QdrantClient db = QdrantVectorStore( collection_name="test", client=QdrantClient(path=tmp_path) ) db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) del db db2 = QdrantVectorStore( collection_name="test", client=QdrantClient(path=tmp_path) ) assert db2.count() == 3 db2.drop() del db2 db2 = QdrantVectorStore( collection_name="test", client=QdrantClient(path=tmp_path) ) with pytest.raises(Exception): # Since no docs were added, the collection should not exist yet # and thus the count function should raise an exception db2.count() ================================================ FILE: libs/ktem/.gitignore ================================================ 14-1_抜粋-1.pdf _example_.db ktem/assets/prebuilt/ ================================================ FILE: libs/ktem/MANIFEST.in ================================================ include ktem/assets/css/*.css include ktem/assets/img/*.svg include ktem/assets/js/*.js include ktem/assets/md/*.md ================================================ FILE: libs/ktem/alembic.ini ================================================ # A generic, single database configuration. [alembic] # path to migration scripts script_location = migrations # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s # Uncomment the line below if you want the files to be prepended with date and time # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file # for all available tokens # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s # sys.path path, will be prepended to sys.path if present. # defaults to the current working directory. prepend_sys_path = . # timezone to use when rendering the date within the migration file # as well as the filename. # If specified, requires the python>=3.9 or backports.zoneinfo library. # Any required deps can installed by adding `alembic[tz]` to the pip requirements # string value is passed to ZoneInfo() # leave blank for localtime # timezone = # max length of characters to apply to the # "slug" field # truncate_slug_length = 40 # set to 'true' to run the environment during # the 'revision' command, regardless of autogenerate # revision_environment = false # set to 'true' to allow .pyc and .pyo files without # a source .py file to be detected as revisions in the # versions/ directory # sourceless = false # version location specification; This defaults # to migrations/versions. When using multiple version # directories, initial revisions must be specified with --version-path. # The path separator used here should be the separator specified by "version_path_separator" below. # version_locations = %(here)s/bar:%(here)s/bat:migrations/versions # version path separator; As mentioned above, this is the character used to split # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. # Valid values for version_path_separator are: # # version_path_separator = : # version_path_separator = ; # version_path_separator = space version_path_separator = os # Use os.pathsep. Default configuration used for new projects. # set to 'true' to search source files recursively # in each "version_locations" directory # new in Alembic version 1.10 # recursive_version_locations = false # the output encoding used when revision files # are written from script.py.mako # output_encoding = utf-8 sqlalchemy.url = driver://user:pass@localhost/dbname [post_write_hooks] # post_write_hooks defines scripts or Python functions that are run # on newly generated revision scripts. See the documentation for further # detail and examples # format using "black" - use the console_scripts runner, against the "black" entrypoint # hooks = black # black.type = console_scripts # black.entrypoint = black # black.options = -l 79 REVISION_SCRIPT_FILENAME # lint with attempts to fix using "ruff" - use the exec runner, execute a binary # hooks = ruff # ruff.type = exec # ruff.executable = %(here)s/.venv/bin/ruff # ruff.options = --fix REVISION_SCRIPT_FILENAME # Logging configuration [loggers] keys = root,sqlalchemy,alembic [handlers] keys = console [formatters] keys = generic [logger_root] level = WARN handlers = console qualname = [logger_sqlalchemy] level = WARN handlers = qualname = sqlalchemy.engine [logger_alembic] level = INFO handlers = qualname = alembic [handler_console] class = StreamHandler args = (sys.stderr,) level = NOTSET formatter = generic [formatter_generic] format = %(levelname)-5.5s [%(name)s] %(message)s datefmt = %H:%M:%S ================================================ FILE: libs/ktem/ktem/__init__.py ================================================ ================================================ FILE: libs/ktem/ktem/app.py ================================================ import os from pathlib import Path from typing import Optional import gradio as gr import pluggy from ktem import extension_protocol from ktem.assets import PDFJS_PREBUILT_DIR, KotaemonTheme from ktem.components import reasonings from ktem.exceptions import HookAlreadyDeclared, HookNotDeclared from ktem.index import IndexManager from ktem.settings import BaseSettingGroup, SettingGroup, SettingReasoningGroup from theflow.settings import settings from theflow.utils.modules import import_dotted_string BASE_PATH = os.environ.get("GR_FILE_ROOT_PATH", "") class BaseApp: """The main app of Kotaemon The main application contains app-level information: - setting state - dynamic conversation state - user id Also contains registering methods for: - reasoning pipelines - indexing & retrieval pipelines App life-cycle: - Render - Declare public events - Subscribe public events - Register events """ public_events: list[str] = [] def __init__(self): self.dev_mode = getattr(settings, "KH_MODE", "") == "dev" self.app_name = getattr(settings, "KH_APP_NAME", "Kotaemon") self.app_version = getattr(settings, "KH_APP_VERSION", "") self.f_user_management = getattr(settings, "KH_FEATURE_USER_MANAGEMENT", False) self._theme = KotaemonTheme() dir_assets = Path(__file__).parent / "assets" with (dir_assets / "css" / "main.css").open() as fi: self._css = fi.read() with (dir_assets / "js" / "main.js").open() as fi: self._js = fi.read() self._js = self._js.replace("KH_APP_VERSION", self.app_version) with (dir_assets / "js" / "pdf_viewer.js").open(encoding="utf-8") as fi: self._pdf_view_js = fi.read() # workaround for Windows path pdf_js_dist_dir = str(PDFJS_PREBUILT_DIR).replace("\\", "\\\\") self._pdf_view_js = self._pdf_view_js.replace( "PDFJS_PREBUILT_DIR", pdf_js_dist_dir, ).replace("GR_FILE_ROOT_PATH", BASE_PATH) with (dir_assets / "js" / "svg-pan-zoom.min.js").open() as fi: self._svg_js = fi.read() self._favicon = str(dir_assets / "img" / "favicon.svg") self.default_settings = SettingGroup( application=BaseSettingGroup(settings=settings.SETTINGS_APP), reasoning=SettingReasoningGroup(settings=settings.SETTINGS_REASONING), ) self._callbacks: dict[str, list] = {} self._events: dict[str, list] = {} self.register_extensions() self.register_reasonings() self.initialize_indices() self.default_settings.reasoning.finalize() self.default_settings.index.finalize() self.settings_state = gr.State(self.default_settings.flatten()) self.user_id = gr.State("default" if not self.f_user_management else None) def initialize_indices(self): """Create the index manager, start indices, and register to app settings""" self.index_manager = IndexManager(self) self.index_manager.on_application_startup() for index in self.index_manager.indices: options = index.get_user_settings() self.default_settings.index.options[index.id] = BaseSettingGroup( settings=options ) def register_reasonings(self): """Register the reasoning components from app settings""" if getattr(settings, "KH_REASONINGS", None) is None: return for value in settings.KH_REASONINGS: reasoning_cls = import_dotted_string(value, safe=False) rid = reasoning_cls.get_info()["id"] reasonings[rid] = reasoning_cls options = reasoning_cls().get_user_settings() self.default_settings.reasoning.options[rid] = BaseSettingGroup( settings=options ) def register_extensions(self): """Register installed extensions""" self.exman = pluggy.PluginManager("ktem") self.exman.add_hookspecs(extension_protocol) self.exman.load_setuptools_entrypoints("ktem") # retrieve and register extension declarations extension_declarations = self.exman.hook.ktem_declare_extensions() for extension_declaration in extension_declarations: # if already in database, with the same version: skip # otherwise, # remove the old information from the database if it exists # store the information into the database functionality = extension_declaration["functionality"] # update the reasoning information if "reasoning" in functionality: for rid, rdec in functionality["reasoning"].items(): unique_rid = f"{extension_declaration['id']}/{rid}" self.default_settings.reasoning.options[ unique_rid ] = BaseSettingGroup( settings=rdec["settings"], ) def declare_event(self, name: str): """Declare a public gradio event for other components to subscribe to Args: name: The name of the event """ if name in self._events: raise HookAlreadyDeclared(f"Hook {name} is already declared") self._events[name] = [] def subscribe_event(self, name: str, definition: dict): """Register a hook for the app Args: name: The name of the hook hook: The hook to be registered """ if name not in self._events: raise HookNotDeclared(f"Hook {name} is not declared") self._events[name].append(definition) def get_event(self, name) -> list[dict]: if name not in self._events: raise HookNotDeclared(f"Hook {name} is not declared") return self._events[name] def ui(self): raise NotImplementedError def on_subscribe_public_events(self): """Subscribe to the declared public event of the app""" def on_register_events(self): """Register all events to the app""" def _on_app_created(self): """Called when the app is created""" def make(self): markmap_js = """ """ external_js = ( "" "" # noqa "" # noqa "" "" # noqa ) with gr.Blocks( theme=self._theme, css=self._css, title=self.app_name, analytics_enabled=False, js=self._js, head=external_js, ) as demo: self.app = demo self.settings_state.render() self.user_id.render() self.ui() self.declare_public_events() self.subscribe_public_events() self.register_events() self.on_app_created() demo.load(None, None, None, js=self._pdf_view_js) return demo def declare_public_events(self): """Declare an event for the app""" for event in self.public_events: self.declare_event(event) for value in self.__dict__.values(): if isinstance(value, BasePage): value.declare_public_events() def subscribe_public_events(self): """Subscribe to an event""" self.on_subscribe_public_events() for value in self.__dict__.values(): if isinstance(value, BasePage): value.subscribe_public_events() def register_events(self): """Register all events""" self.on_register_events() for value in self.__dict__.values(): if isinstance(value, BasePage): value.register_events() def on_app_created(self): """Execute on app created callbacks""" self._on_app_created() for value in self.__dict__.values(): if isinstance(value, BasePage): value.on_app_created() class BasePage: """The logic of the Kotaemon app""" public_events: list[str] = [] def __init__(self, app): self._app = app def on_building_ui(self): """Build the UI of the app""" def on_subscribe_public_events(self): """Subscribe to the declared public event of the app""" def on_register_events(self): """Register all events to the app""" def _on_app_created(self): """Called when the app is created""" def as_gradio_component( self, ) -> Optional[gr.components.Component | list[gr.components.Component]]: """Return the gradio components responsible for events Note: in ideal scenario, this method shouldn't be necessary. """ return None def render(self): for value in self.__dict__.values(): if isinstance(value, gr.blocks.Block): value.render() if isinstance(value, BasePage): value.render() def unrender(self): for value in self.__dict__.values(): if isinstance(value, gr.blocks.Block): value.unrender() if isinstance(value, BasePage): value.unrender() def declare_public_events(self): """Declare an event for the app""" for event in self.public_events: self._app.declare_event(event) for value in self.__dict__.values(): if isinstance(value, BasePage): value.declare_public_events() def subscribe_public_events(self): """Subscribe to an event""" self.on_subscribe_public_events() for value in self.__dict__.values(): if isinstance(value, BasePage): value.subscribe_public_events() def register_events(self): """Register all events""" self.on_register_events() for value in self.__dict__.values(): if isinstance(value, BasePage): value.register_events() def on_app_created(self): """Execute on app created callbacks""" self._on_app_created() for value in self.__dict__.values(): if isinstance(value, BasePage): value.on_app_created() ================================================ FILE: libs/ktem/ktem/assets/__init__.py ================================================ from pathlib import Path from decouple import config from .theme import Kotaemon as KotaemonTheme PDFJS_VERSION_DIST: str = config("PDFJS_VERSION_DIST", "pdfjs-4.0.379-dist") PDFJS_PREBUILT_DIR: Path = config( "PDFJS_PREBUILT_DIR", Path(__file__).parent / "prebuilt" / PDFJS_VERSION_DIST ) __all__ = ["KotaemonTheme", "PDFJS_VERSION_DIST", "PDFJS_PREBUILT_DIR"] ================================================ FILE: libs/ktem/ktem/assets/css/main.css ================================================ :root { --main-area-height: calc(100vh - 110px); } /* no footer */ footer { display: none !important; } /* customize scrollbar */ ::-webkit-scrollbar { background: var(--background-fill-primary); } ::-webkit-scrollbar-thumb { background-color: var(--border-color-primary); border: 4px solid transparent; border-radius: 100px; background-clip: content-box; } ::-webkit-scrollbar-corner { background: var(--background-fill-primary); } .gradio-container { max-width: 100% !important; /* overflow: scroll !important; height: 100% !important; */ } /* styling for header bar */ .header-bar { background-color: transparent; margin: 0px 0px 20px; overflow-x: scroll; display: block !important; text-wrap: nowrap; border: none; } .header-bar button.selected { border: none; /* an alternative header bar style with rounded background */ /* background-color: var(--background-fill-primary); border: 4px solid transparent; border-radius: var(--radius-lg); background-clip: padding-box; */ } /* selected buttons have highlighted text */ button.selected { color: var(--block-label-text-color); font-weight: bold; } .message-row.bubble.bot-row{ overflow-x: auto; } .flex-wrap.bot { overflow-x: inherit; } #chat-tab, #indices-tab, #settings-tab, #help-tab, #resources-tab, #login-tab { border: none !important; } #help-tab, #settings-tab { /* text-dense view should not be wide for readability */ max-width: max(56vw, 900px) !important; margin: 0 auto !important; } .indices-tab, #resources-tab { /* Other view should not be too wide */ border: none !important; max-width: max(70vw, 1200px) !important; margin: 0 auto !important; } #main-chat-bot { background: var(--background-fill-primary); flex: auto; } #chat-area { height: var(--main-area-height) !important; column-gap: 2px !important; } #chat-info-panel { max-height: var(--main-area-height) !important; overflow: auto !important; transition: all 0.4s; } body.dark #chat-info-panel figure>img{ filter: invert(100%); } #conv-settings-panel { max-height: var(--main-area-height) !important; flex-wrap: unset; overflow-y: scroll !important; position: sticky; column-gap: 2px !important; scrollbar-width: none; /* Firefox */ -ms-overflow-style: none; /* Internet Explorer 10+ */ transition: all 0.3s; } #conv-settings-panel::-webkit-scrollbar { /* WebKit */ width: 0; height: 0; } td { /* Fix for Firefox Gradio table overflow display */ overflow: hidden; } .setting-answer-mode-description { margin: 5px 5px 2px !important; } .message-buttons-right { display: none !important; } mark { background-color: #10b981; } /* clpse */ .clpse { background-color: var(--background-fill-secondary); font-weight: bold; cursor: pointer; padding: 3px; width: 100%; border: none; text-align: left; outline: none; } /* for setting transparent background for elements */ .no-background { background-color: transparent; border: none; } /* for setting bold text for elements */ .bold-text { font-weight: bold; } /* for setting highlighted text for elements */ .body-text-color { color: var(--body-text-color); } /* for setting right-aligned buttons */ .right-button { min-width: 200px !important; width: fit-content; padding-left: 20px; padding-right: 20px; margin: 0px 0px 0px auto; } /* for setting height limit for buttons */ .cap-button-height { max-height: 42px; } /* Hide sort buttons at gr.DataFrame */ .sort-button { display: none !important; } /* Show sort button only in File list*/ #file_list_view .sort-button { display: block !important; } #toggle-dark-button { position: fixed; top: 6px; right: 30px; } #info-expand-button { position: absolute; top: 6px; right: 15px; } /* prevent overflow of html info panel */ #html-info-panel { overflow-x: auto !important; } #chat-expand-button { position: absolute; top: 6px; right: -10px; z-index: 1; } #save-setting-btn { width: 150px; height: 30px; min-width: 100px !important; } #quick-setting-labels { margin-top: 5px; margin-bottom: -10px; } #use-mindmap-checkbox { position: absolute; width: 110px; top: 10px; right: 25px; } #citation-dropdown { width: min(25%, 100px); position: absolute; top: 2px; left: 120px; height: 35px; } #quick-url textarea { resize: none; background: transparent; margin-top: 0px; } #quick-url textarea::placeholder { text-align: center; } #quick-file { height: 110px; } span.icon { color: #cecece; } .upload-button { display: none; } .scrollable { overflow-y: auto; } .fill-main-area-height { max-height: var(--main-area-height); } .unset-overflow { overflow: unset !important; } pdfjs-viewer-element { height: 100dvh; } /* Modal styles */ .modal { display: none; position: relative; z-index: 2; left: 0; top: 0; width: 100%; height: 85dvh; overflow: hidden; background-color: rgba(0, 0, 0, 0.4); } .modal-header { padding: 0px 10px } .modal-content { background-color: #fefefe; height: 100%; display: flex; flex-direction: column; } .close { color: #aaa; align-self: flex-end; font-size: 28px; font-weight: bold; } .close:hover, .close:focus { color: black; text-decoration: none; cursor: pointer; } .modal-body { flex: 1; overflow: hidden; } /* Switch checkbox styles */ /* #is-public-checkbox { position: relative; top: 4px; } */ #suggest-chat-checkbox { position: relative; top: 4px; } .switch input { position: absolute; opacity: 0; } .switch { display: inline-block; /* 1 */ height: 1em; width: 2em; background: #8f8f8f; border-radius: 1em; position: relative; top: 2px; margin-right: 1em; } .switch div { height: 1em; width: 1em; border-radius: 1em; background: #FFF; box-shadow: 0 0.1em 0.3em rgba(0, 0, 0, 0.3); -webkit-transition: all 300ms; -moz-transition: all 300ms; transition: all 300ms; } .switch input:checked+div { -webkit-transform: translate3d(100%, 0, 0); -moz-transform: translate3d(100%, 0, 0); transform: translate3d(100%, 0, 0); background: #12df9a; } .switch:has(> input:checked) { background: #0c895f; } /* Bot animation */ .message.bot { animation: fadein 1.0s ease-in-out forwards; } details.evidence { animation: fadein 0.3s ease-in-out forwards; } @keyframes fadein { 0% { opacity: 0; } 100% { opacity: 100%; } } .message a.citation { color: #10b981; text-decoration: none; } /* pop-up for file tag in chat input*/ .tribute-container ul { background-color: var(--background-fill-primary) !important; color: var(--body-text-color) !important; font-family: var(--font); font-size: var(--text-md); } .tribute-container li.highlight { background-color: var(--border-color-primary) !important; } /* a fix for flickering background in Gradio DataFrame */ tbody:not(.row_odd) { background: var(--table-even-background-fill); } #chat-suggestion { max-height: 350px; } #chat-suggestion table { overflow: hidden; } #chat-suggestion table thead { display: none; } #paper-suggestion table { overflow: hidden; } svg.markmap { width: 100%; height: 100%; font-family: Quicksand, sans-serif; font-size: 15px; } div.markmap { height: 400px; } #google-login { max-width: 450px; } #user-api-key-wrapper { max-width: 450px; } #login-row { display: grid; place-items: center; } ================================================ FILE: libs/ktem/ktem/assets/js/main.js ================================================ function run() { let main_parent = document.getElementById("chat-tab").parentNode; main_parent.childNodes[0].classList.add("header-bar"); main_parent.style = "padding: 0; margin: 0"; main_parent.parentNode.style = "gap: 0"; main_parent.parentNode.parentNode.style = "padding: 0"; const version_node = document.createElement("p"); version_node.innerHTML = "version: KH_APP_VERSION"; version_node.style = "position: fixed; top: 10px; right: 10px;"; main_parent.appendChild(version_node); // add favicon const favicon = document.createElement("link"); // set favicon attributes favicon.rel = "icon"; favicon.type = "image/svg+xml"; favicon.href = "/favicon.ico"; document.head.appendChild(favicon); // setup conversation dropdown placeholder let conv_dropdown = document.querySelector("#conversation-dropdown input"); conv_dropdown.placeholder = "Browse conversation"; // move info-expand-button let info_expand_button = document.getElementById("info-expand-button"); let chat_info_panel = document.getElementById("info-expand"); chat_info_panel.insertBefore( info_expand_button, chat_info_panel.childNodes[2] ); // move toggle-side-bar button let chat_expand_button = document.getElementById("chat-expand-button"); let chat_column = document.getElementById("main-chat-bot"); let conv_column = document.getElementById("conv-settings-panel"); // move setting close button let setting_tab_nav_bar = document.querySelector("#settings-tab .tab-nav"); let setting_close_button = document.getElementById("save-setting-btn"); if (setting_close_button) { setting_tab_nav_bar.appendChild(setting_close_button); } let default_conv_column_min_width = "min(300px, 100%)"; conv_column.style.minWidth = default_conv_column_min_width; globalThis.toggleChatColumn = () => { /* get flex-grow value of chat_column */ let flex_grow = conv_column.style.flexGrow; if (flex_grow == "0") { conv_column.style.flexGrow = "1"; conv_column.style.minWidth = default_conv_column_min_width; } else { conv_column.style.flexGrow = "0"; conv_column.style.minWidth = "0px"; } }; chat_column.insertBefore(chat_expand_button, chat_column.firstChild); // move use mind-map checkbox let mindmap_checkbox = document.getElementById("use-mindmap-checkbox"); let citation_dropdown = document.getElementById("citation-dropdown"); let chat_setting_panel = document.getElementById("chat-settings-expand"); chat_setting_panel.insertBefore( mindmap_checkbox, chat_setting_panel.childNodes[2] ); chat_setting_panel.insertBefore(citation_dropdown, mindmap_checkbox); // move share conv checkbox let report_div = document.querySelector( "#report-accordion > div:nth-child(3) > div:nth-child(1)" ); let share_conv_checkbox = document.getElementById("is-public-checkbox"); if (share_conv_checkbox) { report_div.insertBefore(share_conv_checkbox, report_div.querySelector("button")); } // create slider toggle const is_public_checkbox = document.getElementById("suggest-chat-checkbox"); const label_element = is_public_checkbox.getElementsByTagName("label")[0]; const checkbox_span = is_public_checkbox.getElementsByTagName("span")[0]; new_div = document.createElement("div"); label_element.classList.add("switch"); is_public_checkbox.appendChild(checkbox_span); label_element.appendChild(new_div); // clpse globalThis.clpseFn = (id) => { var obj = document.getElementById("clpse-btn-" + id); obj.classList.toggle("clpse-active"); var content = obj.nextElementSibling; if (content.style.display === "none") { content.style.display = "block"; } else { content.style.display = "none"; } }; // store info in local storage globalThis.setStorage = (key, value) => { localStorage.setItem(key, value); }; globalThis.getStorage = (key, value) => { item = localStorage.getItem(key); return item ? item : value; }; globalThis.removeFromStorage = (key) => { localStorage.removeItem(key); }; // Function to scroll to given citation with ID // Sleep function using Promise and setTimeout function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } globalThis.scrollToCitation = async (event) => { event.preventDefault(); // Prevent the default link behavior var citationId = event.target.getAttribute("id"); await sleep(100); // Sleep for 100 milliseconds // check if modal is open var modal = document.getElementById("pdf-modal"); var citation = document.querySelector('mark[id="' + citationId + '"]'); if (modal.style.display == "block") { // trigger on click event of PDF Preview link var detail_elem = citation; // traverse up the DOM tree to find the parent element with tag detail while (detail_elem.tagName.toLowerCase() != "details") { detail_elem = detail_elem.parentElement; } detail_elem.getElementsByClassName("pdf-link").item(0).click(); } else { if (citation) { citation.scrollIntoView({ behavior: "smooth" }); } } }; globalThis.fullTextSearch = () => { // Assign text selection event to last bot message var bot_messages = document.querySelectorAll( "div#main-chat-bot div.message-row.bot-row" ); var last_bot_message = bot_messages[bot_messages.length - 1]; // check if the last bot message has class "text_selection" if (last_bot_message.classList.contains("text_selection")) { return; } // assign new class to last message last_bot_message.classList.add("text_selection"); // Get sentences from evidence div var evidences = document.querySelectorAll( "#html-info-panel > div:last-child > div > details.evidence div.evidence-content" ); console.log("Indexing evidences", evidences); const segmenterEn = new Intl.Segmenter("en", { granularity: "sentence" }); // Split sentences and save to all_segments list var all_segments = []; for (var evidence of evidences) { // check if
tag is open if (!evidence.parentElement.open) { continue; } var markmap_div = evidence.querySelector("div.markmap"); if (markmap_div) { continue; } var evidence_content = evidence.textContent.replace(/[\r\n]+/g, " "); sentence_it = segmenterEn.segment(evidence_content)[Symbol.iterator](); while ((sentence = sentence_it.next().value)) { segment = sentence.segment.trim(); if (segment) { all_segments.push({ id: all_segments.length, text: segment, }); } } } let miniSearch = new MiniSearch({ fields: ["text"], // fields to index for full-text search storeFields: ["text"], }); // Index all documents miniSearch.addAll(all_segments); last_bot_message.addEventListener("mouseup", () => { let selection = window.getSelection().toString(); let results = miniSearch.search(selection); if (results.length == 0) { return; } let matched_text = results[0].text; console.log("query\n", selection, "\nmatched text\n", matched_text); var evidences = document.querySelectorAll( "#html-info-panel > div:last-child > div > details.evidence div.evidence-content" ); // check if modal is open var modal = document.getElementById("pdf-modal"); // convert all in evidences to normal text evidences.forEach((evidence) => { evidence.querySelectorAll("mark").forEach((mark) => { mark.outerHTML = mark.innerText; }); }); // highlight matched_text in evidences for (var evidence of evidences) { var evidence_content = evidence.textContent.replace(/[\r\n]+/g, " "); if (evidence_content.includes(matched_text)) { // select all p and li elements paragraphs = evidence.querySelectorAll("p, li"); for (var p of paragraphs) { var p_content = p.textContent.replace(/[\r\n]+/g, " "); if (p_content.includes(matched_text)) { p.innerHTML = p_content.replace( matched_text, "" + matched_text + "" ); console.log("highlighted", matched_text, "in", p); if (modal.style.display == "block") { // trigger on click event of PDF Preview link var detail_elem = p; // traverse up the DOM tree to find the parent element with tag detail while (detail_elem.tagName.toLowerCase() != "details") { detail_elem = detail_elem.parentElement; } detail_elem.getElementsByClassName("pdf-link").item(0).click(); } else { p.scrollIntoView({ behavior: "smooth", block: "center" }); } break; } } } } }); }; globalThis.spawnDocument = (content, options) => { let opt = { window: "", closeChild: true, childId: "_blank", }; Object.assign(opt, options); // minimal error checking if ( content && typeof content.toString == "function" && content.toString().length ) { let child = window.open("", opt.childId, opt.window); child.document.write(content.toString()); if (opt.closeChild) child.document.close(); return child; } }; globalThis.fillChatInput = (event) => { let chatInput = document.querySelector("#chat-input textarea"); // fill the chat input with the clicked div text chatInput.value = "Explain " + event.target.textContent; var evt = new Event("change"); chatInput.dispatchEvent(new Event("input", { bubbles: true })); chatInput.focus(); }; } ================================================ FILE: libs/ktem/ktem/assets/js/pdf_viewer.js ================================================ function onBlockLoad() { var infor_panel_scroll_pos = 0; globalThis.createModal = () => { // Create modal for the 1st time if it does not exist var modal = document.getElementById("pdf-modal"); var old_position = null; var old_width = null; var old_left = null; var expanded = false; modal.id = "pdf-modal"; modal.className = "modal"; modal.innerHTML = ` `; modal.querySelector("#modal-close").onclick = function () { modal.style.display = "none"; var info_panel = document.getElementById("html-info-panel"); if (info_panel) { info_panel.style.display = "block"; } var scrollableDiv = document.getElementById("chat-info-panel"); scrollableDiv.scrollTop = infor_panel_scroll_pos; }; modal.querySelector("#modal-expand").onclick = function () { expanded = !expanded; if (expanded) { old_position = modal.style.position; old_left = modal.style.left; old_width = modal.style.width; modal.style.position = "fixed"; modal.style.width = "70%"; modal.style.left = "15%"; modal.style.height = "100dvh"; } else { modal.style.position = old_position; modal.style.width = old_width; modal.style.left = old_left; modal.style.height = "85dvh"; } }; }; function matchRatio(str1, str2) { let n = str1.length; let m = str2.length; let lcs = []; for (let i = 0; i <= n; i++) { lcs[i] = []; for (let j = 0; j <= m; j++) { lcs[i][j] = 0; } } let result = ""; let max = 0; for (let i = 0; i < n; i++) { for (let j = 0; j < m; j++) { if (str1[i] === str2[j]) { lcs[i + 1][j + 1] = lcs[i][j] + 1; if (lcs[i + 1][j + 1] > max) { max = lcs[i + 1][j + 1]; result = str1.substring(i - max + 1, i + 1); } } } } return result.length / Math.min(n, m); } globalThis.compareText = (search_phrases, page_label) => { var iframe = document.querySelector("#pdf-viewer").iframe; var innerDoc = iframe.contentDocument ? iframe.contentDocument : iframe.contentWindow.document; var renderedPages = innerDoc.querySelectorAll("div#viewer div.page"); if (renderedPages.length == 0) { // if pages are not rendered yet, wait and try again setTimeout(() => compareText(search_phrases, page_label), 2000); return; } var query_selector = "#viewer > div[data-page-number='" + page_label + "'] > div.textLayer > span"; var page_spans = innerDoc.querySelectorAll(query_selector); for (var i = 0; i < page_spans.length; i++) { var span = page_spans[i]; if ( span.textContent.length > 4 && search_phrases.some( (phrase) => matchRatio(phrase, span.textContent) > 0.5 ) ) { span.innerHTML = "" + span.textContent + ""; } else { // if span is already highlighted, remove it if (span.querySelector(".highlight")) { span.innerHTML = span.textContent; } } } }; // Sleep function using Promise and setTimeout function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } // Function to open modal and display PDF globalThis.openModal = async (event) => { event.preventDefault(); var target = event.currentTarget; var src = target.getAttribute("data-src"); var page = target.getAttribute("data-page"); var search = target.getAttribute("data-search"); var highlighted_spans = target.parentElement.parentElement.querySelectorAll("mark"); // Get text from highlighted spans var search_phrases = Array.from(highlighted_spans).map( (span) => span.textContent ); // Use regex to strip 【id】from search phrases search_phrases = search_phrases.map((phrase) => phrase.replace(/【\d+】/g, "") ); // var phrase = target.getAttribute("data-phrase"); var pdfViewer = document.getElementById("pdf-viewer"); current_src = pdfViewer.getAttribute("src"); if (current_src != src) { pdfViewer.setAttribute("src", src); } // pdfViewer.setAttribute("phrase", phrase); // pdfViewer.setAttribute("search", search); pdfViewer.setAttribute("page", page); var scrollableDiv = document.getElementById("chat-info-panel"); infor_panel_scroll_pos = scrollableDiv.scrollTop; var modal = document.getElementById("pdf-modal"); modal.style.display = "block"; var info_panel = document.getElementById("html-info-panel"); if (info_panel) { info_panel.style.display = "none"; } scrollableDiv.scrollTop = 0; /* search for text inside PDF page */ await sleep(500); compareText(search_phrases, page); }; globalThis.assignPdfOnclickEvent = () => { // Get all links and attach click event var links = document.getElementsByClassName("pdf-link"); for (var i = 0; i < links.length; i++) { links[i].onclick = openModal; } }; var created_modal = document.getElementById("pdf-viewer"); if (!created_modal) { createModal(); } } ================================================ FILE: libs/ktem/ktem/assets/md/about.md ================================================ # About Kotaemon An open-source tool for you to chat with your documents. [Source Code](https://github.com/Cinnamon/kotaemon) | [Demo](https://huggingface.co/spaces/cin-model/kotaemon-demo) [User Guide](https://cinnamon.github.io/kotaemon/) | [Developer Guide](https://cinnamon.github.io/kotaemon/development/) | [Feedback](https://github.com/Cinnamon/kotaemon/issues) ================================================ FILE: libs/ktem/ktem/assets/md/changelogs.md ================================================ # Changelogs ## v0.0.1 - Chat: interact with chatbot with simple pipeline, rewoo and react agents - Chat: conversation management: create, delete, rename conversations - Files: upload files - Files: select files as context for chatbot - User management: create, sign-in, sign-out, change password - Setting: common settings and pipeline-based settings - Info panel: show Cinnamon AI and Kotaemon information ================================================ FILE: libs/ktem/ktem/assets/md/usage.md ================================================ # Basic Usage ## 1. Add your AI models ![resources tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/resources-tab.png) - The tool uses Large Language Model (LLMs) to perform various tasks in a QA pipeline. So, you need to provide the application with access to the LLMs you want to use. - You only need to provide at least one. However, tt is recommended that you include all the LLMs that you have access to, you will be able to switch between them while using the application. To add a model: 1. Navigate to the `Resources` tab. 2. Select the `LLMs` sub-tab. 3. Select the `Add` sub-tab. 4. Config the model to add: - Give it a name. - Pick a vendor/provider (e.g. `ChatOpenAI`). - Provide the specifications. - (Optional) Set the model as default. 5. Click `Add` to add the model. 6. Select `Embedding Models` sub-tab and repeat the step 3 to 5 to add an embedding model.
(Optional) Configure model via the .env file Alternatively, you can configure the models via the `.env` file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one. Currently, the following providers are supported: ### OpenAI In the `.env` file, set the `OPENAI_API_KEY` variable with your OpenAI API key in order to enable access to OpenAI's models. There are other variables that can be modified, please feel free to edit them to fit your case. Otherwise, the default parameter should work for most people. ```shell OPENAI_API_BASE=https://api.openai.com/v1 OPENAI_API_KEY= OPENAI_CHAT_MODEL=gpt-3.5-turbo OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002 ``` ### Azure OpenAI For OpenAI models via Azure platform, you need to provide your Azure endpoint and API key. Your might also need to provide your developments' name for the chat model and the embedding model depending on how you set up Azure development. ```shell AZURE_OPENAI_ENDPOINT= AZURE_OPENAI_API_KEY= OPENAI_API_VERSION=2024-02-15-preview AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002 ``` ### Local models - Pros: - Privacy. Your documents will be stored and process locally. - Choices. There are a wide range of LLMs in terms of size, domain, language to choose from. - Cost. It's free. - Cons: - Quality. Local models are much smaller and thus have lower generative quality than paid APIs. - Speed. Local models are deployed using your machine so the processing speed is limited by your hardware. #### Find and download a LLM You can search and download a LLM to be ran locally from the [Hugging Face Hub](https://huggingface.co/models). Currently, these model formats are supported: - GGUF You should choose a model whose size is less than your device's memory and should leave about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available, then you should choose a model that take up at most 10 GB of RAM. Bigger models tend to give better generation but also take more processing time. Here are some recommendations and their size in memory: - [Qwen1.5-1.8B-Chat-GGUF](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q8_0.gguf?download=true): around 2 GB #### Enable local models To add a local model to the model pool, set the `LOCAL_MODEL` variable in the `.env` file to the path of the model file. ```shell LOCAL_MODEL= ``` Here is how to get the full path of your model file: - On Windows 11: right click the file and select `Copy as Path`.
## 2. Upload your documents ![file index tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/file-index-tab.png) In order to do QA on your documents, you need to upload them to the application first. Navigate to the `File Index` tab and you will see 2 sections: 1. File upload: - Drag and drop your file to the UI or select it from your file system. Then click `Upload and Index`. - The application will take some time to process the file and show a message once it is done. 2. File list: - This section shows the list of files that have been uploaded to the application and allows users to delete them. ## 3. Chat with your documents ![chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png) Now navigate back to the `Chat` tab. The chat tab is divided into 3 regions: 1. Conversation Settings Panel - Here you can select, create, rename, and delete conversations. - By default, a new conversation is created automatically if no conversation is selected. - Below that you have the file index, where you can choose whether to disable, select all files, or select which files to retrieve references from. - If you choose "Disabled", no files will be considered as context during chat. - If you choose "Search All", all files will be considered during chat. - If you choose "Select", a dropdown will appear for you to select the files to be considered during chat. If no files are selected, then no files will be considered during chat. 2. Chat Panel - This is where you can chat with the chatbot. 3. Information Panel - Supporting information such as the retrieved evidence and reference will be displayed here. ================================================ FILE: libs/ktem/ktem/assets/theme.py ================================================ from typing import Iterable from gradio.themes import Soft from gradio.themes.utils import colors, fonts, sizes gray = colors.Color( name="dark", c50="#f9fafb", c100="#edeef0", c200="#e1e2e6", c300="#d5d6dd", c400="#cacbd5", c500="#acadb7", c600="#313138", c700="#25252b", c800="#19191e", c900="#0d0d11", c950="#010104", ) err_txt = "#f05656" gradient = "linear-gradient(90deg, *primary_400 20%, *secondary_500 80%)" gradient_muted = "linear-gradient(90deg, *primary_500 20%, *secondary_600 80%)" err_dark = "rgba(228, 98, 98, 1)" err_dark_muted = "rgba(228, 98, 98, 0.75)" err = "rgba(255, 93, 93, 1)" err_muted = "rgba(237, 80, 80, 1)" common = dict( # element colours color_accent="*primary_400", # shadows shadow_drop="0 0px 5px 1px rgb(0 0 0 / 0.05)", shadow_drop_lg="0 0 10px 3px rgba(0 0 0 / 0.06)", # layout atoms block_label_margin="*spacing_xl", block_label_padding="*spacing_xl", block_label_shadow="none", layout_gap="*spacing_xxl", section_header_text_size="*text_lg", # buttons button_shadow="none", button_shadow_active="*shadow_drop", button_shadow_hover="none", # button_large_radius="*radius_xxl", # button_small_radius="*radius_xxl", # ----- ) dark_mode = dict( # body attributes body_text_color_subdued_dark="*neutral_300", # element colours background_fill_secondary_dark="*neutral_950", border_color_accent_dark="rgba(255,255,255,0)", border_color_primary_dark="*neutral_600", color_accent_soft_dark="*secondary_400", # text link_text_color_dark="*secondary_200", link_text_color_active_dark="*secondary_300", link_text_color_visited_dark="*secondary_400", # layout atoms block_label_background_fill_dark="*neutral_800", block_label_border_width_dark="0px", block_label_text_color_dark="*primary_200", block_shadow_dark="none", block_title_text_color_dark="*primary_200", panel_border_width_dark="0px", # component atoms checkbox_background_color_selected_dark="*primary_400", checkbox_border_color_focus_dark="*primary_400", checkbox_border_color_selected_dark="*primary_500", checkbox_label_background_fill_selected_dark="*primary_200", checkbox_label_text_color_selected_dark="*neutral_700", error_border_color_dark=err_dark, error_text_color_dark="*neutral_100", error_icon_color_dark=err_dark, input_background_fill_dark="*neutral_600", input_border_color_dark="*input_background_fill", input_border_color_focus_dark="*input_background_fill", input_placeholder_color_dark="*neutral_500", loader_color_dark="*primary_200", slider_color_dark="*primary_300", stat_background_fill_dark="*secondary_100", table_border_color_dark="*neutral_800", table_even_background_fill_dark="*neutral_900", table_odd_background_fill_dark="*neutral_800", table_row_focus_dark="*neutral_600", # buttons button_primary_background_fill_dark=gradient, button_primary_background_fill_hover_dark=gradient_muted, button_secondary_background_fill_hover_dark="*neutral_700", button_cancel_background_fill_dark=err_dark, button_cancel_background_fill_hover_dark=err_dark_muted, ) light_mode = dict( background_fill_primary="*neutral_50", background_fill_secondary="*neutral_50", # body attributes body_background_fill="*background_fill_primary", body_text_color_subdued="*neutral_600", border_color_accent="rgba(255,255,255,0)", border_color_primary="*neutral_300", color_accent_soft="*secondary_100", # text link_text_color="*secondary_400", link_text_color_visited="*secondary_700", # layout atoms block_label_border_width="0px", block_label_background_fill="white", block_label_text_color="*primary_600", block_shadow="none", block_title_text_color="*primary_600", panel_border_width="0px", # component atoms checkbox_background_color_selected="*primary_400", checkbox_border_color_focus="*primary_400", checkbox_border_color_selected="*primary_400", checkbox_label_border_color="*primary_200", error_background_fill="*background_fill_primary", error_border_color=err_muted, error_text_color="*neutral_800", input_background_fill="*neutral_200", input_border_color="*input_background_fill", input_border_color_focus="*input_background_fill", input_placeholder_color="*neutral_500", loader_color="*primary_300", slider_color="*primary_400", stat_background_fill="*secondary_300", table_even_background_fill="*neutral_100", table_odd_background_fill="*neutral_300", table_row_focus="*secondary_200", # buttons button_primary_background_fill=gradient_muted, button_primary_background_fill_hover=gradient, button_secondary_background_fill="*neutral_300", button_secondary_background_fill_hover="*neutral_100", button_cancel_background_fill=err_muted, button_cancel_background_fill_hover=err, button_cancel_text_color="*neutral_50", ) class Kotaemon(Soft): """ Official theme of Kotaemon. Public version: https://huggingface.co/spaces/lone17/kotaemon """ def __init__( self, *, primary_hue: colors.Color | str = colors.emerald, secondary_hue: colors.Color | str = colors.blue, neutral_hue: colors.Color | str = gray, spacing_size: sizes.Size | str = sizes.spacing_md, radius_size: sizes.Size | str = sizes.radius_md, text_size: sizes.Size | str = sizes.text_md, font: fonts.Font | str | Iterable[fonts.Font | str] = ( fonts.GoogleFont("Quicksand"), "ui-sans-serif", "sans-serif", ), font_mono: fonts.Font | str | Iterable[fonts.Font | str] = ( fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace", ), ): super().__init__( primary_hue=primary_hue, secondary_hue=secondary_hue, neutral_hue=neutral_hue, spacing_size=spacing_size, radius_size=radius_size, text_size=text_size, font=font, font_mono=font_mono, ) self.name = "kotaemon" super().set( **common, **dark_mode, **light_mode, ) ================================================ FILE: libs/ktem/ktem/components.py ================================================ """Common components, some kind of config""" import logging from functools import cache from pathlib import Path from typing import Optional from theflow.settings import settings from theflow.utils.modules import deserialize from kotaemon.base import BaseComponent from kotaemon.storages import BaseDocumentStore, BaseVectorStore logger = logging.getLogger(__name__) filestorage_path = Path(settings.KH_FILESTORAGE_PATH) filestorage_path.mkdir(parents=True, exist_ok=True) @cache def get_docstore(collection_name: str = "default") -> BaseDocumentStore: from copy import deepcopy ds_conf = deepcopy(settings.KH_DOCSTORE) ds_conf["collection_name"] = collection_name return deserialize(ds_conf, safe=False) @cache def get_vectorstore(collection_name: str = "default") -> BaseVectorStore: from copy import deepcopy vs_conf = deepcopy(settings.KH_VECTORSTORE) vs_conf["collection_name"] = collection_name return deserialize(vs_conf, safe=False) class ModelPool: """Represent a pool of models""" def __init__(self, category: str, conf: dict): self._category = category self._conf = conf self._models: dict[str, BaseComponent] = {} self._accuracy: list[str] = [] self._cost: list[str] = [] self._default: list[str] = [] for name, model in conf.items(): self._models[name] = deserialize(model["spec"], safe=False) if model.get("default", False): self._default.append(name) self._accuracy = list( sorted(conf, key=lambda x: conf[x].get("accuracy", float("-inf"))) ) self._cost = list(sorted(conf, key=lambda x: conf[x].get("cost", float("inf")))) def __getitem__(self, key: str) -> BaseComponent: """Get model by name""" return self._models[key] def __setitem__(self, key: str, value: BaseComponent): """Set model by name""" self._models[key] = value def __delitem__(self, key: str): """Delete model by name""" del self._models[key] def __contains__(self, key: str) -> bool: """Check if model exists""" return key in self._models def get( self, key: str, default: Optional[BaseComponent] = None ) -> Optional[BaseComponent]: """Get model by name with default value""" return self._models.get(key, default) def settings(self) -> dict: """Present model pools option for gradio""" return { "label": self._category, "choices": list(self._models.keys()), "value": self.get_default_name(), } def options(self) -> dict: """Present a dict of models""" return self._models def get_random_name(self) -> str: """Get the name of random model Returns: str: random model name in the pool """ import random if not self._conf: raise ValueError("No models in pool") return random.choice(list(self._conf.keys())) def get_default_name(self) -> str: """Get the name of default model In case there is no default model, choose random model from pool. In case there are multiple default models, choose random from them. Returns: str: model name """ if not self._conf: raise ValueError("No models in pool") if self._default: import random return random.choice(self._default) return self.get_random_name() def get_random(self) -> BaseComponent: """Get random model""" return self._models[self.get_random_name()] def get_default(self) -> BaseComponent: """Get default model In case there is no default model, choose random model from pool. In case there are multiple default models, choose random from them. Returns: BaseComponent: model """ return self._models[self.get_default_name()] def get_highest_accuracy_name(self) -> str: """Get the name of model with highest accuracy Returns: str: model name """ if not self._conf: raise ValueError("No models in pool") return self._accuracy[-1] def get_highest_accuracy(self) -> BaseComponent: """Get model with highest accuracy Returns: BaseComponent: model """ if not self._conf: raise ValueError("No models in pool") return self._models[self._accuracy[-1]] def get_lowest_cost_name(self) -> str: """Get the name of model with lowest cost Returns: str: model name """ if not self._conf: raise ValueError("No models in pool") return self._cost[0] def get_lowest_cost(self) -> BaseComponent: """Get model with lowest cost Returns: BaseComponent: model """ if not self._conf: raise ValueError("No models in pool") return self._models[self._cost[0]] reasonings: dict = {} tools = ModelPool("Tools", {}) ================================================ FILE: libs/ktem/ktem/db/__init__.py ================================================ ================================================ FILE: libs/ktem/ktem/db/base_models.py ================================================ import datetime import uuid from typing import Optional from sqlalchemy import JSON, Column from sqlmodel import Field, SQLModel from tzlocal import get_localzone class BaseConversation(SQLModel): """Store the chat conversation between the user and the bot Attributes: id: canonical id to identify the conversation name: human-friendly name of the conversation user: the user id data_source: the data source of the conversation date_created: the date the conversation was created date_updated: the date the conversation was updated """ __table_args__ = {"extend_existing": True} id: str = Field( default_factory=lambda: uuid.uuid4().hex, primary_key=True, index=True ) name: str = Field( default_factory=lambda: "Untitled - {}".format( datetime.datetime.now(get_localzone()).strftime("%Y-%m-%d %H:%M:%S") ) ) user: str = Field(default="") # For now we only have one user is_public: bool = Field(default=False) # contains messages + current files + chat_suggestions data_source: dict = Field(default={}, sa_column=Column(JSON)) date_created: datetime.datetime = Field( default_factory=lambda: datetime.datetime.now(get_localzone()) ) date_updated: datetime.datetime = Field( default_factory=lambda: datetime.datetime.now(get_localzone()) ) class BaseUser(SQLModel): """Store the user information Attributes: id: canonical id to identify the user username: the username of the user password: the hashed password of the user """ __table_args__ = {"extend_existing": True} id: str = Field( default_factory=lambda: uuid.uuid4().hex, primary_key=True, index=True ) username: str = Field(unique=True) username_lower: str = Field(unique=True) password: str admin: bool = Field(default=False) class BaseSettings(SQLModel): """Record of user settings Attributes: id: canonical id to identify the settings user: the user id setting: the user settings (in dict/json format) """ __table_args__ = {"extend_existing": True} id: str = Field( default_factory=lambda: uuid.uuid4().hex, primary_key=True, index=True ) user: str = Field(default="") setting: dict = Field(default={}, sa_column=Column(JSON)) class BaseIssueReport(SQLModel): """Store user-reported issues Attributes: id: canonical id to identify the issue report issues: the issues reported by the user, formatted as a dict chat: the conversation id when the user reported the issue settings: the user settings at the time of the issue report user: the user id """ __table_args__ = {"extend_existing": True} id: Optional[int] = Field(default=None, primary_key=True) issues: dict = Field(default={}, sa_column=Column(JSON)) chat: Optional[dict] = Field(default=None, sa_column=Column(JSON)) settings: Optional[dict] = Field(default=None, sa_column=Column(JSON)) user: Optional[str] = Field(default=None) ================================================ FILE: libs/ktem/ktem/db/engine.py ================================================ from sqlmodel import create_engine from theflow.settings import settings engine = create_engine(settings.KH_DATABASE) ================================================ FILE: libs/ktem/ktem/db/models.py ================================================ import ktem.db.base_models as base_models from ktem.db.engine import engine from sqlmodel import SQLModel from theflow.settings import settings from theflow.utils.modules import import_dotted_string _base_conv = ( import_dotted_string(settings.KH_TABLE_CONV, safe=False) if hasattr(settings, "KH_TABLE_CONV") else base_models.BaseConversation ) _base_user = ( import_dotted_string(settings.KH_TABLE_USER, safe=False) if hasattr(settings, "KH_TABLE_USER") else base_models.BaseUser ) _base_settings = ( import_dotted_string(settings.KH_TABLE_SETTINGS, safe=False) if hasattr(settings, "KH_TABLE_SETTINGS") else base_models.BaseSettings ) _base_issue_report = ( import_dotted_string(settings.KH_TABLE_ISSUE_REPORT, safe=False) if hasattr(settings, "KH_TABLE_ISSUE_REPORT") else base_models.BaseIssueReport ) class Conversation(_base_conv, table=True): # type: ignore """Conversation record""" class User(_base_user, table=True): # type: ignore """User table""" class Settings(_base_settings, table=True): # type: ignore """Record of settings""" class IssueReport(_base_issue_report, table=True): # type: ignore """Record of issues""" if not getattr(settings, "KH_ENABLE_ALEMBIC", False): SQLModel.metadata.create_all(engine) ================================================ FILE: libs/ktem/ktem/embeddings/__init__.py ================================================ ================================================ FILE: libs/ktem/ktem/embeddings/db.py ================================================ from typing import Type from ktem.db.engine import engine from sqlalchemy import JSON, Boolean, Column, String from sqlalchemy.orm import DeclarativeBase from theflow.settings import settings as flowsettings from theflow.utils.modules import import_dotted_string class Base(DeclarativeBase): pass class BaseEmbeddingTable(Base): """Base table to store language model""" __abstract__ = True name = Column(String, primary_key=True, unique=True) spec = Column(JSON, default={}) default = Column(Boolean, default=False) _base_llm: Type[BaseEmbeddingTable] = ( import_dotted_string(flowsettings.KH_EMBEDDING_LLM, safe=False) if hasattr(flowsettings, "KH_EMBEDDING_LLM") else BaseEmbeddingTable ) class EmbeddingTable(_base_llm): # type: ignore __tablename__ = "embedding" if not getattr(flowsettings, "KH_ENABLE_ALEMBIC", False): EmbeddingTable.metadata.create_all(engine) ================================================ FILE: libs/ktem/ktem/embeddings/manager.py ================================================ from typing import Optional, Type from sqlalchemy import select from sqlalchemy.orm import Session from theflow.settings import settings as flowsettings from theflow.utils.modules import deserialize from kotaemon.embeddings.base import BaseEmbeddings from .db import EmbeddingTable, engine class EmbeddingManager: """Represent a pool of models""" def __init__(self): self._models: dict[str, BaseEmbeddings] = {} self._info: dict[str, dict] = {} self._default: str = "" self._vendors: list[Type] = [] # populate the pool if empty if hasattr(flowsettings, "KH_EMBEDDINGS"): with Session(engine) as sess: count = sess.query(EmbeddingTable).count() if not count: for name, model in flowsettings.KH_EMBEDDINGS.items(): self.add( name=name, spec=model["spec"], default=model.get("default", False), ) self.load() self.load_vendors() def load(self): """Load the model pool from database""" self._models, self._info, self._default = {}, {}, "" with Session(engine) as sess: stmt = select(EmbeddingTable) items = sess.execute(stmt) for (item,) in items: self._models[item.name] = deserialize(item.spec, safe=False) self._info[item.name] = { "name": item.name, "spec": item.spec, "default": item.default, } if item.default: self._default = item.name self._models["default"] = self._models[item.name] def load_vendors(self): from kotaemon.embeddings import ( AzureOpenAIEmbeddings, FastEmbedEmbeddings, LCCohereEmbeddings, LCGoogleEmbeddings, LCHuggingFaceEmbeddings, LCMistralEmbeddings, OpenAIEmbeddings, TeiEndpointEmbeddings, VoyageAIEmbeddings, ) self._vendors = [ AzureOpenAIEmbeddings, OpenAIEmbeddings, FastEmbedEmbeddings, LCCohereEmbeddings, LCHuggingFaceEmbeddings, LCGoogleEmbeddings, LCMistralEmbeddings, TeiEndpointEmbeddings, VoyageAIEmbeddings, ] def __getitem__(self, key: str) -> BaseEmbeddings: """Get model by name""" return self._models[key] def __contains__(self, key: str) -> bool: """Check if model exists""" return key in self._models def get( self, key: str, default: Optional[BaseEmbeddings] = None ) -> Optional[BaseEmbeddings]: """Get model by name with default value""" return self._models.get(key, default) def settings(self) -> dict: """Present model pools option for gradio""" return { "label": "Embedding", "choices": list(self._models.keys()), "value": self.get_default_name(), } def options(self) -> dict: """Present a dict of models""" return self._models def get_random_name(self) -> str: """Get the name of random model Returns: str: random model name in the pool """ import random if not self._models: raise ValueError("No models in pool") return random.choice(list(self._models.keys())) def get_default_name(self) -> str: """Get the name of default model In case there is no default model, choose random model from pool. In case there are multiple default models, choose random from them. Returns: str: model name """ if not self._models: raise ValueError("No models in pool") if not self._default: return self.get_random_name() return self._default def get_random(self) -> BaseEmbeddings: """Get random model""" return self._models[self.get_random_name()] def get_default(self) -> BaseEmbeddings: """Get default model In case there is no default model, choose random model from pool. In case there are multiple default models, choose random from them. Returns: BaseEmbeddings: model """ return self._models[self.get_default_name()] def info(self) -> dict: """List all models""" return self._info def add(self, name: str, spec: dict, default: bool): """Add a new model to the pool""" if not name: raise ValueError("Name must not be empty") try: with Session(engine) as sess: if default: # turn all models to non-default sess.query(EmbeddingTable).update({"default": False}) sess.commit() item = EmbeddingTable(name=name, spec=spec, default=default) sess.add(item) sess.commit() except Exception as e: raise ValueError(f"Failed to add model {name}: {e}") self.load() def delete(self, name: str): """Delete a model from the pool""" try: with Session(engine) as sess: item = sess.query(EmbeddingTable).filter_by(name=name).first() sess.delete(item) sess.commit() except Exception as e: raise ValueError(f"Failed to delete model {name}: {e}") self.load() def update(self, name: str, spec: dict, default: bool, new_name: str = ""): """Update a model in the pool, optionally renaming it.""" if not name: raise ValueError("Name must not be empty") # If update name if new_name and new_name != name: if new_name in self._info: raise ValueError( f"Model '{new_name}' already exists. Use a unique name." ) self.delete(name) self.add(new_name, spec=spec, default=default) return try: with Session(engine) as sess: if default: # turn all models to non-default sess.query(EmbeddingTable).update({"default": False}) sess.commit() item = sess.query(EmbeddingTable).filter_by(name=name).first() if not item: raise ValueError(f"Model {name} not found") item.spec = spec item.default = default sess.commit() except Exception as e: raise ValueError(f"Failed to update model {name}: {e}") self.load() def vendors(self) -> dict: """Return list of vendors""" return {vendor.__qualname__: vendor for vendor in self._vendors} embedding_models_manager = EmbeddingManager() ================================================ FILE: libs/ktem/ktem/embeddings/ui.py ================================================ from copy import deepcopy import gradio as gr import pandas as pd import yaml from ktem.app import BasePage from ktem.utils.file import YAMLNoDateSafeLoader from theflow.utils.modules import deserialize from .manager import embedding_models_manager def format_description(cls): params = cls.describe()["params"] params_lines = ["| Name | Type | Description |", "| --- | --- | --- |"] for key, value in params.items(): if isinstance(value["auto_callback"], str): continue params_lines.append(f"| {key} | {value['type']} | {value['help']} |") return f"{cls.__doc__}\n\n" + "\n".join(params_lines) class EmbeddingManagement(BasePage): def __init__(self, app): self._app = app self.spec_desc_default = ( "# Spec description\n\nSelect a model to view the spec description." ) self.on_building_ui() def on_building_ui(self): with gr.Tab(label="View"): self.emb_list = gr.DataFrame( headers=["name", "vendor", "default"], interactive=False, column_widths=[30, 40, 30], ) with gr.Column(visible=False) as self._selected_panel: self.selected_emb_name = gr.Textbox(value="", visible=False) with gr.Row(): with gr.Column(): self.edit_default = gr.Checkbox( label="Set default", info=( "Set this Embedding model as default. This default " "Embedding will be used by other components by default " "if no Embedding is specified for such components." ), ) self.edit_name = gr.Textbox( label="Name", info="Edit to rename this Embedding model.", ) self.edit_spec = gr.Textbox( label="Specification", info="Specification of the Embedding model in YAML format", lines=10, ) with gr.Accordion( label="Test connection", visible=False, open=False ) as self._check_connection_panel: with gr.Row(): with gr.Column(scale=4): self.connection_logs = gr.HTML( "Logs", ) with gr.Column(scale=1): self.btn_test_connection = gr.Button("Test") with gr.Row(visible=False) as self._selected_panel_btn: with gr.Column(): self.btn_edit_save = gr.Button( "Save", min_width=10, variant="primary" ) with gr.Column(): self.btn_delete = gr.Button( "Delete", min_width=10, variant="stop" ) with gr.Row(): self.btn_delete_yes = gr.Button( "Confirm Delete", variant="stop", visible=False, min_width=10, ) self.btn_delete_no = gr.Button( "Cancel", visible=False, min_width=10 ) with gr.Column(): self.btn_close = gr.Button("Close", min_width=10) with gr.Column(): self.edit_spec_desc = gr.Markdown("# Spec description") with gr.Tab(label="Add"): with gr.Row(): with gr.Column(scale=2): self.name = gr.Textbox( label="Name", info=( "Must be unique and non-empty. " "The name will be used to identify the embedding model." ), ) self.emb_choices = gr.Dropdown( label="Vendors", info=( "Choose the vendor of the Embedding model. Each vendor " "has different specification." ), ) self.spec = gr.Textbox( label="Specification", info="Specification of the Embedding model in YAML format.", ) self.default = gr.Checkbox( label="Set default", info=( "Set this Embedding model as default. This default " "Embedding will be used by other components by default " "if no Embedding is specified for such components." ), ) self.btn_new = gr.Button("Add", variant="primary") with gr.Column(scale=3): self.spec_desc = gr.Markdown(self.spec_desc_default) def _on_app_created(self): """Called when the app is created""" self._app.app.load( self.list_embeddings, inputs=[], outputs=[self.emb_list], ) self._app.app.load( lambda: gr.update(choices=list(embedding_models_manager.vendors().keys())), outputs=[self.emb_choices], ) def on_emb_vendor_change(self, vendor): vendor = embedding_models_manager.vendors()[vendor] required: dict = {} desc = vendor.describe() for key, value in desc["params"].items(): if value.get("required", False): required[key] = value.get("default", None) return yaml.dump(required), format_description(vendor) def on_register_events(self): self.emb_choices.select( self.on_emb_vendor_change, inputs=[self.emb_choices], outputs=[self.spec, self.spec_desc], ) self.btn_new.click( self.create_emb, inputs=[self.name, self.emb_choices, self.spec, self.default], outputs=None, ).success(self.list_embeddings, inputs=[], outputs=[self.emb_list]).success( lambda: ("", None, "", False, self.spec_desc_default), outputs=[ self.name, self.emb_choices, self.spec, self.default, self.spec_desc, ], ) self.emb_list.select( self.select_emb, inputs=self.emb_list, outputs=[self.selected_emb_name], show_progress="hidden", ) self.selected_emb_name.change( self.on_selected_emb_change, inputs=[self.selected_emb_name], outputs=[ self._selected_panel, self._selected_panel_btn, # delete section self.btn_delete, self.btn_delete_yes, self.btn_delete_no, # edit section self.edit_name, self.edit_spec, self.edit_spec_desc, self.edit_default, ], show_progress="hidden", ).success(lambda: gr.update(value=""), outputs=[self.connection_logs]) self.btn_delete.click( self.on_btn_delete_click, inputs=[], outputs=[self.btn_delete, self.btn_delete_yes, self.btn_delete_no], show_progress="hidden", ) self.btn_delete_yes.click( self.delete_emb, inputs=[self.selected_emb_name], outputs=[self.selected_emb_name], show_progress="hidden", ).then( self.list_embeddings, inputs=[], outputs=[self.emb_list], ) self.btn_delete_no.click( lambda: ( gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), ), inputs=[], outputs=[self.btn_delete, self.btn_delete_yes, self.btn_delete_no], show_progress="hidden", ) self.btn_edit_save.click( self.save_emb, inputs=[ self.selected_emb_name, self.edit_name, self.edit_default, self.edit_spec, ], outputs=[self.selected_emb_name], show_progress="hidden", ).then( self.list_embeddings, inputs=[], outputs=[self.emb_list], ) self.btn_close.click( lambda: "", outputs=[self.selected_emb_name], ) self.btn_test_connection.click( self.check_connection, inputs=[self.selected_emb_name, self.edit_spec], outputs=[self.connection_logs], ) def create_emb(self, name, choices, spec, default): try: name = name.strip() spec = yaml.load(spec, Loader=YAMLNoDateSafeLoader) spec["__type__"] = ( embedding_models_manager.vendors()[choices].__module__ + "." + embedding_models_manager.vendors()[choices].__qualname__ ) embedding_models_manager.add(name, spec=spec, default=default) gr.Info(f'Embedding model "{name}" created successfully') except ValueError as e: raise gr.Error(str(e)) except Exception as e: raise gr.Error(f"Failed to create Embedding model '{name}': {e}") def list_embeddings(self): """List the Embedding models""" items = [] for item in embedding_models_manager.info().values(): record = {} record["name"] = item["name"] record["vendor"] = item["spec"].get("__type__", "-").split(".")[-1] record["default"] = item["default"] items.append(record) if items: emb_list = pd.DataFrame.from_records(items) else: emb_list = pd.DataFrame.from_records( [{"name": "-", "vendor": "-", "default": "-"}] ) return emb_list def select_emb(self, emb_list, ev: gr.SelectData): if ev.value == "-" and ev.index[0] == 0: gr.Info("No embedding model is loaded. Please add first") return "" if not ev.selected: return "" return emb_list["name"][ev.index[0]] def on_selected_emb_change(self, selected_emb_name): if selected_emb_name == "": _selected_panel = gr.update(visible=False) _selected_panel_btn = gr.update(visible=False) btn_delete = gr.update(visible=True) btn_delete_yes = gr.update(visible=False) btn_delete_no = gr.update(visible=False) edit_name = gr.update(value="") edit_spec = gr.update(value="") edit_spec_desc = gr.update(value="") edit_default = gr.update(value=False) else: _selected_panel = gr.update(visible=True) _selected_panel_btn = gr.update(visible=True) btn_delete = gr.update(visible=True) btn_delete_yes = gr.update(visible=False) btn_delete_no = gr.update(visible=False) info = deepcopy(embedding_models_manager.info()[selected_emb_name]) vendor_str = info["spec"].pop("__type__", "-").split(".")[-1] vendor = embedding_models_manager.vendors()[vendor_str] edit_name = selected_emb_name edit_spec = yaml.dump(info["spec"]) edit_spec_desc = format_description(vendor) edit_default = info["default"] return ( _selected_panel, _selected_panel_btn, btn_delete, btn_delete_yes, btn_delete_no, edit_name, edit_spec, edit_spec_desc, edit_default, ) def on_btn_delete_click(self): btn_delete = gr.update(visible=False) btn_delete_yes = gr.update(visible=True) btn_delete_no = gr.update(visible=True) return btn_delete, btn_delete_yes, btn_delete_no def check_connection(self, selected_emb_name, selected_spec): log_content: str = "" try: log_content += f"- Testing model: {selected_emb_name}
" yield log_content # Parse content & init model info = deepcopy(embedding_models_manager.info()[selected_emb_name]) # Parse content & create dummy embedding spec = yaml.load(selected_spec, Loader=YAMLNoDateSafeLoader) info["spec"].update(spec) emb = deserialize(info["spec"], safe=False) if emb is None: raise Exception(f"Can not found model: {selected_emb_name}") log_content += "- Sending a message `Hi`
" yield log_content _ = emb("Hi") log_content += ( "- Connection success. " "
" ) yield log_content gr.Info(f"Embedding {selected_emb_name} connect successfully") except Exception as e: print(e) log_content += ( f"- Connection failed. " f"Got error:\n {str(e)}" ) yield log_content return log_content def save_emb(self, selected_emb_name, edit_name, default, spec): try: new_name = edit_name.strip() spec = yaml.load(spec, Loader=YAMLNoDateSafeLoader) spec["__type__"] = embedding_models_manager.info()[selected_emb_name][ "spec" ]["__type__"] embedding_models_manager.update( selected_emb_name, spec=spec, default=default, new_name=new_name ) final_name = ( new_name if new_name != selected_emb_name else selected_emb_name ) gr.Info(f'Embedding model "{final_name}" saved successfully') return final_name except ValueError as e: raise gr.Error(str(e)) except Exception as e: raise gr.Error(f'Failed to save Embedding model "{selected_emb_name}": {e}') def delete_emb(self, selected_emb_name): try: embedding_models_manager.delete(selected_emb_name) except Exception as e: gr.Error(f'Failed to delete Embedding model "{selected_emb_name}": {e}') return selected_emb_name return "" ================================================ FILE: libs/ktem/ktem/exceptions.py ================================================ class KHException(Exception): pass class HookNotDeclared(KHException): pass class HookAlreadyDeclared(KHException): pass ================================================ FILE: libs/ktem/ktem/extension_protocol.py ================================================ import pluggy hookspec = pluggy.HookspecMarker("ktem") hookimpl = pluggy.HookimplMarker("ktem") @hookspec def ktem_declare_extensions() -> dict: # type: ignore """Called before the run() function is executed. This hook is called without any arguments, and should return a dictionary. The dictionary has the following structure: ``` { "id": str, # cannot contain . or / "name": str, # human-friendly name of the plugin "version": str, "support_host": str, "functionality": { "reasoning": { id: { # cannot contain . or / "name": str, "callbacks": {}, "settings": {}, }, }, "index": { "name": str, "callbacks": { "get_index_pipeline": callable, "get_retrievers": {name: callable} }, "settings": {}, }, }, } ``` """ ================================================ FILE: libs/ktem/ktem/index/__init__.py ================================================ from .manager import IndexManager __all__ = ["IndexManager"] ================================================ FILE: libs/ktem/ktem/index/base.py ================================================ import abc import logging from typing import TYPE_CHECKING, Any, Optional if TYPE_CHECKING: from ktem.app import BasePage from kotaemon.base import BaseComponent logger = logging.getLogger(__name__) class BaseIndex(abc.ABC): """The base class for the index The index is responsible for storing information in a searchable manner, and retrieving that information. An application can have multiple indices. For example: - An index of files locally in the computer - An index of chat messages on Discord, Slack, etc. - An index of files stored on Google Drie, Dropbox, etc. - ... User can create, delete, and manage the indices in this application. They can create an index, set it to track a local folder in their computer, and then the chatbot can search for files in that folder. The user can create another index to track their chat messages on Discords. And so on. This class defines the interface for the index. It concerns with: - Setting up the necessary software infrastructure for the index to work (e.g. database table, vector store collection, etc.). - Providing the UI for user interaction with the index, including settings. Methods: __init__: initiate any resource definition required for the index to work (e.g. database table, vector store collection, etc.). on_create: called only once, when the user creates the index. on_delete: called only once, when the user deletes the index. on_start: called when the index starts. get_selector_component_ui: return the UI component to select the entities in the Chat page. Called in the ChatUI page. get_index_page_ui: return the index page UI to manage the entities. Called in the main application UI page. get_user_settings: return default user settings. Called only when the app starts get_admin_settings: return the admin settings. Called only when the user creates the index (for the admin to customize it). The output will be stored in the Index's config. get_indexing_pipeline: return the indexing pipeline when the entities are populated into the index get_retriever_pipelines: return the retriever pipelines when the user chat """ def __init__(self, app, id, name, config): self._app = app self.id = id self.name = name self.config = config # admin settings def on_create(self): """Create the index for the first time""" def on_delete(self): """Trigger when the user delete the index""" def on_start(self): """Trigger when the index start Args: id (int): the id of the index name (str): the name of the index config (dict): the config of the index """ def get_selector_component_ui(self) -> Optional["BasePage"]: """The UI component to select the entities in the Chat page""" return None def get_index_page_ui(self) -> Optional["BasePage"]: """The index page UI to manage the entities""" return None @classmethod def get_user_settings(cls) -> dict: """Return default user settings. These are the runtime settings. The settings will be populated in the user settings page. And will be used when initiating the indexing & retriever pipelines. Returns: dict: user settings in the dictionary format of `ktem.settings.SettingItem` """ return {} @classmethod def get_admin_settings(cls) -> dict: """Return the default admin settings. These are the build-time settings. The settings will be populated in the admin settings page. And will be used when initiating the indexing & retriever pipelines. Returns: dict: user settings in the dictionary format of `ktem.settings.SettingItem` """ return {} @abc.abstractmethod def get_indexing_pipeline( self, settings: dict, user_id: Optional[int] ) -> "BaseComponent": """Return the indexing pipeline that populates the entities into the index Args: settings: the user settings of the index user_id: the user id who is accessing the index TODO: instead of having a user_id, should have an app_state which might also contain the settings. Returns: BaseIndexing: the indexing pipeline """ ... def get_retriever_pipelines( self, settings: dict, user_id: int, selected: Any = None ) -> list["BaseComponent"]: """Return the retriever pipelines to retrieve the entity from the index""" return [] ================================================ FILE: libs/ktem/ktem/index/file/__init__.py ================================================ from .index import FileIndex __all__ = ["FileIndex"] ================================================ FILE: libs/ktem/ktem/index/file/base.py ================================================ from pathlib import Path from typing import Generator, Optional from kotaemon.base import BaseComponent, Document, Param class BaseFileIndexRetriever(BaseComponent): Source = Param(help="The SQLAlchemy Source table") Index = Param(help="The SQLAlchemy Index table") VS = Param(help="The VectorStore") DS = Param(help="The DocStore") FSPath = Param(help="The file storage path") user_id = Param(help="The user id") @classmethod def get_user_settings(cls) -> dict: """Get the user settings for indexing Returns: dict: user settings in the dictionary format of `ktem.settings.SettingItem` """ return {} @classmethod def get_pipeline( cls, user_settings: dict, index_settings: dict, selected: Optional[list] = None, ) -> "BaseFileIndexRetriever": raise NotImplementedError class BaseFileIndexIndexing(BaseComponent): """The pipeline to index information into the data store You should define the following method: - run(self, file_paths): run the indexing given the pipeline - get_pipeline(cls, user_settings, index_settings): return the fully-initialized pipeline, ready to be used by ktem. You will have access to the following resources: - self._Source: the source table - self._Index: the index table - self._VS: the vector store - self._DS: the docstore """ Source = Param(help="The SQLAlchemy Source table") Index = Param(help="The SQLAlchemy Index table") VS = Param(help="The VectorStore") DS = Param(help="The DocStore") FSPath = Param(help="The file storage path") user_id = Param(help="The user id") private = Param(False, help="Whether this is private index") chunk_size = Param(help="Chunk size for this index") chunk_overlap = Param(help="Chunk overlap for this index") def run( self, file_paths: str | Path | list[str | Path], *args, **kwargs ) -> tuple[list[str | None], list[str | None]]: """Run the indexing pipeline Args: file_paths (str | Path | list[str | Path]): the file paths to index Returns: - the indexed file ids (each file id corresponds to an input file path, or None if the indexing failed for that file path) - the error messages (each error message corresponds to an input file path, or None if the indexing was successful for that file path) """ raise NotImplementedError def stream( self, file_paths: str | Path | list[str | Path], *args, **kwargs ) -> Generator[ Document, None, tuple[list[str | None], list[str | None], list[Document]] ]: """Stream the indexing pipeline Args: file_paths (str | Path | list[str | Path]): the file paths to index Yields: Document: the output message to the UI, must have channel == index or debug Returns: - the indexed file ids (each file id corresponds to an input file path, or None if the indexing failed for that file path) - the error messages (each error message corresponds to an input file path, or None if the indexing was successful for that file path) - the indexed documents in form of list[Documents] """ raise NotImplementedError @classmethod def get_pipeline( cls, user_settings: dict, index_settings: dict ) -> "BaseFileIndexIndexing": raise NotImplementedError @classmethod def get_user_settings(cls) -> dict: """Get the user settings for indexing Returns: dict: user settings in the dictionary format of `ktem.settings.SettingItem` """ return {} def copy_to_filestorage( self, file_paths: str | Path | list[str | Path] ) -> list[str]: """Copy to file storage and return the new path, relative to the file storage Args: file_path: the file path to copy Returns: the new file paths, relative to the file storage """ import shutil from hashlib import sha256 if not isinstance(file_paths, list): file_paths = [file_paths] paths = [] for file_path in file_paths: with open(file_path, "rb") as f: paths.append(sha256(f.read()).hexdigest()) shutil.copy(file_path, self.FSPath / paths[-1]) return paths def get_filestorage_path(self, rel_paths: str | list[str]) -> list[str]: """Get the file storage path for the relative path Args: rel_paths: the relative path to the file storage Returns: the absolute file storage path to the file """ raise NotImplementedError def warning(self, msg): """Log a warning message Args: msg: the message to log """ print(msg) def rebuild_index(self): """Rebuild the index""" raise NotImplementedError ================================================ FILE: libs/ktem/ktem/index/file/exceptions.py ================================================ from ktem.exceptions import KHException class FileExistsError(KHException): pass ================================================ FILE: libs/ktem/ktem/index/file/graph/__init__.py ================================================ from .graph_index import GraphRAGIndex from .light_graph_index import LightRAGIndex from .nano_graph_index import NanoGraphRAGIndex __all__ = ["GraphRAGIndex", "NanoGraphRAGIndex", "LightRAGIndex"] ================================================ FILE: libs/ktem/ktem/index/file/graph/graph_index.py ================================================ from typing import Any from ktem.index.file import FileIndex from ..base import BaseFileIndexIndexing, BaseFileIndexRetriever from .pipelines import GraphRAGIndexingPipeline, GraphRAGRetrieverPipeline class GraphRAGIndex(FileIndex): def _setup_indexing_cls(self): self._indexing_pipeline_cls = GraphRAGIndexingPipeline def _setup_retriever_cls(self): self._retriever_pipeline_cls = [GraphRAGRetrieverPipeline] def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing: """Define the interface of the indexing pipeline""" obj = super().get_indexing_pipeline(settings, user_id) # disable vectorstore for this kind of Index obj.VS = None return obj def get_retriever_pipelines( self, settings: dict, user_id: int, selected: Any = None ) -> list["BaseFileIndexRetriever"]: file_ids = self._selector_ui.get_selected_ids(selected) retrievers = [ GraphRAGRetrieverPipeline( file_ids=file_ids, Index=self._resources["Index"], ) ] return retrievers ================================================ FILE: libs/ktem/ktem/index/file/graph/light_graph_index.py ================================================ from typing import Any, Optional from uuid import uuid4 from ktem.db.engine import engine from sqlalchemy.orm import Session from ..base import BaseFileIndexIndexing, BaseFileIndexRetriever from .graph_index import GraphRAGIndex from .lightrag_pipelines import LightRAGIndexingPipeline, LightRAGRetrieverPipeline class LightRAGIndex(GraphRAGIndex): def __init__(self, app, id: int, name: str, config: dict): super().__init__(app, id, name, config) self._collection_graph_id: Optional[str] = None def _setup_indexing_cls(self): self._indexing_pipeline_cls = LightRAGIndexingPipeline def _setup_retriever_cls(self): self._retriever_pipeline_cls = [LightRAGRetrieverPipeline] def _get_or_create_collection_graph_id(self): if self._collection_graph_id: return self._collection_graph_id # Try to find existing graph ID for this collection with Session(engine) as session: result = ( session.query(self._resources["Index"].target_id) # type: ignore .filter( self._resources["Index"].relation_type == "graph" # type: ignore ) .first() ) if result: self._collection_graph_id = result[0] else: self._collection_graph_id = str(uuid4()) return self._collection_graph_id def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing: pipeline = super().get_indexing_pipeline(settings, user_id) # indexing settings prefix = f"index.options.{self.id}." striped_settings = { key[len(prefix) :]: value for key, value in settings.items() if key.startswith(prefix) } # set the prompts pipeline.prompts = striped_settings # set collection graph id pipeline.collection_graph_id = self._get_or_create_collection_graph_id() # set index batch size pipeline.index_batch_size = striped_settings.get( "batch_size", pipeline.index_batch_size ) return pipeline def get_retriever_pipelines( self, settings: dict, user_id: int, selected: Any = None ) -> list["BaseFileIndexRetriever"]: file_ids = self._selector_ui.get_selected_ids(selected) # retrieval settings prefix = f"index.options.{self.id}." search_type = settings.get(prefix + "search_type", "local") retrievers = [ LightRAGRetrieverPipeline( file_ids=file_ids, Index=self._resources["Index"], search_type=search_type, ) ] return retrievers ================================================ FILE: libs/ktem/ktem/index/file/graph/lightrag_pipelines.py ================================================ import asyncio import glob import logging import os import re from pathlib import Path from typing import Generator import numpy as np import pandas as pd from ktem.db.models import engine from ktem.embeddings.manager import embedding_models_manager as embeddings from ktem.llms.manager import llms from sqlalchemy.orm import Session from tenacity import ( retry, retry_if_exception_type, stop_after_attempt, wait_exponential, ) from theflow.settings import settings from kotaemon.base import Document, Param, RetrievedDocument from kotaemon.base.schema import AIMessage, HumanMessage, SystemMessage from ..pipelines import BaseFileIndexRetriever from .pipelines import GraphRAGIndexingPipeline from .visualize import create_knowledge_graph, visualize_graph try: from lightrag import LightRAG, QueryParam # newer versions of LightRAG needs to be initialized before using from lightrag.kg.shared_storage import initialize_pipeline_status from lightrag.operate import ( _find_most_related_edges_from_entities, _find_most_related_text_unit_from_entities, ) from lightrag.utils import EmbeddingFunc, compute_args_hash except ImportError: print( ( "LightRAG dependencies not installed. " "Try `pip install git+https://github.com/HKUDS/LightRAG.git` to install. " "LighthRAG retriever pipeline will not work properly." ) ) logging.getLogger("lightrag").setLevel(logging.INFO) filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "lightrag" filestorage_path.mkdir(parents=True, exist_ok=True) INDEX_BATCHSIZE = 4 def get_llm_func(model): @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type((Exception,)), after=lambda retry_state: logging.warning( f"LLM API call attempt {retry_state.attempt_number} failed. Retrying..." ), ) async def _call_model(model, input_messages): return (await model.ainvoke(input_messages)).text async def llm_func( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: input_messages = [SystemMessage(text=system_prompt)] if system_prompt else [] hashing_kv = kwargs.pop("hashing_kv", None) if history_messages: for msg in history_messages: if msg.get("role") == "user": input_messages.append(HumanMessage(text=msg["content"])) else: input_messages.append(AIMessage(text=msg["content"])) input_messages.append(HumanMessage(text=prompt)) if hashing_kv is not None: args_hash = compute_args_hash("model", input_messages) if_cache_return = await hashing_kv.get_by_id(args_hash) if if_cache_return is not None: return if_cache_return["return"] try: output = await _call_model(model, input_messages) except Exception as e: logging.error(f"Failed to call LLM API after 3 retries: {str(e)}") raise print("-" * 50) print(output, "\n", "-" * 50) if hashing_kv is not None: await hashing_kv.upsert({args_hash: {"return": output, "model": "model"}}) return output return llm_func def get_embedding_func(model): async def embedding_func(texts: list[str]) -> np.ndarray: outputs = model(texts) embedding_outputs = np.array([doc.embedding for doc in outputs]) return embedding_outputs return embedding_func def get_default_models_wrapper(): # setup model functions default_embedding = embeddings.get_default() default_embedding_dim = len(default_embedding(["Hi"])[0].embedding) embedding_func = EmbeddingFunc( embedding_dim=default_embedding_dim, max_token_size=8192, func=get_embedding_func(default_embedding), ) print("GraphRAG embedding dim", default_embedding_dim) default_llm = llms.get_default() llm_func = get_llm_func(default_llm) return llm_func, embedding_func, default_llm, default_embedding def prepare_graph_index_path(graph_id: str): root_path = Path(filestorage_path) / graph_id input_path = root_path / "input" return root_path, input_path def list_of_list_to_df(data: list[list]) -> pd.DataFrame: df = pd.DataFrame(data[1:], columns=data[0]) return df def clean_quote(input: str) -> str: return re.sub(r"[\"']", "", input) async def lightrag_build_local_query_context( graph_func, query, query_param, ): knowledge_graph_inst = graph_func.chunk_entity_relation_graph entities_vdb = graph_func.entities_vdb text_chunks_db = graph_func.text_chunks results = await entities_vdb.query(query, top_k=query_param.top_k) if not len(results): raise ValueError("No results found") node_datas = await asyncio.gather( *[knowledge_graph_inst.get_node(r["entity_name"]) for r in results] ) node_degrees = await asyncio.gather( *[knowledge_graph_inst.node_degree(r["entity_name"]) for r in results] ) node_datas = [ {**n, "entity_name": k["entity_name"], "rank": d} for k, n, d in zip(results, node_datas, node_degrees) if n is not None ] try: use_text_units = await _find_most_related_text_unit_from_entities( node_datas, query_param, text_chunks_db, knowledge_graph_inst ) except Exception: use_text_units = [] try: use_relations = await _find_most_related_edges_from_entities( node_datas, query_param, knowledge_graph_inst ) except Exception: use_relations = [] logging.info( f"Local query uses {len(node_datas)} entities, " f"{len(use_relations)} relations, {len(use_text_units)} text units" ) entites_section_list = [["id", "entity", "type", "description", "rank"]] for i, n in enumerate(node_datas): entites_section_list.append( [ str(i), clean_quote(n["entity_name"]), n.get("entity_type", "UNKNOWN"), clean_quote(n.get("description", "UNKNOWN")), n["rank"], ] ) entities_df = list_of_list_to_df(entites_section_list) relations_section_list = [ ["id", "source", "target", "description", "keywords", "weight", "rank"] ] for i, e in enumerate(use_relations): relations_section_list.append( [ str(i), clean_quote(e["src_tgt"][0]), clean_quote(e["src_tgt"][1]), clean_quote(e["description"]), e["keywords"], e["weight"], e["rank"], ] ) relations_df = list_of_list_to_df(relations_section_list) text_units_section_list = [["id", "content"]] for i, t in enumerate(use_text_units): text_units_section_list.append([str(i), t["content"]]) sources_df = list_of_list_to_df(text_units_section_list) return entities_df, relations_df, sources_df def build_graphrag(working_dir, llm_func, embedding_func): graphrag_func = LightRAG( working_dir=working_dir, llm_model_func=llm_func, embedding_func=embedding_func, ) # newer versions of LightRAG needs to be initialized before using asyncio.run(graphrag_func.initialize_storages()) asyncio.run(initialize_pipeline_status()) return graphrag_func class LightRAGIndexingPipeline(GraphRAGIndexingPipeline): """GraphRAG specific indexing pipeline""" prompts: dict[str, str] = {} collection_graph_id: str index_batch_size: int = INDEX_BATCHSIZE def store_file_id_with_graph_id(self, file_ids: list[str | None]): if not settings.USE_GLOBAL_GRAPHRAG: return super().store_file_id_with_graph_id(file_ids) # Use the collection-wide graph ID for LightRAG graph_id = self.collection_graph_id # Record all files under this graph_id with Session(engine) as session: for file_id in file_ids: if not file_id: continue # Check if mapping already exists existing = ( session.query(self.Index) .filter( self.Index.source_id == file_id, self.Index.target_id == graph_id, self.Index.relation_type == "graph", ) .first() ) if not existing: node = self.Index( source_id=file_id, target_id=graph_id, relation_type="graph", ) session.add(node) session.commit() return graph_id @classmethod def get_user_settings(cls) -> dict: try: from lightrag.prompt import PROMPTS blacklist_keywords = ["default", "response", "process"] settings_dict = { "batch_size": { "name": ( "Index batch size " "(reduce if you have rate limit issues)" ), "value": INDEX_BATCHSIZE, "component": "number", } } settings_dict.update( { prompt_name: { "name": f"Prompt for '{prompt_name}'", "value": content, "component": "text", } for prompt_name, content in PROMPTS.items() if all( keyword not in prompt_name.lower() for keyword in blacklist_keywords ) and isinstance(content, str) } ) return settings_dict except ImportError as e: print(e) return {} def call_graphrag_index(self, graph_id: str, docs: list[Document]): from lightrag.prompt import PROMPTS # modify the prompt if it is set in the settings for prompt_name, content in self.prompts.items(): if prompt_name in PROMPTS: PROMPTS[prompt_name] = content _, input_path = prepare_graph_index_path(graph_id) input_path.mkdir(parents=True, exist_ok=True) ( llm_func, embedding_func, default_llm, default_embedding, ) = get_default_models_wrapper() print( f"Indexing GraphRAG with LLM {default_llm} " f"and Embedding {default_embedding}..." ) all_docs = [ doc.text for doc in docs if doc.metadata.get("type", "text") == "text" and len(doc.text.strip()) > 0 ] yield Document( channel="debug", text="[GraphRAG] Creating/Updating index... This can take a long time.", ) # Check if graph already exists graph_file = input_path / "graph_chunk_entity_relation.graphml" is_incremental = graph_file.exists() # Only clear cache if it's a new graph if not is_incremental: json_files = glob.glob(f"{input_path}/*.json") for json_file in json_files: os.remove(json_file) # Initialize or load existing GraphRAG graphrag_func = build_graphrag( input_path, llm_func=llm_func, embedding_func=embedding_func, ) total_docs = len(all_docs) process_doc_count = 0 yield Document( channel="debug", text=( f"[GraphRAG] {'Updating' if is_incremental else 'Creating'} index: " f"{process_doc_count} / {total_docs} documents." ), ) for doc_id in range(0, len(all_docs), self.index_batch_size): cur_docs = all_docs[doc_id : doc_id + self.index_batch_size] combined_doc = "\n".join(cur_docs) # Use insert for incremental updates graphrag_func.insert(combined_doc) process_doc_count += len(cur_docs) yield Document( channel="debug", text=( f"[GraphRAG] {'Updated' if is_incremental else 'Indexed'} " f"{process_doc_count} / {total_docs} documents." ), ) yield Document( channel="debug", text=f"[GraphRAG] {'Update' if is_incremental else 'Indexing'} finished.", ) def stream( self, file_paths: str | Path | list[str | Path], reindex: bool = False, **kwargs ) -> Generator[ Document, None, tuple[list[str | None], list[str | None], list[Document]] ]: file_ids, errors, all_docs = yield from super().stream( file_paths, reindex=reindex, **kwargs ) return file_ids, errors, all_docs class LightRAGRetrieverPipeline(BaseFileIndexRetriever): """GraphRAG specific retriever pipeline""" Index = Param(help="The SQLAlchemy Index table") file_ids: list[str] = [] search_type: str = "local" @classmethod def get_user_settings(cls) -> dict: return { "search_type": { "name": "Search type", "value": "local", "choices": ["local", "global", "hybrid"], "component": "dropdown", "info": "Whether to use local or global search in the graph.", } } def _build_graph_search(self): file_id = self.file_ids[0] # retrieve the graph_id from the index with Session(engine) as session: graph_id = ( session.query(self.Index.target_id) .filter(self.Index.source_id == file_id) .filter(self.Index.relation_type == "graph") .first() ) graph_id = graph_id[0] if graph_id else None assert graph_id, f"GraphRAG index not found for file_id: {file_id}" _, input_path = prepare_graph_index_path(graph_id) input_path.mkdir(parents=True, exist_ok=True) llm_func, embedding_func, _, _ = get_default_models_wrapper() graphrag_func = build_graphrag( input_path, llm_func=llm_func, embedding_func=embedding_func, ) print("search_type", self.search_type) query_params = QueryParam(mode=self.search_type, only_need_context=True) return graphrag_func, query_params def _to_document(self, header: str, context_text: str) -> RetrievedDocument: return RetrievedDocument( text=context_text, metadata={ "file_name": header, "type": "table", "llm_trulens_score": 1.0, }, score=1.0, ) def format_context_records( self, entities, relationships, sources ) -> list[RetrievedDocument]: docs = [] context: str = "" # entities current parsing error header = "Entities\n" context = entities[["entity", "description"]].to_markdown(index=False) docs.append(self._to_document(header, context)) header = "\nRelationships\n" context = relationships[["source", "target", "description"]].to_markdown( index=False ) docs.append(self._to_document(header, context)) header = "\nSources\n" context = "" for _, row in sources.iterrows(): title, content = row["id"], row["content"] context += f"\n\n
Source #{title}
\n" context += content docs.append(self._to_document(header, context)) return docs def plot_graph(self, relationships): G = create_knowledge_graph(relationships) plot = visualize_graph(G) return plot def run( self, text: str, ) -> list[RetrievedDocument]: if not self.file_ids: return [] graphrag_func, query_params = self._build_graph_search() # only local mode support graph visualization if query_params.mode == "local": entities, relationships, sources = asyncio.run( lightrag_build_local_query_context(graphrag_func, text, query_params) ) documents = self.format_context_records(entities, relationships, sources) plot = self.plot_graph(relationships) documents += [ RetrievedDocument( text="", metadata={ "file_name": "GraphRAG", "type": "plot", "data": plot, }, ), ] else: context = graphrag_func.query(text, query_params) # account for missing ``` for closing code block context += "\n```" documents = [ RetrievedDocument( text=context, metadata={ "file_name": "GraphRAG {} Search".format( query_params.mode.capitalize() ), "type": "table", }, ) ] return documents ================================================ FILE: libs/ktem/ktem/index/file/graph/nano_graph_index.py ================================================ from typing import Any, Optional from uuid import uuid4 from ktem.db.engine import engine from sqlalchemy.orm import Session from ..base import BaseFileIndexIndexing, BaseFileIndexRetriever from .graph_index import GraphRAGIndex from .nano_pipelines import NanoGraphRAGIndexingPipeline, NanoGraphRAGRetrieverPipeline class NanoGraphRAGIndex(GraphRAGIndex): def __init__(self, app, id: int, name: str, config: dict): super().__init__(app, id, name, config) self._collection_graph_id: Optional[str] = None def _setup_indexing_cls(self): self._indexing_pipeline_cls = NanoGraphRAGIndexingPipeline def _setup_retriever_cls(self): self._retriever_pipeline_cls = [NanoGraphRAGRetrieverPipeline] def _get_or_create_collection_graph_id(self): if self._collection_graph_id: return self._collection_graph_id # Try to find existing graph ID for this collection with Session(engine) as session: result = ( session.query(self._resources["Index"].target_id) # type: ignore .filter( self._resources["Index"].relation_type == "graph" # type: ignore ) .first() ) if result: self._collection_graph_id = result[0] else: self._collection_graph_id = str(uuid4()) return self._collection_graph_id def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing: pipeline = super().get_indexing_pipeline(settings, user_id) # indexing settings prefix = f"index.options.{self.id}." striped_settings = { key[len(prefix) :]: value for key, value in settings.items() if key.startswith(prefix) } # set the prompts pipeline.prompts = striped_settings # set collection graph id pipeline.collection_graph_id = self._get_or_create_collection_graph_id() # set index batch size pipeline.index_batch_size = striped_settings.get( "batch_size", pipeline.index_batch_size ) return pipeline def get_retriever_pipelines( self, settings: dict, user_id: int, selected: Any = None ) -> list["BaseFileIndexRetriever"]: file_ids = self._selector_ui.get_selected_ids(selected) # retrieval settings prefix = f"index.options.{self.id}." search_type = settings.get(prefix + "search_type", "local") retrievers = [ NanoGraphRAGRetrieverPipeline( file_ids=file_ids, Index=self._resources["Index"], search_type=search_type, ) ] return retrievers ================================================ FILE: libs/ktem/ktem/index/file/graph/nano_pipelines.py ================================================ import asyncio import glob import logging import os import re from pathlib import Path from typing import Generator import numpy as np import pandas as pd from ktem.db.models import engine from ktem.embeddings.manager import embedding_models_manager as embeddings from ktem.llms.manager import llms from sqlalchemy.orm import Session from tenacity import ( retry, retry_if_exception_type, stop_after_attempt, wait_exponential, ) from theflow.settings import settings from kotaemon.base import Document, Param, RetrievedDocument from kotaemon.base.schema import AIMessage, HumanMessage, SystemMessage from ..pipelines import BaseFileIndexRetriever from .pipelines import GraphRAGIndexingPipeline from .visualize import create_knowledge_graph, visualize_graph try: from nano_graphrag import GraphRAG, QueryParam from nano_graphrag._op import ( _find_most_related_community_from_entities, _find_most_related_edges_from_entities, _find_most_related_text_unit_from_entities, ) from nano_graphrag._utils import EmbeddingFunc, compute_args_hash except ImportError: print( ( "Nano-GraphRAG dependencies not installed. " "Try `pip install nano-graphrag` to install. " "Nano-GraphRAG retriever pipeline will not work properly." ) ) logging.getLogger("nano-graphrag").setLevel(logging.INFO) filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "nano_graphrag" filestorage_path.mkdir(parents=True, exist_ok=True) INDEX_BATCHSIZE = 4 def get_llm_func(model): @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type((Exception,)), after=lambda retry_state: logging.warning( f"LLM API call attempt {retry_state.attempt_number} failed. Retrying..." ), ) async def _call_model(model, input_messages): return (await model.ainvoke(input_messages)).text async def llm_func( prompt, system_prompt=None, history_messages=[], **kwargs ) -> str: input_messages = [SystemMessage(text=system_prompt)] if system_prompt else [] hashing_kv = kwargs.pop("hashing_kv", None) if history_messages: for msg in history_messages: if msg.get("role") == "user": input_messages.append(HumanMessage(text=msg["content"])) else: input_messages.append(AIMessage(text=msg["content"])) input_messages.append(HumanMessage(text=prompt)) if hashing_kv is not None: args_hash = compute_args_hash("model", input_messages) if_cache_return = await hashing_kv.get_by_id(args_hash) if if_cache_return is not None: return if_cache_return["return"] try: output = await _call_model(model, input_messages) except Exception as e: logging.error(f"Failed to call LLM API after 3 retries: {str(e)}") raise print("-" * 50) print(output, "\n", "-" * 50) if hashing_kv is not None: await hashing_kv.upsert({args_hash: {"return": output, "model": "model"}}) return output return llm_func def get_embedding_func(model): async def embedding_func(texts: list[str]) -> np.ndarray: outputs = model(texts) embedding_outputs = np.array([doc.embedding for doc in outputs]) return embedding_outputs return embedding_func def get_default_models_wrapper(): # setup model functions default_embedding = embeddings.get_default() default_embedding_dim = len(default_embedding(["Hi"])[0].embedding) embedding_func = EmbeddingFunc( embedding_dim=default_embedding_dim, max_token_size=8192, func=get_embedding_func(default_embedding), ) print("GraphRAG embedding dim", default_embedding_dim) default_llm = llms.get_default() llm_func = get_llm_func(default_llm) return llm_func, embedding_func, default_llm, default_embedding def prepare_graph_index_path(graph_id: str): root_path = Path(filestorage_path) / graph_id input_path = root_path / "input" return root_path, input_path def list_of_list_to_df(data: list[list]) -> pd.DataFrame: df = pd.DataFrame(data[1:], columns=data[0]) return df def clean_quote(input: str) -> str: return re.sub(r"[\"']", "", input) async def nano_graph_rag_build_local_query_context( graph_func, query, query_param, ): knowledge_graph_inst = graph_func.chunk_entity_relation_graph entities_vdb = graph_func.entities_vdb community_reports = graph_func.community_reports text_chunks_db = graph_func.text_chunks results = await entities_vdb.query(query, top_k=query_param.top_k) if not len(results): raise ValueError("No results found") node_datas = await asyncio.gather( *[knowledge_graph_inst.get_node(r["entity_name"]) for r in results] ) node_degrees = await asyncio.gather( *[knowledge_graph_inst.node_degree(r["entity_name"]) for r in results] ) node_datas = [ {**n, "entity_name": k["entity_name"], "rank": d} for k, n, d in zip(results, node_datas, node_degrees) if n is not None ] use_communities = await _find_most_related_community_from_entities( node_datas, query_param, community_reports ) use_text_units = await _find_most_related_text_unit_from_entities( node_datas, query_param, text_chunks_db, knowledge_graph_inst ) use_relations = await _find_most_related_edges_from_entities( node_datas, query_param, knowledge_graph_inst ) entites_section_list = [["id", "entity", "type", "description", "rank"]] for i, n in enumerate(node_datas): entites_section_list.append( [ str(i), clean_quote(n["entity_name"]), n.get("entity_type", "UNKNOWN"), clean_quote(n.get("description", "UNKNOWN")), n["rank"], ] ) entities_df = list_of_list_to_df(entites_section_list) relations_section_list = [ ["id", "source", "target", "description", "weight", "rank"] ] for i, e in enumerate(use_relations): relations_section_list.append( [ str(i), clean_quote(e["src_tgt"][0]), clean_quote(e["src_tgt"][1]), clean_quote(e["description"]), e["weight"], e["rank"], ] ) relations_df = list_of_list_to_df(relations_section_list) communities_section_list = [["id", "content"]] for i, c in enumerate(use_communities): communities_section_list.append([str(i), c["report_string"]]) communities_df = list_of_list_to_df(communities_section_list) text_units_section_list = [["id", "content"]] for i, t in enumerate(use_text_units): text_units_section_list.append([str(i), t["content"]]) sources_df = list_of_list_to_df(text_units_section_list) return entities_df, relations_df, communities_df, sources_df def build_graphrag(working_dir, llm_func, embedding_func): graphrag_func = GraphRAG( working_dir=working_dir, best_model_func=llm_func, cheap_model_func=llm_func, embedding_func=embedding_func, ) return graphrag_func class NanoGraphRAGIndexingPipeline(GraphRAGIndexingPipeline): """GraphRAG specific indexing pipeline""" prompts: dict[str, str] = {} collection_graph_id: str index_batch_size: int = INDEX_BATCHSIZE def store_file_id_with_graph_id(self, file_ids: list[str | None]): if not settings.USE_GLOBAL_GRAPHRAG: return super().store_file_id_with_graph_id(file_ids) # Use the collection-wide graph ID for LightRAG graph_id = self.collection_graph_id # Record all files under this graph_id with Session(engine) as session: for file_id in file_ids: if not file_id: continue # Check if mapping already exists existing = ( session.query(self.Index) .filter( self.Index.source_id == file_id, self.Index.target_id == graph_id, self.Index.relation_type == "graph", ) .first() ) if not existing: node = self.Index( source_id=file_id, target_id=graph_id, relation_type="graph", ) session.add(node) session.commit() return graph_id @classmethod def get_user_settings(cls) -> dict: try: from nano_graphrag.prompt import PROMPTS blacklist_keywords = ["default", "response", "process"] settings_dict = { "batch_size": { "name": ( "Index batch size " "(reduce if you have rate limit issues)" ), "value": INDEX_BATCHSIZE, "component": "number", } } settings_dict.update( { prompt_name: { "name": f"Prompt for '{prompt_name}'", "value": content, "component": "text", } for prompt_name, content in PROMPTS.items() if all( keyword not in prompt_name.lower() for keyword in blacklist_keywords ) and isinstance(content, str) } ) return settings_dict except ImportError as e: print(e) return {} def call_graphrag_index(self, graph_id: str, docs: list[Document]): from nano_graphrag.prompt import PROMPTS # modify the prompt if it is set in the settings for prompt_name, content in self.prompts.items(): if prompt_name in PROMPTS: PROMPTS[prompt_name] = content _, input_path = prepare_graph_index_path(graph_id) input_path.mkdir(parents=True, exist_ok=True) ( llm_func, embedding_func, default_llm, default_embedding, ) = get_default_models_wrapper() print( f"Indexing GraphRAG with LLM {default_llm} " f"and Embedding {default_embedding}..." ) all_docs = [ doc.text for doc in docs if doc.metadata.get("type", "text") == "text" and len(doc.text.strip()) > 0 ] yield Document( channel="debug", text="[GraphRAG] Creating/Updating index... This can take a long time.", ) # Check if graph already exists graph_file = input_path / "graph_chunk_entity_relation.graphml" is_incremental = graph_file.exists() # Only clear cache if it's a new graph if not is_incremental: json_files = glob.glob(f"{input_path}/*.json") for json_file in json_files: os.remove(json_file) # Initialize or load existing GraphRAG graphrag_func = build_graphrag( input_path, llm_func=llm_func, embedding_func=embedding_func, ) total_docs = len(all_docs) process_doc_count = 0 yield Document( channel="debug", text=( f"[GraphRAG] {'Updating' if is_incremental else 'Creating'} index: " f"{process_doc_count} / {total_docs} documents." ), ) for doc_id in range(0, len(all_docs), self.index_batch_size): cur_docs = all_docs[doc_id : doc_id + self.index_batch_size] combined_doc = "\n".join(cur_docs) # Use insert for incremental updates graphrag_func.insert(combined_doc) process_doc_count += len(cur_docs) yield Document( channel="debug", text=( f"[GraphRAG] {'Updated' if is_incremental else 'Indexed'} " f"{process_doc_count} / {total_docs} documents." ), ) yield Document( channel="debug", text=f"[GraphRAG] {'Update' if is_incremental else 'Indexing'} finished.", ) def stream( self, file_paths: str | Path | list[str | Path], reindex: bool = False, **kwargs ) -> Generator[ Document, None, tuple[list[str | None], list[str | None], list[Document]] ]: file_ids, errors, all_docs = yield from super().stream( file_paths, reindex=reindex, **kwargs ) return file_ids, errors, all_docs class NanoGraphRAGRetrieverPipeline(BaseFileIndexRetriever): """GraphRAG specific retriever pipeline""" Index = Param(help="The SQLAlchemy Index table") file_ids: list[str] = [] search_type: str = "local" @classmethod def get_user_settings(cls) -> dict: return { "search_type": { "name": "Search type", "value": "local", "choices": ["local", "global"], "component": "dropdown", "info": "Whether to use local or global search in the graph.", } } def _build_graph_search(self): file_id = self.file_ids[0] # retrieve the graph_id from the index with Session(engine) as session: graph_id = ( session.query(self.Index.target_id) .filter(self.Index.source_id == file_id) .filter(self.Index.relation_type == "graph") .first() ) graph_id = graph_id[0] if graph_id else None assert graph_id, f"GraphRAG index not found for file_id: {file_id}" _, input_path = prepare_graph_index_path(graph_id) input_path.mkdir(parents=True, exist_ok=True) llm_func, embedding_func, _, _ = get_default_models_wrapper() graphrag_func = build_graphrag( input_path, llm_func=llm_func, embedding_func=embedding_func, ) print("search_type", self.search_type) query_params = QueryParam(mode=self.search_type, only_need_context=True) return graphrag_func, query_params def _to_document(self, header: str, context_text: str) -> RetrievedDocument: return RetrievedDocument( text=context_text, metadata={ "file_name": header, "type": "table", "llm_trulens_score": 1.0, }, score=1.0, ) def format_context_records( self, entities, relationships, reports, sources ) -> list[RetrievedDocument]: docs = [] context: str = "" # entities current parsing error header = "Entities\n" context = entities[["entity", "description"]].to_markdown(index=False) docs.append(self._to_document(header, context)) header = "\nRelationships\n" context = relationships[["source", "target", "description"]].to_markdown( index=False ) docs.append(self._to_document(header, context)) header = "\nReports\n" context = "" for _, row in reports.iterrows(): title, content = row["id"], row["content"] # not contain title context += f"\n\n
Report {title}
\n" context += content docs.append(self._to_document(header, context)) header = "\nSources\n" context = "" for _, row in sources.iterrows(): title, content = row["id"], row["content"] context += f"\n\n
Source #{title}
\n" context += content docs.append(self._to_document(header, context)) return docs def plot_graph(self, relationships): G = create_knowledge_graph(relationships) plot = visualize_graph(G) return plot def run( self, text: str, ) -> list[RetrievedDocument]: if not self.file_ids: return [] graphrag_func, query_params = self._build_graph_search() # only local mode support graph visualization if query_params.mode == "local": entities, relationships, reports, sources = asyncio.run( nano_graph_rag_build_local_query_context( graphrag_func, text, query_params ) ) documents = self.format_context_records( entities, relationships, reports, sources ) plot = self.plot_graph(relationships) documents += [ RetrievedDocument( text="", metadata={ "file_name": "GraphRAG", "type": "plot", "data": plot, }, ), ] else: context = graphrag_func.query(text, query_params) documents = [ RetrievedDocument( text=context, metadata={ "file_name": "GraphRAG {} Search".format( query_params.mode.capitalize() ), "type": "table", }, ) ] return documents ================================================ FILE: libs/ktem/ktem/index/file/graph/pipelines.py ================================================ import os import shutil import subprocess from pathlib import Path from shutil import rmtree from typing import Generator from uuid import uuid4 import pandas as pd import tiktoken import yaml from decouple import config from ktem.db.models import engine from sqlalchemy.orm import Session from theflow.settings import settings from kotaemon.base import Document, Param, RetrievedDocument from ..pipelines import BaseFileIndexRetriever, IndexDocumentPipeline, IndexPipeline from .visualize import create_knowledge_graph, visualize_graph try: from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey from graphrag.query.indexer_adapters import ( read_indexer_entities, read_indexer_relationships, read_indexer_reports, read_indexer_text_units, ) from graphrag.query.input.loaders.dfs import store_entity_semantic_embeddings from graphrag.query.llm.oai.embedding import OpenAIEmbedding from graphrag.query.llm.oai.typing import OpenaiApiType from graphrag.query.structured_search.local_search.mixed_context import ( LocalSearchMixedContext, ) from graphrag.vector_stores.lancedb import LanceDBVectorStore except ImportError: print( ( "GraphRAG dependencies not installed. " "Try `pip install graphrag future` to install. " "GraphRAG retriever pipeline will not work properly." ) ) filestorage_path = Path(settings.KH_FILESTORAGE_PATH) / "graphrag" filestorage_path.mkdir(parents=True, exist_ok=True) GRAPHRAG_KEY_MISSING_MESSAGE = ( "GRAPHRAG_API_KEY is not set. Please set it to use the GraphRAG retriever pipeline." ) def check_graphrag_api_key(): return len(os.getenv("GRAPHRAG_API_KEY", "")) > 0 def prepare_graph_index_path(graph_id: str): root_path = Path(filestorage_path) / graph_id input_path = root_path / "input" return root_path, input_path class GraphRAGIndexingPipeline(IndexDocumentPipeline): """GraphRAG specific indexing pipeline""" def route(self, file_path: str | Path) -> IndexPipeline: """Simply disable the splitter (chunking) for this pipeline""" pipeline = super().route(file_path) pipeline.splitter = None return pipeline def store_file_id_with_graph_id(self, file_ids: list[str | None]): # create new graph_id and assign them to doc_id in self.Index # record in the index graph_id = str(uuid4()) with Session(engine) as session: nodes = [] for file_id in file_ids: if not file_id: continue nodes.append( self.Index( source_id=file_id, target_id=graph_id, relation_type="graph", ) ) session.add_all(nodes) session.commit() return graph_id def write_docs_to_files(self, graph_id: str, docs: list[Document]): root_path, input_path = prepare_graph_index_path(graph_id) input_path.mkdir(parents=True, exist_ok=True) for doc in docs: if doc.metadata.get("type", "text") == "text": with open(input_path / f"{doc.doc_id}.txt", "w") as f: f.write(doc.text) return root_path def call_graphrag_index(self, graph_id: str, all_docs: list[Document]): if not check_graphrag_api_key(): raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE) # call GraphRAG index with docs and graph_id input_path = self.write_docs_to_files(graph_id, all_docs) input_path = str(input_path.absolute()) # Construct the command command = [ "python", "-m", "graphrag.index", "--root", input_path, "--reporter", "rich", "--init", ] # Run the command yield Document( channel="debug", text="[GraphRAG] Creating index... This can take a long time.", ) result = subprocess.run(command, capture_output=True, text=True) print(result.stdout) command = command[:-1] # copy customized GraphRAG config file if it exists if config("USE_CUSTOMIZED_GRAPHRAG_SETTING", default="value").lower() == "true": setting_file_path = os.path.join(os.getcwd(), "settings.yaml.example") destination_file_path = os.path.join(input_path, "settings.yaml") try: shutil.copy(setting_file_path, destination_file_path) except shutil.Error: # Handle the error if the file copy fails print("failed to copy customized GraphRAG config file. ") # Run the command and stream stdout with subprocess.Popen(command, stdout=subprocess.PIPE, text=True) as process: if process.stdout: for line in process.stdout: yield Document(channel="debug", text=line) def stream( self, file_paths: str | Path | list[str | Path], reindex: bool = False, **kwargs ) -> Generator[ Document, None, tuple[list[str | None], list[str | None], list[Document]] ]: file_ids, errors, all_docs = yield from super().stream( file_paths, reindex=reindex, **kwargs ) # assign graph_id to file_ids graph_id = self.store_file_id_with_graph_id(file_ids) # call GraphRAG index with docs and graph_id yield from self.call_graphrag_index(graph_id, all_docs) return file_ids, errors, all_docs class GraphRAGRetrieverPipeline(BaseFileIndexRetriever): """GraphRAG specific retriever pipeline""" Index = Param(help="The SQLAlchemy Index table") file_ids: list[str] = [] @classmethod def get_user_settings(cls) -> dict: return { "search_type": { "name": "Search type", "value": "local", "choices": ["local"], "component": "dropdown", "info": "Whether to use local or global search in the graph.", } } def _build_graph_search(self): assert ( len(self.file_ids) <= 1 ), "GraphRAG retriever only supports one file_id at a time" file_id = self.file_ids[0] # retrieve the graph_id from the index with Session(engine) as session: graph_id = ( session.query(self.Index.target_id) .filter(self.Index.source_id == file_id) .filter(self.Index.relation_type == "graph") .first() ) graph_id = graph_id[0] if graph_id else None assert graph_id, f"GraphRAG index not found for file_id: {file_id}" root_path, _ = prepare_graph_index_path(graph_id) output_path = root_path / "output" INPUT_DIR = output_path LANCEDB_URI = str(INPUT_DIR / "lancedb") COMMUNITY_REPORT_TABLE = "create_final_community_reports" ENTITY_TABLE = "create_final_nodes" ENTITY_EMBEDDING_TABLE = "create_final_entities" RELATIONSHIP_TABLE = "create_final_relationships" TEXT_UNIT_TABLE = "create_final_text_units" COMMUNITY_LEVEL = 2 # read nodes table to get community and degree data entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") entity_embedding_df = pd.read_parquet( f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet" ) entities = read_indexer_entities( entity_df, entity_embedding_df, COMMUNITY_LEVEL ) # load description embeddings to an in-memory lancedb vectorstore # to connect to a remote db, specify url and port values. description_embedding_store = LanceDBVectorStore( collection_name="entity_description_embeddings", ) description_embedding_store.connect(db_uri=LANCEDB_URI) if Path(LANCEDB_URI).is_dir(): rmtree(LANCEDB_URI) _ = store_entity_semantic_embeddings( entities=entities, vectorstore=description_embedding_store ) print(f"Entity count: {len(entity_df)}") # Read relationships relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet") relationships = read_indexer_relationships(relationship_df) # Read community reports report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet") reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL) # Read text units text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet") text_units = read_indexer_text_units(text_unit_df) # initialize default settings embedding_model = os.getenv( "GRAPHRAG_EMBEDDING_MODEL", "text-embedding-3-small" ) embedding_api_key = os.getenv("GRAPHRAG_API_KEY") embedding_api_base = None # use customized GraphRAG settings if the flag is set if config("USE_CUSTOMIZED_GRAPHRAG_SETTING", default="value").lower() == "true": settings_yaml_path = Path(root_path) / "settings.yaml" with open(settings_yaml_path, "r") as f: settings = yaml.safe_load(f) if settings["embeddings"]["llm"]["model"]: embedding_model = settings["embeddings"]["llm"]["model"] if settings["embeddings"]["llm"]["api_key"]: embedding_api_key = settings["embeddings"]["llm"]["api_key"] if settings["embeddings"]["llm"]["api_base"]: embedding_api_base = settings["embeddings"]["llm"]["api_base"] text_embedder = OpenAIEmbedding( api_key=embedding_api_key, api_base=embedding_api_base, api_type=OpenaiApiType.OpenAI, model=embedding_model, deployment_name=embedding_model, max_retries=20, ) token_encoder = tiktoken.get_encoding("cl100k_base") context_builder = LocalSearchMixedContext( community_reports=reports, text_units=text_units, entities=entities, relationships=relationships, covariates=None, entity_text_embeddings=description_embedding_store, embedding_vectorstore_key=EntityVectorStoreKey.ID, # if the vectorstore uses entity title as ids, # set this to EntityVectorStoreKey.TITLE text_embedder=text_embedder, token_encoder=token_encoder, ) return context_builder def _to_document(self, header: str, context_text: str) -> RetrievedDocument: return RetrievedDocument( text=context_text, metadata={ "file_name": header, "type": "table", "llm_trulens_score": 1.0, }, score=1.0, ) def format_context_records(self, context_records) -> list[RetrievedDocument]: entities = context_records.get("entities", []) relationships = context_records.get("relationships", []) reports = context_records.get("reports", []) sources = context_records.get("sources", []) docs = [] context: str = "" header = "Entities\n" context = entities[["entity", "description"]].to_markdown(index=False) docs.append(self._to_document(header, context)) header = "\nRelationships\n" context = relationships[["source", "target", "description"]].to_markdown( index=False ) docs.append(self._to_document(header, context)) header = "\nReports\n" context = "" for idx, row in reports.iterrows(): title, content = row["title"], row["content"] context += f"\n\n
Report {title}
\n" context += content docs.append(self._to_document(header, context)) header = "\nSources\n" context = "" for idx, row in sources.iterrows(): title, content = row["id"], row["text"] context += f"\n\n
Source #{title}
\n" context += content docs.append(self._to_document(header, context)) return docs def plot_graph(self, context_records): relationships = context_records.get("relationships", []) G = create_knowledge_graph(relationships) plot = visualize_graph(G) return plot def generate_relevant_scores(self, text, documents: list[RetrievedDocument]): return documents def run( self, text: str, ) -> list[RetrievedDocument]: if not self.file_ids: return [] if not check_graphrag_api_key(): raise ValueError(GRAPHRAG_KEY_MISSING_MESSAGE) context_builder = self._build_graph_search() local_context_params = { "text_unit_prop": 0.5, "community_prop": 0.1, "conversation_history_max_turns": 5, "conversation_history_user_turns_only": True, "top_k_mapped_entities": 10, "top_k_relationships": 10, "include_entity_rank": False, "include_relationship_weight": False, "include_community_rank": False, "return_candidate_context": False, "embedding_vectorstore_key": EntityVectorStoreKey.ID, # set this to EntityVectorStoreKey.TITLE i # f the vectorstore uses entity title as ids "max_tokens": 12_000, # change this based on the token limit you have on your model # (if you are using a model with 8k limit, a good setting could be 5000) } context_text, context_records = context_builder.build_context( query=text, conversation_history=None, **local_context_params, ) documents = self.format_context_records(context_records) plot = self.plot_graph(context_records) return documents + [ RetrievedDocument( text="", metadata={ "file_name": "GraphRAG", "type": "plot", "data": plot, }, ), ] ================================================ FILE: libs/ktem/ktem/index/file/graph/visualize.py ================================================ import networkx as nx import plotly.graph_objects as go from plotly.io import to_json def create_knowledge_graph(df): """ create nx Graph from DataFrame relations data """ G = nx.Graph() for _, row in df.iterrows(): source = row["source"] target = row["target"] attributes = {k: v for k, v in row.items() if k not in ["source", "target"]} G.add_edge(source, target, **attributes) return G def visualize_graph(G): pos = nx.spring_layout(G, dim=2) edge_x = [] edge_y = [] edge_texts = nx.get_edge_attributes(G, "description") to_display_edge_texts = [] for edge in G.edges(): x0, y0 = pos[edge[0]] x1, y1 = pos[edge[1]] edge_x.append(x0) edge_x.append(x1) edge_x.append(None) edge_y.append(y0) edge_y.append(y1) edge_y.append(None) to_display_edge_texts.append(edge_texts[edge]) edge_trace = go.Scatter( x=edge_x, y=edge_y, text=to_display_edge_texts, line=dict(width=0.5, color="#888"), hoverinfo="text", mode="lines", ) node_x = [] node_y = [] for node in G.nodes(): x, y = pos[node] node_x.append(x) node_y.append(y) node_adjacencies = [] node_text = [] node_size = [] for node_id, adjacencies in enumerate(G.adjacency()): degree = len(adjacencies[1]) node_adjacencies.append(degree) node_text.append(adjacencies[0]) node_size.append(15 if degree < 5 else (30 if degree < 10 else 60)) node_trace = go.Scatter( x=node_x, y=node_y, textfont=dict( family="Courier New, monospace", size=10, # Set the font size here ), textposition="top center", mode="markers+text", hoverinfo="text", text=node_text, marker=dict( showscale=True, # colorscale options size=node_size, colorscale="YlGnBu", reversescale=True, color=node_adjacencies, colorbar=dict( thickness=5, xanchor="left", titleside="right", ), line_width=2, ), ) fig = go.Figure( data=[edge_trace, node_trace], layout=go.Layout( showlegend=False, hovermode="closest", margin=dict(b=20, l=5, r=5, t=40), xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), ), ) fig.update_layout(autosize=True) return to_json(fig) ================================================ FILE: libs/ktem/ktem/index/file/index.py ================================================ import uuid from datetime import datetime from typing import Any, Optional, Type from ktem.components import filestorage_path, get_docstore, get_vectorstore from ktem.db.engine import engine from ktem.index.base import BaseIndex from sqlalchemy import JSON, Column, DateTime, Integer, String, UniqueConstraint from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.mutable import MutableDict from theflow.settings import settings as flowsettings from theflow.utils.modules import import_dotted_string from tzlocal import get_localzone from kotaemon.storages import BaseDocumentStore, BaseVectorStore from .base import BaseFileIndexIndexing, BaseFileIndexRetriever def generate_uuid(): return str(uuid.uuid4()) class FileIndex(BaseIndex): """ File index to store and allow retrieval of files The file index stores files in a local folder and index them for retrieval. This file index provides the following infrastructure to support the indexing: - SQL table Source: store the list of files that are indexed by the system - Vector store: contain the embedding of segments of the files - Document store: contain the text of segments of the files. Each text stored in this document store is associated with a vector in the vector store. - SQL table Index: store the relationship between (1) the source and the docstore, and (2) the source and the vector store. """ def __init__(self, app, id: int, name: str, config: dict): super().__init__(app, id, name, config) self._indexing_pipeline_cls: Type[BaseFileIndexIndexing] self._retriever_pipeline_cls: list[Type[BaseFileIndexRetriever]] self._selector_ui_cls: Type self._selector_ui: Any = None self._index_ui_cls: Type self._index_ui: Any = None self._default_settings: dict[str, dict] = {} self._setting_mappings: dict[str, dict] = {} def _setup_resources(self): """Setup resources for the file index The resources include: - Database table - Vector store - Document store - File storage path """ Base = declarative_base() if self.config.get("private", False): Source = type( "Source", (Base,), { "__tablename__": f"index__{self.id}__source", "__table_args__": ( UniqueConstraint("name", "user", name="_name_user_uc"), ), "id": Column( String, primary_key=True, default=lambda: str(uuid.uuid4()), unique=True, ), "name": Column(String), "path": Column(String), "size": Column(Integer, default=0), "date_created": Column( DateTime(timezone=True), default=datetime.now(get_localzone()) ), "user": Column(String, default=""), "note": Column( MutableDict.as_mutable(JSON), # type: ignore default={}, ), }, ) else: Source = type( "Source", (Base,), { "__tablename__": f"index__{self.id}__source", "id": Column( String, primary_key=True, default=lambda: str(uuid.uuid4()), unique=True, ), "name": Column(String, unique=True), "path": Column(String), "size": Column(Integer, default=0), "date_created": Column( DateTime(timezone=True), default=datetime.now(get_localzone()) ), "user": Column(String, default=""), "note": Column( MutableDict.as_mutable(JSON), # type: ignore default={}, ), }, ) Index = type( "IndexTable", (Base,), { "__tablename__": f"index__{self.id}__index", "id": Column(Integer, primary_key=True, autoincrement=True), "source_id": Column(String), "target_id": Column(String), "relation_type": Column(String), "user": Column(String, default=""), }, ) FileGroup = type( "FileGroupTable", (Base,), { "__tablename__": f"index__{self.id}__group", "__table_args__": ( UniqueConstraint("name", "user", name="_name_user_uc"), ), "id": Column( String, primary_key=True, default=lambda: str(uuid.uuid4()), unique=True, ), "date_created": Column( DateTime(timezone=True), default=datetime.now(get_localzone()) ), "name": Column(String), "user": Column(String, default=""), "data": Column( MutableDict.as_mutable(JSON), # type: ignore default={"files": []}, ), }, ) self._vs: BaseVectorStore = get_vectorstore(f"index_{self.id}") self._docstore: BaseDocumentStore = get_docstore(f"index_{self.id}") self._fs_path = filestorage_path / f"index_{self.id}" self._resources = { "Source": Source, "Index": Index, "FileGroup": FileGroup, "VectorStore": self._vs, "DocStore": self._docstore, "FileStoragePath": self._fs_path, } def _setup_indexing_cls(self): """Retrieve the indexing class for the file index There is only one indexing class. The indexing class will is retrieved from the following order. Stop at the first order found: - `FILE_INDEX_PIPELINE` in self.config - `FILE_INDEX_{id}_PIPELINE` in the flowsettings - `FILE_INDEX_PIPELINE` in the flowsettings - The default .pipelines.IndexDocumentPipeline """ if "FILE_INDEX_PIPELINE" in self.config: self._indexing_pipeline_cls = import_dotted_string( self.config["FILE_INDEX_PIPELINE"], safe=False ) return if hasattr(flowsettings, f"FILE_INDEX_{self.id}_PIPELINE"): self._indexing_pipeline_cls = import_dotted_string( getattr(flowsettings, f"FILE_INDEX_{self.id}_PIPELINE"), safe=False ) return if hasattr(flowsettings, "FILE_INDEX_PIPELINE"): self._indexing_pipeline_cls = import_dotted_string( getattr(flowsettings, "FILE_INDEX_PIPELINE"), safe=False ) return from .pipelines import IndexDocumentPipeline self._indexing_pipeline_cls = IndexDocumentPipeline def _setup_retriever_cls(self): """Retrieve the retriever classes for the file index There can be multiple retriever classes. The retriever classes will is retrieved from the following order. Stop at the first order found: - `FILE_INDEX_RETRIEVER_PIPELINES` in self.config - `FILE_INDEX_{id}_RETRIEVER_PIPELINES` in the flowsettings - `FILE_INDEX_RETRIEVER_PIPELINES` in the flowsettings - The default .pipelines.DocumentRetrievalPipeline """ if "FILE_INDEX_RETRIEVER_PIPELINES" in self.config: self._retriever_pipeline_cls = [ import_dotted_string(each, safe=False) for each in self.config["FILE_INDEX_RETRIEVER_PIPELINES"] ] return if hasattr(flowsettings, f"FILE_INDEX_{self.id}_RETRIEVER_PIPELINES"): self._retriever_pipeline_cls = [ import_dotted_string(each, safe=False) for each in getattr( flowsettings, f"FILE_INDEX_{self.id}_RETRIEVER_PIPELINES" ) ] return if hasattr(flowsettings, "FILE_INDEX_RETRIEVER_PIPELINES"): self._retriever_pipeline_cls = [ import_dotted_string(each, safe=False) for each in getattr(flowsettings, "FILE_INDEX_RETRIEVER_PIPELINES") ] return from .pipelines import DocumentRetrievalPipeline self._retriever_pipeline_cls = [DocumentRetrievalPipeline] def _setup_file_selector_ui_cls(self): """Retrieve the file selector UI for the file index There can be multiple retriever classes. The retriever classes will is retrieved from the following order. Stop at the first order found: - `FILE_INDEX_SELECTOR_UI` in self.config - `FILE_INDEX_{id}_SELECTOR_UI` in the flowsettings - `FILE_INDEX_SELECTOR_UI` in the flowsettings - The default .ui.FileSelector """ if "FILE_INDEX_SELECTOR_UI" in self.config: self._selector_ui_cls = import_dotted_string( self.config["FILE_INDEX_SELECTOR_UI"], safe=False ) return if hasattr(flowsettings, f"FILE_INDEX_{self.id}_SELECTOR_UI"): self._selector_ui_cls = import_dotted_string( getattr(flowsettings, f"FILE_INDEX_{self.id}_SELECTOR_UI"), safe=False, ) return if hasattr(flowsettings, "FILE_INDEX_SELECTOR_UI"): self._selector_ui_cls = import_dotted_string( getattr(flowsettings, "FILE_INDEX_SELECTOR_UI"), safe=False ) return from .ui import FileSelector self._selector_ui_cls = FileSelector def _setup_file_index_ui_cls(self): """Retrieve the Index UI class There can be multiple retriever classes. The retriever classes will is retrieved from the following order. Stop at the first order found: - `FILE_INDEX_UI` in self.config - `FILE_INDEX_{id}_UI` in the flowsettings - `FILE_INDEX_UI` in the flowsettings - The default .ui.FileIndexPage """ if "FILE_INDEX_UI" in self.config: self._index_ui_cls = import_dotted_string( self.config["FILE_INDEX_UI"], safe=False ) return if hasattr(flowsettings, f"FILE_INDEX_{self.id}_UI"): self._index_ui_cls = import_dotted_string( getattr(flowsettings, f"FILE_INDEX_{self.id}_UI"), safe=False, ) return if hasattr(flowsettings, "FILE_INDEX_UI"): self._index_ui_cls = import_dotted_string( getattr(flowsettings, "FILE_INDEX_UI"), safe=False ) return from .ui import FileIndexPage self._index_ui_cls = FileIndexPage def on_create(self): """Create the index for the first time For the file index, this will: 1. Postprocess the config 2. Create the index and the source table if not already exists 3. Create the vectorstore 4. Create the docstore """ # default user's value config = {} for key, value in self.get_admin_settings().items(): config[key] = value["value"] # user's modification config.update(self.config) self.config = config # create the resources self._setup_resources() self._resources["Source"].metadata.create_all(engine) # type: ignore self._resources["Index"].metadata.create_all(engine) # type: ignore self._resources["FileGroup"].metadata.create_all(engine) # type: ignore self._fs_path.mkdir(parents=True, exist_ok=True) def on_delete(self): """Clean up the index when the user delete it""" import shutil self._setup_resources() self._resources["Source"].__table__.drop(engine) # type: ignore self._resources["Index"].__table__.drop(engine) # type: ignore self._resources["FileGroup"].__table__.drop(engine) # type: ignore self._vs.drop() self._docstore.drop() shutil.rmtree(self._fs_path) def on_start(self): """Setup the classes and hooks""" self._setup_resources() self._setup_indexing_cls() self._setup_retriever_cls() self._setup_file_index_ui_cls() self._setup_file_selector_ui_cls() def get_selector_component_ui(self): if self._selector_ui is None: self._selector_ui = self._selector_ui_cls(self._app, self) return self._selector_ui def get_index_page_ui(self): if self._index_ui is None: self._index_ui = self._index_ui_cls(self._app, self) return self._index_ui def get_user_settings(self): if self._default_settings: return self._default_settings settings = {} settings.update(self._indexing_pipeline_cls.get_user_settings()) for cls in self._retriever_pipeline_cls: settings.update(cls.get_user_settings()) self._default_settings = settings return settings @classmethod def get_admin_settings(cls): from ktem.embeddings.manager import embedding_models_manager embedding_default = "default" embedding_choices = list(embedding_models_manager.options().keys()) return { "embedding": { "name": "Embedding model", "value": embedding_default, "component": "dropdown", "choices": embedding_choices, "info": "The name of embedding model to use.", }, "supported_file_types": { "name": "Supported file types", "value": ".pdf, .txt", "component": "text", "info": "The file types that can be indexed, separated by comma.", }, "max_file_size": { "name": "Max file size (MB)", "value": 1000, "component": "number", "info": "The maximum size of file. Set 0 to disable.", }, "max_number_of_files": { "name": "Max number of files that can be indexed", "value": 0, "component": "number", "info": ( "The total number of files that can be indexed on the system. " "Set 0 to disable." ), }, "private": { "name": "Make private", "value": False, "component": "radio", "choices": [("Yes", True), ("No", False)], "info": "If private, files will not be accessible across users.", }, "chunk_size": { "name": "Size of chunk (number of tokens)", "value": 0, "component": "number", "info": ( "Number of tokens of each text segment. " "Set 0 to use developer setting." ), }, "chunk_overlap": { "name": "Number of overlapping tokens between chunks", "value": 0, "component": "number", "info": ( "Number of tokens that consecutive text segments " "should overlap with each other. " "Set 0 to use developer setting." ), }, } def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing: """Define the interface of the indexing pipeline""" prefix = f"index.options.{self.id}." stripped_settings = {} for key, value in settings.items(): if key.startswith(prefix): stripped_settings[key[len(prefix) :]] = value obj = self._indexing_pipeline_cls.get_pipeline(stripped_settings, self.config) obj.Source = self._resources["Source"] obj.Index = self._resources["Index"] obj.VS = self._vs obj.DS = self._docstore obj.FSPath = self._fs_path obj.user_id = user_id obj.private = self.config.get("private", False) obj.chunk_size = self.config.get("chunk_size", 0) obj.chunk_overlap = self.config.get("chunk_overlap", 0) return obj def get_retriever_pipelines( self, settings: dict, user_id: int, selected: Any = None ) -> list["BaseFileIndexRetriever"]: # retrieval settings prefix = f"index.options.{self.id}." stripped_settings = {} for key, value in settings.items(): if key.startswith(prefix): stripped_settings[key[len(prefix) :]] = value # transform selected id selected_ids: Optional[list[str]] = self._selector_ui.get_selected_ids(selected) retrievers = [] for cls in self._retriever_pipeline_cls: obj = cls.get_pipeline(stripped_settings, self.config, selected_ids) if obj is None: continue obj.Source = self._resources["Source"] obj.Index = self._resources["Index"] obj.VS = self._vs obj.DS = self._docstore obj.FSPath = self._fs_path obj.user_id = user_id retrievers.append(obj) return retrievers ================================================ FILE: libs/ktem/ktem/index/file/knet/__init__.py ================================================ from .knet_index import KnowledgeNetworkFileIndex __all__ = ["KnowledgeNetworkFileIndex"] ================================================ FILE: libs/ktem/ktem/index/file/knet/knet_index.py ================================================ from typing import Any from ktem.index.file import FileIndex from ..base import BaseFileIndexIndexing, BaseFileIndexRetriever from .pipelines import KnetIndexingPipeline, KnetRetrievalPipeline class KnowledgeNetworkFileIndex(FileIndex): @classmethod def get_admin_settings(cls): admin_settings = super().get_admin_settings() # remove embedding from admin settings # as we don't need it admin_settings.pop("embedding") return admin_settings def _setup_indexing_cls(self): self._indexing_pipeline_cls = KnetIndexingPipeline def _setup_retriever_cls(self): self._retriever_pipeline_cls = [KnetRetrievalPipeline] def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing: """Define the interface of the indexing pipeline""" obj = super().get_indexing_pipeline(settings, user_id) # disable vectorstore for this kind of Index # also set the collection_name for API call obj.VS = None obj.collection_name = f"kh_index_{self.id}" return obj def get_retriever_pipelines( self, settings: dict, user_id: int, selected: Any = None ) -> list["BaseFileIndexRetriever"]: retrievers = super().get_retriever_pipelines(settings, user_id, selected) for obj in retrievers: # disable vectorstore for this kind of Index # also set the collection_name for API call obj.VS = None obj.collection_name = f"kh_index_{self.id}" return retrievers ================================================ FILE: libs/ktem/ktem/index/file/knet/pipelines.py ================================================ import base64 import json import os from pathlib import Path from typing import Optional, Sequence import requests import yaml from kotaemon.base import RetrievedDocument from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring from ..pipelines import BaseFileIndexRetriever, IndexDocumentPipeline, IndexPipeline class KnetIndexingPipeline(IndexDocumentPipeline): """Knowledge Network specific indexing pipeline""" # collection name for external indexing call collection_name: str = "default" @classmethod def get_user_settings(cls): return { "reader_mode": { "name": "Index parser", "value": "knowledge_network", "choices": [ ("Default (KN)", "knowledge_network"), ], "component": "dropdown", }, } def route(self, file_path: str | Path) -> IndexPipeline: """Simply disable the splitter (chunking) for this pipeline""" pipeline = super().route(file_path) pipeline.splitter = None # assign IndexPipeline collection name to parse to loader pipeline.collection_name = self.collection_name return pipeline class KnetRetrievalPipeline(BaseFileIndexRetriever): DEFAULT_KNET_ENDPOINT: str = "http://127.0.0.1:8081/retrieve" collection_name: str = "default" rerankers: Sequence[BaseReranking] = [LLMReranking.withx()] def encode_image_base64(self, image_path: str | Path) -> bytes | str: """Convert image to base64""" img_base64 = "data:image/png;base64,{}" with open(image_path, "rb") as image_file: return img_base64.format( base64.b64encode(image_file.read()).decode("utf-8") ) def run( self, text: str, doc_ids: Optional[list[str]] = None, *args, **kwargs, ) -> list[RetrievedDocument]: """Retrieve document excerpts similar to the text Args: text: the text to retrieve similar documents doc_ids: list of document ids to constraint the retrieval """ print("searching in doc_ids", doc_ids) if not doc_ids: return [] docs: list[RetrievedDocument] = [] params = { "query": text, "collection": self.collection_name, "meta_filters": {"doc_name": doc_ids}, } params["meta_filters"] = json.dumps(params["meta_filters"]) response = requests.get(self.DEFAULT_KNET_ENDPOINT, params=params) metadata_translation = { "TABLE": "table", "FIGURE": "image", } if response.status_code == 200: # Load YAML content from the response content chunks = yaml.safe_load(response.content) for chunk in chunks: metadata = chunk["node"]["metadata"] metadata["type"] = metadata_translation.get( metadata.pop("content_type", ""), "" ) metadata["file_name"] = metadata.pop("company_name", "") # load image from returned path image_path = metadata.get("image_path", "") if image_path and os.path.isfile(image_path): base64_im = self.encode_image_base64(image_path) # explicitly set document type metadata["type"] = "image" metadata["image_origin"] = base64_im docs.append( RetrievedDocument(text=chunk["node"]["text"], metadata=metadata) ) else: raise IOError(f"{response.status_code}: {response.text}") for reranker in self.rerankers: docs = reranker(documents=docs, query=text) return docs @classmethod def get_user_settings(cls) -> dict: from ktem.llms.manager import llms try: reranking_llm = llms.get_default_name() reranking_llm_choices = list(llms.options().keys()) except Exception: reranking_llm = None reranking_llm_choices = [] return { "reranking_llm": { "name": "LLM for scoring", "value": reranking_llm, "component": "dropdown", "choices": reranking_llm_choices, "special_type": "llm", }, "retrieval_mode": { "name": "Retrieval mode", "value": "hybrid", "choices": ["vector", "text", "hybrid"], "component": "dropdown", }, } @classmethod def get_pipeline(cls, user_settings, index_settings, selected): """Get retriever objects associated with the index Args: settings: the settings of the app kwargs: other arguments """ from ktem.llms.manager import llms retriever = cls( rerankers=[LLMTrulensScoring()], ) # hacky way to input doc_ids to retriever.run() call (through theflow) kwargs = {".doc_ids": selected} retriever.set_run(kwargs, temp=False) for reranker in retriever.rerankers: if isinstance(reranker, LLMReranking): reranker.llm = llms.get( user_settings["reranking_llm"], llms.get_default() ) return retriever ================================================ FILE: libs/ktem/ktem/index/file/pipelines.py ================================================ from __future__ import annotations import json import logging import shutil import threading import time import warnings from collections import defaultdict from copy import deepcopy from functools import lru_cache from hashlib import sha256 from pathlib import Path from typing import Generator, Optional, Sequence import tiktoken from decouple import config from ktem.db.models import engine from ktem.embeddings.manager import embedding_models_manager from ktem.llms.manager import llms from ktem.rerankings.manager import reranking_models_manager from llama_index.core.readers.base import BaseReader from llama_index.core.readers.file.base import default_file_metadata_func from llama_index.core.vector_stores import ( FilterCondition, FilterOperator, MetadataFilter, MetadataFilters, ) from llama_index.core.vector_stores.types import VectorStoreQueryMode from sqlalchemy import delete, select from sqlalchemy.orm import Session from theflow.settings import settings from theflow.utils.modules import import_dotted_string from kotaemon.base import BaseComponent, Document, Node, Param, RetrievedDocument from kotaemon.embeddings import BaseEmbeddings from kotaemon.indices import VectorIndexing, VectorRetrieval from kotaemon.indices.ingests.files import ( KH_DEFAULT_FILE_EXTRACTORS, adobe_reader, azure_reader, docling_reader, unstructured, web_reader, ) from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring from kotaemon.indices.splitters import BaseSplitter, TokenSplitter from .base import BaseFileIndexIndexing, BaseFileIndexRetriever logger = logging.getLogger(__name__) @lru_cache def dev_settings(): """Retrieve the developer settings from flowsettings.py""" file_extractors = {} if hasattr(settings, "FILE_INDEX_PIPELINE_FILE_EXTRACTORS"): file_extractors = { key: import_dotted_string(value, safe=False)() for key, value in settings.FILE_INDEX_PIPELINE_FILE_EXTRACTORS.items() } chunk_size = None if hasattr(settings, "FILE_INDEX_PIPELINE_SPLITTER_CHUNK_SIZE"): chunk_size = settings.FILE_INDEX_PIPELINE_SPLITTER_CHUNK_SIZE chunk_overlap = None if hasattr(settings, "FILE_INDEX_PIPELINE_SPLITTER_CHUNK_OVERLAP"): chunk_overlap = settings.FILE_INDEX_PIPELINE_SPLITTER_CHUNK_OVERLAP return file_extractors, chunk_size, chunk_overlap _default_token_func = tiktoken.encoding_for_model("gpt-3.5-turbo").encode class DocumentRetrievalPipeline(BaseFileIndexRetriever): """Retrieve relevant document Args: vector_retrieval: the retrieval pipeline that return the relevant documents given a text query reranker: the reranking pipeline that re-rank and filter the retrieved documents get_extra_table: if True, for each retrieved document, the pipeline will look for surrounding tables (e.g. within the page) top_k: number of documents to retrieve mmr: whether to use mmr to re-rank the documents """ embedding: BaseEmbeddings rerankers: Sequence[BaseReranking] = [] # use LLM to create relevant scores for displaying on UI llm_scorer: LLMReranking | None = LLMReranking.withx() get_extra_table: bool = False mmr: bool = False top_k: int = 5 retrieval_mode: str = "hybrid" @Node.auto(depends_on=["embedding", "VS", "DS"]) def vector_retrieval(self) -> VectorRetrieval: return VectorRetrieval( embedding=self.embedding, vector_store=self.VS, doc_store=self.DS, retrieval_mode=self.retrieval_mode, # type: ignore rerankers=self.rerankers, ) def run( self, text: str, doc_ids: Optional[list[str]] = None, *args, **kwargs, ) -> list[RetrievedDocument]: """Retrieve document excerpts similar to the text Args: text: the text to retrieve similar documents doc_ids: list of document ids to constraint the retrieval """ # flatten doc_ids in case of group of doc_ids are passed if doc_ids: flatten_doc_ids = [] for doc_id in doc_ids: if doc_id is None: raise ValueError("No document is selected") if doc_id.startswith("["): flatten_doc_ids.extend(json.loads(doc_id)) else: flatten_doc_ids.append(doc_id) doc_ids = flatten_doc_ids print("searching in doc_ids", doc_ids) if not doc_ids: logger.info(f"Skip retrieval because of no selected files: {self}") return [] retrieval_kwargs: dict = {} with Session(engine) as session: stmt = select(self.Index).where( self.Index.relation_type == "document", self.Index.source_id.in_(doc_ids), ) results = session.execute(stmt) chunk_ids = [r[0].target_id for r in results.all()] # do first round top_k extension retrieval_kwargs["do_extend"] = True retrieval_kwargs["scope"] = chunk_ids retrieval_kwargs["filters"] = MetadataFilters( filters=[ MetadataFilter( key="file_id", value=doc_ids, operator=FilterOperator.IN, ) ], condition=FilterCondition.OR, ) if self.mmr: # TODO: double check that llama-index MMR works correctly retrieval_kwargs["mode"] = VectorStoreQueryMode.MMR retrieval_kwargs["mmr_threshold"] = 0.5 # rerank s_time = time.time() print(f"retrieval_kwargs: {retrieval_kwargs.keys()}") docs = self.vector_retrieval(text=text, top_k=self.top_k, **retrieval_kwargs) print("retrieval step took", time.time() - s_time) if not self.get_extra_table: return docs # retrieve extra nodes relate to table table_pages = defaultdict(list) retrieved_id = set([doc.doc_id for doc in docs]) for doc in docs: if "page_label" not in doc.metadata: continue if "file_name" not in doc.metadata: warnings.warn( "file_name not in metadata while page_label is in metadata: " f"{doc.metadata}" ) table_pages[doc.metadata["file_name"]].append(doc.metadata["page_label"]) queries: list[dict] = [ {"$and": [{"file_name": {"$eq": fn}}, {"page_label": {"$in": pls}}]} for fn, pls in table_pages.items() ] if queries: try: extra_docs = self.vector_retrieval( text="", top_k=50, where=queries[0] if len(queries) == 1 else {"$or": queries}, ) for doc in extra_docs: if doc.doc_id not in retrieved_id: docs.append(doc) except Exception: print("Error retrieving additional tables") return docs def generate_relevant_scores( self, query: str, documents: list[RetrievedDocument] ) -> list[RetrievedDocument]: docs = ( documents if not self.llm_scorer else self.llm_scorer(documents=documents, query=query) ) return docs @classmethod def get_user_settings(cls) -> dict: from ktem.llms.manager import llms try: reranking_llm = llms.get_default_name() reranking_llm_choices = list(llms.options().keys()) except Exception as e: logger.error(e) reranking_llm = None reranking_llm_choices = [] return { "reranking_llm": { "name": "LLM for relevant scoring", "value": reranking_llm, "component": "dropdown", "choices": reranking_llm_choices, "special_type": "llm", }, "num_retrieval": { "name": "Number of document chunks to retrieve", "value": 10, "component": "number", }, "retrieval_mode": { "name": "Retrieval mode", "value": "hybrid", "choices": ["vector", "text", "hybrid"], "component": "dropdown", }, "prioritize_table": { "name": "Prioritize table", "value": False, "choices": [True, False], "component": "checkbox", }, "mmr": { "name": "Use MMR", "value": False, "choices": [True, False], "component": "checkbox", }, "use_reranking": { "name": "Use reranking", "value": True, "choices": [True, False], "component": "checkbox", }, "use_llm_reranking": { "name": "Use LLM relevant scoring", "value": not config("USE_LOW_LLM_REQUESTS", default=False, cast=bool), "choices": [True, False], "component": "checkbox", }, } @classmethod def get_pipeline(cls, user_settings, index_settings, selected): """Get retriever objects associated with the index Args: settings: the settings of the app kwargs: other arguments """ use_llm_reranking = user_settings.get("use_llm_reranking", False) retriever = cls( get_extra_table=user_settings["prioritize_table"], top_k=user_settings["num_retrieval"], mmr=user_settings["mmr"], embedding=embedding_models_manager[ index_settings.get( "embedding", embedding_models_manager.get_default_name() ) ], retrieval_mode=user_settings["retrieval_mode"], llm_scorer=(LLMTrulensScoring() if use_llm_reranking else None), rerankers=[ reranking_models_manager[ index_settings.get( "reranking", reranking_models_manager.get_default_name() ) ] ], ) if not user_settings["use_reranking"]: retriever.rerankers = [] # type: ignore for reranker in retriever.rerankers: if isinstance(reranker, LLMReranking): reranker.llm = llms.get( user_settings["reranking_llm"], llms.get_default() ) if retriever.llm_scorer: retriever.llm_scorer.llm = llms.get( user_settings["reranking_llm"], llms.get_default() ) kwargs = {".doc_ids": selected} retriever.set_run(kwargs, temp=False) return retriever class IndexPipeline(BaseComponent): """Index a single file""" loader: BaseReader splitter: BaseSplitter | None chunk_batch_size: int = 200 Source = Param(help="The SQLAlchemy Source table") Index = Param(help="The SQLAlchemy Index table") VS = Param(help="The VectorStore") DS = Param(help="The DocStore") FSPath = Param(help="The file storage path") user_id = Param(help="The user id") collection_name: str = "default" private: bool = False run_embedding_in_thread: bool = False embedding: BaseEmbeddings @Node.auto(depends_on=["Source", "Index", "embedding"]) def vector_indexing(self) -> VectorIndexing: return VectorIndexing( vector_store=self.VS, doc_store=self.DS, embedding=self.embedding ) def handle_docs(self, docs, file_id, file_name) -> Generator[Document, None, int]: s_time = time.time() text_docs = [] non_text_docs = [] thumbnail_docs = [] for doc in docs: doc_type = doc.metadata.get("type", "text") if doc_type == "text": text_docs.append(doc) elif doc_type == "thumbnail": thumbnail_docs.append(doc) else: non_text_docs.append(doc) print(f"Got {len(thumbnail_docs)} page thumbnails") page_label_to_thumbnail = { doc.metadata["page_label"]: doc.doc_id for doc in thumbnail_docs } if self.splitter: all_chunks = self.splitter(text_docs) else: all_chunks = text_docs # add the thumbnails doc_id to the chunks for chunk in all_chunks: page_label = chunk.metadata.get("page_label", None) if page_label and page_label in page_label_to_thumbnail: chunk.metadata["thumbnail_doc_id"] = page_label_to_thumbnail[page_label] to_index_chunks = all_chunks + non_text_docs + thumbnail_docs # add to doc store chunks = [] n_chunks = 0 chunk_size = self.chunk_batch_size * 4 for start_idx in range(0, len(to_index_chunks), chunk_size): chunks = to_index_chunks[start_idx : start_idx + chunk_size] self.handle_chunks_docstore(chunks, file_id) n_chunks += len(chunks) yield Document( f" => [{file_name}] Processed {n_chunks} chunks", channel="debug", ) def insert_chunks_to_vectorstore(): chunks = [] n_chunks = 0 chunk_size = self.chunk_batch_size for start_idx in range(0, len(to_index_chunks), chunk_size): chunks = to_index_chunks[start_idx : start_idx + chunk_size] self.handle_chunks_vectorstore(chunks, file_id) n_chunks += len(chunks) if self.VS: yield Document( f" => [{file_name}] Created embedding for {n_chunks} chunks", channel="debug", ) # run vector indexing in thread if specified if self.run_embedding_in_thread: print("Running embedding in thread") threading.Thread( target=lambda: list(insert_chunks_to_vectorstore()) ).start() else: yield from insert_chunks_to_vectorstore() print("indexing step took", time.time() - s_time) return n_chunks def handle_chunks_docstore(self, chunks, file_id): """Run chunks""" # run embedding, add to both vector store and doc store self.vector_indexing.add_to_docstore(chunks) # record in the index with Session(engine) as session: nodes = [] for chunk in chunks: nodes.append( self.Index( source_id=file_id, target_id=chunk.doc_id, relation_type="document", ) ) session.add_all(nodes) session.commit() def handle_chunks_vectorstore(self, chunks, file_id): """Run chunks""" # run embedding, add to both vector store and doc store self.vector_indexing.add_to_vectorstore(chunks) self.vector_indexing.write_chunk_to_file(chunks) if self.VS: # record in the index with Session(engine) as session: nodes = [] for chunk in chunks: nodes.append( self.Index( source_id=file_id, target_id=chunk.doc_id, relation_type="vector", ) ) session.add_all(nodes) session.commit() def get_id_if_exists(self, file_path: str | Path) -> Optional[str]: """Check if the file is already indexed Args: file_path: the path to the file Returns: the file id if the file is indexed, otherwise None """ file_name = file_path.name if isinstance(file_path, Path) else file_path if self.private: cond: tuple = ( self.Source.name == file_name, self.Source.user == self.user_id, ) else: cond = (self.Source.name == file_name,) with Session(engine) as session: stmt = select(self.Source).where(*cond) item = session.execute(stmt).first() if item: return item[0].id return None def store_url(self, url: str) -> str: """Store URL into the database and storage, return the file id Args: url: the URL Returns: the file id """ file_hash = sha256(url.encode()).hexdigest() source = self.Source( name=url, path=file_hash, size=0, user=self.user_id, # type: ignore ) with Session(engine) as session: session.add(source) session.commit() file_id = source.id return file_id def store_file(self, file_path: Path) -> str: """Store file into the database and storage, return the file id Args: file_path: the path to the file Returns: the file id """ with file_path.open("rb") as fi: file_hash = sha256(fi.read()).hexdigest() shutil.copy(file_path, self.FSPath / file_hash) source = self.Source( name=file_path.name, path=file_hash, size=file_path.stat().st_size, user=self.user_id, # type: ignore ) with Session(engine) as session: session.add(source) session.commit() file_id = source.id return file_id def finish(self, file_id: str, file_path: str | Path) -> str: """Finish the indexing""" with Session(engine) as session: stmt = select(self.Source).where(self.Source.id == file_id) result = session.execute(stmt).first() if not result: return file_id item = result[0] # populate the number of tokens doc_ids_stmt = select(self.Index.target_id).where( self.Index.source_id == file_id, self.Index.relation_type == "document", ) doc_ids = [_[0] for _ in session.execute(doc_ids_stmt)] token_func = self.get_token_func() if doc_ids and token_func: docs = self.DS.get(doc_ids) item.note["tokens"] = sum([len(token_func(doc.text)) for doc in docs]) # populate the note item.note["loader"] = self.get_from_path("loader").__class__.__name__ session.add(item) session.commit() return file_id def get_token_func(self): """Get the token function for calculating the number of tokens""" return _default_token_func def delete_file(self, file_id: str): """Delete a file from the db, including its chunks in docstore and vectorstore Args: file_id: the file id """ with Session(engine) as session: session.execute(delete(self.Source).where(self.Source.id == file_id)) vs_ids, ds_ids = [], [] index = session.execute( select(self.Index).where(self.Index.source_id == file_id) ).all() for each in index: if each[0].relation_type == "vector": vs_ids.append(each[0].target_id) elif each[0].relation_type == "document": ds_ids.append(each[0].target_id) session.delete(each[0]) session.commit() if vs_ids and self.VS: self.VS.delete(vs_ids) if ds_ids: self.DS.delete(ds_ids) def run( self, file_path: str | Path, reindex: bool, **kwargs ) -> tuple[str, list[Document]]: raise NotImplementedError def stream( self, file_path: str | Path, reindex: bool, **kwargs ) -> Generator[Document, None, tuple[str, list[Document]]]: # check if the file is already indexed if isinstance(file_path, Path): file_path = file_path.resolve() file_id = self.get_id_if_exists(file_path) if isinstance(file_path, Path): if file_id is not None: if not reindex: raise ValueError( f"File {file_path.name} already indexed. Please rerun with " "reindex=True to force reindexing." ) else: # remove the existing records yield Document( f" => Removing old {file_path.name}", channel="debug" ) self.delete_file(file_id) file_id = self.store_file(file_path) else: # add record to db file_id = self.store_file(file_path) else: if file_id is not None: raise ValueError(f"URL {file_path} already indexed.") else: # add record to db file_id = self.store_url(file_path) # extract the file if isinstance(file_path, Path): extra_info = default_file_metadata_func(str(file_path)) file_name = file_path.name else: extra_info = {"file_name": file_path} file_name = file_path extra_info["file_id"] = file_id extra_info["collection_name"] = self.collection_name yield Document(f" => Converting {file_name} to text", channel="debug") docs = self.loader.load_data(file_path, extra_info=extra_info) yield Document(f" => Converted {file_name} to text", channel="debug") yield from self.handle_docs(docs, file_id, file_name) self.finish(file_id, file_path) yield Document(f" => Finished indexing {file_name}", channel="debug") return file_id, docs class IndexDocumentPipeline(BaseFileIndexIndexing): """Index the file. Decide which pipeline based on the file type. This method is essentially a factory to decide which indexing pipeline to use. We can decide the pipeline programmatically, and/or automatically based on an LLM. If we based on the LLM, essentially we will log the LLM thought process in a file, and then during the indexing, we will read that file to decide which pipeline to use, and then log the operation in that file. Overtime, the LLM can learn to decide which pipeline should be used. """ reader_mode: str = Param("default", help="The reader mode") embedding: BaseEmbeddings run_embedding_in_thread: bool = False @Param.auto(depends_on="reader_mode") def readers(self): readers = deepcopy(KH_DEFAULT_FILE_EXTRACTORS) print("reader_mode", self.reader_mode) if self.reader_mode == "adobe": readers[".pdf"] = adobe_reader elif self.reader_mode == "azure-di": readers[".pdf"] = azure_reader elif self.reader_mode == "docling": readers[".pdf"] = docling_reader dev_readers, _, _ = dev_settings() readers.update(dev_readers) return readers @classmethod def get_user_settings(cls): return { "reader_mode": { "name": "File loader", "value": "default", "choices": [ ("Default (open-source)", "default"), ("Adobe API (figure+table extraction)", "adobe"), ( "Azure AI Document Intelligence (figure+table extraction)", "azure-di", ), ("Docling (figure+table extraction)", "docling"), ], "component": "dropdown", }, } @classmethod def get_pipeline(cls, user_settings, index_settings) -> BaseFileIndexIndexing: use_quick_index_mode = user_settings.get("quick_index_mode", False) print("use_quick_index_mode", use_quick_index_mode) obj = cls( embedding=embedding_models_manager[ index_settings.get( "embedding", embedding_models_manager.get_default_name() ) ], run_embedding_in_thread=use_quick_index_mode, reader_mode=user_settings.get("reader_mode", "default"), ) return obj def is_url(self, file_path: str | Path) -> bool: return isinstance(file_path, str) and ( file_path.startswith("http://") or file_path.startswith("https://") ) def route(self, file_path: str | Path) -> IndexPipeline: """Decide the pipeline based on the file type Can subclass this method for a more elaborate pipeline routing strategy. """ _, dev_chunk_size, dev_chunk_overlap = dev_settings() chunk_size = self.chunk_size or dev_chunk_size chunk_overlap = self.chunk_overlap or dev_chunk_overlap # check if file_path is a URL if self.is_url(file_path): reader = web_reader else: assert isinstance(file_path, Path) ext = file_path.suffix.lower() reader = self.readers.get(ext, unstructured) if reader is None: raise NotImplementedError( f"No supported pipeline to index {file_path.name}. Please specify " "the suitable pipeline for this file type in the settings." ) print(f"Chunk size: {chunk_size}, chunk overlap: {chunk_overlap}") print("Using reader", reader) pipeline: IndexPipeline = IndexPipeline( loader=reader, splitter=TokenSplitter( chunk_size=chunk_size or 1024, chunk_overlap=chunk_overlap or 256, separator="\n\n", backup_separators=["\n", ".", "\u200B"], ), run_embedding_in_thread=self.run_embedding_in_thread, Source=self.Source, Index=self.Index, VS=self.VS, DS=self.DS, FSPath=self.FSPath, user_id=self.user_id, private=self.private, embedding=self.embedding, ) return pipeline def run( self, file_paths: str | Path | list[str | Path], *args, **kwargs ) -> tuple[list[str | None], list[str | None]]: raise NotImplementedError def stream( self, file_paths: str | Path | list[str | Path], reindex: bool = False, **kwargs ) -> Generator[ Document, None, tuple[list[str | None], list[str | None], list[Document]] ]: """Return a list of indexed file ids, and a list of errors""" if not isinstance(file_paths, list): file_paths = [file_paths] file_ids: list[str | None] = [] errors: list[str | None] = [] all_docs = [] n_files = len(file_paths) for idx, file_path in enumerate(file_paths): if self.is_url(file_path): file_name = file_path else: file_path = Path(file_path) file_name = file_path.name yield Document( content=f"Indexing [{idx + 1}/{n_files}]: {file_name}", channel="debug", ) try: pipeline = self.route(file_path) file_id, docs = yield from pipeline.stream( file_path, reindex=reindex, **kwargs ) all_docs.extend(docs) file_ids.append(file_id) errors.append(None) yield Document( content={ "file_path": file_path, "file_name": file_name, "status": "success", }, channel="index", ) except Exception as e: logger.exception(e) file_ids.append(None) errors.append(str(e)) yield Document( content={ "file_path": file_path, "file_name": file_name, "status": "failed", "message": str(e), }, channel="index", ) return file_ids, errors, all_docs ================================================ FILE: libs/ktem/ktem/index/file/ui.py ================================================ import html import json import os import shutil import tempfile import zipfile from copy import deepcopy from pathlib import Path from typing import Generator import gradio as gr import pandas as pd from gradio.data_classes import FileData from gradio.utils import NamedString from ktem.app import BasePage from ktem.db.engine import engine from ktem.utils.render import Render from sqlalchemy import select from sqlalchemy.orm import Session from theflow.settings import settings as flowsettings from ...utils.commands import WEB_SEARCH_COMMAND from ...utils.rate_limit import check_rate_limit from .utils import download_arxiv_pdf, is_arxiv_url KH_DEMO_MODE = getattr(flowsettings, "KH_DEMO_MODE", False) KH_SSO_ENABLED = getattr(flowsettings, "KH_SSO_ENABLED", False) DOWNLOAD_MESSAGE = "Start download" MAX_FILENAME_LENGTH = 20 MAX_FILE_COUNT = 200 chat_input_focus_js = """ function() { let chatInput = document.querySelector("#chat-input textarea"); chatInput.focus(); } """ chat_input_focus_js_with_submit = """ function() { let chatInput = document.querySelector("#chat-input textarea"); let chatInputSubmit = document.querySelector("#chat-input button.submit-button"); chatInputSubmit.click(); chatInput.focus(); } """ update_file_list_js = """ function(file_list) { var values = []; for (var i = 0; i < file_list.length; i++) { values.push({ key: file_list[i][0], value: '"' + file_list[i][0] + '"', }); } // manually push web search tag values.push({ key: "web_search", value: '"web_search"', }); var tribute = new Tribute({ values: values, noMatchTemplate: "", allowSpaces: true, }) input_box = document.querySelector('#chat-input textarea'); tribute.detach(input_box); tribute.attach(input_box); } """.replace( "web_search", WEB_SEARCH_COMMAND ) class File(gr.File): """Subclass from gr.File to maintain the original filename The issue happens when user uploads file with name like: !@#$%%^&*().pdf """ def _process_single_file(self, f: FileData) -> NamedString | bytes: file_name = f.path if self.type == "filepath": if f.orig_name and Path(file_name).name != f.orig_name: file_name = str(Path(file_name).parent / f.orig_name) os.rename(f.path, file_name) file = tempfile.NamedTemporaryFile(delete=False, dir=self.GRADIO_CACHE) file.name = file_name return NamedString(file_name) elif self.type == "binary": with open(file_name, "rb") as file_data: return file_data.read() else: raise ValueError( "Unknown type: " + str(type) + ". Please choose from: 'filepath', 'binary'." ) class DirectoryUpload(BasePage): def __init__(self, app, index): super().__init__(app) self._index = index self._supported_file_types_str = self._index.config.get( "supported_file_types", "" ) self._supported_file_types = [ each.strip() for each in self._supported_file_types_str.split(",") ] self.on_building_ui() def on_building_ui(self): with gr.Accordion(label="Directory upload", open=False): gr.Markdown(f"Supported file types: {self._supported_file_types_str}") self.path = gr.Textbox( placeholder="Directory path...", lines=1, max_lines=1, container=False ) with gr.Accordion("Advanced indexing options", open=False): with gr.Row(): self.reindex = gr.Checkbox( value=False, label="Force reindex file", container=False ) self.upload_button = gr.Button("Upload and Index") class FileIndexPage(BasePage): def __init__(self, app, index): super().__init__(app) self._index = index self._supported_file_types_str = self._index.config.get( "supported_file_types", "" ) self._supported_file_types = [ each.strip() for each in self._supported_file_types_str.split(",") ] self.selected_panel_false = "Selected file: (please select above)" self.selected_panel_true = "Selected file: {name}" # TODO: on_building_ui is not correctly named if it's always called in # the constructor self.public_events = [f"onFileIndex{index.id}Changed"] if not KH_DEMO_MODE: self.on_building_ui() def upload_instruction(self) -> str: msgs = [] if self._supported_file_types: msgs.append(f"- Supported file types: {self._supported_file_types_str}") if max_file_size := self._index.config.get("max_file_size", 0): msgs.append(f"- Maximum file size: {max_file_size} MB") if max_number_of_files := self._index.config.get("max_number_of_files", 0): msgs.append(f"- The index can have maximum {max_number_of_files} files") if msgs: return "\n".join(msgs) return "" def render_file_list(self): self.filter = gr.Textbox( value="", label="Filter by name:", info=( "(1) Case-insensitive. " "(2) Search with empty string to show all files." ), ) self.file_list_state = gr.State(value=None) self.file_list = gr.DataFrame( headers=[ "id", "name", "size", "tokens", "loader", "date_created", ], column_widths=[0, 50, 8, 7, 15, 20], interactive=False, wrap=False, elem_id="file_list_view", ) with gr.Row(): self.chat_button = gr.Button( "Go to Chat", visible=False, ) self.is_zipped_state = gr.State(value=False) self.download_single_button = gr.DownloadButton( "Download", visible=False, ) self.delete_button = gr.Button( "Delete", variant="stop", visible=False, ) self.deselect_button = gr.Button( "Close", visible=False, ) with gr.Row() as self.selection_info: self.selected_file_id = gr.State(value=None) with gr.Column(scale=2): self.selected_panel = gr.Markdown(self.selected_panel_false) self.chunks = gr.HTML(visible=False) with gr.Accordion("Advance options", open=False): with gr.Row(): if not KH_SSO_ENABLED: self.download_all_button = gr.DownloadButton( "Download all files", ) self.delete_all_button = gr.Button( "Delete all files", variant="stop", visible=True, ) self.delete_all_button_confirm = gr.Button( "Confirm delete", variant="stop", visible=False ) self.delete_all_button_cancel = gr.Button("Cancel", visible=False) def render_group_list(self): self.group_list_state = gr.State(value=None) self.group_list = gr.DataFrame( headers=[ "id", "name", "files", "date_created", ], column_widths=[0, 25, 55, 20], interactive=False, wrap=False, ) with gr.Row(): self.group_add_button = gr.Button( "Add", variant="primary", ) self.group_chat_button = gr.Button( "Go to Chat", visible=False, ) self.group_delete_button = gr.Button( "Delete", variant="stop", visible=False, ) self.group_close_button = gr.Button( "Close", visible=False, ) with gr.Column(visible=False) as self._group_info_panel: self.selected_group_id = gr.State(value=None) self.group_label = gr.Markdown() self.group_name = gr.Textbox( label="Group name", placeholder="Group name", lines=1, max_lines=1, ) self.group_files = gr.Dropdown( label="Attached files", multiselect=True, ) self.group_save_button = gr.Button( "Save", variant="primary", ) def on_building_ui(self): """Build the UI of the app""" with gr.Row(): with gr.Column(scale=1): with gr.Column() as self.upload: with gr.Tab("Upload Files"): self.files = File( file_types=self._supported_file_types, file_count="multiple", container=True, show_label=False, ) msg = self.upload_instruction() if msg: gr.Markdown(msg) with gr.Tab("Use Web Links"): self.urls = gr.Textbox( label="Input web URLs", lines=8, ) gr.Markdown("(separated by new line)") with gr.Accordion("Advanced indexing options", open=False): with gr.Row(): self.reindex = gr.Checkbox( value=False, label="Force reindex file", container=False ) self.upload_button = gr.Button( "Upload and Index", variant="primary" ) with gr.Column(scale=4): with gr.Column(visible=False) as self.upload_progress_panel: gr.Markdown("## Upload Progress") with gr.Row(): self.upload_result = gr.Textbox( lines=1, max_lines=20, label="Upload result" ) self.upload_info = gr.Textbox( lines=1, max_lines=20, label="Upload info" ) self.btn_close_upload_progress_panel = gr.Button( "Clear Upload Info and Close", variant="secondary", elem_classes=["right-button"], ) with gr.Tab("Files"): self.render_file_list() with gr.Tab("Groups"): self.render_group_list() def on_subscribe_public_events(self): """Subscribe to the declared public event of the app""" if KH_DEMO_MODE: return self._app.subscribe_event( name=f"onFileIndex{self._index.id}Changed", definition={ "fn": self.list_file_names, "inputs": [self.file_list_state], "outputs": [self.group_files], "show_progress": "hidden", }, ) if self._app.f_user_management: self._app.subscribe_event( name="onSignIn", definition={ "fn": self.list_file, "inputs": [self._app.user_id], "outputs": [self.file_list_state, self.file_list], "show_progress": "hidden", }, ) self._app.subscribe_event( name="onSignIn", definition={ "fn": self.list_group, "inputs": [self._app.user_id, self.file_list_state], "outputs": [self.group_list_state, self.group_list], "show_progress": "hidden", }, ) self._app.subscribe_event( name="onSignIn", definition={ "fn": self.list_file_names, "inputs": [self.file_list_state], "outputs": [self.group_files], "show_progress": "hidden", }, ) self._app.subscribe_event( name="onSignOut", definition={ "fn": self.list_file, "inputs": [self._app.user_id], "outputs": [self.file_list_state, self.file_list], "show_progress": "hidden", }, ) def file_selected(self, file_id): chunks = [] if file_id is not None: # get the chunks Index = self._index._resources["Index"] with Session(engine) as session: matches = session.execute( select(Index).where( Index.source_id == file_id, Index.relation_type == "document", ) ) doc_ids = [doc.target_id for (doc,) in matches] docs = self._index._docstore.get(doc_ids) docs = sorted( docs, key=lambda x: x.metadata.get("page_label", float("inf")) ) for idx, doc in enumerate(docs): title = html.escape( f"{doc.text[:50]}..." if len(doc.text) > 50 else doc.text ) doc_type = doc.metadata.get("type", "text") content = "" if doc_type == "text": content = html.escape(doc.text) elif doc_type == "table": content = Render.table(doc.text) elif doc_type == "image": content = Render.image( url=doc.metadata.get("image_origin", ""), text=doc.text ) header_prefix = f"[{idx+1}/{len(docs)}]" if doc.metadata.get("page_label"): header_prefix += f" [Page {doc.metadata['page_label']}]" chunks.append( Render.collapsible( header=f"{header_prefix} {title}", content=content, ) ) return ( gr.update(value="".join(chunks), visible=file_id is not None), gr.update(visible=file_id is not None), gr.update(visible=file_id is not None), gr.update(visible=file_id is not None), gr.update(visible=file_id is not None), ) def delete_event(self, file_id): file_name = "" with Session(engine) as session: source = session.execute( select(self._index._resources["Source"]).where( self._index._resources["Source"].id == file_id ) ).first() if source: file_name = source[0].name session.delete(source[0]) vs_ids, ds_ids = [], [] index = session.execute( select(self._index._resources["Index"]).where( self._index._resources["Index"].source_id == file_id ) ).all() for each in index: if each[0].relation_type == "vector": vs_ids.append(each[0].target_id) elif each[0].relation_type == "document": ds_ids.append(each[0].target_id) session.delete(each[0]) session.commit() if vs_ids: self._index._vs.delete(vs_ids) self._index._docstore.delete(ds_ids) gr.Info(f"File {file_name} has been deleted") return None, self.selected_panel_false def delete_no_event(self): return ( gr.update(visible=True), gr.update(visible=False), ) def download_single_file(self, is_zipped_state, file_id): with Session(engine) as session: source = session.execute( select(self._index._resources["Source"]).where( self._index._resources["Source"].id == file_id ) ).first() if source: target_file_name = Path(source[0].name) zip_files = [] for file_name in os.listdir(flowsettings.KH_CHUNKS_OUTPUT_DIR): if target_file_name.stem in file_name: zip_files.append( os.path.join(flowsettings.KH_CHUNKS_OUTPUT_DIR, file_name) ) for file_name in os.listdir(flowsettings.KH_MARKDOWN_OUTPUT_DIR): if target_file_name.stem in file_name: zip_files.append( os.path.join(flowsettings.KH_MARKDOWN_OUTPUT_DIR, file_name) ) zip_file_path = os.path.join( flowsettings.KH_ZIP_OUTPUT_DIR, target_file_name.stem ) with zipfile.ZipFile(f"{zip_file_path}.zip", "w") as zipMe: for file in zip_files: zipMe.write(file, arcname=os.path.basename(file)) if is_zipped_state: new_button = gr.DownloadButton(label="Download", value=None) else: new_button = gr.DownloadButton( label=DOWNLOAD_MESSAGE, value=f"{zip_file_path}.zip" ) return not is_zipped_state, new_button def download_single_file_simple(self, is_zipped_state, file_html, file_id): with Session(engine) as session: source = session.execute( select(self._index._resources["Source"]).where( self._index._resources["Source"].id == file_id ) ).first() if source: target_file_name = Path(source[0].name) # create a temporary file with a path to export output_file_path = os.path.join( flowsettings.KH_ZIP_OUTPUT_DIR, target_file_name.stem + ".html" ) with open(output_file_path, "w") as f: f.write(file_html) if is_zipped_state: new_button = gr.DownloadButton(label="Download", value=None) else: # export the file path new_button = gr.DownloadButton( label=DOWNLOAD_MESSAGE, value=output_file_path, ) return not is_zipped_state, new_button def download_all_files(self): if self._index.config.get("private", False): raise gr.Error("This feature is not available for private collection.") zip_files = [] for file_name in os.listdir(flowsettings.KH_CHUNKS_OUTPUT_DIR): zip_files.append(os.path.join(flowsettings.KH_CHUNKS_OUTPUT_DIR, file_name)) for file_name in os.listdir(flowsettings.KH_MARKDOWN_OUTPUT_DIR): zip_files.append( os.path.join(flowsettings.KH_MARKDOWN_OUTPUT_DIR, file_name) ) zip_file_path = os.path.join(flowsettings.KH_ZIP_OUTPUT_DIR, "all") with zipfile.ZipFile(f"{zip_file_path}.zip", "w") as zipMe: for file in zip_files: arcname = Path(file) zipMe.write(file, arcname=arcname.name) return gr.DownloadButton(label=DOWNLOAD_MESSAGE, value=f"{zip_file_path}.zip") def delete_all_files(self, file_list): for file_id in file_list.id.values: self.delete_event(file_id) def set_file_id_selector(self, selected_file_id): return [selected_file_id, "select", gr.Tabs(selected="chat-tab")] def show_delete_all_confirm(self, file_list): # when the list of files is empty it shows a single line with id equal to - if len(file_list) == 0 or ( len(file_list) == 1 and file_list.id.values[0] == "-" ): gr.Info("No file to delete") return [ gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), ] else: return [ gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), ] def on_register_quick_uploads(self): try: # quick file upload event registration of first Index only if self._index.id == 1: self.quick_upload_state = gr.State(value=[]) print("Setting up quick upload event") # override indexing function from chat page self._app.chat_page.first_indexing_url_fn = ( self.index_fn_url_with_default_loaders ) if not KH_DEMO_MODE: quickUploadedEvent = ( self._app.chat_page.quick_file_upload.upload( fn=lambda: gr.update( value="Please wait for the indexing process " "to complete before adding your question." ), outputs=self._app.chat_page.quick_file_upload_status, ) .then( fn=self.index_fn_file_with_default_loaders, inputs=[ self._app.chat_page.quick_file_upload, gr.State(value=False), self._app.settings_state, self._app.user_id, ], outputs=self.quick_upload_state, concurrency_limit=10, ) .success( fn=lambda: [ gr.update(value=None), gr.update(value="select"), ], outputs=[ self._app.chat_page.quick_file_upload, self._app.chat_page._indices_input[0], ], ) ) for event in self._app.get_event( f"onFileIndex{self._index.id}Changed" ): quickUploadedEvent = quickUploadedEvent.then(**event) quickUploadedEvent = ( quickUploadedEvent.success( fn=lambda x: x, inputs=self.quick_upload_state, outputs=self._app.chat_page._indices_input[1], ) .then( fn=lambda: gr.update(value="Indexing completed."), outputs=self._app.chat_page.quick_file_upload_status, ) .then( fn=self.list_file, inputs=[self._app.user_id, self.filter], outputs=[self.file_list_state, self.file_list], concurrency_limit=20, ) .then( fn=lambda: True, inputs=None, outputs=None, js=chat_input_focus_js_with_submit, ) ) quickURLUploadedEvent = ( self._app.chat_page.quick_urls.submit( fn=lambda: gr.update( value="Please wait for the indexing process " "to complete before adding your question." ), outputs=self._app.chat_page.quick_file_upload_status, ) .then( fn=self.index_fn_url_with_default_loaders, inputs=[ self._app.chat_page.quick_urls, gr.State(value=False), self._app.settings_state, self._app.user_id, ], outputs=self.quick_upload_state, concurrency_limit=10, ) .success( fn=lambda: [ gr.update(value=None), gr.update(value="select"), ], outputs=[ self._app.chat_page.quick_urls, self._app.chat_page._indices_input[0], ], ) ) for event in self._app.get_event(f"onFileIndex{self._index.id}Changed"): quickURLUploadedEvent = quickURLUploadedEvent.then(**event) quickURLUploadedEvent = quickURLUploadedEvent.success( fn=lambda x: x, inputs=self.quick_upload_state, outputs=self._app.chat_page._indices_input[1], ).then( fn=lambda: gr.update(value="Indexing completed."), outputs=self._app.chat_page.quick_file_upload_status, ) if not KH_DEMO_MODE: quickURLUploadedEvent = quickURLUploadedEvent.then( fn=self.list_file, inputs=[self._app.user_id, self.filter], outputs=[self.file_list_state, self.file_list], concurrency_limit=20, ) quickURLUploadedEvent = quickURLUploadedEvent.then( fn=lambda: True, inputs=None, outputs=None, js=chat_input_focus_js_with_submit, ) except Exception as e: print(e) def on_register_events(self): """Register all events to the app""" self.on_register_quick_uploads() if KH_DEMO_MODE: return onDeleted = ( self.delete_button.click( fn=self.delete_event, inputs=[self.selected_file_id], outputs=None, ) .then( fn=lambda: (None, self.selected_panel_false), inputs=[], outputs=[self.selected_file_id, self.selected_panel], show_progress="hidden", ) .then( fn=self.list_file, inputs=[self._app.user_id, self.filter], outputs=[self.file_list_state, self.file_list], ) .then( fn=self.file_selected, inputs=[self.selected_file_id], outputs=[ self.chunks, self.deselect_button, self.delete_button, self.download_single_button, self.chat_button, ], show_progress="hidden", ) ) for event in self._app.get_event(f"onFileIndex{self._index.id}Changed"): onDeleted = onDeleted.then(**event) self.deselect_button.click( fn=lambda: (None, self.selected_panel_false), inputs=[], outputs=[self.selected_file_id, self.selected_panel], show_progress="hidden", ).then( fn=self.file_selected, inputs=[self.selected_file_id], outputs=[ self.chunks, self.deselect_button, self.delete_button, self.download_single_button, self.chat_button, ], show_progress="hidden", ) self.chat_button.click( fn=self.set_file_id_selector, inputs=[self.selected_file_id], outputs=[ self._index.get_selector_component_ui().selector, self._index.get_selector_component_ui().mode, self._app.tabs, ], ) if not KH_SSO_ENABLED: self.download_all_button.click( fn=self.download_all_files, inputs=[], outputs=self.download_all_button, show_progress="hidden", ) self.delete_all_button.click( self.show_delete_all_confirm, [self.file_list], [ self.delete_all_button, self.delete_all_button_confirm, self.delete_all_button_cancel, ], ) self.delete_all_button_cancel.click( lambda: [ gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), ], None, [ self.delete_all_button, self.delete_all_button_confirm, self.delete_all_button_cancel, ], ) self.delete_all_button_confirm.click( fn=self.delete_all_files, inputs=[self.file_list], outputs=[], show_progress="hidden", ).then( fn=self.list_file, inputs=[self._app.user_id, self.filter], outputs=[self.file_list_state, self.file_list], ).then( lambda: [ gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), ], None, [ self.delete_all_button, self.delete_all_button_confirm, self.delete_all_button_cancel, ], ) if not KH_SSO_ENABLED: self.download_single_button.click( fn=self.download_single_file, inputs=[self.is_zipped_state, self.selected_file_id], outputs=[self.is_zipped_state, self.download_single_button], show_progress="hidden", ) else: self.download_single_button.click( fn=self.download_single_file_simple, inputs=[self.is_zipped_state, self.chunks, self.selected_file_id], outputs=[self.is_zipped_state, self.download_single_button], show_progress="hidden", ) onUploaded = ( self.upload_button.click( fn=lambda: gr.update(visible=True), outputs=[self.upload_progress_panel], ) .then( fn=self.index_fn, inputs=[ self.files, self.urls, self.reindex, self._app.settings_state, self._app.user_id, ], outputs=[self.upload_result, self.upload_info], concurrency_limit=20, ) .then( fn=lambda: gr.update(value=""), outputs=[self.urls], ) ) uploadedEvent = onUploaded.then( fn=self.list_file, inputs=[self._app.user_id, self.filter], outputs=[self.file_list_state, self.file_list], concurrency_limit=20, ) for event in self._app.get_event(f"onFileIndex{self._index.id}Changed"): uploadedEvent = uploadedEvent.then(**event) _ = onUploaded.success( fn=lambda: None, outputs=[self.files], ) self.btn_close_upload_progress_panel.click( fn=lambda: (gr.update(visible=False), "", ""), outputs=[self.upload_progress_panel, self.upload_result, self.upload_info], ) self.file_list.select( fn=self.interact_file_list, inputs=[self.file_list], outputs=[self.selected_file_id, self.selected_panel], show_progress="hidden", ).then( fn=self.file_selected, inputs=[self.selected_file_id], outputs=[ self.chunks, self.deselect_button, self.delete_button, self.download_single_button, self.chat_button, ], show_progress="hidden", ) self.group_list.select( fn=self.interact_group_list, inputs=[self.group_list_state], outputs=[ self.group_label, self.selected_group_id, self.group_name, self.group_files, ], show_progress="hidden", ).then( fn=lambda: ( gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), ), outputs=[ self._group_info_panel, self.group_add_button, self.group_close_button, self.group_delete_button, self.group_chat_button, ], ) self.filter.submit( fn=self.list_file, inputs=[self._app.user_id, self.filter], outputs=[self.file_list_state, self.file_list], show_progress="hidden", ) self.group_add_button.click( fn=lambda: [ gr.update(visible=False), gr.update(value="### Add new group"), gr.update(visible=True), gr.update(value=""), gr.update(value=[]), None, ], outputs=[ self.group_add_button, self.group_label, self._group_info_panel, self.group_name, self.group_files, self.selected_group_id, ], ) self.group_chat_button.click( fn=self.set_group_id_selector, inputs=[self.selected_group_id], outputs=[ self._index.get_selector_component_ui().selector, self._index.get_selector_component_ui().mode, self._app.tabs, ], ) onGroupClosedEvent = { "fn": lambda: [ gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), None, ], "outputs": [ self.group_add_button, self._group_info_panel, self.group_close_button, self.group_delete_button, self.group_chat_button, self.selected_group_id, ], } self.group_close_button.click(**onGroupClosedEvent) onGroupSaved = ( self.group_save_button.click( fn=self.save_group, inputs=[ self.selected_group_id, self.group_name, self.group_files, self._app.user_id, ], ) .then( self.list_group, inputs=[self._app.user_id, self.file_list_state], outputs=[self.group_list_state, self.group_list], ) .then(**onGroupClosedEvent) ) onGroupDeleted = ( self.group_delete_button.click( fn=self.delete_group, inputs=[self.selected_group_id], ) .then( self.list_group, inputs=[self._app.user_id, self.file_list_state], outputs=[self.group_list_state, self.group_list], ) .then(**onGroupClosedEvent) ) for event in self._app.get_event(f"onFileIndex{self._index.id}Changed"): onGroupDeleted = onGroupDeleted.then(**event) onGroupSaved = onGroupSaved.then(**event) def _on_app_created(self): """Called when the app is created""" if KH_DEMO_MODE: return self._app.app.load( self.list_file, inputs=[self._app.user_id, self.filter], outputs=[self.file_list_state, self.file_list], ).then( self.list_group, inputs=[self._app.user_id, self.file_list_state], outputs=[self.group_list_state, self.group_list], ).then( self.list_file_names, inputs=[self.file_list_state], outputs=[self.group_files], ) def _may_extract_zip(self, files, zip_dir: str): """Handle zip files""" zip_files = [file for file in files if file.endswith(".zip")] remaining_files = [file for file in files if not file.endswith("zip")] errors: list[str] = [] # Clean-up before unzip to remove old files shutil.rmtree(zip_dir, ignore_errors=True) # Unzip for zip_file in zip_files: # Prepare new zip output dir, separated for each files basename = os.path.splitext(os.path.basename(zip_file))[0] zip_out_dir = os.path.join(zip_dir, basename) os.makedirs(zip_out_dir, exist_ok=True) with zipfile.ZipFile(zip_file, "r") as zip_ref: zip_ref.extractall(zip_out_dir) n_zip_file = 0 for root, dirs, files in os.walk(zip_dir): for file in files: ext = os.path.splitext(file)[1] # only allow supported file-types ( not zip ) if ext not in [".zip"] and ext in self._supported_file_types: remaining_files += [os.path.join(root, file)] n_zip_file += 1 if n_zip_file > 0: print(f"Update zip files: {n_zip_file}") return remaining_files, errors def index_fn( self, files, urls, reindex: bool, settings, user_id ) -> Generator[tuple[str, str], None, None]: """Upload and index the files Args: files: the list of files to be uploaded urls: list of web URLs to be indexed reindex: whether to reindex the files selected_files: the list of files already selected settings: the settings of the app """ if urls: files = [it.strip() for it in urls.split("\n")] errors = self.validate_urls(files) else: if not files: gr.Info("No uploaded file") yield "", "" return files, unzip_errors = self._may_extract_zip( files, flowsettings.KH_ZIP_INPUT_DIR ) errors = self.validate_files(files) errors.extend(unzip_errors) if errors: gr.Warning(", ".join(errors)) yield "", "" return gr.Info(f"Start indexing {len(files)} files...") # get the pipeline indexing_pipeline = self._index.get_indexing_pipeline(settings, user_id) outputs, debugs = [], [] # stream the output output_stream = indexing_pipeline.stream(files, reindex=reindex) try: while True: response = next(output_stream) if response is None: continue if response.channel == "index": if response.content["status"] == "success": outputs.append(f"\u2705 | {response.content['file_name']}") elif response.content["status"] == "failed": outputs.append( f"\u274c | {response.content['file_name']}: " f"{response.content['message']}" ) elif response.channel == "debug": debugs.append(response.text) yield "\n".join(outputs), "\n".join(debugs) except StopIteration as e: results, index_errors, docs = e.value except Exception as e: debugs.append(f"Error: {e}") yield "\n".join(outputs), "\n".join(debugs) return n_successes = len([_ for _ in results if _]) if n_successes: gr.Info(f"Successfully index {n_successes} files") n_errors = len([_ for _ in errors if _]) if n_errors: gr.Warning(f"Have errors for {n_errors} files") return results def index_fn_file_with_default_loaders( self, files, reindex: bool, settings, user_id ) -> list["str"]: """Function for quick upload with default loaders Args: files: the list of files to be uploaded reindex: whether to reindex the files selected_files: the list of files already selected settings: the settings of the app """ print("Overriding with default loaders") exist_ids = [] to_process_files = [] for str_file_path in files: file_path = Path(str(str_file_path)) exist_id = ( self._index.get_indexing_pipeline(settings, user_id) .route(file_path) .get_id_if_exists(file_path) ) if exist_id: exist_ids.append(exist_id) else: to_process_files.append(str_file_path) returned_ids = [] settings = deepcopy(settings) settings[f"index.options.{self._index.id}.reader_mode"] = "default" settings[f"index.options.{self._index.id}.quick_index_mode"] = True if to_process_files: _iter = self.index_fn(to_process_files, [], reindex, settings, user_id) try: while next(_iter): pass except StopIteration as e: returned_ids = e.value return exist_ids + returned_ids def index_fn_url_with_default_loaders( self, urls, reindex: bool, settings, user_id, request: gr.Request, ): if KH_DEMO_MODE: check_rate_limit("file_upload", request) returned_ids: list[str] = [] settings = deepcopy(settings) settings[f"index.options.{self._index.id}.reader_mode"] = "default" settings[f"index.options.{self._index.id}.quick_index_mode"] = True if KH_DEMO_MODE: urls_splitted = urls.split("\n") if not all(is_arxiv_url(url) for url in urls_splitted): raise ValueError("All URLs must be valid arXiv URLs") output_files = [ download_arxiv_pdf( url, output_path=os.environ.get("GRADIO_TEMP_DIR", "/tmp"), ) for url in urls_splitted ] exist_ids = [] to_process_files = [] for str_file_path in output_files: file_path = Path(str_file_path) exist_id = ( self._index.get_indexing_pipeline(settings, user_id) .route(file_path) .get_id_if_exists(file_path) ) if exist_id: exist_ids.append(exist_id) else: to_process_files.append(str_file_path) returned_ids = [] if to_process_files: _iter = self.index_fn(to_process_files, [], reindex, settings, user_id) try: while next(_iter): pass except StopIteration as e: returned_ids = e.value returned_ids = exist_ids + returned_ids else: if urls: _iter = self.index_fn([], urls, reindex, settings, user_id) try: while next(_iter): pass except StopIteration as e: returned_ids = e.value return returned_ids def index_files_from_dir( self, folder_path, reindex, settings, user_id ) -> Generator[tuple[str, str], None, None]: """This should be constructable by users It means that the users can build their own index. Build your own index: - Input: - Type: based on the type, then there are ranges of. Use can select multiple panels: - Panels - Data sources - Include patterns - Exclude patterns - Indexing functions. Can be a list of indexing functions. Each declared function is: - Condition (the source that will go through this indexing function) - Function (the pipeline that run this) - Output: artifacts that can be used to -> this is the artifacts that we wish - Build the UI - Upload page: fixed standard, based on the type - Read page: fixed standard, based on the type - Delete page: fixed standard, based on the type - Build the index function - Build the chat function Step: 1. Decide on the artifacts 2. Implement the transformation from artifacts to UI """ if not folder_path: yield "", "" return import fnmatch from pathlib import Path include_patterns: list[str] = [] exclude_patterns: list[str] = ["*.png", "*.gif", "*/.*"] if include_patterns and exclude_patterns: raise ValueError("Cannot have both include and exclude patterns") # clean up the include patterns for idx in range(len(include_patterns)): if include_patterns[idx].startswith("*"): include_patterns[idx] = str(Path.cwd() / "**" / include_patterns[idx]) else: include_patterns[idx] = str( Path.cwd() / include_patterns[idx].strip("/") ) # clean up the exclude patterns for idx in range(len(exclude_patterns)): if exclude_patterns[idx].startswith("*"): exclude_patterns[idx] = str(Path.cwd() / "**" / exclude_patterns[idx]) else: exclude_patterns[idx] = str( Path.cwd() / exclude_patterns[idx].strip("/") ) # get the files files: list[str] = [str(p) for p in Path(folder_path).glob("**/*.*")] if include_patterns: for p in include_patterns: files = fnmatch.filter(names=files, pat=p) if exclude_patterns: for p in exclude_patterns: files = [f for f in files if not fnmatch.fnmatch(name=f, pat=p)] yield from self.index_fn(files, [], reindex, settings, user_id) def format_size_human_readable(self, num: float | str, suffix="B"): try: num = float(num) except ValueError: return num for unit in ("", "K", "M", "G", "T", "P", "E", "Z"): if abs(num) < 1024.0: return f"{num:3.0f}{unit}{suffix}" num /= 1024.0 return f"{num:.0f}Yi{suffix}" def list_file(self, user_id, name_pattern=""): if user_id is None: # not signed in return [], pd.DataFrame.from_records( [ { "id": "-", "name": "-", "size": "-", "tokens": "-", "loader": "-", "date_created": "-", } ] ) Source = self._index._resources["Source"] with Session(engine) as session: statement = select(Source) if self._index.config.get("private", False): statement = statement.where(Source.user == user_id) if name_pattern: statement = statement.where(Source.name.ilike(f"%{name_pattern}%")) results = [ { "id": each[0].id, "name": each[0].name, "size": self.format_size_human_readable(each[0].size), "tokens": self.format_size_human_readable( each[0].note.get("tokens", "-"), suffix="" ), "loader": each[0].note.get("loader", "-"), "date_created": each[0].date_created.strftime("%Y-%m-%d %H:%M:%S"), } for each in session.execute(statement).all() ] if results: file_list = pd.DataFrame.from_records(results) else: file_list = pd.DataFrame.from_records( [ { "id": "-", "name": "-", "size": "-", "tokens": "-", "loader": "-", "date_created": "-", } ] ) return results, file_list def list_file_names(self, file_list_state): if file_list_state: file_names = [(item["name"], item["id"]) for item in file_list_state] else: file_names = [] return gr.update(choices=file_names) def list_group(self, user_id, file_list): # supply file_list to display the file names in the group if file_list: file_id_to_name = {item["id"]: item["name"] for item in file_list} else: file_id_to_name = {} if user_id is None: # not signed in return [], pd.DataFrame.from_records( [ { "id": "-", "name": "-", "files": "-", "date_created": "-", } ] ) FileGroup = self._index._resources["FileGroup"] with Session(engine) as session: statement = select(FileGroup) if self._index.config.get("private", False): statement = statement.where(FileGroup.user == user_id) results = [ { "id": each[0].id, "name": each[0].name, "files": each[0].data.get("files", []), "date_created": each[0].date_created.strftime("%Y-%m-%d %H:%M:%S"), } for each in session.execute(statement).all() ] if results: formated_results = deepcopy(results) for item in formated_results: file_names = [ file_id_to_name.get(file_id, "-") for file_id in item["files"] ] item["files"] = ", ".join( f"'{it[:MAX_FILENAME_LENGTH]}..'" if len(it) > MAX_FILENAME_LENGTH else f"'{it}'" for it in file_names ) item_count = len(file_names) item_postfix = "s" if item_count > 1 else "" item["files"] = f"[{item_count} item{item_postfix}] " + item["files"] group_list = pd.DataFrame.from_records(formated_results) else: group_list = pd.DataFrame.from_records( [ { "id": "-", "name": "-", "files": "-", "date_created": "-", } ] ) return results, group_list def set_group_id_selector(self, selected_group_id): FileGroup = self._index._resources["FileGroup"] # check if group_name exist with Session(engine) as session: current_group = ( session.query(FileGroup).filter_by(id=selected_group_id).first() ) file_ids = [json.dumps(current_group.data["files"])] return [file_ids, "select", gr.Tabs(selected="chat-tab")] def save_group(self, group_id, group_name, group_files, user_id): FileGroup = self._index._resources["FileGroup"] current_group = None # check if group_name exist with Session(engine) as session: if group_id: current_group = session.query(FileGroup).filter_by(id=group_id).first() # update current group with new info current_group.name = group_name current_group.data["files"] = group_files # Update the files session.commit() else: current_group = ( session.query(FileGroup) .filter_by( name=group_name, user=user_id, ) .first() ) if current_group: raise gr.Error(f"Group {group_name} already exists") current_group = FileGroup( name=group_name, data={"files": group_files}, # type: ignore user=user_id, ) session.add(current_group) session.commit() group_id = current_group.id gr.Info(f"Group {group_name} has been saved") return group_id def delete_group(self, group_id): if not group_id: raise gr.Error("No group is selected") FileGroup = self._index._resources["FileGroup"] with Session(engine) as session: group = session.execute( select(FileGroup).where(FileGroup.id == group_id) ).first() if group: item = group[0] group_name = item.name session.delete(item) session.commit() gr.Info(f"Group {group_name} has been deleted") else: raise gr.Error("No group found") return None def interact_file_list(self, list_files, ev: gr.SelectData): if ev.value == "-" and ev.index[0] == 0: gr.Info("No file is uploaded") return None, self.selected_panel_false if not ev.selected: return None, self.selected_panel_false return list_files["id"][ev.index[0]], self.selected_panel_true.format( name=list_files["name"][ev.index[0]] ) def interact_group_list(self, list_groups, ev: gr.SelectData): selected_id = ev.index[0] if (not ev.value or ev.value == "-") and selected_id == 0: raise gr.Error("No group is selected") selected_item = list_groups[selected_id] selected_group_id = selected_item["id"] return ( "### Group Information", selected_group_id, selected_item["name"], selected_item["files"], ) def validate_files(self, files: list[str]): """Validate if the files are valid""" paths = [Path(file) for file in files] errors = [] if max_file_size := self._index.config.get("max_file_size", 0): errors_max_size = [] for path in paths: if path.stat().st_size > max_file_size * 1e6: errors_max_size.append(path.name) if errors_max_size: str_errors = ", ".join(errors_max_size) if len(str_errors) > 60: str_errors = str_errors[:55] + "..." errors.append( f"Maximum file size ({max_file_size} MB) exceeded: {str_errors}" ) if max_number_of_files := self._index.config.get("max_number_of_files", 0): with Session(engine) as session: current_num_files = session.query( self._index._resources["Source"].id ).count() if len(paths) + current_num_files > max_number_of_files: errors.append( f"Maximum number of files ({max_number_of_files}) will be exceeded" ) return errors def validate_urls(self, urls: list[str]): """Validate if the urls are valid""" errors = [] for url in urls: if not url.startswith("http") and not url.startswith("https"): errors.append(f"Invalid url `{url}`") return errors class FileSelector(BasePage): """File selector UI in the Chat page""" def __init__(self, app, index): super().__init__(app) self._index = index self.on_building_ui() def default(self): if self._app.f_user_management: return "disabled", [], -1 return "disabled", [], 1 def on_building_ui(self): default_mode, default_selector, user_id = self.default() self.mode = gr.Radio( value=default_mode, choices=[ ("Search All", "all"), ("Search In File(s)", "select"), ], container=False, ) self.selector = gr.Dropdown( label="Files", value=default_selector, choices=[], multiselect=True, container=False, interactive=True, visible=False, ) self.selector_user_id = gr.State(value=user_id) self.selector_choices = gr.JSON( value=[], visible=False, ) def on_register_events(self): self.mode.change( fn=lambda mode, user_id: (gr.update(visible=mode == "select"), user_id), inputs=[self.mode, self._app.user_id], outputs=[self.selector, self.selector_user_id], ) # attach special event for the first index if self._index.id == 1: self.selector_choices.change( fn=None, inputs=[self.selector_choices], js=update_file_list_js, show_progress="hidden", ) def as_gradio_component(self): return [self.mode, self.selector, self.selector_user_id] def get_selected_ids(self, components): mode, selected, user_id = components[0], components[1], components[2] if user_id is None: return [] if mode == "disabled": return [] elif mode == "select": return selected file_ids = [] with Session(engine) as session: statement = select(self._index._resources["Source"].id) if self._index.config.get("private", False): statement = statement.where( self._index._resources["Source"].user == user_id ) results = session.execute(statement).all() for (id,) in results: file_ids.append(id) return file_ids def load_files(self, selected_files, user_id): options: list = [] available_ids = [] if user_id is None: # not signed in return gr.update(value=selected_files, choices=options), options with Session(engine) as session: # get file list from Source table statement = select(self._index._resources["Source"]) if self._index.config.get("private", False): statement = statement.where( self._index._resources["Source"].user == user_id ) if KH_DEMO_MODE: # limit query by MAX_FILE_COUNT statement = statement.limit(MAX_FILE_COUNT) results = session.execute(statement).all() for result in results: available_ids.append(result[0].id) options.append((result[0].name, result[0].id)) # get group list from FileGroup table FileGroup = self._index._resources["FileGroup"] statement = select(FileGroup) if self._index.config.get("private", False): statement = statement.where(FileGroup.user == user_id) results = session.execute(statement).all() for result in results: item = result[0] options.append( (f"group: '{item.name}'", json.dumps(item.data.get("files", []))) ) if selected_files: available_ids_set = set(available_ids) selected_files = [ each for each in selected_files if each in available_ids_set ] return gr.update(value=selected_files, choices=options), options def _on_app_created(self): self._app.app.load( self.load_files, inputs=[self.selector, self._app.user_id], outputs=[self.selector, self.selector_choices], ) def on_subscribe_public_events(self): self._app.subscribe_event( name=f"onFileIndex{self._index.id}Changed", definition={ "fn": self.load_files, "inputs": [self.selector, self._app.user_id], "outputs": [self.selector, self.selector_choices], "show_progress": "hidden", }, ) if self._app.f_user_management: for event_name in ["onSignIn", "onSignOut"]: self._app.subscribe_event( name=event_name, definition={ "fn": self.load_files, "inputs": [self.selector, self._app.user_id], "outputs": [self.selector, self.selector_choices], "show_progress": "hidden", }, ) ================================================ FILE: libs/ktem/ktem/index/file/utils.py ================================================ import os import requests # regex patterns for Arxiv URL ARXIV_URL_PATTERNS = [ "https://arxiv.org/abs/", "https://arxiv.org/pdf/", ] ILLEGAL_NAME_CHARS = ["\\", "/", ":", "*", "?", '"', "<", ">", "|"] def clean_name(name): for char in ILLEGAL_NAME_CHARS: name = name.replace(char, "_") return name def is_arxiv_url(url): return any(url.startswith(pattern) for pattern in ARXIV_URL_PATTERNS) # download PDF from Arxiv URL def download_arxiv_pdf(url, output_path): if not is_arxiv_url(url): raise ValueError("Invalid Arxiv URL") is_abstract_url = "abs" in url if is_abstract_url: pdf_url = url.replace("abs", "pdf") abstract_url = url else: pdf_url = url abstract_url = url.replace("pdf", "abs") # get paper name from abstract url response = requests.get(abstract_url) # parse HTML response and get h1.title from bs4 import BeautifulSoup soup = BeautifulSoup(response.content, "html.parser") name = clean_name( soup.find("h1", class_="title").text.strip().replace("Title:", "") ) if not name: raise ValueError("Failed to get paper name") output_file_path = os.path.join(output_path, name + ".pdf") # prevent downloading if file already exists if not os.path.exists(output_file_path): response = requests.get(pdf_url) with open(output_file_path, "wb") as f: f.write(response.content) return output_file_path ================================================ FILE: libs/ktem/ktem/index/manager.py ================================================ from typing import Optional, Type from ktem.db.models import engine from sqlmodel import Session, select from theflow.settings import settings from theflow.utils.modules import import_dotted_string from .base import BaseIndex from .models import Index class IndexManager: """Manage the application indices The index manager is responsible for: - Managing the range of possible indices and their extensions - Each actual index built by user Attributes: - indices: list of indices built by user """ def __init__(self, app): self._app = app self._indices = [] self._index_types: dict[str, Type[BaseIndex]] = {} @property def index_types(self) -> dict: """List the index_type of the index""" return self._index_types def build_index(self, name: str, config: dict, index_type: str): """Build the index Building the index simply means recording the index information into the database and returning the index object. Args: name (str): the name of the index config (dict): the config of the index index_type (str): the type of the index id (int, optional): the id of the index. If None, the id will be generated automatically. Defaults to None. Returns: BaseIndex: the index object """ with Session(engine) as sess: entry = Index(name=name, config=config, index_type=index_type) sess.add(entry) sess.commit() sess.refresh(entry) try: # build the index index_cls = import_dotted_string(index_type, safe=False) index = index_cls(app=self._app, id=entry.id, name=name, config=config) index.on_create() # update the entry entry.config = index.config sess.commit() except Exception as e: sess.delete(entry) sess.commit() raise ValueError(f'Cannot create index "{name}": {e}') return index def update_index(self, id: int, name: str, config: dict): """Update the index information Args: id: the id of the index name: the new name of the index config: the new config of the index """ with Session(engine) as sess: entry = sess.get(Index, id) if entry is None: raise ValueError(f"Index with id {id} does not exist") entry.name = name entry.config = config sess.commit() for index in self._indices: if index.id == id: index.name = name index.config = config break def start_index(self, id: int, name: str, config: dict, index_type: str): """Start the index Args: id (int): the id of the index name (str): the name of the index config (dict): the config of the index index_type (str): the type of the index """ index_cls = import_dotted_string(index_type, safe=False) index = index_cls(app=self._app, id=id, name=name, config=config) index.on_start() self._indices.append(index) return index def delete_index(self, id: int): """Delete the index from the database""" index: Optional[BaseIndex] = None for _ in self._indices: if _.id == id: index = _ break if index is None: raise ValueError( "Index does not exist. If you have already removed the index, " "please restart to reflect the changes." ) try: try: # clean up index.on_delete() except Exception as e: print(f"Error while deleting index {index.name}: {e}") # remove from database with Session(engine) as sess: item = sess.query(Index).filter_by(id=id).first() sess.delete(item) sess.commit() new_indices = [_ for _ in self._indices if _.id != id] self._indices = new_indices except Exception as e: raise ValueError(f"Cannot delete index {index.name}: {e}") def load_index_types(self): """Load the supported index types""" self._index_types = {} # built-in index types from .file.index import FileIndex for index in [FileIndex]: self._index_types[f"{index.__module__}.{index.__qualname__}"] = index # developer-defined custom index types for index_str in settings.KH_INDEX_TYPES: cls: Type[BaseIndex] = import_dotted_string(index_str, safe=False) self._index_types[f"{cls.__module__}.{cls.__qualname__}"] = cls def exists(self, id: Optional[int] = None, name: Optional[str] = None) -> bool: """Check if the index exists Args: id (int): the id of the index Returns: bool: True if the index exists, False otherwise """ if id: with Session(engine) as sess: index = sess.get(Index, id) return index is not None if name: with Session(engine) as sess: index = sess.exec(select(Index).where(Index.name == name)).one_or_none() return index is not None return False def on_application_startup(self): """This method is called by the base application when the application starts Load the index from database """ self.load_index_types() for index in settings.KH_INDICES: if not self.exists(name=index["name"]): self.build_index(**index) with Session(engine) as sess: index_defs = sess.exec(select(Index)) for index_def in index_defs: self.start_index(**index_def.model_dump()) @property def indices(self): return self._indices def info(self): return {index.id: index for index in self._indices} ================================================ FILE: libs/ktem/ktem/index/models.py ================================================ from typing import Optional from ktem.db.engine import engine from sqlalchemy import JSON, Column from sqlmodel import Field, SQLModel # TODO: simplify with using SQLAlchemy directly class Index(SQLModel, table=True): __table_args__ = {"extend_existing": True} __tablename__ = "ktem__index" # type: ignore id: Optional[int] = Field(default=None, primary_key=True) name: str = Field(unique=True) index_type: str = Field() config: dict = Field(default={}, sa_column=Column(JSON)) Index.metadata.create_all(engine) ================================================ FILE: libs/ktem/ktem/index/ui.py ================================================ import gradio as gr import pandas as pd import yaml from ktem.app import BasePage from ktem.utils.file import YAMLNoDateSafeLoader from .manager import IndexManager # UGLY way to restart gradio server by updating atime def update_current_module_atime(): import os import time # Define the file path file_path = __file__ print("Updating atime for", file_path) # Get the current time current_time = time.time() # Set the modified time (and access time) to the current time os.utime(file_path, (current_time, current_time)) def format_description(cls): user_settings = cls.get_admin_settings() params_lines = ["| Name | Default | Description |", "| --- | --- | --- |"] for key, value in user_settings.items(): params_lines.append( f"| {key} | {value.get('value', '')} | {value.get('info', '')} |" ) return f"{cls.__doc__}\n\n" + "\n".join(params_lines) class IndexManagement(BasePage): def __init__(self, app): self._app = app self.manager: IndexManager = app.index_manager self.spec_desc_default = ( "# Spec description\n\nSelect an index to view the spec description." ) self.on_building_ui() def on_building_ui(self): with gr.Tab(label="View"): self.index_list = gr.DataFrame( headers=["id", "name", "index type"], interactive=False, column_widths=[10, 30, 60], ) with gr.Column(visible=False) as self._selected_panel: self.selected_index_id = gr.Number(value=-1, visible=False) with gr.Row(): with gr.Column(): self.edit_name = gr.Textbox( label="Index name", ) self.edit_spec = gr.Textbox( label="Index config", info="Admin configuration of the Index in YAML format", lines=10, ) gr.Markdown( "IMPORTANT: Changing or deleting the index will require " "restarting the system. Some config settings will require " "rebuilding the index for the index to work properly." ) with gr.Row(): self.btn_edit_save = gr.Button( "Save", min_width=10, variant="primary" ) self.btn_delete = gr.Button( "Delete", min_width=10, variant="stop" ) with gr.Row(visible=False) as self._delete_confirm: self.btn_delete_yes = gr.Button( "Confirm Delete", variant="stop", min_width=10, ) self.btn_delete_no = gr.Button("Cancel", min_width=10) self.btn_close = gr.Button("Close", min_width=10) with gr.Column(): self.edit_spec_desc = gr.Markdown("# Spec description") with gr.Tab(label="Add"): with gr.Row(): with gr.Column(scale=2): self.name = gr.Textbox( label="Index name", info="Must be unique and non-empty.", ) self.index_type = gr.Dropdown(label="Index type") self.spec = gr.Textbox( label="Specification", info="Specification of the index in YAML format.", ) gr.Markdown( "Note: " "After creating index, please restart the app" ) self.btn_new = gr.Button("Add", variant="primary") with gr.Column(scale=3): self.spec_desc = gr.Markdown(self.spec_desc_default) def _on_app_created(self): """Called when the app is created""" self._app.app.load( self.list_indices, inputs=[], outputs=[self.index_list], ) self._app.app.load( lambda: gr.update( choices=[ (key.split(".")[-1], key) for key in self.manager.index_types.keys() ] ), outputs=[self.index_type], ) def on_register_events(self): self.index_type.select( self.on_index_type_change, inputs=[self.index_type], outputs=[self.spec, self.spec_desc], ) self.btn_new.click( self.create_index, inputs=[self.name, self.index_type, self.spec], outputs=None, ).success(self.list_indices, inputs=[], outputs=[self.index_list]).success( lambda: ("", None, "", self.spec_desc_default), outputs=[ self.name, self.index_type, self.spec, self.spec_desc, ], ).success( update_current_module_atime ) self.index_list.select( self.select_index, inputs=self.index_list, outputs=[self.selected_index_id], show_progress="hidden", ) self.selected_index_id.change( self.on_selected_index_change, inputs=[self.selected_index_id], outputs=[ self._selected_panel, # edit section self.edit_spec, self.edit_spec_desc, self.edit_name, ], show_progress="hidden", ) self.btn_delete.click( lambda: ( gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), ), inputs=[], outputs=[ self.btn_edit_save, self.btn_delete, self.btn_close, self._delete_confirm, ], show_progress="hidden", ) self.btn_delete_yes.click( self.delete_index, inputs=[self.selected_index_id], outputs=[self.selected_index_id], show_progress="hidden", ).then(self.list_indices, inputs=[], outputs=[self.index_list],).success( update_current_module_atime ) self.btn_delete_no.click( lambda: ( gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), ), inputs=[], outputs=[ self.btn_edit_save, self.btn_delete, self.btn_close, self._delete_confirm, ], show_progress="hidden", ) self.btn_edit_save.click( self.update_index, inputs=[ self.selected_index_id, self.edit_name, self.edit_spec, ], show_progress="hidden", ).then( self.list_indices, inputs=[], outputs=[self.index_list], ) self.btn_close.click( lambda: -1, outputs=[self.selected_index_id], ) def on_index_type_change(self, index_type: str): """Update the spec description and pre-fill the default values Args: index_type: the name of the index type, this is usually the class name Returns: A tuple of the default spec and the description """ index_type_cls = self.manager.index_types[index_type] required: dict = { key: value.get("value", None) for key, value in index_type_cls.get_admin_settings().items() } return yaml.dump(required, sort_keys=False), format_description(index_type_cls) def create_index(self, name: str, index_type: str, config: str): """Create the index""" name = name.strip() if not name: raise gr.Error("Name must not be empty") existing_names = {idx.name for idx in self.manager.indices} if name in existing_names: raise gr.Error(f"Index '{name}' already exists. Please use a unique name.") try: self.manager.build_index( name=name, config=yaml.load(config, Loader=YAMLNoDateSafeLoader), index_type=index_type, ) gr.Info(f'Index "{name}" created successfully. Please restart the app!') except Exception as e: raise gr.Error(f'Failed to create index "{name}": {e}') def list_indices(self): """List the indices constructed by the user""" items = [] for item in self.manager.indices: record = {} record["id"] = item.id record["name"] = item.name record["index type"] = item.__class__.__name__ items.append(record) if items: indices_list = pd.DataFrame.from_records(items) else: indices_list = pd.DataFrame.from_records( [{"id": "-", "name": "-", "index type": "-"}] ) return indices_list def select_index(self, index_list, ev: gr.SelectData) -> int: """Return the index id""" if ev.value == "-" and ev.index[0] == 0: gr.Info("No index is constructed. Please create one first!") return -1 if not ev.selected: return -1 return int(index_list["id"][ev.index[0]]) def on_selected_index_change(self, selected_index_id: int): """Show the relevant index as user selects it on the UI Args: selected_index_id: the id of the selected index """ if selected_index_id == -1: _selected_panel = gr.update(visible=False) edit_spec = gr.update(value="") edit_spec_desc = gr.update(value="") edit_name = gr.update(value="") else: _selected_panel = gr.update(visible=True) index = self.manager.info()[selected_index_id] edit_spec = yaml.dump(index.config) edit_spec_desc = format_description(index.__class__) edit_name = index.name return ( _selected_panel, edit_spec, edit_spec_desc, edit_name, ) def update_index(self, selected_index_id: int, name: str, config: str): name = name.strip() if not name: raise gr.Error("Name must not be empty") # Check uniqueness (excluding current index) for idx in self.manager.indices: if idx.name == name and idx.id != selected_index_id: raise gr.Error( f"Index '{name}' already exists. Please use a unique name." ) try: spec = yaml.load(config, Loader=YAMLNoDateSafeLoader) self.manager.update_index(selected_index_id, name, spec) gr.Info(f'Index "{name}" updated successfully. Please restart the app!') except gr.Error: raise except Exception as e: raise gr.Error(f'Failed to save index "{name}": {e}') def delete_index(self, selected_index_id): try: self.manager.delete_index(selected_index_id) gr.Info("Delete index successfully. Please restart the app!") except Exception as e: gr.Warning(f"Fail to delete index: {e}") return selected_index_id return -1 ================================================ FILE: libs/ktem/ktem/llms/__init__.py ================================================ ================================================ FILE: libs/ktem/ktem/llms/db.py ================================================ from typing import Type from ktem.db.engine import engine from sqlalchemy import JSON, Boolean, Column, String from sqlalchemy.orm import DeclarativeBase from theflow.settings import settings as flowsettings from theflow.utils.modules import import_dotted_string class Base(DeclarativeBase): pass class BaseLLMTable(Base): """Base table to store language model""" __abstract__ = True name = Column(String, primary_key=True, unique=True) spec = Column(JSON, default={}) default = Column(Boolean, default=False) _base_llm: Type[BaseLLMTable] = ( import_dotted_string(flowsettings.KH_TABLE_LLM, safe=False) if hasattr(flowsettings, "KH_TABLE_LLM") else BaseLLMTable ) class LLMTable(_base_llm): # type: ignore __tablename__ = "llm_table" if not getattr(flowsettings, "KH_ENABLE_ALEMBIC", False): LLMTable.metadata.create_all(engine) ================================================ FILE: libs/ktem/ktem/llms/manager.py ================================================ from typing import Optional, Type, overload from sqlalchemy import select from sqlalchemy.orm import Session from theflow.settings import settings as flowsettings from theflow.utils.modules import deserialize, import_dotted_string from kotaemon.llms import ChatLLM from .db import LLMTable, engine class LLMManager: """Represent a pool of models""" def __init__(self): self._models: dict[str, ChatLLM] = {} self._info: dict[str, dict] = {} self._default: str = "" self._vendors: list[Type] = [] if hasattr(flowsettings, "KH_LLMS"): for name, model in flowsettings.KH_LLMS.items(): with Session(engine) as session: stmt = select(LLMTable).where(LLMTable.name == name) result = session.execute(stmt) if not result.first(): item = LLMTable( name=name, spec=model["spec"], default=model.get("default", False), ) session.add(item) session.commit() self.load() self.load_vendors() def load(self): """Load the model pool from database""" self._models, self._info, self._default = {}, {}, "" with Session(engine) as session: stmt = select(LLMTable) items = session.execute(stmt) for (item,) in items: self._models[item.name] = deserialize(item.spec, safe=False) self._info[item.name] = { "name": item.name, "spec": item.spec, "default": item.default, } if item.default: self._default = item.name def load_vendors(self): from kotaemon.llms import ( AzureChatOpenAI, ChatOpenAI, LCAnthropicChat, LCCohereChat, LCGeminiChat, LCOllamaChat, LlamaCppChat, ) self._vendors = [ ChatOpenAI, AzureChatOpenAI, LCAnthropicChat, LCGeminiChat, LCCohereChat, LCOllamaChat, LlamaCppChat, ] for extra_vendor in getattr(flowsettings, "KH_LLM_EXTRA_VENDORS", []): self._vendors.append(import_dotted_string(extra_vendor, safe=False)) def __getitem__(self, key: str) -> ChatLLM: """Get model by name""" return self._models[key] def __contains__(self, key: str) -> bool: """Check if model exists""" return key in self._models @overload def get(self, key: str, default: None) -> Optional[ChatLLM]: ... @overload def get(self, key: str, default: ChatLLM) -> ChatLLM: ... def get(self, key: str, default: Optional[ChatLLM] = None) -> Optional[ChatLLM]: """Get model by name with default value""" return self._models.get(key, default) def settings(self) -> dict: """Present model pools option for gradio""" return { "label": "LLM", "choices": list(self._models.keys()), "value": self.get_default_name(), } def options(self) -> dict: """Present a dict of models""" return self._models def get_random_name(self) -> str: """Get the name of random model Returns: str: random model name in the pool """ import random if not self._models: raise ValueError("No models in pool") return random.choice(list(self._models.keys())) def get_default_name(self) -> str: """Get the name of default model In case there is no default model, choose random model from pool. In case there are multiple default models, choose random from them. Returns: str: model name """ if not self._models: raise ValueError("No models in pool") if not self._default: return self.get_random_name() return self._default def get_random(self) -> ChatLLM: """Get random model""" return self._models[self.get_random_name()] def get_default(self) -> ChatLLM: """Get default model In case there is no default model, choose random model from pool. In case there are multiple default models, choose random from them. Returns: ChatLLM: model """ return self._models[self.get_default_name()] def info(self) -> dict: """List all models""" return self._info def add(self, name: str, spec: dict, default: bool): """Add a new model to the pool""" if not name: raise ValueError("Name must not be empty") try: with Session(engine) as session: if default: # turn all models to non-default session.query(LLMTable).update({"default": False}) session.commit() item = LLMTable(name=name, spec=spec, default=default) session.add(item) session.commit() except Exception as e: raise ValueError(f"Failed to add model {name}: {e}") self.load() def delete(self, name: str): """Delete a model from the pool""" try: with Session(engine) as session: item = session.query(LLMTable).filter_by(name=name).first() session.delete(item) session.commit() except Exception as e: raise ValueError(f"Failed to delete model {name}: {e}") self.load() def update(self, name: str, spec: dict, default: bool, new_name: str = ""): """Update a model in the pool, optionally renaming it.""" if not name: raise ValueError("Name must not be empty") if new_name and new_name != name: # Check uniqueness before destructive delete if new_name in self._info: raise ValueError( f"Model '{new_name}' already exists. Use a unique name." ) self.delete(name) self.add(new_name, spec=spec, default=default) return try: with Session(engine) as session: if default: # turn all models to non-default session.query(LLMTable).update({"default": False}) session.commit() item = session.query(LLMTable).filter_by(name=name).first() if not item: raise ValueError(f"Model {name} not found") item.spec = spec item.default = default session.commit() except Exception as e: raise ValueError(f"Failed to update model {name}: {e}") self.load() def vendors(self) -> dict: """Return list of vendors""" return {vendor.__qualname__: vendor for vendor in self._vendors} llms = LLMManager() ================================================ FILE: libs/ktem/ktem/llms/ui.py ================================================ from copy import deepcopy import gradio as gr import pandas as pd import yaml from ktem.app import BasePage from ktem.utils.file import YAMLNoDateSafeLoader from theflow.utils.modules import deserialize from .manager import llms def format_description(cls): params = cls.describe()["params"] params_lines = ["| Name | Type | Description |", "| --- | --- | --- |"] for key, value in params.items(): if isinstance(value["auto_callback"], str): continue params_lines.append(f"| {key} | {value['type']} | {value['help']} |") return f"{cls.__doc__}\n\n" + "\n".join(params_lines) class LLMManagement(BasePage): def __init__(self, app): self._app = app self.spec_desc_default = ( "# Spec description\n\nSelect an LLM to view the spec description." ) self.on_building_ui() def on_building_ui(self): with gr.Tab(label="View"): self.llm_list = gr.DataFrame( headers=["name", "vendor", "default"], interactive=False, column_widths=[30, 40, 30], ) with gr.Column(visible=False) as self._selected_panel: self.selected_llm_name = gr.Textbox(value="", visible=False) with gr.Row(): with gr.Column(): self.edit_default = gr.Checkbox( label="Set default", info=( "Set this LLM as default. If no default is set, " "a random LLM will be used. " "This default LLM will be used by other components " "by default if no LLM is specified for such components." ), ) self.edit_name = gr.Textbox( label="Name", info="Edit to rename this LLM.", ) self.edit_spec = gr.Textbox( label="Specification", info="Specification of the LLM in YAML format", lines=10, ) with gr.Accordion( label="Test connection", visible=False, open=False ) as self._check_connection_panel: with gr.Row(): with gr.Column(scale=4): self.connection_logs = gr.HTML("Logs") with gr.Column(scale=1): self.btn_test_connection = gr.Button( "Test", ) with gr.Row(visible=False) as self._selected_panel_btn: with gr.Column(): self.btn_edit_save = gr.Button( "Save", min_width=10, variant="primary" ) with gr.Column(): self.btn_delete = gr.Button( "Delete", min_width=10, variant="stop" ) with gr.Row(): self.btn_delete_yes = gr.Button( "Confirm Delete", variant="stop", visible=False, min_width=10, ) self.btn_delete_no = gr.Button( "Cancel", visible=False, min_width=10 ) with gr.Column(): self.btn_close = gr.Button("Close", min_width=10) with gr.Column(): self.edit_spec_desc = gr.Markdown("# Spec description") with gr.Tab(label="Add"): with gr.Row(): with gr.Column(scale=2): self.name = gr.Textbox( label="LLM name", info=( "Must be unique. The name will be used to identify the LLM." ), ) self.llm_choices = gr.Dropdown( label="LLM vendors", info=( "Choose the vendor for the LLM. Each vendor has different " "specification." ), ) self.spec = gr.Textbox( label="Specification", info="Specification of the LLM in YAML format", ) self.default = gr.Checkbox( label="Set default", info=( "Set this LLM as default. This default LLM will be used " "by default across the application." ), ) self.btn_new = gr.Button("Add LLM", variant="primary") with gr.Column(scale=3): self.spec_desc = gr.Markdown(self.spec_desc_default) def _on_app_created(self): """Called when the app is created""" self._app.app.load( self.list_llms, inputs=[], outputs=[self.llm_list], ) self._app.app.load( lambda: gr.update(choices=list(llms.vendors().keys())), outputs=[self.llm_choices], ) def on_llm_vendor_change(self, vendor): vendor = llms.vendors()[vendor] required: dict = {} desc = vendor.describe() for key, value in desc["params"].items(): if value.get("required", False): required[key] = None return yaml.dump(required), format_description(vendor) def on_register_events(self): self.llm_choices.select( self.on_llm_vendor_change, inputs=[self.llm_choices], outputs=[self.spec, self.spec_desc], ) self.btn_new.click( self.create_llm, inputs=[self.name, self.llm_choices, self.spec, self.default], outputs=[], ).success(self.list_llms, inputs=[], outputs=[self.llm_list]).success( lambda: ("", None, "", False, self.spec_desc_default), outputs=[ self.name, self.llm_choices, self.spec, self.default, self.spec_desc, ], ) self.llm_list.select( self.select_llm, inputs=self.llm_list, outputs=[self.selected_llm_name], show_progress="hidden", ) self.selected_llm_name.change( self.on_selected_llm_change, inputs=[self.selected_llm_name], outputs=[ self._selected_panel, self._selected_panel_btn, # delete section self.btn_delete, self.btn_delete_yes, self.btn_delete_no, # edit section self.edit_name, self.edit_spec, self.edit_spec_desc, self.edit_default, ], show_progress="hidden", ).success(lambda: gr.update(value=""), outputs=[self.connection_logs]) self.btn_delete.click( self.on_btn_delete_click, inputs=[], outputs=[self.btn_delete, self.btn_delete_yes, self.btn_delete_no], show_progress="hidden", ) self.btn_delete_yes.click( self.delete_llm, inputs=[self.selected_llm_name], outputs=[self.selected_llm_name], show_progress="hidden", ).then( self.list_llms, inputs=[], outputs=[self.llm_list], ) self.btn_delete_no.click( lambda: ( gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), ), inputs=[], outputs=[self.btn_delete, self.btn_delete_yes, self.btn_delete_no], show_progress="hidden", ) self.btn_edit_save.click( self.save_llm, inputs=[ self.selected_llm_name, self.edit_name, self.edit_default, self.edit_spec, ], outputs=[self.selected_llm_name], show_progress="hidden", ).then( self.list_llms, inputs=[], outputs=[self.llm_list], ) self.btn_close.click( lambda: "", outputs=[self.selected_llm_name], ) self.btn_test_connection.click( self.check_connection, inputs=[self.selected_llm_name, self.edit_spec], outputs=[self.connection_logs], ) def create_llm(self, name, choices, spec, default): try: name = name.strip() spec = yaml.load(spec, Loader=YAMLNoDateSafeLoader) spec["__type__"] = ( llms.vendors()[choices].__module__ + "." + llms.vendors()[choices].__qualname__ ) llms.add(name, spec=spec, default=default) gr.Info(f"LLM '{name}' created successfully") except ValueError as e: raise gr.Error(str(e)) except Exception as e: raise gr.Error(f"Failed to create LLM '{name}': {e}") def list_llms(self): """List the LLMs""" items = [] for item in llms.info().values(): record = {} record["name"] = item["name"] record["vendor"] = item["spec"].get("__type__", "-").split(".")[-1] record["default"] = item["default"] items.append(record) if items: llm_list = pd.DataFrame.from_records(items) else: llm_list = pd.DataFrame.from_records( [{"name": "-", "vendor": "-", "default": "-"}] ) return llm_list def select_llm(self, llm_list, ev: gr.SelectData): if ev.value == "-" and ev.index[0] == 0: gr.Info("No LLM is loaded. Please add LLM first") return "" if not ev.selected: return "" return llm_list["name"][ev.index[0]] def on_selected_llm_change(self, selected_llm_name): if selected_llm_name == "": _selected_panel = gr.update(visible=False) _selected_panel_btn = gr.update(visible=False) btn_delete = gr.update(visible=True) btn_delete_yes = gr.update(visible=False) btn_delete_no = gr.update(visible=False) edit_name = gr.update(value="") edit_spec = gr.update(value="") edit_spec_desc = gr.update(value="") edit_default = gr.update(value=False) else: _selected_panel = gr.update(visible=True) _selected_panel_btn = gr.update(visible=True) btn_delete = gr.update(visible=True) btn_delete_yes = gr.update(visible=False) btn_delete_no = gr.update(visible=False) info = deepcopy(llms.info()[selected_llm_name]) vendor_str = info["spec"].pop("__type__", "-").split(".")[-1] vendor = llms.vendors()[vendor_str] edit_name = selected_llm_name edit_spec = yaml.dump(info["spec"]) edit_spec_desc = format_description(vendor) edit_default = info["default"] return ( _selected_panel, _selected_panel_btn, btn_delete, btn_delete_yes, btn_delete_no, edit_name, edit_spec, edit_spec_desc, edit_default, ) def on_btn_delete_click(self): btn_delete = gr.update(visible=False) btn_delete_yes = gr.update(visible=True) btn_delete_no = gr.update(visible=True) return btn_delete, btn_delete_yes, btn_delete_no def check_connection(self, selected_llm_name: str, selected_spec): log_content: str = "" try: log_content += f"- Testing model: {selected_llm_name}
" yield log_content # Parse content & init model info = deepcopy(llms.info()[selected_llm_name]) # Parse content & create dummy embedding spec = yaml.load(selected_spec, Loader=YAMLNoDateSafeLoader) info["spec"].update(spec) llm = deserialize(info["spec"], safe=False) if llm is None: raise Exception(f"Can not found model: {selected_llm_name}") log_content += "- Sending a message `Hi`
" yield log_content respond = llm("Hi") log_content += ( f"- Connection success. " f"Got response:\n {respond}
" ) yield log_content gr.Info(f"LLM {selected_llm_name} connect successfully") except Exception as e: log_content += ( f"- Connection failed. " f"Got error:\n {e}" ) yield log_content return log_content def save_llm(self, selected_llm_name, edit_name, default, spec): try: new_name = edit_name.strip() spec = yaml.load(spec, Loader=YAMLNoDateSafeLoader) spec["__type__"] = llms.info()[selected_llm_name]["spec"]["__type__"] llms.update( selected_llm_name, spec=spec, default=default, new_name=new_name ) final_name = ( new_name if new_name != selected_llm_name else selected_llm_name ) gr.Info(f"LLM '{final_name}' saved successfully") return final_name except ValueError as e: raise gr.Error(str(e)) except Exception as e: raise gr.Error(f"Failed to save LLM '{selected_llm_name}': {e}") def delete_llm(self, selected_llm_name): try: llms.delete(selected_llm_name) except Exception as e: gr.Error(f"Failed to delete LLM {selected_llm_name}: {e}") return selected_llm_name return "" ================================================ FILE: libs/ktem/ktem/main.py ================================================ import gradio as gr from decouple import config from ktem.app import BaseApp from ktem.pages.chat import ChatPage from ktem.pages.help import HelpPage from ktem.pages.resources import ResourcesTab from ktem.pages.settings import SettingsPage from ktem.pages.setup import SetupPage from theflow.settings import settings as flowsettings KH_DEMO_MODE = getattr(flowsettings, "KH_DEMO_MODE", False) KH_SSO_ENABLED = getattr(flowsettings, "KH_SSO_ENABLED", False) KH_ENABLE_FIRST_SETUP = getattr(flowsettings, "KH_ENABLE_FIRST_SETUP", False) KH_APP_DATA_EXISTS = getattr(flowsettings, "KH_APP_DATA_EXISTS", True) # override first setup setting if config("KH_FIRST_SETUP", default=False, cast=bool): KH_APP_DATA_EXISTS = False def toggle_first_setup_visibility(): global KH_APP_DATA_EXISTS is_first_setup = not KH_DEMO_MODE and not KH_APP_DATA_EXISTS KH_APP_DATA_EXISTS = True return gr.update(visible=is_first_setup), gr.update(visible=not is_first_setup) class App(BaseApp): """The main app of Kotaemon The main application contains app-level information: - setting state - user id App life-cycle: - Render - Declare public events - Subscribe public events - Register events """ def ui(self): """Render the UI""" self._tabs = {} with gr.Tabs() as self.tabs: if self.f_user_management: from ktem.pages.login import LoginPage with gr.Tab( "Welcome", elem_id="login-tab", id="login-tab" ) as self._tabs["login-tab"]: self.login_page = LoginPage(self) with gr.Tab( "Chat", elem_id="chat-tab", id="chat-tab", visible=not self.f_user_management, ) as self._tabs["chat-tab"]: self.chat_page = ChatPage(self) if len(self.index_manager.indices) == 1: for index in self.index_manager.indices: with gr.Tab( f"{index.name}", elem_id="indices-tab", elem_classes=[ "fill-main-area-height", "scrollable", "indices-tab", ], id="indices-tab", visible=not self.f_user_management and not KH_DEMO_MODE, ) as self._tabs[f"{index.id}-tab"]: page = index.get_index_page_ui() setattr(self, f"_index_{index.id}", page) elif len(self.index_manager.indices) > 1: with gr.Tab( "Files", elem_id="indices-tab", elem_classes=["fill-main-area-height", "scrollable", "indices-tab"], id="indices-tab", visible=not self.f_user_management and not KH_DEMO_MODE, ) as self._tabs["indices-tab"]: for index in self.index_manager.indices: with gr.Tab( index.name, elem_id=f"{index.id}-tab", ) as self._tabs[f"{index.id}-tab"]: page = index.get_index_page_ui() setattr(self, f"_index_{index.id}", page) if not KH_DEMO_MODE: if not KH_SSO_ENABLED: with gr.Tab( "Resources", elem_id="resources-tab", id="resources-tab", visible=not self.f_user_management, elem_classes=["fill-main-area-height", "scrollable"], ) as self._tabs["resources-tab"]: self.resources_page = ResourcesTab(self) with gr.Tab( "Settings", elem_id="settings-tab", id="settings-tab", visible=not self.f_user_management, elem_classes=["fill-main-area-height", "scrollable"], ) as self._tabs["settings-tab"]: self.settings_page = SettingsPage(self) with gr.Tab( "Help", elem_id="help-tab", id="help-tab", visible=not self.f_user_management, elem_classes=["fill-main-area-height", "scrollable"], ) as self._tabs["help-tab"]: self.help_page = HelpPage(self) if KH_ENABLE_FIRST_SETUP: with gr.Column(visible=False) as self.setup_page_wrapper: self.setup_page = SetupPage(self) def on_subscribe_public_events(self): if self.f_user_management: from ktem.db.engine import engine from ktem.db.models import User from sqlmodel import Session, select def toggle_login_visibility(user_id): if not user_id: return list( ( gr.update(visible=True) if k == "login-tab" else gr.update(visible=False) ) for k in self._tabs.keys() ) + [gr.update(selected="login-tab")] with Session(engine) as session: user = session.exec(select(User).where(User.id == user_id)).first() if user is None: return list( ( gr.update(visible=True) if k == "login-tab" else gr.update(visible=False) ) for k in self._tabs.keys() ) is_admin = user.admin tabs_update = [] for k in self._tabs.keys(): if k == "login-tab": tabs_update.append(gr.update(visible=False)) elif k == "resources-tab": tabs_update.append(gr.update(visible=is_admin)) else: tabs_update.append(gr.update(visible=True)) tabs_update.append(gr.update(selected="chat-tab")) return tabs_update self.subscribe_event( name="onSignIn", definition={ "fn": toggle_login_visibility, "inputs": [self.user_id], "outputs": list(self._tabs.values()) + [self.tabs], "show_progress": "hidden", }, ) self.subscribe_event( name="onSignOut", definition={ "fn": toggle_login_visibility, "inputs": [self.user_id], "outputs": list(self._tabs.values()) + [self.tabs], "show_progress": "hidden", }, ) if KH_ENABLE_FIRST_SETUP: self.subscribe_event( name="onFirstSetupComplete", definition={ "fn": toggle_first_setup_visibility, "inputs": [], "outputs": [self.setup_page_wrapper, self.tabs], "show_progress": "hidden", }, ) def _on_app_created(self): """Called when the app is created""" if KH_ENABLE_FIRST_SETUP: self.app.load( toggle_first_setup_visibility, inputs=[], outputs=[self.setup_page_wrapper, self.tabs], ) ================================================ FILE: libs/ktem/ktem/mcp/__init__.py ================================================ # MCP (Model Context Protocol) integration for kotaemon ================================================ FILE: libs/ktem/ktem/mcp/db.py ================================================ from ktem.db.engine import engine from sqlalchemy import JSON, Column, String from sqlalchemy import inspect as sa_inspect from sqlalchemy.orm import DeclarativeBase class Base(DeclarativeBase): pass class BaseMCPTable(Base): """Base table to store MCP server configurations""" __abstract__ = True name = Column(String, primary_key=True, unique=True) config = Column(JSON, default={}) # Full JSON config for the MCP server class MCPTable(BaseMCPTable): __tablename__ = "mcp_table" # Drop and recreate to handle schema changes from old multi-column layout. _inspector = sa_inspect(engine) if _inspector.has_table("mcp_table"): _columns = {col["name"] for col in _inspector.get_columns("mcp_table")} if "config" not in _columns: MCPTable.__table__.drop(engine) # type: ignore[attr-defined] MCPTable.metadata.create_all(engine) ================================================ FILE: libs/ktem/ktem/mcp/manager.py ================================================ """Manager for MCP server configurations. Provides CRUD operations on the MCPTable. All tool building/discovery logic lives in kotaemon.agents.tools.mcp. """ import logging from sqlalchemy import select from sqlalchemy.orm import Session from .db import MCPTable, engine logger = logging.getLogger(__name__) class MCPManager: """Manages MCP server configurations stored in the database.""" def __init__(self): self._configs: dict[str, dict] = {} self.load() def load(self): """Reload configurations from the database.""" self._info = {} with Session(engine) as session: stmt = select(MCPTable) items = session.execute(stmt) for (item,) in items: self._info[item.name] = { "name": item.name, "config": item.config, } def info(self) -> dict: """Return all MCP server configurations.""" return self._info def get(self, name: str) -> dict | None: """Get a single configuration by name.""" return self._info.get(name) def add(self, name: str, config: dict): """Add a new MCP server configuration.""" name = name.strip() if not name: raise ValueError("Name must not be empty") with Session(engine) as session: item = MCPTable(name=name, config=config) session.add(item) session.commit() self.load() def update(self, name: str, config: dict): """Update an existing MCP server configuration.""" if not name: raise ValueError("Name must not be empty") with Session(engine) as session: item = session.query(MCPTable).filter_by(name=name).first() if not item: raise ValueError(f"MCP server '{name}' not found") item.config = config # type: ignore[assignment] session.commit() self.load() def delete(self, name: str): """Delete an MCP server configuration.""" with Session(engine) as session: item = session.query(MCPTable).filter_by(name=name).first() if item: session.delete(item) session.commit() self.load() def get_enabled_tools(self) -> list[str]: """Return tool choice names for all MCP servers.""" choices = [] for name, entry in self._info.items(): config = entry.get("config", {}) enabled_tools = config.get("enabled_tools", None) if enabled_tools is not None: choices.append(f"[MCP] {name}") return choices mcp_manager = MCPManager() ================================================ FILE: libs/ktem/ktem/mcp/ui.py ================================================ import json import logging import gradio as gr import pandas as pd from ktem.app import BasePage from kotaemon.agents.tools.mcp import discover_tools_info, format_tool_list from .manager import mcp_manager logger = logging.getLogger(__name__) TOOLS_DEFAULT = "# Available Tools\n\nSelect or add an MCP server to view its tools." MCP_SERVERS_KEY = "mcpServers" EXAMPLE_CONFIG = """{ "mcpServers": { } }""" class MCPManagement(BasePage): def __init__(self, app): self._app = app self.on_building_ui() def on_building_ui(self): with gr.Tab(label="View"): self.mcp_list = gr.DataFrame( headers=["name", "config"], interactive=False, column_widths=[30, 70], ) with gr.Column(visible=False) as self._selected_panel: self.selected_mcp_name = gr.Textbox(value="", visible=False) with gr.Row(): with gr.Column(): self.edit_config = gr.Code( label="Configuration (JSON)", language="json", lines=10, ) with gr.Row(visible=False) as self._selected_panel_btn: with gr.Column(): self.btn_edit_save = gr.Button( "Save", min_width=10, variant="primary" ) with gr.Column(): self.btn_delete = gr.Button( "Delete", min_width=10, variant="stop" ) with gr.Row(): self.btn_delete_yes = gr.Button( "Confirm Delete", variant="stop", visible=False, min_width=10, ) self.btn_delete_no = gr.Button( "Cancel", visible=False, min_width=10 ) with gr.Column(): self.btn_close = gr.Button("Close", min_width=10) with gr.Column(): self.edit_tools_display = gr.Markdown(TOOLS_DEFAULT) with gr.Tab(label="Add"): with gr.Row(): with gr.Column(scale=2): self.config = gr.Code( label="Configuration (JSON)", language="json", lines=10, value=EXAMPLE_CONFIG, ) gr.HTML( "
" ) # Fix: Prevent the overflow of the gr.Code affect click button with gr.Row(): self.btn_new = gr.Button("Add MCP Servers", variant="primary") with gr.Column(scale=3): self.add_tools_display = gr.Markdown(TOOLS_DEFAULT) def _on_app_created(self): """Called when the app is created.""" self._app.app.load( self.list_servers, inputs=[], outputs=[self.mcp_list], ) def on_register_events(self): # Add new server — save first, then fetch tools async self.btn_new.click( self.create_server, inputs=[self.config], outputs=[self.add_tools_display], ).success(self.list_servers, inputs=[], outputs=[self.mcp_list]).then( self.fetch_tools_for_add, inputs=[self.config], outputs=[self.add_tools_display], ).then( lambda: EXAMPLE_CONFIG, outputs=[self.config], ) # Select a server from list self.mcp_list.select( self.select_server, inputs=self.mcp_list, outputs=[self.selected_mcp_name], show_progress="hidden", ) self.selected_mcp_name.change( self.on_selected_server_change, inputs=[self.selected_mcp_name], outputs=[ self._selected_panel, self._selected_panel_btn, self.btn_delete, self.btn_delete_yes, self.btn_delete_no, self.edit_config, self.edit_tools_display, ], show_progress="hidden", ).then( self.fetch_tools_for_view, inputs=[self.selected_mcp_name], outputs=[self.edit_tools_display], ) # Delete flow self.btn_delete.click( self.on_btn_delete_click, inputs=[], outputs=[self.btn_delete, self.btn_delete_yes, self.btn_delete_no], show_progress="hidden", ) self.btn_delete_yes.click( self.delete_server, inputs=[self.selected_mcp_name], outputs=[self.selected_mcp_name], show_progress="hidden", ).then(self.list_servers, inputs=[], outputs=[self.mcp_list]) self.btn_delete_no.click( lambda: ( gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), ), inputs=[], outputs=[self.btn_delete, self.btn_delete_yes, self.btn_delete_no], show_progress="hidden", ) # Save edits — save first, then refresh tools self.btn_edit_save.click( self.save_server, inputs=[self.selected_mcp_name, self.edit_config], outputs=[self.edit_tools_display], show_progress="hidden", ).then(self.list_servers, inputs=[], outputs=[self.mcp_list]).then( self.fetch_tools_for_view, inputs=[self.selected_mcp_name], outputs=[self.edit_tools_display], ) # Close panel self.btn_close.click(lambda: "", outputs=[self.selected_mcp_name]) # --- Handlers --- def _fetch_tools_markdown(self, config: dict) -> str: """Fetch tools from MCP server and return as formatted HTML.""" try: tool_infos = discover_tools_info(config) enabled_tools = config.get("enabled_tools", None) return format_tool_list(tool_infos, enabled_tools) except Exception as e: return f"❌ Failed to fetch tools: {e}" def create_server(self, config_str): """Create server(s), show loading placeholder.""" try: configs = json.loads(config_str) except json.JSONDecodeError as e: raise gr.Error(f"Invalid JSON: {e}") if not isinstance(configs, dict) or MCP_SERVERS_KEY not in configs: raise gr.Error( f"Config must be a dictionary with '{MCP_SERVERS_KEY}' root key." ) mcp_servers = configs[MCP_SERVERS_KEY] if not isinstance(mcp_servers, dict): raise gr.Error( f"'{MCP_SERVERS_KEY}' must be a mapping of server names to configs." ) # Validate that no names are empty before processing for name in mcp_servers: name = name.strip() if not name: raise gr.Error("Server names cannot be empty.") success_count = 0 failed_count = 0 msgs = [] for name, config in mcp_servers.items(): name = name.strip() if name in mcp_manager.info(): gr.Warning(f"MCP server '{name}' already exists. Skipping.") failed_count += 1 continue try: mcp_manager.add(name, config) success_count += 1 msgs.append(f"# Tools for '{name}'\n\n⏳ Fetching tools...") except Exception as e: gr.Warning(f"Failed to create MCP server '{name}': {e}") failed_count += 1 if success_count > 0: gr.Info(f"{success_count} MCP server(s) created successfully") if not msgs: return TOOLS_DEFAULT return "\n\n".join(msgs) def fetch_tools_for_add(self, config_str): """Fetch tools after server was added (chained call).""" if not config_str: return TOOLS_DEFAULT try: configs = json.loads(config_str) except json.JSONDecodeError: return "❌ Invalid JSON config" if not isinstance(configs, dict) or MCP_SERVERS_KEY not in configs: return f"❌ Config must be a dictionary with '{MCP_SERVERS_KEY}' root key" mcp_servers = configs[MCP_SERVERS_KEY] if not isinstance(mcp_servers, dict): return f"❌ '{MCP_SERVERS_KEY}' must be a dictionary" msgs = [] for name, config in mcp_servers.items(): msgs.append( f"# Tools for '{name.strip()}'\n\n{self._fetch_tools_markdown(config)}" ) return "\n\n".join(msgs) def fetch_tools_for_view(self, selected_name): """Fetch tools for the View panel (chained call).""" if not selected_name: return TOOLS_DEFAULT entry = mcp_manager.info().get(selected_name) if not entry: return TOOLS_DEFAULT config = entry.get("config", {}) return f"# Tools for '{selected_name}'\n\n{self._fetch_tools_markdown(config)}" def list_servers(self): items = [] for entry in mcp_manager.info().values(): items.append( { "name": entry["name"], "config": json.dumps(entry.get("config", {})), } ) if items: return pd.DataFrame.from_records(items) return pd.DataFrame.from_records([{"name": "-", "config": "-"}]) def select_server(self, mcp_list, ev: gr.SelectData): if ev.value == "-" and ev.index[0] == 0: gr.Info("No MCP server configured. Please add one first.") return "" if not ev.selected: return "" return mcp_list["name"][ev.index[0]] def on_selected_server_change(self, selected_name): if selected_name == "": return ( gr.update(visible=False), # panel gr.update(visible=False), # buttons gr.update(visible=True), # delete gr.update(visible=False), # delete_yes gr.update(visible=False), # delete_no gr.update(value="{}"), # config gr.update(value=TOOLS_DEFAULT), # tools display ) entry = mcp_manager.info()[selected_name] config = entry.get("config", {}) config_str = json.dumps(config, indent=2) return ( gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(value=config_str), gr.update(value=f"# Tools for '{selected_name}'\n\n⏳ Fetching tools..."), ) def on_btn_delete_click(self): return ( gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), ) def delete_server(self, selected_name): try: mcp_manager.delete(selected_name) gr.Info(f"MCP server '{selected_name}' deleted successfully") except Exception as e: gr.Error(f"Failed to delete MCP server '{selected_name}': {e}") return selected_name return "" def save_server(self, selected_name, config_str): try: config = json.loads(config_str) except json.JSONDecodeError as e: raise gr.Error(f"Invalid JSON: {e}") try: mcp_manager.update(selected_name, config) gr.Info(f"MCP server '{selected_name}' saved successfully") except Exception as e: raise gr.Error(f"Failed to save MCP server '{selected_name}': {e}") # Show loading placeholder; tools fetched in chained .then() return f"# Tools for '{selected_name}'\n\n⏳ Refreshing tools..." ================================================ FILE: libs/ktem/ktem/pages/__init__.py ================================================ ================================================ FILE: libs/ktem/ktem/pages/chat/__init__.py ================================================ import asyncio import json import re from copy import deepcopy from typing import Optional import gradio as gr from decouple import config from ktem.app import BasePage from ktem.components import reasonings from ktem.db.models import Conversation, engine from ktem.index.file.ui import File from ktem.reasoning.prompt_optimization.mindmap import MINDMAP_HTML_EXPORT_TEMPLATE from ktem.reasoning.prompt_optimization.suggest_conversation_name import ( SuggestConvNamePipeline, ) from ktem.reasoning.prompt_optimization.suggest_followup_chat import ( SuggestFollowupQuesPipeline, ) from plotly.io import from_json from sqlmodel import Session, select from theflow.settings import settings as flowsettings from theflow.utils.modules import import_dotted_string from kotaemon.base import Document from kotaemon.indices.ingests.files import KH_DEFAULT_FILE_EXTRACTORS from kotaemon.indices.qa.utils import strip_think_tag from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex, get_urls from ...utils.commands import WEB_SEARCH_COMMAND from ...utils.hf_papers import get_recommended_papers from ...utils.rate_limit import check_rate_limit from .chat_panel import ChatPanel from .chat_suggestion import ChatSuggestion from .common import STATE from .control import ConversationControl from .demo_hint import HintPage from .paper_list import PaperListPage from .report import ReportIssue KH_DEMO_MODE = getattr(flowsettings, "KH_DEMO_MODE", False) KH_SSO_ENABLED = getattr(flowsettings, "KH_SSO_ENABLED", False) KH_WEB_SEARCH_BACKEND = getattr(flowsettings, "KH_WEB_SEARCH_BACKEND", None) WebSearch = None if KH_WEB_SEARCH_BACKEND: try: WebSearch = import_dotted_string(KH_WEB_SEARCH_BACKEND, safe=False) except (ImportError, AttributeError) as e: print(f"Error importing {KH_WEB_SEARCH_BACKEND}: {e}") REASONING_LIMITS = 2 if KH_DEMO_MODE else 10 DEFAULT_SETTING = "(default)" INFO_PANEL_SCALES = {True: 8, False: 4} DEFAULT_QUESTION = ( "What is the summary of this document?" if not KH_DEMO_MODE else "What is the summary of this paper?" ) chat_input_focus_js = """ function() { let chatInput = document.querySelector("#chat-input textarea"); chatInput.focus(); } """ quick_urls_submit_js = """ function() { let urlInput = document.querySelector("#quick-url-demo textarea"); console.log("URL input:", urlInput); urlInput.dispatchEvent(new KeyboardEvent('keypress', {'key': 'Enter'})); } """ recommended_papers_js = """ function() { // Get all links and attach click event var links = document.querySelectorAll("#related-papers a"); function submitPaper(event) { event.preventDefault(); var target = event.currentTarget; var url = target.getAttribute("href"); console.log("URL:", url); let newChatButton = document.querySelector("#new-conv-button"); newChatButton.click(); setTimeout(() => { let urlInput = document.querySelector("#quick-url-demo textarea"); // Fill the URL input urlInput.value = url; urlInput.dispatchEvent(new Event("input", { bubbles: true })); urlInput.dispatchEvent(new KeyboardEvent('keypress', {'key': 'Enter'})); }, 500 ); } for (var i = 0; i < links.length; i++) { links[i].onclick = submitPaper; } } """ clear_bot_message_selection_js = """ function() { var bot_messages = document.querySelectorAll( "div#main-chat-bot div.message-row.bot-row" ); bot_messages.forEach(message => { message.classList.remove("text_selection"); }); } """ pdfview_js = """ function() { setTimeout(fullTextSearch(), 100); // Get all links and attach click event var links = document.getElementsByClassName("pdf-link"); for (var i = 0; i < links.length; i++) { links[i].onclick = openModal; } // Get all citation links and attach click event var links = document.querySelectorAll("a.citation"); for (var i = 0; i < links.length; i++) { links[i].onclick = scrollToCitation; } var markmap_div = document.querySelector("div.markmap"); var mindmap_el_script = document.querySelector('div.markmap script'); if (mindmap_el_script) { markmap_div_html = markmap_div.outerHTML; } // render the mindmap if the script tag is present if (mindmap_el_script) { markmap.autoLoader.renderAll(); } setTimeout(() => { var mindmap_el = document.querySelector('svg.markmap'); var text_nodes = document.querySelectorAll("svg.markmap div"); for (var i = 0; i < text_nodes.length; i++) { text_nodes[i].onclick = fillChatInput; } if (mindmap_el) { function on_svg_export(event) { html = "{html_template}"; html = html.replace("{markmap_div}", markmap_div_html); spawnDocument(html, {window: "width=1000,height=1000"}); } var link = document.getElementById("mindmap-toggle"); if (link) { link.onclick = function(event) { event.preventDefault(); // Prevent the default link behavior var div = document.querySelector("div.markmap"); if (div) { var currentHeight = div.style.height; if (currentHeight === '400px' || (currentHeight === '')) { div.style.height = '650px'; } else { div.style.height = '400px' } } }; } if (markmap_div_html) { var link = document.getElementById("mindmap-export"); if (link) { link.addEventListener('click', on_svg_export); } } } }, 250); return [links.length] } """.replace( "{html_template}", MINDMAP_HTML_EXPORT_TEMPLATE.replace("\n", "").replace('"', '\\"'), ) fetch_api_key_js = """ function(_, __) { api_key = getStorage('google_api_key', ''); console.log('session API key:', api_key); return [api_key, _]; } """ class ChatPage(BasePage): def __init__(self, app): self._app = app self._indices_input = [] self.on_building_ui() self._preview_links = gr.State(value=None) self._reasoning_type = gr.State(value=None) self._conversation_renamed = gr.State(value=False) self._use_suggestion = gr.State( value=getattr(flowsettings, "KH_FEATURE_CHAT_SUGGESTION", False) ) self._info_panel_expanded = gr.State(value=True) self._command_state = gr.State(value=None) self._user_api_key = gr.Text(value="", visible=False) def on_building_ui(self): with gr.Row(): self.state_chat = gr.State(STATE) self.state_retrieval_history = gr.State([]) self.state_plot_history = gr.State([]) self.state_plot_panel = gr.State(None) self.first_selector_choices = gr.State(None) with gr.Column(scale=1, elem_id="conv-settings-panel") as self.conv_column: self.chat_control = ConversationControl(self._app) for index_id, index in enumerate(self._app.index_manager.indices): index.selector = None index_ui = index.get_selector_component_ui() if not index_ui: # the index doesn't have a selector UI component continue index_ui.unrender() # need to rerender later within Accordion is_first_index = index_id == 0 index_name = index.name if KH_DEMO_MODE and is_first_index: index_name = "Select from Paper Collection" with gr.Accordion( label=index_name, open=is_first_index, elem_id=f"index-{index_id}", ): index_ui.render() gr_index = index_ui.as_gradio_component() # get the file selector choices for the first index if index_id == 0: self.first_selector_choices = index_ui.selector_choices self.first_indexing_url_fn = None if gr_index: if isinstance(gr_index, list): index.selector = tuple( range( len(self._indices_input), len(self._indices_input) + len(gr_index), ) ) index.default_selector = index_ui.default() self._indices_input.extend(gr_index) else: index.selector = len(self._indices_input) index.default_selector = index_ui.default() self._indices_input.append(gr_index) setattr(self, f"_index_{index.id}", index_ui) self.chat_suggestion = ChatSuggestion(self._app) if len(self._app.index_manager.indices) > 0: quick_upload_label = ( "Quick Upload" if not KH_DEMO_MODE else "Or input new paper URL" ) with gr.Accordion(label=quick_upload_label) as _: self.quick_file_upload_status = gr.Markdown() if not KH_DEMO_MODE: self.quick_file_upload = File( file_types=list(KH_DEFAULT_FILE_EXTRACTORS.keys()), file_count="multiple", container=True, show_label=False, elem_id="quick-file", ) self.quick_urls = gr.Textbox( placeholder=( "Or paste URLs" if not KH_DEMO_MODE else "Paste Arxiv URLs\n(https://arxiv.org/abs/xxx)" ), lines=1, container=False, show_label=False, elem_id=( "quick-url" if not KH_DEMO_MODE else "quick-url-demo" ), ) if not KH_DEMO_MODE: self.report_issue = ReportIssue(self._app) else: with gr.Accordion(label="Related papers", open=False): self.related_papers = gr.Markdown(elem_id="related-papers") self.hint_page = HintPage(self._app) with gr.Column(scale=6, elem_id="chat-area"): if KH_DEMO_MODE: self.paper_list = PaperListPage(self._app) self.chat_panel = ChatPanel(self._app) with gr.Accordion( label="Chat settings", elem_id="chat-settings-expand", open=False, visible=not KH_DEMO_MODE, ) as self.chat_settings: with gr.Row(elem_id="quick-setting-labels"): gr.HTML("Reasoning method") gr.HTML( "Model", visible=not KH_DEMO_MODE and not KH_SSO_ENABLED ) gr.HTML("Language") with gr.Row(): reasoning_setting = ( self._app.default_settings.reasoning.settings["use"] ) model_setting = self._app.default_settings.reasoning.options[ "simple" ].settings["llm"] language_setting = ( self._app.default_settings.reasoning.settings["lang"] ) citation_setting = self._app.default_settings.reasoning.options[ "simple" ].settings["highlight_citation"] self.reasoning_type = gr.Dropdown( choices=reasoning_setting.choices[:REASONING_LIMITS], value=reasoning_setting.value, container=False, show_label=False, ) self.model_type = gr.Dropdown( choices=model_setting.choices, value=model_setting.value, container=False, show_label=False, visible=not KH_DEMO_MODE and not KH_SSO_ENABLED, ) self.language = gr.Dropdown( choices=language_setting.choices, value=language_setting.value, container=False, show_label=False, ) self.citation = gr.Dropdown( choices=citation_setting.choices, value=citation_setting.value, container=False, show_label=False, interactive=True, elem_id="citation-dropdown", ) if not config("USE_LOW_LLM_REQUESTS", default=False, cast=bool): self.use_mindmap = gr.State(value=True) self.use_mindmap_check = gr.Checkbox( label="Mindmap (on)", container=False, elem_id="use-mindmap-checkbox", value=True, ) else: self.use_mindmap = gr.State(value=False) self.use_mindmap_check = gr.Checkbox( label="Mindmap (off)", container=False, elem_id="use-mindmap-checkbox", value=False, ) with gr.Column( scale=INFO_PANEL_SCALES[False], elem_id="chat-info-panel" ) as self.info_column: with gr.Accordion( label="Information panel", open=True, elem_id="info-expand" ): self.modal = gr.HTML("
") self.plot_panel = gr.Plot(visible=False) self.info_panel = gr.HTML(elem_id="html-info-panel") self.followup_questions = self.chat_suggestion.examples self.followup_questions_ui = self.chat_suggestion.accordion def _json_to_plot(self, json_dict: dict | None): if json_dict: plot = from_json(json_dict) plot = gr.update(visible=True, value=plot) else: plot = gr.update(visible=False) return plot def on_register_events(self): # first index paper recommendation if KH_DEMO_MODE and len(self._indices_input) > 0: self._indices_input[1].change( self.get_recommendations, inputs=[self.first_selector_choices, self._indices_input[1]], outputs=[self.related_papers], ).then( fn=None, inputs=None, outputs=None, js=recommended_papers_js, ) chat_event = ( gr.on( triggers=[ self.chat_panel.text_input.submit, ], fn=self.submit_msg, inputs=[ self.chat_panel.text_input, self.chat_panel.chatbot, self._app.user_id, self._app.settings_state, self.chat_control.conversation_id, self.chat_control.conversation_rn, self.first_selector_choices, ], outputs=[ self.chat_panel.text_input, self.chat_panel.chatbot, self.chat_control.conversation_id, self.chat_control.conversation, self.chat_control.conversation_rn, # file selector from the first index self._indices_input[0], self._indices_input[1], self._command_state, ], concurrency_limit=20, show_progress="hidden", ) .success( fn=self.chat_fn, inputs=[ self.chat_control.conversation_id, self.chat_panel.chatbot, self._app.settings_state, self._reasoning_type, self.model_type, self.use_mindmap, self.citation, self.language, self.state_chat, self._command_state, self._app.user_id, ] + self._indices_input, outputs=[ self.chat_panel.chatbot, self.info_panel, self.plot_panel, self.state_plot_panel, self.state_chat, ], concurrency_limit=20, show_progress="minimal", ) .then( fn=lambda: True, inputs=None, outputs=[self._preview_links], js=pdfview_js, ) .success( fn=self.check_and_suggest_name_conv, inputs=self.chat_panel.chatbot, outputs=[ self.chat_control.conversation_rn, self._conversation_renamed, ], ) .success( self.chat_control.rename_conv, inputs=[ self.chat_control.conversation_id, self.chat_control.conversation_rn, self._conversation_renamed, self._app.user_id, ], outputs=[ self.chat_control.conversation, self.chat_control.conversation, self.chat_control.conversation_rn, ], show_progress="hidden", ) ) onSuggestChatEvent = { "fn": self.suggest_chat_conv, "inputs": [ self._app.settings_state, self.language, self.chat_panel.chatbot, self._use_suggestion, ], "outputs": [ self.followup_questions_ui, self.followup_questions, ], "show_progress": "hidden", } # chat suggestion toggle chat_event = chat_event.success(**onSuggestChatEvent) # final data persist if not KH_DEMO_MODE: chat_event = chat_event.then( fn=self.persist_data_source, inputs=[ self.chat_control.conversation_id, self._app.user_id, self.info_panel, self.state_plot_panel, self.state_retrieval_history, self.state_plot_history, self.chat_panel.chatbot, self.state_chat, ] + self._indices_input, outputs=[ self.state_retrieval_history, self.state_plot_history, ], concurrency_limit=20, ) self.chat_control.btn_info_expand.click( fn=lambda is_expanded: ( gr.update(scale=INFO_PANEL_SCALES[is_expanded]), not is_expanded, ), inputs=self._info_panel_expanded, outputs=[self.info_column, self._info_panel_expanded], ) self.chat_control.btn_chat_expand.click( fn=None, inputs=None, js="function() {toggleChatColumn();}" ) if KH_DEMO_MODE: self.chat_control.btn_demo_logout.click( fn=None, js=self.chat_control.logout_js, ) self.chat_control.btn_new.click( fn=lambda: self.chat_control.select_conv("", None), outputs=[ self.chat_control.conversation_id, self.chat_control.conversation, self.chat_control.conversation_rn, self.chat_panel.chatbot, self.followup_questions, self.info_panel, self.state_plot_panel, self.state_retrieval_history, self.state_plot_history, self.chat_control.cb_is_public, self.state_chat, ] + self._indices_input, ).then( lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[self.paper_list.accordion, self.chat_settings], ).then( fn=None, inputs=None, js=chat_input_focus_js, ) if not KH_DEMO_MODE: self.chat_control.btn_new.click( self.chat_control.new_conv, inputs=self._app.user_id, outputs=[ self.chat_control.conversation_id, self.chat_control.conversation, ], show_progress="hidden", ).then( self.chat_control.select_conv, inputs=[self.chat_control.conversation, self._app.user_id], outputs=[ self.chat_control.conversation_id, self.chat_control.conversation, self.chat_control.conversation_rn, self.chat_panel.chatbot, self.followup_questions, self.info_panel, self.state_plot_panel, self.state_retrieval_history, self.state_plot_history, self.chat_control.cb_is_public, self.state_chat, ] + self._indices_input, show_progress="hidden", ).then( fn=self._json_to_plot, inputs=self.state_plot_panel, outputs=self.plot_panel, ).then( fn=None, inputs=None, js=chat_input_focus_js, ) self.chat_control.btn_del.click( lambda id: self.toggle_delete(id), inputs=[self.chat_control.conversation_id], outputs=[ self.chat_control._new_delete, self.chat_control._delete_confirm, ], ) self.chat_control.btn_del_conf.click( self.chat_control.delete_conv, inputs=[self.chat_control.conversation_id, self._app.user_id], outputs=[ self.chat_control.conversation_id, self.chat_control.conversation, ], show_progress="hidden", ).then( self.chat_control.select_conv, inputs=[self.chat_control.conversation, self._app.user_id], outputs=[ self.chat_control.conversation_id, self.chat_control.conversation, self.chat_control.conversation_rn, self.chat_panel.chatbot, self.followup_questions, self.info_panel, self.state_plot_panel, self.state_retrieval_history, self.state_plot_history, self.chat_control.cb_is_public, self.state_chat, ] + self._indices_input, show_progress="hidden", ).then( fn=self._json_to_plot, inputs=self.state_plot_panel, outputs=self.plot_panel, ).then( lambda: self.toggle_delete(""), outputs=[ self.chat_control._new_delete, self.chat_control._delete_confirm, ], ) self.chat_control.btn_del_cnl.click( lambda: self.toggle_delete(""), outputs=[ self.chat_control._new_delete, self.chat_control._delete_confirm, ], ) self.chat_control.btn_conversation_rn.click( lambda: gr.update(visible=True), outputs=[ self.chat_control.conversation_rn, ], ) self.chat_control.conversation_rn.submit( self.chat_control.rename_conv, inputs=[ self.chat_control.conversation_id, self.chat_control.conversation_rn, gr.State(value=True), self._app.user_id, ], outputs=[ self.chat_control.conversation, self.chat_control.conversation, self.chat_control.conversation_rn, ], show_progress="hidden", ) onConvSelect = ( self.chat_control.conversation.select( self.chat_control.select_conv, inputs=[self.chat_control.conversation, self._app.user_id], outputs=[ self.chat_control.conversation_id, self.chat_control.conversation, self.chat_control.conversation_rn, self.chat_panel.chatbot, self.followup_questions, self.info_panel, self.state_plot_panel, self.state_retrieval_history, self.state_plot_history, self.chat_control.cb_is_public, self.state_chat, ] + self._indices_input, show_progress="hidden", ) .then( fn=self._json_to_plot, inputs=self.state_plot_panel, outputs=self.plot_panel, ) .then( lambda: self.toggle_delete(""), outputs=[ self.chat_control._new_delete, self.chat_control._delete_confirm, ], ) ) if KH_DEMO_MODE: onConvSelect = onConvSelect.then( lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[self.paper_list.accordion, self.chat_settings], ) onConvSelect = ( onConvSelect.then( fn=lambda: True, js=clear_bot_message_selection_js, ) .then( fn=lambda: True, inputs=None, outputs=[self._preview_links], js=pdfview_js, ) .then(fn=None, inputs=None, outputs=None, js=chat_input_focus_js) ) if not KH_DEMO_MODE: # evidence display on message selection self.chat_panel.chatbot.select( self.message_selected, inputs=[ self.state_retrieval_history, self.state_plot_history, ], outputs=[ self.info_panel, self.state_plot_panel, ], ).then( fn=self._json_to_plot, inputs=self.state_plot_panel, outputs=self.plot_panel, ).then( fn=lambda: True, inputs=None, outputs=[self._preview_links], js=pdfview_js, ) self.chat_control.cb_is_public.change( self.on_set_public_conversation, inputs=[self.chat_control.cb_is_public, self.chat_control.conversation], outputs=None, show_progress="hidden", ) if not KH_DEMO_MODE: # user feedback events self.chat_panel.chatbot.like( fn=self.is_liked, inputs=[self.chat_control.conversation_id], outputs=None, ) self.report_issue.report_btn.click( self.report_issue.report, inputs=[ self.report_issue.correctness, self.report_issue.issues, self.report_issue.more_detail, self.chat_control.conversation_id, self.chat_panel.chatbot, self._app.settings_state, self._app.user_id, self.info_panel, self.state_chat, ] + self._indices_input, outputs=None, ) self.reasoning_type.change( self.reasoning_changed, inputs=[self.reasoning_type], outputs=[self._reasoning_type], ) self.use_mindmap_check.change( lambda x: (x, gr.update(label="Mindmap " + ("(on)" if x else "(off)"))), inputs=[self.use_mindmap_check], outputs=[self.use_mindmap, self.use_mindmap_check], show_progress="hidden", ) def toggle_chat_suggestion(current_state): return current_state, gr.update(visible=current_state) def raise_error_on_state(state): if not state: raise ValueError("Chat suggestion disabled") self.chat_control.cb_suggest_chat.change( fn=toggle_chat_suggestion, inputs=[self.chat_control.cb_suggest_chat], outputs=[self._use_suggestion, self.followup_questions_ui], show_progress="hidden", ).then( fn=raise_error_on_state, inputs=[self._use_suggestion], show_progress="hidden", ).success( **onSuggestChatEvent ) self.chat_control.conversation_id.change( lambda: gr.update(visible=False), outputs=self.plot_panel, ) self.followup_questions.select( self.chat_suggestion.select_example, outputs=[self.chat_panel.text_input], show_progress="hidden", ).then( fn=None, inputs=None, outputs=None, js=chat_input_focus_js, ) if KH_DEMO_MODE: self.paper_list.examples.select( self.paper_list.select_example, inputs=[self.paper_list.papers_state], outputs=[self.quick_urls], show_progress="hidden", ).then( lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[self.paper_list.accordion, self.chat_settings], ).then( fn=None, inputs=None, outputs=None, js=quick_urls_submit_js, ) def submit_msg( self, chat_input, chat_history, user_id, settings, conv_id, conv_name, first_selector_choices, request: gr.Request, ): """Submit a message to the chatbot""" if KH_DEMO_MODE: sso_user_id = check_rate_limit("chat", request) print("User ID:", sso_user_id) if not chat_input: raise ValueError("Input is empty") chat_input_text = chat_input.get("text", "") file_ids = [] used_command = None first_selector_choices_map = { item[0]: item[1] for item in first_selector_choices } # get all file names with pattern @"filename" in input_str file_names, chat_input_text = get_file_names_regex(chat_input_text) # check if web search command is in file_names if WEB_SEARCH_COMMAND in file_names: used_command = WEB_SEARCH_COMMAND # get all urls in input_str urls, chat_input_text = get_urls(chat_input_text) if urls and self.first_indexing_url_fn: print("Detected URLs", urls) file_ids = self.first_indexing_url_fn( "\n".join(urls), True, settings, user_id, request=None, ) elif file_names: for file_name in file_names: file_id = first_selector_choices_map.get(file_name) if file_id: file_ids.append(file_id) # add new file ids to the first selector choices first_selector_choices.extend(zip(urls, file_ids)) # if file_ids is not empty and chat_input_text is empty # set the input to summary if not chat_input_text and file_ids: chat_input_text = DEFAULT_QUESTION # if start of conversation and no query is specified if not chat_input_text and not chat_history: chat_input_text = DEFAULT_QUESTION if file_ids: selector_output = [ "select", gr.update(value=file_ids, choices=first_selector_choices), ] else: selector_output = [gr.update(), gr.update()] # check if regen mode is active if chat_input_text: chat_history = chat_history + [(chat_input_text, None)] else: if not chat_history: raise gr.Error("Empty chat") if not conv_id: if not KH_DEMO_MODE: id_, update = self.chat_control.new_conv(user_id) with Session(engine) as session: statement = select(Conversation).where(Conversation.id == id_) name = session.exec(statement).one().name new_conv_id = id_ conv_update = update new_conv_name = name else: new_conv_id, new_conv_name, conv_update = None, None, gr.update() else: new_conv_id = conv_id conv_update = gr.update() new_conv_name = conv_name return ( [ {}, chat_history, new_conv_id, conv_update, new_conv_name, ] + selector_output + [used_command] ) def get_recommendations(self, first_selector_choices, file_ids): first_selector_choices_map = { item[1]: item[0] for item in first_selector_choices } file_names = [first_selector_choices_map[file_id] for file_id in file_ids] if not file_names: return "" first_file_name = file_names[0].split(".")[0].replace("_", " ") return get_recommended_papers(first_file_name) def toggle_delete(self, conv_id): if conv_id: return gr.update(visible=False), gr.update(visible=True) else: return gr.update(visible=True), gr.update(visible=False) def on_set_public_conversation(self, is_public, convo_id): if not convo_id: gr.Warning("No conversation selected") return with Session(engine) as session: statement = select(Conversation).where(Conversation.id == convo_id) result = session.exec(statement).one() name = result.name if result.is_public != is_public: # Only trigger updating when user # select different value from the current result.is_public = is_public session.add(result) session.commit() gr.Info( f"Conversation: {name} is {'public' if is_public else 'private'}." ) def on_subscribe_public_events(self): if self._app.f_user_management: self._app.subscribe_event( name="onSignIn", definition={ "fn": self.chat_control.reload_conv, "inputs": [self._app.user_id], "outputs": [self.chat_control.conversation], "show_progress": "hidden", }, ) self._app.subscribe_event( name="onSignOut", definition={ "fn": lambda: self.chat_control.select_conv("", None), "outputs": [ self.chat_control.conversation_id, self.chat_control.conversation, self.chat_control.conversation_rn, self.chat_panel.chatbot, self.followup_questions, self.info_panel, self.state_plot_panel, self.state_retrieval_history, self.state_plot_history, self.chat_control.cb_is_public, self.state_chat, ] + self._indices_input, "show_progress": "hidden", }, ) def _on_app_created(self): if KH_DEMO_MODE: self._app.app.load( fn=lambda x: x, inputs=[self._user_api_key], outputs=[self._user_api_key], js=fetch_api_key_js, ).then( fn=self.chat_control.toggle_demo_login_visibility, inputs=[self._user_api_key], outputs=[ self.chat_control.cb_suggest_chat, self.chat_control.btn_new, self.chat_control.btn_demo_logout, self.chat_control.btn_demo_login, ], ).then( fn=None, inputs=None, js=chat_input_focus_js, ) def persist_data_source( self, convo_id, user_id, retrieval_msg, plot_data, retrival_history, plot_history, messages, state, *selecteds, ): """Update the data source""" if not convo_id: gr.Warning("No conversation selected") return # if not regen, then append the new message if not state["app"].get("regen", False): retrival_history = retrival_history + [retrieval_msg] plot_history = plot_history + [plot_data] else: if retrival_history: print("Updating retrieval history (regen=True)") retrival_history[-1] = retrieval_msg plot_history[-1] = plot_data # reset regen state state["app"]["regen"] = False selecteds_ = {} for index in self._app.index_manager.indices: if index.selector is None: continue if isinstance(index.selector, int): selecteds_[str(index.id)] = selecteds[index.selector] else: selecteds_[str(index.id)] = [selecteds[i] for i in index.selector] with Session(engine) as session: statement = select(Conversation).where(Conversation.id == convo_id) result = session.exec(statement).one() data_source = result.data_source old_selecteds = data_source.get("selected", {}) is_owner = result.user == user_id # Write down to db result.data_source = { "selected": selecteds_ if is_owner else old_selecteds, "messages": messages, "retrieval_messages": retrival_history, "plot_history": plot_history, "state": state, "likes": deepcopy(data_source.get("likes", [])), } session.add(result) session.commit() return retrival_history, plot_history def reasoning_changed(self, reasoning_type): if reasoning_type != DEFAULT_SETTING: # override app settings state (temporary) gr.Info("Reasoning type changed to `{}`".format(reasoning_type)) return reasoning_type def is_liked(self, convo_id, liked: gr.LikeData): with Session(engine) as session: statement = select(Conversation).where(Conversation.id == convo_id) result = session.exec(statement).one() data_source = deepcopy(result.data_source) likes = data_source.get("likes", []) likes.append([liked.index, liked.value, liked.liked]) data_source["likes"] = likes result.data_source = data_source session.add(result) session.commit() def message_selected(self, retrieval_history, plot_history, msg: gr.SelectData): index = msg.index[0] try: retrieval_content, plot_content = ( retrieval_history[index], plot_history[index], ) except IndexError: retrieval_content, plot_content = gr.update(), None return retrieval_content, plot_content def create_pipeline( self, settings: dict, session_reasoning_type: str, session_llm: str, session_use_mindmap: bool | str, session_use_citation: str, session_language: str, state: dict, command_state: str | None, user_id: int, *selecteds, ): """Create the pipeline from settings Args: settings: the settings of the app state: the state of the app selected: the list of file ids that will be served as context. If None, then consider using all files Returns: - the pipeline objects """ # override reasoning_mode by temporary chat page state print( "Session reasoning type", session_reasoning_type, "use mindmap", session_use_mindmap, "use citation", session_use_citation, "language", session_language, ) print("Session LLM", session_llm) reasoning_mode = ( settings["reasoning.use"] if session_reasoning_type in (DEFAULT_SETTING, None) else session_reasoning_type ) reasoning_cls = reasonings[reasoning_mode] print("Reasoning class", reasoning_cls) reasoning_id = reasoning_cls.get_info()["id"] settings = deepcopy(settings) llm_setting_key = f"reasoning.options.{reasoning_id}.llm" if llm_setting_key in settings and session_llm not in ( DEFAULT_SETTING, None, "", ): settings[llm_setting_key] = session_llm if session_use_mindmap not in (DEFAULT_SETTING, None): settings["reasoning.options.simple.create_mindmap"] = session_use_mindmap if session_use_citation not in (DEFAULT_SETTING, None): settings[ "reasoning.options.simple.highlight_citation" ] = session_use_citation if session_language not in (DEFAULT_SETTING, None): settings["reasoning.lang"] = session_language # get retrievers retrievers = [] if command_state == WEB_SEARCH_COMMAND: # set retriever for web search if not WebSearch: raise ValueError("Web search back-end is not available.") web_search = WebSearch() retrievers.append(web_search) else: for index in self._app.index_manager.indices: index_selected = [] if isinstance(index.selector, int): index_selected = selecteds[index.selector] if isinstance(index.selector, tuple): for i in index.selector: index_selected.append(selecteds[i]) iretrievers = index.get_retriever_pipelines( settings, user_id, index_selected ) retrievers += iretrievers # prepare states reasoning_state = { "app": deepcopy(state["app"]), "pipeline": deepcopy(state.get(reasoning_id, {})), } pipeline = reasoning_cls.get_pipeline(settings, reasoning_state, retrievers) return pipeline, reasoning_state def chat_fn( self, conversation_id, chat_history, settings, reasoning_type, llm_type, use_mind_map, use_citation, language, chat_state, command_state, user_id, *selecteds, ): """Chat function""" chat_input, chat_output = chat_history[-1] chat_history = chat_history[:-1] # if chat_input is empty, assume regen mode if chat_output: chat_state["app"]["regen"] = True queue: asyncio.Queue[Optional[dict]] = asyncio.Queue() # construct the pipeline pipeline, reasoning_state = self.create_pipeline( settings, reasoning_type, llm_type, use_mind_map, use_citation, language, chat_state, command_state, user_id, *selecteds, ) print("Reasoning state", reasoning_state) pipeline.set_output_queue(queue) text, refs, plot, plot_gr = "", "", None, gr.update(visible=False) msg_placeholder = getattr( flowsettings, "KH_CHAT_MSG_PLACEHOLDER", "Thinking ..." ) print(msg_placeholder) yield ( chat_history + [(chat_input, text or msg_placeholder)], refs, plot_gr, plot, chat_state, ) try: for response in pipeline.stream(chat_input, conversation_id, chat_history): if not isinstance(response, Document): continue if response.channel is None: continue if response.channel == "chat": if response.content is None: text = "" else: text += response.content if response.channel == "info": if response.content is None: refs = "" else: refs += response.content if response.channel == "plot": plot = response.content plot_gr = self._json_to_plot(plot) chat_state[pipeline.get_info()["id"]] = reasoning_state["pipeline"] yield ( chat_history + [(chat_input, text or msg_placeholder)], refs, plot_gr, plot, chat_state, ) except ValueError as e: print(e) if not text: empty_msg = getattr( flowsettings, "KH_CHAT_EMPTY_MSG_PLACEHOLDER", "(Sorry, I don't know)" ) print(f"Generate nothing: {empty_msg}") yield ( chat_history + [(chat_input, text or empty_msg)], refs, plot_gr, plot, chat_state, ) def check_and_suggest_name_conv(self, chat_history): suggest_pipeline = SuggestConvNamePipeline() new_name = gr.update() renamed = False # check if this is a newly created conversation if len(chat_history) == 1: suggested_name = suggest_pipeline(chat_history).text suggested_name = strip_think_tag(suggested_name) suggested_name = suggested_name.replace('"', "").replace("'", "")[:40] new_name = gr.update(value=suggested_name) renamed = True return new_name, renamed def suggest_chat_conv( self, settings, session_language, chat_history, use_suggestion, ): target_language = ( session_language if session_language not in (DEFAULT_SETTING, None) else settings["reasoning.lang"] ) if use_suggestion: suggest_pipeline = SuggestFollowupQuesPipeline() suggest_pipeline.lang = SUPPORTED_LANGUAGE_MAP.get( target_language, "English" ) suggested_questions = [[each] for each in ChatSuggestion.CHAT_SAMPLES] if len(chat_history) >= 1: suggested_resp = suggest_pipeline(chat_history).text if ques_res := re.search( r"\[(.*?)\]", re.sub("\n", "", suggested_resp) ): ques_res_str = ques_res.group() try: suggested_questions = json.loads(ques_res_str) suggested_questions = [[x] for x in suggested_questions] except Exception: pass return gr.update(visible=True), suggested_questions return gr.update(visible=False), gr.update() ================================================ FILE: libs/ktem/ktem/pages/chat/chat_panel.py ================================================ import gradio as gr from ktem.app import BasePage from theflow.settings import settings as flowsettings KH_DEMO_MODE = getattr(flowsettings, "KH_DEMO_MODE", False) if not KH_DEMO_MODE: PLACEHOLDER_TEXT = ( "This is the beginning of a new conversation.\n" "Start by uploading a file or a web URL. " "Visit Files tab for more options (e.g: GraphRAG)." ) else: PLACEHOLDER_TEXT = ( "Welcome to Kotaemon Demo. " "Start by browsing preloaded conversations to get onboard.\n" "Check out Hint section for more tips." ) class ChatPanel(BasePage): def __init__(self, app): self._app = app self.on_building_ui() def on_building_ui(self): self.chatbot = gr.Chatbot( label=self._app.app_name, placeholder=PLACEHOLDER_TEXT, show_label=False, elem_id="main-chat-bot", show_copy_button=True, likeable=True, bubble_full_width=False, ) with gr.Row(): self.text_input = gr.MultimodalTextbox( interactive=True, scale=20, file_count="multiple", placeholder=( "Type a message, search the @web, or tag a file with @filename" ), container=False, show_label=False, elem_id="chat-input", ) def submit_msg(self, chat_input, chat_history): """Submit a message to the chatbot""" return "", chat_history + [(chat_input, None)] ================================================ FILE: libs/ktem/ktem/pages/chat/chat_suggestion.py ================================================ import gradio as gr from ktem.app import BasePage from theflow.settings import settings as flowsettings class ChatSuggestion(BasePage): CHAT_SAMPLES = getattr( flowsettings, "KH_FEATURE_CHAT_SUGGESTION_SAMPLES", [ "Summary this document", "Generate a FAQ for this document", "Identify the main highlights in bullet points", ], ) def __init__(self, app): self._app = app self.on_building_ui() def on_building_ui(self): self.chat_samples = [[each] for each in self.CHAT_SAMPLES] with gr.Accordion( label="Chat Suggestion", visible=getattr(flowsettings, "KH_FEATURE_CHAT_SUGGESTION", False), ) as self.accordion: self.default_example = gr.State( value=self.chat_samples, ) self.examples = gr.DataFrame( value=self.chat_samples, headers=["Next Question"], interactive=False, elem_id="chat-suggestion", wrap=True, ) def as_gradio_component(self): return self.examples def select_example(self, ev: gr.SelectData): return {"text": ev.value} ================================================ FILE: libs/ktem/ktem/pages/chat/common.py ================================================ DEFAULT_APPLICATION_STATE = {"regen": False} STATE = { "app": DEFAULT_APPLICATION_STATE, } ================================================ FILE: libs/ktem/ktem/pages/chat/control.py ================================================ import logging import os from copy import deepcopy import gradio as gr from ktem.app import BasePage from ktem.db.models import Conversation, User, engine from sqlmodel import Session, or_, select import flowsettings from ...utils.conversation import sync_retrieval_n_message from .chat_suggestion import ChatSuggestion from .common import STATE logger = logging.getLogger(__name__) KH_DEMO_MODE = getattr(flowsettings, "KH_DEMO_MODE", False) KH_SSO_ENABLED = getattr(flowsettings, "KH_SSO_ENABLED", False) ASSETS_DIR = "assets/icons" if not os.path.isdir(ASSETS_DIR): ASSETS_DIR = "libs/ktem/ktem/assets/icons" logout_js = """ function () { removeFromStorage('google_api_key'); window.location.href = "/logout"; } """ def is_conv_name_valid(name): """Check if the conversation name is valid""" errors = [] if len(name) == 0: errors.append("Name cannot be empty") elif len(name) > 40: errors.append("Name cannot be longer than 40 characters") return "; ".join(errors) class ConversationControl(BasePage): """Manage conversation""" def __init__(self, app): self._app = app self.logout_js = logout_js self.on_building_ui() def on_building_ui(self): with gr.Row(): title_text = "Conversations" if not KH_DEMO_MODE else "Kotaemon Papers" gr.Markdown("## {}".format(title_text)) self.btn_toggle_dark_mode = gr.Button( value="", icon=f"{ASSETS_DIR}/dark_mode.svg", scale=1, size="sm", elem_classes=["no-background", "body-text-color"], elem_id="toggle-dark-button", ) self.btn_chat_expand = gr.Button( value="", icon=f"{ASSETS_DIR}/expand.svg", scale=1, size="sm", elem_classes=["no-background", "body-text-color"], elem_id="chat-expand-button", ) self.btn_info_expand = gr.Button( value="", icon=f"{ASSETS_DIR}/expand.svg", min_width=2, scale=1, size="sm", elem_classes=["no-background", "body-text-color"], elem_id="info-expand-button", ) self.btn_toggle_dark_mode.click( None, js=""" () => { document.body.classList.toggle('dark'); } """, ) self.conversation_id = gr.State(value="") self.conversation = gr.Dropdown( label="Chat sessions", choices=[], container=False, filterable=True, interactive=True, elem_classes=["unset-overflow"], elem_id="conversation-dropdown", ) with gr.Row() as self._new_delete: self.cb_suggest_chat = gr.Checkbox( value=False, label="Suggest chat", min_width=10, scale=6, elem_id="suggest-chat-checkbox", container=False, visible=not KH_DEMO_MODE, ) self.cb_is_public = gr.Checkbox( value=False, label="Share this conversation", elem_id="is-public-checkbox", container=False, visible=not KH_DEMO_MODE and not KH_SSO_ENABLED, ) if not KH_DEMO_MODE: self.btn_conversation_rn = gr.Button( value="", icon=f"{ASSETS_DIR}/rename.svg", min_width=2, scale=1, size="sm", elem_classes=["no-background", "body-text-color"], ) self.btn_del = gr.Button( value="", icon=f"{ASSETS_DIR}/delete.svg", min_width=2, scale=1, size="sm", elem_classes=["no-background", "body-text-color"], ) self.btn_new = gr.Button( value="", icon=f"{ASSETS_DIR}/new.svg", min_width=2, scale=1, size="sm", elem_classes=["no-background", "body-text-color"], elem_id="new-conv-button", ) else: self.btn_new = gr.Button( value="New chat", min_width=120, size="sm", scale=1, variant="primary", elem_id="new-conv-button", visible=False, ) if KH_DEMO_MODE: with gr.Row(): self.btn_demo_login = gr.Button( "Sign-in to create new chat", min_width=120, size="sm", scale=1, variant="primary", ) _js_redirect = """ () => { url = '/login' + window.location.search; window.open(url, '_blank'); } """ self.btn_demo_login.click(None, js=_js_redirect) self.btn_demo_logout = gr.Button( "Sign-out", min_width=120, size="sm", scale=1, visible=False, ) with gr.Row(visible=False) as self._delete_confirm: self.btn_del_conf = gr.Button( value="Delete", variant="stop", min_width=10, ) self.btn_del_cnl = gr.Button(value="Cancel", min_width=10) with gr.Row(): self.conversation_rn = gr.Text( label="(Enter) to save", placeholder="Conversation name", container=True, scale=5, min_width=10, interactive=True, visible=False, ) def load_chat_history(self, user_id): """Reload chat history""" # In case user are admin. They can also watch the # public conversations can_see_public: bool = False with Session(engine) as session: statement = select(User).where(User.id == user_id) result = session.exec(statement).one_or_none() if result is not None: if flowsettings.KH_USER_CAN_SEE_PUBLIC: can_see_public = ( result.username == flowsettings.KH_USER_CAN_SEE_PUBLIC ) else: can_see_public = True print(f"User-id: {user_id}, can see public conversations: {can_see_public}") options = [] with Session(engine) as session: # Define condition based on admin-role: # - can_see: can see their conversations & public files # - can_not_see: only see their conversations if can_see_public: statement = ( select(Conversation) .where( or_( Conversation.user == user_id, Conversation.is_public, ) ) .order_by( Conversation.is_public.desc(), Conversation.date_created.desc() ) # type: ignore ) else: statement = ( select(Conversation) .where(Conversation.user == user_id) .order_by(Conversation.date_created.desc()) # type: ignore ) results = session.exec(statement).all() for result in results: options.append((result.name, result.id)) return options def reload_conv(self, user_id): conv_list = self.load_chat_history(user_id) if conv_list: return gr.update(value=None, choices=conv_list) else: return gr.update(value=None, choices=[]) def new_conv(self, user_id): """Create new chat""" if user_id is None: gr.Warning("Please sign in first (Settings → User Settings)") return None, gr.update() with Session(engine) as session: new_conv = Conversation(user=user_id) session.add(new_conv) session.commit() id_ = new_conv.id history = self.load_chat_history(user_id) return id_, gr.update(value=id_, choices=history) def delete_conv(self, conversation_id, user_id): """Delete the selected conversation""" if not conversation_id: gr.Warning("No conversation selected.") return None, gr.update() if user_id is None: gr.Warning("Please sign in first (Settings → User Settings)") return None, gr.update() with Session(engine) as session: statement = select(Conversation).where(Conversation.id == conversation_id) result = session.exec(statement).one() session.delete(result) session.commit() history = self.load_chat_history(user_id) if history: id_ = history[0][1] return id_, gr.update(value=id_, choices=history) else: return None, gr.update(value=None, choices=[]) def select_conv(self, conversation_id, user_id): """Select the conversation""" default_chat_suggestions = [[each] for each in ChatSuggestion.CHAT_SAMPLES] with Session(engine) as session: statement = select(Conversation).where(Conversation.id == conversation_id) try: result = session.exec(statement).one() id_ = result.id name = result.name is_conv_public = result.is_public # disable file selection ids state if # not the owner of the conversation if user_id == result.user: selected = result.data_source.get("selected", {}) else: selected = {} chats = result.data_source.get("messages", []) chat_suggestions = result.data_source.get( "chat_suggestions", default_chat_suggestions ) retrieval_history: list[str] = result.data_source.get( "retrieval_messages", [] ) plot_history: list[dict] = result.data_source.get("plot_history", []) # On initialization # Ensure len of retrieval and messages are equal retrieval_history = sync_retrieval_n_message(chats, retrieval_history) info_panel = ( retrieval_history[-1] if retrieval_history else "
No evidence found.
" ) plot_data = plot_history[-1] if plot_history else None state = result.data_source.get("state", STATE) except Exception as e: logger.warning(e) id_ = "" name = "" selected = {} chats = [] chat_suggestions = default_chat_suggestions retrieval_history = [] plot_history = [] info_panel = "" plot_data = None state = STATE is_conv_public = False indices = [] for index in self._app.index_manager.indices: # assume that the index has selector if index.selector is None: continue if isinstance(index.selector, int): indices.append(selected.get(str(index.id), index.default_selector)) if isinstance(index.selector, tuple): indices.extend(selected.get(str(index.id), index.default_selector)) return ( id_, id_, name, chats, chat_suggestions, info_panel, plot_data, retrieval_history, plot_history, is_conv_public, state, *indices, ) def rename_conv(self, conversation_id, new_name, is_renamed, user_id): """Rename the conversation""" if not is_renamed or KH_DEMO_MODE or user_id is None or not conversation_id: return ( gr.update(), conversation_id, gr.update(visible=False), ) errors = is_conv_name_valid(new_name) if errors: gr.Warning(errors) return ( gr.update(), conversation_id, gr.update(visible=False), ) with Session(engine) as session: statement = select(Conversation).where(Conversation.id == conversation_id) result = session.exec(statement).one() result.name = new_name session.add(result) session.commit() history = self.load_chat_history(user_id) gr.Info("Conversation renamed.") return ( gr.update(choices=history), conversation_id, gr.update(visible=False), ) def persist_chat_suggestions( self, conversation_id, new_suggestions, is_updated, user_id ): """Update the conversation's chat suggestions""" if not is_updated: return if user_id is None: gr.Warning("Please sign in first (Settings → User Settings)") return gr.update(), "" if not conversation_id: gr.Warning("No conversation selected.") return gr.update(), "" with Session(engine) as session: statement = select(Conversation).where(Conversation.id == conversation_id) result = session.exec(statement).one() data_source = deepcopy(result.data_source) data_source["chat_suggestions"] = [ [x] for x in new_suggestions.iloc[:, 0].tolist() ] result.data_source = data_source session.add(result) session.commit() gr.Info("Chat suggestions updated.") def toggle_demo_login_visibility(self, user_api_key, request: gr.Request): try: import gradiologin as grlogin user = grlogin.get_user(request) except (ImportError, AssertionError): user = None if user: # or user_api_key: return [ gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), ] else: return [ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), ] def _on_app_created(self): """Reload the conversation once the app is created""" self._app.app.load( self.reload_conv, inputs=[self._app.user_id], outputs=[self.conversation], ) ================================================ FILE: libs/ktem/ktem/pages/chat/demo_hint.py ================================================ from textwrap import dedent import gradio as gr from ktem.app import BasePage class HintPage(BasePage): def __init__(self, app): self._app = app self.on_building_ui() def on_building_ui(self): with gr.Accordion(label="Hint", open=False): gr.Markdown( dedent( """ - You can select any text from the chat answer to **highlight relevant citation(s)** on the right panel. - **Citations** can be viewed on both PDF viewer and raw text. - You can tweak the citation format and use advance (CoT) reasoning in **Chat settings** menu. - Want to **explore more**? Check out the **Help** section to create your private space. """ # noqa ) ) ================================================ FILE: libs/ktem/ktem/pages/chat/paper_list.py ================================================ import gradio as gr from ktem.app import BasePage from pandas import DataFrame from ...utils.hf_papers import fetch_papers class PaperListPage(BasePage): def __init__(self, app): self._app = app self.on_building_ui() def on_building_ui(self): self.papers_state = gr.State(None) with gr.Accordion( label="Browse popular daily papers", open=True, ) as self.accordion: self.examples = gr.DataFrame( value=[], headers=["title", "url", "upvotes"], column_widths=[60, 30, 10], interactive=False, elem_id="paper-suggestion", wrap=True, ) return self.examples def load(self): papers = fetch_papers(top_n=5) papers_df = DataFrame(papers) return (papers_df, papers) def _on_app_created(self): self._app.app.load( self.load, outputs=[self.examples, self.papers_state], ) def select_example(self, state, ev: gr.SelectData): return state[ev.index[0]]["url"] ================================================ FILE: libs/ktem/ktem/pages/chat/report.py ================================================ from typing import Optional import gradio as gr from ktem.app import BasePage from ktem.db.models import IssueReport, engine from sqlmodel import Session class ReportIssue(BasePage): def __init__(self, app): self._app = app self.on_building_ui() def on_building_ui(self): with gr.Accordion(label="Feedback", open=False, elem_id="report-accordion"): self.correctness = gr.Radio( choices=[ ("The answer is correct", "correct"), ("The answer is incorrect", "incorrect"), ], label="Correctness:", ) self.issues = gr.CheckboxGroup( choices=[ ("The answer is offensive", "offensive"), ("The evidence is incorrect", "wrong-evidence"), ], label="Other issue:", ) self.more_detail = gr.Textbox( placeholder=( "More detail (e.g. how wrong is it, what is the " "correct answer, etc...)" ), container=False, lines=3, ) gr.Markdown( "This will send the current chat and the user settings to " "help with investigation" ) self.report_btn = gr.Button("Report") def report( self, correctness: str, issues: list[str], more_detail: str, conv_id: str, chat_history: list, settings: dict, user_id: Optional[int], info_panel: str, chat_state: dict, *selecteds, ): selecteds_ = {} for index in self._app.index_manager.indices: if index.selector is not None: if isinstance(index.selector, int): selecteds_[str(index.id)] = selecteds[index.selector] elif isinstance(index.selector, tuple): selecteds_[str(index.id)] = [selecteds[_] for _ in index.selector] else: print(f"Unknown selector type: {index.selector}") with Session(engine) as session: issue = IssueReport( issues={ "correctness": correctness, "issues": issues, "more_detail": more_detail, }, chat={ "conv_id": conv_id, "chat_history": chat_history, "info_panel": info_panel, "chat_state": chat_state, "selecteds": selecteds_, }, settings=settings, user=user_id, ) session.add(issue) session.commit() gr.Info("Thank you for your feedback") ================================================ FILE: libs/ktem/ktem/pages/help.py ================================================ from importlib.metadata import version from pathlib import Path import gradio as gr import requests from decouple import config from theflow.settings import settings KH_DEMO_MODE = getattr(settings, "KH_DEMO_MODE", False) HF_SPACE_URL = config("HF_SPACE_URL", default="") def get_remote_doc(url: str) -> str: try: res = requests.get(url) res.raise_for_status() return res.text except Exception as e: print(f"Failed to fetch document from {url}: {e}") return "" def download_changelogs(release_url: str) -> str: try: res = requests.get(release_url).json() changelogs = res.get("body", "") return changelogs except Exception as e: print(f"Failed to fetch changelogs from {release_url}: {e}") return "" class HelpPage: def __init__( self, app, doc_dir: str = settings.KH_DOC_DIR, remote_content_url: str = "https://raw.githubusercontent.com/Cinnamon/kotaemon", app_version: str | None = settings.KH_APP_VERSION, changelogs_cache_dir: str | Path = (Path(settings.KH_APP_DATA_DIR) / "changelogs"), ): self._app = app self.doc_dir = Path(doc_dir) self.remote_content_url = remote_content_url self.app_version = app_version self.changelogs_cache_dir = Path(changelogs_cache_dir) self.changelogs_cache_dir.mkdir(parents=True, exist_ok=True) about_md_dir = self.doc_dir / "about.md" if about_md_dir.exists(): with (self.doc_dir / "about.md").open(encoding="utf-8") as fi: about_md = fi.read() else: # fetch from remote about_md = get_remote_doc( f"{self.remote_content_url}/v{self.app_version}/docs/about.md" ) if about_md: with gr.Accordion("About"): if self.app_version: about_md = f"Version: {self.app_version}\n\n{about_md}" gr.Markdown(about_md) if KH_DEMO_MODE: with gr.Accordion("Create Your Own Space"): gr.Markdown( "This is a demo with limited functionality. " "Use **Create space** button to install Kotaemon " "in your own space with all features " "(including upload and manage your private " "documents securely)." ) gr.Button( value="Create Your Own Space", link=HF_SPACE_URL, variant="primary", size="lg", ) user_guide_md_dir = self.doc_dir / "usage.md" if user_guide_md_dir.exists(): with (self.doc_dir / "usage.md").open(encoding="utf-8") as fi: user_guide_md = fi.read() else: # fetch from remote user_guide_md = get_remote_doc( f"{self.remote_content_url}/v{self.app_version}/docs/usage.md" ) if user_guide_md: with gr.Accordion("User Guide", open=not KH_DEMO_MODE): gr.Markdown(user_guide_md) if self.app_version: # try retrieve from cache changelogs = "" if (self.changelogs_cache_dir / f"{version}.md").exists(): with open(self.changelogs_cache_dir / f"{version}.md", "r") as fi: changelogs = fi.read() else: release_url_base = ( "https://api.github.com/repos/Cinnamon/kotaemon/releases" ) changelogs = download_changelogs( release_url=f"{release_url_base}/tags/v{self.app_version}" ) # cache the changelogs if not self.changelogs_cache_dir.exists(): self.changelogs_cache_dir.mkdir(parents=True, exist_ok=True) with open( self.changelogs_cache_dir / f"{self.app_version}.md", "w" ) as fi: fi.write(changelogs) if changelogs: with gr.Accordion(f"Changelogs (v{self.app_version})"): gr.Markdown(changelogs) ================================================ FILE: libs/ktem/ktem/pages/login.py ================================================ import hashlib import gradio as gr from ktem.app import BasePage from ktem.db.models import User, engine from ktem.pages.resources.user import create_user from sqlmodel import Session, select fetch_creds = """ function() { const username = getStorage('username', '') const password = getStorage('password', '') return [username, password, null]; } """ signin_js = """ function(usn, pwd) { setStorage('username', usn); setStorage('password', pwd); return [usn, pwd]; } """ class LoginPage(BasePage): public_events = ["onSignIn"] def __init__(self, app): self._app = app self.on_building_ui() def on_building_ui(self): gr.Markdown(f"# Welcome to {self._app.app_name}!") self.usn = gr.Textbox(label="Username", visible=False) self.pwd = gr.Textbox(label="Password", type="password", visible=False) self.btn_login = gr.Button("Login", visible=False) def on_register_events(self): onSignIn = gr.on( triggers=[self.btn_login.click, self.pwd.submit], fn=self.login, inputs=[self.usn, self.pwd], outputs=[self._app.user_id, self.usn, self.pwd], show_progress="hidden", js=signin_js, ).then( self.toggle_login_visibility, inputs=[self._app.user_id], outputs=[self.usn, self.pwd, self.btn_login], ) for event in self._app.get_event("onSignIn"): onSignIn = onSignIn.success(**event) def toggle_login_visibility(self, user_id): return ( gr.update(visible=user_id is None), gr.update(visible=user_id is None), gr.update(visible=user_id is None), ) def _on_app_created(self): onSignIn = self._app.app.load( self.login, inputs=[self.usn, self.pwd], outputs=[self._app.user_id, self.usn, self.pwd], show_progress="hidden", js=fetch_creds, ).then( self.toggle_login_visibility, inputs=[self._app.user_id], outputs=[self.usn, self.pwd, self.btn_login], ) for event in self._app.get_event("onSignIn"): onSignIn = onSignIn.success(**event) def on_subscribe_public_events(self): self._app.subscribe_event( name="onSignOut", definition={ "fn": self.toggle_login_visibility, "inputs": [self._app.user_id], "outputs": [self.usn, self.pwd, self.btn_login], "show_progress": "hidden", }, ) def login(self, usn, pwd, request: gr.Request): try: import gradiologin as grlogin user = grlogin.get_user(request) except (ImportError, AssertionError): user = None if user: user_id = user["sub"] with Session(engine) as session: stmt = select(User).where( User.id == user_id, ) result = session.exec(stmt).all() if result: print("Existing user:", user) return user_id, "", "" else: print("Creating new user:", user) create_user( usn=user["email"], pwd="", user_id=user_id, is_admin=False, ) return user_id, "", "" else: if not usn or not pwd: return None, usn, pwd hashed_password = hashlib.sha256(pwd.encode()).hexdigest() with Session(engine) as session: stmt = select(User).where( User.username_lower == usn.lower().strip(), User.password == hashed_password, ) result = session.exec(stmt).all() if result: return result[0].id, "", "" gr.Warning("Invalid username or password") return None, usn, pwd ================================================ FILE: libs/ktem/ktem/pages/resources/__init__.py ================================================ import gradio as gr from ktem.app import BasePage from ktem.db.models import User, engine from ktem.embeddings.ui import EmbeddingManagement from ktem.index.ui import IndexManagement from ktem.llms.ui import LLMManagement from ktem.mcp.ui import MCPManagement from ktem.rerankings.ui import RerankingManagement from sqlmodel import Session, select from .user import UserManagement class ResourcesTab(BasePage): def __init__(self, app): self._app = app self.on_building_ui() def on_building_ui(self): with gr.Tab("Index Collections") as self.index_management_tab: self.index_management = IndexManagement(self._app) with gr.Tab("LLMs") as self.llm_management_tab: self.llm_management = LLMManagement(self._app) with gr.Tab("Embeddings") as self.emb_management_tab: self.emb_management = EmbeddingManagement(self._app) with gr.Tab("Rerankings") as self.rerank_management_tab: self.rerank_management = RerankingManagement(self._app) with gr.Tab("MCP Servers") as self.mcp_management_tab: self.mcp_management = MCPManagement(self._app) if self._app.f_user_management: with gr.Tab("Users", visible=False) as self.user_management_tab: self.user_management = UserManagement(self._app) def on_subscribe_public_events(self): if self._app.f_user_management: self._app.subscribe_event( name="onSignIn", definition={ "fn": self.toggle_user_management, "inputs": [self._app.user_id], "outputs": [self.user_management_tab], "show_progress": "hidden", }, ) self._app.subscribe_event( name="onSignOut", definition={ "fn": self.toggle_user_management, "inputs": [self._app.user_id], "outputs": [self.user_management_tab], "show_progress": "hidden", }, ) def toggle_user_management(self, user_id): """Show/hide the user management, depending on the user's role""" with Session(engine) as session: user = session.exec(select(User).where(User.id == user_id)).first() if user and user.admin: return gr.update(visible=True) return gr.update(visible=False) ================================================ FILE: libs/ktem/ktem/pages/resources/user.py ================================================ import hashlib import gradio as gr import pandas as pd from ktem.app import BasePage from ktem.db.models import User, engine from sqlmodel import Session, select from theflow.settings import settings as flowsettings USERNAME_RULE = """**Username rule:** - Username is case-insensitive - Username must be at least 3 characters long - Username must be at most 32 characters long - Username must contain only alphanumeric characters and underscores """ PASSWORD_RULE = """**Password rule:** - Password must be at least 8 characters long - Password must contain at least one uppercase letter - Password must contain at least one lowercase letter - Password must contain at least one digit - Password must contain at least one special character from the following: ^ $ * . [ ] { } ( ) ? - " ! @ # % & / \\ , > < ' : ; | _ ~ + = """ def validate_username(usn): """Validate that whether username is valid Args: usn (str): Username """ errors = [] if len(usn) < 3: errors.append("Username must be at least 3 characters long") if len(usn) > 32: errors.append("Username must be at most 32 characters long") if not usn.replace("_", "").isalnum(): errors.append( "Username must contain only alphanumeric characters and underscores" ) return "; ".join(errors) def validate_password(pwd, pwd_cnf): """Validate that whether password is valid - Password must be at least 8 characters long - Password must contain at least one uppercase letter - Password must contain at least one lowercase letter - Password must contain at least one digit - Password must contain at least one special character from the following: ^ $ * . [ ] { } ( ) ? - " ! @ # % & / \\ , > < ' : ; | _ ~ + = Args: pwd (str): Password pwd_cnf (str): Confirm password Returns: str: Error message if password is not valid """ errors = [] if pwd != pwd_cnf: errors.append("Password does not match") if len(pwd) < 8: errors.append("Password must be at least 8 characters long") if not any(c.isupper() for c in pwd): errors.append("Password must contain at least one uppercase letter") if not any(c.islower() for c in pwd): errors.append("Password must contain at least one lowercase letter") if not any(c.isdigit() for c in pwd): errors.append("Password must contain at least one digit") special_chars = "^$*.[]{}()?-\"!@#%&/\\,><':;|_~+=" if not any(c in special_chars for c in pwd): errors.append( "Password must contain at least one special character from the " f"following: {special_chars}" ) if errors: return "; ".join(errors) return "" def create_user(usn, pwd, user_id=None, is_admin=True) -> bool: with Session(engine) as session: statement = select(User).where(User.username_lower == usn.lower()) result = session.exec(statement).all() if result: print(f'User "{usn}" already exists') return False else: hashed_password = hashlib.sha256(pwd.encode()).hexdigest() user = User( id=user_id, username=usn, username_lower=usn.lower(), password=hashed_password, admin=is_admin, ) session.add(user) session.commit() return True class UserManagement(BasePage): def __init__(self, app): self._app = app self.on_building_ui() if hasattr(flowsettings, "KH_FEATURE_USER_MANAGEMENT_ADMIN") and hasattr( flowsettings, "KH_FEATURE_USER_MANAGEMENT_PASSWORD" ): usn = flowsettings.KH_FEATURE_USER_MANAGEMENT_ADMIN pwd = flowsettings.KH_FEATURE_USER_MANAGEMENT_PASSWORD is_created = create_user(usn, pwd) if is_created: gr.Info(f'User "{usn}" created successfully') def on_building_ui(self): with gr.Tab(label="User list"): self.state_user_list = gr.State(value=None) self.user_list = gr.DataFrame( headers=["id", "name", "admin"], column_widths=[0, 50, 50], interactive=False, ) with gr.Group(visible=False) as self._selected_panel: self.selected_user_id = gr.State(value=-1) self.usn_edit = gr.Textbox(label="Username") with gr.Row(): self.pwd_edit = gr.Textbox(label="Change password", type="password") self.pwd_cnf_edit = gr.Textbox( label="Confirm change password", type="password", ) self.admin_edit = gr.Checkbox(label="Admin") with gr.Row(visible=False) as self._selected_panel_btn: with gr.Column(): self.btn_edit_save = gr.Button("Save") with gr.Column(): self.btn_delete = gr.Button("Delete") with gr.Row(): self.btn_delete_yes = gr.Button( "Confirm delete", variant="primary", visible=False ) self.btn_delete_no = gr.Button("Cancel", visible=False) with gr.Column(): self.btn_close = gr.Button("Close") with gr.Tab(label="Create user"): self.usn_new = gr.Textbox(label="Username", interactive=True) self.pwd_new = gr.Textbox( label="Password", type="password", interactive=True ) self.pwd_cnf_new = gr.Textbox( label="Confirm password", type="password", interactive=True ) with gr.Row(): gr.Markdown(USERNAME_RULE) gr.Markdown(PASSWORD_RULE) self.btn_new = gr.Button("Create user") def on_register_events(self): self.btn_new.click( self.create_user, inputs=[self.usn_new, self.pwd_new, self.pwd_cnf_new], outputs=[self.usn_new, self.pwd_new, self.pwd_cnf_new], ).then( self.list_users, inputs=self._app.user_id, outputs=[self.state_user_list, self.user_list], ) self.user_list.select( self.select_user, inputs=self.user_list, outputs=[self.selected_user_id], show_progress="hidden", ) self.selected_user_id.change( self.on_selected_user_change, inputs=[self.selected_user_id], outputs=[ self._selected_panel, self._selected_panel_btn, # delete section self.btn_delete, self.btn_delete_yes, self.btn_delete_no, # edit section self.usn_edit, self.pwd_edit, self.pwd_cnf_edit, self.admin_edit, ], show_progress="hidden", ) self.btn_delete.click( self.on_btn_delete_click, inputs=[self.selected_user_id], outputs=[self.btn_delete, self.btn_delete_yes, self.btn_delete_no], show_progress="hidden", ) self.btn_delete_yes.click( self.delete_user, inputs=[self._app.user_id, self.selected_user_id], outputs=[self.selected_user_id], show_progress="hidden", ).then( self.list_users, inputs=self._app.user_id, outputs=[self.state_user_list, self.user_list], ) self.btn_delete_no.click( lambda: ( gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), ), inputs=[], outputs=[self.btn_delete, self.btn_delete_yes, self.btn_delete_no], show_progress="hidden", ) self.btn_edit_save.click( self.save_user, inputs=[ self.selected_user_id, self.usn_edit, self.pwd_edit, self.pwd_cnf_edit, self.admin_edit, ], outputs=[self.pwd_edit, self.pwd_cnf_edit], show_progress="hidden", ).then( self.list_users, inputs=self._app.user_id, outputs=[self.state_user_list, self.user_list], ) self.btn_close.click( lambda: -1, outputs=[self.selected_user_id], ) def on_subscribe_public_events(self): self._app.subscribe_event( name="onSignIn", definition={ "fn": self.list_users, "inputs": [self._app.user_id], "outputs": [self.state_user_list, self.user_list], }, ) self._app.subscribe_event( name="onSignOut", definition={ "fn": lambda: ("", "", "", None, None, -1), "outputs": [ self.usn_new, self.pwd_new, self.pwd_cnf_new, self.state_user_list, self.user_list, self.selected_user_id, ], }, ) def create_user(self, usn, pwd, pwd_cnf): errors = validate_username(usn) if errors: gr.Warning(errors) return usn, pwd, pwd_cnf errors = validate_password(pwd, pwd_cnf) print(errors) if errors: gr.Warning(errors) return usn, pwd, pwd_cnf with Session(engine) as session: statement = select(User).where(User.username_lower == usn.lower()) result = session.exec(statement).all() if result: gr.Warning(f'Username "{usn}" already exists') return hashed_password = hashlib.sha256(pwd.encode()).hexdigest() user = User( username=usn, username_lower=usn.lower(), password=hashed_password ) session.add(user) session.commit() gr.Info(f'User "{usn}" created successfully') return "", "", "" def list_users(self, user_id): if user_id is None: return [], pd.DataFrame.from_records( [{"id": "-", "username": "-", "admin": "-"}] ) with Session(engine) as session: statement = select(User).where(User.id == user_id) user = session.exec(statement).one() if not user.admin: return [], pd.DataFrame.from_records( [{"id": "-", "username": "-", "admin": "-"}] ) statement = select(User) results = [ {"id": user.id, "username": user.username, "admin": user.admin} for user in session.exec(statement).all() ] if results: user_list = pd.DataFrame.from_records(results) else: user_list = pd.DataFrame.from_records( [{"id": "-", "username": "-", "admin": "-"}] ) return results, user_list def select_user(self, user_list, ev: gr.SelectData): if ev.value == "-" and ev.index[0] == 0: gr.Info("No user is loaded. Please refresh the user list") return -1 if not ev.selected: return -1 return user_list["id"][ev.index[0]] def on_selected_user_change(self, selected_user_id): if selected_user_id == -1: _selected_panel = gr.update(visible=False) _selected_panel_btn = gr.update(visible=False) btn_delete = gr.update(visible=True) btn_delete_yes = gr.update(visible=False) btn_delete_no = gr.update(visible=False) usn_edit = gr.update(value="") pwd_edit = gr.update(value="") pwd_cnf_edit = gr.update(value="") admin_edit = gr.update(value=False) else: _selected_panel = gr.update(visible=True) _selected_panel_btn = gr.update(visible=True) btn_delete = gr.update(visible=True) btn_delete_yes = gr.update(visible=False) btn_delete_no = gr.update(visible=False) with Session(engine) as session: statement = select(User).where(User.id == selected_user_id) user = session.exec(statement).one() usn_edit = gr.update(value=user.username) pwd_edit = gr.update(value="") pwd_cnf_edit = gr.update(value="") admin_edit = gr.update(value=user.admin) return ( _selected_panel, _selected_panel_btn, btn_delete, btn_delete_yes, btn_delete_no, usn_edit, pwd_edit, pwd_cnf_edit, admin_edit, ) def on_btn_delete_click(self, selected_user_id): if selected_user_id is None: gr.Warning("No user is selected") btn_delete = gr.update(visible=True) btn_delete_yes = gr.update(visible=False) btn_delete_no = gr.update(visible=False) return btn_delete = gr.update(visible=False) btn_delete_yes = gr.update(visible=True) btn_delete_no = gr.update(visible=True) return btn_delete, btn_delete_yes, btn_delete_no def save_user(self, selected_user_id, usn, pwd, pwd_cnf, admin): errors = validate_username(usn) if errors: gr.Warning(errors) return pwd, pwd_cnf if pwd: errors = validate_password(pwd, pwd_cnf) if errors: gr.Warning(errors) return pwd, pwd_cnf with Session(engine) as session: # Check username uniqueness (excluding current user) statement = select(User).where( User.username_lower == usn.lower(), User.id != selected_user_id, ) existing = session.exec(statement).first() if existing: gr.Warning( f'Username "{usn}" already exists. Please use a unique name.' ) return pwd, pwd_cnf statement = select(User).where(User.id == selected_user_id) user = session.exec(statement).one() user.username = usn user.username_lower = usn.lower() user.admin = admin if pwd: user.password = hashlib.sha256(pwd.encode()).hexdigest() session.commit() gr.Info(f'User "{usn}" updated successfully') return "", "" def delete_user(self, current_user, selected_user_id): if current_user == selected_user_id: gr.Warning("You cannot delete yourself") return selected_user_id with Session(engine) as session: statement = select(User).where(User.id == selected_user_id) user = session.exec(statement).one() session.delete(user) session.commit() gr.Info(f'User "{user.username}" deleted successfully') return -1 ================================================ FILE: libs/ktem/ktem/pages/settings.py ================================================ import hashlib import gradio as gr from ktem.app import BasePage from ktem.components import reasonings from ktem.db.models import Settings, User, engine from sqlmodel import Session, select from theflow.settings import settings as flowsettings KH_SSO_ENABLED = getattr(flowsettings, "KH_SSO_ENABLED", False) signout_js = """ function(u, c, pw, pwc) { removeFromStorage('username'); removeFromStorage('password'); return [u, c, pw, pwc]; } """ gr_cls_single_value = { "text": gr.Textbox, "number": gr.Number, "checkbox": gr.Checkbox, } gr_cls_choices = { "dropdown": gr.Dropdown, "radio": gr.Radio, "checkboxgroup": gr.CheckboxGroup, } def render_setting_item(setting_item, value): """Render the setting component into corresponding Gradio UI component""" kwargs = { "label": setting_item.name, "value": value, "interactive": True, } if setting_item.component in gr_cls_single_value: return gr_cls_single_value[setting_item.component](**kwargs) kwargs["choices"] = setting_item.choices if setting_item.component in gr_cls_choices: return gr_cls_choices[setting_item.component](**kwargs) raise ValueError( f"Unknown component {setting_item.component}, allowed are: " f"{list(gr_cls_single_value.keys()) + list(gr_cls_choices.keys())}.\n" f"Setting item: {setting_item}" ) class SettingsPage(BasePage): """Responsible for allowing the users to customize the application **IMPORTANT**: the name and id of the UI setting components should match the name of the setting in the `app.default_settings` """ public_events = ["onSignOut"] def __init__(self, app): """Initiate the page and render the UI""" self._app = app self._settings_state = app.settings_state self._user_id = app.user_id self._default_settings = app.default_settings self._settings_dict = self._default_settings.flatten() self._settings_keys = list(self._settings_dict.keys()) self._components = {} self._reasoning_mode = {} # store llms and embeddings components self._llms = [] self._embeddings = [] # render application page if there are application settings self._render_app_tab = False if not KH_SSO_ENABLED and self._default_settings.application.settings: self._render_app_tab = True # render index page if there are index settings (general and/or specific) self._render_index_tab = False if not KH_SSO_ENABLED: if self._default_settings.index.settings: self._render_index_tab = True else: for sig in self._default_settings.index.options.values(): if sig.settings: self._render_index_tab = True break # render reasoning page if there are reasoning settings self._render_reasoning_tab = False if not KH_SSO_ENABLED: if len(self._default_settings.reasoning.settings) > 1: self._render_reasoning_tab = True else: for sig in self._default_settings.reasoning.options.values(): if sig.settings: self._render_reasoning_tab = True break self.on_building_ui() def on_building_ui(self): if not KH_SSO_ENABLED: self.setting_save_btn = gr.Button( "Save & Close", variant="primary", elem_classes=["right-button"], elem_id="save-setting-btn", ) if self._app.f_user_management: with gr.Tab("User settings"): self.user_tab() self.app_tab() self.index_tab() self.reasoning_tab() def on_subscribe_public_events(self): """ Subscribes to public events related to user management. This function is responsible for subscribing to the "onSignIn" event, which is triggered when a user signs in. It registers two event handlers for this event. The first event handler, "load_setting", is responsible for loading the user's settings when they sign in. It takes the user ID as input and returns the settings state and a list of component outputs. The progress indicator for this event is set to "hidden". The second event handler, "get_name", is responsible for retrieving the username of the current user. It takes the user ID as input and returns the username if it exists, otherwise it returns "___". The progress indicator for this event is also set to "hidden". Parameters: self (object): The instance of the class. Returns: None """ if self._app.f_user_management: self._app.subscribe_event( name="onSignIn", definition={ "fn": self.load_setting, "inputs": self._user_id, "outputs": [self._settings_state] + self.components(), "show_progress": "hidden", }, ) def get_name(user_id): name = "Current user: " if user_id: with Session(engine) as session: statement = select(User).where(User.id == user_id) result = session.exec(statement).all() if result: return name + result[0].username return name + "___" self._app.subscribe_event( name="onSignIn", definition={ "fn": get_name, "inputs": self._user_id, "outputs": [self.current_name], "show_progress": "hidden", }, ) def on_register_events(self): if not KH_SSO_ENABLED: self.setting_save_btn.click( self.save_setting, inputs=[self._user_id] + self.components(), outputs=self._settings_state, ).then( lambda: gr.Tabs(selected="chat-tab"), outputs=self._app.tabs, ) self._components["reasoning.use"].change( self.change_reasoning_mode, inputs=[self._components["reasoning.use"]], outputs=list(self._reasoning_mode.values()), show_progress="hidden", ) if self._app.f_user_management and not KH_SSO_ENABLED: self.password_change_btn.click( self.change_password, inputs=[ self._user_id, self.password_change, self.password_change_confirm, ], outputs=[self.password_change, self.password_change_confirm], show_progress="hidden", ) onSignOutClick = self.signout.click( lambda: (None, "Current user: ___", "", ""), inputs=[], outputs=[ self._user_id, self.current_name, self.password_change, self.password_change_confirm, ], show_progress="hidden", js=signout_js, ).then( self.load_setting, inputs=self._user_id, outputs=[self._settings_state] + self.components(), show_progress="hidden", ) for event in self._app.get_event("onSignOut"): onSignOutClick = onSignOutClick.then(**event) def user_tab(self): # user management self.current_name = gr.Markdown("Current user: ___") if KH_SSO_ENABLED: import gradiologin as grlogin self.sso_signout = grlogin.LogoutButton("Logout") else: self.signout = gr.Button("Logout") self.password_change = gr.Textbox( label="New password", interactive=True, type="password" ) self.password_change_confirm = gr.Textbox( label="Confirm password", interactive=True, type="password" ) self.password_change_btn = gr.Button("Change password", interactive=True) def change_password(self, user_id, password, password_confirm): from ktem.pages.resources.user import validate_password errors = validate_password(password, password_confirm) if errors: print(errors) gr.Warning(errors) return password, password_confirm with Session(engine) as session: statement = select(User).where(User.id == user_id) result = session.exec(statement).all() if result: user = result[0] hashed_password = hashlib.sha256(password.encode()).hexdigest() user.password = hashed_password session.add(user) session.commit() gr.Info("Password changed") else: gr.Warning("User not found") return "", "" def app_tab(self): with gr.Tab("General", visible=self._render_app_tab): for n, si in self._default_settings.application.settings.items(): obj = render_setting_item(si, si.value) self._components[f"application.{n}"] = obj if si.special_type == "llm": self._llms.append(obj) if si.special_type == "embedding": self._embeddings.append(obj) def index_tab(self): # TODO: double check if we need general # with gr.Tab("General"): # for n, si in self._default_settings.index.settings.items(): # obj = render_setting_item(si, si.value) # self._components[f"index.{n}"] = obj id2name = {k: v.name for k, v in self._app.index_manager.info().items()} with gr.Tab("Retrieval settings", visible=self._render_index_tab): for pn, sig in self._default_settings.index.options.items(): name = id2name.get(pn, f"") with gr.Tab(name): for n, si in sig.settings.items(): obj = render_setting_item(si, si.value) self._components[f"index.options.{pn}.{n}"] = obj if si.special_type == "llm": self._llms.append(obj) if si.special_type == "embedding": self._embeddings.append(obj) def reasoning_tab(self): with gr.Tab("Reasoning settings", visible=self._render_reasoning_tab): with gr.Group(): for n, si in self._default_settings.reasoning.settings.items(): if n == "use": continue obj = render_setting_item(si, si.value) self._components[f"reasoning.{n}"] = obj if si.special_type == "llm": self._llms.append(obj) if si.special_type == "embedding": self._embeddings.append(obj) gr.Markdown("### Reasoning-specific settings") self._components["reasoning.use"] = render_setting_item( self._default_settings.reasoning.settings["use"], self._default_settings.reasoning.settings["use"].value, ) for idx, (pn, sig) in enumerate( self._default_settings.reasoning.options.items() ): with gr.Group( visible=idx == 0, elem_id=pn, ) as self._reasoning_mode[pn]: reasoning = reasonings.get(pn, None) if reasoning is None: gr.Markdown("**Name**: Description") else: info = reasoning.get_info() gr.Markdown(f"**{info['name']}**: {info['description']}") for n, si in sig.settings.items(): obj = render_setting_item(si, si.value) self._components[f"reasoning.options.{pn}.{n}"] = obj if si.special_type == "llm": self._llms.append(obj) if si.special_type == "embedding": self._embeddings.append(obj) def change_reasoning_mode(self, value): output = [] for each in self._reasoning_mode.values(): if value == each.elem_id: output.append(gr.update(visible=True)) else: output.append(gr.update(visible=False)) return output def load_setting(self, user_id=None): settings = self._settings_dict with Session(engine) as session: statement = select(Settings).where(Settings.user == user_id) result = session.exec(statement).all() if result: settings = result[0].setting output = [settings] output += tuple(settings[name] for name in self.component_names()) return output def save_setting(self, user_id: int, *args): """Save the setting to disk and persist the setting to session state Args: user_id: the user id args: all the values from the settings """ setting = {key: value for key, value in zip(self.component_names(), args)} if user_id is None: gr.Warning("Need to login before saving settings") return setting with Session(engine) as session: statement = select(Settings).where(Settings.user == user_id) try: user_setting = session.exec(statement).one() except Exception: user_setting = Settings() user_setting.user = user_id user_setting.setting = setting session.add(user_setting) session.commit() gr.Info("Setting saved") return setting def components(self) -> list: """Get the setting components""" output = [] for name in self._settings_keys: output.append(self._components[name]) return output def component_names(self): """Get the setting components""" return self._settings_keys def _on_app_created(self): if not self._app.f_user_management: self._app.app.load( self.load_setting, inputs=self._user_id, outputs=[self._settings_state] + self.components(), show_progress="hidden", ) def update_llms(): from ktem.llms.manager import llms if llms._default: llm_choices = [(f"{llms._default} (default)", "")] else: llm_choices = [("(random)", "")] llm_choices += [(_, _) for _ in llms.options().keys()] return gr.update(choices=llm_choices) def update_embeddings(): from ktem.embeddings.manager import embedding_models_manager if embedding_models_manager._default: emb_choices = [(f"{embedding_models_manager._default} (default)", "")] else: emb_choices = [("(random)", "")] emb_choices += [(_, _) for _ in embedding_models_manager.options().keys()] return gr.update(choices=emb_choices) for llm in self._llms: self._app.app.load( update_llms, inputs=[], outputs=[llm], show_progress="hidden", ) for emb in self._embeddings: self._app.app.load( update_embeddings, inputs=[], outputs=[emb], show_progress="hidden", ) ================================================ FILE: libs/ktem/ktem/pages/setup.py ================================================ import json import gradio as gr import requests from decouple import config from ktem.app import BasePage from ktem.embeddings.manager import embedding_models_manager as embeddings from ktem.llms.manager import llms from ktem.rerankings.manager import reranking_models_manager as rerankers from theflow.settings import settings as flowsettings KH_OLLAMA_URL = getattr(flowsettings, "KH_OLLAMA_URL", "http://localhost:11434/v1/") DEFAULT_OLLAMA_URL = KH_OLLAMA_URL.replace("v1", "api") if DEFAULT_OLLAMA_URL.endswith("/"): DEFAULT_OLLAMA_URL = DEFAULT_OLLAMA_URL[:-1] DEMO_MESSAGE = ( "This is a public space. Please use the " '"Duplicate Space" function on the top right ' "corner to setup your own space." ) def pull_model(name: str, stream: bool = True): payload = {"name": name} headers = {"Content-Type": "application/json"} response = requests.post( DEFAULT_OLLAMA_URL + "/pull", json=payload, headers=headers, stream=stream ) # Check if the request was successful response.raise_for_status() if stream: for line in response.iter_lines(): if line: data = json.loads(line.decode("utf-8")) yield data if data.get("status") == "success": break else: data = response.json() return data class SetupPage(BasePage): public_events = ["onFirstSetupComplete"] def __init__(self, app): self._app = app self.on_building_ui() def on_building_ui(self): gr.Markdown(f"# Welcome to {self._app.app_name} first setup!") self.radio_model = gr.Radio( [ ("Cohere API (*free registration*) - recommended", "cohere"), ("Google API (*free registration*)", "google"), ("OpenAI API (for GPT-based models)", "openai"), ("Local LLM (for completely *private RAG*)", "ollama"), ], label="Select your model provider", value="cohere", info=( "Note: You can change this later. " "If you are not sure, go with the first option " "which fits most normal users." ), interactive=True, ) with gr.Column(visible=False) as self.openai_option: gr.Markdown( ( "#### OpenAI API Key\n\n" "(create at https://platform.openai.com/api-keys)" ) ) self.openai_api_key = gr.Textbox( show_label=False, placeholder="OpenAI API Key" ) with gr.Column(visible=True) as self.cohere_option: gr.Markdown( ( "#### Cohere API Key\n\n" "(register your free API key " "at https://dashboard.cohere.com/api-keys)" ) ) self.cohere_api_key = gr.Textbox( show_label=False, placeholder="Cohere API Key" ) with gr.Column(visible=False) as self.google_option: gr.Markdown( ( "#### Google API Key\n\n" "(register your free API key " "at https://aistudio.google.com/app/apikey)" ) ) self.google_api_key = gr.Textbox( show_label=False, placeholder="Google API Key" ) with gr.Column(visible=False) as self.ollama_option: gr.Markdown( ( "#### Setup Ollama\n\n" "Download and install Ollama from " "https://ollama.com/. Check out latest models at " "https://ollama.com/library. " ) ) self.ollama_model_name = gr.Textbox( label="LLM model name", value=config("LOCAL_MODEL", default="qwen2.5:7b"), ) self.ollama_emb_model_name = gr.Textbox( label="Embedding model name", value=config("LOCAL_MODEL_EMBEDDINGS", default="nomic-embed-text"), ) self.setup_log = gr.HTML( show_label=False, ) with gr.Row(): self.btn_finish = gr.Button("Proceed", variant="primary") self.btn_skip = gr.Button( "I am an advance user. Skip this.", variant="stop" ) def on_register_events(self): onFirstSetupComplete = gr.on( triggers=[ self.btn_finish.click, self.cohere_api_key.submit, self.openai_api_key.submit, ], fn=self.update_model, inputs=[ self.cohere_api_key, self.openai_api_key, self.google_api_key, self.ollama_model_name, self.ollama_emb_model_name, self.radio_model, ], outputs=[self.setup_log], show_progress="hidden", ) onSkipSetup = gr.on( triggers=[self.btn_skip.click], fn=lambda: None, inputs=[], show_progress="hidden", outputs=[self.radio_model], ) for event in self._app.get_event("onFirstSetupComplete"): onSkipSetup = onSkipSetup.success(**event) onFirstSetupComplete = onFirstSetupComplete.success( fn=self.update_default_settings, inputs=[self.radio_model, self._app.settings_state], outputs=self._app.settings_state, ) for event in self._app.get_event("onFirstSetupComplete"): onFirstSetupComplete = onFirstSetupComplete.success(**event) self.radio_model.change( fn=self.switch_options_view, inputs=[self.radio_model], show_progress="hidden", outputs=[ self.cohere_option, self.openai_option, self.ollama_option, self.google_option, ], ) def update_model( self, cohere_api_key, openai_api_key, google_api_key, ollama_model_name, ollama_emb_model_name, radio_model_value, ): log_content = "" if not radio_model_value: gr.Info("Skip setup models.") yield gr.value(visible=False) return if radio_model_value == "cohere": if cohere_api_key: llms.update( name="cohere", spec={ "__type__": "kotaemon.llms.chats.LCCohereChat", "model_name": "command-r-plus-08-2024", "api_key": cohere_api_key, }, default=True, ) embeddings.update( name="cohere", spec={ "__type__": "kotaemon.embeddings.LCCohereEmbeddings", "model": "embed-multilingual-v3.0", "cohere_api_key": cohere_api_key, "user_agent": "default", }, default=True, ) rerankers.update( name="cohere", spec={ "__type__": "kotaemon.rerankings.CohereReranking", "model_name": "rerank-multilingual-v2.0", "cohere_api_key": cohere_api_key, }, default=True, ) elif radio_model_value == "openai": if openai_api_key: llms.update( name="openai", spec={ "__type__": "kotaemon.llms.ChatOpenAI", "base_url": "https://api.openai.com/v1", "model": "gpt-4o", "api_key": openai_api_key, "timeout": 20, }, default=True, ) embeddings.update( name="openai", spec={ "__type__": "kotaemon.embeddings.OpenAIEmbeddings", "base_url": "https://api.openai.com/v1", "model": "text-embedding-3-large", "api_key": openai_api_key, "timeout": 10, "context_length": 8191, }, default=True, ) elif radio_model_value == "google": if google_api_key: llms.update( name="google", spec={ "__type__": "kotaemon.llms.chats.LCGeminiChat", "model_name": "gemini-1.5-flash", "api_key": google_api_key, }, default=True, ) embeddings.update( name="google", spec={ "__type__": "kotaemon.embeddings.LCGoogleEmbeddings", "model": "models/text-embedding-004", "google_api_key": google_api_key, }, default=True, ) elif radio_model_value == "ollama": llms.update( name="ollama", spec={ "__type__": "kotaemon.llms.ChatOpenAI", "base_url": KH_OLLAMA_URL, "model": ollama_model_name, "api_key": "ollama", }, default=True, ) embeddings.update( name="ollama", spec={ "__type__": "kotaemon.embeddings.OpenAIEmbeddings", "base_url": KH_OLLAMA_URL, "model": ollama_emb_model_name, "api_key": "ollama", }, default=True, ) # download required models through ollama llm_model_name = llms.get("ollama").model # type: ignore emb_model_name = embeddings.get("ollama").model # type: ignore try: for model_name in [emb_model_name, llm_model_name]: log_content += f"- Downloading model `{model_name}` from Ollama
" yield log_content pre_download_log = log_content for response in pull_model(model_name): complete = response.get("completed", 0) total = response.get("total", 0) if complete > 0 and total > 0: ratio = int(complete / total * 100) log_content = ( pre_download_log + f"- {response.get('status')}: {ratio}%
" ) else: if "pulling" not in response.get("status", ""): log_content += f"- {response.get('status')}
" yield log_content except Exception as e: log_content += ( "Make sure you have download and installed Ollama correctly. " f"Got error: {str(e)}" ) yield log_content raise gr.Error("Failed to download model from Ollama.") # test models connection llm_output = emb_output = None # LLM model log_content += f"- Testing LLM model: {radio_model_value}
" yield log_content llm = llms.get(radio_model_value) # type: ignore log_content += "- Sending a message `Hi`
" yield log_content try: llm_output = llm("Hi") except Exception as e: log_content += ( f"- Connection failed. " f"Got error:\n {str(e)}" ) if llm_output: log_content += ( "- Connection success. " "
" ) yield log_content if llm_output: # embedding model log_content += f"- Testing Embedding model: {radio_model_value}
" yield log_content emb = embeddings.get(radio_model_value) assert emb, f"Embedding model {radio_model_value} not found." log_content += "- Sending a message `Hi`
" yield log_content try: emb_output = emb("Hi") except Exception as e: log_content += ( f"" "- Connection failed. " f"Got error:\n {str(e)}" ) if emb_output: log_content += ( "" "- Connection success. " "
" ) yield log_content if llm_output and emb_output: gr.Info("Setup models completed successfully!") else: raise gr.Error( "Setup models failed. Please verify your connection and API key." ) def update_default_settings(self, radio_model_value, default_settings): # revise default settings # reranking llm default_settings["index.options.1.reranking_llm"] = radio_model_value if radio_model_value == "ollama": default_settings["index.options.1.use_llm_reranking"] = False return default_settings def switch_options_view(self, radio_model_value): components_visible = [gr.update(visible=False) for _ in range(4)] values = ["cohere", "openai", "ollama", "google", None] assert radio_model_value in values, f"Invalid value {radio_model_value}" if radio_model_value is not None: idx = values.index(radio_model_value) components_visible[idx] = gr.update(visible=True) return components_visible ================================================ FILE: libs/ktem/ktem/reasoning/__init__.py ================================================ ================================================ FILE: libs/ktem/ktem/reasoning/base.py ================================================ from typing import Optional from kotaemon.base import BaseComponent class BaseReasoning(BaseComponent): """The reasoning pipeline that handles each of the user chat messages This reasoning pipeline has access to: - the retrievers - the user settings - the message - the conversation id - the message history """ @classmethod def get_info(cls) -> dict: """Get the pipeline information for the app to organize and display Returns: a dictionary that contains the following keys: - "id": the unique id of the pipeline - "name": the human-friendly name of the pipeline - "description": the overview short description of the pipeline, for user to grasp what does the pipeline do """ raise NotImplementedError @classmethod def get_user_settings(cls) -> dict: """Get the default user settings for this pipeline""" return {} @classmethod def get_pipeline( cls, user_settings: dict, state: dict, retrievers: Optional[list["BaseComponent"]] = None, ) -> "BaseReasoning": """Get the reasoning pipeline for the app to execute Args: user_setting: user settings state: conversation state retrievers (list): List of retrievers """ return cls() def run(self, message: str, conv_id: str, history: list, **kwargs): # type: ignore """Execute the reasoning pipeline""" raise NotImplementedError ================================================ FILE: libs/ktem/ktem/reasoning/prompt_optimization/__init__.py ================================================ from .decompose_question import DecomposeQuestionPipeline from .fewshot_rewrite_question import FewshotRewriteQuestionPipeline from .mindmap import CreateMindmapPipeline from .rewrite_question import RewriteQuestionPipeline __all__ = [ "DecomposeQuestionPipeline", "FewshotRewriteQuestionPipeline", "RewriteQuestionPipeline", "CreateMindmapPipeline", ] ================================================ FILE: libs/ktem/ktem/reasoning/prompt_optimization/decompose_question.py ================================================ import logging from ktem.llms.manager import llms from ktem.reasoning.prompt_optimization.rewrite_question import RewriteQuestionPipeline from pydantic import BaseModel, Field from kotaemon.base import Document, HumanMessage, Node, SystemMessage from kotaemon.llms import ChatLLM logger = logging.getLogger(__name__) class SubQuery(BaseModel): """Search over a database of insurance rulebooks or financial reports""" sub_query: str = Field( ..., description="A very specific query against the database.", ) class DecomposeQuestionPipeline(RewriteQuestionPipeline): """Decompose user complex question into multiple sub-questions Args: llm: the language model to rewrite question lang: the language of the answer. Currently support English and Japanese """ llm: ChatLLM = Node( default_callback=lambda _: llms.get("openai-gpt4-turbo", llms.get_default()) ) DECOMPOSE_SYSTEM_PROMPT_TEMPLATE = ( "You are an expert at converting user complex questions into sub questions. " "Perform query decomposition using provided function_call. " "Given a user question, break it down into the most specific sub" " questions you can (at most 3) " "which will help you answer the original question. " "Each sub question should be about a single concept/fact/idea. " "If there are acronyms or words you are not familiar with, " "do not try to rephrase them." ) prompt_template: str = DECOMPOSE_SYSTEM_PROMPT_TEMPLATE def create_prompt(self, question): schema = SubQuery.model_json_schema() function = { "name": schema["title"], "description": schema["description"], "parameters": schema, } llm_kwargs = { "tools": [{"type": "function", "function": function}], "tool_choice": "auto", "tools_pydantic": [SubQuery], } messages = [ SystemMessage(content=self.prompt_template), HumanMessage(content=question), ] return messages, llm_kwargs def run(self, question: str) -> list: # type: ignore messages, llm_kwargs = self.create_prompt(question) result = self.llm(messages, **llm_kwargs) tool_calls = result.additional_kwargs.get("tool_calls", None) sub_queries = [] if tool_calls: for tool_call in tool_calls: if "function" in tool_call: # openai and cohere format function_output = tool_call["function"]["arguments"] else: # anthropic format function_output = tool_call["args"] if isinstance(function_output, str): sub_query = SubQuery.parse_raw(function_output).sub_query else: sub_query = SubQuery.parse_obj(function_output).sub_query sub_queries.append( Document( content=sub_query, ) ) return sub_queries ================================================ FILE: libs/ktem/ktem/reasoning/prompt_optimization/fewshot_rewrite_question.py ================================================ import json import uuid from pathlib import Path from ktem.components import get_docstore, get_vectorstore from ktem.llms.manager import llms from ktem.reasoning.prompt_optimization.rewrite_question import ( DEFAULT_REWRITE_PROMPT, RewriteQuestionPipeline, ) from theflow.settings import settings as flowsettings from kotaemon.base import AIMessage, Document, HumanMessage, Node, SystemMessage from kotaemon.embeddings import BaseEmbeddings from kotaemon.llms import ChatLLM from kotaemon.storages import BaseDocumentStore, BaseVectorStore class FewshotRewriteQuestionPipeline(RewriteQuestionPipeline): """Rewrite user question Args: llm: the language model to rewrite question rewrite_template: the prompt template for llm to paraphrase a text input lang: the language of the answer. Currently support English and Japanese embedding: the embedding model to encode the question vector_store: the vector store to store the encoded question doc_store: the document store to store the original question k: the number of examples to retrieve for rewriting """ llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) rewrite_template: str = DEFAULT_REWRITE_PROMPT lang: str = "English" embedding: BaseEmbeddings vector_store: BaseVectorStore doc_store: BaseDocumentStore k: int = getattr(flowsettings, "N_PROMPT_OPT_EXAMPLES", 3) def add_documents(self, examples, batch_size: int = 50): print("Adding fewshot examples for rewriting") documents = [] for example in examples: doc = Document( text=example["input"], id_=str(uuid.uuid4()), metadata=example ) documents.append(doc) for i in range(0, len(documents), batch_size): embeddings = self.embedding(documents[i : i + batch_size]) ids = [t.doc_id for t in documents[i : i + batch_size]] self.vector_store.add( embeddings=embeddings, ids=ids, ) self.doc_store.add(documents[i : i + batch_size]) @classmethod def get_pipeline( cls, embedding, example_path=Path(__file__).parent / "rephrase_question_train.json", collection_name: str = "fewshot_rewrite_examples", ): vector_store = get_vectorstore(collection_name) doc_store = get_docstore(collection_name) pipeline = cls( embedding=embedding, vector_store=vector_store, doc_store=doc_store ) if doc_store.count(): return pipeline examples = json.load(open(example_path, "r")) pipeline.add_documents(examples) return pipeline def run(self, question: str) -> Document: # type: ignore emb = self.embedding(question)[0].embedding _, _, ids = self.vector_store.query(embedding=emb, top_k=self.k) examples = self.doc_store.get(ids) messages = [SystemMessage(content="You are a helpful assistant")] for example in examples: messages.append( HumanMessage( content=self.rewrite_template.format( question=example.metadata["input"], lang=self.lang ) ) ) messages.append(AIMessage(content=example.metadata["output"])) messages.append( HumanMessage( content=self.rewrite_template.format(question=question, lang=self.lang) ) ) result = self.llm(messages) return result ================================================ FILE: libs/ktem/ktem/reasoning/prompt_optimization/mindmap.py ================================================ import logging from textwrap import dedent from ktem.llms.manager import llms from kotaemon.base import BaseComponent, Document, HumanMessage, Node, SystemMessage from kotaemon.llms import ChatLLM, PromptTemplate logger = logging.getLogger(__name__) MINDMAP_HTML_EXPORT_TEMPLATE = dedent( """ Mindmap {markmap_div} """ ) class CreateMindmapPipeline(BaseComponent): """Create a mindmap from the question and context""" llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) SYSTEM_PROMPT = """ From now on you will behave as "MapGPT" and, for every text the user will submit, you are going to create a PlantUML mind map file for the inputted text to best describe main ideas. Format it as a code and remember that the mind map should be in the same language as the inputted context. You don't have to provide a general example for the mind map format before the user inputs the text. """ # noqa: E501 MINDMAP_PROMPT_TEMPLATE = """ Question: {question} Context: {context} Generate a sample PlantUML mindmap for based on the provided question and context above. Only includes context relevant to the question to produce the mindmap. Use the template like this: @startmindmap * Title ** Item A *** Item B **** Item C *** Item D @endmindmap """ # noqa: E501 prompt_template: str = MINDMAP_PROMPT_TEMPLATE @classmethod def convert_uml_to_markdown(cls, text: str) -> str: start_phrase = "@startmindmap" end_phrase = "@endmindmap" try: text = text.split(start_phrase)[-1] text = text.split(end_phrase)[0] text = text.strip().replace("*", "#") except IndexError: text = "" return text def run(self, question: str, context: str) -> Document: # type: ignore prompt_template = PromptTemplate(self.prompt_template) prompt = prompt_template.populate( question=question, context=context, ) messages = [ SystemMessage(content=self.SYSTEM_PROMPT), HumanMessage(content=prompt), ] uml_text = self.llm(messages).text markdown_text = self.convert_uml_to_markdown(uml_text) return Document( text=markdown_text, ) ================================================ FILE: libs/ktem/ktem/reasoning/prompt_optimization/rephrase_question_train.json ================================================ [ { "input": "What was the percentage increase in General and Administrative expenses in the year 2018 compared to the previous year?", "output": "What was the increase in the General and administrative in 2018?" }, { "input": "What was the specific percentage increase in the General and Administrative expenses during the year 2018?", "output": "What was the increase in the General and administrative in 2018?" }, { "input": "What was the decrease in the Other net expense in 2019?", "output": "What was the specific amount of the decrease in the Other net expense in the financial year of 2019?" }, { "input": "What was the decrease in the Other net expense in 2019?", "output": "What was the specific decrease amount experienced in the Other net expense during the year 2019?" }, { "input": "What was the increase in interest income?", "output": "What was the exact amount by which the interest income increased?" }, { "input": "What was the increase in interest income?", "output": "What was the exact amount of the increase in interest income?" }, { "input": "What was the average Other expense, net for 2018 and 2019?", "output": "What was the average net Other expense for the years 2018 and 2019 combined?" }, { "input": "In which year was Other expense, net less than 5,000 thousands?", "output": "In what particular year did the amount of Other expense, net fall below 5,000 thousands?" }, { "input": "What were the income tax expenses in the years 2019, 2018, and 2017, respectively?", "output": "What was the income tax expense in 2019,2018 and 2017 respectively?" }, { "input": "\"What were the income tax expenses for the years 2019, 2018, and 2017?\"", "output": "What was the income tax expense in 2019,2018 and 2017 respectively?" }, { "input": "How much money did the company have in unremitted earnings for the year 2019?", "output": "What was the company's unremitted earnings in 2019?" }, { "input": "How much unremitted earnings did the company have in 2019, without releasing or distributing them?", "output": "What was the company's unremitted earnings in 2019?" }, { "input": "Which year witnessed interest and penalties that amounted to less than 20 thousand?", "output": "In which year was Interest and penalties less than 20 thousand?" }, { "input": "What is the net carrying amount in 2019?", "output": "What is the net carrying amount for the year 2019? Please provide details to assist in providing a more accurate response." }, { "input": "What is the net carrying amount in 2019?", "output": "What is the net carrying amount for the year 2019? Can you provide more details or context about the specific asset, liability or financial element in consideration?" }, { "input": "What is the change in the debt discount, net of amortization from December 31, 2019 to December 31, 2018?", "output": "What was the net change in the debt discount's value, after taking into account amortization, between December 31, 2019 and December 31, 2018?" }, { "input": "What is the change in Net carrying amount from December 31, 2019 to December 31, 2018?", "output": "What is the difference in the net carrying amount between December 31, 2019 and December 31, 2018?" }, { "input": "What is the change in Furniture and equipment from December 31, 2019 to December 31, 2018?", "output": "What is the net difference in the value of furniture and equipment between December 31, 2019 and December 31, 2018?" }, { "input": "What is the change in Furniture and equipment from December 31, 2019 to December 31, 2018?", "output": "What was the net change in the value of Furniture and equipment between December 31, 2019 and December 31, 2018?" }, { "input": "What is the change in System hardware from December 31, 2019 to December 31, 2018?", "output": "What are the differences in the hardware of the System between December 31, 2019, and December 31, 2018?" }, { "input": "What was the exact percentage increase in the cost of revenue between 2017 and 2018?", "output": "What was the increase in the cost of revenue in 2018?" }, { "input": "What was the percentage increase in the total amount spent on expenses directly related to generating revenue in 2018 compared to the previous year?", "output": "What was the increase in the cost of revenue in 2018?" }, { "input": "In which year was cost of revenue less than 40,000 thousands?", "output": "In what specific year did the cost of revenue fall below the amount of 40,000 thousands?" }, { "input": "If a company used its earnings to finance its domestic operations, what are the potential outcomes or consequences of this decision?", "output": "What would happen if earnings were used to fund domestic operations?" }, { "input": "What was the change in Cash, cash equivalents, and restricted cash at the beginning of the period, comparing December 31, 2019, to December 31, 2018?", "output": "What is the change in Cash, cash equivalents and restricted cash at beginning of period from December 31, 2019 to December 31, 2018?" }, { "input": "What is the net difference in Cash provided by operating activities between December 31, 2019 and December 31, 2018?", "output": "What is the change in Cash provided by operating activities from December 31, 2019 to December 31, 2018?" }, { "input": "What is the change in Cash used in investing activities from December 31, 2019 to December 31, 2018?", "output": "What is the difference in the amount of Cash used in investing activities between December 31, 2019, and December 31, 2018?" }, { "input": "What was the percentage change in the Sales and marketing expense from 2017 to 2018?", "output": "What was the increase in the Sales and marketing expense in 2018 compared to 2017?" }, { "input": "In which year was Sales and marketing expenses less than 50,000 thousands?", "output": "In what specific year did the total expenses for sales and marketing fall below 50,000 thousands (50 million) currency units?" }, { "input": "In which year was General and administrative expenses less than 50,000 thousands?", "output": "In which specific year did the amount for General and Administrative expenses fall below 50,000 thousands (50 million) dollars?" }, { "input": "What is the average Carrying Value for the period December 31, 2019 to December 31, 2018?", "output": "What is the average Carrying Value from December 31, 2018 to December 31, 2019?" }, { "input": "In which specific year did the fair value of the 2022 Notes measure below 200,000 thousands?", "output": "In which year was the 2022 Notes fair value less than 200,000 thousands?" }, { "input": "In the year when the fair value of the 2022 Notes was below 200,000 thousands, what specific year was it?", "output": "In which year was the 2022 Notes fair value less than 200,000 thousands?" }, { "input": "What was the 2022 Notes cap price?", "output": "\"What was the cap price for the 2022 Notes? Please provide any additional details that could assist in retrieving the information accurately.\"" }, { "input": "What were the average expenses for sales and marketing in the years 2018 and 2019?", "output": "What was the average Sales and marketing expenses for 2018 and 2019?" }, { "input": "What were the average expenses for Sales and Marketing in 2018 and 2019, respectively?", "output": "What was the average Sales and marketing expenses for 2018 and 2019?" }, { "input": "In which year was Sales and marketing expenses less than 90,000 thousands?", "output": "When was the year that the sales and marketing expenses were recorded to be under 90,000 thousands?" }, { "input": "In which year was Sales and marketing expenses less than 90,000 thousands?", "output": "In which specific year did the total expenses for sales and marketing amount to a figure less than 90,000 thousands?" }, { "input": "What was the exact amount of gross profit recorded in the fiscal year of 2018?", "output": "What was the Gross profit in 2018?" }, { "input": "What was the specific value of the gross profit for the year 2018?", "output": "What was the Gross profit in 2018?" }, { "input": "In what specific year did the revenue fall below one hundred thousand thousands (100,000,000)?", "output": "In which year was revenue less than 100,000 thousands?" }, { "input": "\"In which specific year did the revenue fall below 100,000 thousands (100 million)?\"", "output": "In which year was revenue less than 100,000 thousands?" }, { "input": "What is the difference in the Gross Profit between the fiscal years 2018 and 2019?", "output": "What is the change in the Gross Profit from 2018 to 2019?" }, { "input": "What were the capitalized software development costs for the company in the years ending on December 31, 2017?", "output": "What was the company capitalized software development costs during the years ended December 31, 2017?" }, { "input": "What does the Adjusted EBITDA represent?", "output": "What is the meaning and purpose of the term Adjusted EBITDA?" }, { "input": "What does the Adjusted EBITDA represent?", "output": "What is the meaning and significance of Adjusted EBITDA?" }, { "input": "\"What were the federal statutory income tax rates in 2019, 2018, and 2017?\"", "output": "What was the Income tax at federal statutory rate in 2019, 2018 and 2017?" }, { "input": "What were the federal statutory income tax rates in 2019, 2018, and 2017?", "output": "What was the Income tax at federal statutory rate in 2019, 2018 and 2017?" }, { "input": "In what year did the State income tax expense, taking into account the federal tax effect, fall below negative one thousand thousands?", "output": "In which year was State income tax expense, net of federal tax effect less than (1,000) thousands?" }, { "input": "What were the amounts of the Other expense, net in 2018 and 2017?", "output": "What was the Other expense, net in 2018 and 2017 respectively?" }, { "input": "What is the difference in the amount of money owed from customers (net accounts receivable) between December 31, 2019, and December 31, 2018?", "output": "What is the change in Net accounts receivable from December 31, 2019 to December 31, 2018?" }, { "input": "What are the respective federal income tax expense at statutory rates in 2018 and 2019?", "output": "What were the federal income tax expenses at statutory rates for the years 2018 and 2019?" }, { "input": "What were the specific state income tax rates, after accounting for federal benefit deductions, for the years 2017 and 2018?", "output": "What are the respective state income taxes, net of federal benefit in 2017 and 2018?" }, { "input": "What were the state income tax rates, after accounting for federal benefit, for both the years 2017 and 2018?", "output": "What are the respective state income taxes, net of federal benefit in 2017 and 2018?" }, { "input": "What are the respective service revenue at March 31 and June 30, 2018?", "output": "What were the service revenues as of March 31 and June 30, 2018?" }, { "input": "What are the respective service revenue at June 30 and September 30, 2018?", "output": "What is the service revenue as of June 30 and September 30, 2018?" }, { "input": "What are the respective service revenue at June 30 and September 30, 2018?", "output": "What is the amount of service revenue reported on the financial statements for June 30 and September 30, 2018?" }, { "input": "What percentage of the total revenue generated in North America in 2019 can be attributed to the non-core segment?", "output": "What is the value of the revenue from the non-core segment from North America as a percentage of the total revenue earned in North America in 2019?" }, { "input": "What is the average amount of net cash generated from operating activities for the years 2018 and 2019?", "output": "What is the average net cash provided by operating activities in 2018 and 2019?" }, { "input": "What is the amount of cash-based equity compensation expense included in network operations expenses in both 2018 and 2017?", "output": "What are the respective on-cash equity-based compensation expense included in network operations expenses in 2018 and 2017?" }, { "input": "What are the respective service revenue in 2018 and 2019?", "output": "What was the amount of service revenue in the year 2018 and 2019 respectively?" }, { "input": "What is the total amount of interest and principal payments required for the debt relating to the 2022 Notes?", "output": "What is the value of the interest and principal payment obligations included in the debt under 2022 Notes?" }, { "input": "What is the total amount of both interest and principal that needs to be paid for the debt included in the 2022 Notes?", "output": "What is the value of the interest and principal payment obligations included in the debt under 2022 Notes?" }, { "input": "What is the total amount of debt that the company needs to repay within the next 3 years?", "output": "What is the company's total debt due within 3 years?" }, { "input": "What is the aggregate amount of debt that the company needs to repay within the next three years?", "output": "What is the company's total debt due within 3 years?" }, { "input": "What is the total amount of finance lease obligations that the company needs to repay within the next 3 years?", "output": "What is the company's total finance lease obligations due within 3 years?" }, { "input": "What were the specific values of system infrastructure in the years 2018 and 2019?", "output": "What are the respective values of system infrastructure in 2018 and 2019?" }, { "input": "What were the values of system infrastructure in 2018 and 2019?", "output": "What are the respective values of system infrastructure in 2018 and 2019?" }, { "input": "What is the percentage increase or decrease in the value of network equipment from 2018 to 2019?", "output": "What is the percentage change in the value of network equipment between 2018 and 2019?" }, { "input": "What is the percentage change in the value of leasehold improvements between 2018 and 2019?", "output": "What is the precise percentage increase or decrease in the value of leasehold improvements from 2018 to 2019?" }, { "input": "What is the percentage change in the value of system infrastructure between 2018 and 2019?", "output": "What is the percentage difference in the value of system infrastructure from 2018 to 2019?" }, { "input": "What were the pre-tax domestic incomes for 2018 and 2019?", "output": "What are the respective domestic income before income taxes in 2018 and 2019?" }, { "input": "What was the mean pre-tax domestic income for individuals in the years 2017 and 2018?", "output": "What is the average domestic income before income taxes in 2017 and 2018?" }, { "input": "What were the average pre-tax domestic incomes in the years 2017 and 2018?", "output": "What is the average domestic income before income taxes in 2017 and 2018?" }, { "input": "What was the average pre-tax domestic income for households in 2018 and 2019?", "output": "What is the average domestic income before income taxes in 2018 and 2019?" }, { "input": "What is the average number of basic weighted average common shares in 2018 and 2019?", "output": "What is the mean value of the basic weighted average common shares for the years 2018 and 2019?" }, { "input": "What is the average number of basic weighted average common shares in 2018 and 2019?", "output": "What is the average number of basic weighted average common shares for the years 2018 and 2019 combined?" }, { "input": "What was the average impact on stock dilution from stock options in 2018 and 2019?", "output": "What is the average dilutive effect of stock options in 2018 and 2019?" }, { "input": "What was the average impact of stock options on dilution for the years 2018 and 2019?", "output": "What is the average dilutive effect of stock options in 2018 and 2019?" }, { "input": "What does the cost of operations represent?", "output": "What is the meaning or significance of the term \"cost of operations\"?" }, { "input": "How many quarters had operating revenues that was below $2,000 million?", "output": "How many quarters had operating revenues below $2,000 million? Can you provide a breakdown of the number of quarters where the operating revenues were below $2,000 million?" }, { "input": "What was the exact change in Operating revenues from the Three Months Ended March to the Three Months Ended June?", "output": "What was the change in Operating revenues between Three Months Ended March and June?" }, { "input": "What was the difference in operating revenues for the Three Months Ended March and the Three Months Ended June?", "output": "What was the change in Operating revenues between Three Months Ended March and June?" }, { "input": "What was the amount of net property and equipment in 2015?", "output": "What was the exact monetary value of net property and equipment in the year 2015?" }, { "input": "What was the amount of net property and equipment in 2015?", "output": "What was the specific amount of net property and equipment recorded on the financial statements for the year 2015?" }, { "input": "What was the value of redeemable noncontrolling interests in the year 2019?", "output": "What were the Redeemable noncontrolling interests in 2019?" }, { "input": "What was the value of the redeemable noncontrolling interests in 2019?", "output": "What were the Redeemable noncontrolling interests in 2019?" }, { "input": "What was the percent increase or decrease in the equity of American Tower Corporation from 2015 to 2016 in the United States?", "output": "What was the percentage change in Total American Tower Corporation equity between 2015 and 2016?" }, { "input": "What was the percentage increase or decrease in the equity of American Tower Corporation from 2015 to 2016?", "output": "What was the percentage change in Total American Tower Corporation equity between 2015 and 2016?" }, { "input": "What was the difference in the overall value of assets from 2017 to 2018?", "output": "What was the change in total assets between 2017 and 2018?" }, { "input": "What was the total amount received from the sales of senior notes in the year 2019 after deducting any expenses or fees?", "output": "What were the net Proceeds from issuance of senior notes in 2019?" }, { "input": "What was the total amount of money received from the repayment of credit facilities in 2018?", "output": "What were the net Proceeds from (repayments of) credit facilities in 2018?" }, { "input": "\"What was the total amount received or repaid through credit facilities in 2018?\"", "output": "What were the net Proceeds from (repayments of) credit facilities in 2018?" }, { "input": "What was the value of Equipment in 2018?", "output": "What was the total monetary worth or market value of Equipment in the year 2018?" }, { "input": "What was the value of Equipment in 2018?", "output": "What was the monetary worth of the Equipment in the year 2018?" }, { "input": "What was the exact monetary worth of buildings and any related improvements in the year 2019?", "output": "What was the value of buildings and improvements in 2019?" }, { "input": "What was the difference in the value of land and improvements from 2018 to 2019?", "output": "What was the change in Land and improvements between 2018 and 2019?" }, { "input": "What was the percentage increase or decrease in the net value of Property and equipment between the years 2018 and 2019?", "output": "What was the percentage change in Property and equipment, net between 2018 and 2019?" }, { "input": "What are the components that make up redevelopment capital expenditures?", "output": "What does Redevelopment capital expenditures include?" }, { "input": "What actions are authorized by the company's Buyback Programs?", "output": "What do the company's Buyback Programs authorize the company to do?" }, { "input": "What was the change in the total number of shares purchased between November and December?", "output": "How much did the total number of shares purchased increase or decrease from November to December?" }, { "input": "What was the total value of the Property, Plant, and Equipment (PPE) in 2016?", "output": "What was the gross PPE in 2016?" }, { "input": "What was the total value of personal protective equipment (PPE) in terms of sales or revenue for the year 2016?", "output": "What was the gross PPE in 2016?" }, { "input": "For how many years has the total value of gross intangibles been above $15,000 million?", "output": "How many years did gross intangibles exceed $15,000 million?" }, { "input": "How many consecutive years did the value of gross intangibles surpass $15,000 million?", "output": "How many years did gross intangibles exceed $15,000 million?" }, { "input": "What was the percentage change in gross goodwill between 2018 and 2019?", "output": "What was the percentage difference in the amount of gross goodwill from 2018 to 2019?" }, { "input": "What was the percentage change in gross goodwill between 2018 and 2019?", "output": "What was the percentage difference in gross goodwill from 2018 to 2019?" }, { "input": "\"What was the overall amount paid in lease payments for the operating lease agreement?\"", "output": "What were the total lease payments for operating lease?" }, { "input": "What was the sum total of all lease payments made for an operating lease?", "output": "What were the total lease payments for operating lease?" }, { "input": "\"What was the total amount of operating lease expenses incurred during the fiscal years 2020, 2021, and 2022?\"", "output": "What was the sum of operating lease in fiscal years 2020-2022?" }, { "input": "What is the proportion of non-current lease liability in relation to the total lease liability, expressed as a percentage?", "output": "What is non-current lease liability as a percentage of Total lease liability?" }, { "input": "Which years did the company allocate a valuation allowance?", "output": "In which years did the company provide a valuation allowance?" }, { "input": "For how many years, starting from January 1, was the balance consistently higher than $100 million?", "output": "How many years was the balance as of January 1 above $100 million?" }, { "input": "How many years, starting from January 1, had a balance above $100 million?", "output": "How many years was the balance as of January 1 above $100 million?" }, { "input": "What specific details about the company's taxes can be found in their state tax returns?", "output": "What do the company's state tax returns reflect?" }, { "input": "In 2017, how many instances were there where the deferred Income tax benefit (provision) exceeded $50 million?", "output": "How many of the deferred Income tax benefit (provision) were above $50 million in 2017?" }, { "input": "How many instances occurred in 2017 where the deferred income tax benefit (provision) exceeded $50 million?", "output": "How many of the deferred Income tax benefit (provision) were above $50 million in 2017?" }, { "input": "In 2019, what is the number of income tax benefits (provisions) that exceeded $(4 million)?", "output": "How many of the current Income tax benefit (provision) were above $(4 million) in 2019?" }, { "input": "In 2019, what is the total number of income tax benefits (provisions) that exceeded $(4 million)?", "output": "How many of the current Income tax benefit (provision) were above $(4 million) in 2019?" }, { "input": "What was the change in the net cash provided by (used for) by investing activities between 2017 and 2018?", "output": "What was the difference in the net cash provided by (used for) investing activities from 2017 to 2018?" }, { "input": "What was the percentage change in diluted net income per share attributable to common stockholders of American Tower Corporation between the Three Months Ended in March and June?", "output": "What was the change in Diluted net income per share attributable to American Tower Corporation common stockholders between Three Months Ended March and June?" }, { "input": "What was the specific difference in the value of Prepaids and other current assets between the fiscal years 2018 and 2019?", "output": "What was the change in Prepaids and other current assets between 2018 and 2019?" }, { "input": "What was the difference in the amount of Prepaids and other current assets reported in the financial statements between the years 2018 and 2019?", "output": "What was the change in Prepaids and other current assets between 2018 and 2019?" }, { "input": "What was the total amount of property and real estate taxes that were accumulated or owed in 2018?", "output": "What was the Accrued property and real estate taxes in 2018?" }, { "input": "What was the change in Other accrued expenses between 2018 and 2019?", "output": "What was the difference in the amount of Other accrued expenses recorded in 2018 compared to 2019?" }, { "input": "What was the change in Other accrued expenses between 2018 and 2019?", "output": "What was the difference in the amount of Other accrued expenses recorded between the years 2018 and 2019?" }, { "input": "What was the total amount of Net operating loss carryforwards recorded for the financial year 2019?", "output": "What were the Net operating loss carryforwards in 2019?" }, { "input": "What were the amounts of the liability decreases included in the company's revisions in estimates for both 2019 and 2018?", "output": "How much were the decreases to the liability included in the revisions in estimates by the company in 2019 and 2018 respectively?" }, { "input": "What was the change in balance as of December 31 between 2018 and 2019?", "output": "What was the difference in the balance as of December 31st, 2018 and December 31st, 2019?" }, { "input": "What was the value of the Eure-et-Loir interests as of December 31, 2019?", "output": "What was the monetary worth of the Eure-et-Loir interests on the specific date of December 31, 2019?" }, { "input": "How many of the non-current liabilities components in 2018 were above $500 million?", "output": "How many components of non-current liabilities in the year 2018 exceeded $500 million in value?" }, { "input": "How many of the non-current liabilities components in 2018 were above $500 million?", "output": "In 2018, how many components of non-current liabilities exceeded $500 million in value?" }, { "input": "How many years has the expense for stock-based compensation for Total exceeded $100 million?", "output": "How many years did Total stock-based compensation expense exceed $100 million?" }, { "input": "For how many years has the expense for Stock-based compensation - Services been higher than $1 million?", "output": "How many years did Stock-based compensation expense - Services exceed $1 million?" }, { "input": "What differentiates the characteristics of Federal and State NOLs from 2025 to 2029?", "output": "What is the difference between Federal and State NOLs in the period 2025 to 2029?" }, { "input": "What differentiates Federal and State NOLs in the 2025-2029 timeframe, and how do they vary during this period?", "output": "What is the difference between Federal and State NOLs in the period 2025 to 2029?" }, { "input": "What was the approximate percentage of revenue in the company's property segments were attributable to their communication sites in 2018?", "output": "What was the estimated percentage of the company's total revenue in 2018 that came from their property segments, specifically from their communication sites?" }, { "input": "What was the approximate percentage of revenue in the company's property segments were attributable to their communication sites in 2018?", "output": "What percentage of the company's total revenue in 2018 can be attributed to their communication sites within their property segments?" }, { "input": "What percentage of the total revenue in the United States in 2019 can be attributed to property segments?", "output": "How many percent of total revenue in 2019 was accounted for by property segments in the U.S.?" }, { "input": "How many percent of total revenue in 2018 was accounted for by property segments in Asia?", "output": "What was the percentage of the total revenue in 2018 contributed by the property segments specifically in Asia?" }, { "input": "What were the intangible assets related to the Tower and network location specifically in the year 2019?", "output": "What were the Tower and network location intangible assets in 2019?" }, { "input": "What were the intangible assets related to the tower and network locations observed in 2019?", "output": "What were the Tower and network location intangible assets in 2019?" }, { "input": "How many years were the total impairment charges above $200 million?", "output": "How many years did the total impairment charges exceed or surpass $200 million?" }, { "input": "How many years were the total impairment charges above $200 million?", "output": "For how many years did the total impairment charges exceed $200 million?" }, { "input": "What was the percentage change in Total impairment charges between 2018 and 2019?", "output": "What percentage represents the difference in Total impairment charges between the years 2018 and 2019?" }, { "input": "What was the percentage change in Total impairment charges between 2018 and 2019?", "output": "What is the percentage difference in Total impairment charges from 2018 to 2019?" }, { "input": "For how long has the sum of all assets reached or surpassed $40,000 million?", "output": "How many years did the total assets exceed $40,000 million?" }, { "input": "What was the balance as at January 1, 2019?", "output": "What was the amount of money in the account on January 1, 2019?" }, { "input": "What was the difference in account balance on January 1st, comparing 2018 to 2019?", "output": "What was the change in balance as of January 1 between 2018 and 2019?" }, { "input": "What was the difference in the account balance between January 1, 2018 and January 1, 2019?", "output": "What was the change in balance as of January 1 between 2018 and 2019?" }, { "input": "What is the difference in the percentage increase of current year for 2017 and 2018?", "output": "What was the change in current year increases between 2017 and 2018?" }, { "input": "What was the difference in the yearly increases of current year between 2017 and 2018?", "output": "What was the change in current year increases between 2017 and 2018?" }, { "input": "What was the total amount spent on depreciation in the year 2019?", "output": "What was the depreciation cost in 2019?" }, { "input": "What was the change in depreciation between 2018 and 2019?", "output": "What was the difference in the amount of depreciation incurred in 2018 compared to 2019?" }, { "input": "What was the gross amount of accumulated depreciation at the beginning of 2017 and how did it change by the beginning of 2018?", "output": "What was the change in Gross amount of accumulated depreciation at beginning between 2017 and 2018?" }, { "input": "Which specific years have the statute of limitations expired on certain unrecognized tax benefits?", "output": "Which years did the statute of limitations on certain unrecognized tax benefits lapse?" }, { "input": "In 2019, what was the total value of receivables that had not yet been invoiced to customers?", "output": "What were the unbilled receivables in 2019?" }, { "input": "What was the change in Unbilled receivables between 2018 and 2019?", "output": "What was the net difference in Unbilled receivables from 2018 to 2019?" }, { "input": "What was the change in the investments into Nippon Yttrium Co., Ltd (\"NYC\") between 2018 and 2019?", "output": "\"What was the difference in the amount of money invested in Nippon Yttrium Co., Ltd (\"NYC\") from 2018 to 2019?\"" }, { "input": "What was the change in the investments into Nippon Yttrium Co., Ltd (\"NYC\") between 2018 and 2019?", "output": "What was the difference in the amount of investments made into Nippon Yttrium Co., Ltd (\"NYC\") during the years 2018 and 2019?" }, { "input": "What was the percentage change in total investments between 2018 and 2019?", "output": "What was the exact percentage difference in the total amount of investments from 2018 to 2019?" }, { "input": "What was the percentage increase or decrease in Pro forma revenues from 2017 to 2018?", "output": "What was the change in Pro forma revenues between 2017 and 2018?" }, { "input": "How many years did net income exceed $100,000 thousand?", "output": "For how many consecutive years has the net income surpassed $100,000 thousand?" }, { "input": "How many years did net income exceed $100,000 thousand?", "output": "For how many consecutive years has the net income been greater than $100,000 thousand?" }, { "input": "What was the specific financial difference in adjustments made to the defined benefit post-retirement plan between the years 2017 and 2019?", "output": "What was the change in Defined benefit post-retirement plan adjustments between 2017 and 2019?" }, { "input": "What was the difference in the adjustments made to Defined Benefit post-retirement plans between the years 2017 and 2019?", "output": "What was the change in Defined benefit post-retirement plan adjustments between 2017 and 2019?" }, { "input": "What were the total sales revenues in Hong Kong in the year 2018?", "output": "What was the net sales in Hong Kong in 2018?" }, { "input": "What was the exact amount of net sales generated in Singapore during the year 2017?", "output": "What was the net sales in Singapore in 2017?" }, { "input": "What was the change in net sales in Mexico between 2017 and 2018?", "output": "What was the difference in net sales in Mexico from 2017 to 2018 and how does it impact the overall sales performance in the country?" }, { "input": "In which specific years did the cumulative net sales in every region surpass one million dollars?", "output": "Which years did the total net sales in all regions exceed $1,000,000 thousand?" }, { "input": "What was the percentage change in the net sales from Other Countries between 2018 and 2019?", "output": "What was the percentage difference in net sales from Other Countries in 2019 compared to 2018?" }, { "input": "What was the difference in the amount of Amortization of debt issuance costs between the years 2017 and 2018?", "output": "What was the change in the Amortization of debt issuance costs between 2017 and 2018?" }, { "input": "For how many years did the interest expense on capital leases surpass $200 thousand?", "output": "How many years did Interest expense on capital leases exceed $200 thousand?" }, { "input": "By what percentage did the total interest expense increase or decrease from 2018 to 2019?", "output": "What was the percentage change in the total interest expense between 2018 and 2019?" }, { "input": "What was the percentage increase or decrease in the total interest expense from 2018 to 2019?", "output": "What was the percentage change in the total interest expense between 2018 and 2019?" }, { "input": "What was the total net actuarial loss or gain specifically associated with Other Benefits for the year 2018?", "output": "What was the Net actuarial loss (gain) for Other Benefits in 2018?" }, { "input": "What was the change in net sales between 2017 and 2019?", "output": "What is the difference in net sales from 2017 to 2019?" }, { "input": "For how many years was the cost of sales higher than $800,000 thousand?", "output": "How many years did cost of sales exceed $800,000 thousand?" }, { "input": "For how many years has the cost of sales been higher than $800,000 thousand?", "output": "How many years did cost of sales exceed $800,000 thousand?" }, { "input": "What was the precise net value of property, plant, and equipment in Japan specifically for the year 2019?", "output": "What was the net amount of property, plant and equipment in Japan in 2019?" }, { "input": "What was the net amount of property, plant and equipment in Thailand in 2018?", "output": "What was the exact net value of property, plant, and equipment in Thailand during the year 2018?" }, { "input": "How many years did Total net property, plant and equipment from Non-United States regions exceed $400,000 thousand?", "output": "How many years was the total value of property, plant, and equipment from regions outside the United States greater than $400,000 thousand?" }, { "input": "What was the change in the net property, plant and equipment in China between 2018 and 2019?", "output": "What was the exact difference in the net value of property, plant, and equipment in China during the period from 2018 to 2019?" }, { "input": "What was the net interest expense in 2019?", "output": "What was the amount of interest expenses incurred in 2019, after subtracting any interest income earned during the same year?" }, { "input": "What was the change in the Loss on early extinguishment of debt between 2018 and 2019?", "output": "What was the difference in the Loss on early extinguishment of debt from 2018 to 2019?" }, { "input": "What was the percentage increase or decrease in the net interest expense from 2018 to 2019?", "output": "What was the percentage change in the net interest expense between 2018 and 2019?" }, { "input": "What percentage increase or decrease occurred in the net interest expense from 2018 to 2019?", "output": "What was the percentage change in the net interest expense between 2018 and 2019?" }, { "input": "What was the change in the Returns reserves between 2018 and 2019?", "output": "What was the difference in the amount allocated to Returns reserves between the fiscal years 2018 and 2019?" }, { "input": "How much did the Additions for tax positions change from 2018 to 2019 in the current year?", "output": "What was the change in the Additions for tax positions of the current year between 2018 and 2019?" }, { "input": "What was the reflection of Pension and other post-retirement benefits?", "output": "What did Pension and other post-retirement benefits reflect?" }, { "input": "What were the interest obligations that had payments due more than 5 years?", "output": "What were the interest obligations that had payments due for a period longer than 5 years?" }, { "input": "What was the precise difference in the total amount between Employee separation liability and Restructuring liability?", "output": "What was the difference in the total between Employee separation liability and Restructuring liability?" }, { "input": "What is the discrepancy in the total amount between Employee Separation Liability and Restructuring Liability?", "output": "What was the difference in the total between Employee separation liability and Restructuring liability?" }, { "input": "What was the distinction in the payment amounts expected in Year 1 for interest obligations compared to operating lease obligations?", "output": "What was the difference between the payments due by Year 1 between Interest obligations and operating lease obligations?" }, { "input": "What were the contrasting payment amounts between Year 1 for interest obligations and operating lease obligations?", "output": "What was the difference between the payments due by Year 1 between Interest obligations and operating lease obligations?" }, { "input": "What is the percentage of total debt obligations in relation to the overall contractual obligations?", "output": "What were the total debt obligations as a percentage of the total contractual obligations?" }, { "input": "What was the amount of cash generated or used in financing activities during the year 2017?", "output": "What was the Net cash provided by (used in) financing activities in 2017?" }, { "input": "What was the percentage change in the amount of cash, cash equivalents, and restricted cash from 2018 to 2019?", "output": "What was the percentage change in the Net increase in cash, cash equivalents, and restricted cash between 2018 and 2019?" }, { "input": "How much of a discount did Customer Advances receive?", "output": "What was the amount of discount for Customer Advances?" }, { "input": "What was the total amount of discount provided for Customer Advances?", "output": "What was the amount of discount for Customer Advances?" }, { "input": "What was the net gain or loss from acquisitions in the year 2018?", "output": "What was the Acquisition (gain) loss in 2018?" }, { "input": "What were the restructuring charges in 2017?", "output": "\"What were the specific charges related to restructuring expenses incurred during the year 2017?\"" }, { "input": "What were the restructuring charges in 2017?", "output": "What were the specific charges related to the corporate restructuring that occurred in 2017?" }, { "input": "For how many years has the income tax expense from continuing operations exceeded $10,000 thousand?", "output": "How many years did Total current income tax expense from continuing operations exceed $10,000 thousand?" }, { "input": "What was the difference in foreign income tax expense between 2017 and 2018?", "output": "What was the change in current foreign income tax expense between 2017 and 2018?" }, { "input": "What was the difference in the amount of foreign income tax expense recorded for the years 2017 and 2018?", "output": "What was the change in current foreign income tax expense between 2017 and 2018?" }, { "input": "What was the percentage change in the deferred federal income tax expense between 2018 and 2019?", "output": "What was the percentage increase or decrease in the amount of deferred federal income tax expense from 2018 to 2019?" }, { "input": "What were the restructuring charges in 2019?", "output": "What were the specific charges incurred as part of the restructuring efforts in the year 2019?" }, { "input": "What was the change in Personnel reduction costs between 2017 and 2018?", "output": "What was the difference in the costs associated with reducing personnel between the years 2017 and 2018?" }, { "input": "What is the percentage change in the costs associated with relocation and exit from 2018 to 2019?", "output": "What was the percentage change in the Relocation and exit costs between 2018 and 2019?" }, { "input": "What is the percentage difference in the Relocation and exit costs from 2018 to 2019?", "output": "What was the percentage change in the Relocation and exit costs between 2018 and 2019?" }, { "input": "What was the change in Foreign income between 2017 and 2018?", "output": "What was the difference in the amount of income from foreign sources recorded for the years 2017 and 2018?" }, { "input": "What were the net sales in 2019?", "output": "What was the total value of sales generated in 2019?" }, { "input": "What were the net sales in 2019?", "output": "What was the total amount of revenue generated from sales in the year 2019?" }, { "input": "\"What was the gross margin percentage for the quarter ending on September 30th?\"", "output": "What was the gross margin for the Sep-30 quarter?" }, { "input": "\"What was the exact gross margin percentage for the quarter ending on September 30th?\"", "output": "What was the gross margin for the Sep-30 quarter?" }, { "input": "\"For which specific time periods did the Gross Margin exceed $120,000 thousand?\"", "output": "Which quarters ended did the Gross Margin exceed $120,000 thousand?" }, { "input": "What was the total value of contract assets in 2019?", "output": "What were the amount of contract assets in 2019?" }, { "input": "What were the specific values of contract assets recorded in the year 2019?", "output": "What were the amount of contract assets in 2019?" }, { "input": "What was the difference in the amount of prepaid expenses between the fiscal years 2018 and 2019?", "output": "What was the change in prepaid expenses between 2018 and 2019?" }, { "input": "What was the difference in the amount of prepaid expenses between the years 2018 and 2019?", "output": "What was the change in prepaid expenses between 2018 and 2019?" }, { "input": "What was the percentage difference in the variance between Software Solutions and Data and Analytics?", "output": "What was the difference in the percent variance between Software Solutions and Data and Analytics?" }, { "input": "What was the reported amount of net trade receivables in 2017?", "output": "What were the net trade receivables as reported in 2017?" }, { "input": "What was the adjustmentments for ASC 606 adoption for net computer software?", "output": "What adjustments were made for the adoption of ASC 606 in relation to net computer software? Please provide a detailed explanation of the adjustments made for net computer software under ASC 606." }, { "input": "What was the adjustmentments for ASC 606 adoption for net computer software?", "output": "What were the adjustments made for the adoption of ASC 606 regarding net computer software?" }, { "input": "In which specific years did the total value of Deferred contract costs surpass $30 million?", "output": "Which years did Deferred contract costs exceed $30 million?" }, { "input": "What was the change in cash equivalents between 2018 and 2019?", "output": "What was the difference in the amount of cash equivalents reported for the years 2018 and 2019?" }, { "input": "What was the change in cash equivalents between 2018 and 2019?", "output": "What was the difference in the amount of cash equivalents held between the year 2018 and the year 2019?" }, { "input": "What was the change in Cash between 2018 and 2019?", "output": "What was the difference in the Cash amount reported in the financial statements of the company between the years 2018 and 2019?" }, { "input": "What was the amount of Equity method investments in 2019?", "output": "How much money was invested through the equity method in 2019?" }, { "input": "Which years does the table provide information for the EBITDA by segment?", "output": "For which years does the table contain EBITDA information segmented by different categories?" }, { "input": "Which years does the table provide information for the EBITDA by segment?", "output": "For which specific years does the table provide EBITDA data segmented by different categories?" }, { "input": "What was the total value of EBITDA (Earnings Before Interest, Taxes, Depreciation, and Amortization) for the year 2018?", "output": "What was the sum of the EBITDA in 2018?" }, { "input": "\"What was the specific monetary balance amount for Software Solutions in the year 2017?\"", "output": "What was the balance in 2017 for Software Solutions?" }, { "input": "What was the specific balance amount, in USD, for Software Solutions in the year 2017?", "output": "What was the balance in 2017 for Software Solutions?" }, { "input": "What was the specific amount of federal income tax expense incurred in the fiscal year of 2019?", "output": "What was the current federal income tax expense in 2019?" }, { "input": "Which years does the table provide information for depreciation and amortization by segment?", "output": "For which specific periods does the table offer data regarding depreciation and amortization broken down by segment?" }, { "input": "What was the value of the total assets before the implementation of ASC 606 revenue recognition standards?", "output": "What were the total assets without the adoption of ASC 606?" }, { "input": "What was the sum of all assets before the implementation of ASC 606?", "output": "What were the total assets without the adoption of ASC 606?" }, { "input": "How did the adoption of ASC 606 affect the difference between Total Assets and Total Liabilities?", "output": "What was the difference the effect of ASC 606 Adoption between Total Assets and Total Liabilities?" }, { "input": "What is the impact of ASC 606 Adoption on the difference between Total Assets and Total Liabilities?", "output": "What was the difference the effect of ASC 606 Adoption between Total Assets and Total Liabilities?" }, { "input": "What was the percentage change in cash flows from operating activities in 2019 compared to 2018?", "output": "What was the cash flows provided by operating activities in 2019 as a percentage of the cash flow in 2018?" }, { "input": "For how many consecutive years did the Net increase in cash and cash equivalents remain positive?", "output": "How many years was the Net increase in cash and cash equivalents positive?" }, { "input": "What was the EBITDA difference specifically recorded between the Software Solutions division and the Data and Analytics division?", "output": "What was the difference in the EBITDA between Software Solutions and Data and Analytics?" }, { "input": "What was the exact difference in EBITDA (earnings before interest, taxes, depreciation, and amortization) between Software Solutions and Data and Analytics?", "output": "What was the difference in the EBITDA between Software Solutions and Data and Analytics?" }, { "input": "What specific expenses were included in the category of operating expenses for Corporate and Other?", "output": "What did operating expenses for Corporate and Other include?" }, { "input": "What was the discrepancy or variance between the overall value of assets and the specific intangible asset of goodwill in the context of data and analytics?", "output": "What was the difference between the total assets and goodwill from data and analytics?" }, { "input": "What was the difference between Operating expenses and Revenues from Software Solutions?", "output": "What differentiates Operating expenses and Revenues from Software Solutions? Please provide a detailed explanation comparing the two." }, { "input": "What was the percentage change in the Net earnings margin between 2017 and 2019?", "output": "What was the percentage difference in the Net earnings margin from 2017 to 2019?" }, { "input": "What was the percentage change in the Net earnings margin between 2017 and 2019?", "output": "What was the exact percentage difference in the net earnings margin from 2017 to 2019?" }, { "input": "What was the percentage change in revenues between 2018 and 2019?", "output": "What was the percentage increase or decrease in revenues from 2018 to 2019?" }, { "input": "What was the percentage change in revenues between 2018 and 2019?", "output": "What was the percentage difference in the amount of money earned in revenue from 2018 to 2019?" }, { "input": "What were the specific prepaid expenses incurred during the year 2018?", "output": "What were the prepaid expenses in 2018?" }, { "input": "What were the specific categories of Other current assets in the financial statements for the year 2019?", "output": "What were the Other current assets in 2019?" }, { "input": "What was the change in contract assets between 2018 and 2019?", "output": "What was the difference in the amount of contract assets from 2018 to 2019?" }, { "input": "How many years did prepaid expenses exceed $40.0 million?", "output": "For how many years was the amount of prepaid expenses higher than $40.0 million?" }, { "input": "How many years did the operating margin exceed 20.0%?", "output": "For how many years was the operating margin greater than 20.0%?" }, { "input": "What was the amount of Corporate Services in 2018?", "output": "What was the specific amount of funds allocated to Corporate Services during the year 2018?" }, { "input": "What was the amount of Corporate Services in 2018?", "output": "How much did Corporate Services amount to in the year 2018?" }, { "input": "What specific factors or elements were taken into consideration when calculating the debt?", "output": "What did the calculation for Debt include?" }, { "input": "What was the difference between Total Debt and Total Interest on Debt?", "output": "What is the distinction between the total amount of debt and the total interest incurred on that debt?" }, { "input": "How many contracts with a value exceeding $100 million were made during the period of 2021-2022?", "output": "For the period 2021-2022, how many contractual obligations exceeded $100 million?" }, { "input": "What is the dollar difference between the actual cost and the estimated cost for software solutions?", "output": "What was the dollar variance for software solutions?" }, { "input": "What was the specific revenue generated from the field of Data and Analytics during the year 2017?", "output": "What was revenue from Data and Analytics in 2017?" }, { "input": "For how many consecutive years has revenue from Data and Analytics surpassed $150 million?", "output": "How many years did revenue from Data and Analytics exceed $150 million?" }, { "input": "What was the average amount of money earned between the years 2017 and 2018?", "output": "What was the average total revenue between 2017 and 2018?" }, { "input": "What was the average revenue from Corporate and Other between 2017 and 2018?", "output": "What was the average revenue generated from the Corporate and Other category during the two-year period encompassing 2017 and 2018?" }, { "input": "What was the change in purchased software between 2018 and 2019?", "output": "What was the difference in the amount of software that was bought between the years 2018 and 2019?" }, { "input": "For how many years has the internally developed software generated revenue exceeding $800 million?", "output": "How many years did internally developed software exceed $800 million?" }, { "input": "For how long did internally developed software generate revenue exceeding $800 million?", "output": "How many years did internally developed software exceed $800 million?" }, { "input": "What was the percentage change in net computer software between 2018 and 2019?", "output": "What was the percentage increase or decrease in net computer software from 2018 to 2019?" }, { "input": "How many years did Percentage of gross lease receivables exceed 2.0%?", "output": "For how many years was the percentage of gross lease receivables above 2.0%?" }, { "input": "How many years did Percentage of gross lease receivables exceed 2.0%?", "output": "For how long has the percentage of gross lease receivables been above 2.0%?" }, { "input": "What was the percentage change in Allowance for doubtful accounts between 2018 and 2019?", "output": "What is the percentage difference in the allowance for doubtful accounts from 2018 to 2019?" }, { "input": "What was the percentage change in Allowance for doubtful accounts between 2018 and 2019?", "output": "What was the percentage increase or decrease in the Allowance for doubtful accounts from 2018 to 2019?" }, { "input": "What was the percentage change in revenue from the Europe, Middle East, and Africa (EMEA) region between 2017 and 2018?", "output": "What was the change for revenue from EMEA between 2017 and 2018?" }, { "input": "What was the percentage change in revenue from the Europe, Middle East, and Africa (EMEA) region from 2017 to 2018?", "output": "What was the change for revenue from EMEA between 2017 and 2018?" }, { "input": "Why would actual maturities differ from the contractual maturities?", "output": "Why do the actual time periods until maturity sometimes vary from the specified contractual time periods?" }, { "input": "What is the distinction in amortized cost between debt investments held for less than 1 year compared to those held for a period ranging from 1 year to 5 years?", "output": "What was the difference between the amortized cost of debt investments that were within 1 year and after 1 year through 5 years?" }, { "input": "What was the total amount of money allocated to the gradual write-off of acquired intangible assets throughout the year 2018?", "output": "What was the total amortization of purchased intangible assets in 2018?" }, { "input": "What was the difference in the variance in dollars between Available-for-sale debt investments and net Other gains (losses)?", "output": "What was the variance in dollars between the variance in Available-for-sale debt investments and the variance in net Other gains (losses)?" }, { "input": "What was the percentage change in the net other gains (losses) between 2017 and 2018?", "output": "What was the percent change in the net amount of other gains (losses) from 2017 to 2018?" }, { "input": "What were the dividends in 2018?", "output": "What were the dividend payments made in the year 2018?" }, { "input": "What were the dividends in 2018?", "output": "What was the total amount of dividends paid out in the year 2018?" }, { "input": "What units are being utilized in the table and what purpose do they serve in the given context?", "output": "What are the units used in the table?" }, { "input": "What was the difference in the balance at the start of the fiscal year in 2017 compared to 2018?", "output": "What was the change in Balance at beginning of fiscal year between 2017 and 2018?" }, { "input": "What was the difference in the balance at the start of the fiscal year from 2017 to 2018?", "output": "What was the change in Balance at beginning of fiscal year between 2017 and 2018?" }, { "input": "How does the company typically recommend analyzing their liquidity and capital resources for future periods?", "output": "How does the company believe that their liquidity and capital resources in future periods should be analyzed?" }, { "input": "What differentiates operating leases with durations of less than 1 year from those lasting 1 to 3 years?", "output": "What was the difference in operating leases between those that were less than 1 year and 1 to 3 years?" }, { "input": "How did the characteristics of operating leases differ between those with a duration of less than 1 year and those lasting from 1 to 3 years?", "output": "What was the difference in operating leases between those that were less than 1 year and 1 to 3 years?" }, { "input": "What percentage of the total contractual obligations is represented by the senior notes?", "output": "What was the total senior notes as a percentage of total contractual obligations?" }, { "input": "What is the period that has the greatest Operating leases?", "output": "What is the time frame during which Operating leases are most prevalent?" }, { "input": "How does the company manage and minimize the potential financial loss related to credit risk in relation to derivatives?", "output": "How does the company mitigate credit risk associated with derivatives?" }, { "input": "Which years does the table provide information for the company's outstanding derivatives?", "output": "For which specific years does the table contain information regarding the outstanding derivatives of the company?" }, { "input": "Which years does the table provide information for the company's outstanding derivatives?", "output": "For which specific years does the table present information about the company's exceptional derivatives?" }, { "input": "What was the change in the Net investment hedging instruments between 2018 and 2019?", "output": "What was the difference in the amount of net investment hedging instruments between 2018 and 2019?" }, { "input": "What was the change in the Net investment hedging instruments between 2018 and 2019?", "output": "What was the difference in the amount of Net investment hedging instruments between 2018 and 2019?" }, { "input": "What was the percentage change in the total amount of outstanding derivatives between 2018 and 2019?", "output": "What was the percentage difference in the total value of outstanding derivatives from 2018 to 2019?" }, { "input": "How much was the total amount spent on Purchased Considerations specifically for BroadSoft?", "output": "What was the amount of Purchased Considerations for BroadSoft?" }, { "input": "What were the differences in terms of Purchase Consideration specifically between Springpath and Broadsoft? Please provide additional details and insights to help understand this comparison better.", "output": "What was the difference in Purchase Consideration between Springpath and Broadsoft?" }, { "input": "What was the specific variance in terms of Purchase Consideration when comparing Springpath and Broadsoft?", "output": "What was the difference in Purchase Consideration between Springpath and Broadsoft?" }, { "input": "What was the amount of unearned income specifically derived from lease receivables in the year 2019?", "output": "What was unearned income from lease receivables in 2019?" }, { "input": "\"What was the amount of unearned income generated specifically from lease receivables during the year 2019?\"", "output": "What was unearned income from lease receivables in 2019?" }, { "input": "What was the difference in the reported total between current and noncurrent financing receivables?", "output": "How much greater or lesser was the total reported amount of financing receivables between current and noncurrent categories?" }, { "input": "What were the inclusions of foreign exchange transactions and other related activities?", "output": "What did foreign exchange and other include?" }, { "input": "What was the difference, expressed as a percentage, in the balance of financing receivables at the end of fiscal year 2018 compared to the balance at the end of fiscal year 2019?", "output": "What was the percentage change in the balance at the end of fiscal year for financing receivables between 2018 and 2019?" }, { "input": "What percentage increase or decrease occurred in the balance of financing receivables at the end of the fiscal year from 2018 to 2019?", "output": "What was the percentage change in the balance at the end of fiscal year for financing receivables between 2018 and 2019?" }, { "input": "How many years did Revenue exceed $50,000 million?", "output": "How many consecutive years has the revenue exceeded $50,000 million?" }, { "input": "How many years did Revenue exceed $50,000 million?", "output": "For how many complete years has the revenue consistently surpassed the $50,000 million mark?" }, { "input": "What is the difference in the amount of Goodwill between Duo and Luxtera and how does it impact their overall value?", "output": "What was the difference in Goodwill between Duo and Luxtera?" }, { "input": "What were the variations in the level of Goodwill between Duo and Luxtera?", "output": "What was the difference in Goodwill between Duo and Luxtera?" }, { "input": "What was the discrepancy in the value of purchased intangible assets between Luxtera and other companies?", "output": "What was the difference in Purchased intangible assets between Luxtera and Others?" }, { "input": "What were the variations in the value of purchased intangible assets specifically between Luxtera and other companies/entities?", "output": "What was the difference in Purchased intangible assets between Luxtera and Others?" }, { "input": "What is the acquisition with the highest Purchase Consideration?", "output": "What is the highest purchase consideration acquisition?" }, { "input": "What was the total value of technology products purchased before accounting for any deductions or expenses?", "output": "What was the amount of gross purchased technology?" }, { "input": "What is the net value difference between Technology and Customer relationships?", "output": "What was the difference in the net values between Technology and Customer relationships?" }, { "input": "What was the numerical difference in the net values specifically attributed to Technology and Customer relationships in relation to a certain context?", "output": "What was the difference in the net values between Technology and Customer relationships?" }, { "input": "Which years does the table provide information for the company's activity related to their product warranty liability?", "output": "For which specific years does the table display data regarding the company's product warranty liability?" }, { "input": "Which years does the table provide information for the company's activity related to their product warranty liability?", "output": "For which years does the table displayed provide information about the company's activity in terms of their product warranty liability?" }, { "input": "What were the Acquisitions and divestitures in 2019?", "output": "What were the acquisitions and divestitures that took place in the year 2019?" }, { "input": "What was the difference in the amount allocated for warranty provisions issued in 2017 compared to 2018?", "output": "What was the change in Provisions for warranties issued between 2017 and 2018?" }, { "input": "What was the difference in the amount allocated for warranties issued between the years 2017 and 2018?", "output": "What was the change in Provisions for warranties issued between 2017 and 2018?" }, { "input": "For which specific years does the table offer data on cash and cash equivalents as well as investments?", "output": "Which years does the table provide information for cash and cash equivalents and investments?" }, { "input": "What was the change in the value of available-for-sale debt investments, indicating whether there was an increase or decrease?", "output": "What was the increase (decrease) in available-for-sale debt investments?" }, { "input": "In 2019, how did the distinction between cash and cash equivalents and available-for-sale debt investments manifest and how were they different from each other?", "output": "What was the difference between cash and cash equivalents and Available-for-sale debt investments in 2019?" }, { "input": "What differentiates cash and cash equivalents from available-for-sale debt investments in the financial records for the year 2019?", "output": "What was the difference between cash and cash equivalents and Available-for-sale debt investments in 2019?" }, { "input": "What was the change in Interest expense between 2017 and 2018?", "output": "What is the difference in the amount of interest expense incurred from 2017 to 2018?" }, { "input": "What was the change in Interest expense between 2017 and 2018?", "output": "What was the difference in the amount of money paid for interest in 2017 compared to 2018?" }, { "input": "What was the difference in the amount of income earned from interest between the years 2018 and 2019?", "output": "What was the change in Interest income between 2018 and 2019?" }, { "input": "What is the amount of federal net operating loss carryforwards that the company had for income tax purposes in the year 2019?", "output": "What was the company's federal net operating loss carryforwards for income tax purposes in 2019?" }, { "input": "What was the amount of the company's federal net operating loss carryforwards specifically for income tax purposes in the year 2019?", "output": "What was the company's federal net operating loss carryforwards for income tax purposes in 2019?" }, { "input": "What was the percentage change in total deferred tax assets between 2018 and 2019?", "output": "What is the percentage increase or decrease in the total amount of deferred tax assets from 2018 to 2019?" }, { "input": "What was the percentage change in total deferred tax assets between 2018 and 2019?", "output": "What was the percentage increase or decrease in the total amount of deferred tax assets from 2018 to 2019?" }, { "input": "In fiscal year 2019, what was the total amount of net interest expense recognized by the company?", "output": "How much net interest expense did the company recognize in fiscal year 2019?" }, { "input": "How much interest expense, after deducting interest income, did the company record as a net amount during fiscal year 2019?", "output": "How much net interest expense did the company recognize in fiscal year 2019?" }, { "input": "What were the Additions for tax positions of prior years in 2019?", "output": "What were the additions made to the tax positions of previous years in the year 2019?" }, { "input": "What was the percentage change in net income between 2018 and 2019?", "output": "What was the percentage difference in net income from 2018 to 2019 and how can it be calculated?" }, { "input": "Why has the direct effect of foreign currency fluctuations on revenue not been material?", "output": "Why has the direct impact of changes in foreign currency exchange rates on our revenue not had a significant financial effect so far? Please provide an explanation for the lack of material impact despite fluctuations in currency values." }, { "input": "Why has the direct effect of foreign currency fluctuations on revenue not been material?", "output": "Why has the impact of foreign currency fluctuations on revenue not been significant enough to have a material effect?" }, { "input": "What was the difference in the fair value of forward contracts that were sold between the years 2018 and 2019?", "output": "What was the change in the fair value of sold forward contracts between 2018 and 2019?" }, { "input": "Why does the company prioritize inventory and supply chain management as a specific area of focus?", "output": "Why is inventory and supply chain management an area of focus for the company?" }, { "input": "Why does the company prioritize inventory and supply chain management as a focal point for its operations?", "output": "Why is inventory and supply chain management an area of focus for the company?" }, { "input": "What was the total sum of purchase commitments made in the year 2018?", "output": "What were the total purchase commitments in 2018?" }, { "input": "What was the change in purchase commitments that were less than 1 year between 2018 and 2019?", "output": "What was the difference in purchase commitments of less than 1 year between 2018 and 2019?" }, { "input": "What was the change in purchase commitments that were less than 1 year between 2018 and 2019?", "output": "What was the difference in purchase commitments shorter than one year between the year 2018 and the year 2019?" }, { "input": "For how long did contracts ranging from 1 to 3 years surpass a cumulative worth of $700 million?", "output": "How many years did commitments that were 1 to 3 years exceed $700 million?" }, { "input": "What was the percentage change in product revenue for the APJC region from 2017 to 2018?", "output": "What was the change in the product revenue from APJC between 2017 and 2018?" }, { "input": "What was the change in the product revenue from Americas between 2017 and 2018?", "output": "What was the difference in the amount of revenue generated from products in the Americas region between the years 2017 and 2018?" }, { "input": "What was the numerical difference in the total value of net property and equipment in International regions between the years 2017 and 2018?", "output": "What was the change in net property and equipment from International regions between 2017 and 2018?" }, { "input": "What is the percentage increase or decrease in the total net property and equipment from 2018 to 2019?", "output": "What was the percentage change in the total net property and equipment between 2018 and 2019?" }, { "input": "What was the percentage change in revenue generated from security services between 2017 and 2018?", "output": "What was the change in revenue from security between 2017 and 2018?" }, { "input": "What was the total amount paid for the shares during the period from April 28, 2019 to May 25, 2019 and May 26, 2019 to June 22, 2019?", "output": "What was the total payment for the shares from April 28, 2019 to May 25, 2019 and from May 26, 2019 to June 22, 2019?" }, { "input": "What was the total amount paid for the shares during the period from April 28, 2019 to May 25, 2019 and May 26, 2019 to June 22, 2019?", "output": "What was the overall payment for the shares between April 28, 2019 and May 25, 2019, as well as between May 26, 2019 and June 22, 2019?" }, { "input": "What was the average total amount paid for the shares during the period from April 28, 2019 to May 25, 2019 and May 26, 2019 to June 22, 2019?", "output": "What was the average total payment for shares from April 28, 2019 to June 22, 2019, and how does it compare between the periods April 28, 2019 to May 25, 2019 and May 26, 2019 to June 22, 2019?" }, { "input": "What was the difference in the balance of Americas between the years 2018 and 2019?", "output": "What was the change in balance from Americas between 2018 and 2019?" }, { "input": "What was the difference in America's balance between 2018 and 2019 and how did it change?", "output": "What was the change in balance from Americas between 2018 and 2019?" }, { "input": "Which years does the table provide information for the company's available-for-sale debt investments and equity investments?", "output": "For which time period does the table provide data on the available-for-sale debt investments and equity investments of the company?" }, { "input": "\"What were the total provisions, also known as benefits, provided in this context?", "output": "What were the total provisions (benefits)?" }, { "input": "What was the difference between total provisions (benefits) and net recoveries (write-offs)?", "output": "What differentiates total provisions, also referred to as benefits, from net recoveries, which are write-offs?" }, { "input": "What was the allowance for credit loss for financed service contracts as a percentage of total allowance for credit loss in 2018?", "output": "In 2018, what was the proportion or ratio of the allowance for credit loss specifically assigned to financed service contracts compared to the total allowance for credit loss?" }, { "input": "What was the allowance for credit loss for financed service contracts as a percentage of total allowance for credit loss in 2018?", "output": "What was the percentage of the total allowance for credit loss in 2018 that was allocated specifically for financed service contracts?" }, { "input": "What is the percentage difference in Loss from continuing operations before income tax between the years 2018 and 2019?", "output": "What is the percentage change in the Loss from continuing operations before income tax from 2018 to 2019?" }, { "input": "What is the percentage difference in the Loss from continuing operations before income tax between the years 2018 and 2019?", "output": "What is the percentage change in the Loss from continuing operations before income tax from 2018 to 2019?" }, { "input": "What is the percentage change in the income tax benefit from 2018 to 2019?", "output": "What is the percentage difference in the income tax benefit received between 2018 and 2019?" }, { "input": "In which year did the net loss for the year exceed the net loss of any other year?", "output": "In which year is there a greater net loss for the year?" }, { "input": "What does the net deferred tax liabilities include?", "output": "What is included in the calculation of net deferred tax liabilities?" }, { "input": "What is the exact percentage change in the net deferred tax liabilities from 2018 to 2019?", "output": "What is the percentage change in the net deferred tax liabilities from 2018 to 2019?" }, { "input": "What was the parent entity's current ratio in 2019?", "output": "What is the current ratio of the parent entity in 2019?" }, { "input": "What was the parent entity's current ratio in 2019? Please provide the numerical value.", "output": "What is the current ratio of the parent entity in 2019?" }, { "input": "What was the debts to assets ratio of the parent entity in the year 2018? Can you provide the specific ratio value or percentage?", "output": "What is the debts to assets ratio of the parent entity in 2018?" }, { "input": "What is the debt-to-assets ratio of the parent company for the year 2018?", "output": "What is the debts to assets ratio of the parent entity in 2018?" }, { "input": "What is the percentage change in the net cash generated or used for investing activities?", "output": "What is the percentage change in the net cash provided from investing activities?" }, { "input": "What is the change in the net cash provided from operating activities from 2018 to 2019?", "output": "What is the difference between the net cash provided from operating activities in 2018 and 2019?" }, { "input": "What is the change in the net cash provided from operating activities from 2018 to 2019?", "output": "What is the difference in the amount of net cash provided from operating activities between 2018 and 2019?" }, { "input": "What is the change in Net change in cash and cash equivalent from 2018 to 2019?", "output": "What is the difference in the amount of cash and cash equivalents between 2018 and 2019?" }, { "input": "What is the change in Net change in cash and cash equivalent from 2018 to 2019?", "output": "What is the difference in the net change in cash and cash equivalents between the years 2018 and 2019?" }, { "input": "What was the percentage increase or decrease in the value of current assets from 2018 to 2019?", "output": "What is the percentage change in current assets from 2018 to 2019?" }, { "input": "What is the current ratio in the year 2019 and what factors should be considered while determining it?", "output": "What is the current ratio in 2019?" }, { "input": "What was the debts to assets ratio for the year 2019?", "output": "What is the debts to assets ratio in 2019?" }, { "input": "\"What was the debts to assets ratio for the year 2019?\"", "output": "What is the debts to assets ratio in 2019?" }, { "input": "What is the purpose and significance of trial commissions, and how are they defined?", "output": "What are trial commissions?" }, { "input": "\"What is the purpose and definition of trial commissions?\"", "output": "What are trial commissions?" }, { "input": "\"How does the Group calculate or determine its total income or financial proceeds?\"", "output": "How does the Group determine its revenue?" }, { "input": "\"How does the Group calculate or establish its revenue? Provide details about the methods or criteria used for determining the Group's revenue.\"", "output": "How does the Group determine its revenue?" }, { "input": "What is the percentage difference in the amount of upfront fees charged from 2018 to 2019?", "output": "What is the percentage change in upfront fees from 2018 to 2019?" }, { "input": "What is the percentage difference in upfront fees between 2018 and 2019?", "output": "What is the percentage change in upfront fees from 2018 to 2019?" }, { "input": "What is the change in the Business combination reserve between 2018 and 2019?", "output": "What is the monetary difference in the Business combination reserve from 2018 to 2019?" }, { "input": "What is the percentage change in the total reserves from 2018 to 2019?", "output": "By what percentage did the total reserves increase or decrease from 2018 to 2019?" }, { "input": "What is the percentage increase or decrease in the trail commission asset comparing the values from 2018 and 2019?", "output": "What is the percentage change in the current trail commission asset from 2018 to 2019?" }, { "input": "What is the percentage change in the cash receipts from 2018 to 2019?", "output": "What is the percentage increase or decrease in the cash receipts comparing the amounts in 2018 and 2019?" }, { "input": "What are the payroll tax rates for both 2019 and 2018?", "output": "What is the payroll tax for 2019 and 2018 respectively?" }, { "input": "\"What was the amount of post-tax loss incurred from discontinued operations in the year 2019?\"", "output": "What is the Post-tax loss of discontinued operations in 2019?" }, { "input": "\"In which year did the revenue increase compared to other years?\"", "output": "In which year is there a higher revenue?" }, { "input": "In which specific year did the revenue increase, resulting in a higher amount compared to previous years?", "output": "In which year is there a higher revenue?" }, { "input": "What is the percentage difference in the amount of lease liabilities between 2018 and 2019?", "output": "What is the percentage change in the current lease liabilities from 2018 to 2019?" }, { "input": "What is the exact percentage difference in the lease liabilities as of 2018 compared to 2019?", "output": "What is the percentage change in the current lease liabilities from 2018 to 2019?" }, { "input": "What method does the Company use to determine and calculate the amount for its provision?", "output": "How does the Company calculate its provision?" }, { "input": "How does the Company determine the amount of money set aside for provisions and what factors are taken into consideration during the calculation process?", "output": "How does the Company calculate its provision?" }, { "input": "What is the percentage change in the contract assets from 2018 to 2019?", "output": "What is the percentage change in the value of the contract assets from the year 2018 to the year 2019?" }, { "input": "In which year does the number of outstanding shares have a higher value at the start of the period as compared to other years?", "output": "In which year is the number of outstanding shares at the beginning of the period higher?" }, { "input": "What is the pre-tax discount rate that was utilized to determine the value of projected cash flows for the year 2019?", "output": "What is the pre-tax discount rate applied to cash flow projections in 2019?" }, { "input": "What was the pre-tax discount rate used to calculate cash flow projections in 2019?", "output": "What is the pre-tax discount rate applied to cash flow projections in 2019?" }, { "input": "In which year is the health CGU higher?", "output": "\"In which specific year does the health Comprehensive Grand Unit (CGU) exhibit a higher value compared to other years?\"" }, { "input": "In which year is the car CGU higher?", "output": "\"In which specific year does the car CGU (Car Gross Utilization) exhibit a higher value compared to other years? Please provide the year.\"" }, { "input": "What is the post-employment benefits in 2018?", "output": "What were the post-employment benefits in 2018, and can you provide more details about them?" }, { "input": "What were the share-based payment transactions and their impact in 2019?", "output": "What is the share-based payments in 2019?" }, { "input": "What were the share-based payments in 2019 and could you provide further details on this topic?", "output": "What is the share-based payments in 2019?" }, { "input": "What is the percentage change in the post-employment benefits from 2018 to 2019?", "output": "What is the percentage difference in post-employment benefits between the years 2018 and 2019?" }, { "input": "What specific expenses or expenditure categories are included in the unallocated corporate costs for the current year?", "output": "What do the unalloacated corporate costs include in the current year?" }, { "input": "What is the exact percentage change in revenue in Australia from the year 2018 to the year 2019?", "output": "What is the percentage change in the revenue in Australia from 2018 to 2019?" }, { "input": "What is the revenue percentage change experienced in Asia between 2018 and 2019?", "output": "What is the percentage change in the revenue in Asia from 2018 to 2019?" }, { "input": "What was the percentage increase or decrease in revenue in the Asian region from 2018 to 2019?", "output": "What is the percentage change in the revenue in Asia from 2018 to 2019?" }, { "input": "What was the total number of outstanding stock options for VMware in 2017?", "output": "What was the outstanding number of shares for VMware stock options in 2017?" }, { "input": "What was the total number of shares allotted for VMware stock options in the year 2017?", "output": "What was the outstanding number of shares for VMware stock options in 2017?" }, { "input": "What was the number of granted shares from Pivotal Stock Options in 2018?", "output": "\"How many shares were granted by Pivotal Stock Options during the year 2018?\"" }, { "input": "For how long did the Weighted-Average Exercise Price (per share) for VMware stock options remain above $60.00?", "output": "How many years did the outstanding Weighted-Average Exercise Price (per share) for VMware stock options exceed $60.00?" }, { "input": "What was the difference in the overall amount of money earned by the company in 2018 compared to 2019?", "output": "What was the change in total revenue between 2018 and 2019?" }, { "input": "What was the difference in total revenue between the years 2018 and 2019?", "output": "What was the change in total revenue between 2018 and 2019?" }, { "input": "How many years did total revenue exceed $500 million?", "output": "For how many consecutive years has the total revenue been greater than $500 million?" }, { "input": "What was the percentage change in net income between 2019 and 2020?", "output": "What was the percentage difference in net income from 2019 to 2020, and how can we calculate it accurately?" }, { "input": "\"What was the variation in the increase in goodwill associated with mergers and acquisitions from 2019 to 2020?\"", "output": "What was the change in the Increase in goodwill related to business combinations between 2019 and 2020?" }, { "input": "What was the difference in balance at the start of 2019 and 2020?", "output": "What was the change in balance at the beginning of the year between 2019 and 2020?" }, { "input": "What was the difference in balance at the beginning of 2019 and 2020?", "output": "What was the change in balance at the beginning of the year between 2019 and 2020?" }, { "input": "What did the Senior Notes consists of?", "output": "What were the contents or components of the Senior Notes?" }, { "input": "What was the difference between total Senior Notes and total Notes payable to Dell?", "output": "What is the distinction between the combined value of Senior Notes and the combined value of Notes payable to Dell?" }, { "input": "What differentiates the total amount of Term Loans from the total amount of Future Lease Commitments?", "output": "What was the difference between total Term Loans and total Future Lease Commitments?" }, { "input": "What differentiates the total amount of Term Loans from the total value of Future Lease Commitments? Elaborate on the distinctions between these two financial concepts.", "output": "What was the difference between total Term Loans and total Future Lease Commitments?" }, { "input": "Which specific time periods are covered in the table containing data on long-lived assets categorized by geographic area? These assets primarily consist of net property and equipment.", "output": "Which years does the table include information for long-lived assets by geographic area, which primarily include property and equipment, net?" }, { "input": "How long has the outstanding balance owed to related parties been surpassing $100 million?", "output": "How many years did current amounts due to related parties exceed $100 million?" }, { "input": "What was the cumulative amount of sales invoices acknowledged and recorded as revenue throughout the entire fiscal year of 2020?", "output": "What was the total billings recognized during the year ended 2020?" }, { "input": "\"What was the exact amount of billings that were recorded and acknowledged as revenue throughout the entirety of the year that concluded in 2020?\"", "output": "What was the total billings recognized during the year ended 2020?" }, { "input": "What was the change in Unearned software maintenance revenue between 2019 and 2020?", "output": "\"What was the difference in Unearned software maintenance revenue from 2019 to 2020 and how did it change?\"" }, { "input": "What was the change in Unearned software maintenance revenue between 2019 and 2020?", "output": "What was the difference in Unearned software maintenance revenue from 2019 to 2020?" }, { "input": "What is the percentage of unearned revenue in the total revenue for the years 2019 and 2020?", "output": "What was the percentage of total unearned revenue between 2019 and 2020?" }, { "input": "\"What was the percentage of unearned revenue as a proportion of the total revenue generated in the years 2019 and 2020?\"", "output": "What was the percentage of total unearned revenue between 2019 and 2020?" }, { "input": "What was the average price per share in 2018, taking into account the weight assigned to each share?", "output": "What was the Weighted-average price per share in 2018?" }, { "input": "For how many years did the repurchase of Class A common stock surpass a cumulative value of $10,000 million?", "output": "How many years did Class A common stock repurchased exceed $10,000 million?" }, { "input": "For how many years did the repurchase of Class A common stock amount to more than $10 billion?", "output": "How many years did Class A common stock repurchased exceed $10,000 million?" }, { "input": "For which years does the table provide details regarding the exclusion of weighted-average common share equivalents of Class A common stock from the calculations of diluted net income per share?", "output": "Which years does the table provide information for the weighted-average common share equivalents of Class A common stock that were excluded from the diluted net income per share calculations?" }, { "input": "What were the employee stock options for the year 2019?", "output": "What was the Employee stock options in 2019?" }, { "input": "What were the employee stock options offered in 2019?", "output": "What was the Employee stock options in 2019?" }, { "input": "How does Dell engage in the procurement of products and services from other companies?", "output": "How does Dell purchase products and services from the company?" }, { "input": "What was the percentage change in the internal-use revenue between 2019 and 2020?", "output": "What is the percentage difference in internal-use revenue from 2019 to 2020?" }, { "input": "Which years does the table provide information for accrued expenses and other?", "output": "For which specific years does the provided table offer information related to accrued expenses and other similar items?" }, { "input": "Which years does the table provide information for accrued expenses and other?", "output": "For which specific years does the table contain data regarding accrued expenses and other items?" }, { "input": "What was the difference in the total amount of expenses that accumulated over time between the years 2019 and 2020?", "output": "What was the change in the total accrued expenses between 2019 and 2020?" }, { "input": "What is the difference in the total amount of expenses that accumulated over time between 2019 and 2020?", "output": "What was the change in the total accrued expenses between 2019 and 2020?" }, { "input": "In 2019, what were the specific support and administrative costs incurred by Dell's subsidiary?", "output": "What was the Dell subsidiary support and administrative costs in 2019?" }, { "input": "What was the change in Purchases and leases of products and purchases of services between 2018 and 2019?", "output": "What was the difference in the amount spent on purchasing and leasing products as well as purchasing services from 2018 to 2019?" }, { "input": "What was the change in Purchases and leases of products and purchases of services between 2018 and 2019?", "output": "What was the difference in the amount spent on purchasing and leasing products and purchasing services from 2018 to 2019?" }, { "input": "For how many consecutive years did Dell subsidiary incur support and administrative costs exceeding $200 million?", "output": "How many years did Dell subsidiary support and administrative costs exceed $200 million?" }, { "input": "What was the percentage change in the Dell subsidiary support and administrative costs between 2019 and 2020?", "output": "What percentage increase or decrease was observed in the support and administrative costs of Dell's subsidiary from 2019 to 2020?" }, { "input": "What was the significance or meaning of the aggregate intrinsic value?", "output": "What did the aggregate intrinsic value represent?" }, { "input": "What differentiates outstanding options that are both exercisable and vested as well as expected to vest?", "output": "What was the difference between outstanding options that were exercisable and vested and expected to vest?" }, { "input": "What was the specific amount allocated for the federal income tax provision in the year 2019?", "output": "What was the current federal income tax provision in 2019?" }, { "input": "What was the specific provision for federal income tax in 2019?", "output": "What was the current federal income tax provision in 2019?" }, { "input": "What was the change in the current federal income tax provision between 2018 and 2019?", "output": "What was the difference in the amount allocated for federal income tax in 2018 compared to 2019?" }, { "input": "What was the percentage change in the Total income tax provision between 2018 and 2019?", "output": "What was the precise percentage increase or decrease in the Total income tax provision from 2018 to 2019?" }, { "input": "What was the percentage change in the Total income tax provision between 2018 and 2019?", "output": "What is the percentage difference in the Total income tax provision from 2018 to 2019?" }, { "input": "What was the specific monetary value of Restricted cash categorized under other current assets in the financial records for the year 2019?", "output": "What was the amount of Restricted cash within other current assets in 2019?" }, { "input": "What were the specific additions made to tax positions related to the current year of 2020 in the field of taxation?", "output": "What were the additions to tax positions related to current year in 2020?" }, { "input": "What categories are encompassed within the scope of purchases and leases of products, and what does this also include in terms of purchases?", "output": "What did Purchases and leases of products and purchases of services include?" }, { "input": "What was the change in Purchases and leases of products and purchases of services between 2018 and 2019?", "output": "What was the year-on-year difference in the amounts spent on purchasing and leasing products and purchasing services between 2018 and 2019?" }, { "input": "What was the change in Purchases and leases of products and purchases of services between 2018 and 2019?", "output": "What was the difference in the amount spent on purchasing and leasing products, as well as purchasing services, from 2018 to 2019?" }, { "input": "For how long did the subsidiary of Dell exceed $150 million in annual support and administrative costs?", "output": "How many years did Dell subsidiary support and administrative costs exceed $150 million?" }, { "input": "What specific expenses or liabilities are encompassed within the current balances owed to affiliated individuals or entities?", "output": "What did the current amounts due to related parties include?" }, { "input": "What specific items or transactions are included in the present outstanding balances owed to related parties?", "output": "What did the current amounts due to related parties include?" }, { "input": "For how long has the current outstanding amount owed by related parties been over $1,000 million?", "output": "How many years did current amount due from related parties exceed $1,000 million?" }, { "input": "For how long has the present outstanding balance owed by affiliated organizations been over $1,000 million?", "output": "How many years did current amount due from related parties exceed $1,000 million?" }, { "input": "What was the percentage increase or decrease in the net current amount due from related parties from 2019 to 2020?", "output": "What was the percentage change in the net current amount due from related parities between 2019 and 2020?" }, { "input": "What was the change in Additions to intangible assets between 2019 and 2020?", "output": "What was the difference in the amount of intangible assets added between the years 2019 and 2020?" }, { "input": "What is the total amount of lease liabilities for operating leases recorded on the current financial statements?", "output": "What were the current lease liabilities for operating leases?" }, { "input": "What is the total amount of lease liabilities for operating leases as of the present moment?", "output": "What were the current lease liabilities for operating leases?" }, { "input": "How many lease liabilities for operating leases exceeded $500 million?", "output": "How many operating lease liabilities were there that exceeded $500 million in amount?" }, { "input": "What differentiates current lease liabilities from non-current lease liabilities in the context of operating leases?", "output": "What was the difference between current and non-current lease liabilities for operating leases?" }, { "input": "What differentiates current lease liabilities from non-current lease liabilities for operating leases?", "output": "What was the difference between current and non-current lease liabilities for operating leases?" }, { "input": "What was the discrepancy in the aggregate lease obligations between operating leases and finance leases?", "output": "What was the difference in total lease liabilities between operating leases and finance leases?" }, { "input": "Which years does the table provide information for unearned revenue?", "output": "For which specific years does the table contain information regarding unearned revenue?" }, { "input": "Which years does the table provide information for unearned revenue?", "output": "\"For which specific years does the table provide data on unearned revenue?\"" }, { "input": "What was the percentage change in Unearned subscription and SaaS revenue from 2019 to 2020?", "output": "What was the change in Unearned subscription and SaaS revenue between 2019 and 2020?" }, { "input": "What was the foreign income in 2018?", "output": "What was the total amount of income received from foreign sources during the year 2018?" }, { "input": "What is the percentage difference in the total income prior to income tax between the years 2019 and 2020?", "output": "What was the percentage change in the total income before income tax between 2019 and 2020?" }, { "input": "What was the total value of construction projects underway in 2020 globally or within a specific country/region?", "output": "What was the amount of construction in progress in 2020?" }, { "input": "What was the difference in the total value of property and equipment from 2019 to 2020?", "output": "What was the change in Total property and equipment between 2019 and 2020?" }, { "input": "What was the change in the Unrealized losses on available-for-sale securities between 2017 and 2018?", "output": "What was the difference in the amount of Unrealized losses on available-for-sale securities from 2017 to 2018?" }, { "input": "What was the percentage change in the Reclassification of realized transactions, net of taxes between 2018 and 2019?", "output": "What percentage represents the difference in the Reclassification of realized transactions, net of taxes between the years 2018 and 2019?" }, { "input": "What impact did the alteration in the balance of net accounts receivable have on the financial situation?", "output": "What was the effect of change in the net accounts receivable?" }, { "input": "What is the ratio of long-term deferred tax liability to accrued liabilities?", "output": "What was the Long-term deferred tax liability as a ratio of Accrued liabilities?" }, { "input": "How did the Other interest expense change between the years 2017 and 2018?", "output": "What was the change in Other interest expense between 2017 and 2018?" }, { "input": "What was the difference between the reported net sales and the reported cost of sales in terms of amount?", "output": "What was the difference in amount as reported between net sales and cost of sales?" }, { "input": "What was the difference between Net income from continuing operations and Income before income taxes?", "output": "What is the distinction between net income from continuing operations and income before income taxes? Please provide a comprehensive response, retaining all the details provided in the original question. Keep the question brief, succinct, and in the same language as the original." }, { "input": "What was the impact of stock options and RSUs on dilution in 2019?", "output": "What was the Dilutive effect of stock options and RSUs in 2019?" }, { "input": "How many consecutive years did the balance at the end of each year surpass $2 million?", "output": "How many years did the balance at end of the year exceed $2 million?" }, { "input": "For how many years was the balance at the end of each year above $2 million?", "output": "How many years did the balance at end of the year exceed $2 million?" }, { "input": "How many years did Intercompany prepaid tax asset amortization exceed $7 million?", "output": "For how many years was the amortization of Intercompany prepaid tax asset greater than $7 million?" }, { "input": "What was the service cost in 2018?", "output": "What was the cost of the service provided in 2018?" }, { "input": "What was the service cost in 2018?", "output": "What was the cost of the service in the year 2018 specifically?" }, { "input": "What was the total amount of interest incurred in the year 2019?", "output": "What was the interest cost in 2019?" }, { "input": "What was the total expense incurred as interest in the year 2019?", "output": "What was the interest cost in 2019?" }, { "input": "What was the percentage difference in the Discount rate from 2018 to 2019?", "output": "What was the change in the Discount rate between 2018 and 2019?" }, { "input": "What was the specific difference in the Discount rate from the year 2018 to the year 2019?", "output": "What was the change in the Discount rate between 2018 and 2019?" }, { "input": "Why could the sums fail to equal the overall total?", "output": "Why might amounts may not add to the total?" }, { "input": "\"What was the numerical distinction between the total amount obtained from sales after deducting all applicable costs, and the total revenue generated from sales before deducting any expenses or costs?\"", "output": "What was the difference between the total net sales and gross profit?" }, { "input": "What is the calculated percentage change in the operating income from the third quarter to the fourth quarter?", "output": "What was the percentage change in the Operating income between the third and fourth quarter?" }, { "input": "What was the change in the cost of sales between 2017 and 2018?", "output": "How much did the cost of sales change from 2017 to 2018?" }, { "input": "What was the percentage change in the Net deferred tax asset between 2018 and 2019?", "output": "What was the percentage increase or decrease in the amount of Net deferred tax asset from 2018 to 2019?" }, { "input": "What was the percentage change in the Net deferred tax asset between 2018 and 2019?", "output": "What was the percentage increase or decrease in the Net deferred tax asset from 2018 to 2019?" }, { "input": "What is the percentage difference in the total balance from 2018 to 2019? Calculate using the formula for percentage change.", "output": "What was the percentage change in the total balance between 2018 and 2019?" }, { "input": "What was the percentage increase or decrease in the total balance from 2018 to 2019?", "output": "What was the percentage change in the total balance between 2018 and 2019?" }, { "input": "For how long, in terms of number of years, has the cost of sales exceeded 40% of net sales?", "output": "How many years did cost of sales of net sales exceed 40%?" }, { "input": "What is the percentage increase or decrease in operating income as a proportion of net sales from 2018 to 2019?", "output": "What was the percentage change in operating income of net sales between 2018 and 2019?" }, { "input": "What was the exact percentage increase or decrease in operating income of net sales from 2018 to 2019?", "output": "What was the percentage change in operating income of net sales between 2018 and 2019?" }, { "input": "How many years have there been increases in tax positions related to the current year that exceeded $30 million?", "output": "How many years did Increases related to current year tax positions exceed $30 million?" }, { "input": "What was the percentage change in the Ending balance between 2018 and 2019?", "output": "What was the percentage difference in the Ending balance from 2018 to 2019?" }, { "input": "What was the percentage change in the Ending balance between 2018 and 2019?", "output": "What is the percent increase or decrease in the Ending balance from 2018 to 2019?" }, { "input": "According to what were the plan benefits provided, and to what were they in accordance with?", "output": "What were plan benefits provided in accordance with?" }, { "input": "In accordance with what, were the plan benefits provided? Can you expand on the specific benefits mentioned in the plan?", "output": "What were plan benefits provided in accordance with?" }, { "input": "What were the total service costs incurred during the year 2018?", "output": "What were the service costs in 2018?" }, { "input": "What were the specific costs associated with services provided in 2018?", "output": "What were the service costs in 2018?" }, { "input": "What were the total interest expenses incurred in the year 2017?", "output": "What were the interest costs in 2017?" }, { "input": "What was the total amount spent on interest expenses in the year 2017?", "output": "What were the interest costs in 2017?" }, { "input": "What was the difference in the amount of Amortization of actuarial loss between 2018 and 2019?", "output": "What was the change in the Amortization of actuarial loss between 2018 and 2019?" }, { "input": "What was the difference in the amount allocated for the gradual reduction of actuarial loss in the financial statements between the years 2018 and 2019?", "output": "What was the change in the Amortization of actuarial loss between 2018 and 2019?" }, { "input": "What was the percentage change in the net pension period cost between 2018 and 2019?", "output": "What was the percentage increase or decrease in the net pension period cost from 2018 to 2019?" }, { "input": "What was the percentage change in the net pension period cost between 2018 and 2019?", "output": "What was the percentage difference in the net pension period cost from 2018 to 2019?" }, { "input": "How was the lower of cost and net realizable value determined as the valuation method for inventories? Please provide a detailed explanation of the process.", "output": "How were inventories valued at the lower of cost and net realizable value?" }, { "input": "What was the quantity or volume of raw materials used in the year 2018?", "output": "What was the amount of raw materials in 2018?" }, { "input": "For how long has the value of Finished goods remained above $200 million?", "output": "How many years did the amount of Finished goods exceed $200 million?" }, { "input": "What was the total value of both Core and developed technology assets combined?", "output": "What was the gross amount of Core and developed technology assets?" }, { "input": "What was the value assigned to each share granted on a specific date at March 31, 2017, taking into account their respective weights?", "output": "What was the Weighted Average Grant Date Fair Value for nonvested shares at March 31, 2017?" }, { "input": "What was the percentage increase or decrease in the number of shares that have not yet vested from 2018 to 2019?", "output": "What was the percentage change in the number of nonvested shares between 2018 and 2019?" }, { "input": "What was the cumulative intrinsic value of options and stock appreciation rights (SARs) exercised throughout the fiscal years ending on March 31, 2019?", "output": "What was the total intrinsic value of options and SARs exercised during the years ended March 31, 2019?" }, { "input": "In 2019, how many options and SAR (stock appreciation rights) shares were available for exercise?", "output": "What was the number of option and SAR shares exercisable in 2019?" }, { "input": "Which years does the table provide the number of outstanding shares for?", "output": "Which specific years are mentioned in the table that provides the information regarding the number of outstanding shares?" }, { "input": "What was the difference in the Weighted Average Exercise Price per Share for outstanding shares between the years 2017 and 2018?", "output": "What was the change in the Weighted Average Exercise Price per Share for outstanding shares between 2017 and 2018?" }, { "input": "\"What was the calculated percentage difference in the quantity of outstanding shares from 2018 to 2019?\"", "output": "What was the percentage change in the number of outstanding shares between 2018 and 2019?" }, { "input": "For which specific years does the table provide information on Identifiable long-lived assets (property, plant, and equipment net of accumulated amortization) within different geographic areas?", "output": "Which years does the table provide information for Identifiable long-lived assets (consisting of property, plant and equipment net of accumulated amortization) by geographic area?" }, { "input": "Which geographic areas are included in the table that provides information on identifiable long-lived assets, specifically property, plant, and equipment (net of accumulated amortization) for different years?", "output": "Which years does the table provide information for Identifiable long-lived assets (consisting of property, plant and equipment net of accumulated amortization) by geographic area?" }, { "input": "What was the total value of assets held in different countries during the year 2019?", "output": "What was the amount of assets in Various other countries in 2019?" }, { "input": "What was the total value of assets in various countries in 2019?", "output": "What was the amount of assets in Various other countries in 2019?" }, { "input": "What percentage increase or decrease occurred in the total value of long-lived assets from 2018 to 2019?", "output": "What was the percentage change in total long-lived assets between 2018 and 2019?" }, { "input": "What was the change in the Deferred expense for State between 2018 and 2019?", "output": "What is the difference in the amount of Deferred expense for the State from 2018 to 2019?" }, { "input": "What was the change in the Deferred expense for State between 2018 and 2019?", "output": "What was the difference in the amount of Deferred expense for State between the years 2018 and 2019?" }, { "input": "What is the cumulative balance of all time deposits?", "output": "What was the total balance time deposits?" }, { "input": "What is the aggregate amount of money held in time deposits at present?", "output": "What was the total balance time deposits?" }, { "input": "What was the specific balance of Inventories on April 1, 2018, as recorded in the financial records or statements?", "output": "What was the balance of Inventories on April 1, 2018?" }, { "input": "What was the adjustment made in ASC 606 for other current assets? Please provide details regarding how this adjustment affected the accounting treatment and reporting of other current assets under ASC 606.", "output": "What was the adjustment from ASC 606 for other current assets?" }, { "input": "How many liabilities had a balance on March 31, 2018 that exceeded $300 million?", "output": "How many liabilities, as of March 31, 2018, had a balance surpassing $300 million?" }, { "input": "What was the percentage change in Other assets specifically attributed to the adjustments made?", "output": "What was the percentage change in Other assets due to the adjustments?" }, { "input": "What were the net sales in 2016?", "output": "What was the total amount of sales generated in the year 2016?" }, { "input": "What was the difference in net sales from 2016 to 2017, and how can this information assist in better answering the question?", "output": "What was the change in net sales between 2016 and 2017?" }, { "input": "What was the difference in net sales revenue from 2016 to 2017?", "output": "What was the change in net sales between 2016 and 2017?" }, { "input": "Which years did assets from Thailand exceed $200 million?", "output": "Which specific years did the total value of assets originating from Thailand surpass $200 million?" }, { "input": "Which years did assets from Thailand exceed $200 million?", "output": "In which specific years did the total value of assets originating from Thailand surpass $200 million?" }, { "input": "What was the percentage change in the total long-lived assets between 2018 and 2019?", "output": "What was the percentage difference in the total value of long-lived assets from 2018 to 2019, indicating whether it increased or decreased?" }, { "input": "In which years was the total annual compensation calculated in?", "output": "What are the specific years for which the total annual compensation was calculated?" }, { "input": "Which geographic locations are taken into account for calculating the overall current tax expense as mentioned in the table?", "output": "What are the geographic locations in the table considered when calculating the total current tax expense?" }, { "input": "What is the highest recorded tax expense in Germany and in which specific year did it occur?", "output": "In which year was the current tax expense in Germany the largest?" }, { "input": "What is the highest recorded tax expense in Germany, and in which specific year was it observed?", "output": "In which year was the current tax expense in Germany the largest?" }, { "input": "What was the amount of non-current assets in APJ in 2019?", "output": "What was the specific value of non-current assets within the Asia-Pacific-Japan (APJ) region during the year 2019?" }, { "input": "During which specific years were the calculations of Non-Current Assets by Region conducted?", "output": "In which years were the Non-Current Assets by Region calculated?" }, { "input": "In which year was the amount in Rest of Americas larger?", "output": "In what specific year did the Rest of Americas region experience a higher amount compared to previous years?" }, { "input": "What was the numerical difference in the quantity of Rest of Americas in 2019 compared to 2018?", "output": "What was the change in the amount in Rest of Americas in 2019 from 2018?" }, { "input": "What was the difference in the quantity of Rest of Americas in 2019 compared to 2018?", "output": "What was the change in the amount in Rest of Americas in 2019 from 2018?" }, { "input": "How was the Total expense for the share-based payment plans of Executive Board members determined?", "output": "How was the total expense for the share-based payment plans determined specifically for the Executive Board members?" }, { "input": "During which specific years is the Total Expense for Share-Based Payment given or provided?", "output": "In which years is the Total Expense for Share-Based Payment provided?" }, { "input": "For which specific years is the Total Expense for Share-Based Payment given?", "output": "In which years is the Total Expense for Share-Based Payment provided?" }, { "input": "What is the specific value or quantity for the EMEA region in 2019?", "output": "What is the amount for EMEA in 2019?" }, { "input": "What is the amount for APJ in 2018?", "output": "What was the total amount of APJ in the year 2018?" }, { "input": "When did Germany have the highest amount?", "output": "In which year was the amount for Germany the largest?" }, { "input": "When did Germany experience its highest amount?", "output": "In which year was the amount for Germany the largest?" }, { "input": "What is the specific monetary figure spent on employee benefits in the year 2019?", "output": "What is the amount of employee benefits expenses in 2019?" }, { "input": "What was the percentage difference in the amount spent on pensions in 2019 compared to 2018?", "output": "What was the percentage change in Pension expenses in 2019 from 2018?" }, { "input": "What was the total cost of services offered in the year 2019?", "output": "What was the Cost of services in 2019?" }, { "input": "What was the specific cost of services rendered in the year 2019?", "output": "What was the Cost of services in 2019?" }, { "input": "\"When did the Cost of services reach its highest value?\"", "output": "In which year was Cost of services largest?" }, { "input": "In which specific year did the Cost of services reach its maximum value, considering all available data?", "output": "In which year was Cost of services largest?" }, { "input": "What was the change in issued capital in 2019 from 2018?", "output": "What was the net change in the amount of issued capital between the years 2018 and 2019?" }, { "input": "What was the change in issued capital in 2019 from 2018?", "output": "What was the difference in the amount of issued capital between 2019 and 2018? How much did the issued capital change from 2018 to 2019?" }, { "input": "What is the percentage of unused tax losses in 2019 that can be attributed to state tax loss carryforwards in the United States?", "output": "How much of unused tax losses relate to U.S. state tax loss carryforwards in 2019?" }, { "input": "In which year was the amount Expiring after the following year the largest?", "output": "In what year did the highest amount of expenses expire after the subsequent year?" }, { "input": "What is the meaning and scope of the rights associated with Bill McDermott?", "output": "What do the rights for Bill McDermott refer to? " }, { "input": "What specific types of entitlements are included in the table when referring to annual pension entitlements?", "output": "What kind of entitlements are annual pension entitlements as displayed in the table?" }, { "input": "Which specific years are the annual pension entitlements fully and legally secured to an individual?", "output": "In which years are the annual pension entitlements vested?" }, { "input": "What was the profit before tax in 2019?", "output": "What was the pre-tax profit for the fiscal year of 2019?" }, { "input": "What was the profit before tax in 2019?", "output": "What was the pre-tax profit for the year 2019?" }, { "input": "\"What was the specific tax rate applicable in 2018 for individuals or businesses in a particular country?\"", "output": "What was the applicable tax rate in 2018?" }, { "input": "What was the percentage change in Other in 2019 from 2018?", "output": "What was the percentage difference in the category labeled \"Other\" between the years 2018 and 2019?" }, { "input": "What market is the information specifically targeting and representing for Airtel?", "output": "Which market does the information reflect for Airtel?" }, { "input": "What is the percentage increase in the number of mobile customers for AIS over a specific period?", "output": "What is the % growth of mobile customers for AIS?" }, { "input": "Which associate had the best % growth in mobile customers?", "output": "\"Which associate experienced the highest percentage growth in the number of mobile customers over a specific period of time?\"" }, { "input": "What specific components were encompassed within the exceptional items during the fiscal year 2018?", "output": "What was included in the exceptional items in FY2018?" }, { "input": "What specific items were considered exceptional in the financial year 2018?", "output": "What was included in the exceptional items in FY2018?" }, { "input": "What was the amount of cash generated from operations after accounting for capital expenditures and operating expenses in the specified year?", "output": "What was the free cash flow for the year?" }, { "input": "What is the % change in EBITDA margin from 2018 to 2019?", "output": "What is the percentage difference in EBITDA margin between 2018 and 2019?" }, { "input": "What is the % change in EBITDA margin from 2018 to 2019?", "output": "What is the percentage change in EBITDA margin from the year 2018 to the year 2019?" }, { "input": "What is the absolute difference in net profit between 2018 and 2019?", "output": "What is the change in net profit from 2018 to 2019 in absolute numbers?" }, { "input": "What does the line item labeled as 'Others' provide information about in a given context or dataset?", "output": "What information does the line item 'Others' relate to?" }, { "input": "What is the average total amount owed in debt for the duration of the two-year period?", "output": "What is the average gross debt across the 2 years?" }, { "input": "Why does this net deferred gain balance exist?", "output": "Why is there a net deferred gain balance?" }, { "input": "What is the reason for the continued presence of a net deferred gain balance after Singtel completed the sale of its entire 100% stake in NLT to NetLink NBN Trust in July 2017?", "output": "Why is there still a balance of net deferred gain since Singtel sold its 100% interest in NLT to NetLink NBN Trust in July 2017?" }, { "input": "What was the year when the net deferred gain balance reached its peak?", "output": "In which year was the net deferred gain balance the highest?" }, { "input": "What is the subject matter or main focus mentioned in note 22?", "output": "What is the topic of note 22?" }, { "input": "What is included in the \"Others\" category in the table?", "output": "What does the line item \"Others\" in the table encompass?" }, { "input": "What is the Singtel's largest joint venture in terms of the proportion of the Group's ownership?", "output": "Which is the largest joint venture of Singtel, in terms of the proportion of Group's ownership?" }, { "input": "Do the non-audit services offered by KPMG LLP have an impact on their independence in conducting audits?", "output": "Does the non-audit services provided by KPMG LLP affect their independence?" }, { "input": "What is included in the compensation for key management personnel?", "output": "What does key management personnel compensation comprise?" }, { "input": "What is included in the compensation package for key management personnel?", "output": "What does key management personnel compensation comprise?" }, { "input": "What are the components included in directors' remuneration?", "output": "What does directors' remuneration comprise of?" }, { "input": "To what topic does the content of note 18.1 correspond?", "output": "What is the subject matter of note 18.1?" }, { "input": "What percentage of the company's derivative financial liabilities are classified as non-current liabilities disclosed in their financial statements?", "output": "How many % of the company's derivative financial liabilities are being disclosed as non-current?" }, { "input": "What is Note 21 about?", "output": "What is the topic of note 21?" }, { "input": "Can you please provide details on the terms and conditions of the advances made by the shareholders?", "output": "What are the terms of the shareholders' advances?" }, { "input": "\"What are the specific terms and conditions associated with the advances provided to shareholders?\"", "output": "What are the terms of the shareholders' advances?" }, { "input": "How many factors need to be considered when calculating the balance for subsidiaries?", "output": "How many factors are involved in calculating the balance for subsidiaries?" }, { "input": "\"What is typically included in the category of selling and administrative costs?\"", "output": "What does selling and administrative costs include?" }, { "input": "How many distinct categories of operating expenses exist?", "output": "How many different type of operating expenses are there?" }, { "input": "What is the mean value of the three highest subcategories within operating expenses for the year 2019?", "output": "What is the average of the top 3 operating expenses subcategories in 2019?" }, { "input": "What is the average value of the three highest subcategories of operating expenses in the year 2019?", "output": "What is the average of the top 3 operating expenses subcategories in 2019?" }, { "input": "What are the projected maximum future payouts for Barry Litwin and Thomas Clark under the NEO plan of 2019?", "output": "What is the maximum estimated future payouts under the 2019 NEO plan for Barry Litwin and Thomas Clark?" }, { "input": "What is the maximum estimated future payout amount under the 2019 NEO plan specifically designated for Barry Litwin and Thomas Clark?", "output": "What is the maximum estimated future payouts under the 2019 NEO plan for Barry Litwin and Thomas Clark?" }, { "input": "What are the projected highest expected future payouts under the 2019 NEO plan for both Manoj Shetty and Lawrence Reinhold?", "output": "What is the maximum estimated future payouts under the 2019 NEO plan for Manoj Shetty and Lawrence Reinhold?" }, { "input": "What are the audit fees incurred by the company in 2018 and 2019?", "output": "What were the audit fees that the company had to pay in the years 2018 and 2019?" }, { "input": "What is the percentage difference in the total amount of fees paid by the company for all other expenses between the years 2018 and 2019?", "output": "What is the percentage change in all other fees incurred by the company between 2018 and 2019?" }, { "input": "What percentage of common stock does Barry Litwin and Robert D. Rosenthal collectively own?", "output": "What is the percent of common stock owned by Barry Litwin and Robert D. Rosenthal?" }, { "input": "What is the total compensation received by Robert D. Rosenthal and Chad M. Lindbloom respectively during fiscal 2019?", "output": "What were the individual total compensations received by Robert D. Rosenthal and Chad M. Lindbloom in fiscal year 2019?" }, { "input": "What is the total compensation received by Paul S. Pearlman and Lawrence Reinhold respectively during fiscal 2019?", "output": "What were the individual total compensations earned by Paul S. Pearlman and Lawrence Reinhold in fiscal year 2019?" }, { "input": "What is the definition and significance of stock awards?", "output": "What does stock awards refer to?" }, { "input": "What are the Thomas Clark's accelerated vesting of stock options and unvested performance restricted stock units respectively?", "output": "Can you please provide specific details about the accelerated vesting of stock options of Thomas Clark? Additionally, what are the details regarding the unvested performance restricted stock units held by Thomas Clark?" }, { "input": "What is the meaning or definition of the \"other\" description in the allowance for sales return in the year 2017?", "output": "What does the \"other\" description in allowance for sales return in 2017 refer to?" }, { "input": "What is the cumulative amount of allowances for sales returns write-offs from 2017 to 2019?", "output": "What is the total allowances for sales returns write-offs between 2017 to 2019?" }, { "input": "What is the total amount of allowances accounted for sales returns and write-offs from 2017 to 2019?", "output": "What is the total allowances for sales returns write-offs between 2017 to 2019?" }, { "input": "What is the percentage difference in the allowance for deferred tax assets at the end of the period in 2018 compared to 2019?", "output": "What is the percentage change in the allowance for deferred tax assets at the end of period between 2018 and 2019?" }, { "input": "What are Manoj Shetty's respective accelerated vesting of stock options and unvested performance restricted stock units respectively?", "output": "What is the accelerated vesting schedule for Manoj Shetty's stock options? Additionally, how many unvested performance restricted stock units does he currently have?" }, { "input": "What are Manoj Shetty's respective accelerated vesting of stock options and unvested performance restricted stock units respectively?", "output": "What is the accelerated vesting timeline for Manoj Shetty's stock options? Additionally, what is the status of his unvested performance restricted stock units?" }, { "input": "What is the difference in consolidated gross profits comparing 2019 to 2018, and also 2018 to 2017?", "output": "What is the change in consolidated gross profits between 2019 vs 2018 and 2018 vs 2017 respectively?" }, { "input": "What is the total consolidated net sales in 2019 and 2018?", "output": "What were the combined net sales for the fiscal years of 2019 and 2018?" }, { "input": "What is the total consolidated net sales in 2017 and 2018", "output": "What were the combined net sales for the years 2017 and 2018?" }, { "input": "What is the total consolidated net sales in 2017 and 2018", "output": "What were the combined net sales for 2017 and 2018?" }, { "input": "What is the change in consolidated gross profit between 2017 and 2018?", "output": "What is the difference in consolidated gross profit for the company between the years 2017 and 2018, and how does the value change during this period?" }, { "input": "What is the change in consolidated gross profit between 2017 and 2018?", "output": "What is the specific difference in consolidated gross profit when comparing the financial years of 2017 and 2018?" }, { "input": "What was the percentage of the total population or total something specific (specify) from Singapore in the year 2019?", "output": "What was the percentage of total from Singapore in 2019?" }, { "input": "What was the average amount of money sent from Singapore in the years 2018 and 2019?", "output": "What was the average amount from Singapore in 2018 and 2019?" }, { "input": "What was the average amount of money, in Singapore currency, that was received from Singapore in the years 2018 and 2019 combined?", "output": "What was the average amount from Singapore in 2018 and 2019?" }, { "input": "How is the fair value of a financial instrument of a company determined and defined?", "output": "How is the fair value of the company's financial instrument defined?" }, { "input": "In what specific year did the interest rate swap fall below the threshold of 1,000 thousands?", "output": "In which year was interest rate swap less than 1,000 thousands?" }, { "input": "In which specific year did the interest rate swap fall below one million in thousands?", "output": "In which year was interest rate swap less than 1,000 thousands?" }, { "input": "What are the three levels of subjectivity and could you provide more information to assist in providing a comprehensive answer?", "output": "What are the 3 levels of subjectivity?" }, { "input": "What are the 3 distinct levels of subjectivity and how can they be described or defined?", "output": "What are the 3 levels of subjectivity?" }, { "input": "\"What were the additional features or updates introduced in the year 2019? Similarly, what were the new additions or improvements made in 2018?\"", "output": "What were the other additions in 2019 and 2018 respectively?" }, { "input": "\"What are the average number of other additions made in 2018 and 2019?\"", "output": "What is the average other additions for 2018 and 2019?" }, { "input": "\"What is the average number of non-inclusive additions made in the years 2018 and 2019 combined?\"", "output": "What is the average other additions for 2018 and 2019?" }, { "input": "In what specific year did the transfer of developed technology from IPR&D exceed an amount greater than 4,000 thousands?", "output": "In which year was Transfers to developed technology from IPR&D greater than 4,000 thousands?" }, { "input": "What were the average Selling, General, and Administrative expenses for the financial years ending on December 31, 2019 and 2018?", "output": "What is the average Selling, general and administrative for the Years Ended December 31, 2019 to 2018?" }, { "input": "\"What is the average amount spent on Selling, general and administrative expenses for the Years Ended December 31, 2019 and 2018?\"", "output": "What is the average Selling, general and administrative for the Years Ended December 31, 2019 to 2018?" }, { "input": "What was the percentage decrease in the Selling, general and administrative expenses during the year 2019?", "output": "What was the decrease in Selling, general and administrative in 2019?" }, { "input": "What was the number of shares granted in 2019?", "output": "How many shares were awarded in the year 2019?" }, { "input": "Define research and development activities.", "output": "What are research and development activities?" }, { "input": "What was the average amount spent on net revenue for the years ending on December 31, 2019 and 2018?", "output": "What is the average Cost of net revenue, for the Years Ended December 31, 2019 to 2018?" }, { "input": "What is the average Cost of net revenue for the period spanning from December 31, 2018, to December 31, 2019?", "output": "What is the average Cost of net revenue, for the Years Ended December 31, 2019 to 2018?" }, { "input": "In which year was Restructuring expense 0 thousands?", "output": "In which specific year did the Restructuring expense amount to zero thousands (thousands referring to the unit) without any incurred costs?" }, { "input": "In which year was Restructuring expense 0 thousands?", "output": "What was the specific year when the Restructuring expense amounted to 0 thousands?" }, { "input": "\"What were the transfers of developed technology from IPR&D in the years 2019 and 2018 respectively?\"", "output": "What were the Transfers to developed technology from IPR&D in 2019 and 2018 respectively?" }, { "input": "What was the change in the Transfers to developed technology from IPR&D from 2018 to 2019?", "output": "What was the difference in the amount of transfers to developed technology from the Intangible Property, Research, and Development (IPR&D) category between the years 2018 and 2019?" }, { "input": "In which year was Transfers to developed technology from IPR&D negative?", "output": "In what specific year did Transfers to developed technology from IPR&D show a negative value?" }, { "input": "What is the average Short-term restricted cash for 2018 and 2019?", "output": "What is the average amount of Short-term restricted cash for the years 2018 and 2019?" }, { "input": "What was the average Net cash provided by operating activities from December 31, 2018 to December 31, 2019?", "output": "What is the average Net cash provided by operating activities for the year ended December 31, 2019 to 2018?" }, { "input": "What is the average amount of cash generated from operating activities, received or used, for the period spanning from December 31, 2018 to December 31, 2019?", "output": "What is the average Net cash provided by operating activities for the year ended December 31, 2019 to 2018?" }, { "input": "What was the respective Net cash used in investing activities in 2019 and 2018?", "output": "What were the net amounts of cash used in investing activities for the years 2019 and 2018 respectively?" }, { "input": "What was the respective Net cash used in investing activities in 2019 and 2018?", "output": "\"What was the amount of net cash utilized in investing activities during the years 2019 and 2018?\"" }, { "input": "What was the net value of long-term debt in 2019 after deducting any related expenses or liabilities?", "output": "What was the Net carrying amount of long-term debt in 2019?" }, { "input": "What was the specific amount of long-term debt after deducting all necessary expenses from the total amount in 2018?", "output": "What was the Net carrying amount of long-term debt in 2018?" }, { "input": "What was the specific amount of long-term debt, after deducting any reductions or write-offs, recorded on the financial statements for the year 2018?", "output": "What was the Net carrying amount of long-term debt in 2018?" }, { "input": "What is the mean value of the unamortized debt discount for the fiscal years 2018 and 2019?", "output": "What is the average Unamortized debt discount for 2018 and 2019?" }, { "input": "When was the year when the net carrying amount of long-term debt was less than 210,000 thousands?", "output": "In which year was Net carrying amount of long-term debt less than 210,000 thousands?" }, { "input": "What is the average working capital for the period from December 31, 2018 to December 31, 2019?", "output": "What is the average Working capital for December 31, 2019 to 2018?" }, { "input": "What is the average Cash and cash equivalents for December 31, 2019 to 2018?", "output": "What is the average amount of Cash and cash equivalents for the period from December 31, 2018 to December 31, 2019?" }, { "input": "What is the average Cash and cash equivalents for December 31, 2019 to 2018?", "output": "What is the average amount of Cash and cash equivalents as of December 31 for the years 2019 and 2018 combined?" }, { "input": "What is the eligibility of Performance-based restricted stock to vest?", "output": "What criteria must be met for Performance-based restricted stock to become vested and eligible for acquisition?" }, { "input": "What were the adjustments in 2018?", "output": "\"What specific adjustments, if any, occurred during the year 2018?\"" }, { "input": "When was the year when the ending balance fell below $240,000,000?", "output": "In which year was the ending balance less than 240,000 thousands?" }, { "input": "What was the total amount of expenses incurred during the period from January 1 to December 31, 2018, after accounting for all deductions and offsets?", "output": "What was the net expense in the year ended December 31, 2018?" }, { "input": "What was the total amount of expenditures for the financial year that concluded on December 31, 2018?", "output": "What was the net expense in the year ended December 31, 2018?" }, { "input": "What was the total amount of expenses incurred during the period from January 1 to December 31, 2019?", "output": "What was the net expense in the year ended December 31, 2019?" }, { "input": "What was the total expenditure for the period that concluded on December 31, 2019?", "output": "What was the net expense in the year ended December 31, 2019?" }, { "input": "What was the percentage increase in the cost of services in 2019?", "output": "How much did Cost of services increase in 2019?" }, { "input": "What was the percentage increase in the cost of services in the year 2019 compared to the previous year?", "output": "How much did Cost of services increase in 2019?" }, { "input": "What was the Less Capital expenditures (including capitalized software) in 2019?", "output": "What was the total amount spent on Less Capital Expenditures, including expenditure on capitalized software, during the year 2019?" }, { "input": "What was the Less Capital expenditures (including capitalized software) in 2019?", "output": "\"What was the total amount of capital expenditures, including capitalized software, that occurred in the year 2019?\"" }, { "input": "What was the free cash flow in 2019?", "output": "What was the amount of free cash flow generated in the year 2019?" }, { "input": "What was the average free cash flow for 2018 and 2019?", "output": "What was the average amount of free cash flow generated for the years 2018 and 2019?" }, { "input": "What was the percentage change in the Less Capital expenditures from 2018 to 2019?", "output": "What was the percentage difference in the amount spent on capital expenditures between 2018 and 2019?" }, { "input": "What was the exact total value of the assets owned by a particular entity/company during the year 2019?", "output": "What was the Total assets in 2019?" }, { "input": "What was the total value of assets in the year 2019?", "output": "What was the Total assets in 2019?" }, { "input": "What was the average amount of operating income recorded during the period from 2015 to 2019?", "output": "What was the average operating income for 2015-2019?" }, { "input": "What was the change in the Per common share – basic from 2018 to 2019?", "output": "What was the exact change in the Per common share – basic value from 2018 to 2019? Please provide the specific numerical difference between the two years." }, { "input": "What were the amounts of Service and other revenues for the years ended December 31, 2019 and 2018?", "output": "How much did Service and other revenues amounted for years ended December 31, 2019 and 2018 respectively?" }, { "input": "What were the respective amounts of Service and other revenues for the years ended December 31, 2019 and 2018?", "output": "How much did Service and other revenues amounted for years ended December 31, 2019 and 2018 respectively?" }, { "input": "\"What were the respective revenue amounts generated from Wireless equipment for the years ending December 31, 2019 and 2018?\"", "output": "How much did Wireless equipment revenues amounted for years ended December 31, 2019 and 2018 respectively?" }, { "input": "What was the total amount of revenue generated from Wireless equipment during the years ending December 31, 2019 and 2018?", "output": "How much did Wireless equipment revenues amounted for years ended December 31, 2019 and 2018 respectively?" }, { "input": "What was the percentage change in the value of Small and Medium Businesses from 2018 to 2019?", "output": "What is the change in Small and Medium Business value from 2018 to 2019?" }, { "input": "What is the Consolidated Net Income in 2019?", "output": "What is the Consolidated Net Income for the year 2019?" }, { "input": "What is the Consolidated Net Income in 2019?", "output": "What is the exact amount of Consolidated Net Income recorded in the fiscal year of 2019?" }, { "input": "What is the difference in Consolidated Adjusted EBITDA between 2018 and 2019?", "output": "What is the change in Consolidated Adjusted EBITDA from 2018 to 2019?" }, { "input": "What was the net change in accumulated other comprehensive income for the year 2016?", "output": "What was the net increase to accumulated other comprehensive income in 2016?" }, { "input": "What is the difference in the amount of projected employee benefits that a company is obligated to pay, as of 2018 compared to 2019?", "output": "What is the change in the projected benefit obligation from 2018 to 2019?" }, { "input": "\"What was the specific percentage of income tax that individuals had to pay during the tax year of 2018?\"", "output": "What was the effective income tax rate for the period 2018?" }, { "input": "What is the percentage difference in the effective income tax rate between 2018 and 2019?", "output": "What is the change in Effective income tax rate from 2018 to 2019?" }, { "input": "What specific amount, included in the year 2018, would have a positive impact on the effective income tax rate?", "output": "What was the amount that would favorably affect the effective income tax rate if included in 2018?" }, { "input": "What specific amount, included in 2017, would positively impact the effective income tax rate?", "output": "What was the amount that would favorably affect the effective income tax rate if included in 2017?" }, { "input": "What specific amount, if included in the 2017 earnings, would have a positive impact on the effective income tax rate?", "output": "What was the amount that would favorably affect the effective income tax rate if included in 2017?" }, { "input": "How did the Additions based on tax positions related to the current year change from 2018 to 2019?", "output": "What was the change in the Additions based on tax positions related to the current year from 2018 to 2019?" }, { "input": "What was the average Additions for tax positions of prior years for 2017-2019?", "output": "What were the average additions made for tax positions in the previous years, specifically from 2017 to 2019?" }, { "input": "What was the total long-term debt?", "output": "\"What was the precise amount of long-term debt accrued by the entity in question, taking into account all outstanding obligations over an extended period of time?\"" }, { "input": "What was the value of the long-term debt that was payable within less than 1 year?", "output": "What was the long-term debt less than 1 year?" }, { "input": "What was the exact amount of the finance lease obligation in its entirety?", "output": "What was the total finance lease obligation?" }, { "input": "What is the difference between the long-term debt due less than 1 year and 1 to 3 years?", "output": "What distinguishes long-term debt due within less than one year from long-term debt due within one to three years?" }, { "input": "What is the difference between the long-term debt due less than 1 year and 1 to 3 years?", "output": "What differentiates long-term debt with a maturity of less than 1 year from debt with a maturity ranging from 1 to 3 years?" }, { "input": "By what percentage did Consumer's total operating revenues increase in the year 2019?", "output": "How much did Consumer’s total operating revenues increase in 2019?" }, { "input": "What was the percentage increase in Consumer's total operating revenues in 2019 compared to the previous year?", "output": "How much did Consumer’s total operating revenues increase in 2019?" }, { "input": "What was the percentage increase in Service Revenue for the year 2019 compared to the previous year?", "output": "How much did Service Revenue increase in 2019?" }, { "input": "What is the percentage change in revenue from wireless equipment sales between 2018 and 2019?", "output": "What is the change in Wireless equipment revenue from 2018 to 2019?" }, { "input": "By what percentage did the consolidated revenue increase from 2018 to 2019?", "output": "What was the increase in the consolidated revenue from 2018 to 2019?" }, { "input": "What was the percentage increase/decrease in consumer revenue from 2018 to 2019?", "output": "What was the change in the consumer revenue from 2018 to 2019?" }, { "input": "What is the source of financing for the firm's capital spending requirements?", "output": "How is the capital spending requirements for the firm financed?" }, { "input": "How does the firm obtain financing for its capital spending needs?", "output": "How is the capital spending requirements for the firm financed?" }, { "input": "What was the average cash flow used in investing activities for 2018 and 2019?", "output": "What was the average amount of cash used in investing activities during the years 2018 and 2019?" }, { "input": "What was the percentage change in cash flow used in financing activities from 2018 to 2019?", "output": "What was the percentage change in cash flow utilized for financing activities between the fiscal years 2018 and 2019?" }, { "input": "What was the percentage change in cash flow used in financing activities from 2018 to 2019?", "output": "What was the exact percentage of increase or decrease in cash flow used for financing activities between 2018 and 2019?" }, { "input": "What was the percentage difference in the cost of services between 2018 and 2019?", "output": "What was the change in the cost of services from 2018 to 2019?" }, { "input": "What were the adopted standards on January 1, 2018?", "output": "Which Standards were Adopted on January 1, 2018?" }, { "input": "What is the change in Accumulated other comprehensive income from December 31, 2017 to January 1, 2018?", "output": "What is the difference in the amount of Accumulated Other Comprehensive Income between December 31, 2017 and January 1, 2018?" }, { "input": "What is the net change in Noncontrolling interests between December 31, 2017, and January 1, 2018?", "output": "What is the change in Noncontrolling interests from December 31, 2017 to January 1, 2018?" }, { "input": "What was the difference in Noncontrolling interests' value between December 31, 2017, and January 1, 2018?", "output": "What is the change in Noncontrolling interests from December 31, 2017 to January 1, 2018?" }, { "input": "What was the ratio of the amortized cost due within one year or less to the fair value for the same period?", "output": "What was the amortized cost that was due in one year or less as a ratio of the fair value for the same period?" }, { "input": "What was the ratio between the amortized cost to be paid within one year or less and the fair value for the same period?", "output": "What was the amortized cost that was due in one year or less as a ratio of the fair value for the same period?" }, { "input": "What is the specific difference between the amortized cost and fair value, at the end of five years and at the end of ten years?", "output": "What was the difference between the amortized cost and fair value that was due after five years through ten years?" }, { "input": "Which specific years can be found in the table that offer additional details pertaining to the stock options of the company?", "output": "Which years does the table provide Additional information related to the company's stock options?" }, { "input": "What was the specific intrinsic value associated with exercises in the year 2019?", "output": "What was the intrinsic value of exercises in 2019?" }, { "input": "What were the Proceeds received from exercises in 2018?", "output": "What was the total amount of money received from exercises held in 2018?" }, { "input": "What were the Proceeds received from exercises in 2018?", "output": "What was the total amount of money received from exercises in the year 2018?" }, { "input": "What was the gross income in 2017, prior to deducting income taxes?", "output": "What was the total income before income taxes in 2017?" }, { "input": "What was the change in foreign income before income taxes between 2017 and 2018?", "output": "What was the difference in the amount of income from foreign sources prior to income taxes, comparing the years 2017 and 2018? Please provide the change in figures for better understanding." }, { "input": "What was the specific difference in foreign income, without considering income taxes, from 2018 to 2019?", "output": "What was the change in foreign income before income taxes between 2018 and 2019?" }, { "input": "What were the short-term investment options available in the year 2018?", "output": "What were the short-term investments in 2018?" }, { "input": "What were the types of short-term investments that were prevalent in 2018?", "output": "What were the short-term investments in 2018?" }, { "input": "What years does the table provide information for total assets?", "output": "For which time period does the table provide data on total assets?" }, { "input": "What years does the table provide information for total assets?", "output": "For which years does the table present data regarding the total assets?" }, { "input": "What was the total debt in 2015?", "output": "What amount of debt, in terms of total outstanding obligations, was recorded in the year 2015?" }, { "input": "What was the change in working capital between 2015 and 2016?", "output": "What was the difference in the amount of working capital between 2015 and 2016?" }, { "input": "By what percentage did the Total stockholders' equity change from 2018 to 2019?", "output": "What was the percentage change in Total stockholders' equity between 2018 and 2019?" }, { "input": "Which years does the table provide information for the funded status of the company's postretirement health care and other defined benefit plans?", "output": "For which specific years does the provided table present data relating to the funded status of the company's postretirement health care and other defined benefit plans?" }, { "input": "What was the specific amount of fair value attributed to the plan assets during the year 2019?", "output": "What was the fair value of plan assets in 2019?" }, { "input": "What was the fair value measurements using Level 2 for Corporate Bonds?", "output": "What are the fair value measurements using Level 2 for Corporate Bonds?" }, { "input": "What was the combined value of cash, cash equivalents, and short-term investments?", "output": "What were the total cash, cash equivalents and short-term investments?" }, { "input": "What was the sum of all cash, cash equivalents, and short-term investments?", "output": "What were the total cash, cash equivalents and short-term investments?" }, { "input": "What was the difference in the fair value for U.S. Treasury and government debt securities between Level 1 and Level 2?", "output": "What was the discrepancy in the fair value of U.S. Treasury and government debt securities when classified as Level 1 and Level 2 assets?" }, { "input": "What was the fair value of Level 2 Total cash, cash equivalents and short-term investments as a percentage of the total cash, cash equivalents and short-term investments?", "output": "What is the fair value, expressed as a percentage, of Level 2 Total cash, cash equivalents, and short-term investments in relation to the overall total of cash, cash equivalents, and short-term investments?" }, { "input": "What were the distinctions between the levels of Total cash, cash equivalents, and short-term investments in terms of Level 1 and Level 2?", "output": "What was the difference between the Total cash, cash equivalents and short-term investments for Level 1 and Level 2?" }, { "input": "What were the disparities in the amounts of Total Cash, Cash Equivalents, and Short-Term Investments between Level 1 and Level 2?", "output": "What was the difference between the Total cash, cash equivalents and short-term investments for Level 1 and Level 2?" }, { "input": "On what basis was the calculation for days inventory outstanding determined?", "output": "What was days inventory outstanding based on?" }, { "input": "What was the Days sales outstanding for three months ended april 2019?", "output": "\"What was the specific value of Days Sales Outstanding (DSO) specifically for the three-month period ending in April 2019?\"" }, { "input": "What was the change in day sales outstanding between 2018 and 2019?", "output": "What was the difference in the number of days it took to collect sales revenue between the years 2018 and 2019?" }, { "input": "What was the change in day sales outstanding between 2018 and 2019?", "output": "What was the difference in the number of days it took to collect sales revenue between 2018 and 2019?" }, { "input": "For how long was the days inventory outstanding higher than 20 days ?", "output": "How many years did days inventory outstanding exceed 20 days?" }, { "input": "What was the total amount spent on purchases in 2018?", "output": "What was the Aggregate purchase price in 2018?" }, { "input": "What was the difference in the total purchase price for all items combined in 2018 and 2019?", "output": "What was the change in the Aggregate purchase price between 2018 and 2019?" }, { "input": "What was the amount of finished goods in 2019?", "output": "What was the specific quantity of fully completed and ready-for-sale products in the year 2019?" }, { "input": "What was the exact quantity of inventories during the year 2018?", "output": "What was the amount of inventories in 2018?" }, { "input": "What was the change in finished goods between 2018 and 2019?", "output": "How did the amount of finished goods change from 2018 to 2019?" }, { "input": "What were the additions made to tax positions related to the current year in 2019, and how do they impact the overall financial situation?", "output": "What were the Additions based on tax positions related to the current year in 2019?" }, { "input": "What was the change in the Balance at beginning of period between 2017 and 2018?", "output": "What was the difference in the starting balance from the beginning of the period in 2017 compared to 2018?" }, { "input": "What is the overall difference in the total amount of Additions for tax positions of prior years from 2017 to 2019?", "output": "What was the total change in Additions for tax positions of prior years between 2017 and 2019?" }, { "input": "What was the difference in the total expenses recorded between 2018 and 2019?", "output": "What was the change in expense accrued during the period between 2018 and 2019?" }, { "input": "What was the difference in the amount of expenses recorded from 2018 to 2019?", "output": "What was the change in expense accrued during the period between 2018 and 2019?" }, { "input": "What was the percentage change in the Warranty costs incurred between 2018 and 2019?", "output": "What was the difference in warranty costs incurred between 2018 and 2019, expressed as a percentage change?" }, { "input": "What were the total assets in 2018?", "output": "What was the total value of assets in the year 2018?" }, { "input": "What was the percentage change in Total deferred commissions between 2018 and 2019?", "output": "What was the percentage change in the total amount of deferred commissions from 2018 to 2019?" }, { "input": "What was the final balance at the end of the specified period in the year 2018?", "output": "What was the balance at the end of period in 2018?" }, { "input": "What was the final balance at the end of period in 2018? Please provide the balance from the specified period in 2018.", "output": "What was the balance at the end of period in 2018?" }, { "input": "What was the change in the balance at beginning of period between 2018 and 2019?", "output": "What was the difference in the balance at the start of the period between the years 2018 and 2019?" }, { "input": "For how many years did the ending balance surpass $3,500 million?", "output": "How many years did balance at end of period exceed $3,500 million?" }, { "input": "How many years did interest income exceed $50 million?", "output": "For how many years was the interest income greater than $50 million?" }, { "input": "How many years did interest income exceed $50 million?", "output": "How many years was the total interest income greater than $50 million?" }, { "input": "What was the percentage change in net revenues between 2018 and 2019?", "output": "What was the exact percentage difference in net revenues from 2018 to 2019, and how can I calculate it accurately?" }, { "input": "Which years does the table provide information for net property and equipment?", "output": "For which specific years is information regarding net property and equipment provided in the table?" }, { "input": "Which years does the table provide information for net property and equipment?", "output": "For which specific years does the table present data regarding net property and equipment?" }, { "input": "What was the total revenue generated from operational activities during the year 2017?", "output": "What was the income from operations in 2017?" }, { "input": "What was the percentage change in the net income between 2018 and 2019?", "output": "What was the precise percentage increase or decrease in the net income from 2018 to 2019?" }, { "input": "What was the percentage change in the net income between 2018 and 2019?", "output": "What is the percentage difference in the net income from 2018 to 2019?" }, { "input": "How much money was in circulation in the form of physical cash in the year 2019?", "output": "What was the amount of cash in 2019?" }, { "input": "In 2019, how much cash was there?", "output": "What was the amount of cash in 2019?" }, { "input": "What was the specific value or quantity of cash equivalents recorded in the financial records for the year 2018?", "output": "What was the amount of cash equivalents in 2018?" }, { "input": "Which years does the table provide information for property and equipment information for geographic areas based on the physical location of the assets?", "output": "For which specific years does the table provide information related to property and equipment in geographic areas, considering the assets' physical locations?" }, { "input": "Which years does the table provide information for property and equipment information for geographic areas based on the physical location of the assets?", "output": "For which years does the table present property and equipment information? This information is specifically related to geographic areas, and is based on the physical location of the assets." }, { "input": "What was the total combined value of property and equipment in the United States during the year 2019?", "output": "What was the amount of property and equipment in U.S. in 2019?" }, { "input": "How many years did International property and equipment exceed $150 million?", "output": "For how many consecutive years has the value of International property and equipment been greater than $150 million?" }, { "input": "What was the financed unearned services revenue in 2019?", "output": "\"What was the amount of unearned revenue from financed services in 2019?\"" }, { "input": "What was the financed unearned services revenue in 2019?", "output": "What was the specific amount of unearned services revenue that was financed during the year 2019?" }, { "input": "For how many years has the amount of financed unearned services revenue been greater than $100 million?", "output": "How many years did financed unearned services revenue exceed $100 million?" }, { "input": "For how many consecutive years has the unearned services revenue in financed exceeded $100 million?", "output": "How many years did financed unearned services revenue exceed $100 million?" }, { "input": "What was the change in deferred services revenue between 2018 and 2019?", "output": "What was the difference in the amount of deferred services revenue recorded in 2018 and 2019?" }, { "input": "What was the sum of the net revenues from the last two quarters?", "output": "What is the total net revenue for the most recent two quarters?" }, { "input": "What was the change in the Dividends per share declared between 2018 and 2019?", "output": "What was the difference in the amount of Dividends per share declared between the years 2018 and 2019?" }, { "input": "What was the change in the Dividends per share declared between 2018 and 2019?", "output": "What was the difference in the amount of dividends per share declared for the years 2018 and 2019?" }, { "input": "What was the change in the Dividend payments allocated to retained earnings (accumulated deficit) between 2017 and 2018?", "output": "What was the difference in the amounts of dividends allocated to retained earnings (accumulated deficit) from 2017 to 2018?" }, { "input": "What was the change in the Dividend payments allocated to retained earnings (accumulated deficit) between 2017 and 2018?", "output": "What was the variation in the amount of dividends transferred to retained earnings (accumulated deficit) from 2017 to 2018?" }, { "input": "What factors contributed to the rise in the cash inflows generated from day-to-day business operations during the year 2019?", "output": "What caused the increase in the cash flow from operating activities in 2019?" }, { "input": "What is the percentage change in the Free Cash Flow, considering both an increase and decrease in value?", "output": "What is the increase / (decrease) in the Free Cash Flow?" }, { "input": "What is the percentage change in Free Cash Flow?", "output": "What is the increase / (decrease) in the Free Cash Flow?" }, { "input": "What are the definitions of underlying operating profit and underlying operating margin, and how do they differ from each other?", "output": "What is the definition of the Underlying operating profit and underlying operating margin?" }, { "input": "What does the underlying effective tax rate measure reflects?", "output": "What is the definition and significance of the underlying effective tax rate and what does it reflect in terms of a company's financial situation or performance?" }, { "input": "What does the underlying effective tax rate measure reflects?", "output": "What is the meaning of the underlying effective tax rate measure and what does it reflect?" }, { "input": "What is the methodology used to calculate the effective tax rate?", "output": "How is the underlying effective tax rate calculated?" }, { "input": "What is the exact difference in the underlying effective tax rate between 2018 and 2019?", "output": "What is the Underlying effective tax rate change from 2018 to 2019?" }, { "input": "What is the percentage change in the underlying effective tax rate between 2018 and 2019?", "output": "What is the Underlying effective tax rate change from 2018 to 2019?" }, { "input": "By what percentage did the Biogas production increase from the year 2018 to 2019?", "output": "What is the increase in the Biogas from 2018 to 2019?" }, { "input": "What is the average total energy?", "output": "What is the average amount of energy, when considering all types of energy combined?" }, { "input": "\"What is the percentage change in the price of Fuel Oils from 2018 to 2019? Please specify if it is an increase or decrease.\"", "output": "What is the percentage increase / (decrease) in Fuel Oils from 2018 to 2019?" }, { "input": "What is the percentage change in the price of Fuel Oils between 2018 and 2019?", "output": "What is the percentage increase / (decrease) in Fuel Oils from 2018 to 2019?" }, { "input": "What method is used to calculate ROIC (Return on Invested Capital)? Please provide a detailed explanation of the calculation process.", "output": "How is ROIC calculated?" }, { "input": "What is the difference in the diluted combined average number of share units (in millions) between 2018 and 2019?", "output": "What is the change in the Diluted combined average number of share units (millions of units) from 2018 to 2019?" }, { "input": "What is the difference in the diluted combined average number of share units (measured in millions) between 2018 and 2019?", "output": "What is the change in the Diluted combined average number of share units (millions of units) from 2018 to 2019?" }, { "input": "What was the specific price quotation for investments on 31 December 2018?", "output": "What was the price quotation for investments as of 31 December 2018?" }, { "input": "What was the average value of investments made by listed companies in the year 2019?", "output": "What is the average amount of investments of listed companies in 2019?" }, { "input": "What was the average investment amount of listed companies in 2019?", "output": "What is the average amount of investments of listed companies in 2019?" }, { "input": "What was the average percentage of ownership or voting rights held by shareholders in listed companies during the year 2019?", "output": "What is the average Percentage of ownership or voting rights of listed companies in 2019?" }, { "input": "What is the average Unsecured other loans?", "output": "What is the average amount of unsecured loans that fall under the category of \"other loans\"?" }, { "input": "What is the average value or rating of Integrated Device Manufacturers (IDMs)?", "output": "What is the average of Integrated device manufacturers?" }, { "input": "What is the average performance or profitability of Integrated device manufacturers?", "output": "What is the average of Integrated device manufacturers?" }, { "input": "What is the increase / (decrease) in the Integrated device manufacturers from 2017 to 2018?", "output": "What is the net change, either increase or decrease, in the number of Integrated device manufacturers from 2017 to 2018?" }, { "input": "What is the average amount of compensation received by Directors?", "output": "What is the average Directors’ compensation?" }, { "input": "What is the percentage change in compensation for Directors from the year 2018 to 2019?", "output": "What is the increase/ (decrease) in Directors’ compensation from 2018 to 2019?" }, { "input": "What calculation method is employed to determine the total sum of the Company's shares?", "output": "What method is used to calculate the aggregate amount of the Company’s share?" }, { "input": "What specific method is employed to calculate the total combined value of the shares held by the Company?", "output": "What method is used to calculate the aggregate amount of the Company’s share?" }, { "input": "What is the average revenue generated from the sale of goods and services from December 31, 2018 to December 31, 2019?", "output": "What are the average Sales of goods and services for December 31, 2018 to 2019?" }, { "input": "What is the increase/ (decrease) in Sales of goods and services for December 31, 2018 to 2019?", "output": "What is the percentage change in the sales of goods and services when comparing December 31, 2018, to December 31, 2019?" }, { "input": "What is the percentage change in the sales of goods and services from January 1, 2018, to December 31, 2018?", "output": "What is the increase/ (decrease) in Sales of goods and services for January 1, 2018 to December 31, 2018?" }, { "input": "What is the percentage change in the sales of goods and services from January 1, 2018 to December 31, 2018?", "output": "What is the increase/ (decrease) in Sales of goods and services for January 1, 2018 to December 31, 2018?" }, { "input": "\"What is the average amount of lease obligation for the period Less than 1 Year and 1-3 Years? Please provide the average lease obligations separately for both time periods.\"", "output": "What is the average lease obligation for the period Less than 1 Year and 1-3 Years?" }, { "input": "What does Purchase obligations incorporate?", "output": "What is included in purchase obligations?" }, { "input": "What is the criteria for an employee to receive additional bonuses?", "output": "What are the specific requirements or factors that determine if an employee is eligible to receive extra bonuses in addition to their regular compensation?" }, { "input": "What is the difference in Time Deposits: Non-Trading Purpose between Carrying Amount and Fair Amount?", "output": "What is the distinction between the carrying amount and fair amount of Time Deposits used for non-trading purposes?" }, { "input": "What is the difference between Bonds: Non-Trading Purpose Carrying Amount and Fair Amount?", "output": "What distinguishes Bonds: Non-Trading Purpose Carrying Amount from Fair Amount in terms of their meaning, significance, and financial implications?" }, { "input": "What is the average quantity of supplies and spare parts in general?", "output": "What was the average Supplies and spare parts?" }, { "input": "What was the average quantity of supplies and spare parts used?", "output": "What was the average Supplies and spare parts?" }, { "input": "What was the percentage increase / (decrease) in the Finished goods from 2018 to 2019?", "output": "What was the percentage change in the amount of Finished goods from the year 2018 to 2019, indicating whether it increased or decreased?" }, { "input": "What was the percentage increase / (decrease) in the Finished goods from 2018 to 2019?", "output": "What was the percentage change in the value of Finished goods from 2018 to 2019, indicating whether it increased or decreased?" }, { "input": "What is the percentage change in the value of financial assets at fair value through profit or loss between 2018 and 2019?", "output": "What is the increase / (decrease) in the Financial assets at fair value through profit or loss from 2018 to 2019?" }, { "input": "What is the increase / (decrease) in the Short-term loans from 2018 to 2019?", "output": "What was the percentage change in Short-term loans from 2018 to 2019?" }, { "input": "What is the increase / (decrease) in the Short-term loans from 2018 to 2019?", "output": "What was the percentage change in Short-term loans between 2018 and 2019?" }, { "input": "What measures does the company implement to effectively manage, maintain, or adapt its capital structure in response to changes in the business environment?", "output": "What steps does the company take To maintain or adjust the capital structure?" }, { "input": "What is the change in the total liabilities, expressed as a percentage, between 2018 and 2019?", "output": "What is the increase / (decrease) in the Total liabilities from 2018 to 2019?" }, { "input": "What is the percentage change in Total Capital from 2018 to 2019?", "output": "What is the percentage increase / (decrease) of Total Capital from 2018 to 2019?" }, { "input": "How do non-GAAP measures assist investors in making informed investment decisions and assessing a company's financial performance?", "output": "How do these non-GAAP measures aid investors?" }, { "input": "How much would diluted earnings per share for fiscal year 2018 be without the net charge (benefit) related to the enactment of the TCJA?", "output": "What would be the diluted earnings per share for fiscal year 2018 if we exclude the net charge (benefit) associated with the enactment of the Tax Cuts and Jobs Act (TCJA)?" }, { "input": "How much would diluted earnings per share for fiscal year 2018 be without the net charge (benefit) related to the enactment of the TCJA?", "output": "What would be the diluted earnings per share for fiscal year 2018, excluding the impact of the net charge (benefit) resulting from the implementation of the Tax Cuts and Jobs Act (TCJA)?" }, { "input": "What was the average revenue over the 3 year period from 2017 to 2019?", "output": "What was the average revenue from 2017 to 2019, calculated over the span of three years?" }, { "input": "What factors contributed to the change in the federal statutory rate during fiscal year 2017?", "output": "Why did the federal statutory rate in fiscal year 2017? " }, { "input": "How many items accounted for the difference between income taxes computed at the US federal statutory rate and the company's effective rate?", "output": "How many specific items contributed to the variance between the income taxes calculated using the official US federal statutory rate and the actual rate paid by the company?" }, { "input": "What was the average federal statutory rate over the 3 year period from 2017 to 2019? ", "output": "What was the average federal statutory rate from 2017 to 2019?" }, { "input": "What was the average effective rate for the years 2017, 2018, and 2019?", "output": "What was the average effective rate over the 3 year period from 2017 to 2019? " }, { "input": "How much were the top 3 components of property and equipment as a % of the total at cost, property and equipment for 2019?", "output": "What were the percentages for the top 3 components of property and equipment in relation to the total at cost, property, and equipment in 2019?" }, { "input": "What percentage of the overall revenue in 2019 was not generated by the commercial cloud revenue?", "output": "How much of the total revenue in 2019 did not come from commercial cloud revenue?" }, { "input": "Which were the bottom 2 revenue items for 2017?", "output": "\"What were the two lowest revenue-generating items in 2017?\"" }, { "input": "What was the average EPS (earnings per share) for the years 2017, 2018, and 2019?", "output": "What was the average basic earnings per share over the 3 year period from 2017 to 2019?" }, { "input": "\"Which specific locations does the company have both operating leases and finance leases in place?\"", "output": "Which places does the company have operating and finance leases?" }, { "input": "What is the topic or content covered in note 15?", "output": "What is note 15 about?" }, { "input": "\"What is the ascending order of the years when sorted by their total cost of finance leases?\"", "output": "What are the years sorted by total finance lease cost, in ascending order?" }, { "input": "How many different items are included when calculating operating income (loss)?", "output": "How many items are there for operating income (loss)?" }, { "input": "How many different items are included in the calculation of operating income (loss)?", "output": "How many items are there for operating income (loss)?" }, { "input": "How is the allocation of amortization and depreciation determined and distributed?", "output": "How is the portion of amortization and depreciation allocated?" }, { "input": "How is the allocation of the portion of amortization and depreciation determined and distributed?", "output": "How is the portion of amortization and depreciation allocated?" }, { "input": "How many countries have Long-lived assets classified based on the controlling statutory company's location, excluding financial instruments and tax assets? Also, can you identify the countries where the assets amount to over 10% of the total assets?", "output": "How many countries have Long-lived assets, excluding financial instruments and tax assets, classified by the location of the controlling statutory company and with countriesover 10% of the total assets?" }, { "input": "What was the percentage change in the total long-lived assets from 2017 to 2018?", "output": "What was the precise percentage difference in the value of total long-lived assets between the years 2017 and 2018?" }, { "input": "What was the percentage change in the total long-lived assets from 2017 to 2018?", "output": "What was the percentage increase or decrease in the value of the total long-lived assets between the years 2017 and 2018?" }, { "input": "What is the definition and significance of purchase commitments?", "output": "What do the purchase commitments represent?" }, { "input": "What is the meaning and significance of purchase commitments?", "output": "What do the purchase commitments represent?" }, { "input": "Where can the long-term debt be found?", "output": "Where is the location to find the information regarding long-term debt?" }, { "input": "How much do the top 3 contractual obligation terms add up to in 2020?", "output": "What is the total value of the top three contractual obligation terms in 2020?" }, { "input": "How much do the top 3 contractual obligation terms add up to in 2020?", "output": "In 2020, what is the sum of the top three contractual obligation terms?" }, { "input": "What was the number of stock awards that were granted in 2019, not including the PSUs granted?", "output": "Excluding the PSUs, how many stock awards were granted in the year 2019?" }, { "input": "What was the number of stock awards that were granted in 2019, not including the PSUs granted?", "output": "How many stock awards were granted in 2019, excluding the granted PSUs?" }, { "input": "What was the average fair value of the stock awards that were vested between 2017 and 2019 over a three-year period?", "output": "What was the average fair value of stock awards vested over the 3 year period from 2017 to 2019?" }, { "input": "What was the intelligent cloud as a percentage of total revenue in 2019?", "output": "What percentage of the total revenue in 2019 was attributed to the intelligent cloud?" }, { "input": "What was the intelligent cloud as a percentage of total revenue in 2019?", "output": "What percentage of total revenue in 2019 was attributed to the intelligent cloud?" }, { "input": "How many revenue categories are there?", "output": "How many different categories of revenue exist in the given context?" }, { "input": "What is the company's net loss in 2019?", "output": "What was the net financial loss incurred by the company during the year 2019?" }, { "input": "What was the cumulative unrealized gain on marketable securities for the company from 2017 to 2019?", "output": "What is the company's total unrealized gain on marketable securities between 2017 to 2019?" }, { "input": "What is the percentage difference in the total stock base compensation, categorized by type of award, from 2018 to 2019?", "output": "What is the percentage change in total stock base compensation by type of award between 2018 and 2019?" }, { "input": "What is the percentage change in the total stock base compensation for each type of award from 2018 to 2019?", "output": "What is the percentage change in total stock base compensation by type of award between 2018 and 2019?" }, { "input": "What is the cumulative amount of stock based compensation from 2017 through 2019?", "output": "What is the total stock based compensation amount between 2017 to 2019?" }, { "input": "What is the sum of stock-based compensation expenses for the years 2017 to 2019?", "output": "What is the total stock based compensation amount between 2017 to 2019?" }, { "input": "As of December 31, 2018, how many shares were granted and are currently outstanding?", "output": "What is the number of outstanding shares granted as of December 31, 2018? " }, { "input": "As of December 31, 2018, how many shares were granted and still remain outstanding?", "output": "What is the number of outstanding shares granted as of December 31, 2018? " }, { "input": "What was the change in the number of outstanding shares between December 31, 2018, and December 31, 2019?", "output": "What is the difference in outstanding shares as of December 31, 2018 and 2019?" }, { "input": "What data is shown in the table?", "output": "\"What specific information does the table display?\"" }, { "input": "What is the units for the data shown in the table?", "output": "What units are being used to represent the information presented in the table?" }, { "input": "What is the total revenue earned by the company between 2017 to 2019?", "output": "What is the cumulative revenue generated by the company from 2017 to 2019?" }, { "input": "What is the exact amount of revenue generated in the EMEA region from 2017 to 2019?", "output": "What is the total revenue earned in EMEA between 2017 to 2019?" }, { "input": "What is the change in total revenue between 2018 to 2019?", "output": "What is the difference in total revenue from 2018 compared to 2019, and how can this information assist in better understanding the financial performance of the company during this period?" }, { "input": "What is the sum of all non-current liabilities excluding total other non-current liabilities as of 31 December 2019?", "output": "What is the total other non-current liabilities as at 31 December 2019?" }, { "input": "What is the aggregate amount of all non-current liabilities, excluding any current liabilities, as of December 31, 2019?", "output": "What is the total other non-current liabilities as at 31 December 2019?" }, { "input": "What is the depreciation expense on property and equipment as at December 31, 2018?", "output": "What is the specific amount of the depreciation expense recorded for property and equipment on the financial statement as at the closing date of December 31, 2018?" }, { "input": "What is the depreciation expense on property and equipment as at December 31, 2018?", "output": "What was the amount of depreciation expense booked for property and equipment up until December 31st, 2018?" }, { "input": "What is the depreciation expense on property and equipment as at December 31, 2019?", "output": "As of December 31, 2019, what is the recorded depreciation expense amount for property and equipment?" }, { "input": "What is the depreciation expense on property and equipment as at December 31, 2019?", "output": "What is the specific amount of depreciation expense recorded for property and equipment as on December 31, 2019?" }, { "input": "What is the total depreciation expense on property and equipment from 2017 to 2019?", "output": "From 2017 to 2019, what was the cumulative amount of depreciation expense incurred on property and equipment?" }, { "input": "What is the total depreciation expense on property and equipment from 2017 to 2019?", "output": "What is the cumulative depreciation expense on property and equipment for the years 2017, 2018, and 2019?" }, { "input": "What is the allowance for doubtful accounts as at December 31, 2018?", "output": "What is the amount set aside for doubtful accounts on the balance sheet as of December 31, 2018?" }, { "input": "What are some examples of financial instruments that can subject the company to credit risk?", "output": "What financial instruments expose companies to credit risk? Can you provide some examples of such instruments?" }, { "input": "What are some examples of financial instruments that can subject the company to credit risk?", "output": "What financial instruments can expose a company to credit risk? Can you provide some examples?" }, { "input": "How does the company effectively manage and reduce its credit risk associated with accounts receivable?", "output": "How does the company mitigate its credit risk pertaining to accounts receivable?" }, { "input": "How does the company identify and determine its important or major customers in its business operations?", "output": "How does the company determine its significant customers?" }, { "input": "How does the company identify and assess its significant customers for business purposes?", "output": "How does the company determine its significant customers?" }, { "input": "What was the monetary value of the company's losses within its own country during the year 2019?", "output": "What is the company's domestic loss in 2019?" }, { "input": "\"What was the total financial loss incurred by the company within its own country during the year 2019?\"", "output": "What is the company's domestic loss in 2019?" }, { "input": "What is the amount of money that the company lost within its own country during the year 2017?", "output": "What is the company's domestic loss in 2017?" }, { "input": "\"What was the fiscal year 2017 domestic financial loss incurred by the company?\"", "output": "What is the company's domestic loss in 2017?" }, { "input": "What is the company's total loss before income taxes between 2017 to 2019?", "output": "What is the cumulative pre-tax loss of the company from 2017 to 2019?" }, { "input": "What is the company's change in foreign income between 2018 and 2019?", "output": "What was the percentage change in the company's foreign income from 2018 to 2019?" }, { "input": "What is the cumulative amount of loss incurred by the company domestically during the period from 2017 to 2019?", "output": "What is the company's total domestic loss between 2017 to 2019?" }, { "input": "What is the aggregate amount of losses incurred by the company within its domestic markets during the period from 2017 to 2019?", "output": "What is the company's total domestic loss between 2017 to 2019?" }, { "input": "What is the percentage change in total deferred revenue 2018 and 2019?", "output": "What is the percentage difference in the total deferred revenue between 2018 and 2019?" }, { "input": "What is the percentage change in total deferred revenue 2018 and 2019?", "output": "What is the percentage difference in the total amount of deferred revenue between the years 2018 and 2019?" }, { "input": "What is the proportion of the cost of revenue and research and development expenses as a percentage of the total restructuring expense?", "output": "What is the ratio of the combined cost of revenue and research and development expenses to the total restructuring expense, expressed as a percentage?" }, { "input": "What is the total legal fees and facilities expenses?", "output": "What are the combined expenses for legal fees and facilities?" }, { "input": "What is the total legal fees and facilities expenses?", "output": "What is the cumulative sum of the expenses incurred in legal fees and facilities?" }, { "input": "What is the total increases related to tax positions taken during current year in 2018 and 2019?", "output": "What is the cumulative increase in tax positions taken during the current year across 2018 and 2019?" }, { "input": "What is the total increases related to tax positions taken during current year in 2018 and 2019?", "output": "What is the combined amount of increments in relation to tax positions that were assumed during the present year in both 2018 and 2019?" }, { "input": "As of 31st December 2018, what is the sum of all liabilities that have accumulated or been incurred by the company up to that date?", "output": "What is the total accrued liabilities as at 31 December 2018?" }, { "input": "What are the units used to measure the values in the table?", "output": "What is the units that the values in the table are measured in?" }, { "input": "What is the percentage increase or decrease in the total amount of accrued liabilities from 2018 to 2019?", "output": "What is the percentage change in total accrued liabilities between 2018 and 2019?" }, { "input": "What is the percentage increase or decrease in the total amount of liabilities that have accumulated between the years 2018 and 2019?", "output": "What is the percentage change in total accrued liabilities between 2018 and 2019?" }, { "input": "What is the total amount of long-lived asset in Japan in both 2018 and 2019?", "output": "What is the total value of long-lived assets in Japan for the years 2018 and 2019 combined?" }, { "input": "What were the capitalized costs of software development in 2019?", "output": "What was the Capitalized software development costs in 2019?" }, { "input": "What were the average net costs of software development for both the years 2018 and 2019?", "output": "What was the average Software development costs, net for 2018 and 2019?" }, { "input": "What were the net average costs for software development in both 2018 and 2019?", "output": "What was the average Software development costs, net for 2018 and 2019?" }, { "input": "What was the Cash and cash equivalents in 2019 and 2018 respectively?", "output": "What were the cash and cash equivalents figures for the years 2019 and 2018, respectively?" }, { "input": "What was the Cash and cash equivalents in 2019 and 2018 respectively?", "output": "What were the amounts of Cash and cash equivalents for the years 2019 and 2018, respectively?" }, { "input": "What are the average amounts of Restricted cash for the years 2018 and 2019?", "output": "What is the average Restricted cash for 2018 and 2019?" }, { "input": "What was the Compensation and related benefits in 2019 and 2018 respectively?", "output": "What were the compensation and benefits offered in 2019 and 2018?" }, { "input": "\"In what specific year did the expenses for professional and legal services amount to a value below 500 thousand dollars?\"", "output": "In which year was Professional and legal fees less than 500 thousands?" }, { "input": "What was the percentage change in Compensation and related benefits from 2018 to 2019?", "output": "What is the change in the Compensation and related benefits from 2018 to 2019?" }, { "input": "What were the average professional and legal fees in 2018 and 2019?", "output": "What was the average Professional and legal fees for 2018 and 2019?" }, { "input": "What were the factors that led to the decrease in net pool allocation? Can you provide an explanation and further details regarding the decline in net pool allocation?", "output": "What accounted for the decrease in net pool allocation?" }, { "input": "What are the components of vessel operating and supervision costs?", "output": "What are the key components that make up the costs associated with vessel operations and supervision?" }, { "input": "How much did the technical maintenance expenses differ between 2017 and 2018?", "output": "What was the change in technical maintenance expenses from 2017 to 2018?" }, { "input": "What was the exact monetary difference in technical maintenance expenses between the years 2017 and 2018?", "output": "What was the change in technical maintenance expenses from 2017 to 2018?" }, { "input": "What was the percentage increase or decrease in the combined costs of operating and supervising vessels from 2018 to 2019?", "output": "What was the percentage change in total vessel operating and supervision costs from 2018 to 2019?" }, { "input": "Why does the Group choose to retain ownership of the trade receivables instead of selling them or assigning them to a third party?", "output": "Why does the Group hold the trade receivables?" }, { "input": "What does accrued income represent?", "output": "\"What is the meaning and significance of accrued income?\"" }, { "input": "What does accrued income represent?", "output": "What is the meaning and significance of accrued income?" }, { "input": "What was the change in trade receivables from 2018 to 2019?", "output": "What was the difference in the amount of trade receivables between 2018 and 2019, indicating whether it increased or decreased?" }, { "input": "What was the exact percentage increase or decrease in the total amount of money owed to a company from 2018 to 2019?", "output": "What was the percentage change in total receivables from 2018 to 2019?" }, { "input": "In which years was the fair value of the derivative assets recorded for?", "output": "For which specific years were the fair values of the derivative assets recorded? Please provide all relevant information regarding the years for accurate response." }, { "input": "In which specific year did the fair value of interest rate swaps experience an increase compared to other years?", "output": "In which year was the fair value of Interest rate swaps higher?" }, { "input": "When was the fair value of interest rate swaps higher: in which specific year did the fair value of interest rate swaps surpass that of other years?", "output": "In which year was the fair value of Interest rate swaps higher?" }, { "input": "In which year did the company record the highest amount of expenses pertaining to share-based compensation?", "output": "In which year was the total expense recognized in respect of share-based compensation the highest?" }, { "input": "What was the highest year in terms of total recognized expense for share-based compensation?", "output": "In which year was the total expense recognized in respect of share-based compensation the highest?" }, { "input": "What was the difference in the amount of money earned by The Cool Pool Limited in 2017 compared to 2018?", "output": "What was the change in revenues from The Cool Pool Limited from 2017 to 2018?" }, { "input": "What was the exact difference in the amount of money earned by The Cool Pool Limited in the year 2017 compared to the year 2018?", "output": "What was the change in revenues from The Cool Pool Limited from 2017 to 2018?" }, { "input": "For which years were the movements in tangible fixed assets and vessels under construction documented?", "output": "In which years was the movements in tangible fixed assets and vessels under construction recorded for?" }, { "input": "What was the percentage change in the cost of vessels from the end of 2018 to the end of 2019?", "output": "What was the change in vessel cost from end 2018 to end 2019?" }, { "input": "In which year was the other non-current assets recorded for?", "output": "What is the specific year in which the recording of non-current assets other than the current ones occurred?" }, { "input": "In which year did the value of long-term assets, other than the current year, surpass that of the current year?", "output": "In which year was the other long-term assets higher?" }, { "input": "What was the percentage change in the total value of other non-current assets from the year 2018 to the year 2019?", "output": "What was the percentage change in total other non-current assets from 2018 to 2019?" }, { "input": "For which years were the earnings per share (EPS) or losses per share (LPS) recorded?", "output": "In which years was the earnings/(losses) per share recorded for?" }, { "input": "What was the amount of ship management client accounts in 2018?", "output": "How many ship management client accounts were there in the year 2018?" }, { "input": "What was the change in current accounts from 2018 to 2019?", "output": "What is the difference between the current accounts in 2018 and 2019?" }, { "input": "\"What was the specific amount of compensation or payment given to individuals or employees in the year 2018?\"", "output": "What was the remuneration in 2018?" }, { "input": "What was the amount of compensation received in 2018?", "output": "What was the remuneration in 2018?" }, { "input": "Which year was the short-term benefits the highest?", "output": "What was the peak year for short-term benefits?" }, { "input": "Which year was the short-term benefits the highest?", "output": "In which year did the short-term benefits reach their peak level?" }, { "input": "For which specific years were the costs of constructing the vessels recorded?", "output": "In which years was the vessels under construction costs recorded for?" }, { "input": "For which years were the recorded construction costs of the vessels?", "output": "In which years was the vessels under construction costs recorded for?" }, { "input": "What is the significance or meaning behind vessels that are currently being constructed?", "output": "What does vessels under construction represent?" }, { "input": "In which year were the installment payments for the progress shipyard lower compared to other years?", "output": "Which year was the progress shipyard installments lower?" }, { "input": "In which specific year did the progress shipyard experience a decrease in the number of installments being made?", "output": "Which year was the progress shipyard installments lower?" }, { "input": "What was the difference in the average daily hire rate between 2017 and 2018?", "output": "What was the change in average daily hire rate from 2017 to 2018?" }, { "input": "What is the change in pool gross revenues from 2017 to 2018?", "output": "What was the difference in the total revenue generated from pools in 2017 compared to 2018?" }, { "input": "What is the change in pool gross revenues from 2017 to 2018?", "output": "What is the difference in total gross revenues generated by the pool in 2017 compared to 2018?" }, { "input": "In which years was the other payables and accruals recorded for?", "output": "For which years was the recording of other payables and accruals made?" }, { "input": "How many vessels were hired for employment in the year 2018?", "output": "How many vessels were hired in 2018?" }, { "input": "In 2018, what was the total number of vessels that were hired?", "output": "How many vessels were hired in 2018?" }, { "input": "In which year was the accrued interest higher?", "output": "\"In which year did the accumulated interest amount exceed the interest accrued in other years?\"" }, { "input": "What is the percentage difference in the combined value of total payables and accruals between 2018 and 2019?", "output": "What was the percentage change in total payables and accruals from 2018 to 2019?" }, { "input": "What was the difference in the ratio of a company's long-term debt to its equity from 2018 to 2019?", "output": "What was the change in gearing ratio from 2018 to 2019?" }, { "input": "In 2019, how many operating days were there and what was the total revenue generated during those days?", "output": "What was the total revenue operating days in 2019?" }, { "input": "How many operating days were there in 2019 and what was the total revenue generated during those days?", "output": "What was the total revenue operating days in 2019?" }, { "input": "What was the change in average daily hire rate from 2018 to 2019?", "output": "What was the specific difference in the average rate of daily hiring between 2018 and 2019? How did the average daily hire rate change from one year to the other?" }, { "input": "\"What was the specific monetary value of the cash distributions made in the year 2019 for common units?\"", "output": "What was the amount of cash distributions in 2019 for common units?" }, { "input": "How much did the size of the ATM Programme change?", "output": "What was the change in size of the ATM Programme?" }, { "input": "\"When was the specific year when the general and administrative expenses were officially documented or accounted for?\"", "output": "In which year was the general and administrative expenses recorded for?" }, { "input": "Where does the management intend to relocate its employees?", "output": "\"What are the specific plans of the management regarding the relocation of its employees?\"" }, { "input": "What was the selling price of Methane Julia Louise?", "output": "How much was Methane Julia Louise sold for?" }, { "input": "Which company was the vessel sold to?", "output": "To which company was the vessel sold? Please provide the name of the company that purchased the vessel." }, { "input": "Which company was the vessel sold to?", "output": "\"To which company was the vessel sold?\"" }, { "input": "What was the percentage change in total right-of use assets from start to end 2019?", "output": "What was the percentage change in the total value of right-of-use assets between the beginning and end of 2019?" }, { "input": "How much was the change in the repayment amount for the bank loan?", "output": "By how much did the bank loan repayment change?" }, { "input": "What year experienced a decrease in audit fees compared to previous years?", "output": "In which year was the audit fees lower?" }, { "input": "What was the percentage change in audit fees between 2018 and 2019?", "output": "What was the change in audit fees from 2018 to 2019?" }, { "input": "What was the percentage change in audit fees from the year 2018 to the year 2019?", "output": "What was the change in audit fees from 2018 to 2019?" }, { "input": "What are the components subjected to credit risk?", "output": "What are the various components that are exposed to the risk of credit?" }, { "input": "What is the significance of bunkers' consumption and other voyage expenses in relation to overall expenses for a voyage?", "output": "What does bunkers’ consumption and other voyage expenses represent?" }, { "input": "What is the highest year for brokers' commissions on revenue?", "output": "In which year was the brokers' commissions on revenue the highest?" }, { "input": "What was the change in bunkers’ consumption and other voyage expenses from 2018 to 2019?", "output": "What was the difference in the consumption of bunkers and other voyage expenses between 2018 and 2019?" }, { "input": "What is the specific monetary value that The Cool Pool Limited is required to pay in 2018?", "output": "What is the amount due from The Cool Pool Limited in 2018?" }, { "input": "In which years was the dividends receivable and other amounts due from related parties recorded for?", "output": "For which years were the recorded dividends receivable and other amounts due from related parties?" }, { "input": "In which years was the dividends receivable and other amounts due from related parties recorded for?", "output": "In which specific years were the dividends receivable and other outstanding amounts due from related parties recorded?" }, { "input": "In which years was the investments recorded for?", "output": "For which specific years were the investments recorded?" }, { "input": "What was the numerical difference in the number of additions made between 2018 and 2019?", "output": "What was the change in additions from 2018 to 2019?" }, { "input": "What was the difference in the number of additions between 2018 and 2019?", "output": "What was the change in additions from 2018 to 2019?" }, { "input": "What were the specific Level 2 municipal and corporate bonds held on December 31, 2018?", "output": "What are the respective Level 2 municipal and corporate bonds as at December 31, 2018?" }, { "input": "What is the value of municipal bonds as a percentage of the total marketable securities?", "output": "What proportion of the overall marketable securities is represented by municipal bonds?" }, { "input": "What are the respective goodwill amounts in 2015 and 2016?", "output": "What were the goodwill amounts in 2015 and 2016, respectively?" }, { "input": "What were the average amounts of total cash, cash equivalents, and marketable securities in the years 2015 and 2016?", "output": "What is the average total cash, cash equivalents, and marketable securities in 2015 and 2016?" }, { "input": "What is the proportion of shares bought in November out of the total shares purchased in the last three months of 2019?", "output": "What percentage of the total shares purchased in the last three months of 2019 are bought in November?" }, { "input": "What is the proportion of shares bought in November out of the total shares purchased during the last three months of 2019?", "output": "What percentage of the total shares purchased in the last three months of 2019 are bought in November?" }, { "input": "What is the percentage change in revenue from term license between 2018 and 2019?", "output": "What is the percent increase or decrease in revenue from term license for the year 2019 compared to 2018?" }, { "input": "What is the percentage increase or decrease in revenue generated from maintenance services from 2018 to 2019?", "output": "What is the percentage change in revenue from maintenance between 2018 and 2019?" }, { "input": "What were the pre-tax losses in 2018 and 2019 for the company?", "output": "What are the respective loss before income tax benefits in 2018 and 2019?" }, { "input": "What were the pre-tax domestic and foreign earnings in 2017?", "output": "What are the respective domestic and foreign income before income taxes in 2017?" }, { "input": "\"What were the pre-tax incomes in 2017, separated by domestic and foreign sources?\"", "output": "What are the respective domestic and foreign income before income taxes in 2017?" }, { "input": "\"What were the total losses incurred in the domestic market and international markets in 2019, excluding any income tax benefits received?\"", "output": "What are the respective domestic and foreign losses before income tax benefits in 2019?" }, { "input": "What is the sum of all outstanding receivables that are yet to be invoiced, and are expected to be collected within a period of two years?", "output": "What is the total unbilled receivables due within 2 years?" }, { "input": "What is the sum of all outstanding amounts receivable that have not been invoiced yet, and are expected to be collected within the next 1 to 5 years?", "output": "What is the total unbilled receivables due between 1 to 5 years?" }, { "input": "What is the cumulative value of unrealized billings that are yet to be accounted for and are expected to be collected within a period ranging from 1 to 5 years?", "output": "What is the total unbilled receivables due between 1 to 5 years?" }, { "input": "What is the difference between the company's federal and state net operating losses?", "output": "What differentiates the federal and state net operating losses of the company?" }, { "input": "What were the amounts that the company is authorised to repurchase in 2019 and 2018 respectively?", "output": "What were the authorized repurchase amounts for the company in both 2019 and 2018?" }, { "input": "What were the respective amounts of stock repurchases made by the company as at January 1, 2018 and 2019 respectively?", "output": "What were the specific amounts of stock repurchases made by the company as of January 1, 2018 and January 1, 2019, respectively?" }, { "input": "What was the average amount of stock repurchases as of January 1st in both 2017 and 2018?", "output": "What is the average stock repurchases as at January 1, 2017 and 2018?" }, { "input": "What is the average amount of stock repurchases as of January 1, 2017, and January 1, 2018?", "output": "What is the average stock repurchases as at January 1, 2017 and 2018?" }, { "input": "What is the exact percentage difference in the amount of long-term deferred revenue from 2018 to 2019?", "output": "What is the percentage change in long-term deferred revenue between 2018 and 2019?" }, { "input": "What are the respective values for Term in 2018 and 2019?", "output": "What are the values for Term in the year 2018? What are the values for Term in the year 2019?" }, { "input": "What are the company's respective stock-based compensation for cost of revenues in 2019 and 2018?", "output": "What were the stock-based compensation expenses for the cost of revenues incurred by the company in both 2019 and 2018?" }, { "input": "What are the company's respective stock-based compensation for cost of revenues in 2019 and 2018?", "output": "\"What were the stock-based compensation expenses for cost of revenues in 2019 and 2018 for the company?\"" }, { "input": "What is the average amount of stock-based compensation for the cost of revenue that the company has incurred from 2017 to 2019?", "output": "What is the company's average stock-based compensation for the cost of revenue between 2017 to 2019?" }, { "input": "What is the average amount of stock-based compensation granted by the company for the selling and marketing department in both 2018 and 2019?", "output": "What is the company's average stock-based compensation for selling and marketing in 2018 and 2019?" }, { "input": "\"What was the average amount of stock-based compensation granted to employees in the selling and marketing department of the company in both 2018 and 2019?\"", "output": "What is the company's average stock-based compensation for selling and marketing in 2018 and 2019?" }, { "input": "What is the percentage change in revenue earned from term license from 2015 to 2016?", "output": "What is the percentage change in the revenue earned from term license between 2015 and 2016?" }, { "input": "What is the total revenue earned from maintenance in 2017 and 2018?", "output": "What is the combined revenue generated from maintenance services in the years 2017 and 2018?" }, { "input": "What is the total revenue earned from maintenance in 2017 and 2018?", "output": "What was the combined revenue generated from maintenance services in the years 2017 and 2018?" }, { "input": "How did actuarial calculations impact the resulting changes in parameters?", "output": "What did the changes in parameters on the basis of actuarial calculations lead to?" }, { "input": "What was the precise amount of the actual return on plan assets in the fiscal year 2019?", "output": "How much did the actual return on plan assets amount to in FY2019?" }, { "input": "What items or expenses are typically excluded from the expected payments?", "output": "What is not included in expected payments?" }, { "input": "Explain what expenses or items are typically not encompassed within the estimated payments.", "output": "What is not included in expected payments?" }, { "input": "What was the change in interest income in FY2019 from FY2018?", "output": "What was the difference in the amount of income generated from interest in the fiscal year 2019 compared to the fiscal year 2018?" }, { "input": "What was the change in interest income in FY2019 from FY2018?", "output": "What was the specific difference in interest income between the fiscal year 2019 and the fiscal year 2018?" }, { "input": "In which year did Asia experience a higher year-on-year percentage change in GDP compared to 2018 during the period from 2018 to 2019?", "output": "In which year was the Year-on-year percentage change of GDP in Asia larger from 2018 to 2019?" }, { "input": "What was the change in the year-on-year percentage change in GDP for Germany from 2018 to 2019?", "output": "What was the difference in the percentage change of Germany's GDP from one year to the next, comparing 2018 and 2019?" }, { "input": "In which year did the total sum of taxes collected exceed the amount collected in all other years?", "output": "In which year was the total amount of taxes larger?" }, { "input": "What was the exact difference in the total amount of taxes paid in the time period from 2018 to 2019 compared to the time period from 2017 to 2018?", "output": "What was the change in total taxes in 2018/2019 from 2017/2018?" }, { "input": "What was the percentage increase or decrease in the total amount of taxes in 2018/2019 compared to 2017/2018?", "output": "What was the percentage change in total taxes in 2018/2019 from 2017/2018?" }, { "input": "What factors or criteria are used as a foundation for the measurements being taken?", "output": "What is the basis for the measurements?" }, { "input": "What is the fundamental basis or principle used to determine the measurements being referred to in the context?", "output": "What is the basis for the measurements?" }, { "input": "In which specific year did the amount for Belgium surpass or exceed the previous value?", "output": "In which year was the amount for Belgium larger?" }, { "input": "In which specific year did the total amount (not specified) exceed the amount for Belgium?", "output": "In which year was the amount for Belgium larger?" }, { "input": "What was the amount of cash and cash equivalents in FY2019?", "output": "How much cash and cash equivalents were reported in the financial statements for the fiscal year 2019?" }, { "input": "What was the exact percentage difference in Cash and cash equivalents between FY2018 and FY2019 after considering adjustments?", "output": "What was the percentage change in Cash and cash equivalents in FY2019 from FY2018 adjusted?" }, { "input": "What was the calculated percentage difference in Cash and cash equivalents between FY2019 and FY2018, after adjusting for any relevant factors?", "output": "What was the percentage change in Cash and cash equivalents in FY2019 from FY2018 adjusted?" }, { "input": "In which specific year did the absolute value of the Earnings per share exceed the value in another year?", "output": "In which year was the absolute value of the Earnings per share larger?" }, { "input": "\"What specific factor was used to calculate the net cost of the current service?\"", "output": "What was the current service cost netted against?" }, { "input": "\"In which specific year did the total sales exceed the sales of other years in terms of maximum revenue generated?\"", "output": "In which year was the Total sales larger?" }, { "input": "In what year did the headcount in METRO Asia exceed the corresponding headcount in other years?", "output": "In which year was the amount in METRO Asia headcount larger?" }, { "input": "What was the year-over-year change in the number of employees at METRO AG from 2018 to 2019?", "output": "What was the change in METRO AG headcount in 2019 from 2018?" }, { "input": "With which companies did METRO maintain business relations with, and who were the parties involved in these relations?", "output": "Who were the parties to which METRO maintained business relations with related companies?" }, { "input": "What were the business relations to related companies listed in the table?", "output": "What were the business relations between the listed companies in the table and other related entities or organizations?" }, { "input": "What was the difference in the amount and type of services received in the fiscal year 2019 compared to the fiscal year 2018?", "output": "What was the change in services received in FY2019 from FY2018?" }, { "input": "What is the exact percentage change in the number of services received during fiscal year 2019 compared to fiscal year 2018?", "output": "What was the percentage change in services received in FY2019 from FY2018?" }, { "input": "What was the exact percentage difference in the quantity of services received during the fiscal year 2019 compared to the fiscal year 2018?", "output": "What was the percentage change in services received in FY2019 from FY2018?" }, { "input": "In what format is the cash flow statement presented in the table?", "output": "What version is the cash flow statement in the table in?" }, { "input": "What specific expenses related to income tax were officially acknowledged in the year 2019?", "output": "What are the recognised income tax expenses in 2019?" }, { "input": "What was the percentage change in recognised income tax expenses in 2018/2019 from 2017/2018?", "output": "What was the percentage increase or decrease in recognised income tax expenses for the financial year 2018/2019 compared to the financial year 2017/2018?" }, { "input": "What was the percentage change in recognised income tax expenses in 2018/2019 from 2017/2018?", "output": "What was the percentage increase or decrease in recognised income tax expenses for the fiscal year 2018/2019 compared to the fiscal year 2017/2018?" }, { "input": "What was the change in EBT in 2018/2019 from 2017/2018?", "output": "What was the difference in EBT (Earnings Before Taxes) between 2018 and 2019 when compared to 2017 and 2018?" }, { "input": "What does the other operating income include?", "output": "What type of income is included under \"other operating income\"? Please provide a comprehensive explanation of the various sources that fall under this category." }, { "input": "In which specific year did the amount of Services surpass the quantity in other years?", "output": "In which year was the amount of Services larger?" }, { "input": "During which year did the quantity of services surpass the quantity in previous years?", "output": "In which year was the amount of Services larger?" }, { "input": "What was the exact amount of equity during the fiscal year 2019?", "output": "What was the amount of equity in FY2019?" }, { "input": "What proportion of the total earnings in FY2019 can be attributed to the Noncontrolling interests?", "output": "How much of earnings does the Noncontrolling interests account for in FY2019?" }, { "input": "What proportion of the FY2019 earnings is represented by the Noncontrolling interests account?", "output": "How much of earnings does the Noncontrolling interests account for in FY2019?" }, { "input": "What was the exact numerical difference in sales between FY2019 and FY2018?", "output": "What was the change in Sales in FY2019 from FY2018?" }, { "input": "What was the difference in the total amount of sales between fiscal year 2019 and fiscal year 2018?", "output": "What was the change in Sales in FY2019 from FY2018?" }, { "input": "When were the calculations of employee numbers by segments performed?", "output": "When were the employee numbers by segments calculated?" }, { "input": "What were the specific segments or categories mentioned under the \"METRO\" component in the table that accounted for the number of employees?", "output": "What were the components under METRO in the table when accounting for the employee numbers by segments?" }, { "input": "What was the change in METRO AG in 2019 from 2018?", "output": "What was the year-on-year change in METRO AG's performance in 2019 compared to 2018?" }, { "input": "What was the percentage difference in METRO AG's performance in 2019 compared to 2018?", "output": "What was the percentage change in METRO AG in 2019 from 2018?" }, { "input": "What was the percentage difference in the performance of METRO AG in 2019 compared to 2018?", "output": "What was the percentage change in METRO AG in 2019 from 2018?" }, { "input": "In which year was the amount of total assets larger?", "output": "\"When comparing the amount of total assets, which year had a larger value?\"" }, { "input": "What was the percentage change in total assets in FY2019 from FY2018?", "output": "What was the percentage increase or decrease in the total assets for the fiscal year 2019 compared to the total assets for the fiscal year 2018?" }, { "input": "What is the source of revenue that compensates for the expenses incurred in providing logistics services?", "output": "What is offset by income from logistics services?" }, { "input": "What are the components under Other operating expenses in the table?", "output": "What specific components are included in the category of \"Other operating expenses\" as mentioned in the table?" }, { "input": "What are the components under Other operating expenses in the table?", "output": "What specifically are the individual components included in the category of Other operating expenses in the provided table?" }, { "input": "What was the change in Losses from the disposal of fixed assets in 2018/2019 from 2017/2018?", "output": "\"What is the difference in losses resulting from the sale or disposal of fixed assets between the fiscal years 2017/2018 and 2018/2019?\"" }, { "input": "What was the change in Losses from the disposal of fixed assets in 2018/2019 from 2017/2018?", "output": "What was the difference in the amount of losses incurred from selling fixed assets between the fiscal years 2018/2019 and 2017/2018?" }, { "input": "What was the percentage increase or decrease in losses resulting from the disposal of fixed assets during the period of 2018/2019 compared to the period of 2017/2018?", "output": "What was the percentage change in Losses from the disposal of fixed assets in 2018/2019 from 2017/2018?" }, { "input": "What was the percentage change in losses incurred from the disposal of fixed assets in the fiscal year 2018/2019 compared to the fiscal year 2017/2018?", "output": "What was the percentage change in Losses from the disposal of fixed assets in 2018/2019 from 2017/2018?" }, { "input": "What was the change in interest carry-forwards in FY2019 from FY2018?", "output": "What was the difference in the amount of interest carry-forwards between the fiscal years of 2019 and 2018?" }, { "input": "What are the primary elements within the table that are utilized to determine the outcome of the interest calculation?", "output": "What are the broad components in the table which are used to calculate the interest result?" }, { "input": "What are the various broad components included in the table that are utilized for calculating the outcome of interest?", "output": "What are the broad components in the table which are used to calculate the interest result?" }, { "input": "What was the percentage change in the amount of income earned from interest during the period between 2018 and 2019 compared to the previous period between 2017 and 2018?", "output": "What was the percentage change in interest income in 2018/2019 from 2017/2018?" }, { "input": "In which specific year did the total sum of taxes other than general administrative expenses exceed the amount incurred in those administrative expenses?", "output": "In which year was the amount of other taxes thereof from general administrative expenses larger?" }, { "input": "In which years were the reserves retained from earnings calculated in?", "output": "During which specific years were the calculations for retained reserves from earnings conducted?" }, { "input": "In which years were the reserves retained from earnings calculated in?", "output": "During which specific years were the calculations for retaining reserves from earnings carried out?" }, { "input": "What was the change in the Income tax on components of other comprehensive income in FY2019 from FY2018?", "output": "What was the difference in the income tax related to the components of other comprehensive income from fiscal year 2018 to fiscal year 2019?" }, { "input": "What was the change in operating income in the Communications Solutions segment in 2019?", "output": "What was the specific amount of increase or decrease in operating income for the Communications Solutions segment during the year 2019?" }, { "input": "What was the change in operating income in the Communications Solutions segment in 2019?", "output": "What was the percentage change in operating income specifically for the Communications Solutions segment during the year 2019 compared to the previous year?" }, { "input": "Why did operating income decrease in fiscal 2019?", "output": "\"What were the reasons for the decrease in operating income during fiscal year 2019?\"" }, { "input": "Why did operating income decrease in fiscal 2019?", "output": "What were the reasons for the decrease in operating income in fiscal 2019?" }, { "input": "In which year did the volume or quantity of raw materials reach its highest point?", "output": "In which year was Raw materials larger?" }, { "input": "In which specific year did the total volume or quantity of raw materials experience a greater increase or reach a higher value compared to previous years?", "output": "In which year was Raw materials larger?" }, { "input": "In which specific years was the calculation of the Repurchase value conducted?", "output": "Which years was the Repurchase value calculated in?" }, { "input": "What was the exact difference in the total number of common shares repurchased in the year 2019 compared to 2018?", "output": "What was the change in the Number of common shares repurchased in 2019 from 2018?" }, { "input": "What was the calculation for determining the percentage change in the quantity of common shares repurchased during the year 2019 in comparison to 2018?", "output": "What was the percentage change in the Number of common shares repurchased in 2019 from 2018?" }, { "input": "What are the specific categories or divisions for which the net sales figures are displayed in the provided table?", "output": "What are the segments for which the net sales are presented in the table?" }, { "input": "What year did Industrial Solutions experience a larger size or growth compared to other years?", "output": "In which year was Industrial Solutions larger?" }, { "input": "What specific changes occurred in the Industrial Solutions sector between 2018 and 2019?", "output": "What was the change in Industrial Solutions in 2019 from 2018?" }, { "input": "In 2019, what specific changes occurred in the Industrial Solutions sector compared to the previous year of 2018?", "output": "What was the change in Industrial Solutions in 2019 from 2018?" }, { "input": "In which years was the basic earnings per share calculated for?", "output": "For which specific years was the metric of basic earnings per share calculated and used for evaluation?" }, { "input": "In which years was the basic earnings per share calculated for?", "output": "For which years was the calculation of basic earnings per share carried out? Please provide specific details in your answer." }, { "input": "What was the change in Dilutive impact of share-based compensation arrangements in 2019 from 2018?", "output": "What was the difference in the dilutive impact of share-based compensation arrangements between 2018 and 2019?" }, { "input": "What was the year-on-year percentage change in the dilutive impact caused by share-based compensation arrangements in 2019 compared to 2018?", "output": "What was the percentage change in Dilutive impact of share-based compensation arrangements in 2019 from 2018?" }, { "input": "What was the exact percentage change in the dilutive impact of share-based compensation arrangements between 2018 and 2019?", "output": "What was the percentage change in Dilutive impact of share-based compensation arrangements in 2019 from 2018?" }, { "input": "In which specific year did the percentage of industrial solutions reach its lowest point?", "output": "In which year was the percentage of industrial solutions the lowest in?" }, { "input": "What was the percentage change in the Weighted-Average Grant-Date Fair Value for nonvested shares between 2018 and 2019?", "output": "What was the change in the Weighted-Average Grant-Date Fair Value for nonvested shares in 2019 from 2018?" }, { "input": "In the table, what segments were considered when calculating the backlog?", "output": "What were the segments for which backlog was calculated in the table?" }, { "input": "For which segments in the table was the backlog calculated?", "output": "What were the segments for which backlog was calculated in the table?" }, { "input": "What was the change in total backlog in 2019 from 2018?", "output": "What was the difference in the total backlog between 2019 and 2018?" }, { "input": "What was the percentage difference in the total amount of unfinished work in 2019 compared to 2018?", "output": "What was the percentage change in total backlog in 2019 from 2018?" }, { "input": "In which specific year did the amount spent on acquisition and integration costs exceed any other year?", "output": "In which year was Acquisition and integration costs larger?" }, { "input": "In which specific year did the expenses incurred for acquisition and integration costs exceed the expenses in other years?", "output": "In which year was Acquisition and integration costs larger?" }, { "input": "What are the components under U.S. in the table?", "output": "Which components are listed under the U.S. category in the table?" }, { "input": "What are the different categories of Income Tax Expense (Benefit) mentioned in the table?", "output": "What are the types of Income Tax Expense (Benefit) in the table?" }, { "input": "In which specific year did Non-U.S. experience the highest income tax expense (benefit)?", "output": "Which year was the current income tax expense (benefit) for Non-U.S. the largest?" }, { "input": "In which year did the Non-U.S. current income tax expense (benefit) reach its highest amount?", "output": "Which year was the current income tax expense (benefit) for Non-U.S. the largest?" }, { "input": "What was the change in Current income tax expense (benefit) in 2019 from 2018?", "output": "What was the difference in Current income tax expense (benefit) between 2019 and 2018?" }, { "input": "What is the company's policy regarding the allocation and management of funds?", "output": "What is the company's funding policy?" }, { "input": "What is the minimum amount of contributions that individuals are required to make to the pension plan outside of the United States?", "output": "How much minimum required contributions is expected to be made to the non-U.S. pension plan?" }, { "input": "Which fiscal year between 2020 and 2024 will witness the highest benefit payments under the United States plans?", "output": "In which Fiscal year from 2020 to 2024 would the benefit payments under the U.S Plans be the largest?" }, { "input": "During which fiscal year from 2020 to 2024 will the benefit payments under the United States Plans reach their maximum extent?", "output": "In which Fiscal year from 2020 to 2024 would the benefit payments under the U.S Plans be the largest?" }, { "input": "What is typically not included in the net earnings of companies operating outside the United States?", "output": "What do the Non-U.S. net earnings exclude?" }, { "input": "What is excluded from the net earnings of countries other than the U.S. and why?", "output": "What do the Non-U.S. net earnings exclude?" }, { "input": "What was the percentage increase or decrease in the expense for U.S. federal income tax at the statutory rate in 2019 compared to 2018?", "output": "What was the percentage change in Notional U.S. federal income tax expense at the statutory rate in 2019 from 2018?" }, { "input": "What are the specific elements or factors taken into account when calculating Restructuring and Other Charges, Net?", "output": "What are the components considered under Restructuring and Other Charges, Net?" }, { "input": "When did the lowest value of net restructuring charges occur?", "output": "In which year was Restructuring charges, net the lowest?" }, { "input": "When was the year with the lowest net restructuring charges recorded?", "output": "In which year was Restructuring charges, net the lowest?" }, { "input": "What was the change in Restructuring charges, net in 2019 from 2018?", "output": "What was the difference in the amount of Restructuring charges, net between 2019 and 2018?" }, { "input": "What was the change in Restructuring charges, net in 2019 from 2018?", "output": "What was the net change in restructuring charges in 2019 compared to 2018?" }, { "input": "In 2019, what was the percentage difference in Restructuring charges, net compared to 2018?", "output": "What was the percentage change in Restructuring charges, net in 2019 from 2018?" }, { "input": "What specific regions do the net sales to external customers correspond to?", "output": "What are the net sales by geographic region to external customers attributed to?" }, { "input": "In which specific year did the percentage in Americas reach its highest point compared to other years?", "output": "In which year was the percentage in Americas the largest?" }, { "input": "What is the specific year when the largest percentage was recorded in the Americas?", "output": "In which year was the percentage in Americas the largest?" }, { "input": "What was the average net sales in Asia-Pacific as a percentage of total net sales across 2017, 2018 and 2019?", "output": "What was the average proportion of net sales generated in the Asia-Pacific region, compared to the total net sales, for the years 2017, 2018, and 2019?" }, { "input": "What was the average net sales in Asia-Pacific as a percentage of total net sales across 2017, 2018 and 2019?", "output": "What was the average percentage of total net sales in the Asia-Pacific region for the years 2017, 2018, and 2019 combined?" }, { "input": "What was the percentage change in the amount of US dollar-denominated long-term debt from 2018 to 2019?", "output": "What was the increase / (decrease) in US dollar-denominated long-term debt from 2018 to 2019?" }, { "input": "What is the increase/ (decrease) in Numerator (basic) – Net income from 2018 to 2019?", "output": "What is the change in Numerator (basic) – Net income from 2018 to 2019?" }, { "input": "What is the increase/ (decrease) in Numerator (basic) – Net income from 2018 to 2019?", "output": "What is the absolute change in Numerator (basic) – Net income from the year 2018 to the year 2019?" }, { "input": "What is the percentage change in Basic Earnings per share from 2018 to 2019?", "output": "What is the increase/ (decrease) in Basic Earnings per share from 2018 to 2019?" }, { "input": "What is the percentage change in Diluted Earnings per share from 2018 to 2019?", "output": "What is the increase/ (decrease) in Diluted Earnings per share from 2018 to 2019?" }, { "input": "What is the percentage change in Diluted Earnings per share (EPS) between 2018 and 2019?", "output": "What is the increase/ (decrease) in Diluted Earnings per share from 2018 to 2019?" }, { "input": "What was the specific percentage of taxes paid in the year 2019, taking into account all applicable taxes and deductions?", "output": "What was the effective tax rate in 2019?" }, { "input": "What was the average Computed income tax expense?", "output": "What was the average expense for income tax computed?" }, { "input": "What was the percentage change in the amount of interest on borrowings from 2018 to 2019?", "output": "What was the increase / (decrease) in Interest on borrowings from 2018 to 2019?" }, { "input": "What was the percentage change in Interest on borrowings from 2018 to 2019?", "output": "What was the increase / (decrease) in Interest on borrowings from 2018 to 2019?" }, { "input": "What was the increase / (decrease) in the Finance costs before interest on lease liabilities from 2018 to 2019?", "output": "What was the change in the Finance costs before interest on lease liabilities from 2018 to 2019?" }, { "input": "What was the percentage change in Adjusted EBITDA between 2018 and 2019?", "output": "What was the increase / (decrease) in Adjusted EBITDA from 2018 to 2019?" }, { "input": "What was the average Depreciation and amortization?", "output": "What was the average amount of depreciation and amortization expenses incurred over a specific period?" }, { "input": "What factors contributed to the overall rise in both depreciation and amortization expenses?", "output": "What caused the increase in the total depreciation and amortization?" }, { "input": "What was the Depreciation of right-of-use assets in 2019?", "output": "What was the amount of depreciation recorded for right-of-use assets during the year 2019?" }, { "input": "What was the Depreciation of right-of-use assets in 2019?", "output": "\"What was the amount of depreciation for right-of-use assets recorded in the financial statements for the year 2019?\"" }, { "input": "What was the increase / (decrease) in the Depreciation from 2018 to 2019?", "output": "What was the net change in Depreciation, either an increase or decrease, between the years 2018 and 2019?" }, { "input": "What was the increase / (decrease) in the Depreciation from 2018 to 2019?", "output": "What was the difference in the Depreciation amount from 2018 to 2019?" }, { "input": "What was the percentage change in the total depreciation and amortization from 2018 to 2019?", "output": "What was the increase / (decrease) in the Total depreciation and amortization from 2018 to 2019?" }, { "input": "What was the percentage change in Total depreciation and amortization from 2018 to 2019?", "output": "What was the increase / (decrease) in the Total depreciation and amortization from 2018 to 2019?" }, { "input": "What was the change in the coupon rate between November 12, 2019, and April 30, 2019?", "output": "What was the increase / (decrease) in the coupon rate from November 12, 2019 to April 30, 2019?" }, { "input": "What was the change in the coupon rate from November 12, 2019 to April 30, 2019?", "output": "What was the increase / (decrease) in the coupon rate from November 12, 2019 to April 30, 2019?" }, { "input": "What was the percentage change in the Equivalent (Cdn$) value from 2018 to November 2019?", "output": "What was the increase / (decrease) in the Equivalent (Cdn$) between 2018 and November 2019?" }, { "input": "What is the percentage change in the amount of interest on borrowings from 2018 to 2019?", "output": "What is the increase/ (decrease) in Interest on borrowings from 2018 to 2019?" }, { "input": "By what percentage did the Interest on borrowings change from 2018 to 2019? Please state whether there was an increase or decrease.", "output": "What is the increase/ (decrease) in Interest on borrowings from 2018 to 2019?" }, { "input": "What is the purpose of utilizing valuation-related analysis and making capital structure-related decisions?", "output": "What is used for to conduct valuation-related analysis and make capital structure-related decisions?" }, { "input": "What items are considered when calculating the adjusted net debt?", "output": "What is included in the adjusted net debt?" }, { "input": "In which context or industry is the valuation of debt derivatives without incorporating credit risk used as an evaluation tool?", "output": "Where is debt derivatives valued without adjustment for credit risk used for evaluation?" }, { "input": "What is the net change in long-term debt from December 31, 2018, to December 31, 2019?", "output": "What is the increase/ (decrease) in Long-term debt from December 31, 2018 to December 31, 2019?" }, { "input": "What is the change in the amount of Long-term debt from December 31, 2018 to December 31, 2019?", "output": "What is the increase/ (decrease) in Long-term debt from December 31, 2018 to December 31, 2019?" }, { "input": "What is the change in the value of net debt derivative assets without credit risk adjustment from December 31, 2018 to December 31, 2019?", "output": "What is the increase/ (decrease) in Net debt derivative assets valued without any adjustment for credit risk from December 31, 2018 to December 31, 2019?" }, { "input": "What is the method of measuring accounts receivable and what factors are considered in the measurement process?", "output": "How is accounts receivable measured?" }, { "input": "What is the percentage change in the amount of customer accounts receivable from 2018 to 2019?", "output": "What is the increase/ (decrease) in Customer accounts receivable from 2018 to 2019?" }, { "input": "\"What is the method employed to settle the principal amount of Convertible Notes in cash upon conversion, and how does it factor into the computation of diluted earnings?\"", "output": "Which method is used for to settle the principal of the Convertible Notes in cash on conversion and calculates diluted earnings?" }, { "input": "What are the consequences when a loss is incurred that can be attributed to shareholders?", "output": "What happens if a loss attributable to shareholders has been incurred?" }, { "input": "What was the Convertible Notes that had an anti-dilutive effect on the calculation of diluted earnings per common share in the year end 2019, 2018 and 2017 respectively?", "output": "What were the Convertible Notes that impacted diluted earnings per common share calculation in 2019, 2018, and 2017, specifically in terms of their anti-dilutive effects?" }, { "input": "What is the change in Loss per common share - basic and diluted from 2018 to 2019?", "output": "What is the difference in the loss per common share - basic and diluted between 2018 and 2019?" }, { "input": "What is the change in Loss per common share - basic and diluted from 2018 to 2019?", "output": "What is the difference in basic and diluted Loss per common share between 2018 and 2019?" }, { "input": "How are the recognition processes for realized and unrealized losses different?", "output": "How are the realized and unrealized losses recognized?" }, { "input": "How are both realized and unrealized losses recognized? Please provide a detailed explanation of the recognition process for each type of loss.", "output": "How are the realized and unrealized losses recognized?" }, { "input": "In which specific year did the Gain on sale or write-down of a cost-accounted investment amount to less than 1,000 thousands?", "output": "In which year is the Gain on sale / (write-down) of cost-accounted investment less than 1,000 thousands?" }, { "input": "\"In which fiscal year did the Gain on sale or write-down of a cost-accounted investment amount to less than 1,000 thousands (1 million)?\"", "output": "In which year is the Gain on sale / (write-down) of cost-accounted investment less than 1,000 thousands?" }, { "input": "What were the total fees paid by Teekay Tankers to KPMG LLP in 2019 and 2018, respectively?", "output": "What fees was paid to KPMG LLP by Teekay Tankers during 2019 and 2018 respectively?" }, { "input": "What were the fees paid by Teekay Tankers to KPMG LLP in 2019 and 2018?", "output": "What fees was paid to KPMG LLP by Teekay Tankers during 2019 and 2018 respectively?" }, { "input": "What are the components or factors included in the income tax expense calculation?", "output": "What is included in the income tax expense?" }, { "input": "What items are incorporated into the income tax expense?", "output": "What is included in the income tax expense?" }, { "input": "What is the difference in the number of positions taken in previous years between December 31, 2019, and December 31, 2018?", "output": "What is the change in Changes for positions taken in prior years from December 31, 2019 to December 31, 2018?" }, { "input": "What is the net change in the number of positions taken in prior years from December 31, 2019 to December 31, 2018?", "output": "What is the change in Changes for positions taken in prior years from December 31, 2019 to December 31, 2018?" }, { "input": "What led to increase in Consolidated net cash flow from operating activities for the year ended December 31, 2019?", "output": "What factors contributed to the rise in Consolidated net cash flow from operating activities during the year ending on December 31, 2019?" }, { "input": "How much did Brookfield receive from the sale of its interests in Altera in 2019?", "output": "How much was received from Brookfield for the sale of interests in Altera during 2019?" }, { "input": "What is the change in Net financing cash flows from Year Ended December 31, 2019 to December 31, 2018?", "output": "What is the difference in the amount of cash generated or used for financing activities in the financial statements for the years ending December 31, 2019 and December 31, 2018?" }, { "input": "What is the change in Net financing cash flows from Year Ended December 31, 2019 to December 31, 2018?", "output": "What is the difference in net cash flows from financing activities between the fiscal years ended on December 31, 2019 and December 31, 2018?" }, { "input": "What is the change in Net (loss) income from Year Ended December 31, 2019 to December 31, 2018?", "output": "What was the difference in the Net (loss) income between the fiscal years ending on December 31, 2019 and December 31, 2018?" }, { "input": "What is the change in Net (loss) income from Year Ended December 31, 2019 to December 31, 2018?", "output": "What is the difference in the amount of net (loss) income between the fiscal year ending on December 31, 2019, and the fiscal year ending on December 31, 2018?" }, { "input": "What is the change in Income tax expense (recovery) from Year Ended December 31, 2019 to December 31, 2018?", "output": "What is the difference in the amount of income tax expense or recovery reported for the fiscal year ended on December 31, 2019, compared to the fiscal year ended on December 31, 2018?" }, { "input": "What is the change in Income tax expense (recovery) from Year Ended December 31, 2019 to December 31, 2018?", "output": "How does the income tax expense (recovery) change between the years ended December 31, 2019 and December 31, 2018?" }, { "input": "What is the sum of all minimum lease payments expected to be received in the year 2019?", "output": "What was the total Total minimum lease payments to be received in 2019?" }, { "input": "What is the total sum of minimum lease payments that were expected to be received in the year 2019?", "output": "What was the total Total minimum lease payments to be received in 2019?" }, { "input": "What is the change in Initial direct costs and other from December 31, 2019 to December 31, 2018?", "output": "How did the initial direct costs and other expenses change from December 31, 2018, to December 31, 2019?" }, { "input": "When did Teekay sell to Brookfield all of the Company’s remaining interests in Altera?", "output": "\"When was the complete sale of Teekay's remaining holdings in Altera to Brookfield executed?\"" }, { "input": "When did Teekay sell to Brookfield all of the Company’s remaining interests in Altera?", "output": "\"When did Teekay sell its remaining interests in Altera to Brookfield?\"" }, { "input": "What were the Fair values at the start of the year for 2019 and 2018, respectively? Please provide the Fair value figures for both years separately.", "output": "What was the Fair value at the beginning of the year for 2019 and 2018 respectively?" }, { "input": "In which year was Fair value at the beginning of the year less than 15,000 thousands?", "output": "In which year did the fair value at the beginning of the year fall below 15,000,000 (15 million) dollars?" }, { "input": "\"What items are typically included in the calculation of current income tax expense, and what factors can affect this expense?\"", "output": "What is included in current income tax expense?" }, { "input": "What components are typically encompassed in the calculation of contemporary income tax expense?", "output": "What is included in current income tax expense?" }, { "input": "What is the change in Current income tax expense from December 31, 2019 to December 31, 2018?", "output": "How much has the Current income tax expense changed from December 31, 2019, to December 31, 2018?" }, { "input": "What is the change in Current income tax expense from December 31, 2019 to December 31, 2018?", "output": "What was the difference in Current income tax expense between December 31, 2019 and December 31, 2018?" }, { "input": "\"Between two years, which one exhibits a higher expense for current income tax?\"", "output": "Which year has higher current income tax expense?" }, { "input": "What were the Lease and Non-Lease commitments in the year 2021, and can you provide specific details about each commitment?", "output": "What was the Lease and Non-Lease commitment in 2021 respectively?" }, { "input": "What was the amount of lease commitment for the fiscal year 2022?", "output": "What was the Lease commitment in 2022?" }, { "input": "What was the total value of lease commitments in the year 2022?", "output": "What was the Lease commitment in 2022?" }, { "input": "In which year was the Lease commitment less than 10,000 thousands?", "output": "In which specific year did the Lease commitment amount to less than 10,000 thousand units?" }, { "input": "In which year was the Lease commitment less than 10,000 thousands?", "output": "In which specific year did the Lease commitment amount fall below 10,000 thousands? (For better answering, please state the year and the unit of the Lease commitment.)" }, { "input": "What is the difference in the lease commitment amount between 2020 and 2021?", "output": "What is the change in the Lease commitment from 2020 to 2021?" }, { "input": "What is the average duration of lease agreements from 2020 to 2022?", "output": "What is the average Lease Commitment from 2020 to 2022?" }, { "input": "What is the average duration of lease commitments between 2020 and 2022?", "output": "What is the average Lease Commitment from 2020 to 2022?" }, { "input": "How much was the equity loss for the year ended December 31, 2019?", "output": "What was the amount of equity lost during the period from January 1 to December 31, 2019?" }, { "input": "How much was the equity loss for the year ended December 31, 2019?", "output": "What was the total amount of equity loss recorded for the period of the year ending December 31, 2019?" }, { "input": "What is the difference in the combined amount of cash and restricted cash on December 31, 2019, compared to December 31, 2018?", "output": "What is the change in Cash and restricted cash from December 31, 2019 to December 31, 2018?" }, { "input": "What was the increase or decrease in the category of Other assets – current on the company's balance sheet from December 31, 2019 to December 31, 2018?", "output": "What is the change in Other assets – current from December 31, 2019 to December 31, 2018?" }, { "input": "What were the values of the other current assets in the years 2019 and 2018?", "output": "What was the other assets - current in 2019 and 2018 respectively?" }, { "input": "What were the respective amounts of non-current liabilities in 2019 and 2018 aside from the liabilities mentioned earlier?", "output": "What was the other liabilities - non current in 2019 and 2018 respectively?" }, { "input": "What were the amounts of non-current liabilities in 2019 and 2018, respectively, excluding any other liabilities that may be present on the financial statements?", "output": "What was the other liabilities - non current in 2019 and 2018 respectively?" }, { "input": "What is the difference in the number of Consolidated LNG carriers between 2020 and 2021?", "output": "What is the change in Consolidated LNG carriers from 2020 to 2021?" }, { "input": "What is the net difference in the number of Consolidated LNG carriers between 2020 and 2021?", "output": "What is the change in Consolidated LNG carriers from 2020 to 2021?" }, { "input": "What is the annual growth or decline in the number of Consolidated LNG carriers from 2021 to 2022?", "output": "What is the change in Consolidated LNG carriers from 2021 to 2022?" }, { "input": "What is the average number of Consolidated LNG carriers for the period from 2020 to 2021?", "output": "What is the average Consolidated LNG carriers, for the year 2020 to 2021?" }, { "input": "What is the average number of Consolidated LNG carriers between the years 2020 and 2021?", "output": "What is the average Consolidated LNG carriers, for the year 2020 to 2021?" }, { "input": "What is the change in Realized (losses) gains relating to Interest rate swap agreements from Year Ended December 31, 2019 to December 31, 2018?", "output": "What was the difference in Realized (losses) gains associated with Interest rate swap agreements between December 31, 2019 and December 31, 2018?" }, { "input": "What is the difference in Realized losses and gains from Forward freight agreements between Year Ended December 31, 2019 and December 31, 2018?", "output": "What is the change in Realized (losses) gains relating to Forward freight agreements from Year Ended December 31, 2019 to December 31, 2018?" }, { "input": "What was the difference in revenue between 2019 and 2018?", "output": "What is the change in Revenues from, 2019 to 2018?" }, { "input": "What is the change in Voyage expenses from, 2019 to 2018?", "output": "What is the difference in expenses for voyages comparing 2019 to 2018?" }, { "input": "How is the expense of share-based compensation quantified and evaluated?", "output": "How is share-based compensation expense measured?" }, { "input": "How is the expense related to share-based compensation quantified and determined?", "output": "How is share-based compensation expense measured?" }, { "input": "What are the various types of share-based compensation plans in existence?", "output": "What are the types of share-based compensation plans?" }, { "input": "By what percentage did the stock-based compensation expense for general and administrative operations decrease from 2018 to 2019?", "output": "How much did the stock-based compensation expense in the general and administrative operations decreased from 2018 to 2019?" }, { "input": "What is the ending outstanding number of vested RSUs in 2019?", "output": "What is the final count of vested Restricted Stock Units (RSUs) at the conclusion of 2019?" }, { "input": "What is the ending outstanding number of vested RSUs in 2019?", "output": "How many RSUs were left as outstanding in the year 2019 once they had vested?" }, { "input": "What was the exact amount of depreciation expense recorded in the financial statements for the year 2018?", "output": "What was the depreciation expense in 2018?" }, { "input": "What are the accepted accounting methods for handling and recording disposed assets?", "output": "What are the accounting treatments for disposed assets?" }, { "input": "What are the accounting procedures to be followed specifically for assets that have been disposed of?", "output": "What are the accounting treatments for disposed assets?" }, { "input": "What is the net total for property, plant and equipment in 2019?", "output": "What is the total value of property, plant, and equipment after deducting accumulated depreciation in 2019?" }, { "input": "What is the net total for property, plant and equipment in 2019?", "output": "What is the total value of property, plant, and equipment after deducting accumulated depreciation and any impairment losses, as reported in the financial statements for the year 2019?" }, { "input": "What is the percentage decrease in the net value of Property, Plant, and Equipment between 2018 and 2019?", "output": "What is the percentage decrease in Net Total Property, Plant and Equipment from 2018 to 2019?" }, { "input": "How much was the increase in Federal R&D tax credits from 2018 to 2019?", "output": "How much did R&D tax credits from the Federal increased from 2018 to 2019?" }, { "input": "What was the percentage increase in R&D tax credits provided by the Federal government from 2018 to 2019?", "output": "How much did R&D tax credits from the Federal increased from 2018 to 2019?" }, { "input": "\"What was the amount recorded as a long-term liability in the financial statements for the year 2018?\"", "output": "How much was reflected as a long-term liability in 2018?" }, { "input": "What is the percentage difference in Net deferred tax assets from 2018 to 2019?", "output": "What was the percentage change in Net deferred tax assets between 2018 and 2019?" }, { "input": "What was the total cost of the service provided in the year 2018?", "output": "What was the service cost in 2018?" }, { "input": "What was the cost of the service provided in the year 2018?", "output": "What was the service cost in 2018?" }, { "input": "What was the difference in the cost of services from 2018 to 2019?", "output": "What was the change in service cost between 2018 and 2019?" }, { "input": "What was the change in interest cost between 2018 and 2019?", "output": "What was the difference in the amount of money spent on interest between the years 2018 and 2019?" }, { "input": "What was the change in interest cost between 2018 and 2019?", "output": "How did the interest cost change from 2018 to 2019, and what is the difference between the two years?" }, { "input": "What was the change in balances at the end of period between 2018 and 2019?", "output": "What is the difference in balances at the end of the period between 2018 and 2019, and what caused this change?" }, { "input": "What was the specific monetary value or total sum allocated for Machinery and equipment in the year 2018?", "output": "What was the amount for Machinery and equipment in 2018?" }, { "input": "\"What was the specific monetary value allocated to Machinery and equipment in the year 2018?\"", "output": "What was the amount for Machinery and equipment in 2018?" }, { "input": "What was the change in Net cash provided by (used in) operating activities from continuing operations between 2018 and 2019?", "output": "What was the difference in Net cash provided by (used in) operating activities from continuing operations between the years 2018 and 2019?" }, { "input": "What was the difference in the amount of Net cash provided by (used in) operating activities between the years 2018 and 2019?", "output": "What was the change in Net cash provided by (used in) operating activities between 2018 and 2019?" }, { "input": "What was the percentage change in Income (loss) from discontinued operations, net of income taxes between 2018 and 2019?", "output": "What was the percentage change in net income (loss) from discontinued operations, after accounting for income taxes, between the fiscal years 2018 and 2019?" }, { "input": "Which periods does the company's effective tax rates differ from the U.S. federal statutory rate?", "output": "\"During which specific time periods does the effective tax rate of the company deviate from the U.S. federal statutory rate?\"" }, { "input": "What was the specific framework or arrangement regarding Federal taxes in the year 2018, including any applicable laws, regulations, rates, and deductions?", "output": "What was the current provision for Federal taxes in 2018?" }, { "input": "What were the existing federal tax regulations and provisions for the year 2018?", "output": "What was the current provision for Federal taxes in 2018?" }, { "input": "What was the specific provision made for foreign taxes in the year 2019?", "output": "What was the current provision for Foreign taxes in 2019?" }, { "input": "What was the percentage increase or decrease in the fair value of intangible assets from 2018 to 2019?", "output": "What was the percentage change in the fair value of intangible assets between 2018 and 2019?" }, { "input": "What was the change in Pro Forma total sales between 2018 and 2019?", "output": "What was the difference in the total sales of Pro Forma from 2018 to 2019?" }, { "input": "What was the change in Pro Forma total sales between 2018 and 2019?", "output": "What was the difference in Pro Forma total sales from 2018 to 2019?" }, { "input": "What was the exact fair value of the property and equipment owned by LumaSense?", "output": "What was the fair value of Property and equipment from LumaSense?" }, { "input": "What was the fair value of Property and equipment acquired by LumaSense?", "output": "What was the fair value of Property and equipment from LumaSense?" }, { "input": "What was the difference in the assumed exercise of dilutive stock options and restricted stock units between the years 2018 and 2019?", "output": "What was the change in Assumed exercise of dilutive stock options and restricted stock units between 2018 and 2019?" }, { "input": "What was the disparity in the fair value of cash paid for acquisition between Trek and the Electrostatic Product Line?", "output": "What was the difference in the fair value of cash paid for acquisition between Trek and Electrostatic Product Line?" }, { "input": "What was the diluted earnings per share (EPS) specifically for Continuing Operations for the quarter ending in September?", "output": "What was the diluted earnings per share of Continuing Operations in Quarter Ended September?" }, { "input": "What was the diluted earnings per share of Continuing Operations in the quarter ending in September?", "output": "What was the diluted earnings per share of Continuing Operations in Quarter Ended September?" }, { "input": "What was the total Income (loss) from discontinued operations, net of income taxes in 2019?", "output": "In 2019, what was the net income (loss) from discontinued operations, adjusted for income taxes?" }, { "input": "What was the difference in total operating expense between the fiscal years 2018 and 2019?", "output": "What was the change in total operating expense between 2018 and 2019?" }, { "input": "What was the difference in the amount of money spent on operational expenses from 2018 to 2019?", "output": "What was the change in total operating expense between 2018 and 2019?" }, { "input": "How much money was owed to the company for goods or services provided in 2019 but not yet invoiced?", "output": "What was the amount of unbilled receivables in 2019?" }, { "input": "What were the precise amounts that were billed during the year 2018 after considering all deductions, fees, and payments?", "output": "What were the net amounts billed in 2018?" }, { "input": "What was the change in unbilled receivables between 2018 and 2019?", "output": "What was the difference in the amount of unbilled receivables from 2018 to 2019?" }, { "input": "What was the difference in the total amount billed between 2018 and 2019?", "output": "What was the change in net amounts billed between 2018 and 2019?" }, { "input": "What was the accumulated amortization of customer relationships in 2018?", "output": "\"What was the total amount of accumulated amortization recorded for customer relationships during the year 2018?\"" }, { "input": "What were the top two segments with the highest net carrying amounts in the year 2018?", "output": "What was the two highest net carrying amounts segments in 2018?" }, { "input": "What were the two segments with the highest net carrying amounts in 2018?", "output": "What was the two highest net carrying amounts segments in 2018?" }, { "input": "How much did the service cost in 2019 and can you provide more details about the specific service being referred to?", "output": "What was service cost in 2019?" }, { "input": "What was the total amount spent on interest expenses in the year 2018?", "output": "What was interest cost in 2018?" }, { "input": "What was the expected rate of return on plan assets for the year 2017?", "output": "What was the Expected return on plan assets in 2017?" }, { "input": "What was the anticipated return on assets for the plan in 2017?", "output": "What was the Expected return on plan assets in 2017?" }, { "input": "What was the change in Amortization of actuarial gains and losses between 2017 and 2018?", "output": "What was the difference in the amount of amortization of actuarial gains and losses recorded between the years 2017 and 2018?" }, { "input": "What was the change in Amortization of actuarial gains and losses between 2017 and 2018?", "output": "What was the difference in the amount of Amortization of actuarial gains and losses between the fiscal years 2017 and 2018?" }, { "input": "What were the specific expenses incurred for severance payments and any associated costs during the year 2019?", "output": "What were the severance and related charges in 2019?" }, { "input": "What was the amount of severance and associated expenses recorded as charges in the year 2019?", "output": "What were the severance and related charges in 2019?" }, { "input": "What was the change in Severance and related charges between 2018 and 2019?", "output": "What was the year-on-year difference in the amount spent on Severance and related charges between 2018 and 2019?" }, { "input": "What was the change in Severance and related charges between 2018 and 2019?", "output": "What was the difference in the amount of Severance and related charges incurred in 2018 compared to 2019?" }, { "input": "What is the proportion of facility relocation and closure charges in the total amount of restructuring charges incurred in 2019?", "output": "What percentage of total restructuring charges in 2019 consisted of Facility relocation and closure charges?" }, { "input": "What was the contrast between the combined assets in the Level 1 and Level 2 categories for the year 2019?", "output": "What was the difference in the total assets in the Level 1 Level 2 categories for 2019?" }, { "input": "What was the difference between the total assets in the Level 1 and Level 2 categories for the year 2019?", "output": "What was the difference in the total assets in the Level 1 Level 2 categories for 2019?" }, { "input": "What was the combined value of the top three assets in the Level 2 Category for the year 2019?", "output": "What was the total of the three highest assets in the Level 2 Category for 2019?" }, { "input": "What percentage of total assets across all categories consisted of assets from the Level One category for 2019?", "output": "What was the proportion or share, in terms of percentage, of Level One assets out of the total assets across all categories for the year 2019?" }, { "input": "What was the specific value or monetary figure of long-lived assets in the United States during the year 2019?", "output": "What was the amount of long-lived assets in United States in 2019?" }, { "input": "What was the net difference in the value of long-term assets in the United States from 2018 to 2019?", "output": "What was the change in the amount of long-lived assets between 2018 and 2019 in United States?" }, { "input": "\"What was the combined value of the two longest-lasting assets in the year 2018?\"", "output": "What is the sum of the highest two long-lived assets in 2018?" }, { "input": "\"What is the total value, in monetary terms, of the two long-lived assets with the highest worth that were recorded in the year 2018?\"", "output": "What is the sum of the highest two long-lived assets in 2018?" }, { "input": "What was the initial estimated value of property and equipment as of December 31, 2019 in relation to their market worth?", "output": "What was the Preliminary fair value of property and equipment in December 31, 2019?" }, { "input": "What was the initial estimated value of property and equipment as of December 31, 2019, before any adjustments or final calculations were made?", "output": "What was the Preliminary fair value of property and equipment in December 31, 2019?" }, { "input": "What was the initial estimated value assigned to inventories as of September 10th, 2019?", "output": "What was the Preliminary fair value of inventories in September 10, 2019?" }, { "input": "\"What was the initial estimated market value of the inventories as of September 10, 2019?\"", "output": "What was the Preliminary fair value of inventories in September 10, 2019?" }, { "input": "In 2019, what was the specific amount of revenue that was recognized for product sales and the provision of services at a particular point in time?", "output": "What was the Product and service revenue recognized at point in time in 2019?" }, { "input": "What specific extended warranty and service contracts were acknowledged and recorded throughout the year of 2018?", "output": "What was the Extended warranty and service contracts recognized over time in 2018?" }, { "input": "What were the specific extended warranty and service contracts that gained recognition throughout the year 2018?", "output": "What was the Extended warranty and service contracts recognized over time in 2018?" }, { "input": "What was the change in Product and service revenue recognized at point in time between 2018 and 2019?", "output": "What was the specific difference in revenue from the sales of products and services recognized at a specific moment between the years 2018 and 2019?" }, { "input": "What were the sales figures for the Industrial & Medical sectors in the year 2017?", "output": "What was the sales for Industrial & Medical in 2017?" }, { "input": "What were the sales figures specifically for Industrial & Medical sectors in the year 2017?", "output": "What was the sales for Industrial & Medical in 2017?" }, { "input": "What was the percentage change in sales of Semiconductor Equipment from 2017 to 2018?", "output": "What was the change in sales of Semiconductor Equipment between 2017 and 2018?" }, { "input": "What were the income taxes for the 2018 tax year, based on the federal statutory tax rates?", "output": "What was the Income taxes per federal statutory rate in 2018?" }, { "input": "What was the State income taxes, net of federal deduction in 2017?", "output": "In 2017, what was the amount of state income taxes paid after deducting the federal deduction?" }, { "input": "What were the specific modifications that occurred in the GILTI Tax legislation from 2018 to 2019?", "output": "What was the change in GILTI Tax between 2018 and 2019?" }, { "input": "What was the difference in the GILTI Tax rates from 2018 to 2019, and how did the change impact tax calculations?", "output": "What was the change in GILTI Tax between 2018 and 2019?" }, { "input": "What was the change in Unremitted earnings between 2018 and 2019?", "output": "What was the difference in Unremitted earnings from 2018 to 2019? Please provide the specific amount of change in Unremitted earnings between these two years." }, { "input": "What was the percentage difference in Withholding taxes from 2018 to 2019?", "output": "What was the percentage change in Withholding taxes between 2018 and 2019?" }, { "input": "What is the estimated rate of forfeiture?", "output": "What was the forfeiture rate estimated to be?" }, { "input": "What is total non-cash compensation expense related to stock-based awards in 2019?", "output": "\"What was the amount of expenses incurred in 2019 for stock-based awards as a form of total non-cash compensation?\"" }, { "input": "What is the range of years covered in the table?", "output": "What are the years included in the table?" }, { "input": "\"What is the final balance as of December 31, 2019?\"", "output": "What is the balance at end of year for 2019?" }, { "input": "In which specific markets does the company focus its marketing efforts and aim to capture a significant share of customers?", "output": "Which markets does the company target?" }, { "input": "What specific types of research and development (R&D) activities are sponsored by companies? Please provide a comprehensive list of these activities as outlined in the table.", "output": "What are the types of company-sponsored R&D activities in the table?" }, { "input": "What types of research and development (R&D) activities supported by companies are listed in the table?", "output": "What are the types of company-sponsored R&D activities in the table?" }, { "input": "What was the difference in the total expenditure on research and development sponsored by the company from 2018 to 2019?", "output": "What is the change in the total company-sponsored research and development expense in 2019 from 2018?" }, { "input": "What is the change in total backlog in 2019?", "output": "What was the overall increase or decrease in the total backlog during the year 2019?" }, { "input": "What are the segments under Total Backlog in the table?", "output": "What specific categories or divisions are included within the Total Backlog section of the table?" }, { "input": "How many distinct segments are there included in the Total Backlog?", "output": "How many segments are there under Total Backlog?" }, { "input": "How many distinct segments are included within the category of Total Backlog?", "output": "How many segments are there under Total Backlog?" }, { "input": "What is the specific maturity date or time period for the long-term debt that is expected to be due in 2021?", "output": "What is the maturity of long-term debt for 2021?" }, { "input": "What are the components recorded under income (loss) from continuing operations before income taxes?", "output": "What specific items are included in the income (loss) from continuing operations category before taxes are deducted?" }, { "input": "In which year was the amount under Foreign the smallest?", "output": "\"When was the lowest recorded value for the amount under the category 'Foreign' observed?\"" }, { "input": "What is the change in the amount under Foreign in 2019 from 2018?", "output": "What is the difference in the amount under the category \"Foreign\" between 2019 and 2018?" }, { "input": "What is the change in the amount under Foreign in 2019 from 2018?", "output": "How much did the Foreign amount change from 2018 to 2019?" }, { "input": "In which year was the amount of sales in Other the largest?", "output": "In which year did the category labeled as \"Other\" witness the highest recorded sales volume?" }, { "input": "What is the percentage change in the amount of Other in 2019 from 2018?", "output": "What is the percentage difference in the amount of Other expenses or income in the year 2019 compared to 2018, and how can it be calculated?" }, { "input": "What is the tax expense for 2019 calculated at the U.S. statutory rate?", "output": "What is the Tax expense at U.S. statutory rate for 2019?" }, { "input": "What was the tax expense for the year 2019 in the United States at the statutory rate?", "output": "What is the Tax expense at U.S. statutory rate for 2019?" }, { "input": "What is the percentage change in nondeductible expenses in 2019 from 2018?", "output": "What is the percentage difference in the amount of expenses that cannot be deducted in 2019 compared to 2018?" }, { "input": "What is the percentage change in nondeductible expenses in 2019 from 2018?", "output": "What is the percentage increase or decrease in expenses that cannot be deducted from taxes in the year 2019 compared to the year 2018?" }, { "input": "\"When were the costs associated with purchasing and developing the ERP system recorded?\"", "output": "In which years were the incurred costs related to the purchase and development of the ERP system recorded?" }, { "input": "In which year did the total value of property, plant, and equipment reach its highest level?", "output": "In which year was the total amount of property, plant and equipment larger?" }, { "input": "In which specific year did the combined value of property, plant, and equipment reach its highest point?", "output": "In which year was the total amount of property, plant and equipment larger?" }, { "input": "What is the change in leasehold improvements from 2018 to 2019?", "output": "What was the year-on-year change in the value of leasehold improvements between 2018 and 2019?" }, { "input": "What is the percentage change in leasehold improvements from 2018 to 2019?", "output": "What is the percentage difference in the value of leasehold improvements from 2018 to 2019?" }, { "input": "What is the percentage change in CGD sales in 2018?", "output": "What was the percentage increase or decrease in CGD sales in 2018 compared to the previous year?" }, { "input": "What is the percentage change in CGD sales in 2018?", "output": "What is the exact percentage difference in sales of CGD products between the years 2018 and the previous year?" }, { "input": "For which years is the amortization of purchased intangibles included in the CGD results recorded?", "output": "In the recorded CGD results, during which specific years is the amortization of purchased intangibles included?" }, { "input": "In which year does the inclusion of the amortization of purchased intangibles result in a higher amount in the Comprehensive Gain or Loss From Discontinued Operations (CGD)?", "output": "In which year is the amortization of purchased intangibles included in the CGD results larger?" }, { "input": "\"What criteria or requirements must be met in order to be eligible for receiving an extra cash payment amounting to $3.0 million?\"", "output": "What is the eligibility to receive an additional cash payment of $3.0 million based on?" }, { "input": "During which years was the method of calculating net income (loss) per common share using the weighted-average number of shares outstanding implemented and documented?", "output": "For which years was the weighted-average number of shares outstanding used to compute net income (loss) per common share recorded?" }, { "input": "What is the percentage change in adjusted EBITDA in 2019 from 2018?", "output": "What is the exact percentage change in adjusted EBITDA between the years 2018 and 2019?" }, { "input": "What is the percentage change in adjusted EBITDA in 2019 from 2018?", "output": "What is the percentage difference in adjusted EBITDA between 2019 and 2018? How can this information be used to analyze the financial performance in these two years?" }, { "input": "What resulted in a higher operating income?", "output": "What factor led to an increase in operating income?" }, { "input": "How many finished products were produced in the year 2019?", "output": "What is the amount of finished products in 2019?" }, { "input": "What was the total quantity of completed products manufactured in the year 2019?", "output": "What is the amount of finished products in 2019?" }, { "input": "In which year does the value of finished products surpass the value of finished products in other years, resulting in a higher overall value?", "output": "In which year is the value of finished products higher?" }, { "input": "What are the differences in the types of materials and purchased parts between 2018 and 2019?", "output": "What is the change in materials and purchased parts from 2018 to 2019?" }, { "input": "\"What factors contributed to the increase in operating profits?\"", "output": "What resulted in the improvement in operating profits?" }, { "input": "For which fiscal year(s) is the amount of amortization of purchased intangibles included in the CGD results recorded?", "output": "In which fiscal year(s) is the CGD results recorded and does it include the amortization amount of purchased intangibles?" }, { "input": "For which fiscal year(s) is the amount of amortization of purchased intangibles included in the CGD results recorded?", "output": "In which fiscal year(s) is the amortization amount of purchased intangibles included in the recorded results of the Comprehensive Growth and Development (CGD)?" }, { "input": "Which year, from the given weighted-average assumptions for determining benefit obligation as of September 30, exhibits the highest rate of compensation increase among all years?", "output": "For the weighted-average assumptions used to determine benefit obligation at September 30, which year has the largest rate of compensation increase?" }, { "input": "What was the change in the discount rate between 2018 and 2019 for determining the benefit obligation at September 30?", "output": "For the weighted-average assumptions used to determine benefit obligation at September 30, what is the change in the discount rate in 2019 from 2018?" }, { "input": "What is the weighted-average discount rate used to calculate the benefit obligation as of September 30 for the years 2017, 2018, and 2019?", "output": "For the weighted-average assumptions used to determine benefit obligation at September 30, what is the average discount rate across 2017, 2018 and 2019?" }, { "input": "In what specific year did the rate differential for foreign exchange exceed 10.0?", "output": "In which year was Foreign rate differential greater than 10.0?" }, { "input": "What was the average amount of state taxes, excluding federal benefits, for the years 2017 to 2019?", "output": "What was the average State taxes net of federal benefit for 2017-2019?" }, { "input": "How much did the Research and Development credits shift between 2018 and 2019?", "output": "What is the change in the Research and development credits from 2018 to 2019?" }, { "input": "What is the difference in the amount of Research and Development credits between 2018 and 2019?", "output": "What is the change in the Research and development credits from 2018 to 2019?" }, { "input": "What were the respective gains in 2019 and 2018?", "output": "What were the gains achieved in 2019 and 2018, respectively?" }, { "input": "What were the respective gains in 2019 and 2018?", "output": "What were the gains in 2019 and 2018, respectively?" }, { "input": "What is the specific year when the Loss from discontinued operations was exactly zero?", "output": "In which year was Loss from discontinued operations 0?" }, { "input": "What was the average amount of money lost from 2017 to 2019?", "output": "What was the average Net loss for 2017 to 2019?" }, { "input": "What is the change in the Weighted average common shares outstanding-basic from 2018 to 2019?", "output": "What is the difference in the number of weighted average common shares outstanding-basic between 2018 and 2019?" }, { "input": "What was the depreciation and amortization expense related to property and equipment for 2019?", "output": "What was the specific amount of depreciation and amortization expense attributed to property and equipment for the fiscal year 2019?" }, { "input": "\"What was the precise monetary worth or estimated value, expressed in the currency of the specific country or region, of the category of Machinery and Equipment during the calendar year of 2019?\"", "output": "What was the value of Machinery and equipment in 2019?" }, { "input": "What was the exact monetary value of Machinery and equipment in the year 2019?", "output": "What was the value of Machinery and equipment in 2019?" }, { "input": "In what year did the construction in process have a value of less than 30,000 thousands?", "output": "In which year was value of Construction in process less than 30,000 thousands?" }, { "input": "In which specific year did the value of Construction in process fall below 30,000 thousands?", "output": "In which year was value of Construction in process less than 30,000 thousands?" }, { "input": "What were the average values of Machinery and equipment in both 2018 and 2019?", "output": "What was the average value of Machinery and equipment for 2018 and 2019?" }, { "input": "What is the average telecommunication expenditure during the fiscal years 2018 and 2019?", "output": "What is the average Telecom for fiscal year 2018 and 2019?" }, { "input": "During which year did the number of Telecom users fall below 200,000 thousand?", "output": "In which year was Telecom less than 200,000 thousand?" }, { "input": "What was the exact percentage decrease of the revenue in 2019 compared to the previous year?", "output": "What was the decrease in the revenue in 2019?" }, { "input": "What was the specific percentage decrease in revenue for the fiscal year of 2019?", "output": "What was the decrease in the revenue in 2019?" }, { "input": "What is the average value of the Telecom industry for the fiscal years 2017 and 2018 combined?", "output": "What is the average Telecom value for fiscal year 2017 and 2018?" }, { "input": "What was the average value of the telecommunications industry for the fiscal years 2017 and 2018?", "output": "What is the average Telecom value for fiscal year 2017 and 2018?" }, { "input": "In which year was Telecom value less than 300,000 thousand?", "output": "In what specific year did the value of Telecom drop below 300,000 thousand?" }, { "input": "What were the values of divestments and acquisitions in 2019, 2018, and 2017?", "output": "What was the (Divested)/acquired values in 2019, 2018 and 2017 respectively?" }, { "input": "In what specific year did the beginning balance fall below 5,000,000?", "output": "In which year was Balance — beginning of year less than 5,000 thousands?" }, { "input": "In what specific year did the beginning balance equate to less than 5,000 thousands (5 million)?", "output": "In which year was Balance — beginning of year less than 5,000 thousands?" }, { "input": "What was the difference in the amount of expenses allocated to provisions between the years 2017 and 2018?", "output": "What was the change in the Provisions/(expense) from 2017 to 2018?" }, { "input": "What was the difference in the Provisions/(expense) between 2017 and 2018?", "output": "What was the change in the Provisions/(expense) from 2017 to 2018?" }, { "input": "What was the amount of charge-offs recorded in the year 2019?", "output": "What was the charge-offs in 2019?" }, { "input": "\"What was the total amount of charge-offs incurred in 2019?\"", "output": "What was the charge-offs in 2019?" }, { "input": "\"What is the average net Provision for the years 2017, 2018, and 2019?\"", "output": "What was the average Provision, net for 2017-2019?" }, { "input": "What was the average net provision for the years 2017, 2018, and 2019 combined?", "output": "What was the average Provision, net for 2017-2019?" }, { "input": "What factors determine the value that is obtained upon vesting?", "output": "What is the value realized on vesting based on?" }, { "input": "What were the specific equity awards held by each named executive officer throughout the year 2019?", "output": "What were the equity awards held by named executives during 2019?" }, { "input": "What were the equity awards granted to the named executives in 2019?", "output": "What were the equity awards held by named executives during 2019?" }, { "input": "What is the significance of the number of shares obtained upon vesting?", "output": "What does the number of shares acquired on vesting represent?" }, { "input": "Where was the reported combined net loss of $82 million disclosed or published?", "output": "Where was the combined net loss of $82 million reported in?" }, { "input": "What was the percentage difference in the Adjusted EBITDA margin from 2017 to 2019?", "output": "What is the total change in the Adjusted EBITDA margin between 2019 and 2017?" }, { "input": "What specific expenses are included in the integration and transformation costs that have an impact on the Adjusted EBITDA?", "output": "What does the integration and transformation costs impacting Adjusted EBITDA include?" }, { "input": "What is the change in total special items impacting adjusted EBITDA between 2018 and 2019?", "output": "What was the difference in the total number of special items affecting adjusted EBITDA from 2018 to 2019?" }, { "input": "What is the change in total special items impacting adjusted EBITDA between 2018 and 2019?", "output": "How did the total special items affecting adjusted EBITDA change from 2018 to 2019?" }, { "input": "What is the percentage change in total special items impacting adjusted EBITDA between 2018 and 2019?", "output": "What is the percentage difference in the amount of special items affecting adjusted EBITDA from 2018 to 2019?" }, { "input": "What is the percentage change in total special items impacting adjusted EBITDA between 2018 and 2019?", "output": "What is the percentage difference in the total number of special items affecting adjusted EBITDA from 2018 to 2019?" }, { "input": "How is the amortization process for deferred acquisition and fulfillment costs conducted?", "output": "How are the deferred acquisition and fulfillment costs amortized?" }, { "input": "What is the percentage change for the end of period balance for Fulfillment Costs when comparing the beginning of period balance?", "output": "What is the percentage change in the end of period balance for Fulfillment Costs when comparing it with the beginning of period balance?" }, { "input": "What was the total value of options that were available to be exercised and outstanding as of December 31, 2019?", "output": "What was the aggregate intrinsic value of options outstanding and exercisable at December 31, 2019? " }, { "input": "Which year has a higher weighted-average exercise price for options outstanding and exercisable?", "output": "Which year exhibits a greater weighted-average exercise price for options that are currently outstanding and available for exercise?" }, { "input": "What does the deferred income tax benefit (expense) under post-retirement benefit plans currently recognize? ", "output": "What recognition does the deferred income tax benefit (expense) currently provide for under post-retirement benefit plans?" }, { "input": "What is the percentage increase or decrease in the deferred income tax benefit or expense for post-retirement benefit plans between 2017 and 2018?", "output": "What is the percentage change in the deferred income tax benefit (expense) for post-retirement benefit plans in 2018 from 2017?" }, { "input": "What was the percentage difference in the deferred income tax benefit (expense) for post-retirement benefit plans between 2018 and 2017?", "output": "What is the percentage change in the deferred income tax benefit (expense) for post-retirement benefit plans in 2018 from 2017?" }, { "input": "How is the Adjusted EBITDA Run Rate determined?", "output": "How is the Adjusted EBITDA Run Rate calculated and what factors are considered in determining it?" }, { "input": "What are the different performance levels?", "output": "What are the various levels of performance and can you provide more details about each level?" }, { "input": "How many levels of performance would result in a payout exceeding 50% of the target award?", "output": "How many performance levels would the payout as % of target award be above 50%?" }, { "input": "How many performance levels have a payout as a percentage of the target award that exceeds 50%?", "output": "How many performance levels would the payout as % of target award be above 50%?" }, { "input": "What is the main factor or aspect that goodwill is specifically connected or associated with?", "output": "What is goodwill attributable to?" }, { "input": "What is property, plant and equipment expressed as a ratio of the total estimated aggregate consideration under the balance as of December 31, 2017?", "output": "What is the ratio of property, plant, and equipment to the total estimated aggregate consideration shown in the balance as of December 31, 2017?" }, { "input": "What is property, plant and equipment expressed as a ratio of the total estimated aggregate consideration under the balance as of December 31, 2017?", "output": "What is the ratio of property, plant and equipment to the total estimated aggregate consideration on the balance sheet as of December 31, 2017?" }, { "input": "What is the percentage change in other non current assets in 2018?", "output": "What was the percentage change in other non-current assets during the year 2018?" }, { "input": "Under what conditions would officers be entitled to the years of welfare benefits?", "output": "When would officers have the right to receive welfare benefits for a prolonged period of time?" }, { "input": "Which types of officers are listed in the table?", "output": "\"What types of officers are included in the table displayed?\"" }, { "input": "Which type of officer has the longest protected period?", "output": "Which rank of officer within a specific profession or organization is entitled to the longest period of protection?" }, { "input": "Which type of officer has the longest protected period?", "output": "Which type of officer is granted the longest period of protection against any form of termination or dismissal?" }, { "input": "What was the gross debt amount in the year 2019?", "output": "What is the gross debt in 2019?" }, { "input": "What was the difference in the total amount of debt in 2019 compared to 2018?", "output": "What is the change in gross debt in 2019 from 2018?" }, { "input": "What is the percentage change in gross debt in 2019 from 2018?", "output": "What was the percentage increase/decrease in the gross debt from 2018 to 2019?" }, { "input": "What is the percentage change in gross debt in 2019 from 2018?", "output": "What is the percentage difference in the amount of gross debt from 2018 to 2019?" }, { "input": "On what basis are the estimated benefit payments made, considering the underlying assumptions?", "output": "What assumptions are the estimated benefit payments based on?" }, { "input": "How many separate periods are highlighted in the table?", "output": "How many different period segments are highlighted in the table?" }, { "input": "How many distinct period segments are specifically emphasized in the provided table?", "output": "How many different period segments are highlighted in the table?" }, { "input": "What is the percentage change in combined pension plan in 2021 from 2020?", "output": "What is the percentage difference in the combined pension plan between 2021 and 2020?" }, { "input": "What was the extent or percentage of the business that falls under the category of Operations and Other that was sold?", "output": "What portion of business under Operations and Other was sold?" }, { "input": "What is the total segment revenue in 2019?", "output": "What is the exact amount of revenue generated by each segment in 2019 and what is the sum of these revenues overall?" }, { "input": "What components are under operating revenue?", "output": "What specific components contribute to the total operating revenue of a company or organization?" }, { "input": "What additional information does Note 17 provide?", "output": "What is the additional information provided in Note 17?" }, { "input": "What additional information does Note 17 provide?", "output": "What is the content of Note 17 and how does it provide additional information?" }, { "input": "\"What was the exact sum of adjusted EBITDA for the entire year of 2019?\"", "output": "What is the total adjusted EBITDA in 2019?" }, { "input": "What is the change in the adjusted EBITDA under Wholesale in 2019 from 2018?", "output": "What was the difference in adjusted EBITDA for the Wholesale category between 2019 and 2018?" }, { "input": "What is the average total adjusted EBITDA over the three years?", "output": "What is the mean value of the total adjusted EBITDA for the three-year period?" }, { "input": "How many executives among the present ones in the organization receive a short-term incentive bonus that exceeds $500,000 in amount?", "output": "How many current executives have a STI bonus amount greater than $500,000?" }, { "input": "How many executives presently receive a Short-Term Incentive (STI) bonus exceeding $500,000?", "output": "How many current executives have a STI bonus amount greater than $500,000?" }, { "input": "What is the salary earned by Indraneel Dev in 2019, and how does it compare to his/her STI bonus amount, expressed as a ratio?", "output": "What is Indraneel Dev's salary earned during 2019 expressed as a ratio of his/her STI bonus amount?" }, { "input": "What is included in the category of 'Support assets'? Please provide a detailed explanation of the various components that constitute these assets.", "output": "What do 'Support assets' consist of?" }, { "input": "What is the specific figure for Reported Balances' operating revenue?", "output": "What is the Reported Balances operating revenue?" }, { "input": "What was the variation in the deferred income tax benefit (expense) related to pension plans between 2019 and 2018?", "output": "What is the change in the deferred income tax benefit (expense) for pension plans in 2019 from 2018?" }, { "input": "What is the percentage change in the deferred income tax benefit (expense) for pension plans in 2019 from 2018?", "output": "What is the exact percentage difference in the deferred income tax benefit or expense for pension plans between the years 2018 and 2019?" }, { "input": "What was the outcome of the continuous efforts made to downsize the company's workforce?", "output": "What was recognized as a result of ongoing efforts to reduce the company's workforce?" }, { "input": "What is the average amount of service costs across 2017, 2018 and 2019?", "output": "What is the mean service cost from 2017 to 2019?" }, { "input": "What was the total amount of money owed by customers that had not been received as of December 31, 2019?", "output": "What was the gross customer receivables at December 31, 2019?" }, { "input": "What was the total amount of customer receivables as recorded on the balance sheet at the end of December 31, 2019, including all outstanding invoices and amounts owed to the company by its customers?", "output": "What was the gross customer receivables at December 31, 2019?" }, { "input": "What is included in the total value of long-term debt?", "output": "What does total long-term debt include?" }, { "input": "What is included in the total amount of long-term debt?", "output": "What does total long-term debt include?" }, { "input": "What was the approach or method adopted in 2019 to determine the total assets of an entity?", "output": "Regarding the total assets, what was adopted in 2019?" }, { "input": "How many different dates declared had a dividend per share of $0.250 in 2018 and 2019?", "output": "In 2018 and 2019, how many dates were declared with a dividend per share of $0.250?" }, { "input": "\"What specific categories of expenses were documented in the records for the year 2018?\"", "output": "What types of expenses were recorded during 2018?" }, { "input": "What were the recorded expenses in 2018 and what categories do they fall under?", "output": "What types of expenses were recorded during 2018?" }, { "input": "\"What is the total amount of non-cash goodwill impairment charges that are not tax-deductible, for the years 2019 and 2018?\"", "output": "What is the sum of non-cash, non-tax-deductible goodwill impairment charges for 2019 and 2018?" }, { "input": "How much were the cumulative acquisition-related expenses incurred specifically by Level 3 in the year 2019?", "output": "What was the amount of cumulative acquisition-related expenses incurred for Level 3 in 2019?" }, { "input": "What was the total amount of expenses related to acquisitions that Level 3 incurred in the year 2019? Please provide the specific cumulative value.", "output": "What was the amount of cumulative acquisition-related expenses incurred for Level 3 in 2019?" }, { "input": "In which year did the company record the smallest total expenditure associated with acquisitions?", "output": "Which year incurred the lowest amount of total acquisition-related expenses?" }, { "input": "In which specific year did the company experience the least amount of expenses related to acquisition?", "output": "Which year incurred the lowest amount of total acquisition-related expenses?" }, { "input": "Against what were the estimates offset for the years 2019, 2018, and 2017?", "output": "What are the 2019, 2018 and 2017 change in estimates offset against?" }, { "input": "What factors were the changes in estimates for 2019, 2018, and 2017 set off against?", "output": "What are the 2019, 2018 and 2017 change in estimates offset against?" }, { "input": "In what year was the highest accretion expense recorded?", "output": "Which year has the largest accretion expense?" }, { "input": "What is the year with the highest accretion expense?", "output": "Which year has the largest accretion expense?" }, { "input": "What is the average accretion expense across 2017, 2018 and 2019?", "output": "What is the average expense for accretion incurred during the years 2017, 2018, and 2019?" }, { "input": "Where is the recording location for the current portion of the post-retirement benefit obligations?", "output": "Where is the current portion of the post-retirement benefit obligations recorded?" }, { "input": "In which specific year is the fair value of plan assets under post-retirement benefit plans higher than in other years?", "output": "In which year is the fair value of plan assets under post-retirement benefit plans higher?" }, { "input": "\"What were the depreciation and amortization figures for the fiscal year 2019?\"", "output": "What is the Depreciation and amortization for 2019?" }, { "input": "What is the total adjusted EBITDA for each segment in 2019?", "output": "What is the total segment adjusted EBITDA in 2019?" }, { "input": "What is the total amount of income tax expense (benefit) recorded in the financial statements for the years 2018 and 2019?", "output": "What is the sum of income tax expense (benefit) in 2018 and 2019?" }, { "input": "What is the percentage change in income tax expense (benefit) in 2019 from 2018?", "output": "What is the percentage difference in income tax expense (benefit) between 2018 and 2019?" }, { "input": "What was the total amount of interest expenses incurred in the year 2019?", "output": "What is the total interest expense in 2019?" }, { "input": "What was the total amount of interest paid in expenses during the year 2019?", "output": "What is the total interest expense in 2019?" }, { "input": "What segments of interest expense are presented in the table?", "output": "Which specific segments of interest expense are displayed in the table provided?" }, { "input": "What is the specific monetary difference in the total interest expense for the year 2019 compared to 2018?", "output": "What is the change in the gross interest expense in 2019 from 2018?" }, { "input": "What was the difference in the amount of money spent on interest expenses in 2019 compared to 2018?", "output": "What is the change in the gross interest expense in 2019 from 2018?" }, { "input": "What is the average amount of interest paid over the years 2017, 2018, and 2019?", "output": "What is the average total interest expense for 2017 to 2019?" }, { "input": "\"What is the average annual value of unvested restricted stock awards that were excluded from antidilutive considerations in the years 2017, 2018, and 2019?\"", "output": "What is the average annual amount of unvested restricted stock awards that are antidilutive excluded in 2017, 2018 and 2019?" }, { "input": "What were the respective average annual amounts of unvested restricted stock awards that were excluded due to being antidilutive in 2017, 2018, and 2019?", "output": "What is the average annual amount of unvested restricted stock awards that are antidilutive excluded in 2017, 2018 and 2019?" }, { "input": "Which year has a larger amount of contract acquisition costs?", "output": "In which specific year do the contract acquisition costs exhibit a greater magnitude or quantity compared to other years?" }, { "input": "What is the change in contract assets in 2019?", "output": "What is the net increase or decrease in contract assets during the year 2019?" }, { "input": "What percentage of Level 3 goodwill is specifically allocated to the consumer segment?", "output": "How much of Level 3 goodwill is allocated to consumer?" }, { "input": "What is the ratio of goodwill to the total amount as of December 31, 2018, in the context of Business?", "output": "What is the amount of goodwill under Business as a ratio of the Total amount as of December 31, 2018?" }, { "input": "As of December 31, 2018, what is the ratio of goodwill to the total amount in Business?", "output": "What is the amount of goodwill under Business as a ratio of the Total amount as of December 31, 2018?" }, { "input": "What is the combined goodwill figure for the years 2017 and 2018?", "output": "What is the sum of the total amount of goodwill for 2017 and 2018?" }, { "input": "What is the combined value of goodwill for both 2017 and 2018?", "output": "What is the sum of the total amount of goodwill for 2017 and 2018?" }, { "input": "\"What specific items are encompassed within the current liabilities section of the consolidated balance sheets?\"", "output": "What do the current liabilities reflected in the consolidated balance sheets include?" }, { "input": "What specific items are included under current liabilities in the consolidated balance sheets?", "output": "What do the current liabilities reflected in the consolidated balance sheets include?" }, { "input": "What items are typically categorized as Other current liabilities in financial statements?", "output": "What are the items included under Other current liabilities?" }, { "input": "In which specific year do capital expenditures account for a greater value of accounts payable?", "output": "Which year has a larger amount of accounts payable associated with capital expenditures?" }, { "input": "Which specific year indicates a higher level of accounts payable specifically linked to capital expenditures?", "output": "Which year has a larger amount of accounts payable associated with capital expenditures?" }, { "input": "What distinguishes the total grant values of Indraneel Dev and Stacey W. Goff?", "output": "What is the difference between Indraneel Dev and Stacey W. Goff's total grant values?" }, { "input": "What is the disparity in total grant amounts between Indraneel Dev and Stacey W. Goff?", "output": "What is the difference between Indraneel Dev and Stacey W. Goff's total grant values?" }, { "input": "What is the monthly average for the number of shares that are typically withheld for tax purposes?", "output": "What is the average number of shares withheld for taxes per month?" }, { "input": "\"What is the monthly average amount of shares that are typically withheld for tax purposes?\"", "output": "What is the average number of shares withheld for taxes per month?" }, { "input": "What is the total amount of minimum lease payments expected for non-cancellable operating leases in 2019?", "output": "How much is the 2019 total future minimum lease payments under non-cancellable operating leases?" }, { "input": "What was the percentage change in future minimum lease payments due within one year between 2018 and 2019?", "output": "How much did future minimum lease payments due within one year change by between 2018 and 2019?" }, { "input": "Which year, 2018 or 2019, had a higher value of capital?", "output": "Between 2018 and 2019, which year had a greater amount of capital?" }, { "input": "In terms of owned assets, which year, 2018 or 2019, had a higher overall value or quantity of assets?", "output": "Between 2018 and 2019, which year had a greater amount of owned assets?" }, { "input": "Which year, 2018 or 2019, had a higher total value of assets owned?", "output": "Between 2018 and 2019, which year had a greater amount of owned assets?" }, { "input": "What were the average values of leased assets for the years 2018 and 2019?", "output": "What is the average leased assets for 2018 and 2019?" }, { "input": "What was the average value of leased assets for the years 2018 and 2019?", "output": "What is the average leased assets for 2018 and 2019?" }, { "input": "What is the average trade receivables included within non-current assets?", "output": "What is the average amount of trade receivables included within the category of non-current assets?" }, { "input": "What is the average prepayments included in non-current assets?", "output": "What is the average amount of prepayments that are included in non-current assets?" }, { "input": "What financial items does guidance basis comprise of?", "output": "What specific components does the guidance basis in financial terms consist of?" }, { "input": "Which year, either 2018 or 2019, had a higher adjusted EBITDA on a guidance basis?", "output": "Between 2018 and 2019, which year had higher adjusted EBITDA, guidance basis?" }, { "input": "Which year, 2018 or 2019, had a higher adjusted EBITDA on a guidance basis? Please provide the comparison between the two specified years.", "output": "Between 2018 and 2019, which year had higher adjusted EBITDA, guidance basis?" }, { "input": "What is the 2019 average adjusted EBITDA, guidance basis?", "output": "\"What was the average adjusted EBITDA on a guidance basis for the year 2019?\"" }, { "input": "What is the average amount of cash and cash equivalents that are typically reported in the statement of cash flows?", "output": "What is the average cash and cash equivalents presented in the statement of cash flows?" }, { "input": "What is the average amount of cash and cash equivalents reported in the statement of cash flows?", "output": "What is the average cash and cash equivalents presented in the statement of cash flows?" }, { "input": "What are the different types of financial items that are included in the data of a consolidated income statement?", "output": "What financial items does the consolidated income statement data comprise of?" }, { "input": "\"What is the mean total income for the fiscal years 2018 and 2019?\"", "output": "What is the average revenue for 2018 and 2019?" }, { "input": "What were the average revenues for the years 2018 and 2019?", "output": "What is the average revenue for 2018 and 2019?" }, { "input": "What was the average revenue in 2017 and 2018?", "output": "What is the average revenue for 2017 and 2018?" }, { "input": "\"What was the average revenue in both 2017 and 2018? Please provide the specific revenue figures for each year.\"", "output": "What is the average revenue for 2017 and 2018?" }, { "input": "\"What is the revenue change reported specifically for Europe?\"", "output": "What is the reported change in revenue - europe?" }, { "input": "What is the reported change in revenue for Europe? How has the revenue in Europe been affected according to the reports?", "output": "What is the reported change in revenue - europe?" }, { "input": "\"Which country, Germany or Italy, has experienced a greater overall growth in the organic sector over a specified period of time?\"", "output": "Between Germany and Italy, which one has a higher organic change?" }, { "input": "What does cash consideration paid comprise of?", "output": "What is included in the cash consideration paid?" }, { "input": "\"In the table, what is the timeframe for the financial years whose information is displayed?\"", "output": "Which financial years' information is shown in the table?" }, { "input": "What is the total value of acquisitions made in 2019 throughout the year?", "output": "How much is the 2019 acquisitions during the year ?" }, { "input": "What was the total value of acquisitions made in 2019 throughout the year?", "output": "How much is the 2019 acquisitions during the year ?" }, { "input": "Which year, 2018 or 2019, witnessed a higher number of acquisitions throughout the entire year?", "output": "Between 2018 and 2019, which year had a greater amount of acquisitions during the year?" }, { "input": "What are the total operating lease commitments?", "output": "What is the sum of all the commitments related to operating leases?" }, { "input": "What are the total operating lease commitments?", "output": "What is the sum of all commitments made for operating leases?" }, { "input": "What is the total outstanding amount of financial liabilities that need to be paid within the shortest timeframe?", "output": "How much financial liabilities are due for payment soonest?" }, { "input": "What is the proportion of financial liabilities in relation to the overall sum of contractual obligations and commitments?", "output": "What percentage of total contractual obligations and commitments is the financial liabilities?" }, { "input": "\"What is the average amount of revenue generated from services according to IAS 18 for the years 2018 and 2019?\"", "output": "What is the average service revenue between 2018 and 2019 IAS 18?" }, { "input": "What is the difference between average service revenue and average other revenue?", "output": "What distinguishes average service revenue from average other revenue?" }, { "input": "Which specific financial years are being represented in the provided table?", "output": "Which financial years' information is shown in the table?" }, { "input": "What type of short-term investments are shown in the table?", "output": "\"What specific types of short-term investments are displayed in the provided table?\"" }, { "input": "What is the percentage change in the value of managed investment funds from 2018 to 2019?", "output": "What is the change in managed investment funds between 2018 and 2019?" }, { "input": "What is the difference in the value of managed investment funds from 2018 to 2019?", "output": "What is the change in managed investment funds between 2018 and 2019?" }, { "input": "What type of long-term borrowings are shown in the table?", "output": "What are the specific types of long-term borrowings that are presented in the table?" }, { "input": "What type of long-term borrowings are shown in the table?", "output": "What is the specific category of long-term borrowings that is displayed in the table?" }, { "input": "What is the percentage change in the number of shares acquired from February to March 2019?", "output": "What is the percentage change between shares purchased in February and March 2019?" }, { "input": "Which financial years' information is shown in the table?", "output": "Which fiscal years' data is displayed in the given table?" }, { "input": "Which financial years' information is shown in the table?", "output": "Which specific financial year's information is depicted in the table?" }, { "input": "How much is the 2019 audit fees ?", "output": "What is the exact cost of the audit fees for the year 2019?" }, { "input": "What were the average fees for audits in 2018 and 2019?", "output": "What is the average audit fees for 2018 and 2019?" }, { "input": "What were the average audit fees in both 2018 and 2019?", "output": "What is the average audit fees for 2018 and 2019?" }, { "input": "\"What is the average total fees for the years 2018 and 2019 combined?\"", "output": "What is the average total fees for 2018 and 2019?" }, { "input": "What is the average sum of fees for the years 2018 and 2019 combined?", "output": "What is the average total fees for 2018 and 2019?" }, { "input": "What is the discrepancy between the average fees charged for audits and the average total fees incurred in 2018 and 2019?", "output": "What is the difference between the average audit fees and the average total fees for 2018 and 2019?" }, { "input": "What is the disparity between the mean charges for audits and the mean overall charges, comparing the years 2018 and 2019?", "output": "What is the difference between the average audit fees and the average total fees for 2018 and 2019?" }, { "input": "What is the amount of the non-current liabilities' 2018 other payables?", "output": "How much is the 2018 other payables included within non-current liabilities?" }, { "input": "What is the value of the 2019 other payables included in current liabilities, excluding the amount payable for the share buyback?", "output": "How much is the 2019 other payables included within current liabilities excluding the amount payable in relation to the share buyback?" }, { "input": "What is the value of the 2019 other payables included in the current liabilities, except for the amount payable related to the share buyback?", "output": "How much is the 2019 other payables included within current liabilities excluding the amount payable in relation to the share buyback?" }, { "input": "What is the total amount of money that is owed to subsidiaries within one year as of 2019?", "output": "What is the 2019 amounts owed to subsidiaries within one year?" }, { "input": "Which year experiences a greater number of bank loans and other loans that are due within one year?", "output": "Which year has a higher amount of bank loans and other loans under amounts falling due within one year?" }, { "input": "How much is 2019 cash at bank and in hand ?", "output": "What is the total amount of cash that was available in the bank and on hand in 2019?" }, { "input": "Between 2018 and 2019, which year had a greater amount of money market funds?", "output": "Which year, 2018 or 2019, had a higher value of money market funds?" }, { "input": "Between 2018 and 2019, which year had a greater amount of money market funds?", "output": "Which year had a higher volume of funds invested in money market funds: 2018 or 2019?" }, { "input": "Which year, 2018 or 2019, witnessed a higher quantity of government securities being issued?", "output": "Between 2018 and 2019, which year had a greater amount of government securities?" }, { "input": "Between 2018 and 2019, which year had more ordinary shares allotted?", "output": "In the period spanning from 2018 to 2019, which year witnessed a higher allocation of ordinary shares?" }, { "input": "What is the current cost of servicing for the year 2019?", "output": "How much is the 2019 current service cost?" }, { "input": "What specific types of investments that are no longer held are displayed in the table?", "output": "What type of non-current investments are shown in the table?" }, { "input": "In the table, which types of investments that are not currently being utilized or held are depicted?", "output": "What type of non-current investments are shown in the table?" }, { "input": "How much is the 2018 debt securities?", "output": "What is the value or price of the debt securities issued in 2018?" }, { "input": "How much is the 2018 debt securities?", "output": "What is the cost of the 2018 debt securities?" }, { "input": "Which financial years' information is shown in the table?", "output": "Which financial years are displayed in the provided table?" }, { "input": "What is the total projected income and expenditure for the year 2018?", "output": "How much is the 2018 aggregated expected income expense ?" }, { "input": "What is the projected total income expense for 2018 after aggregating all relevant information?", "output": "How much is the 2018 aggregated expected income expense ?" }, { "input": "What is the 2018 deferred tax on overseas earnings, excluding the 15€m charge relating to the combination of Vodafone India with Idea Cellular?", "output": "What is the amount of deferred tax on overseas earnings in 2018, excluding the charge of 15€m related to the merger of Vodafone India and Idea Cellular?" }, { "input": "What is the difference in average income tax expense between the years 2017-2018 and 2018-2019?", "output": "What is the change between 2017-2018 and 2018-2019 average income tax expense?" }, { "input": "How much is the excluded depreciation and amortisation for 2019 adjusted EBITDA?", "output": "What is the amount of depreciation and amortization that is excluded from the adjusted EBITDA calculation for the year 2019?" }, { "input": "How much is the excluded depreciation and amortisation for 2018 adjusted EBITDA?", "output": "What is the total amount of depreciation and amortization that were not included in calculating the adjusted EBITDA for the year 2018?" }, { "input": "\"What is the exact amount of the adjusted profit before tax for the year 2018?\"", "output": "How much is the adjusted profit before tax in 2018?" }, { "input": "Which specific financial year's information is displayed in the provided table?", "output": "Which financial years' information is shown in the table?" }, { "input": "What is the distinction between the average net book value and the average total costs of shares in Group undertakings as of March 31, 2019? How can analyzing this difference assist in better understanding financial performance?", "output": "What is the difference between 2019 average net book value and 2019 average total costs of shares in Group undertakings as at 31 March?" }, { "input": "What is the significance of the recorded amount for developed technology?", "output": "What does the amount recorded for developed technology represent?" }, { "input": "What does the recorded amount for developed technology signify, and what is its purpose or significance?", "output": "What does the amount recorded for developed technology represent?" }, { "input": "What is the useful life for Developed technology?", "output": "What is the average duration of usefulness or lifespan for technology that has been fully developed and is in practical use?" }, { "input": "What is the difference in useful life between developed technology and customer relationships?", "output": "What distinguishes the lifespan of developed technology from that of customer relationships?" }, { "input": "What is the difference in fair value between developed technology and customer relationships?", "output": "What distinguishes the fair value of developed technology from that of customer relationships?" }, { "input": "What identifiable intangible assets have a useful life exceeding five years?", "output": "What are the identifiable intangible assets with a useful life above 5 years?" }, { "input": "What intangible assets can be identified and have a useful life of more than 5 years?", "output": "What are the identifiable intangible assets with a useful life above 5 years?" }, { "input": "What does the amount recorded for developed technology represent?", "output": "\"What is the meaning or significance of the recorded amount for developed technology?\"" }, { "input": "What is the useful life (in years) of developed technology?", "output": "What is the average duration, measured in years, of the lifespan of technology that has been developed?" }, { "input": "What is the difference in fair value between developed technology and customer relationships?", "output": "What distinguishes the fair value of developed technology from that of customer relationships, and how do they differ from each other?" }, { "input": "What are the intangible assets that have a fair value of above $20,000 thousands?", "output": "What are the intangible assets with a fair value exceeding $20,000 thousands? Please provide a comprehensive list of these assets." }, { "input": "What are the intangible assets that have a fair value of above $20,000 thousands?", "output": "What are the intangible assets with a fair value exceeding $20 million dollars?" }, { "input": "What is the distinction between the total amount invested in debt and equity, when considering amortized cost as the basis for evaluation?", "output": "What is the difference in the total debt investments and the total equity investments, based on amortized cost?" }, { "input": "What is the percentage composition of U.S. treasury securities in relation to the total debt investments when considering fair value?", "output": "Based on fair value, what is the percentage constitution of U.S. treasury securities among the total debt investments?" }, { "input": "What is the number of potentially anti-dilutive shares that were excluded for fiscal years ended June 30, 2017, 2018 and 2019 respectively? ", "output": "What was the excluded number of potentially anti-dilutive shares for the fiscal years ending June 30, 2017, 2018, and 2019?" }, { "input": "What is the number of potentially anti-dilutive shares that were excluded for fiscal years ended June 30, 2017, 2018 and 2019 respectively? ", "output": "What is the total count of potentially anti-dilutive shares excluded for the fiscal years ending on June 30, 2017, 2018, and 2019, respectively?" }, { "input": "What is the variance in the weighted-average ordinary shares outstanding for basic and diluted calculations in the fiscal year that ended on June 30, 2019?", "output": "In fiscal year ended June 30, 2019, what is the difference in the weighted-average ordinary shares outstanding between the basic and the diluted?" }, { "input": "What is the average total non-current operating assets for fiscal years ended June 30, 2018 and 2019?", "output": "\"What is the average total value of non-current operating assets for the fiscal years that ended on June 30, 2018 and June 30, 2019?\"" }, { "input": "How many geographic regions had non-current operating assets exceeding $10,000 thousand in the fiscal year ending on June 30, 2019?", "output": "In fiscal year ended June 30, 2019, how many geographic regions have non-current operating assets of more than $10,000 thousand?" }, { "input": "What percentage of the total non-current operating assets owned by the Group in the United States constituted the non-current operating assets in the fiscal year ending June 30, 2018?", "output": "In fiscal year ended June 30, 2018, what is the percentage constitution of the non-current operating assets in the United States among the total non-current operating assets owned by the Group?" }, { "input": "\"What specific adjustments were made to figures in the year 2018?\"", "output": "What was the adjustment done to figures in 2018?" }, { "input": "How much did the value of security deposits change from fiscal year 2018 to 2019?", "output": "What is the change in value between security deposits between fiscal years 2018 and 2019?" }, { "input": "What is the Balance at the beginning of 2019?", "output": "What is the balance amount in the accounts at the start of the year 2019?" }, { "input": "What is the average deferred tax expense for fiscal years 2018 and 2019?", "output": "What is the average amount of deferred tax expense incurred during fiscal years 2018 and 2019?" }, { "input": "What is the percentage change of deferred tax expenses between fiscal year 2018 to 2019?", "output": "What is the percentage difference in deferred tax expenses between fiscal year 2018 and fiscal year 2019?" }, { "input": "What is the amount recorded for short-term investments as of June 30, 2018, on the consolidated statements of financial position for the Group?", "output": "As of June 30, 2018, what is the value of short-term investments on the Group's consolidated statements of financial position?" }, { "input": "Based on fair value, what is the difference in value between U.S. treasury securities and agency securities?", "output": "What is the quantifiable difference between U.S. treasury securities and agency securities in terms of their fair market value?" }, { "input": "What are the specific revenue figures for the United Kingdom for the fiscal years that concluded in 2017, 2018, and 2019?", "output": "What are the revenue amounts from the United Kingdom for fiscal years ended 2017, 2018 and 2019 respectively?" }, { "input": "What is the difference in the revenues from Asia Pacific between fiscal years ended 2018 and 2019?", "output": "How does the revenue for the Asia Pacific region differ between the fiscal years ending in 2018 and 2019?" }, { "input": "What is the balance of capped call as of June 30, 2019?", "output": "What is the specific capped call balance as of June 30, 2019?" }, { "input": "What is the difference in the capped call balance between the fiscal year ending on June 30, 2018, and the fiscal year ending on June 30, 2019?", "output": "What is the change in the balance of capped call between fiscal year ended June 30, 2018 and 2019?" }, { "input": "What is the difference in balance of capped call and non-marketable investments as of June 30, 2019?", "output": "What is the difference in the balance of capped call and non-marketable investments as of June 30, 2019, and how do these two types of investments vary in terms of their marketability?" }, { "input": "What is the difference in balance of capped call and non-marketable investments as of June 30, 2019?", "output": "What is the balance difference between capped call and non-marketable investments as of June 30, 2019?" }, { "input": "\"What is the chronological order of the share capital amounts from June 30, 2015 to 2019?\"", "output": "What are the share capital for the years as of June 30, 2015 to 2019 in chronological order?" }, { "input": "List the short-term investments from June 30, 2015 to 2019 in chronological order.", "output": "What are the short-term investments for the years as of June 30, 2015 to 2019 in chronological order?" }, { "input": "What is the difference in the value of short-term investments between fiscal years 2018 and 2019?", "output": "What is the variance in the monetary worth of short-term investments from fiscal year 2018 to fiscal year 2019?" }, { "input": "What is the average Cash and cash equivalents for 2015-2019?", "output": "What is the average amount of Cash and cash equivalents from 2015 to 2019?" }, { "input": "What is the definition of other fees?", "output": "What does the term \"other fees\" mean? Please provide a clear definition of this term, including any relevant details, to facilitate a more accurate response." }, { "input": "What is the average amount of taxes paid in fees for the fiscal years 2018 and 2019?", "output": "What is the average tax fees for fiscal years 2018 and 2019?" }, { "input": "What is the disparity in the aggregate of obligations between operating leases and other types of contractual commitments?", "output": "What is the difference in the total commitments between that of operating leases and other contractual commitments?" }, { "input": "What percentage of the total commitments for operating leases for the year ending in 2020 is constituted by the commitments specifically for operating leases?", "output": "What is the percentage constitution of the commitments for operating leases for year ending 2020 among the total commitments for operating leases?" }, { "input": "What portion of the total commitments for operating leases for the year ending 2020 represents the percentage constitution?", "output": "What is the percentage constitution of the commitments for operating leases for year ending 2020 among the total commitments for operating leases?" }, { "input": "What was the percentage increase in maintenance revenues from the fiscal year ending in 2018 to the fiscal year ending in 2019?", "output": "What was the increase of maintenance revenues from fiscal year ended 2018 to 2019?" }, { "input": "What was the exact percentage increase in perpetual license revenues from the fiscal year ended 2018 to the fiscal year ended 2019?", "output": "What was the increase of perpetual license revenues from fiscal year ended 2018 to 2019?" }, { "input": "What percentage of total revenue for the fiscal year ending in 2018 is made up of perpetual license sales?", "output": "For fiscal year ended 2018, what is the percentage constitution of perpetual license among the total revenue?" }, { "input": "What percentage of the total revenue for the fiscal year ended 2018 was generated from perpetual licenses?", "output": "For fiscal year ended 2018, what is the percentage constitution of perpetual license among the total revenue?" }, { "input": "\"What are the average maintenance revenues for the fiscal years ending in 2018 and 2019, respectively?\"", "output": "What is the average maintenance revenues for fiscal year ended 2018 and 2019?" }, { "input": "What is the specific provision for dilapidation expenses allocated for the fiscal year 2019?", "output": "What is the dilapidation provision for fiscal year 2019?" }, { "input": "What is the formula to calculate free cash flow, and can you further explain the steps involved in deriving this measure?", "output": "How is free cash flow calculated?" }, { "input": "What was the percentage increase in free cash flow from fiscal year 2018 to 2019?", "output": "What was the increase in free cash flow between fiscal year 2018 and 2019?" }, { "input": "What is the average net cash flow generated by operating activities for the years 2017, 2018, and 2019?", "output": "What is the average net cash provided by operating activities from 2017-2019?" }, { "input": "\"What is the average amount of net cash generated from operating activities for the years 2017, 2018, and 2019?\"", "output": "What is the average net cash provided by operating activities from 2017-2019?" }, { "input": "What is the percentage difference in the amount of available cash generated by the company between the years 2017 and 2018?", "output": "What is the percentage change in free cash flow between 2017 and 2018?" }, { "input": "What is the change in capital expenditures between 2018 and 2019?", "output": "What was the difference in capital expenditures from 2018 to 2019?" }, { "input": "What is the change in capital expenditures between 2018 and 2019?", "output": "What was the difference in the amount of money spent on capital expenditures from 2018 to 2019?" }, { "input": "As of June 30, 2019, what is the total value or monetary amount of outstanding payments owed by customers or business partners for goods or services provided?", "output": "As of June 30, 2019, what is the amount of trade receivables?" }, { "input": "How does the net cash provided by operating activities compare between the fiscal years ending in 2018 and 2019?", "output": "What is the difference in net cash provided by operating activities between fiscal year ended 2018 and 2019?" }, { "input": "What is the change in net cash provided by operating activities between the fiscal years ending in 2018 and 2019?", "output": "What is the difference in net cash provided by operating activities between fiscal year ended 2018 and 2019?" }, { "input": "What are the specific components included in the \"other current assets\" category in the table?", "output": "What are the components under other current assets in the table?" }, { "input": "What was the difference in the total value of other current assets between 2019 and 2018?", "output": "What was the change in total other current assets in 2019 from 2018?" }, { "input": "Which items in the table include the cumulative effect of the 2017 Tax Reform Act in their 2017 amounts?", "output": "For which items in the table does the amount for 2017 includes the sum related to the impact of the 2017 Tax Reform Act?" }, { "input": "Which items in the table include the sum related to the impact of the 2017 Tax Reform Act in their amount for 2017?", "output": "For which items in the table does the amount for 2017 includes the sum related to the impact of the 2017 Tax Reform Act?" }, { "input": "What was the year when the Diluted Net Income per Common Share reached its highest value?", "output": "In which year was the Diluted Net Income per Common Share largest?" }, { "input": "In which year did the exclusion of anti-dilutive shares from the calculation of diluted earnings per share reach its highest magnitude?", "output": "In which year was the Anti-dilutive shares excluded from the diluted earnings per share calculation largest?" }, { "input": "What was the year when the exclusion of anti-dilutive shares from the calculation of diluted earnings per share reached its highest value?", "output": "In which year was the Anti-dilutive shares excluded from the diluted earnings per share calculation largest?" }, { "input": "Where is Income tax benefit included?", "output": "Where is the income tax benefit typically included in financial statements or accounting records?" }, { "input": "Where is Income tax benefit included?", "output": "Where can income tax benefit be found in financial statements or reports?" }, { "input": "When is the calculation of income tax benefit typically performed annually?", "output": "In which years is income tax benefit calculated?" }, { "input": "What does restricted cash include?", "output": "What is included in restricted cash and can you provide further details about its components?" }, { "input": "In which year was Restricted cash included in \"Other current assets\" the lowest?", "output": "In which year did the inclusion of Restricted cash in the category of \"Other current assets\" reach its lowest level?" }, { "input": "What was the difference in the amount of Restricted cash included in the category of \"Deferred charges and other assets\" in the year 2017 compared to 2016?", "output": "What was the change in Restricted cash included in \"Deferred charges and other assets\" in 2017 from 2016?" }, { "input": "What is the largest recorded year for Net periodic benefit cost?", "output": "In which year was Net periodic benefit cost the largest?" }, { "input": "What was the difference in the amount of money spent on interest in 2019 compared to 2018?", "output": "What was the change in interest cost in 2019 from 2018?" }, { "input": "What was the difference in the cost of interest between 2019 and 2018?", "output": "What was the change in interest cost in 2019 from 2018?" }, { "input": "\"In which year did the amount of deferred revenue reach its highest value and why?\"", "output": "In which year was deferred revenue larger?" }, { "input": "During which year was the accumulated income received in advance, known as deferred revenue, higher?", "output": "In which year was deferred revenue larger?" }, { "input": "What was the specific dollar amount of the change in deferred revenue between 2019 and 2018?", "output": "What was the change in deferred revenue in 2019 from 2018?" }, { "input": "What was the percentage change in deferred revenue in 2019 from 2018?", "output": "What is the percentage increase or decrease in deferred revenue from 2018 to 2019?" }, { "input": "What was the change in Fair value of share awards vested in 2019 from 2018?", "output": "What was the variation in the fair value of share awards that were vested in the year 2019 compared to 2018?" }, { "input": "What was the change in Fair value of share awards vested in 2019 from 2018?", "output": "What was the difference in fair value of share awards that were granted in 2018 but vested in 2019?" }, { "input": "During which years was information regarding the credit agreements provided?", "output": "In which years is information related to the credit agreements provided?" }, { "input": "What is the least year in which the Interest expense reached its minimum amount?", "output": "In which year was the amount of Interest expense the smallest?" }, { "input": "When was the least amount of interest expense recorded?", "output": "In which year was the amount of Interest expense the smallest?" }, { "input": "What was the change in Interest expense in 2019 from 2018?", "output": "What was the difference in the amount of money spent on interest payments in 2019 compared to 2018?" }, { "input": "What was the percentage increase or decrease in the amount spent on interest in 2019 compared to 2018?", "output": "What was the percentage change in Interest expense in 2019 from 2018?" }, { "input": "What was the value of purchases that were accumulated or outstanding as of the end of 2019?", "output": "What was the amount of Accrued purchases in 2019?" }, { "input": "What was the specific amount of Accrued legal and professional fees incurred during the year 2018?", "output": "What was the amount of Accrued legal and professional fees in 2018?" }, { "input": "\"How much money was recorded for Accrued legal and professional fees during the year 2018?\"", "output": "What was the amount of Accrued legal and professional fees in 2018?" }, { "input": "What was the percentage difference in Accrued roadside assistance claim costs between 2019 and 2018?", "output": "What was the change in Accrued roadside assistance claim costs in 2019 from 2018?" }, { "input": "\"What was the specific amount charged for the service in the year 2019?\"", "output": "What was the service cost in 2019?" }, { "input": "\"What was the cost of the service provided in the year 2019?\"", "output": "What was the service cost in 2019?" }, { "input": "What was the total cost of interest expenses in the year 2018?", "output": "What was the Interest cost in 2018?" }, { "input": "What was the total amount spent on interest expenses in 2018?", "output": "What was the Interest cost in 2018?" }, { "input": "\"In which specific year did the cost of services exceed the cost of services in the other years?\"", "output": "In which year was service cost larger?" }, { "input": "Which year had a greater service cost compared to other years?", "output": "In which year was service cost larger?" }, { "input": "What was the difference in the amount spent on interest in 2019 compared to 2018?", "output": "What was the change in interest cost in 2019 from 2018?" }, { "input": "\"What was the gross revenue generated by overseas operations, excluding any income taxes, in the fiscal year 2018?\"", "output": "What was the Foreign income before income taxes in 2018?" }, { "input": "What was the total income from foreign sources in 2018, before any deductions for income taxes were made?", "output": "What was the Foreign income before income taxes in 2018?" }, { "input": "What are the components under income before income taxes?", "output": "What are the individual components that make up the category of income before income taxes?" }, { "input": "In which specific year, prior to the implementation of income taxes, did individuals and/or organizations experience the highest level of income?", "output": "In which year was income before income taxes the largest?" }, { "input": "What is the largest recorded income before income taxes, and in which specific year was it achieved?", "output": "In which year was income before income taxes the largest?" }, { "input": "In which specific year did the quantity of an item categorized as \"Other\" exceed the quantity of another item, considering all provided information?", "output": "In which year was the amount of Other larger?" }, { "input": "In which year did the quantity of a different item exceed the quantity of the rest?", "output": "In which year was the amount of Other larger?" }, { "input": "What was the year-over-year difference in accrued severance and associated expenses between 2018 and 2019?", "output": "What was the change in Accrued severance and related costs in 2019 from 2018?" }, { "input": "What was the total payment made by the Company in the year 2019?", "output": "How much has the Company paid in total in 2019?" }, { "input": "What was the total amount paid by the Company in 2019?", "output": "How much has the Company paid in total in 2019?" }, { "input": "When did the amount of non-cash impairment charges exceed previous years?", "output": "In which year was Non-cash impairment charges larger?" }, { "input": "Which year had a greater amount of non-cash impairment charges compared to other years?", "output": "In which year was Non-cash impairment charges larger?" }, { "input": "During which years is the fair value of restricted shares or restricted stock units (RSUs) vested determined?", "output": "In which years is the Fair value of restricted shares/RSUs vested calculated?" }, { "input": "What was the change in Number of restricted shares/ RSUs granted in 2019 from 2018?", "output": "What was the difference in the number of restricted shares/RSUs granted in 2019 compared to 2018?" }, { "input": "What was the change in Number of restricted shares/ RSUs granted in 2019 from 2018?", "output": "What was the difference in the number of restricted shares/ RSUs granted in 2019 compared to 2018?" }, { "input": "What was the percentage change in Number of restricted shares/ RSUs granted in 2019 from 2018?", "output": "What was the percentage change in the number of restricted shares/ RSUs granted from 2018 to 2019?" }, { "input": "As of July 1, 2019, what was the total value of stapled securities of NSR listed on the Australian Securities Exchange?", "output": "How much did NSR had stapled securities quoted on the Australian Securities Exchange as at 1 July 2019?" }, { "input": "As of July 1, 2019, what was the total value of NSR stapled securities quoted on the Australian Securities Exchange?", "output": "How much did NSR had stapled securities quoted on the Australian Securities Exchange as at 1 July 2019?" }, { "input": "What would happen if the price of a security dropped by 2.5% and how would this affect the overall market?", "output": "What would be the impact if security price decreased by 2.5%?" }, { "input": "What is the change in Other intangible assets Opening net book value from 2018 to 2019?", "output": "What is the difference in the net book value of Other intangible assets at the beginning of 2018 compared to the beginning of 2019?" }, { "input": "What is the change in Other intangible assets Opening net book value from 2018 to 2019?", "output": "What is the difference in the opening net book value of Other intangible assets between 2018 and 2019?" }, { "input": "How many distinct business segments were there in the years 2019 and 2018?", "output": "How many business segments were present in 2019 and 2018?" }, { "input": "\"What is the average revenue generated from New Zealand in both 2018 and 2019?\"", "output": "What is the average revenue from New Zealand for 2018 and 2019?" }, { "input": "What is the average revenue generated from New Zealand in both 2018 and 2019?", "output": "What is the average revenue from New Zealand for 2018 and 2019?" }, { "input": "In which specific year did the revenue from New Zealand amount to less than 10,000 thousands?", "output": "In which year was revenue from New Zealand under 10,000 thousands?" }, { "input": "In what specific year did the revenue generated from New Zealand fall below 10,000 thousands (in the same currency)?", "output": "In which year was revenue from New Zealand under 10,000 thousands?" }, { "input": "What was the total NLA in Brisbane?", "output": "What was the total number of Newly Listed Apartments (NLA) in Brisbane?" }, { "input": "What is the difference in the NLA between Sunshine Cost and Brisbane?", "output": "What are the NLA differences between the Sunshine Coast and Brisbane, and how do they vary from each other in terms of specific aspects?" }, { "input": "In which year is the Net investment hedge negative?", "output": "Which specific year does the Net investment hedge demonstrate a negative value?" }, { "input": "What is the change in the Taxation impact on revaluation from 2018 to 2019?", "output": "What is the difference in the taxation impact on revaluation between 2018 and 2019?" }, { "input": "What is the change in the Taxation impact on revaluation from 2018 to 2019?", "output": "What is the difference in the effect of taxation on revaluation between 2018 and 2019?" }, { "input": "What was the share of profit from associates representing NSRs in 2019 and 2018?", "output": "What were the percentages of profit from associates, specifically those representing NSRs, for the years 2019 and 2018?" }, { "input": "What is the difference in the percentage of profit attributed to associates between 2018 and 2019?", "output": "What is the change in the Share of profit from associates from 2018 to 2019?" }, { "input": "What is the difference in the percentage of profit allocated to associates between 2018 and 2019?", "output": "What is the change in the Share of profit from associates from 2018 to 2019?" }, { "input": "When was the year when the closing balance on June 30th was below 11,000 thousand units?", "output": "In which year was the Closing balance at 30 June less than 11,000 thousands?" }, { "input": "What is the year-over-year change in face value for New Zealand Dollar (NZD) interest rate swaps from 2018 to 2019?", "output": "What is the change in Interest rate swaps (NZD) at face value for Current interest rate swaps from 2018 to 2019?" }, { "input": "What was the percentage change in face value for current New Zealand Dollar (NZD) interest rate swaps from 2018 to 2019?", "output": "What is the change in Interest rate swaps (NZD) at face value for Current interest rate swaps from 2018 to 2019?" }, { "input": "What was the assets held for sale under Level 1, Level 2 and Level 3 in 2018?", "output": "In 2018, what were the assets classified as held for sale and valued at Level 1, Level 2, and Level 3 according to their respective classifications?" }, { "input": "What was the amount of senior secured term loan for fiscal years 2019 and 2018?", "output": "What is the value of senior secured term loan for fiscal years 2019 and 2018 respectively?" }, { "input": "What is the percentage shift in the amount of finance leases and other financing obligations from fiscal year 2018 to fiscal year 2019?", "output": "What is the percentage change in the finance leases and other financing obligations between fiscal years 2018 and 2019?" }, { "input": "What is the average finance leases and other financing obligations for fiscal years 2018 and 2019?", "output": "What is the mean value of finance leases and other financing obligations for fiscal years 2018 and 2019?" }, { "input": "What are the income tax expenses for the years 2019, 2018, and 2017?", "output": "What is the total current income tax expense for 2019, 2018 and 2017 respectively?" }, { "input": "What is the amount of deferred income tax expense (benefit) recorded for the years 2019, 2018, and 2017, respectively?", "output": "What is the Total deferred income tax expense (benefit) for 2019, 2018 and 2017 respectively?" }, { "input": "What is the average total income tax expense for the years 2017, 2018, and 2019?", "output": "What is the average Total income tax expense for 2019, 2018 and 2017?" }, { "input": "What is the average aggregate size of the facilities in Arizona and Washington?", "output": "What is the average total size of the facilities located in both Arizona and Washington?" }, { "input": "\"How many states in the United States have a total of more than 5 facilities?\"", "output": "How many states have more than 5 facilities?" }, { "input": "What percentage of the total contractual obligations does the total inventory purchase commitments represent?", "output": "What is the percentage constitution of total inventory purchase commitments among the total contractual obligations?" }, { "input": "What percentage of the total contractual obligations is constituted by the inventory purchase commitments in the overall inventory?", "output": "What is the percentage constitution of total inventory purchase commitments among the total contractual obligations?" }, { "input": "\"What specific component within a contractual obligation holds the highest overall value?\"", "output": "Which component of contractual obligation has the highest total value?" }, { "input": "At December 29, 2017, how many shares were still unvested?", "output": "What is the number of shares that are Unvested at December 29, 2017?" }, { "input": "What is the length of time that buildings are typically considered useful for?", "output": "What is the useful lives of buildings?" }, { "input": "What are the estimated useful lives or lifespans of machinery and equipment commonly used in various industries or sectors?", "output": "What is the useful lives of Machinery and equipment?" }, { "input": "What are the estimated useful lifespans of machinery and equipment? How long can machinery and equipment be expected to remain functional and productive?", "output": "What is the useful lives of Machinery and equipment?" }, { "input": "What is the average monetary worth of buildings in the years 2018 and 2019?", "output": "What is the average value of buildings for 2018 and 2019?" }, { "input": "What is the average monetary worth of buildings in the years 2018 and 2019 combined?", "output": "What is the average value of buildings for 2018 and 2019?" }, { "input": "What is the average adjusted net income from 2017-2019?", "output": "\"What is the average adjusted net income for the years 2017, 2018, and 2019?\"" }, { "input": "What were the values of receivables and inventory in the years 2019 and 2018, respectively?", "output": "What is the value of Receivables and inventory for 2019 and 2018 respectively?" }, { "input": "What is the value of Accrued expenses for 2019 and 2018 respectively?", "output": "What is the amount of Accrued expenses in 2019 and 2018?" }, { "input": "What is the value of Self-insurance reserves for 2019 and 2018 respectively?", "output": "What are the self-insurance reserve values for the years 2019 and 2018?" }, { "input": "What are the average values of receivables and inventory for the years 2019 and 2018?", "output": "What is the average value of Receivables and inventory for 2019 and 2018?" }, { "input": "What are the average values of Receivables and inventory for the years 2019 and 2018?", "output": "What is the average value of Receivables and inventory for 2019 and 2018?" }, { "input": "In which year did the Accrued expenses reach the highest level?", "output": "Which year has the highest Accrued expenses?" }, { "input": "In which specific year did the Accrued expenses reach their highest value?", "output": "Which year has the highest Accrued expenses?" }, { "input": "Which year recorded the highest Senior Secured Term Loan amount?", "output": "Which year has the highest Senior secured term loan?" }, { "input": "What information does the table present?", "output": "What data is being shown in the table and what details can be derived from it?" }, { "input": "What is the change in long-term portion of contingent earn-out liabilities for 2018 and 2019?", "output": "What is the difference in the long-term portion of contingent earn-out liabilities between 2018 and 2019?" }, { "input": "What were the statutory U.S. Federal tax rates for the years 2019, 2018, and 2017?", "output": "What is the Statutory U.S. Federal tax for 2019, 2018 and 2017 respectively?" }, { "input": "What are the respective statutory federal tax rates for the years 2019, 2018, and 2017 in the United States?", "output": "What is the Statutory U.S. Federal tax for 2019, 2018 and 2017 respectively?" }, { "input": "How much did the valuation allowance change in 2017?", "output": "What is the change in valuation allowance for 2017?" }, { "input": "What was the difference in valuation allowance between 2016 and 2017?", "output": "What is the change in valuation allowance for 2017?" }, { "input": "What has been the average amount of Statutory U.S. Federal tax paid annually for the years 2017, 2018, and 2019?", "output": "What is the average Statutory U.S. Federal tax from 2017-2019?" }, { "input": "What was the average statutory U.S. federal tax rate for the years 2017-2019?", "output": "What is the average Statutory U.S. Federal tax from 2017-2019?" }, { "input": "What is the Working capital, excluding cash and cash equivalents for fiscal years 2019, 2018 and 2017 respectively?", "output": "What is the year-end working capital, excluding cash and cash equivalents, for the fiscal years 2019, 2018, and 2017, respectively?" }, { "input": "What is the Working capital, excluding cash and cash equivalents for fiscal years 2019, 2018 and 2017 respectively?", "output": "What is the Working Capital, excluding cash and cash equivalents, for the fiscal years 2019, 2018, and 2017?" }, { "input": "What are the average values of Cash and cash equivalents for fiscal years 2019, 2018, and 2017?", "output": "What is the average value for the Cash and cash equivalents for fiscal years 2019, 2018 and 2017?" }, { "input": "What are the average values of the Availability under asset-based loan facility for the fiscal years 2019, 2018, and 2017?", "output": "What is the average value for the Availability under asset-based loan facility for fiscal years 2019, 2018 and 2017?" }, { "input": "What were the net income (or net losses) for the years 2019, 2018, and 2017 respectively?", "output": "What was the Net (loss) income in 2019, 2018 and 2017 respectively?" }, { "input": "What were the respective net losses or incomes for the years 2019, 2018, and 2017?", "output": "What was the Net (loss) income in 2019, 2018 and 2017 respectively?" }, { "input": "In which year was Adjusted EBITDA less than 20,000 thousands?", "output": "\"In which specific year, out of the given years, was the Adjusted EBITDA figure lower than 20,000 thousands?\"" }, { "input": "In which year was Adjusted EBITDA less than 20,000 thousands?", "output": "In which specific year was the Adjusted EBITDA, expressed in thousands, less than 20,000?" }, { "input": "What was the percentage increase in the expense incurred for Research and Development in the year 2018 compared to the previous period?", "output": "What was the increase in Research and development expense in 2018?" }, { "input": "\"What was the percentage increase in Research and Development (R&D) expense during the year 2018 compared to the previous year?\"", "output": "What was the increase in Research and development expense in 2018?" }, { "input": "What were the average research and development expenses in both 2017 and 2018?", "output": "What was the average Research and development expenses in 2017 and 2018?" }, { "input": "What were the respective average amounts spent on research and development in 2017 and 2018?", "output": "What was the average Research and development expenses in 2017 and 2018?" }, { "input": "What was the change in the Net operating loss carryforwards from 2018 to 2019?", "output": "What was the numerical difference between the Net operating loss carryforwards in 2018 and 2019?" }, { "input": "\"In which specific year did the value of Deferred revenue fall below 2,000?\"", "output": "In which year was Deferred revenue less than 2,000?" }, { "input": "What is the difference in the amount of accumulated depreciation between March 31, 2018, and March 31, 2019?", "output": "What is the change in Accumulated depreciation from March 31, 2018 to March 31, 2019?" }, { "input": "How much did the Accumulated depreciation increase between March 31, 2018 and March 31, 2019?", "output": "What is the change in Accumulated depreciation from March 31, 2018 to March 31, 2019?" }, { "input": "What was the percentage change in the General and administrative expenses in 2019 compared to the previous year?", "output": "What was the increase in the General and administrative expenses in 2019?" }, { "input": "What was the percentage increase in the General and administrative expenses from 2018 to 2019?", "output": "What was the increase in the General and administrative expenses in 2019?" }, { "input": "\"Who has the authority over the management and administration of the company's operations in a specific country?\"", "output": "Which country controls the company's affairs?" }, { "input": "\"Who has jurisdiction over the overall management and operations of the company?\"", "output": "Which country controls the company's affairs?" }, { "input": "What was the change in the U.S. state taxes, net of federal from 2018 to 2019?", "output": "What was the difference in the amount of state taxes paid in the United States, after deducting federal taxes, from 2018 to 2019?" }, { "input": "What was the change in the U.S. state taxes, net of federal from 2018 to 2019?", "output": "What was the difference in the amount of state taxes, after deducting federal taxes, between the years 2018 and 2019 in the United States?" }, { "input": "In which year was Tax credits less than 10.0?", "output": "When was the year when tax credits were less than 10.0 units?" }, { "input": "In which year was Tax credits less than 10.0?", "output": "In which specific year did the amount of Tax credits fall below 10.0?" }, { "input": "What was the exact amount of share-based compensation expense incurred during the year 2019?", "output": "What was the Total share-based compensation expense in 2019?" }, { "input": "\"What changes, if any, took place once the IPO concluded?\"", "output": "What became effective upon the closing of the IPO?" }, { "input": "What were the initial balances in the years 2019 and 2018, respectively?", "output": "What was the Beginning balance in 2019 and 2018 respectively?" }, { "input": "What were the starting balances for the year 2019 and 2018?", "output": "What was the Beginning balance in 2019 and 2018 respectively?" }, { "input": "How much did the Company recorded amortization expense for the year ended March 31, 2018?", "output": "What was the amount of amortization expense recorded by the Company for the fiscal year ending on March 31, 2018?" }, { "input": "How much did the Company recorded amortization expense for the year ended March 31, 2018?", "output": "What was the amount of amortization expense recorded by the Company for the fiscal year that ended on March 31, 2018?" }, { "input": "How much did the Company recorded amortization expense for the year ended March 31, 2017?", "output": "What was the amount of amortization expense recorded by the Company for the fiscal year that ended on March 31, 2017?" }, { "input": "What are the main sources of liquidity in financial markets and institutions?", "output": "What are the principal sources of liquidity?" }, { "input": "What was the total value of outstanding letters of credit associated with specific operating leases as of March 31, 2018?", "output": "How much was the outstanding letters of credit related to certain operating leases as of March 31, 2018?" }, { "input": "What were the revenue retention rates for the years 2019, 2018, and 2017 in chronological order?", "output": "What was the Revenue retention rate in 2019, 2018 and 2017 respectively?" }, { "input": "What is the average number of customers in total from 2017 to 2019?", "output": "What is the average Total customers between 2017-2019?" }, { "input": "\"In which specific year did the Adjusted EBITDA amount to less than 20,000 thousands?\"", "output": "In which year was Adjusted EBITDA less than 20,000 thousands?" }, { "input": "In which specific year did the Adjusted EBITDA amount to less than 20,000 thousands?", "output": "In which year was Adjusted EBITDA less than 20,000 thousands?" }, { "input": "What was the amount of Depreciation and amortization expense recorded for the fiscal year that ended on March 31, 2017?", "output": "How much was Depreciation and amortization expense for the years ended March 31, 2017?" }, { "input": "What was the total amount of Depreciation and amortization expense recorded for the fiscal years ending on March 31, 2017?", "output": "How much was Depreciation and amortization expense for the years ended March 31, 2017?" }, { "input": "What is the net increase or decrease in the Accumulated Amortization balance from March 31, 2018, to March 31, 2019?", "output": "What is the change in Accumulated amortization from March 31, 2018 to March 31, 2019?" }, { "input": "What is the net increase or decrease in the amount of Accumulated Amortization between March 31, 2018, and March 31, 2019?", "output": "What is the change in Accumulated amortization from March 31, 2018 to March 31, 2019?" }, { "input": "What was the percentage change in the Risk-free interest rate from 2018 to 2019?", "output": "What was the change in the Risk-free interest rate from 2018 to 2019?" }, { "input": "What was the average level of expected volatility recorded between 2017 and 2019?", "output": "What was the average Expected volatility between 2017-2019?" }, { "input": "What was the average expected volatility during the period of 2017-2019?", "output": "What was the average Expected volatility between 2017-2019?" }, { "input": "In which specific year did the grant date fair value per ordinary share fall below 30.0?", "output": "In which year was the Grant date fair value per ordinary share less than 30.0?" }, { "input": "When was the year when the grant date fair value per ordinary share fell below 30.0?", "output": "In which year was the Grant date fair value per ordinary share less than 30.0?" }, { "input": "\"What was the specific quantity or value of work that was at the intermediate stage of completion in the year 2018?\"", "output": "What was the amount of work in process in 2018?" }, { "input": "What was the change in finished goods between 2018 and 2019?", "output": "What was the numerical difference in the amount of finished goods produced between the years 2018 and 2019?" }, { "input": "What was the specific amount of interest incurred in the year 2018?", "output": "What was the interest cost in 2018?" }, { "input": "What was the change in interest cost between 2018 and 2019?", "output": "By how much did the interest cost change from 2018 to 2019?" }, { "input": "For how many years did the actuarial loss surpass $1,000 thousand and receive recognition?", "output": "How many years did the recognized actuarial loss exceed $1,000 thousand?" }, { "input": "What was the total amount of insurance proceeds received during the fiscal year that ended on August 31, 2019?", "output": "What were the insurance proceeds for the fiscal year ended August 31, 2019?" }, { "input": "\"What was the total amount of insurance claims paid out during the fiscal year that concluded on August 31, 2019?\"", "output": "What were the insurance proceeds for the fiscal year ended August 31, 2019?" }, { "input": "What was the specific difference in the amount of Restructuring and related charges incurred by the company in the years 2018 and 2019?", "output": "What was the change in Restructuring and related charges between 2018 and 2019?" }, { "input": "What were the reductions for tax positions primarily related to?", "output": "\"What was the primary reason for the reductions in tax positions?\"" }, { "input": "What is the company's definition of working capital and how is it calculated?", "output": "How does the company define working capital?" }, { "input": "What was the amount of assets in 2019?", "output": "What were the total assets in 2019?" }, { "input": "What was the value of the total assets in the year 2019?", "output": "What were the total assets in 2019?" }, { "input": "What is the range of years covered in the table?", "output": "What are the years included in the table?" }, { "input": "What was the change in Current installments of notes payable and long-term debt between 2018 and 2019?", "output": "What was the difference in the amount of current installments of notes payable and long-term debt between the year 2018 and 2019?" }, { "input": "What was the change in Current installments of notes payable and long-term debt between 2018 and 2019?", "output": "What is the difference in the amount of current installments of notes payable and long-term debt between the years 2018 and 2019?" }, { "input": "What was the percentage increase or decrease in the stockholders' equity of Total Jabil Inc. from 2018 to 2019?", "output": "What was the percentage change in Total Jabil Inc. stockholders’ equity between 2018 and 2019?" }, { "input": "What was the specific net revenue figure reported?", "output": "What was the net revenue as reported?" }, { "input": "What was the operating income as reported?", "output": "What was the reported operating income amount?" }, { "input": "What was the specific distinction between the reported net revenue and cost of revenue? Please elaborate on the differences observed between these two financial measures.", "output": "What was the difference between net revenue and cost of revenue as reported?" }, { "input": "What was the distinction between the net revenue and cost of revenue as stated in the report, and can you provide further details to assist in answering?", "output": "What was the difference between net revenue and cost of revenue as reported?" }, { "input": "What was the operating income balance expressed as a ratio of the amount reported, before considering the implementation of ASU 2014-09?", "output": "What was the operating income balance without the adoption of ASU 2014-09 as a ratio of the amount as reported?" }, { "input": "What was the ratio of the operating income balance without the adoption of ASU 2014-09 to the amount as reported?", "output": "What was the operating income balance without the adoption of ASU 2014-09 as a ratio of the amount as reported?" }, { "input": "By what amount did the gross profit increase or decrease from May 2018 to August 2018?", "output": "What was the change in gross profit between May 2018 and August 2018?" }, { "input": "What are the distinctions between the September 2018 balances of contract assets and contract liabilities? Please provide a comprehensive explanation while keeping all the details intact.", "output": "What is the difference between the balance in September 2018 for contract assets and contract liabilities?" }, { "input": "What was the percentage change for Other accrued expenses due to adjustments by the new standard?", "output": "What was the percentage change specifically for Other accrued expenses as a result of adjustments made under the new standard?" }, { "input": "What was the percentage change for Other accrued expenses due to adjustments by the new standard?", "output": "What was the percentage change in Other accrued expenses after incorporating adjustments made by the new standard? Please provide the specific percentage change." }, { "input": "What is the exact amount of net revenue generated in the fiscal year 2019?", "output": "What was the net revenue in 2019?" }, { "input": "What was the total revenue generated during the year 2019 after deducting all expenses and costs?", "output": "What was the net revenue in 2019?" }, { "input": "What was the gross profit in 2018?", "output": "What was the gross profit generated in the year 2018?" }, { "input": "What was the operating income in 2017?", "output": "What was the operating income for the year 2017?" }, { "input": "What was the change in gross profit between 2017 and 2018?", "output": "What was the difference in gross profit from 2017 to 2018?" }, { "input": "What was the change in Prepaid expenses and other current assets due to the adoption of ASU 2014-09?", "output": "\"What was the impact on Prepaid expenses and other current assets resulting from the adoption of ASU 2014-09? Please provide details and explain the specific changes that occurred as a result of this accounting standard.\"" }, { "input": "How many reported segments have a value exceeding $2,000,000?", "output": "How many segments as reported exceed $2,000,000 thousand?" }, { "input": "What were the items incorporated within the amounts for the fiscal year that concluded on August 31, 2017?", "output": "What did the amounts in fiscal year ended August 31, 2017 include?" }, { "input": "What expenses or revenues were included in the amounts reported for the fiscal year that ended on August 31, 2017?", "output": "What did the amounts in fiscal year ended August 31, 2017 include?" }, { "input": "What was the difference in the costs of severance and employee benefits for the company from 2017 to 2018?", "output": "What was the change in Employee severance and benefit costs between 2017 and 2018?" }, { "input": "Which periods does the table include?", "output": "What time periods are encompassed in the table's content?" }, { "input": "\"What specific items or transactions were encompassed within the category of acquisitions and adjustments?\"", "output": "What did Acquisitions and adjustments include?" }, { "input": "What were the variations in acquisitions and adjustments made in fiscal year 2018 between the Electronic Manufacturing Services (EMS) and Document Management Systems (DMS)?", "output": "What is the difference in Acquisitions and adjustments in fiscal 2018 between EMS and DMS?" }, { "input": "What are the distinctions in acquisitions and adjustments, specifically pertaining to fiscal year 2018, when comparing EMS (Emergency Medical Services) and DMS (Document Management Systems)?", "output": "What is the difference in Acquisitions and adjustments in fiscal 2018 between EMS and DMS?" }, { "input": "What was the balance of EMS as in 2018 as a percentage of the total balance at the end of fiscal 2018?", "output": "What percentage of the total balance at the end of fiscal 2018 represented the balance of EMS in 2018?" }, { "input": "What categories of income or loss are included in the table's income (loss) figures before accounting for income tax expense?", "output": "What types of income (loss) before income tax expense is provided by the table?" }, { "input": "What was the foreign income (loss) in 2018?", "output": "What was the amount of foreign income or loss for the year 2018?" }, { "input": "What was the change in Foreign income (loss) between 2018 and 2019?", "output": "What was the difference in the amount of income or loss generated from foreign sources, comparing the years 2018 and 2019?" }, { "input": "What was the change in Foreign income (loss) between 2018 and 2019?", "output": "What was the difference in the amount of foreign income or loss reported between 2018 and 2019?" }, { "input": "\"How many annual periods experienced a Foreign income loss that was greater than $800,000 thousand dollars?\"", "output": "How many years did Foreign income (loss) exceed $800,000 thousand?" }, { "input": "For how many years was the amount of Foreign income (loss) greater than $800,000 thousand?", "output": "How many years did Foreign income (loss) exceed $800,000 thousand?" }, { "input": "What was the change in Accumulated benefit obligation between 2018 and 2019?", "output": "What was the specific numerical difference in the Accumulated Benefit Obligation (ABO) amount between the fiscal years of 2018 and 2019?" }, { "input": "\"What were the specific expenses incurred by the company for employee severance and benefits during the year 2018?\"", "output": "What were the Employee severance and benefit costs in 2018?" }, { "input": "What were the severance and benefit costs for employees in the year 2018?", "output": "What were the Employee severance and benefit costs in 2018?" }, { "input": "What was the precise difference in terms of employee severance and benefit costs from 2018 to 2019?", "output": "What was the change in Employee severance and benefit costs between 2018 and 2019?" }, { "input": "How many years did cash proceeds received exceed $5,000 million?", "output": "How many years have the cash proceeds received been above $5,000 million?" }, { "input": "How many years did cash proceeds received exceed $5,000 million?", "output": "For how many years did the cash proceeds received exceed $5,000 million?" }, { "input": "What was the percentage increase or decrease in pre-tax losses on the sale of receivables from 2017 to 2018?", "output": "What was the percentage change in Pre-tax losses on sale of receivables between 2017 and 2018?" }, { "input": "What was the difference in the amount of money spent on repurchasing shares in 2016 compared to 2017?", "output": "What was the change in share repurchases between 2016 and 2017?" }, { "input": "What was the percentage increase or decrease in dividend payments from 2018 to 2019?", "output": "What was the percentage change in dividends paid between 2018 and 2019?" }, { "input": "What was the percentage increase or decrease in dividends paid from 2018 to 2019?", "output": "What was the percentage change in dividends paid between 2018 and 2019?" }, { "input": "What was the specific topic of discussion in the DMS segment?", "output": "What was the DMS segment focused on?" }, { "input": "What was the change in the net revenue from EMS between 2018 and 2019?", "output": "What was the difference in net revenue generated by EMS between the years 2018 and 2019?" }, { "input": "What was the change in the net revenue from EMS between 2018 and 2019?", "output": "What was the exact difference in net revenue for EMS between the years 2018 and 2019?" }, { "input": "What was the average year-on-year percentage change in total net revenue from 2017-2019?", "output": "What was the average annual percentage increase or decrease in the total net revenue from 2017 to 2019?" }, { "input": "What was the average year-on-year percentage change in total net revenue from 2017-2019?", "output": "What was the average percentage change in net revenue from year to year for the total period between 2017 and 2019?" }, { "input": "What is the difference in the balance of Employee Severance and Benefit Costs as of August between 2017 and 2018?", "output": "What is the change in the balance as of August in Employee Severance and Benefit Costs between 2017 and 2018?" }, { "input": "What were the lease costs in 2017 as a percentage of the total balance in 2017?", "output": "\"What was the percentage of lease costs in 2017 compared to the total balance in 2017?\"" }, { "input": "What were the lease costs in 2017 as a percentage of the total balance in 2017?", "output": "\"What proportion of the total balance in 2017 was allocated towards lease costs?\"" }, { "input": "\"What was the amount of the domestic-federal income tax expense (benefit) recorded for the year 2017?\"", "output": "What was the current domestic-federal income tax expense (benefit) in 2017?" }, { "input": "What was the amount of domestic-federal income tax expense (benefit) recorded in the financial statements for the year 2017?", "output": "What was the current domestic-federal income tax expense (benefit) in 2017?" }, { "input": "What was the amount of income tax expense or benefit incurred for domestic operations in 2019?", "output": "What was the current domestic-state income tax expense (benefit) in 2019?" }, { "input": "In 2019, what was the amount of income tax expense or benefit incurred at the domestic state level?", "output": "What was the current domestic-state income tax expense (benefit) in 2019?" }, { "input": "What is the change in the company's domestic-state income tax expense (benefit) between 2018 and 2019?", "output": "What is the difference in the domestic-state income tax expense (benefit) for the company from 2018 to 2019?" }, { "input": "What is the change in the company's domestic-state income tax expense (benefit) between 2018 and 2019?", "output": "What was the difference in the domestic-state income tax expense (benefit) of the company from 2018 to 2019?" }, { "input": "What were the types of land and the associated improvements that existed in the year 2019?", "output": "What were the land and improvements in 2019?" }, { "input": "What was the total monetary worth of all buildings in the year 2018?", "output": "What was the value of buildings in 2018?" }, { "input": "What was the percentage change in total property, plant and equipment between 2018 and 2019?", "output": "What was the percentage increase or decrease in the total value of property, plant, and equipment from 2018 to 2019?" }, { "input": "Based on what factors or criteria were the fair value estimates for Level 3 determined?", "output": "What were the Level 3 fair value estimates based on?" }, { "input": "What factors or criteria were taken into consideration when determining the Level 3 fair value estimates?", "output": "What were the Level 3 fair value estimates based on?" }, { "input": "What was the change in the fair value for the 3.950% Senior Notes between 2018 and 2019?", "output": "What was the difference in fair value for the 3.950% Senior Notes from 2018 to 2019?" }, { "input": "What is the significance of the amount awarded as shares, which is contingent upon meeting specific performance criteria?", "output": "What does the amount for shares granted that are based on the achievement of certain performance criteria represent?" }, { "input": "What was the percentage difference in the number of shares from 2018 to 2019 and how do I calculate it accurately?", "output": "What was the percentage change in the number of shares between 2018 and 2019?" }, { "input": "What was the percentage change in Expected dividend yield between 2017 and 2018?", "output": "What was the percentage adjustment in the expected dividend yield from 2017 to 2018?" }, { "input": "What were the restricted stock units in 2019?", "output": "What is the significance of restricted stock units in 2019, and can you provide more details about them?" }, { "input": "What were the details and features of the Employee stock purchase plan (ESPP) implemented in 2018?", "output": "What was the Employee stock purchase plan in 2018?" }, { "input": "What was the change in Other stock-based compensation expenses between 2017 and 2018?", "output": "What was the difference in the expenses related to stock-based compensation for Other category between the years 2017 and 2018?" }, { "input": "\"What was the difference in the number of restricted stock units awarded by a company in 2018 compared to 2019?\"", "output": "What was the change in the restricted stock units between 2018 and 2019?" }, { "input": "What was the net change in the number of restricted stock units from 2018 to 2019?", "output": "What was the change in the restricted stock units between 2018 and 2019?" }, { "input": "What does the key management comprise of?", "output": "What is included in key management?" }, { "input": "What is the combined value of all shares held in joint ventures as of December 31, 2019?", "output": "What is the total value of shares at 31 December 2019 for each joint venture?" }, { "input": "What is the precise percentage change in revenue losses experienced by the United Kingdom from the year 2018 to 2019?", "output": "What is the percentage change in the revenue losses in UK from 2018 to 2019?" }, { "input": "What is the percentage increase or decrease in the revenue losses experienced in the United Kingdom from 2018 to 2019?", "output": "What is the percentage change in the revenue losses in UK from 2018 to 2019?" }, { "input": "What was the net amount of debt in 2019?", "output": "What is the Net debt in 2019?" }, { "input": "What was the net debt amount in the year 2019?", "output": "What is the Net debt in 2019?" }, { "input": "What is the percentage change in the net debt from 2018 to 2019?", "output": "What is the percentage increase or decrease in the net debt from 2018 to 2019?" }, { "input": "What were the fees payable by the Group's joint ventures in 2019?", "output": "What were the specific fees that the Group's joint ventures had to pay in the year 2019?" }, { "input": "What were the fees payable by the Group's joint ventures in 2019?", "output": "What were the fees that the Group's joint ventures had to pay in 2019? Could you provide details on the specific amounts payable by each joint venture?" }, { "input": "What proportion of the total borrowings in 2019, including both the principal and interest, will be due within a period of 5 years?", "output": "What is the percentage of borrowings (including interest) that matures over 5 years in the total borrowings in 2019?" }, { "input": "What was the net asset value (NAV) of EPRA in 2018?", "output": "What is the EPRA NAV in 2018?" }, { "input": "What is the precise percentage difference in the overall investment made in joint ventures for the years 2018 and 2019?", "output": "What is the percentage change in the total investment in joint ventures from 2018 to 2019?" }, { "input": "What was the interest rate protection provided in 2019?", "output": "What is the interest rate protection in 2019?" }, { "input": "What is the current interest rate protection in 2019 and how does it work?", "output": "What is the interest rate protection in 2019?" }, { "input": "What is the percentage difference in the fair value of unallocated swaps between 2018 and 2019?", "output": "What is the percentage change in the fair value of unallocated swaps from 2018 to 2019?" }, { "input": "What was the difference in the level of interest rate protection from 2018 to 2019?", "output": "What is the change in interest rate protection between 2018 and 2019?" }, { "input": "What is the year-on-year percentage change in the calculation of contingent rents based on tenants' turnover from 2018 to 2019?", "output": "What is the percentage change in the amount of contingent rents calculated by reference to tenants' turnover from 2018 to 2019?" }, { "input": "In what year will the future minimum lease amounts receivable be higher than usual?", "output": "In which year is there higher future minimum lease amounts receivable?" }, { "input": "What were the dividends in respect of these shares have been waived by agreement in 2018?", "output": "What was the specific agreement regarding the waiver of dividends on these shares in 2018?" }, { "input": "What is the percentage change in the disposals value from 2018 to 2019?", "output": "What is the percentage difference in the value of disposals from 2018 to 2019, and how can it be calculated?" }, { "input": "What is the current total quantity of units that the Company possesses?", "output": "What is the total number of units the Company has?" }, { "input": "What is the weighted average exercise price for the outstanding options in the Save As You Earn Scheme as of 31 December 2019?", "output": "What is the weighted average exercise prices of the outstanding options exercisable at 31 December 2019 for the Save As You Earn Scheme?" }, { "input": "What is the calculated average exercise prices of the options that are currently outstanding and can be exercised as of December 31, 2019, under the Save As You Earn Scheme?", "output": "What is the weighted average exercise prices of the outstanding options exercisable at 31 December 2019 for the Save As You Earn Scheme?" }, { "input": "What is the percentage change in the total provided deferred tax provision from 1 January 2018 to 31 December 2019?", "output": "What is the precise percentage change in the total amount of deferred tax provision that was provided from 1 January 2018 to 31 December 2019?" }, { "input": "What was the total amount of administration expenses incurred in the year 2019?", "output": "What is the administration expense in 2019?" }, { "input": "What is encompassed in diluted shares?", "output": "What do the diluted shares include?" }, { "input": "What is the percentage of counterparty #5 exposure in the total exposure in 2019?", "output": "What is the proportion of the exposure to counterparty #5 compared to the total exposure in the year 2019?" }, { "input": "What is the percentage of counterparty #5 exposure in the total exposure in 2019?", "output": "What is the proportion of counterparty #5's exposure compared to the total exposure in the year 2019?" }, { "input": "What is the percentage difference in the combined value of cash deposits and derivative financial instrument assets between 2018 and 2019?", "output": "What is the percentage change in the sum of cash deposits and derivative financial instrument assets from 2018 to 2019?" }, { "input": "What is the percentage difference in the loss before tax, including joint ventures and associates, from 2018 to 2019?", "output": "What is the percentage change in the loss before tax, joint ventures and associates from 2018 to 2019?" }, { "input": "What is the net exposure to foreign exchange risk (euro) in 2018?", "output": "\"What is the total amount of foreign exchange risk (in euros) faced by a company in the year 2018?\"" }, { "input": "What is the impact on equity attributable to owners of the Group when there is a 10% depreciation in foreign exchange rates from 2018 to 2019? Specifically, how does this affect the negative movement of money to equity?", "output": "What is the change in the negative movement of money to equity attributable to owners of the Group when there is a 10 per cent depreciation in foreign exchange rates from 2018 to 2019?" }, { "input": "What is the impact on the Group's owners' equity when foreign exchange rates depreciate by 10% between 2018 and 2019?", "output": "What is the change in the negative movement of money to equity attributable to owners of the Group when there is a 10 per cent depreciation in foreign exchange rates from 2018 to 2019?" }, { "input": "What is the percentage difference in the euro amount drawn from 2018 to 2019?", "output": "What is the percentage change in the amount drawn in euros from 2018 to 2019?" }, { "input": "What is the percentage increase or decrease in the euro amount withdrawn between 2018 and 2019?", "output": "What is the percentage change in the amount drawn in euros from 2018 to 2019?" }, { "input": "\"In which specific year did the total debt reach its highest level, and is it greater than the total debt in any other year?\"", "output": "In which year is there a greater total debt?" }, { "input": "In which year does the total debt amount exceed or surpass the debt levels of other years?", "output": "In which year is there a greater total debt?" }, { "input": "What is the percentage change in the total debt from 2018 to 2019?", "output": "What is the percentage increase or decrease in the total debt from 2018 to 2019?" }, { "input": "\"What were the average salaries for administrative positions in fiscal years 2019 and 2018?", "output": "What is the average Administrative salaries for fiscal 2019 and 2018?" }, { "input": "\"What was the average annual salary for administrative employees in fiscal years 2019 and 2018?", "output": "What is the average Administrative salaries for fiscal 2019 and 2018?" }, { "input": "What is the weekly processing capacity comparison between the plants located in Laurel, Mississippi and Collins, Mississippi?", "output": "What is the difference in capacity per week between the processing plants at Laurel, Mississippi and Collins, Mississippi? " }, { "input": "What are the average amounts for prepaid insurance in both fiscal years 2019 and 2018?", "output": "What is the average Prepaid insurance for fiscal years 2019 and 2018?" }, { "input": "What is the difference in the amount of Total prepaid expenses recorded in the financial statements for fiscal years 2019 and 2018? I need to determine the change in prepaid expenses for these two specific years.", "output": "What is the change in Total prepaid expenses between fiscal years 2019 and 2018?" }, { "input": "What is the difference in the total amount of prepaid expenses between fiscal years 2019 and 2018?", "output": "What is the change in Total prepaid expenses between fiscal years 2019 and 2018?" }, { "input": "What is the difference in the statutory income tax rates between fiscal years 2019 and 2018, and how does this impact income taxes?", "output": "What is the change in Income taxes at statutory rate between fiscal years 2019 and 2018?" }, { "input": "What is the net sales from Fresh, vacuum-sealed chicken for fiscal years 2019 to 2017 respectively?", "output": "What were the net sales of Fresh, vacuum-sealed chicken for fiscal years 2019, 2018, and 2017?" }, { "input": "What is the net sales from Fresh, chill-packed chicken for fiscal years 2019 to 2017 respectively?", "output": "What are the net sales figures for Fresh, chill-packed chicken for fiscal years 2019, 2018, and 2017, respectively?" }, { "input": "What is the net sales from Fresh, chill-packed chicken for fiscal years 2019 to 2017 respectively?", "output": "How much revenue was generated from the sales of Fresh, chill-packed chicken for the fiscal years of 2019, 2018, and 2017, respectively?" }, { "input": "What is the net sales from Fresh, ice-packed chicken for fiscal years 2019 to 2017 respectively?", "output": "What were the net sales of Fresh, ice-packed chicken for fiscal years 2019, 2018, and 2017?" }, { "input": "What was the purpose of the company's operating leases, and how did they contribute to the company's operations or financial performance?", "output": "What was the company's operating leases for?" }, { "input": "What is the discrepancy in the overall amount when comparing revenues gained from revolving loan facility fees to expenses incurred from operating leases?", "output": "What was the difference in the total between Revolving loan facility fees and operating leases? " }, { "input": "What was the change in the Net cash used in investing activities between 2017 and 2019?", "output": "What was the difference in the amount of net cash used in investing activities from 2017 to 2019?" }, { "input": "What was the percentage change in the Net cash used in financing activities between 2018 and 2019?", "output": "What was the percentage difference in the amount of cash used in financing activities from 2018 to 2019?" }, { "input": "Which years does the table provide information for the company's Selected Consolidated Statements of Operations Data?", "output": "For which specific time periods does the table present the company's Selected Consolidated Statements of Operations Data?" }, { "input": "What was the exact amount of revenue generated in the year 2015?", "output": "What was the total revenue in 2015?" }, { "input": "What was the specific amount of operating profit generated in the year 2017?", "output": "What was the operating profit in 2017?" }, { "input": "What was the exact amount of operating profit earned during the year 2017?", "output": "What was the operating profit in 2017?" }, { "input": "What is the precise percentage difference in operating profit from 2015 to 2016?", "output": "What was the percentage change in operating profit between 2015 and 2016?" }, { "input": "What was the percentage increase or decrease in operating profit from 2015 to 2016?", "output": "What was the percentage change in operating profit between 2015 and 2016?" }, { "input": "What was the cash used in Software in 2019?", "output": "How much money was spent on Software in the year 2019?" }, { "input": "What was the net income in 2019?", "output": "What was the exact net income amount for the fiscal year of 2019?" }, { "input": "What was the specific amount of interest that was recorded as an expense in the year 2018?", "output": "What was the interest expense in 2018?" }, { "input": "What is the amount of money paid as interest in 2018?", "output": "What was the interest expense in 2018?" }, { "input": "For how many consecutive years has the net income exceeded $100,000 thousand?", "output": "How many years did net income exceed $100,000 thousand?" }, { "input": "What was the change in the interest expense between 2017 and 2018?", "output": "What were the differences in the amount of interest expense incurred in 2017 and 2018?" }, { "input": "How much was the change in fair value of the company's servicing asset that was included in the fees charged for servicing?", "output": "How much was the included change in fair value of the company's servicing asset included in its servicing fees?" }, { "input": "For how many consecutive years have Interchange fees surpassed $50,000?", "output": "How many years did Interchange fees exceed $50,000 thousand?" }, { "input": "How many years have the Interchange fees been over $50,000 thousand?", "output": "How many years did Interchange fees exceed $50,000 thousand?" }, { "input": "What was the difference in the fees charged for transactions in 2017 compared to 2019? How did the transaction fees change between 2017 and 2019?", "output": "What was the change in the transaction fees between 2017 and 2019?" }, { "input": "What was the percentage change in the total revenue between 2018 and 2019?", "output": "What is the percentage increase or decrease in the total revenue from 2018 to 2019?" }, { "input": "What was the percentage change in the total revenue between 2018 and 2019?", "output": "What was the percentage difference in the overall revenue from 2018 to 2019?" }, { "input": "How much did the company invest in partnerships during the year 2018?", "output": "What was the company's investment in partnership in 2018?" }, { "input": "How much money did the company invest in partnership during the year 2018?", "output": "What was the company's investment in partnership in 2018?" }, { "input": "What is the significance and meaning of the number of Settlements?", "output": "What does the amount of Settlements represent?" }, { "input": "What is the significance or meaning of the number of Settlements?", "output": "What does the amount of Settlements represent?" }, { "input": "How many years did the face value of term loan exceed $300,000 thousand?", "output": "For how many years has the term loan been valued above $300,000 thousand?" }, { "input": "How many years did the face value of term loan exceed $300,000 thousand?", "output": "How many years has the face value of the term loan been consistently higher than $300,000 thousand?" }, { "input": "What was the change in the Loan Servicing Portfolio between 2017 and 2018?", "output": "What was the difference in the size of the Loan Servicing Portfolio from 2017 to 2018?" }, { "input": "What was the total costs and expenses?", "output": "What was the overall amount of costs and expenses associated with the given context or situation?" }, { "input": "What was the exact value of the restricted cash in the year 2017?", "output": "What was the amount of restricted cash in 2017?" }, { "input": "\"What is the range of years covered in the table?\"", "output": "Which years does the table provide?" }, { "input": "What was the change in Cash and cash equivalents between 2018 and 2019?", "output": "What was the difference in the amount of Cash and cash equivalents between 2018 and 2019?" }, { "input": "For how long did the amount of Restricted Cash surpass $200,000 thousand?", "output": "How many years did Restricted Cash exceed $200,000 thousand?" }, { "input": "What was the total quantity of Furniture sold or produced in the year 2018?", "output": "What was the amount of Furniture in 2018?" }, { "input": "What was the total quantity or number of furniture items in the year 2018?", "output": "What was the amount of Furniture in 2018?" }, { "input": "What was the total quantity of computer hardware devices manufactured or sold globally in the year 2019?", "output": "What was the amount of Computer hardware in 2019?" }, { "input": "How much computer hardware was present in the year 2019?", "output": "What was the amount of Computer hardware in 2019?" }, { "input": "What was the change in the amount of Software between 2018 and 2019?", "output": "What was the difference in the quantity of Software from 2018 to 2019, specifically looking at the change in amount?" }, { "input": "What was the change in the amount of Software between 2018 and 2019?", "output": "How much did the amount of Software change from 2018 to 2019?" }, { "input": "What was the amount of Receipts in 2017?", "output": "How many receipts were recorded in the year 2017?" }, { "input": "What was the amount of Receipts in 2017?", "output": "\"How many total receipts were recorded in the year 2017?\"" }, { "input": "What was the change in the amount of Settlements between 2017 and 2019?", "output": "What was the net difference in the number of Settlements from 2017 to 2019?" }, { "input": "What was the change in the amount of Settlements between 2017 and 2019?", "output": "What was the difference in the number of Settlements from 2017 to 2019 and how did it change over the two-year period?" }, { "input": "What was the exact amount of revenue generated in the year 2018?", "output": "What was the total revenue in 2018?" }, { "input": "What is the total amount of money earned in 2018?", "output": "What was the total revenue in 2018?" }, { "input": "\"What was the specific financial figure measuring earnings from regular business operations, known as operating profit, during the year 2019?\"", "output": "What was the operating profit in 2019?" }, { "input": "What was the operating profit in the first quarter?", "output": "What is the exact amount of operating profit recorded in the first quarter of the fiscal year?" }, { "input": "What was the specific amount of net income generated in the third quarter?", "output": "What was the net income in the third quarter?" }, { "input": "What was the percentage increase or decrease in the basic earnings per share of Class A common stock from the first quarter to the second quarter?", "output": "What was the percentage change in the basic earnings per share of Class A common stock between the first and second quarter?" }, { "input": "\"What is the range of years covered in the table?\"", "output": "Which years does the table provide?" }, { "input": "What was the difference in the amount of net income recorded between the years 2017 and 2018?", "output": "What was the change in net income between 2017 and 2018?" }, { "input": "\"What is the meaning and significance of net fair value changes recognized in other gains (losses)?\"", "output": "What does the net Fair value changes recognized in other gains (losses) represent?" }, { "input": "What is the representation of net fair value changes recognized in other gains (losses)?", "output": "What does the net Fair value changes recognized in other gains (losses) represent?" }, { "input": "What was the beginning balance in 2018?", "output": "\"The starting balance for the year 2018, specifically referring to financial records or accounts, is requested. Please provide the initial amount present at the beginning of that year.\"" }, { "input": "Which years did the ending balance exceed $3,000 thousand?", "output": "In which years was the ending balance higher than $3,000 thousand?" }, { "input": "What is the percentage of fair value time deposits compared to the overall fair value of marketable securities?", "output": "What percentage of the total fair value marketable securities is made up of fair value time deposits?" }, { "input": "\"What is the current interest rate specified in the credit facility agreement that governs our credit arrangement?\"", "output": "What is the interest rate on the credit facility agreement?" }, { "input": "\"What is the specific interest rate specified in the credit facility agreement that governs our borrowing terms?\"", "output": "What is the interest rate on the credit facility agreement?" }, { "input": "What factors contributed to the increase in research and development expenses in 2019?", "output": "What are the reasons for higher research and development expense in 2019?" }, { "input": "What is the net difference in research and development expense between 2019 and 2017?", "output": "What is the exact amount of change in research and development expense between the fiscal years of 2019 and 2017?" }, { "input": "How much were the marketable securities worth in 2019?", "output": "What was the amount of marketable securities in 2019?" }, { "input": "What was the precise value of marketable securities held in the year 2019?", "output": "What was the amount of marketable securities in 2019?" }, { "input": "What is the difference in the total amount of shareholders' equity between the years 2018 and 2019?", "output": "What is the change in total shareholders' equity from 2018 to 2019?" }, { "input": "How much did the total shareholders' equity change between 2018 and 2019?", "output": "What is the change in total shareholders' equity from 2018 to 2019?" }, { "input": "How does the net (loss) income of 2019 compare to the net (loss) income of 2018?", "output": "What is the difference between Net (loss) income in 2018 and 2019?" }, { "input": "What was the exact percentage increase in gross profit from the year 2018 to the year 2019?", "output": "What was the percentage increase in gross profit from 2018 to 2019?" }, { "input": "What are the different components of expenses that need to be considered when starting production?", "output": "What are components of production start-up expense?" }, { "input": "What is the total change in expenditure for initiating production between the years 2019 and 2017?", "output": "What is the net difference in production start-up expense between 2019 and 2017?" }, { "input": "What is the difference in net sales amount in 2019 and 2018?", "output": "What is the difference in the net sales amount between the years 2019 and 2018? Please provide details and figures to compare the net sales amounts of these two specific years." }, { "input": "What is the value of foreign government obligations as of December 31, 2019?", "output": "What is the total amount of foreign government obligations recorded as of December 31, 2019?" }, { "input": "What is the comparative value of money market funds and foreign debt as of December 31, 2019, and how do they differ?", "output": "What is the difference in the value of money market funds and foreign debt as of December 31, 2019?" }, { "input": "What is the increase in cash provided by operating activities from 2018 to 2019?", "output": "How much did the cash provided by operating activities increase from 2018 to 2019?" }, { "input": "What is the difference in net cash and cash equivalents between 2019 and 2018?", "output": "What is the change in net cash and cash equivalents between the fiscal years 2019 and 2018?" }, { "input": "\"What specific items or responsibilities are typically considered as part of other obligations?\"", "output": "What are included in other obligations?" }, { "input": "What does the category of \"other obligations\" encompass and include?", "output": "What are included in other obligations?" }, { "input": "Why was $72.2 million of unrecognized tax benefits excluded?", "output": "Why were the $72.2 million in unrecognized tax benefits excluded from something?" }, { "input": "What percentage of the total contractual obligations is made up of total long-term debt obligations?", "output": "What is the proportion of the overall contractual obligations that consists specifically of the total amount owed in long-term debt obligations?" }, { "input": "What is the distinction between the total amount paid in interest and the total sum of operating lease obligations?", "output": "What difference between total interest payments and total operating lease obligations?" }, { "input": "What is the interest rate for the term loan that I am inquiring about?", "output": "What is the interest rate of the term loan?" }, { "input": "What are the differences in the amounts of notes receivable recorded in 2018 and 2019?", "output": "What is the difference between notes receivables from 2018 to 2019?" }, { "input": "What is the precise percentage by which the total prepaid expenses and other current assets increased from 2018 to 2019?", "output": "What is the percentage increase in total prepaid expenses and other current assets from 2018 to 2019?" }, { "input": "How are the accounting procedures used to record changes in fair value of hedging instruments?", "output": "How are changes in fair value of hedging instruments accounted for?" }, { "input": "What are the types of derivative instruments included in the total for other liabilities on a financial statement?", "output": "What is the total derivate instruments for other liabilities?" }, { "input": "What are the types of derivative instruments used to measure the total value of other liabilities?", "output": "What is the total derivate instruments for other liabilities?" }, { "input": "What is the ratio of the total derivatives classified as hedging instruments to the total derivatives not classified as hedging instruments under prepaid expenses and other current assets?", "output": "Under prepaid expenses and other current assets, what is the ratio of the total derivates designated as hedging instruments to those not designated as hedging instruments?" }, { "input": "What was the increase in machinery and equipment from 2018 to 2019?", "output": "What was the percentage change in the value of machinery and equipment between 2018 and 2019?" }, { "input": "What was the increase in machinery and equipment from 2018 to 2019?", "output": "What percentage growth was observed in the value of machinery and equipment between the years 2018 and 2019?" }, { "input": "What are the differences in land statistics comparing the year 2018 and 2019?", "output": "What is the difference between land from 2018 to 2019?" }, { "input": "What are the discrepancies in land conditions between the years 2018 and 2019?", "output": "What is the difference between land from 2018 to 2019?" }, { "input": "What is the net change in selling, general, and administrative expense between the years 2019 and 2017?", "output": "What is the net difference in selling, general and administrative expense between 2019 and 2017?" }, { "input": "What is the current percentage shareholding of Tata Sons Private Limited and Life Insurance Corporation of India, and how does it differ between the two entities?", "output": "What is the difference in percentage shareholding between Tata Sons Private Limited and Life Insurance Corporation of India?" }, { "input": "What is the disparity in the quantity of equity shares possessed by Promoters and Insurance Companies?", "output": "What is the difference between number of equity shares held between Promoters and Insurance Companies?" }, { "input": "How many shareholding accounts were there as of March 31, 2019?", "output": "What is the number of existing shareholding accounts as on March 31, 2019? " }, { "input": "As of March 31, 2019, what is the overall sum of all assets owned by someone or something?", "output": "What is the grand total of holdings as on March 31, 2019?" }, { "input": "What is the overall value of assets and investments held as of March 31, 2019?", "output": "What is the grand total of holdings as on March 31, 2019?" }, { "input": "What is the difference in the number of holdings within the share categories of '1-100' and '101-500'? How do the share categories of '1-100' and '101-500' differ in terms of the number of holdings they encompass?", "output": "What is the difference in holdings between the share categories of '1-100' and '101-500'?" }, { "input": "What is the basis for consolidating shareholdings and how does it affect the overall company structure?", "output": "On what basis is shareholding consolidated?" }, { "input": "What is the basis for the consolidation of shareholding?", "output": "On what basis is shareholding consolidated?" }, { "input": "\"What is the proportion of unsecured loans out of the total debt at the start of the financial year?\"", "output": "At the beginning of the financial year, what percentage of total indebtedness is made up of unsecured loans?" }, { "input": "What was the change in the quantity of shares transferred from 2011 to 2012 into the Investor Education and Protection Fund (IEPF)?", "output": "What is the difference in number of shares transferred to IEPF from 2011 to 2012?" }, { "input": "What is the total remuneration for Ramakrishnan V?", "output": "What is the complete amount of compensation that Ramakrishnan V receives?" }, { "input": "What is the total remuneration for Ramakrishnan V?", "output": "What is the total amount of money received as payment or compensation for services rendered by Ramakrishnan V?" }, { "input": "What is the monetary worth of the stock option that the Company Secretary receives?", "output": "What is the value of Stock Option given to the Company Secretary?" }, { "input": "What is the monetary worth or amount of Stock Option that has been granted or allocated to the Company Secretary as part of their compensation or benefits package?", "output": "What is the value of Stock Option given to the Company Secretary?" }, { "input": "What is the specific value or percentage of commission that is usually given to the Chief Financial Officer (CFO) of a company?", "output": "What is the value of Commission given to the Chief Financial Officer?" }, { "input": "\"What is the specific amount or percentage of commission that is typically awarded to the Chief Financial Officer (CFO) for their role?\"", "output": "What is the value of Commission given to the Chief Financial Officer?" }, { "input": "What distinguishes the CFO from the Company Secretary in terms of Others and Allowances? Clarify and elaborate on the differences between these two roles regarding their responsibilities, remunerations, and any additional benefits they may receive.", "output": "What is the difference in Others, Allowances between the CFO and Company Secretary?" }, { "input": "Which key managerial personnel had the highest total remuneration?", "output": "\"Who among the key managerial personnel received the highest total remuneration?\"" }, { "input": "How does the ratio of median remuneration between the two Executive Directors differ?", "output": "What is the difference in ratio to median remuneration between the two Executive Directors?" }, { "input": "\"What is the difference in the amount of money received from sales or services between the years 2018 and 2019?\"", "output": "What is the change in revenue between 2018 and 2019?" }, { "input": "What is the difference in the amount of money earned by the company between the years 2018 and 2019?", "output": "What is the change in revenue between 2018 and 2019?" }, { "input": "What portion of total unconsolidated revenue in FY 2019 constitutes of export revenue?", "output": "What percentage of the total unconsolidated revenue for the fiscal year 2019 is comprised of revenue from exports?" }, { "input": "What portion of total unconsolidated revenue in FY 2018 constitutes of export revenue?", "output": "What percentage of the overall unconsolidated revenue for the fiscal year 2018 is derived from export sales?" }, { "input": "What is the difference in the amount of money earned from foreign exchanges between the fiscal years of 2018 and 2019?", "output": "What is the change in foreign exchange earnings between FY 2018 and FY 2019?" }, { "input": "What is the change in Cost, Insurance and Freight (CIF) value of imports from FY 2018 to FY 2019?", "output": "How has the Cost, Insurance and Freight (CIF) value of imports changed from FY 2018 to FY 2019?" }, { "input": "What is the specific amount of money charged as a sitting fee for N Chandrasekaran?", "output": "What is the sitting fee of N Chandrasekaran?" }, { "input": "What is the difference in sitting fees between O P Bhatt and Aarthi Subramanian?", "output": "What is the difference in sitting fees between O P Bhatt and Aarthi Subramanian, and in what context are these fees being compared or analyzed?" }, { "input": "What is the difference in sitting fees between O P Bhatt and Aarthi Subramanian?", "output": "What is the difference in the sitting fees charged by O P Bhatt and Aarthi Subramanian?" }, { "input": "What is the percentage change in pre-tax margin between 2017 and 2018?", "output": "What is the increase/ (decrease) in Pre-tax margin from 2017 to 2018" }, { "input": "What is the percentage change in Pre-tax margin between 2017 and 2018?", "output": "What is the increase/ (decrease) in Pre-tax margin from 2017 to 2018" }, { "input": "What does Net Income include?", "output": "\"What elements are included in the calculation of Net Income?\"" }, { "input": "What is the increase / (decrease) in revenue from 2018 to 2019?", "output": "How much did the revenue change from 2018 to 2019?" }, { "input": "What is the percentage increase / (decrease) in Current liabilities from 2018 to 2019?", "output": "What is the percentage change in Current Liabilities between 2018 and 2019?" }, { "input": "What caused the decrease in the actual shares?", "output": "What factors led to the decrease in the current number of shares held by individuals or entities?" }, { "input": "What was the percentage change in Earnings per Share of common stock from continuing operations (basic) compared to the previous period?", "output": "What was the increase / (decrease) from the Earnings per share of common stock from continuing operations basic?" }, { "input": "What was the percentage change in the basic earnings per share of common stock from continuing operations?", "output": "What was the increase / (decrease) from the Earnings per share of common stock from continuing operations basic?" }, { "input": "How did the performance of hardware platforms compare from year to year?", "output": "How was the performance of hardware platforms year to year" }, { "input": "What does the Pre-tax income performance implied? ", "output": "What is the meaning of Pre-tax income performance?" }, { "input": "What was the pre-tax margin of 2019?", "output": "\"What was the pre-tax margin percentage for the year 2019?\"" }, { "input": "What was the pre-tax margin of 2019?", "output": "\"What was the specific pre-tax margin for the fiscal year of 2019?\"" }, { "input": "What was the External gross profit margin in 2019?", "output": "What was the gross profit margin from sources external to the company for the year 2019?" }, { "input": "What is the average of Balance at January 1?", "output": "What is the average balance as of January 1st for all accounts (or customers, if applicable)?" }, { "input": "What is the average balance amount as of December 31st?", "output": "What is the average of Balance at December 31?" }, { "input": "What is the average balance amount recorded on December 31st for a specific period of time?", "output": "What is the average of Balance at December 31?" }, { "input": "What is the increase / (decrease) in the net capitalized software from 2018 to 2019?", "output": "What is the change in the net capitalized software amount from 2018 to 2019?" }, { "input": "What was the percentage change in pre-tax income from 2018 to 2019?", "output": "What was the increase / (decrease) in the Pre-tax income from 2018 to 2019?" }, { "input": "What is the percentage change in gross profit for External Operating Systems Software from 2018 to 2019?", "output": "What is the increase / (decrease) in the External Operating Systems Software gross profit from 2018 to 2019?" }, { "input": "What was the percentage increase / (decrease) in total short term debt?", "output": "What was the percentage change in the total amount of short term debt?" }, { "input": "What was the percentage change in the pre-tax margin from 2018 to 2019?", "output": "What was the increase / (decrease) in the Pre-tax margin from 2018 to 2019?" }, { "input": "What percentage change in the pre-tax margin occurred between 2018 and 2019?", "output": "What was the increase / (decrease) in the Pre-tax margin from 2018 to 2019?" }, { "input": "What is the meaning or significance of the launch expense?", "output": "What does launch expense represent?" }, { "input": "How many types of expenses are listed in the table?", "output": "How many categories of expenses are included in the table?" }, { "input": "What was the average adjusted EBITDA for 2018 and 2019?", "output": "What was the average amount of adjusted EBITDA for both the years 2018 and 2019?" }, { "input": "\"What is the chronological order of the gross profit for each financial year end as depicted in the table?\"", "output": "What is the gross profit for each financial year end shown in the table (in chronological order)?" }, { "input": "What was the year with the highest value for other financial expenses?", "output": "In which year was the amount of other financial expenses the largest?" }, { "input": "What was the net difference in the overall financial expenses incurred during the year 2019 as compared to 2018?", "output": "What was the change in the total financial expenses in 2019 from 2018?" }, { "input": "What was the sum of all financial costs incurred in the year 2019?", "output": "What is the total amount of financial expenses in 2019?" }, { "input": "What was the specific year when the amount of Current lease liabilities was greater than in any other year?", "output": "In which year was the amount of Current lease liabilities larger?" }, { "input": "When did the amount of Current lease liabilities exceed the previous year's amount?", "output": "In which year was the amount of Current lease liabilities larger?" }, { "input": "What is the total financial income for 2019?", "output": "What was the overall financial income for the year 2019?" }, { "input": "What was the total number of financial items in the year 2019?", "output": "What is the amount of total financial items in 2019?" }, { "input": "What is the total number of financial items recorded in the year 2019?", "output": "What is the amount of total financial items in 2019?" }, { "input": "What are the primary categories that are examined within the Financial Items section of the table?", "output": "What are the main categories analyzed under Financial Items in the table?" }, { "input": "In which year was the amount of financial income the smallest?", "output": "What is the year with the minimum financial income amount?" }, { "input": "What was the change in the Average invested capital less average impairment in 2019 from 2018?", "output": "What was the difference in the average amount of capital invested excluding average impairment between 2019 and 2018?" }, { "input": "What was the change in the Average invested capital less average impairment in 2019 from 2018?", "output": "What was the difference in the average amount of money invested minus the average amount of impairments in 2019 compared to 2018?" }, { "input": "What was the percentage change in retained earnings in 2019 from 2018?", "output": "What was the precise percentage difference in retained earnings between 2018 and 2019, specifically referring to the change observed in 2019?" }, { "input": "What was the percentage change in retained earnings in 2019 from 2018?", "output": "What was the percentage increase or decrease in the amount of retained earnings from the year 2018 to 2019?" }, { "input": "How is Total Cost of Ownership (TCE) earnings utilized as a widely accepted performance measure for the shipping industry?", "output": "How is TCE earnings used as a standard shipping industry performance measure?" }, { "input": "What is the specific year when the revenue reached its maximum value?", "output": "In which year was Revenue the largest?" }, { "input": "What items were included in the freight receivables as of 2019?", "output": "As of 2019, what did freight receivables include?" }, { "input": "What was included in the category of freight receivables as of 2019?", "output": "As of 2019, what did freight receivables include?" }, { "input": "What is the determining basis for the calculation and establishment of an allowance for expected credit loss?", "output": "What is the making of allowance for expected credit loss based on?" }, { "input": "What is the basis for determining the provision for expected credit loss?", "output": "What is the making of allowance for expected credit loss based on?" }, { "input": "What is the specific year when the total gross freight receivables reached their highest amount?", "output": "In which year was the amount of total gross freight receivables the largest?" }, { "input": "What was the net increase or decrease in the total gross freight receivables between 2018 and 2019?", "output": "What was the change in the total net gross freight receivables in 2019 from 2018?" }, { "input": "What information can be derived in note 16?", "output": "What data or details can be obtained from note 16?" }, { "input": "What information can be derived in note 16?", "output": "What kind of information can be obtained from note 16?" }, { "input": "What was the change in the carrying amount as of 31 December from 2018 to 2019?", "output": "What was the difference in the carrying amount as of December 31 between the years 2018 and 2019? Please provide the specific change in the carrying amount." }, { "input": "What are the specific components present in the table that are utilized for liquidity calculations?", "output": "What are the components in the table used to calculate liquidity?" }, { "input": "What was the difference in liquidity levels between 2019 and 2018, and how did it change over the course of the year?", "output": "What was the change in liquidity in 2019 from 2018?" }, { "input": "What does TORM have leases for?", "output": "What types of leases does TORM currently have?" }, { "input": "How are leases represented on the balance sheet and what specific details are included for each lease?", "output": "How is each lease reflected on the balance sheet?" }, { "input": "\"What is the asset type that experiences the highest rate of depreciation in a year?\"", "output": "Which asset type has the largest depreciation of the year?" }, { "input": "What was the difference in the amount of gross profit in 2019 compared to 2018?", "output": "What was the change in gross profit in 2019 from 2018?" }, { "input": "What was the exact percentage increase or decrease in the gross profit from 2018 to 2019?", "output": "What was the percentage change in gross profit in 2019 from 2018?" }, { "input": "What was the percentage increase or decrease in gross profit from 2018 to 2019?", "output": "What was the percentage change in gross profit in 2019 from 2018?" }, { "input": "\"What specific details or data can be found in Note 21 of the provided information?\"", "output": "What information is provided in Note 21?" }, { "input": "What was the percentage change in interest rate swaps between the year 2018 and 2019?", "output": "What was the change in interest rate swaps from 2018 to 2019?" }, { "input": "How did interest rate swaps change from 2018 to 2019?", "output": "What was the change in interest rate swaps from 2018 to 2019?" }, { "input": "What was the percentage difference in the interest rate swaps between 2018 and 2019? I would like to know the rate at which the interest rate swaps changed between these two years.", "output": "What was the percentage change in interest rate swaps from 2018 to 2019?" }, { "input": "What was the precise percentage difference in the interest rate swap market between 2018 and 2019?", "output": "What was the percentage change in interest rate swaps from 2018 to 2019?" }, { "input": "What is TORM's belief or estimation of the amount of net interest-bearing debt they possess?", "output": "What does TORM believe net interest-bearing debt to be?" }, { "input": "What is the methodology used to calculate the net asset value (NAV) per share in investment funds?", "output": "How is NAV/share calculated?" }, { "input": "What are the types of audit fees in the table?", "output": "What are the different types of audit fees listed in the table?" }, { "input": "In what year did the auditor receive the highest overall compensation?", "output": "In which year was the total remuneration to the auditor the largest?" }, { "input": "For which specific years were the calculations of Other Liabilities amounts conducted?", "output": "In which years were the amounts of Other Liabilities calculated for?" }, { "input": "In which specific year did the amount allocated to Partners and commercial managements surpass the amount allocated in other years?", "output": "In which year was the amount under Partners and commercial managements larger?" }, { "input": "\"What specific expenses are included in the category of operating expenses that are related to staff costs?\"", "output": "What are the staff costs included in operating expenses related to?" }, { "input": "How is the average number of employees determined and what is the methodology behind its calculation?", "output": "How is the average number of employees calculated?" }, { "input": "What was the change in the total number of permanent employees from 2018 to 2019?", "output": "How many permanent employees were added or lost between 2018 and 2019, resulting in the change in the total number of permanent employees?" }, { "input": "What does note 21 include in terms of information?", "output": "What information does note 21 contain?" }, { "input": "What does accounts receivables in the table include?", "output": "What specific components are included in the accounts receivables section of the table?" }, { "input": "What specific items or costs are included in the income statement as Port expenses, including bunkers and commissions?", "output": "What is recognized in the income statement under Port expenses, bunkers and commissions?" }, { "input": "How is allowance for expected credit loss of freight receivables calculated?", "output": "How is the calculation for allowance for expected credit loss determined specifically for freight receivables?" }, { "input": "\"What specific components are accounted for in the carrying amount designated for 'Vessels and capitalized dry-docking'?\"", "output": "What is included in the carrying amount for \"Vessels and capitalized dry-docking\"?" }, { "input": "What specific sub-elements, denoted in capital letters, can be found under the category of \"Vessels\" in the table related to dry-docking?", "output": "What are the sub-elements under Vessels and capitalized dry-docking in the table?" }, { "input": "What specific components or sub-elements fall under the category of Vessels and capitalized dry-docking in the table?", "output": "What are the sub-elements under Vessels and capitalized dry-docking in the table?" }, { "input": "\"What is the meaning of the phrase 'company's retirement obligations' and what specific aspects does it encompass?\"", "output": "What does the company's retirement obligations refer to?" }, { "input": "\"What are the retirement obligations of the company and what do they encompass?\"", "output": "What does the company's retirement obligations refer to?" }, { "input": "What is the company's total contractual cash obligations due in more than 5 years?", "output": "What is the total amount of cash that the company is obligated to pay according to its contracts, and when are these payments due, specifically for obligations that extend beyond a period of 5 years?" }, { "input": "What is the company's total contractual cash obligations due in more than 5 years?", "output": "\"What is the total amount of cash that the company owes based on contractual agreements with a maturity period of more than 5 years?\"" }, { "input": "How much does operating leases account for total contractual cash obligations for period of less than 1 year?", "output": "What is the percentage of total contractual cash obligations attributable to operating leases with a duration of less than one year?" }, { "input": "What is the specific monetary difference in the company's operating income between the years 2018 and 2019?", "output": "What is the company's increase in income from operating activities between 2018 and 2019? " }, { "input": "What is the cash balance and the amount of cash equivalents held by the company as of August 31, 2019?", "output": "What is the company's cash and cash equivalents as at 31 August 2019?" }, { "input": "What is the change in cash flow from investing activities from 2018 to 2019, specifically referring to any increase or decrease in the amount of cash generated or used in investing activities during this period?", "output": "What is the increase in cash flow from investing activities between 2018 and 2019?" }, { "input": "What is the total cashflow from investing activities in both 2018 and 2019?", "output": "What is the combined amount of cash inflows and outflows from investing activities in 2018 and 2019?" }, { "input": "What is the total cashflow from investing activities in both 2018 and 2019?", "output": "What was the net cash flow from investing activities in 2018 and 2019 combined?" }, { "input": "What is the total effect of exchange rate changes on cash and cash equivalents in both 2018 and 2019?", "output": "What is the cumulative impact of fluctuations in exchange rates on the combined balances of cash and cash equivalents for the years 2018 and 2019?" }, { "input": "When was the approval by Accenture's Board of Directors granted for the implementation of an open-market share purchase program specifically designed for the acquisition of Accenture's Class A ordinary shares?", "output": "When did Accenture's Board of Directors authorize and confirm an open-market share purchase program for acquiring Accenture pls Class A ordinary shares?" }, { "input": "How much of the shares purchased in July were part of publicly announced plan or programs?", "output": "What proportion of the shares acquired in the month of July were specifically disclosed as part of publicly announced plan or programs?" }, { "input": "How much of the shares purchased in July were part of publicly announced plan or programs?", "output": "What was the proportion of shares acquired in July that were part of publicly disclosed plans or programs?" }, { "input": "What was the primary source of the company's operational revenue in the year 2019?", "output": "What is the company's main source of operating income in 2019?" }, { "input": "What was the specific amount spent on expenses directly related to the production or provision of goods and services in the year 2019?", "output": "What was the cost of revenue in 2019?" }, { "input": "What was the precise amount spent on expenses directly related to generating revenue in the year 2019?", "output": "What was the cost of revenue in 2019?" }, { "input": "What was the total count of shares that were approved and allocated?", "output": "What was the number of granted shares?" }, { "input": "How many shares were approved and given to individuals or entities as stocks?", "output": "What was the number of granted shares?" }, { "input": "What is the disparity in the weighted-average grant date fair value between shares that have been allotted and subsequently forfeited?", "output": "What is the difference in weighted-average grant date fair value between granted and forfeited shares?" }, { "input": "What is the difference in the weighted-average grant date fair value between unvested shares and vested shares in 2018?", "output": "What is the difference in weighted-average grant date fair value for unvested shares in 2018 and vested shares?" }, { "input": "What is the discrepancy in the weighted-average grant date fair value between unvested shares and vested shares in the year 2018?", "output": "What is the difference in weighted-average grant date fair value for unvested shares in 2018 and vested shares?" }, { "input": "What was the previously reported amount of total current assets?", "output": "What was the total current assets as previously reported?" }, { "input": "What is the total value of the deferred commissions, including the current portion, and the total current assets as reported in the previous financial statement?", "output": "What was the sum of deferred commissions, current portion and total current assets as previously reported?" }, { "input": "What was the total amount reported for deferred commissions, the current portion of deferred commissions, and the overall total for current assets in the previous report?", "output": "What was the sum of deferred commissions, current portion and total current assets as previously reported?" }, { "input": "What is the revised percentage of accrued expenses in relation to the total sum of liabilities and stockholders' equity?", "output": "What was accrued expenses as revised as a percentage of total liabilities and stockholders' equity?" }, { "input": "What was the company's bad debt expense for the year ending December 31, 2018?", "output": "What was the exact amount recorded as bad debt expense by the company for the entire year ending on December 31, 2018?" }, { "input": "What was the company's bad debt expense for the year ending December 31, 2018?", "output": "What was the amount of bad debts that the company recorded as an expense for the fiscal year that ended on December 31, 2018?" }, { "input": "What was the total amount of interest earned as income in the year 2017?", "output": "What was interest income in 2017?" }, { "input": "What was the net income of others in 2019?", "output": "What was others, net in 2019?" }, { "input": "What was the percentage change in Interest expense between 2017 and 2018?", "output": "What was the percentage increase or decrease in Interest expense from 2017 to 2018?" }, { "input": "What is the average interest income from 2017-2019?", "output": "What is the average income generated from interest for the years 2017 to 2019 inclusive?" }, { "input": "What percentage increase or decrease in interest income occurred from 2018 to 2019?", "output": "What was the percentage change in interest income between 2018 and 2019?" }, { "input": "What is the average amount of net cash provided by operating activities for the years 2017, 2018, and 2019?", "output": "What was the average net cash provided by operating activities from 2017-2019?" }, { "input": "What was the percentage change in the free flow cash margin from 2017 to 2018?", "output": "What was the change in free flow cash margin between 2017 and 2018?" }, { "input": "What was the total amount of federal net operating losses incurred by the company in the year 2019?", "output": "What was the company's federal net operating losses in 2019?" }, { "input": "What was the amount of federal net operating losses incurred by the company during the year 2019 for tax purposes?", "output": "What was the company's federal net operating losses in 2019?" }, { "input": "What was the company's state net operating losses in 2019?", "output": "What was the total amount of net operating losses incurred by the company in the state during the year 2019?" }, { "input": "What is the percentage change in Deferred revenue between 2018 and 2019?", "output": "What is the exact percentage difference in Deferred revenue from 2018 to 2019?" }, { "input": "What was the precise amount spent on sales and marketing in the year 2018?", "output": "What was the sales and marketing expense in 2018?" }, { "input": "What was the specific fair value amount of cash that is being referred to in the context of a certain situation or event?", "output": "What was the fair value amount of cash?" }, { "input": "What was the specific fair value figure assigned to the amount of cash? Please provide the determined fair value amount for the cash in question.", "output": "What was the fair value amount of cash?" }, { "input": "What percentage of fair value total cash equivalents consist of agency bonds?", "output": "What is the proportion of agency bonds, in terms of percentage, in the total cash equivalents that make up the fair value?" }, { "input": "What updates and advancements were made in computer equipment and software from 2018 to 2019?", "output": "What is the change in computer equipment and software between 2018 and 2019?" }, { "input": "What are the differences in computer equipment and software between the years 2018 and 2019?", "output": "What is the change in computer equipment and software between 2018 and 2019?" }, { "input": "What was the percentage change in revenue from research and development from 2017 to 2018?", "output": "What was the change in percentage of revenue of research and development between 2017 and 2018?" }, { "input": "What are the main promises or obligations that the company is committed to?", "output": "What are the company's principal commitments?" }, { "input": "What are the main commitments of the company?", "output": "What are the company's principal commitments?" }, { "input": "What is the sum of total operating lease obligations and Long-term debt obligations including interest?", "output": "What is the combined amount of operating lease obligations and long-term debt obligations, including accrued interest?" }, { "input": "What portion of the total obligations is attributable to payments that are due within a time frame of 1-3 years?", "output": "What is the percentage of the total obligations that consists of payments due in 1-3 years?" }, { "input": "What is the specific percentage of the overall obligations that are categorized as payments to be made within a period of 1-3 years?", "output": "What is the percentage of the total obligations that consists of payments due in 1-3 years?" }, { "input": "\"What are the regulations or guidelines regarding the payment of Foreign income tax applicable for the year 2019?\"", "output": "What is the provision for Foreign income tax in 2019?" }, { "input": "\"What are the regulations or rules implemented in 2019 regarding the taxation of foreign income?\"", "output": "What is the provision for Foreign income tax in 2019?" }, { "input": "What is the disparity in the weighted-average grant date fair value when comparing shares that have been granted to those that have been forfeited?", "output": "What is the difference in weighted-average grant date fair value between granted and forfeited shares?" }, { "input": "What is the disparity in weighted-average grant date fair value when comparing shares that have been granted with those that have been forfeited?", "output": "What is the difference in weighted-average grant date fair value between granted and forfeited shares?" }, { "input": "What is the Net sales for 2019?", "output": "What was the total global revenue generated from sales in the year 2019?" }, { "input": "What was the percentage change in diluted net income per share from continuing operations from 2017 to 2018?", "output": "What was the change in Diluted Net income per share from continuing operations in 2018 from 2017?" }, { "input": "What was the percentage increase or decrease in the Diluted Net income per share from continuing operations between 2018 and 2017?", "output": "What was the percentage change in Diluted Net income per share from continuing operations in 2018 from 2017?" }, { "input": "What does the Euro Term Loan due 2024 consist of?", "output": "What is included in the Euro Term Loan due in 2024?" }, { "input": "What does the Euro Term Loan due 2024 consist of?", "output": "\"What are the components and details of the Euro Term Loan due in 2024?\"" }, { "input": "In which year did the value of the 1.0% State of Connecticut term loan, which has a maturity date in 2023, surpass its previous size?", "output": "In which year was the 1.0% State of Connecticut term loan due 2023 larger?" }, { "input": "In which year did the size of the 1.0% State of Connecticut term loan due in 2023 increase?", "output": "In which year was the 1.0% State of Connecticut term loan due 2023 larger?" }, { "input": "What was the change in Capital lease obligations from 2018 to 2019?", "output": "What was the exact difference in the amount of Capital lease obligations between the years 2018 and 2019?" }, { "input": "In which year was ILS a higher percentage of total net sales?", "output": "In which specific year did the Incremental Lifetime Sales (ILS) constitute a larger proportion of the overall net sales?" }, { "input": "What were the net sales figures for ILS in both 2018 and 2019, and what was the average amount for each year?", "output": "What was the average amount of net sales for ILS in 2018 and 2019?" }, { "input": "What were the average net sales for ILS for the years 2018 and 2019?", "output": "What was the average amount of net sales for ILS in 2018 and 2019?" }, { "input": "What is the estimated duration of a building and any associated enhancements before they are deemed no longer useful?", "output": "What is the useful life of Buildings and improvements?" }, { "input": "How is property and equipment stated in the table?", "output": "In what manner is property and equipment presented and described within the table?" }, { "input": "In which year was the amount of Land larger?", "output": "\"When comparing the years, which year had a larger amount of land?\"" }, { "input": "What was the change in Leasehold improvements from 2018 to 2019?", "output": "What was the difference in Leasehold improvements value between 2018 and 2019?" }, { "input": "What was the percentage change in Short-term investments in 2019 from 2018?", "output": "What was the percentage difference in the value of Short-term investments between 2019 and 2018?" }, { "input": "What was the percentage change in Short-term investments in 2019 from 2018?", "output": "What was the exact percentage difference in the value of Short-term investments between 2018 and 2019?" }, { "input": "What was the net income for the year 2019?", "output": "What was Net income in 2019?" }, { "input": "\"In which specific year did the company report the highest amount of comprehensive income?\"", "output": "In which year was Comprehensive income largest?" }, { "input": "What is the largest year for Comprehensive income?", "output": "In which year was Comprehensive income largest?" }, { "input": "What was the difference in the amount of money earned in 2018 compared to 2017 after all expenses and taxes were accounted for?", "output": "What was the change in Net income in 2018 from 2017?" }, { "input": "What was the Projected Benefit Obligation (PBO) amount for the year 2019? Please provide the numerical value for the PBO as of December 31st, 2019.", "output": "What was the Projected benefit obligation in 2019?" }, { "input": "In which years was information on defined benefit plans provided?", "output": "During which specific years was information regarding defined benefit plans provided?" }, { "input": "In which year was the amount of Research and development largest?", "output": "In which specific year did Research and development reach its highest amount or expenditure?" }, { "input": "What was the specific amount allocated for income taxes in the year 2019?", "output": "What was the Provision for income taxes in 2019?" }, { "input": "What specific amount was set aside for income taxes in the fiscal year of 2019?", "output": "What was the Provision for income taxes in 2019?" }, { "input": "What has been the modification in the amount of state income taxes that individuals are required to pay in 2019 compared to 2018?", "output": "What was the change in state income taxes that is currently payable in 2019 from 2018?" }, { "input": "How much Microelectronics was present in the year 2019?", "output": "What is the amount of Microelectronics in 2019?" }, { "input": "What is the total volume or quantity of Microelectronics produced or used worldwide in the year 2019?", "output": "What is the amount of Microelectronics in 2019?" }, { "input": "In which years is net sales calculated?", "output": "In which specific years does the calculation of net sales occur?" }, { "input": "In which years is net sales calculated?", "output": "During which specific years is the calculation of net sales undertaken?" }, { "input": "What was the percentage difference in the quantity of OEM components and instrumentation between 2018 and 2019?", "output": "What was the percentage change in the amount of OEM components and instrumentation in 2019 from 2018?" }, { "input": "What was the Net cash provided by operating activities in 2019?", "output": "What was the amount of net cash generated from operating activities during the year 2019?" }, { "input": "What was the Purchases of property and equipment in 2018?", "output": "What was the total value of purchases made for property and equipment in the year 2018?" }, { "input": "In which specific years are the instances or applications of physical currency emphasized or showcased in the table?", "output": "In which years are the uses of cash highlighted in the table?" }, { "input": "What years are specified in the table to illustrate the purpose of utilizing cash?", "output": "In which years are the uses of cash highlighted in the table?" }, { "input": "In which year was the Issuance of shares under employee stock plans larger?", "output": "When comparing the issuance of shares under employee stock plans, in which specific year was the volume of shares issued found to be greater?" }, { "input": "In which year were the Additions charged to expenses the largest?", "output": "What is the specific year when the largest Additions were charged to expenses?" }, { "input": "In which year were the Additions charged to expenses the largest?", "output": "What was the year when the largest amount of Additions were charged to expenses?" }, { "input": "In which year did tax fees have a higher value or amount than in other years?", "output": "In which year were tax fees larger?" }, { "input": "What was the difference in the amount of all other fees charged in 2019 compared to 2018?", "output": "What was the change in All other fees in 2019 from 2018?" }, { "input": "What was Long-term taxes payable in 2019?", "output": "What was the amount of Long-term taxes payable in the year 2019?" }, { "input": "In which year was Other long-term liabilities larger?", "output": "\"In which specific year did Other long-term liabilities exhibit a greater amount compared to the other years?\"" }, { "input": "In 2019, what was the exact amount of deferred compensation liability?", "output": "What was the Total deferred compensation liability in 2019?" }, { "input": "For which years was the calculation of Total deferred compensation liability performed?", "output": "In which years was Total deferred compensation liability calculated?" }, { "input": "How was the calculation for diluted earnings per share derived?", "output": "How was diluted earnings per share computed?" }, { "input": "In which specific years can we find the earnings per share data that was included in the table?", "output": "In which years was earnings per share provided in the table?" }, { "input": "What year experienced the highest dilutive impact from employee stock awards?", "output": "In which year was the Dilutive effect of employee stock awards largest?" }, { "input": "In what specific year did employee stock awards have the most significant dilutive effect?", "output": "In which year was the Dilutive effect of employee stock awards largest?" }, { "input": "What was the change in Dilutive effect of employee stock awards in 2019 from 2018?", "output": "What was the difference in the dilutive impact caused by employee stock awards between 2019 and 2018?" }, { "input": "What was the change in Dilutive effect of employee stock awards in 2019 from 2018?", "output": "What was the difference in how much employee stock awards had a dilutive effect in 2019 compared to 2018?" }, { "input": "What was the percentage change in Dilutive effect of employee stock awards in 2019 from 2018?", "output": "What was the percentage change in the dilutive effect of employee stock awards from 2018 to 2019?" }, { "input": "In which specific year did the Amortization of intangible assets account for a larger proportion of the overall net sales?", "output": "In which year was Amortization of intangible assets a higher percentage of total net sales?" }, { "input": "What was the change in the amount of Amortization of intangible assets in 2019 from 2018?", "output": "What was the difference in the amount of Amortization of intangible assets between the fiscal years 2019 and 2018?" }, { "input": "What was the value of Net deferred tax assets in the financial year of 2019?", "output": "What was the Net deferred tax assets in 2019?" }, { "input": "What was the difference in the initial balance for 2019 compared to 2018?", "output": "What was the change in Beginning balance in 2019 from 2018?" }, { "input": "What was the difference in the starting balance in 2019 compared to 2018?", "output": "What was the change in Beginning balance in 2019 from 2018?" }, { "input": "What was the balance in the accounts as of the start of the year in 2018?", "output": "What was the Balance as of the beginning of the year in 2018?" }, { "input": "\"What was the exact monetary balance in the accounts as of January 1st, 2018, at the start of the year?\"", "output": "What was the Balance as of the beginning of the year in 2018?" }, { "input": "What was the percentage change in Balance as of the beginning of the year in 2019 from 2018?", "output": "What is the percentage change in the Balance as of the start of 2019 compared to the Balance at the start of 2018?" }, { "input": "What was the percentage change in Prepaid and refundable income taxes from 2018 to 2019?", "output": "What was the percentage difference in the amount of income taxes that were prepaid and refundable between the years 2018 and 2019?" }, { "input": "What was the percentage change in Prepaid and refundable income taxes from 2018 to 2019?", "output": "What was the percentage difference in the amount of prepaid and refundable income taxes between 2018 and 2019?" }, { "input": "What does the Current portion of Euro Term Loan consist of?", "output": "What is included in the current portion of the Euro Term Loan?" }, { "input": "What was the net change in the amount of outstanding Capital lease obligations between the years 2018 and 2019?", "output": "What was the change in Capital lease obligations from 2018 to 2019?" }, { "input": "What was the percentage increase or decrease in Capital lease obligations from 2018 to 2019?", "output": "What was the percentage change in Capital lease obligations from 2018 to 2019?" }, { "input": "What was the percentage increase or decrease in Capital lease obligations between 2018 and 2019?", "output": "What was the percentage change in Capital lease obligations from 2018 to 2019?" }, { "input": "What are the respective values of the company's work in process inventories on March 31, 2018 and 2019?", "output": "What were the specific values of the company's work in process inventories as of March 31 in both 2018 and 2019?" }, { "input": "What are the respective values of the company's work in process inventories on March 31, 2018 and 2019?", "output": "What is the dollar amount of the work in process inventories for the company on March 31 of both 2018 and 2019?" }, { "input": "What was the value of the company's finished goods on March 31, 2018 and March 31, 2019?", "output": "What are the respective values of the company's finished goods on March 31, 2018 and 2019?" }, { "input": "What is the change in the company's raw materials on March 31, 2018 and 2019?", "output": "What is the difference in the amount of raw materials held by the company as of March 31, 2018 and March 31, 2019?" }, { "input": "What was the mean value of the company's raw materials on March 31 in both 2018 and 2019?", "output": "What is the average value of the company's raw materials on March 31, 2018 and 2019?" }, { "input": "What is the average worth of the company's raw materials specifically on March 31st, both in the years 2018 and 2019?", "output": "What is the average value of the company's raw materials on March 31, 2018 and 2019?" }, { "input": "What is the mean amount of the company's completed products on March 31, in both the years 2018 and 2019?", "output": "What is the average value of the company's finished goods on March 31, 2018 and 2019?" }, { "input": "What is the average value of the finished goods held by the company on March 31 in both 2018 and 2019?", "output": "What is the average value of the company's finished goods on March 31, 2018 and 2019?" }, { "input": "What are the company's respective foreign income before taxes in 2018 and 2019?", "output": "What were the pre-tax foreign incomes of the company in 2018 and 2019?" }, { "input": "What is the company's average loss before income taxes from the United States in 2018 and 2019?", "output": "\"What was the average pre-tax loss incurred by the company in the United States for the years 2018 and 2019?\"" }, { "input": "What is the company's average loss before income taxes from the United States in 2018 and 2019?", "output": "What was the average pre-tax income loss for the company in the United States for both 2018 and 2019?" }, { "input": "What is the company's average foreign income before taxes in 2018 and 2019?", "output": "What was the average amount of foreign income before taxes for the company in both 2018 and 2019?" }, { "input": "What are the average pre-tax earnings of the company for the fiscal years 2018 and 2019?", "output": "What is the company's average total income before income taxes in 2018 and 2019?" }, { "input": "What are the specific cost of goods sold for the company in both 2018 and 2019?", "output": "What are the company's respective cost of goods sold in 2018 and 2019?" }, { "input": "What were the cost of goods sold for the company in 2018 and 2019?", "output": "What are the company's respective cost of goods sold in 2018 and 2019?" } ] ================================================ FILE: libs/ktem/ktem/reasoning/prompt_optimization/rewrite_question.py ================================================ from ktem.llms.manager import llms from kotaemon.base import BaseComponent, Document, HumanMessage, Node, SystemMessage from kotaemon.llms import ChatLLM, PromptTemplate DEFAULT_REWRITE_PROMPT = ( "Given the following question, rephrase and expand it " "to help you do better answering. Maintain all information " "in the original question. Keep the question as concise as possible. " "Only output the rephrased question without additional information. " "Give answer in {lang}\n" "Original question: {question}\n" "Rephrased question: " ) class RewriteQuestionPipeline(BaseComponent): """Rewrite user question Args: llm: the language model to rewrite question rewrite_template: the prompt template for llm to paraphrase a text input lang: the language of the answer. Currently support English and Japanese """ llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) rewrite_template: str = DEFAULT_REWRITE_PROMPT lang: str = "English" def run(self, question: str) -> Document: # type: ignore prompt_template = PromptTemplate(self.rewrite_template) prompt = prompt_template.populate(question=question, lang=self.lang) messages = [ SystemMessage(content="You are a helpful assistant"), HumanMessage(content=prompt), ] return self.llm(messages) ================================================ FILE: libs/ktem/ktem/reasoning/prompt_optimization/suggest_conversation_name.py ================================================ import logging from ktem.llms.manager import llms from kotaemon.base import AIMessage, BaseComponent, Document, HumanMessage, Node from kotaemon.llms import ChatLLM, PromptTemplate logger = logging.getLogger(__name__) class SuggestConvNamePipeline(BaseComponent): """Suggest a good conversation name based on the chat history.""" llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) SUGGEST_NAME_PROMPT_TEMPLATE = ( "You are an expert at suggesting good and memorable conversation name. " "Based on the chat history above, " "suggest a good conversation name (max 10 words). " "Give answer in {lang}. Just output the conversation " "name without any extra." ) prompt_template: str = SUGGEST_NAME_PROMPT_TEMPLATE lang: str = "English" def run(self, chat_history: list[tuple[str, str]]) -> Document: # type: ignore prompt_template = PromptTemplate(self.prompt_template) prompt = prompt_template.populate(lang=self.lang) messages = [] for human, ai in chat_history: messages.append(HumanMessage(content=human)) messages.append(AIMessage(content=ai)) messages.append(HumanMessage(content=prompt)) return self.llm(messages) ================================================ FILE: libs/ktem/ktem/reasoning/prompt_optimization/suggest_followup_chat.py ================================================ import logging from ktem.llms.manager import llms from kotaemon.base import AIMessage, BaseComponent, Document, HumanMessage, Node from kotaemon.llms import ChatLLM, PromptTemplate logger = logging.getLogger(__name__) class SuggestFollowupQuesPipeline(BaseComponent): """Suggest a list of follow-up questions based on the chat history.""" llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) SUGGEST_QUESTIONS_PROMPT_TEMPLATE = ( "Based on the chat history above. " "your task is to generate 3 to 5 relevant follow-up questions. " "These questions should be simple, very concise, " "and designed to guide the conversation further. " "Respond in JSON format with 'questions' key. " "Answer using the language {lang} same as the question. " ) prompt_template: str = SUGGEST_QUESTIONS_PROMPT_TEMPLATE extra_prompt: str = """Example of valid response: ```json { "questions": ["the weather is good", "what's your favorite city"] } ```""" lang: str = "English" def run(self, chat_history: list[tuple[str, str]]) -> Document: prompt_template = PromptTemplate(self.prompt_template) prompt = prompt_template.populate(lang=self.lang) + self.extra_prompt messages = [] for human, ai in chat_history[-3:]: messages.append(HumanMessage(content=human)) messages.append(AIMessage(content=ai)) messages.append(HumanMessage(content=prompt)) return self.llm(messages) ================================================ FILE: libs/ktem/ktem/reasoning/react.py ================================================ import html import logging from typing import AnyStr, Optional, Type from ktem.llms.manager import llms from ktem.mcp.manager import mcp_manager from ktem.reasoning.base import BaseReasoning from ktem.utils.generator import Generator from ktem.utils.render import Render from langchain.text_splitter import CharacterTextSplitter from pydantic import BaseModel, Field from kotaemon.agents import ( BaseTool, GoogleSearchTool, LLMTool, ReactAgent, WikipediaTool, ) from kotaemon.agents.tools.mcp import create_tools_from_config from kotaemon.base import BaseComponent, Document, HumanMessage, Node, SystemMessage from kotaemon.llms import ChatLLM, PromptTemplate from ..utils import SUPPORTED_LANGUAGE_MAP logger = logging.getLogger(__name__) DEFAULT_AGENT_STEPS = 4 class DocSearchArgs(BaseModel): query: str = Field(..., description="a search query as input to the doc search") class DocSearchTool(BaseTool): name: str = "docsearch" description: str = ( "A storage that contains internal documents. If you lack any specific " "private information to answer the question, you can search in this " "document storage. Furthermore, if you are unsure about which document that " "the user refers to, likely the user already selects the target document in " "this document storage, you just need to do normal search. If possible, " "formulate the search query as specific as possible." ) args_schema: Optional[Type[BaseModel]] = DocSearchArgs retrievers: list[BaseComponent] = [] def _run_tool(self, query: AnyStr) -> AnyStr: docs = [] doc_ids = [] for retriever in self.retrievers: for doc in retriever(text=query): if doc.doc_id not in doc_ids: docs.append(doc) doc_ids.append(doc.doc_id) return self.prepare_evidence(docs) def prepare_evidence(self, docs, trim_len: int = 4000): evidence = "" table_found = 0 for _id, retrieved_item in enumerate(docs): retrieved_content = "" page = retrieved_item.metadata.get("page_label", None) source = filename = retrieved_item.metadata.get("file_name", "-") if page: source += f" (Page {page})" if retrieved_item.metadata.get("type", "") == "table": if table_found < 5: retrieved_content = retrieved_item.metadata.get("table_origin", "") if retrieved_content not in evidence: table_found += 1 evidence += ( f"
Table from {source}\n" + retrieved_content + "\n
" ) elif retrieved_item.metadata.get("type", "") == "chatbot": retrieved_content = retrieved_item.metadata["window"] evidence += ( f"
Chatbot scenario from {filename} (Row {page})\n" + retrieved_content + "\n
" ) elif retrieved_item.metadata.get("type", "") == "image": retrieved_content = retrieved_item.metadata.get("image_origin", "") retrieved_caption = html.escape(retrieved_item.get_content()) evidence += ( f"
Figure from {source}\n" + retrieved_caption + "\n
" ) else: if "window" in retrieved_item.metadata: retrieved_content = retrieved_item.metadata["window"] else: retrieved_content = retrieved_item.text retrieved_content = retrieved_content.replace("\n", " ") if retrieved_content not in evidence: evidence += ( f"
Content from {source}: " + retrieved_content + " \n
" ) print("Retrieved #{}: {}".format(_id, retrieved_content[:100])) print("Score", retrieved_item.metadata.get("reranking_score", None)) # trim context by trim_len if evidence: text_splitter = CharacterTextSplitter.from_tiktoken_encoder( chunk_size=trim_len, chunk_overlap=0, separator=" ", model_name="gpt-3.5-turbo", ) texts = text_splitter.split_text(evidence) evidence = texts[0] return Document(content=evidence) TOOL_REGISTRY = { "Google": GoogleSearchTool(), "Wikipedia": WikipediaTool(), "LLM": LLMTool(), "SearchDoc": DocSearchTool(), } DEFAULT_QA_PROMPT = ( "Answer the following questions as best you can. Give answer in {lang}. " "You have access to the following tools:\n" "{tool_description}\n" "Use the following format:\n\n" "Question: the input question you must answer\n" "Thought: you should always think about what to do\n\n" "Action: the action to take, should be one of [{tool_names}]\n\n" "Action Input: the input to the action, should be different from the action input " "of the same action in previous steps.\n\n" "Observation: the result of the action\n\n" "... (this Thought/Action/Action Input/Observation can repeat N times)\n" "#Thought: I now know the final answer\n" "Final Answer: the final answer to the original input question\n\n" "Begin! After each Action Input.\n\n" "Question: {instruction}\n" "Thought: {agent_scratchpad}\n" ) DEFAULT_REWRITE_PROMPT = ( "Given the following question, rephrase and expand it " "to help you do better answering. Maintain all information " "in the original question. Keep the question as concise as possible. " "Give answer in {lang}\n" "Original question: {question}\n" "Rephrased question: " ) class RewriteQuestionPipeline(BaseComponent): """Rewrite user question Args: llm: the language model to rewrite question rewrite_template: the prompt template for llm to paraphrase a text input lang: the language of the answer. Currently support English and Japanese """ llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) rewrite_template: str = DEFAULT_REWRITE_PROMPT lang: str = "English" def run(self, question: str) -> Document: # type: ignore prompt_template = PromptTemplate(self.rewrite_template) prompt = prompt_template.populate(question=question, lang=self.lang) messages = [ SystemMessage(content="You are a helpful assistant"), HumanMessage(content=prompt), ] return self.llm(messages) class ReactAgentPipeline(BaseReasoning): """Question answering pipeline using ReAct agent.""" class Config: allow_extra = True retrievers: list[BaseComponent] agent: ReactAgent = ReactAgent.withx() rewrite_pipeline: RewriteQuestionPipeline = RewriteQuestionPipeline.withx() use_rewrite: bool = False def prepare_citation(self, step_id, step, output, status) -> Document: header = "Step {id}: {log}".format(id=step_id, log=step.log) content = ( "Action: {tool}[{input}]\n\nOutput: {output}" ).format( tool=step.tool if status == "thinking" else "", input=step.tool_input.replace("\n", "").replace('"', "") if status == "thinking" else "", output=output if status == "thinking" else "Finished", ) return Document( channel="info", content=Render.collapsible( header=header, content=Render.table(content), open=True, ), ) async def ainvoke( # type: ignore self, message, conv_id: str, history: list, **kwargs # type: ignore ) -> Document: if self.use_rewrite: rewrite = await self.rewrite_pipeline(question=message) message = rewrite.text answer = self.agent(message) self.report_output(Document(content=answer.text, channel="chat")) intermediate_steps = answer.intermediate_steps for _, step_output in intermediate_steps: self.report_output(Document(content=step_output, channel="info")) self.report_output(None) return answer def stream(self, message, conv_id: str, history: list, **kwargs): if self.use_rewrite: rewrite = self.rewrite_pipeline(question=message) message = rewrite.text yield Document( channel="info", content=f"Rewrote the message to: {rewrite.text}", ) output_stream = Generator(self.agent.stream(message)) idx = 0 for item in output_stream: idx += 1 if item.status == "thinking": step, step_output = item.intermediate_steps yield Document( channel="info", content=self.prepare_citation(idx, step, step_output, item.status), ) else: yield Document( channel="chat", content=item.text, ) step, step_output = item.intermediate_steps yield Document( channel="info", content=self.prepare_citation(idx, step, step_output, item.status), ) return output_stream.value @classmethod def get_pipeline( cls, settings: dict, states: dict, retrievers: list | None = None ) -> BaseReasoning: _id = cls.get_info()["id"] prefix = f"reasoning.options.{_id}" llm_name = settings[f"{prefix}.llm"] llm = llms.get(llm_name, llms.get_default()) max_context_length_setting = settings.get("reasoning.max_context_length", None) pipeline = ReactAgentPipeline(retrievers=retrievers) pipeline.agent.llm = llm pipeline.agent.max_iterations = settings[f"{prefix}.max_iterations"] if max_context_length_setting: pipeline.agent.max_context_length = ( max_context_length_setting // DEFAULT_AGENT_STEPS ) tools = [] for tool_name in settings[f"reasoning.options.{_id}.tools"]: if tool_name.startswith("[MCP] "): server_name = tool_name[len("[MCP] ") :] entry = mcp_manager.get(server_name) if entry: config = entry["config"] enabled_tools = config.pop("enabled_tools", None) mcp_tools = create_tools_from_config(config, enabled_tools) tools.extend(mcp_tools) else: tool = TOOL_REGISTRY[tool_name] if tool_name == "SearchDoc": tool.retrievers = retrievers elif tool_name == "LLM": tool.llm = llm tools.append(tool) pipeline.agent.plugins = tools pipeline.agent.output_lang = SUPPORTED_LANGUAGE_MAP.get( settings["reasoning.lang"], "English" ) pipeline.use_rewrite = states.get("app", {}).get("regen", False) pipeline.agent.prompt_template = PromptTemplate(settings[f"{prefix}.qa_prompt"]) return pipeline @classmethod def get_user_settings(cls) -> dict: llm = "" llm_choices = [("(default)", "")] try: llm_choices += [(_, _) for _ in llms.options().keys()] except Exception as e: logger.exception(f"Failed to get LLM options: {e}") tool_choices = ["Wikipedia", "Google", "LLM", "SearchDoc"] try: tool_choices += mcp_manager.get_enabled_tools() except Exception as e: logger.exception(f"Failed to get MCP tool options: {e}") return { "llm": { "name": "Language model", "value": llm, "component": "dropdown", "choices": llm_choices, "special_type": "llm", "info": ( "The language model to use for generating the answer. If None, " "the application default language model will be used." ), }, "tools": { "name": "Tools for knowledge retrieval", "value": ["SearchDoc", "LLM"], "component": "checkboxgroup", "choices": tool_choices, }, "max_iterations": { "name": "Maximum number of iterations the LLM can go through", "value": 5, "component": "number", }, "qa_prompt": { "name": "QA Prompt", "value": DEFAULT_QA_PROMPT, }, } @classmethod def get_info(cls) -> dict: return { "id": "ReAct", "name": "ReAct Agent", "description": ( "Implementing ReAct paradigm: https://arxiv.org/abs/2210.03629. " "ReAct agent answers the user's request by iteratively formulating " "plan and executing it. The agent can use multiple tools to gather " "information and generate the final answer." ), } ================================================ FILE: libs/ktem/ktem/reasoning/rewoo.py ================================================ import html import logging from difflib import SequenceMatcher from typing import AnyStr, Generator, Optional, Type from ktem.llms.manager import llms from ktem.mcp.manager import mcp_manager from ktem.reasoning.base import BaseReasoning from ktem.utils.generator import Generator as GeneratorWrapper from ktem.utils.render import Render from langchain.text_splitter import CharacterTextSplitter from pydantic import BaseModel, Field from kotaemon.agents import ( BaseTool, GoogleSearchTool, LLMTool, RewooAgent, WikipediaTool, ) from kotaemon.agents.tools.mcp import create_tools_from_config from kotaemon.base import BaseComponent, Document, HumanMessage, Node, SystemMessage from kotaemon.llms import ChatLLM, PromptTemplate from ..utils import SUPPORTED_LANGUAGE_MAP logger = logging.getLogger(__name__) DEFAULT_AGENT_STEPS = 4 DEFAULT_PLANNER_PROMPT = ( "You are an AI agent who makes step-by-step plans to solve a problem under the " "help of external tools. For each step, make one plan followed by one tool-call, " "which will be executed later to retrieve evidence for that step.\n" "You should store each evidence into a distinct variable #E1, #E2, #E3 ... that " "can be referred to in later tool-call inputs.\n\n" "##Available Tools##\n" "{tool_description}\n\n" "##Output Format (Replace '<...>')##\n" "#Plan1: \n" "#E1: [] (eg. Search[What is Python])\n" "#Plan2: \n" "#E2: []\n" "And so on...\n\n" "##Your Task##\n" "{task}\n\n" "##Now Begin##\n" ) DEFAULT_SOLVER_PROMPT = ( "You are an AI agent who solves a problem with my assistance. I will provide " "step-by-step plans(#Plan) and evidences(#E) that could be helpful.\n" "Your task is to briefly summarize each step, then make a short final conclusion " "for your task. Give answer in {lang}.\n\n" "##My Plans and Evidences##\n" "{plan_evidence}\n\n" "##Example Output##\n" "First, I , and I think <...>; Second, I <...>, " "and I think <...>; ....\n" "So, .\n\n" "##Your Task##\n" "{task}\n\n" "##Now Begin##\n" ) class DocSearchArgs(BaseModel): query: str = Field(..., description="a search query as input to the doc search") class DocSearchTool(BaseTool): name: str = "docsearch" description: str = ( "A storage that contains internal documents. If you lack any specific " "private information to answer the question, you can search in this " "document storage. Furthermore, if you are unsure about which document that " "the user refers to, likely the user already selects the target document in " "this document storage, you just need to do normal search. If possible, " "formulate the search query as specific as possible." ) args_schema: Optional[Type[BaseModel]] = DocSearchArgs retrievers: list[BaseComponent] = [] def _run_tool(self, query: AnyStr) -> AnyStr: docs = [] doc_ids = [] for retriever in self.retrievers: for doc in retriever(text=query): if doc.doc_id not in doc_ids: docs.append(doc) doc_ids.append(doc.doc_id) return self.prepare_evidence(docs) def prepare_evidence(self, docs, trim_len: int = 3000): evidence = "" table_found = 0 for _id, retrieved_item in enumerate(docs): retrieved_content = "" page = retrieved_item.metadata.get("page_label", None) source = filename = retrieved_item.metadata.get("file_name", "-") if page: source += f" (Page {page})" if retrieved_item.metadata.get("type", "") == "table": if table_found < 5: retrieved_content = retrieved_item.metadata.get("table_origin", "") if retrieved_content not in evidence: table_found += 1 evidence += ( f"
Table from {source}\n" + retrieved_content + "\n
" ) elif retrieved_item.metadata.get("type", "") == "chatbot": retrieved_content = retrieved_item.metadata["window"] evidence += ( f"
Chatbot scenario from {filename} (Row {page})\n" + retrieved_content + "\n
" ) elif retrieved_item.metadata.get("type", "") == "image": retrieved_content = retrieved_item.metadata.get("image_origin", "") retrieved_caption = html.escape(retrieved_item.get_content()) # PWS doesn't support VLM for images, we will just store the caption evidence += ( f"
Figure from {source}\n" + retrieved_caption + "\n
" ) else: if "window" in retrieved_item.metadata: retrieved_content = retrieved_item.metadata["window"] else: retrieved_content = retrieved_item.text retrieved_content = retrieved_content.replace("\n", " ") if retrieved_content not in evidence: evidence += ( f"
Content from {source}: " + retrieved_content + " \n
" ) print("Retrieved #{}: {}".format(_id, retrieved_content)) print("Score", retrieved_item.metadata.get("reranking_score", None)) # trim context by trim_len if evidence: text_splitter = CharacterTextSplitter.from_tiktoken_encoder( chunk_size=trim_len, chunk_overlap=0, separator=" ", model_name="gpt-3.5-turbo", ) texts = text_splitter.split_text(evidence) evidence = texts[0] return Document(content=evidence) TOOL_REGISTRY = { "Google": GoogleSearchTool(), "Wikipedia": WikipediaTool(), "LLM": LLMTool(), "SearchDoc": DocSearchTool(), } DEFAULT_REWRITE_PROMPT = ( "Given the following question, rephrase and expand it " "to help you do better answering. Maintain all information " "in the original question. Keep the question as concise as possible. " "Give answer in {lang}\n" "Original question: {question}\n" "Rephrased question: " ) class RewriteQuestionPipeline(BaseComponent): """Rewrite user question Args: llm: the language model to rewrite question rewrite_template: the prompt template for llm to paraphrase a text input lang: the language of the answer. Currently support English and Japanese """ llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) rewrite_template: str = DEFAULT_REWRITE_PROMPT lang: str = "English" def run(self, question: str) -> Document: # type: ignore prompt_template = PromptTemplate(self.rewrite_template) prompt = prompt_template.populate(question=question, lang=self.lang) messages = [ SystemMessage(content="You are a helpful assistant"), HumanMessage(content=prompt), ] return self.llm(messages) def find_text(llm_output, context): sentence_list = llm_output.split("\n") matches = [] for sentence in sentence_list: match = SequenceMatcher( None, sentence, context, autojunk=False ).find_longest_match() matches.append((match.b, match.b + match.size)) return matches class RewooAgentPipeline(BaseReasoning): """Question answering pipeline using ReWOO Agent.""" class Config: allow_extra = True retrievers: list[BaseComponent] agent: RewooAgent = RewooAgent.withx() rewrite_pipeline: RewriteQuestionPipeline = RewriteQuestionPipeline.withx() use_rewrite: bool = False enable_citation: bool = False def format_info_panel_evidence(self, worker_log): header = "" content = [] for line in worker_log.splitlines(): if line.startswith("#Plan"): # line starts with #Plan should be marked as a new segment header = line elif line.startswith("#Action"): # small fix for markdown output line = "\\" + line + "
" content.append(line) elif line.startswith("#"): # stop markdown from rendering big headers line = "\\" + line content.append(line) else: content.append(line) if not header: return return Document( channel="info", content=Render.collapsible( header=header, content=Render.table("\n".join(content)), open=False, ), ) def format_info_panel_planner(self, planner_output): planner_output = planner_output.replace("\n", "
") return Document( channel="info", content=Render.collapsible( header="Planner Output", content=planner_output, open=True, ), ) def prepare_citation(self, answer) -> list[Document]: """Prepare citation to show on the UI""" segments = [] split_indices = [ 0, ] start_indices = set() text = "" if "citation" in answer.metadata and answer.metadata["citation"] is not None: context = answer.metadata["worker_log"] for evidence in answer.metadata["citation"].evidences: matches = find_text(evidence, context) for match in matches: split_indices.append(match[0]) split_indices.append(match[1]) start_indices.add(match[0]) split_indices = sorted(list(set(split_indices))) spans = [] prev = 0 for index in split_indices: if index > prev: spans.append(context[prev:index]) prev = index spans.append(context[split_indices[-1] :]) prev = 0 for span, start_idx in list(zip(spans, split_indices)): if start_idx in start_indices: text += Render.highlight(span) else: text += span else: text = answer.metadata["worker_log"] # separate text by detect header: #Plan for line in text.splitlines(): if line.startswith("#Plan"): # line starts with #Plan should be marked as a new segment new_segment = [line] segments.append(new_segment) elif line.startswith("#Action"): # small fix for markdown output line = "\\" + line + "
" segments[-1].append(line) elif line.startswith("#"): # stop markdown from rendering big headers line = "\\" + line segments[-1].append(line) else: if segments: segments[-1].append(line) else: segments.append([line]) outputs = [] for segment in segments: outputs.append( Document( channel="info", content=Render.collapsible( header=segment[0], content=Render.table("\n".join(segment[1:])), open=True, ), ) ) return outputs async def ainvoke( # type: ignore self, message, conv_id: str, history: list, **kwargs # type: ignore ) -> Document: answer = self.agent(message, use_citation=True) self.report_output(Document(content=answer.text, channel="chat")) refined_citations = self.prepare_citation(answer) for _ in refined_citations: self.report_output(_) self.report_output(None) return answer def stream( # type: ignore self, message, conv_id: str, history: list, **kwargs # type: ignore ) -> Generator[Document, None, Document] | None: if self.use_rewrite: rewrite = self.rewrite_pipeline(question=message) message = rewrite.text yield Document( channel="info", content=f"Rewrote the message to: {rewrite.text}", ) output_stream = GeneratorWrapper( self.agent.stream(message, use_citation=self.enable_citation) ) for item in output_stream: if item.intermediate_steps: for step in item.intermediate_steps: if "planner_log" in step: yield Document( channel="info", content=self.format_info_panel_planner(step["planner_log"]), ) else: yield Document( channel="info", content=self.format_info_panel_evidence(step["worker_log"]), ) if item.text: # final answer yield Document(channel="chat", content=item.text) answer = output_stream.value yield Document(channel="info", content=None) yield from self.prepare_citation(answer) return answer @classmethod def get_pipeline( cls, settings: dict, states: dict, retrievers: list | None = None ) -> BaseReasoning: _id = cls.get_info()["id"] prefix = f"reasoning.options.{_id}" pipeline = RewooAgentPipeline(retrievers=retrievers) max_context_length_setting = settings.get("reasoning.max_context_length", None) planner_llm_name = settings[f"{prefix}.planner_llm"] planner_llm = llms.get(planner_llm_name, llms.get_default()) solver_llm_name = settings[f"{prefix}.solver_llm"] solver_llm = llms.get(solver_llm_name, llms.get_default()) pipeline.agent.planner_llm = planner_llm pipeline.agent.solver_llm = solver_llm if max_context_length_setting: pipeline.agent.max_context_length = ( max_context_length_setting // DEFAULT_AGENT_STEPS ) tools = [] for tool_name in settings[f"{prefix}.tools"]: if tool_name.startswith("[MCP] "): server_name = tool_name[len("[MCP] ") :] entry = mcp_manager.get(server_name) if entry: config = entry["config"] enabled_tools = config.pop("enabled_tools", None) mcp_tools = create_tools_from_config(config, enabled_tools) tools.extend(mcp_tools) else: tool = TOOL_REGISTRY[tool_name] if tool_name == "SearchDoc": tool.retrievers = retrievers elif tool_name == "LLM": tool.llm = solver_llm tools.append(tool) pipeline.agent.plugins = tools pipeline.agent.output_lang = SUPPORTED_LANGUAGE_MAP.get( settings["reasoning.lang"], "English" ) pipeline.agent.prompt_template["Planner"] = PromptTemplate( settings[f"{prefix}.planner_prompt"] ) pipeline.agent.prompt_template["Solver"] = PromptTemplate( settings[f"{prefix}.solver_prompt"] ) pipeline.enable_citation = settings[f"{prefix}.highlight_citation"] pipeline.use_rewrite = states.get("app", {}).get("regen", False) pipeline.rewrite_pipeline.llm = ( planner_llm # TODO: separate llm for rewrite if needed ) return pipeline @classmethod def get_user_settings(cls) -> dict: llm = "" llm_choices = [("(default)", "")] try: llm_choices += [(_, _) for _ in llms.options().keys()] except Exception as e: logger.exception(f"Failed to get LLM options: {e}") tool_choices = ["Wikipedia", "Google", "LLM", "SearchDoc"] try: tool_choices += mcp_manager.get_enabled_tools() except Exception as e: logger.exception(f"Failed to get MCP tool options: {e}") return { "planner_llm": { "name": "Language model for Planner", "value": llm, "component": "dropdown", "choices": llm_choices, "special_type": "llm", "info": ( "The language model to use for planning. " "This model will generate a plan based on the " "instruction to find the answer." ), }, "solver_llm": { "name": "Language model for Solver", "value": llm, "component": "dropdown", "choices": llm_choices, "special_type": "llm", "info": ( "The language model to use for solving. " "This model will generate the answer based on the " "plan generated by the planner and evidences found by the tools." ), }, "highlight_citation": { "name": "Highlight Citation", "value": False, "component": "checkbox", }, "tools": { "name": "Tools for knowledge retrieval", "value": ["SearchDoc", "LLM"], "component": "checkboxgroup", "choices": tool_choices, }, "planner_prompt": { "name": "Planner Prompt", "value": DEFAULT_PLANNER_PROMPT, }, "solver_prompt": { "name": "Solver Prompt", "value": DEFAULT_SOLVER_PROMPT, }, } @classmethod def get_info(cls) -> dict: return { "id": "ReWOO", "name": "ReWOO Agent", "description": ( "Implementing ReWOO paradigm: https://arxiv.org/abs/2305.18323. " "The ReWOO agent makes a step by step plan in the first stage, " "then solves each step in the second stage. The agent can use " "external tools to help in the reasoning process. Once all stages " "are completed, the agent will summarize the answer." ), } ================================================ FILE: libs/ktem/ktem/reasoning/simple.py ================================================ import logging import threading from textwrap import dedent from typing import Generator from decouple import config from ktem.embeddings.manager import embedding_models_manager as embeddings from ktem.llms.manager import llms from ktem.reasoning.prompt_optimization import ( DecomposeQuestionPipeline, RewriteQuestionPipeline, ) from ktem.utils.render import Render from ktem.utils.visualize_cited import CreateCitationVizPipeline from plotly.io import to_json from kotaemon.base import ( AIMessage, BaseComponent, Document, HumanMessage, Node, RetrievedDocument, SystemMessage, ) from kotaemon.indices.qa.citation_qa import ( CONTEXT_RELEVANT_WARNING_SCORE, DEFAULT_QA_TEXT_PROMPT, AnswerWithContextPipeline, ) from kotaemon.indices.qa.citation_qa_inline import AnswerWithInlineCitation from kotaemon.indices.qa.format_context import PrepareEvidencePipeline from kotaemon.indices.qa.utils import replace_think_tag_with_details from kotaemon.llms import ChatLLM from ..utils import SUPPORTED_LANGUAGE_MAP from .base import BaseReasoning logger = logging.getLogger(__name__) class AddQueryContextPipeline(BaseComponent): n_last_interactions: int = 5 llm: ChatLLM = Node(default_callback=lambda _: llms.get_default()) def run(self, question: str, history: list) -> Document: messages = [ SystemMessage( content="Below is a history of the conversation so far, and a new " "question asked by the user that needs to be answered by searching " "in a knowledge base.\nYou have access to a Search index " "with 100's of documents.\nGenerate a search query based on the " "conversation and the new question.\nDo not include cited source " "filenames and document names e.g info.txt or doc.pdf in the search " "query terms.\nDo not include any text inside [] or <<>> in the " "search query terms.\nDo not include any special characters like " "'+'.\nIf the question is not in English, rewrite the query in " "the language used in the question.\n If the question contains enough " "information, return just the number 1\n If it's unnecessary to do " "the searching, return just the number 0." ), HumanMessage(content="How did crypto do last year?"), AIMessage( content="Summarize Cryptocurrency Market Dynamics from last year" ), HumanMessage(content="What are my health plans?"), AIMessage(content="Show available health plans"), ] for human, ai in history[-self.n_last_interactions :]: messages.append(HumanMessage(content=human)) messages.append(AIMessage(content=ai)) messages.append(HumanMessage(content=f"Generate search query for: {question}")) resp = self.llm(messages).text if resp == "0": return Document(content="") if resp == "1": return Document(content=question) return Document(content=resp) class FullQAPipeline(BaseReasoning): """Question answering pipeline. Handle from question to answer""" class Config: allow_extra = True # configuration parameters trigger_context: int = 150 use_rewrite: bool = False retrievers: list[BaseComponent] evidence_pipeline: PrepareEvidencePipeline = PrepareEvidencePipeline.withx() answering_pipeline: AnswerWithContextPipeline rewrite_pipeline: RewriteQuestionPipeline | None = None create_citation_viz_pipeline: CreateCitationVizPipeline = Node( default_callback=lambda _: CreateCitationVizPipeline( embedding=embeddings.get_default() ) ) add_query_context: AddQueryContextPipeline = AddQueryContextPipeline.withx() def retrieve( self, message: str, history: list ) -> tuple[list[RetrievedDocument], list[Document]]: """Retrieve the documents based on the message""" # if len(message) < self.trigger_context: # # prefer adding context for short user questions, avoid adding context for # # long questions, as they are likely to contain enough information # # plus, avoid the situation where the original message is already too long # # for the model to handle # query = self.add_query_context(message, history).content # else: # query = message # print(f"Rewritten query: {query}") query = None if not query: # TODO: previously return [], [] because we think this message as something # like "Hello", "I need help"... query = message docs, doc_ids = [], [] plot_docs = [] for idx, retriever in enumerate(self.retrievers): retriever_node = self._prepare_child(retriever, f"retriever_{idx}") retriever_docs = retriever_node(text=query) retriever_docs_text = [] retriever_docs_plot = [] for doc in retriever_docs: if doc.metadata.get("type", "") == "plot": retriever_docs_plot.append(doc) else: retriever_docs_text.append(doc) for doc in retriever_docs_text: if doc.doc_id not in doc_ids: docs.append(doc) doc_ids.append(doc.doc_id) plot_docs.extend(retriever_docs_plot) info = [ Document( channel="info", content=Render.collapsible_with_header(doc, open_collapsible=True), ) for doc in docs ] + [ Document( channel="plot", content=doc.metadata.get("data", ""), ) for doc in plot_docs ] return docs, info def prepare_mindmap(self, answer) -> Document | None: mindmap = answer.metadata["mindmap"] if mindmap: mindmap_text = mindmap.text mindmap_svg = dedent( """
""" ).format(mindmap_text) mindmap_content = Document( channel="info", content=Render.collapsible( header=""" Mindmap [Expand] [Export]""", content=mindmap_svg, open=True, ), ) else: mindmap_content = None return mindmap_content def prepare_citation_viz(self, answer, question, docs) -> Document | None: doc_texts = [doc.text for doc in docs] citation_plot = None plot_content = None if answer.metadata["citation_viz"] and len(docs) > 1: try: citation_plot = self.create_citation_viz_pipeline(doc_texts, question) except Exception as e: print("Failed to create citation plot:", e) if citation_plot: plot = to_json(citation_plot) plot_content = Document(channel="plot", content=plot) return plot_content def show_citations_and_addons(self, answer, docs, question): # show the evidence with_citation, without_citation = self.answering_pipeline.prepare_citations( answer, docs ) mindmap_output = self.prepare_mindmap(answer) citation_plot_output = self.prepare_citation_viz(answer, question, docs) if not with_citation and not without_citation: yield Document(channel="info", content="
No evidence found.
") else: # clear the Info panel max_llm_rerank_score = max( doc.metadata.get("llm_trulens_score", 0.0) for doc in docs ) has_llm_score = any("llm_trulens_score" in doc.metadata for doc in docs) # clear previous info yield Document(channel="info", content=None) # yield mindmap output if mindmap_output: yield mindmap_output # yield citation plot output if citation_plot_output: yield citation_plot_output # yield warning message if has_llm_score and max_llm_rerank_score < CONTEXT_RELEVANT_WARNING_SCORE: yield Document( channel="info", content=( "
WARNING! Context relevance score is low. " "Double check the model answer for correctness.
" ), ) # show QA score qa_score = ( round(answer.metadata["qa_score"], 2) if answer.metadata.get("qa_score") else None ) if qa_score: yield Document( channel="info", content=f"
Answer confidence: {qa_score}
", ) yield from with_citation if without_citation: yield from without_citation async def ainvoke( # type: ignore self, message: str, conv_id: str, history: list, **kwargs # type: ignore ) -> Document: # type: ignore raise NotImplementedError def stream( # type: ignore self, message: str, conv_id: str, history: list, **kwargs # type: ignore ) -> Generator[Document, None, Document]: if self.use_rewrite and self.rewrite_pipeline: print("Chosen rewrite pipeline", self.rewrite_pipeline) message = self.rewrite_pipeline(question=message).text print("Rewrite result", message) print(f"Retrievers {self.retrievers}") # should populate the context docs, infos = self.retrieve(message, history) print(f"Got {len(docs)} retrieved documents") yield from infos evidence_mode, evidence, images = self.evidence_pipeline(docs).content def generate_relevant_scores(): nonlocal docs docs = self.retrievers[0].generate_relevant_scores(message, docs) # generate relevant score using if evidence and self.retrievers: scoring_thread = threading.Thread(target=generate_relevant_scores) scoring_thread.start() else: scoring_thread = None answer = yield from self.answering_pipeline.stream( question=message, history=history, evidence=evidence, evidence_mode=evidence_mode, images=images, conv_id=conv_id, **kwargs, ) # check tag from reasoning models processed_answer = replace_think_tag_with_details(answer.text) if processed_answer != answer.text: # clear the chat message and render again yield Document(channel="chat", content=None) yield Document(channel="chat", content=processed_answer) # show the evidence if scoring_thread: scoring_thread.join() yield from self.show_citations_and_addons(answer, docs, message) return answer @classmethod def prepare_pipeline_instance(cls, settings, retrievers): return cls( retrievers=retrievers, rewrite_pipeline=None, ) @classmethod def get_pipeline(cls, settings, states, retrievers): """Get the reasoning pipeline Args: settings: the settings for the pipeline retrievers: the retrievers to use """ max_context_length_setting = settings.get("reasoning.max_context_length", 32000) pipeline = cls.prepare_pipeline_instance(settings, retrievers) prefix = f"reasoning.options.{cls.get_info()['id']}" llm_name = settings.get(f"{prefix}.llm", None) llm = llms.get(llm_name, llms.get_default()) # prepare evidence pipeline configuration evidence_pipeline = pipeline.evidence_pipeline evidence_pipeline.max_context_length = max_context_length_setting # answering pipeline configuration use_inline_citation = settings[f"{prefix}.highlight_citation"] == "inline" if use_inline_citation: answer_pipeline = pipeline.answering_pipeline = AnswerWithInlineCitation() else: answer_pipeline = pipeline.answering_pipeline = AnswerWithContextPipeline() answer_pipeline.llm = llm answer_pipeline.citation_pipeline.llm = llm answer_pipeline.n_last_interactions = settings[f"{prefix}.n_last_interactions"] answer_pipeline.enable_citation = ( settings[f"{prefix}.highlight_citation"] != "off" ) answer_pipeline.enable_mindmap = settings[f"{prefix}.create_mindmap"] answer_pipeline.enable_citation_viz = settings[f"{prefix}.create_citation_viz"] answer_pipeline.use_multimodal = settings[f"{prefix}.use_multimodal"] answer_pipeline.system_prompt = settings[f"{prefix}.system_prompt"] answer_pipeline.qa_template = settings[f"{prefix}.qa_prompt"] answer_pipeline.lang = SUPPORTED_LANGUAGE_MAP.get( settings["reasoning.lang"], "English" ) pipeline.add_query_context.llm = llm pipeline.add_query_context.n_last_interactions = settings[ f"{prefix}.n_last_interactions" ] pipeline.trigger_context = settings[f"{prefix}.trigger_context"] pipeline.use_rewrite = states.get("app", {}).get("regen", False) if pipeline.rewrite_pipeline: pipeline.rewrite_pipeline.llm = llm pipeline.rewrite_pipeline.lang = SUPPORTED_LANGUAGE_MAP.get( settings["reasoning.lang"], "English" ) return pipeline @classmethod def get_user_settings(cls) -> dict: from ktem.llms.manager import llms llm = "" choices = [("(default)", "")] try: choices += [(_, _) for _ in llms.options().keys()] except Exception as e: logger.exception(f"Failed to get LLM options: {e}") return { "llm": { "name": "Language model", "value": llm, "component": "dropdown", "choices": choices, "special_type": "llm", "info": ( "The language model to use for generating the answer. If None, " "the application default language model will be used." ), }, "highlight_citation": { "name": "Citation style", "value": ( "highlight" if not config("USE_LOW_LLM_REQUESTS", default=False, cast=bool) else "off" ), "component": "radio", "choices": [ ("citation: highlight", "highlight"), ("citation: inline", "inline"), ("no citation", "off"), ], }, "create_mindmap": { "name": "Create Mindmap", "value": False, "component": "checkbox", }, "create_citation_viz": { "name": "Create Embeddings Visualization", "value": False, "component": "checkbox", }, "use_multimodal": { "name": "Use Multimodal Input", "value": False, "component": "checkbox", }, "system_prompt": { "name": "System Prompt", "value": ("This is a question answering system."), }, "qa_prompt": { "name": "QA Prompt (contains {context}, {question}, {lang})", "value": DEFAULT_QA_TEXT_PROMPT, }, "n_last_interactions": { "name": "Number of interactions to include", "value": 5, "component": "number", "info": "The maximum number of chat interactions to include in the LLM", }, "trigger_context": { "name": "Maximum message length for context rewriting", "value": 150, "component": "number", "info": ( "The maximum length of the message to trigger context addition. " "Exceeding this length, the message will be used as is." ), }, } @classmethod def get_info(cls) -> dict: return { "id": "simple", "name": "Simple QA", "description": ( "Simple RAG-based question answering pipeline. This pipeline can " "perform both keyword search and similarity search to retrieve the " "context. After that it includes that context to generate the answer." ), } class FullDecomposeQAPipeline(FullQAPipeline): def answer_sub_questions( self, messages: list, conv_id: str, history: list, **kwargs ): output_str = "" for idx, message in enumerate(messages): yield Document( channel="chat", content=f"
Sub-question {idx + 1}" f"
{message}
Answer
", ) # should populate the context docs, infos = self.retrieve(message, history) print(f"Got {len(docs)} retrieved documents") yield from infos evidence_mode, evidence, images = self.evidence_pipeline(docs).content answer = yield from self.answering_pipeline.stream( question=message, history=history, evidence=evidence, evidence_mode=evidence_mode, images=images, conv_id=conv_id, **kwargs, ) output_str += ( f"Sub-question {idx + 1}-th: '{message}'\nAnswer: '{answer.text}'\n\n" ) return output_str def stream( # type: ignore self, message: str, conv_id: str, history: list, **kwargs # type: ignore ) -> Generator[Document, None, Document]: sub_question_answer_output = "" if self.rewrite_pipeline: print("Chosen rewrite pipeline", self.rewrite_pipeline) result = self.rewrite_pipeline(question=message) print("Rewrite result", result) if isinstance(result, Document): message = result.text elif ( isinstance(result, list) and len(result) > 0 and isinstance(result[0], Document) ): yield Document( channel="chat", content="

Sub questions and their answers

", ) sub_question_answer_output = yield from self.answer_sub_questions( [r.text for r in result], conv_id, history, **kwargs ) yield Document( channel="chat", content=f"

Main question

{message}
Answer
", ) # should populate the context docs, infos = self.retrieve(message, history) print(f"Got {len(docs)} retrieved documents") yield from infos evidence_mode, evidence, images = self.evidence_pipeline(docs).content answer = yield from self.answering_pipeline.stream( question=message, history=history, evidence=evidence + "\n" + sub_question_answer_output, evidence_mode=evidence_mode, images=images, conv_id=conv_id, **kwargs, ) # show the evidence with_citation, without_citation = self.answering_pipeline.prepare_citations( answer, docs ) if not with_citation and not without_citation: yield Document(channel="info", content="
No evidence found.
") else: yield Document(channel="info", content=None) yield from with_citation yield from without_citation return answer @classmethod def get_user_settings(cls) -> dict: user_settings = super().get_user_settings() user_settings["decompose_prompt"] = { "name": "Decompose Prompt", "value": DecomposeQuestionPipeline.DECOMPOSE_SYSTEM_PROMPT_TEMPLATE, } return user_settings @classmethod def prepare_pipeline_instance(cls, settings, retrievers): prefix = f"reasoning.options.{cls.get_info()['id']}" pipeline = cls( retrievers=retrievers, rewrite_pipeline=DecomposeQuestionPipeline( prompt_template=settings.get(f"{prefix}.decompose_prompt") ), ) return pipeline @classmethod def get_info(cls) -> dict: return { "id": "complex", "name": "Complex QA", "description": ( "Use multi-step reasoning to decompose a complex question into " "multiple sub-questions. This pipeline can " "perform both keyword search and similarity search to retrieve the " "context. After that it includes that context to generate the answer." ), } ================================================ FILE: libs/ktem/ktem/rerankings/__init__.py ================================================ ================================================ FILE: libs/ktem/ktem/rerankings/db.py ================================================ from typing import Type from ktem.db.engine import engine from sqlalchemy import JSON, Boolean, Column, String from sqlalchemy.orm import DeclarativeBase from theflow.settings import settings as flowsettings from theflow.utils.modules import import_dotted_string class Base(DeclarativeBase): pass class BaseRerankingTable(Base): """Base table to store rerankings model""" __abstract__ = True name = Column(String, primary_key=True, unique=True) spec = Column(JSON, default={}) default = Column(Boolean, default=False) __base_reranking: Type[BaseRerankingTable] = ( import_dotted_string(flowsettings.KH_TABLE_RERANKING, safe=False) if hasattr(flowsettings, "KH_TABLE_RERANKING") else BaseRerankingTable ) class RerankingTable(__base_reranking): # type: ignore __tablename__ = "reranking" if not getattr(flowsettings, "KH_ENABLE_ALEMBIC", False): RerankingTable.metadata.create_all(engine) ================================================ FILE: libs/ktem/ktem/rerankings/manager.py ================================================ from typing import Optional, Type from sqlalchemy import select from sqlalchemy.orm import Session from theflow.settings import settings as flowsettings from theflow.utils.modules import deserialize from kotaemon.rerankings.base import BaseReranking from .db import RerankingTable, engine class RerankingManager: """Represent a pool of rerankings models""" def __init__(self): self._models: dict[str, BaseReranking] = {} self._info: dict[str, dict] = {} self._default: str = "" self._vendors: list[Type] = [] # populate the pool if empty if hasattr(flowsettings, "KH_RERANKINGS"): with Session(engine) as sess: count = sess.query(RerankingTable).count() if not count: for name, model in flowsettings.KH_RERANKINGS.items(): self.add( name=name, spec=model["spec"], default=model.get("default", False), ) self.load() self.load_vendors() def load(self): """Load the model pool from database""" self._models, self._info, self._default = {}, {}, "" with Session(engine) as sess: stmt = select(RerankingTable) items = sess.execute(stmt) for (item,) in items: self._models[item.name] = deserialize(item.spec, safe=False) self._info[item.name] = { "name": item.name, "spec": item.spec, "default": item.default, } if item.default: self._default = item.name def load_vendors(self): from kotaemon.rerankings import ( CohereReranking, TeiFastReranking, VoyageAIReranking, ) self._vendors = [TeiFastReranking, CohereReranking, VoyageAIReranking] def __getitem__(self, key: str) -> BaseReranking: """Get model by name""" return self._models[key] def __contains__(self, key: str) -> bool: """Check if model exists""" return key in self._models def get( self, key: str, default: Optional[BaseReranking] = None ) -> Optional[BaseReranking]: """Get model by name with default value""" return self._models.get(key, default) def settings(self) -> dict: """Present model pools option for gradio""" return { "label": "Reranking", "choices": list(self._models.keys()), "value": self.get_default_name(), } def options(self) -> dict: """Present a dict of models""" return self._models def get_random_name(self) -> str: """Get the name of random model Returns: str: random model name in the pool """ import random if not self._models: raise ValueError("No models is pool") return random.choice(list(self._models.keys())) def get_default_name(self) -> str: """Get the name of default model In case there is no default model, choose random model from pool. In case there are multiple default models, choose random from them. Returns: str: model name """ if not self._models: raise ValueError("No models in pool") if not self._default: return self.get_random_name() return self._default def get_random(self) -> BaseReranking: """Get random model""" return self._models[self.get_random_name()] def get_default(self) -> BaseReranking: """Get default model In case there is no default model, choose random model from pool. In case there are multiple default models, choose random from them. Returns: BaseReranking: model """ return self._models[self.get_default_name()] def info(self) -> dict: """List all models""" return self._info def add(self, name: str, spec: dict, default: bool): if not name: raise ValueError("Name must not be empty") try: with Session(engine) as sess: if default: # turn all models to non-default sess.query(RerankingTable).update({"default": False}) sess.commit() item = RerankingTable(name=name, spec=spec, default=default) sess.add(item) sess.commit() except Exception as e: raise ValueError(f"Failed to add model {name}: {e}") self.load() def delete(self, name: str): """Delete a model from the pool""" try: with Session(engine) as sess: item = sess.query(RerankingTable).filter_by(name=name).first() sess.delete(item) sess.commit() except Exception as e: raise ValueError(f"Failed to delete model {name}: {e}") self.load() def update(self, name: str, spec: dict, default: bool, new_name: str = ""): """Update a model in the pool, optionally renaming it.""" if not name: raise ValueError("Name must not be empty") if new_name and new_name != name: if new_name in self._info: raise ValueError( f"Model '{new_name}' already exists. Use a unique name." ) self.delete(name) self.add(new_name, spec=spec, default=default) return try: with Session(engine) as sess: if default: # turn all models to non-default sess.query(RerankingTable).update({"default": False}) sess.commit() item = sess.query(RerankingTable).filter_by(name=name).first() if not item: raise ValueError(f"Model {name} not found") item.spec = spec item.default = default sess.commit() except Exception as e: raise ValueError(f"Failed to update model {name}: {e}") self.load() def vendors(self) -> dict: """Return list of vendors""" return {vendor.__qualname__: vendor for vendor in self._vendors} reranking_models_manager = RerankingManager() ================================================ FILE: libs/ktem/ktem/rerankings/ui.py ================================================ from copy import deepcopy import gradio as gr import pandas as pd import yaml from ktem.app import BasePage from ktem.utils.file import YAMLNoDateSafeLoader from theflow.utils.modules import deserialize from kotaemon.base import Document from .manager import reranking_models_manager def format_description(cls): params = cls.describe()["params"] params_lines = ["| Name | Type | Description |", "| --- | --- | --- |"] for key, value in params.items(): if isinstance(value["auto_callback"], str): continue params_lines.append(f"| {key} | {value['type']} | {value['help']} |") return f"{cls.__doc__}\n\n" + "\n".join(params_lines) class RerankingManagement(BasePage): def __init__(self, app): self._app = app self.spec_desc_default = ( "# Spec description\n\nSelect a model to view the spec description." ) self.on_building_ui() def on_building_ui(self): with gr.Tab(label="View"): self.rerank_list = gr.DataFrame( headers=["name", "vendor", "default"], interactive=False, column_widths=[30, 40, 30], ) with gr.Column(visible=False) as self._selected_panel: self.selected_rerank_name = gr.Textbox(value="", visible=False) with gr.Row(): with gr.Column(): self.edit_default = gr.Checkbox( label="Set default", info=( "Set this Reranking model as default. This default " "Reranking will be used by other components by default " "if no Reranking is specified for such components." ), ) self.edit_name = gr.Textbox( label="Name", info="Edit to rename this Reranking model.", ) self.edit_spec = gr.Textbox( label="Specification", info="Specification of the Embedding model in YAML format", lines=10, ) with gr.Accordion( label="Test connection", visible=False, open=False ) as self._check_connection_panel: with gr.Row(): with gr.Column(scale=4): self.connection_logs = gr.HTML( "Logs", ) with gr.Column(scale=1): self.btn_test_connection = gr.Button("Test") with gr.Row(visible=False) as self._selected_panel_btn: with gr.Column(): self.btn_edit_save = gr.Button( "Save", min_width=10, variant="primary" ) with gr.Column(): self.btn_delete = gr.Button( "Delete", min_width=10, variant="stop" ) with gr.Row(): self.btn_delete_yes = gr.Button( "Confirm Delete", variant="stop", visible=False, min_width=10, ) self.btn_delete_no = gr.Button( "Cancel", visible=False, min_width=10 ) with gr.Column(): self.btn_close = gr.Button("Close", min_width=10) with gr.Column(): self.edit_spec_desc = gr.Markdown("# Spec description") with gr.Tab(label="Add"): with gr.Row(): with gr.Column(scale=2): self.name = gr.Textbox( label="Name", info=( "Must be unique and non-empty. " "The name will be used to identify the reranking model." ), ) self.rerank_choices = gr.Dropdown( label="Vendors", info=( "Choose the vendor of the Reranking model. Each vendor " "has different specification." ), ) self.spec = gr.Textbox( label="Specification", info="Specification of the Embedding model in YAML format.", ) self.default = gr.Checkbox( label="Set default", info=( "Set this Reranking model as default. This default " "Reranking will be used by other components by default " "if no Reranking is specified for such components." ), ) self.btn_new = gr.Button("Add", variant="primary") with gr.Column(scale=3): self.spec_desc = gr.Markdown(self.spec_desc_default) def _on_app_created(self): """Called when the app is created""" self._app.app.load( self.list_rerankings, inputs=[], outputs=[self.rerank_list], ) self._app.app.load( lambda: gr.update(choices=list(reranking_models_manager.vendors().keys())), outputs=[self.rerank_choices], ) def on_rerank_vendor_change(self, vendor): vendor = reranking_models_manager.vendors()[vendor] required: dict = {} desc = vendor.describe() for key, value in desc["params"].items(): if value.get("required", False): required[key] = value.get("default", None) return yaml.dump(required), format_description(vendor) def on_register_events(self): self.rerank_choices.select( self.on_rerank_vendor_change, inputs=[self.rerank_choices], outputs=[self.spec, self.spec_desc], ) self.btn_new.click( self.create_rerank, inputs=[self.name, self.rerank_choices, self.spec, self.default], outputs=None, ).success(self.list_rerankings, inputs=[], outputs=[self.rerank_list]).success( lambda: ("", None, "", False, self.spec_desc_default), outputs=[ self.name, self.rerank_choices, self.spec, self.default, self.spec_desc, ], ) self.rerank_list.select( self.select_rerank, inputs=self.rerank_list, outputs=[self.selected_rerank_name], show_progress="hidden", ) self.selected_rerank_name.change( self.on_selected_rerank_change, inputs=[self.selected_rerank_name], outputs=[ self._selected_panel, self._selected_panel_btn, # delete section self.btn_delete, self.btn_delete_yes, self.btn_delete_no, # edit section self.edit_name, self.edit_spec, self.edit_spec_desc, self.edit_default, ], show_progress="hidden", ).success(lambda: gr.update(value=""), outputs=[self.connection_logs]) self.btn_delete.click( self.on_btn_delete_click, inputs=[], outputs=[self.btn_delete, self.btn_delete_yes, self.btn_delete_no], show_progress="hidden", ) self.btn_delete_yes.click( self.delete_rerank, inputs=[self.selected_rerank_name], outputs=[self.selected_rerank_name], show_progress="hidden", ).then( self.list_rerankings, inputs=[], outputs=[self.rerank_list], ) self.btn_delete_no.click( lambda: ( gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), ), inputs=[], outputs=[self.btn_delete, self.btn_delete_yes, self.btn_delete_no], show_progress="hidden", ) self.btn_edit_save.click( self.save_rerank, inputs=[ self.selected_rerank_name, self.edit_name, self.edit_default, self.edit_spec, ], outputs=[self.selected_rerank_name], show_progress="hidden", ).then( self.list_rerankings, inputs=[], outputs=[self.rerank_list], ) self.btn_close.click(lambda: "", outputs=[self.selected_rerank_name]) self.btn_test_connection.click( self.check_connection, inputs=[self.selected_rerank_name, self.edit_spec], outputs=[self.connection_logs], ) def create_rerank(self, name, choices, spec, default): try: name = name.strip() spec = yaml.load(spec, Loader=YAMLNoDateSafeLoader) spec["__type__"] = ( reranking_models_manager.vendors()[choices].__module__ + "." + reranking_models_manager.vendors()[choices].__qualname__ ) reranking_models_manager.add(name, spec=spec, default=default) gr.Info(f'Reranking model "{name}" created successfully') except ValueError as e: raise gr.Error(str(e)) except Exception as e: raise gr.Error(f"Failed to create Reranking model '{name}': {e}") def list_rerankings(self): """List the Reranking models""" items = [] for item in reranking_models_manager.info().values(): record = {} record["name"] = item["name"] record["vendor"] = item["spec"].get("__type__", "-").split(".")[-1] record["default"] = item["default"] items.append(record) if items: rerank_list = pd.DataFrame.from_records(items) else: rerank_list = pd.DataFrame.from_records( [{"name": "-", "vendor": "-", "default": "-"}] ) return rerank_list def select_rerank(self, rerank_list, ev: gr.SelectData): if ev.value == "-" and ev.index[0] == 0: gr.Info("No reranking model is loaded. Please add first") return "" if not ev.selected: return "" return rerank_list["name"][ev.index[0]] def on_selected_rerank_change(self, selected_rerank_name): if selected_rerank_name == "": _selected_panel = gr.update(visible=False) _selected_panel_btn = gr.update(visible=False) btn_delete = gr.update(visible=True) btn_delete_yes = gr.update(visible=False) btn_delete_no = gr.update(visible=False) edit_name = gr.update(value="") edit_spec = gr.update(value="") edit_spec_desc = gr.update(value="") edit_default = gr.update(value=False) else: _selected_panel = gr.update(visible=True) _selected_panel_btn = gr.update(visible=True) btn_delete = gr.update(visible=True) btn_delete_yes = gr.update(visible=False) btn_delete_no = gr.update(visible=False) info = deepcopy(reranking_models_manager.info()[selected_rerank_name]) vendor_str = info["spec"].pop("__type__", "-").split(".")[-1] vendor = reranking_models_manager.vendors()[vendor_str] edit_name = selected_rerank_name edit_spec = yaml.dump(info["spec"]) edit_spec_desc = format_description(vendor) edit_default = info["default"] return ( _selected_panel, _selected_panel_btn, btn_delete, btn_delete_yes, btn_delete_no, edit_name, edit_spec, edit_spec_desc, edit_default, ) def on_btn_delete_click(self): btn_delete = gr.update(visible=False) btn_delete_yes = gr.update(visible=True) btn_delete_no = gr.update(visible=True) return btn_delete, btn_delete_yes, btn_delete_no def check_connection(self, selected_rerank_name, selected_spec): log_content: str = "" try: log_content += f"- Testing model: {selected_rerank_name}
" yield log_content # Parse content & init model info = deepcopy(reranking_models_manager.info()[selected_rerank_name]) # Parse content & create dummy response spec = yaml.load(selected_spec, Loader=YAMLNoDateSafeLoader) info["spec"].update(spec) rerank = deserialize(info["spec"], safe=False) if rerank is None: raise Exception(f"Can not found model: {selected_rerank_name}") log_content += "- Sending a message ([`Hello`], `Hi`)
" yield log_content _ = rerank([Document(content="Hello")], "Hi") log_content += ( "- Connection success. " "
" ) yield log_content gr.Info(f"Embedding {selected_rerank_name} connect successfully") except Exception as e: print(e) log_content += ( f"- Connection failed. " f"Got error:\n {str(e)}" ) yield log_content return log_content def save_rerank(self, selected_rerank_name, edit_name, default, spec): try: new_name = edit_name.strip() spec = yaml.load(spec, Loader=YAMLNoDateSafeLoader) spec["__type__"] = reranking_models_manager.info()[selected_rerank_name][ "spec" ]["__type__"] reranking_models_manager.update( selected_rerank_name, spec=spec, default=default, new_name=new_name ) final_name = ( new_name if new_name != selected_rerank_name else selected_rerank_name ) gr.Info(f'Reranking model "{final_name}" saved successfully') return final_name except ValueError as e: raise gr.Error(str(e)) except Exception as e: raise gr.Error( f'Failed to save Reranking model "{selected_rerank_name}": {e}' ) def delete_rerank(self, selected_rerank_name): try: reranking_models_manager.delete(selected_rerank_name) except Exception as e: gr.Error(f'Failed to delete Reranking model "{selected_rerank_name}": {e}') return selected_rerank_name return "" ================================================ FILE: libs/ktem/ktem/settings.py ================================================ from typing import Any from pydantic import BaseModel, Field class SettingItem(BaseModel): """Represent a setting item Args: name: the name of the setting item value: the default value of the setting item choices: the list of choices of the setting item, if any metadata: the metadata of the setting item component: the expected UI component to render the setting """ name: str value: Any choices: list = Field(default_factory=list) metadata: dict = Field(default_factory=dict) component: str = "text" special_type: str = "" class BaseSettingGroup(BaseModel): settings: dict[str, "SettingItem"] = Field(default_factory=dict) options: dict[str, "BaseSettingGroup"] = Field(default_factory=dict) def _get_options(self) -> dict: return {} def finalize(self): """Finalize the setting group""" def flatten(self) -> dict: """Render the setting group into value""" output = {} for key, value in self.settings.items(): output[key] = value.value output.update({f"options.{k}": v for k, v in self._get_options().items()}) return output def get_setting_item(self, path: str) -> SettingItem: """Get the item based on dot notation""" path = path.strip(".") if "." not in path: return self.settings[path] key, sub_path = path.split(".", 1) if key != "options": raise ValueError(f"Invalid key {path}. Should starts with `options.*`") option_id, sub_path = sub_path.split(".", 1) option = self.options[option_id] return option.get_setting_item(sub_path) def __bool__(self): return bool(self.settings) or bool(self.options) class SettingReasoningGroup(BaseSettingGroup): def _get_options(self) -> dict: output = {} for ex_name, ex_setting in self.options.items(): for key, value in ex_setting.flatten().items(): output[f"{ex_name}.{key}"] = value return output def finalize(self): """Finalize the setting""" options = list(self.options.keys()) if options: self.settings["use"].choices = [(x, x) for x in options] self.settings["use"].value = options[0] class SettingIndexOption(BaseSettingGroup): """Temporarily keep it here to see if we need this setting template for the index component """ indexing: BaseSettingGroup retrieval: BaseSettingGroup def flatten(self) -> dict: """Render the setting group into value""" output = {} for key, value in self.indexing.flatten(): output[f"indexing.{key}"] = value for key, value in self.retrieval.flatten(): output[f"retrieval.{key}"] = value return output def get_setting_item(self, path: str) -> SettingItem: """Get the item based on dot notation""" path = path.strip(".") key, sub_path = path.split(".", 1) if key not in ["indexing", "retrieval"]: raise ValueError( f"Invalid key {path}. Should starts with `indexing.*` or `retrieval.*`" ) value = getattr(self, key) return value.get_setting_item(sub_path) class SettingIndexGroup(BaseSettingGroup): def _get_options(self) -> dict: output = {} for name, setting in self.options.items(): for key, value in setting.flatten().items(): output[f"{name}.{key}"] = value return output class SettingGroup(BaseModel): application: BaseSettingGroup = Field(default_factory=BaseSettingGroup) index: SettingIndexGroup = Field(default_factory=SettingIndexGroup) reasoning: SettingReasoningGroup = Field(default_factory=SettingReasoningGroup) def flatten(self) -> dict: """Render the setting group into value""" output = {} for key, value in self.application.flatten().items(): output[f"application.{key}"] = value for key, value in self.index.flatten().items(): output[f"index.{key}"] = value for key, value in self.reasoning.flatten().items(): output[f"reasoning.{key}"] = value return output def get_setting_item(self, path: str) -> SettingItem: """Get the item based on dot notation""" path = path.strip(".") key, sub_path = path.split(".", 1) if key not in ["application", "index", "reasoning"]: raise ValueError( f"Invalid key {path}. Should starts with `indexing.*` or `retrieval.*`" ) value = getattr(self, key) return value.get_setting_item(sub_path) ================================================ FILE: libs/ktem/ktem/utils/__init__.py ================================================ from .conversation import get_file_names_regex, get_urls from .lang import SUPPORTED_LANGUAGE_MAP __all__ = ["SUPPORTED_LANGUAGE_MAP", "get_file_names_regex", "get_urls"] ================================================ FILE: libs/ktem/ktem/utils/commands.py ================================================ WEB_SEARCH_COMMAND = "web" ================================================ FILE: libs/ktem/ktem/utils/conversation.py ================================================ import re def sync_retrieval_n_message( messages: list[list[str]], retrievals: list[str], ) -> list[str]: """Ensure len of messages history and retrieval history are equal Empty string/Truncate will be used in case any difference exist """ n_message = len(messages) # include previous history n_retrieval = min(n_message, len(retrievals)) diff = n_message - n_retrieval retrievals = retrievals[:n_retrieval] + ["" for _ in range(diff)] assert len(retrievals) == n_message return retrievals def get_file_names_regex(input_str: str) -> tuple[list[str], str]: # get all file names with pattern @"filename" in input_str # also remove these file names from input_str pattern = r'@"([^"]*)"' matches = re.findall(pattern, input_str) input_str = re.sub(pattern, "", input_str).strip() return matches, input_str def get_urls(input_str: str) -> tuple[list[str], str]: # get all urls in input_str # also remove these urls from input_str pattern = r"https?://[^\s]+" matches = re.findall(pattern, input_str) input_str = re.sub(pattern, "", input_str).strip() return matches, input_str if __name__ == "__main__": print(sync_retrieval_n_message([[""], [""], [""]], [])) ================================================ FILE: libs/ktem/ktem/utils/file.py ================================================ import yaml class YAMLNoDateSafeLoader(yaml.SafeLoader): """Load datetime as strings, not dates""" @classmethod def remove_implicit_resolver(cls, tag_to_remove): """Remove implicit resolvers for a particular tag Args: tag_to_remove (str): YAML tag to remove """ if "yaml_implicit_resolvers" not in cls.__dict__: cls.yaml_implicit_resolvers = cls.yaml_implicit_resolvers.copy() for first_letter, mappings in cls.yaml_implicit_resolvers.items(): cls.yaml_implicit_resolvers[first_letter] = [ (tag, regexp) for tag, regexp in mappings if tag != tag_to_remove ] YAMLNoDateSafeLoader.remove_implicit_resolver("tag:yaml.org,2002:timestamp") ================================================ FILE: libs/ktem/ktem/utils/generator.py ================================================ class Generator: """A generator that stores return value from another generator""" def __init__(self, gen): self.gen = gen def __iter__(self): self.value = yield from self.gen return self.value ================================================ FILE: libs/ktem/ktem/utils/hf_papers.py ================================================ from datetime import datetime, timedelta import requests from cachetools import TTLCache, cached HF_API_URL = "https://huggingface.co/api/daily_papers" ARXIV_URL = "https://arxiv.org/abs/{paper_id}" SEMANTIC_SCHOLAR_QUERY_URL = "https://api.semanticscholar.org/graph/v1/paper/search/match?query={paper_name}" # noqa SEMANTIC_SCHOLAR_RECOMMEND_URL = ( "https://api.semanticscholar.org/recommendations/v1/papers/" # noqa ) CACHE_TIME = 60 * 60 * 6 # 6 hours # Function to parse the date string def parse_date(date_str): return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ") @cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME)) def get_recommendations_from_semantic_scholar(semantic_scholar_id: str): try: r = requests.post( SEMANTIC_SCHOLAR_RECOMMEND_URL, json={ "positivePaperIds": [semantic_scholar_id], }, params={"fields": "externalIds,title,year", "limit": 14}, # type: ignore ) return r.json()["recommendedPapers"] except KeyError as e: print(e) return [] def filter_recommendations(recommendations, max_paper_count=5): # include only arxiv papers arxiv_paper = [ r for r in recommendations if r["externalIds"].get("ArXiv", None) is not None ] if len(arxiv_paper) > max_paper_count: arxiv_paper = arxiv_paper[:max_paper_count] return arxiv_paper def format_recommendation_into_markdown(recommendations): comment = "(recommended by the Semantic Scholar API)\n\n" for r in recommendations: hub_paper_url = f"https://arxiv.org/abs/{r['externalIds']['ArXiv']}" comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n" return comment def get_paper_id_from_name(paper_name): try: response = requests.get( SEMANTIC_SCHOLAR_QUERY_URL.format(paper_name=paper_name) ) response.raise_for_status() items = response.json() paper_id = items.get("data", [])[0].get("paperId") except Exception as e: print(e) return None return paper_id def get_recommended_papers(paper_name): paper_id = get_paper_id_from_name(paper_name) recommended_content = "" if paper_id is None: return recommended_content recommended_papers = get_recommendations_from_semantic_scholar(paper_id) filtered_recommendations = filter_recommendations(recommended_papers) recommended_content = format_recommendation_into_markdown(filtered_recommendations) return recommended_content def fetch_papers(top_n=5): try: response = requests.get(f"{HF_API_URL}?limit=100") response.raise_for_status() items = response.json() # Calculate the date 3 days ago from now three_days_ago = datetime.now() - timedelta(days=3) # Filter items from the last 3 days recent_items = [ item for item in items if parse_date(item.get("publishedAt")) >= three_days_ago ] recent_items.sort( key=lambda x: x.get("paper", {}).get("upvotes", 0), reverse=True ) output_items = [ { "title": item.get("paper", {}).get("title"), "url": ARXIV_URL.format(paper_id=item.get("paper", {}).get("id")), "upvotes": item.get("paper", {}).get("upvotes"), } for item in recent_items[:top_n] ] except Exception as e: print(e) return [] return output_items ================================================ FILE: libs/ktem/ktem/utils/lang.py ================================================ SUPPORTED_LANGUAGE_MAP = { "en": "English", "ja": "Japanese", "vi": "Vietnamese", "es": "Spanish", "fr": "French", "de": "German", "zh": "Chinese", "ru": "Russian", "ar": "Arabic", "pt": "Portuguese", "hi": "Hindi", "bn": "Bengali", "pa": "Punjabi", "ko": "Korean", "it": "Italian", "nl": "Dutch", "tr": "Turkish", "pl": "Polish", "uk": "Ukrainian", "ro": "Romanian", "el": "Greek", "hu": "Hungarian", "sv": "Swedish", "cs": "Czech", "fi": "Finnish", "da": "Danish", "no": "Norwegian", "he": "Hebrew", "th": "Thai", "id": "Indonesian", "ms": "Malay", } ================================================ FILE: libs/ktem/ktem/utils/plantuml.py ================================================ #!/usr/bin/env python from __future__ import print_function import base64 import string from zlib import compress import httplib2 import six # type: ignore if six.PY2: from string import maketrans else: maketrans = bytes.maketrans plantuml_alphabet = ( string.digits + string.ascii_uppercase + string.ascii_lowercase + "-_" ) base64_alphabet = string.ascii_uppercase + string.ascii_lowercase + string.digits + "+/" b64_to_plantuml = maketrans( base64_alphabet.encode("utf-8"), plantuml_alphabet.encode("utf-8") ) class PlantUMLError(Exception): """ Error in processing. """ class PlantUMLConnectionError(PlantUMLError): """ Error connecting or talking to PlantUML Server. """ class PlantUMLHTTPError(PlantUMLConnectionError): """ Request to PlantUML server returned HTTP Error. """ def __init__(self, response, content, *args, **kwdargs): self.response = response self.content = content message = "%d: %s" % (self.response.status, self.response.reason) if not getattr(self, "message", None): self.message = message super(PlantUMLHTTPError, self).__init__(message, *args, **kwdargs) def deflate_and_encode(plantuml_text): """zlib compress the plantuml text and encode it for the plantuml server.""" zlibbed_str = compress(plantuml_text.encode("utf-8")) compressed_string = zlibbed_str[2:-4] return ( base64.b64encode(compressed_string).translate(b64_to_plantuml).decode("utf-8") ) class PlantUML(object): """Connection to a PlantUML server with optional authentication. All parameters are optional. :param str url: URL to the PlantUML server image CGI. defaults to http://www.plantuml.com/plantuml/svg/ :param dict request_opts: Extra options to be passed off to the httplib2.Http().request() call. """ def __init__(self, url="http://www.plantuml.com/plantuml/svg/", request_opts={}): self.HttpLib2Error = httplib2.HttpLib2Error self.http = httplib2.Http() self.url = url self.request_opts = request_opts def get_url(self, plantuml_text): """Return the server URL for the image. You can use this URL in an IMG HTML tag. :param str plantuml_text: The plantuml markup to render :returns: the plantuml server image URL """ return self.url + deflate_and_encode(plantuml_text) def process(self, plantuml_text): """Processes the plantuml text into the raw PNG image data. :param str plantuml_text: The plantuml markup to render :returns: the raw image data """ url = self.get_url(plantuml_text) try: response, content = self.http.request(url, **self.request_opts) except self.HttpLib2Error as e: raise PlantUMLConnectionError(e) if response.status != 200: raise PlantUMLHTTPError(response, content) svg_content = content.decode("utf-8") svg_content = svg_content.replace("" f"{svg_content}" ) return svg_content ================================================ FILE: libs/ktem/ktem/utils/rate_limit.py ================================================ from collections import defaultdict from datetime import datetime, timedelta import gradio as gr from decouple import config # In-memory store for rate limiting (for demonstration purposes) rate_limit_store: dict[str, dict] = defaultdict(dict) # Rate limit configuration RATE_LIMIT = config("RATE_LIMIT", default=20, cast=int) RATE_LIMIT_PERIOD = timedelta(hours=24) def check_rate_limit(limit_type: str, request: gr.Request): if request is None: raise ValueError("This feature is not available") user_id = None try: import gradiologin as grlogin user = grlogin.get_user(request) if user: user_id = user.get("email") except (ImportError, AssertionError): pass if not user_id: raise ValueError("Please sign-in to use this feature") now = datetime.now() user_data = rate_limit_store[limit_type].get( user_id, {"count": 0, "reset_time": now + RATE_LIMIT_PERIOD} ) if now >= user_data["reset_time"]: # Reset the rate limit for the user user_data = {"count": 0, "reset_time": now + RATE_LIMIT_PERIOD} if user_data["count"] >= RATE_LIMIT: raise ValueError("Rate limit exceeded. Please try again later.") # Increment the request count user_data["count"] += 1 rate_limit_store[limit_type][user_id] = user_data return user_id ================================================ FILE: libs/ktem/ktem/utils/render.py ================================================ import os import markdown from fast_langdetect import detect from kotaemon.base import RetrievedDocument BASE_PATH = os.environ.get("GR_FILE_ROOT_PATH", "") def is_close(val1, val2, tolerance=1e-9): return abs(val1 - val2) <= tolerance def replace_mardown_header(text: str) -> str: textlines = text.splitlines() newlines = [] for line in textlines: if line.startswith("#"): line = "" + line.replace("#", "") + "" if line.startswith("=="): line = "" newlines.append(line) return "\n".join(newlines) def get_header(doc: RetrievedDocument) -> str: """Get the header for the document""" header = "" if "page_label" in doc.metadata: header += f" [Page {doc.metadata['page_label']}]" header += f" {doc.metadata.get('file_name', '')}" return header.strip() class Render: """Default text rendering into HTML for the UI""" @staticmethod def collapsible(header, content, open: bool = False) -> str: """Render an HTML friendly collapsible section""" o = " open" if open else "" return ( f"
" f"{header}{content}" "

" ) @staticmethod def table(text: str) -> str: """Render table from markdown format into HTML""" text = replace_mardown_header(text) return markdown.markdown( text, extensions=[ "markdown.extensions.tables", "markdown.extensions.fenced_code", ], ) @staticmethod def table_preserve_linebreaks(text: str) -> str: """Render table from markdown format into HTML""" return markdown.markdown( text, extensions=[ "markdown.extensions.tables", "markdown.extensions.fenced_code", ], ).replace("\n", "
") @staticmethod def preview( html_content: str, doc: RetrievedDocument, highlight_text: str | None = None, ) -> str: text = doc.content pdf_path = doc.metadata.get("file_path", "") if not os.path.isfile(pdf_path): print(f"pdf-path: {pdf_path} does not exist") return html_content is_pdf = doc.metadata.get("file_type", "") == "application/pdf" page_idx = int(doc.metadata.get("page_label", 1)) if not is_pdf: print("Document is not pdf") return html_content if page_idx < 0: print("Fail to extract page number") return html_content if not highlight_text: phrase = "false" try: lang = detect(text.replace("\n", " "))["lang"] if lang not in ["ja", "cn"]: highlight_words = [ t[:-1] if t.endswith("-") else t for t in text.split("\n") ] highlight_text = highlight_words[0] phrase = "true" highlight_text = ( text.replace("\n", "").replace('"', "").replace("'", "") ) except Exception as e: print(e) highlight_text = text else: phrase = "true" return f""" {html_content} [Preview] """ # noqa @staticmethod def highlight(text: str, elem_id: str | None = None) -> str: """Highlight text""" id_text = f" id='mark-{elem_id}'" if elem_id else "" return f"{text}
" @staticmethod def image(url: str, text: str = "") -> str: """Render an image""" img = f'
' if text: caption = f"

{text}

" return f"
{img}{caption}

" return img @staticmethod def collapsible_with_header( doc: RetrievedDocument, open_collapsible: bool = False, ) -> str: header = f"{get_header(doc)}" if doc.metadata.get("type", "") == "image": doc_content = Render.image(url=doc.metadata["image_origin"], text=doc.text) elif doc.metadata.get("type", "") == "table_raw": doc_content = Render.table_preserve_linebreaks(doc.text) else: doc_content = Render.table(doc.text) return Render.collapsible( header=Render.preview(header, doc), content=doc_content, open=open_collapsible, ) @staticmethod def collapsible_with_header_score( doc: RetrievedDocument, override_text: str | None = None, highlight_text: str | None = None, open_collapsible: bool = False, ) -> str: """Format the retrieval score and the document""" # score from doc_store (Elasticsearch) if is_close(doc.score, -1.0): vectorstore_score = "" text_search_str = " (full-text search)
" else: vectorstore_score = str(round(doc.score, 2)) text_search_str = "
" llm_reranking_score = ( round(doc.metadata["llm_trulens_score"], 2) if doc.metadata.get("llm_trulens_score") is not None else 0.0 ) reranking_score = ( round(doc.metadata["reranking_score"], 2) if doc.metadata.get("reranking_score") is not None else 0.0 ) item_type_prefix = doc.metadata.get("type", "") item_type_prefix = item_type_prefix.capitalize() if item_type_prefix: item_type_prefix += " from " if "raw" in item_type_prefix: item_type_prefix = "" if llm_reranking_score > 0: relevant_score = llm_reranking_score elif reranking_score > 0: relevant_score = reranking_score else: relevant_score = 0.0 rendered_score = Render.collapsible( header=f" Relevance score: {relevant_score:.1f}", content="  Vectorstore score:" f" {vectorstore_score}" f"{text_search_str}" "  LLM relevant score:" f" {llm_reranking_score}
" "  Reranking score:" f" {reranking_score}
", ) text = doc.text if not override_text else override_text if doc.metadata.get("type", "") == "image": rendered_doc_content = Render.image( url=doc.metadata["image_origin"], text=text, ) elif doc.metadata.get("type", "") == "table_raw": rendered_doc_content = Render.table_preserve_linebreaks(doc.text) else: rendered_doc_content = Render.table(text) rendered_header = Render.preview( f"{item_type_prefix}{get_header(doc)}" f" [score: {llm_reranking_score}]", doc, highlight_text=highlight_text, ) rendered_doc_content = ( f"
{rendered_doc_content}
" ) return Render.collapsible( header=rendered_header, content=rendered_score + rendered_doc_content, open=open_collapsible, ) ================================================ FILE: libs/ktem/ktem/utils/visualize_cited.py ================================================ """ This module aims to project high-dimensional embeddings into a lower-dimensional space for visualization. Refs: 1. [RAGxplorer](https://github.com/gabrielchua/RAGxplorer) 2. [RAGVizExpander](https://github.com/KKenny0/RAGVizExpander) """ from typing import List, Tuple import numpy as np import pandas as pd import plotly.graph_objs as go import umap from kotaemon.base import BaseComponent from kotaemon.embeddings import BaseEmbeddings VISUALIZATION_SETTINGS = { "Original Query": {"color": "red", "opacity": 1, "symbol": "cross", "size": 15}, "Retrieved": {"color": "green", "opacity": 1, "symbol": "circle", "size": 10}, "Chunks": {"color": "blue", "opacity": 0.4, "symbol": "circle", "size": 10}, "Sub-Questions": {"color": "purple", "opacity": 1, "symbol": "star", "size": 15}, } class CreateCitationVizPipeline(BaseComponent): """Creating PlotData for visualizing query results""" embedding: BaseEmbeddings projector: umap.UMAP = None def _set_up_umap(self, embeddings: np.ndarray): umap_transform = umap.UMAP().fit(embeddings) return umap_transform def _project_embeddings(self, embeddings, umap_transform) -> np.ndarray: umap_embeddings = np.empty((len(embeddings), 2)) for i, embedding in enumerate(embeddings): umap_embeddings[i] = umap_transform.transform([embedding]) return umap_embeddings def _get_projections(self, embeddings, umap_transform): projections = self._project_embeddings(embeddings, umap_transform) x = projections[:, 0] y = projections[:, 1] return x, y def _prepare_projection_df( self, document_projections: Tuple[np.ndarray, np.ndarray], document_text: List[str], plot_size: int = 3, ) -> pd.DataFrame: """Prepares a DataFrame for visualization from projections and texts. Args: document_projections (Tuple[np.ndarray, np.ndarray]): Tuple of X and Y coordinates of document projections. document_text (List[str]): List of document texts. """ df = pd.DataFrame({"x": document_projections[0], "y": document_projections[1]}) df["document"] = document_text df["document_cleaned"] = df.document.str.wrap(50).apply( lambda x: x.replace("\n", "
")[:512] + "..." ) df["size"] = plot_size df["category"] = "Retrieved" return df def _plot_embeddings(self, df: pd.DataFrame) -> go.Figure: """ Creates a Plotly figure to visualize the embeddings. Args: df (pd.DataFrame): DataFrame containing the data to visualize. Returns: go.Figure: A Plotly figure object for visualization. """ fig = go.Figure() for category in df["category"].unique(): category_df = df[df["category"] == category] settings = VISUALIZATION_SETTINGS.get( category, {"color": "grey", "opacity": 1, "symbol": "circle", "size": 10}, ) fig.add_trace( go.Scatter( x=category_df["x"], y=category_df["y"], mode="markers", name=category, marker=dict( color=settings["color"], opacity=settings["opacity"], symbol=settings["symbol"], size=settings["size"], line_width=0, ), hoverinfo="text", text=category_df["document_cleaned"], ) ) fig.update_layout( height=500, legend=dict(y=100, x=0.5, xanchor="center", yanchor="top", orientation="h"), ) return fig def run(self, context: List[str], question: str): embed_contexts = self.embedding(context) context_embeddings = np.array([d.embedding for d in embed_contexts]) self.projector = self._set_up_umap(embeddings=context_embeddings) embed_query = self.embedding(question) query_projection = self._get_projections( embeddings=[embed_query[0].embedding], umap_transform=self.projector ) viz_query_df = pd.DataFrame( { "x": [query_projection[0][0]], "y": [query_projection[1][0]], "document_cleaned": question, "category": "Original Query", "size": 5, } ) context_projections = self._get_projections( embeddings=context_embeddings, umap_transform=self.projector ) viz_base_df = self._prepare_projection_df( document_projections=context_projections, document_text=context ) visualization_df = pd.concat([viz_base_df, viz_query_df], axis=0) fig = self._plot_embeddings(visualization_df) return fig ================================================ FILE: libs/ktem/ktem_tests/__init__.py ================================================ ================================================ FILE: libs/ktem/ktem_tests/resources/embedding_openai.json ================================================ { "object": "list", "data": [ { "object": "embedding", "index": 0, "embedding": [ 0.006555966101586819, 0.003670461941510439, -0.011642491444945335, -0.026776473969221115, -0.012383491732180119, -0.0014341175556182861, -0.013375678099691868, 0.009356695227324963, -0.006364436354488134, -0.0294390507042408, 0.023950627073645592, 0.0029859787318855524, -0.023234745487570763, -0.009205983020365238, 0.006744355894625187, 0.0011790062999352813, 0.02607315219938755, -0.018437083810567856, 0.008904559537768364, 0.009620440192520618, -0.01306169480085373, -0.0011358336778357625, 0.007253008428961039, 0.00875384733080864, -0.012710033915936947, 0.0037206991109997034, 0.005419347435235977, -0.017243949696421623, 0.036246202886104584, -0.0266759991645813, 0.012647237628698349, -0.008552898652851582, -0.00762350857257843, -0.012546762824058533, 0.007083457428961992, -0.014078999869525433, 0.0048761568032205105, -0.013689660467207432, 0.018211016431450844, -0.014367864467203617, 0.008307991549372673, 0.006022194866091013, 0.005457025486975908, -0.006320478860288858, -0.03609549254179001, 0.012986338697373867, 0.00017592862423043698, -0.016427593305706978, -0.004734864458441734, 0.0257717277854681, 0.01944182999432087, -0.00211938563734293, -0.0147195253521204, 0.01332544069737196, -0.017670966684818268, 0.0013956546317785978, -0.04024006798863411, 0.024754423648118973, 0.032051388174295425, -0.018713390454649925, 0.011265711858868599, 0.012370931915938854, -0.013576626777648926, 0.0040974789299070835, -0.002153923735022545, 0.009915584698319435, 0.006637601647526026, 0.011353626847267151, -0.019919084385037422, 0.030142372474074364, 0.01966789737343788, -0.008213796652853489, -0.004213652573525906, -0.009162025526165962, 0.012446288019418716, 0.008979915641248226, -0.02098662778735161, 0.003113141981884837, 0.0018838982796296477, -0.0008289152756333351, 0.02722861059010029, -0.03355850651860237, -0.004408321809023619, 0.017859356477856636, 0.019893966615200043, -0.0014137086691334844, -0.0021602034103125334, 0.017959831282496452, -0.0114603815600276, -0.020069796591997147, 0.007604669313877821, 0.001281835837289691, 0.01132222916930914, 0.006668999791145325, -0.01743233948945999, 0.0048761568032205105, 0.0016923686489462852, 0.02356128767132759, -0.005796127021312714, -0.041470881551504135, -0.01292354241013527, -0.005824385676532984, -0.010700542479753494, -0.017005322501063347, -0.007950050756335258, 0.011560855433344841, -0.005485283676534891, -0.0002590360236354172, 0.028434304520487785, 0.007447678130120039, -0.026198744773864746, 0.009306457825005054, -0.00500488979741931, -0.03107176162302494, -0.0007555217016488314, -0.008056805469095707, 0.007479076273739338, 0.003978165332227945, -0.005984516814351082, -0.02778122015297413, 0.015272135846316814, 0.029338575899600983, 0.010964288376271725, -0.023548727855086327, 0.007014381233602762, 0.0014137086691334844, -0.03380969539284706, -0.016741575673222542, -0.005230957642197609, -0.004794521257281303, 0.0315992534160614, 0.0041100382804870605, 0.010644025169312954, -0.0014294078573584557, -0.035367049276828766, 0.0246916264295578, -0.007096016779541969, 0.02096150815486908, -0.017030440270900726, -0.011906237341463566, 0.006273381412029266, 0.021237812936306, 0.01757049188017845, -0.013224965892732143, -0.005092805251479149, 0.01288586389273405, 0.004486817866563797, -0.016339678317308426, 0.008917118422687054, -0.015083746053278446, 0.008377067744731903, 0.0185626782476902, 0.012961219996213913, -0.002535413019359112, 0.0050143091939389706, 0.016025694087147713, 0.004618690814822912, 0.0205470509827137, 0.008000288158655167, -0.013074253685772419, -0.0007264782907441258, -0.0004474258457776159, 0.0033125211484730244, -0.03084569424390793, 0.009400652721524239, 0.015787066891789436, 0.02607315219938755, 0.003689300734549761, -0.013940847478806973, -0.02604803442955017, -0.011454101651906967, 0.007340923883020878, -0.043178949505090714, 0.02660064399242401, -0.01593777909874916, 0.00514304218813777, -0.001305384561419487, 0.022820288315415382, -0.012697474099695683, -0.017269067466259003, -0.03024284727871418, 0.011617372743785381, 0.008992474526166916, 0.025897322222590446, 0.00629850011318922, -0.00014511904737446457, 0.009890465997159481, -0.006364436354488134, 0.00384629238396883, -0.017030440270900726, 0.016666220501065254, 0.03333244100213051, 0.021011745557188988, 0.01844964362680912, -0.6892555952072144, -0.007196491584181786, 0.025319593027234077, 0.02024562656879425, 0.006245122756808996, 0.009909304790198803, 0.012176262214779854, 0.027806337922811508, 0.006518288049846888, 0.0274546779692173, -0.02107454277575016, 0.013011457398533821, -0.015083746053278446, -0.010719381272792816, -0.0002598209830466658, -0.00823263544589281, 0.0055763390846550465, -0.012339534237980843, -0.011881118640303612, 0.015196779742836952, -0.010141652077436447, 0.03129782900214195, -0.022104406729340553, -0.014568813145160675, 0.009928143583238125, 0.011730406433343887, 0.0025102945510298014, -0.009739753790199757, -0.006964143831282854, 0.030192609876394272, -0.0021272350568324327, 0.0114603815600276, 9.394961671205238e-05, 0.014154355973005295, 0.06611227244138718, 0.007610949221998453, -0.005463304929435253, 0.008088203147053719, 0.009827669709920883, 0.028986915946006775, -0.019077610224485397, -0.022267676889896393, 0.012559321708977222, -0.01662854291498661, -0.002763050841167569, 0.009205983020365238, 0.012659796513617039, -0.0024051100481301546, 0.0022543983068317175, 0.0017849936848506331, 0.014568813145160675, -0.004851038102060556, 0.010857533663511276, 0.019504627212882042, -0.005343991331756115, 0.0071588135324418545, 0.02151411771774292, 0.011962753720581532, -0.002935741562396288, 0.004521355964243412, 0.0005718416068702936, 0.017168592661619186, -0.0016280021518468857, 0.0038494321051985025, -0.022443508729338646, 0.02381247468292713, -0.021313169971108437, 0.01400364376604557, -0.008929678238928318, -0.012050669640302658, -0.0018368008313700557, -0.007994008250534534, -0.018047746270895004, -0.014744644053280354, 0.013965966179966927, 0.028107762336730957, 0.01681693270802498, -0.0033219405449926853, -0.006160347256809473, 0.01966789737343788, 0.010926609858870506, 0.013237525708973408, -0.014066440984606743, -0.01764584705233574, 0.02349849045276642, -0.019780931994318962, -0.030067017301917076, -0.016276881098747253, 0.010430516675114632, 0.011479220353066921, 0.030192609876394272, 0.0033250804990530014, -0.012370931915938854, -0.01980605162680149, 0.02883620373904705, 0.0005384809337556362, -0.011548296548426151, 0.003821173682808876, 0.027303965762257576, 0.0004238771216478199, 0.014933033846318722, -0.0028038686141371727, 0.0003752097545657307, 0.00652456795796752, 0.010279805399477482, 0.006656440440565348, -0.00809448305517435, 0.033156611025333405, 0.02444044128060341, -0.030142372474074364, 0.006317338906228542, 0.004094338975846767, -0.04071732237935066, 0.02121269516646862, 0.008471262641251087, -0.032001152634620667, 0.0013022447237744927, 0.016013136133551598, 0.026374576613307, -0.014694406650960445, 0.023159390315413475, 0.0020424597896635532, 0.013664542697370052, 0.0057458896189928055, 0.0041539957746863365, 0.005303173791617155, 0.0003883577883243561, 0.004342385567724705, -0.01690484769642353, 0.001676669460721314, 0.020697763189673424, 0.005133622791618109, 0.00046273251064121723, -0.005651694722473621, 0.009808829985558987, 0.00202676048502326, 0.009896745905280113, -0.017884474247694016, 0.00862825382500887, 0.020308423787355423, -0.007994008250534534, 0.005127343349158764, -0.0029388812836259604, -0.0006303209811449051, 0.011818322353065014, -0.018399406224489212, -0.012025550939142704, 0.008678491227328777, 0.00016974708705674857, -0.013677101582288742, 0.01983116939663887, -0.007918652147054672, -0.03245328739285469, 0.021162457764148712, -0.008044245652854443, -0.0002005566784646362, 0.0036076651886105537, -0.029187863692641258, -0.03569359332323074, -0.028961796313524246, 0.0027096737176179886, 0.016364796087145805, -0.0071022966876626015, -0.005174440797418356, -0.01400364376604557, -0.023787355050444603, -0.024566033855080605, 0.011648771353065968, 0.005579478573054075, -0.029012033715844154, 0.010857533663511276, -0.0049420930445194244, -0.025093525648117065, -0.0024961652234196663, -0.0030016780365258455, 0.010694262571632862, -0.010681703686714172, -0.015724271535873413, -0.003249724628403783, -0.01731930486857891, 0.006876228842884302, -0.009601601399481297, -0.009821389801800251, 0.012176262214779854, 0.02582196518778801, -0.010185610502958298, -0.0005722341011278331, 0.02747979573905468, -0.019793491810560226, -0.0016405613860115409, 0.017005322501063347, 0.0072027710266411304, -0.02220488153398037, 0.0006103045307099819, -0.007912373170256615, -0.004559034015983343, -0.0010808866936713457, 0.0038023346569389105, 0.01679181307554245, 0.01718115247786045, 0.018713390454649925, 0.004100618418306112, 0.017859356477856636, -0.026022914797067642, -0.016013136133551598, -0.028811084106564522, -0.0023705719504505396, -0.030343322083353996, 0.003858851734548807, 0.013890610076487064, 0.011969033628702164, -0.002329754177480936, -0.0014223431935533881, 0.01764584705233574, 0.016276881098747253, 0.02692718617618084, 0.0021821821574121714, -0.020923830568790436, -0.02855989895761013, -0.007510474417358637, -0.002681415295228362, 0.008326830342411995, -0.01458137296140194, -0.0007727907504886389, 0.01720627024769783, 0.032604001462459564, 0.004417741671204567, 0.03554287925362587, 0.003953046631067991, -0.0322021022439003, -0.02505584806203842, 0.009231101721525192, 0.012659796513617039, 0.009563923813402653, -0.016842050477862358, -0.005513542331755161, 0.0041414364241063595, -0.014405542053282261, 0.02916274592280388, 0.012163703329861164, -0.008502661250531673, 0.010185610502958298, 0.028710609301924706, -0.0177839994430542, 0.00950112659484148, 0.028384067118167877, 0.014744644053280354, -0.009444610215723515, 0.005055127199739218, 0.02803240716457367, -0.0009089809027500451, 0.0004933458403684199, -0.014405542053282261, -0.008433585055172443, 0.0012111896649003029, -0.031498778611421585, 0.0004556679050438106, -0.021388525143265724, 0.019605102017521858, 0.03569359332323074, 0.013689660467207432, 0.01161109283566475, -0.0064680506475269794, -0.005783567670732737, 0.009205983020365238, -0.013413355685770512, -0.001551076304167509, -0.013488711789250374, 0.003160239430144429, -0.005114783998578787, -0.012408610433340073, -0.003158669453114271, 0.0335836261510849, -0.026274101808667183, 0.009319016709923744, 0.0005424057017080486, -0.0011326938401907682, -0.008841762319207191, -0.005205838941037655, 0.0030566249042749405, -0.013036576099693775, -0.022443508729338646, -0.0017865635454654694, 0.011830881237983704, 0.007096016779541969, -0.01605081371963024, -0.024038542062044144, -0.005039427895098925, -0.0011122849537059665, -0.0032779830507934093, -0.008571737445890903, 0.0041194576770067215, 0.014292508363723755, -0.02342313528060913, 0.015485644340515137, -0.002646877197548747, 0.020722880959510803, 0.00680087273940444, -0.009865347295999527, -0.00600649556145072, 0.021036865189671516, 0.020371221005916595, -0.0016107329865917563, -0.019291117787361145, 0.013287762179970741, -0.004006423521786928, -0.00762350857257843, -0.015334932133555412, 0.0008681631297804415, -0.008295431733131409, -0.0007543442770838737, 0.011523177847266197, -0.0027818898670375347, 0.0037332584615796804, 0.01304913591593504, -0.005237237084656954, -0.00340043636970222, 0.016691338270902634, 0.02612338960170746, -0.002295216079801321, -0.008333110250532627, -0.009683237411081791, -0.022305356338620186, -0.00935041531920433, 0.06972935795783997, -0.001001605880446732, -0.0031319810077548027, 0.019617659971117973, -0.019014813005924225, -0.012848186306655407, -0.04051637277007103, -0.018223576247692108, -0.01665366068482399, -0.0031979172490537167, -0.004436580464243889, 0.004458559211343527, -0.013802695088088512, 0.01994420401751995, 0.013551508076488972, 0.009080389514565468, 0.00340043636970222, -0.027052778750658035, -0.011341067962348461, 0.0048667374067008495, -0.0151088647544384, 0.010110254399478436, 0.00046077012666501105, 0.009990940801799297, 0.0205470509827137, 0.008307991549372673, 0.00277247023768723, 0.01720627024769783, -0.007460237015038729, -0.0038965295534580946, 0.008483821526169777, -0.00610383041203022, 0.01745745725929737, 0.008778966031968594, 0.013312880881130695, 0.014078999869525433, 0.0029043431859463453, 0.03780356049537659, 0.008464982733130455, -0.005384809337556362, 0.011868558824062347, 0.005287474486976862, -0.006179186515510082, -0.006009635515511036, 0.0142673896625638, 0.0023344638757407665, -0.0057710083201527596, 0.028660371899604797, -0.0014239131705835462, -0.028685491532087326, 0.023121710866689682, 0.007742822170257568, -0.009218541905283928, -0.023071475327014923, -0.01187483873218298, 0.018072864040732384, 0.00027355772908777, -0.013928287662565708, -0.010844974778592587, -0.017796559259295464, -0.014744644053280354, -0.01513398252427578, -0.014694406650960445, -0.001071467180736363, -0.031172236427664757, -0.019190644845366478, -0.02369944006204605, 0.002376851625740528, -0.007466516923159361, 0.001561280689202249, -0.0048981355503201485, -0.03863247483968735, -0.0125404829159379, -0.0036516229156404734, 0.01909017004072666, 0.0004517431079875678, -0.001898027490824461, -0.006486889906227589, 0.004044101573526859, 0.013802695088088512, -0.006047313567250967, -0.029012033715844154, 0.011271991766989231, -0.006292220205068588, 0.015611236914992332, 0.0031727987807244062, 0.00666272034868598, 0.009111788123846054, -0.01690484769642353, 0.029589762911200523, 0.008521500043570995, 0.015071186237037182, 0.026299219578504562, 0.0003149642434436828, 0.018650593236088753, 0.004537055268883705, 0.0050582666881382465, 0.001783423707820475, -0.019906524568796158, -1.6802998288767412e-05, -0.008307991549372673, -0.01179320365190506, -0.0020785678643733263, 0.004810220096260309, -0.0034695127978920937, 0.01676669530570507, 0.01690484769642353, 0.01568659394979477, -0.015372609719634056, -0.0016327118501067162, 0.034211594611406326, -0.024088779464364052, 0.016251763328909874, -0.0064806099981069565, 0.0001965337578440085, 0.013438474386930466, 0.021564355120062828, 0.03504050895571709, -0.004694046452641487, -0.03443766012787819, -0.005557499825954437, -0.025281915441155434, 0.008490101434290409, 0.0157996267080307, 0.009130626916885376, 0.0012418029364198446, -0.0018619195325300097, -0.022192321717739105, -0.006907626986503601, -0.0032748430967330933, -0.005548080429434776, 0.019542304798960686, -0.006304779555648565, -0.0023250444792211056, -0.015309813432395458, -0.006499449256807566, -0.018324051052331924, 0.005968817975372076, -0.006851110141724348, -0.017859356477856636, -0.02425205148756504, -0.0063236188143491745, -0.006568525452166796, -0.013890610076487064, -0.04551498219370842, -0.044786542654037476, -0.016163846477866173, 0.00394362723454833, -0.017444897443056107, 0.016364796087145805, -0.003949906677007675, -0.000322813808452338, -0.019981881603598595, -0.010625186376273632, -0.006342457607388496, -0.01692996546626091, -0.0006311059114523232, -0.006116389762610197, 0.027002541348338127, 0.027931932359933853, 0.025369830429553986, 0.013978525064885616, 0.0114603815600276, 0.006474330555647612, -0.008559177629649639, -0.01662854291498661, 0.004559034015983343, -0.012094627134501934, -0.014204593375325203, 0.012710033915936947, 0.020999185740947723, 0.028459424152970314, -0.007026940584182739, 0.009595322422683239, 0.0048447586596012115, -0.008188677951693535, -0.020873593166470528, 0.0010173050686717033, -0.013928287662565708, -0.014405542053282261, -0.020974067971110344, -0.0010471334680914879, 0.002153923735022545, 0.010958008468151093, 0.011234313249588013, -0.015548440627753735, 0.011812042444944382, 0.008596856147050858, 0.02079823799431324, 0.003050345228984952, 0.026299219578504562, -0.01969301700592041, 0.02742955833673477, 0.007265567779541016, 0.020597288385033607, -0.013162169605493546, 0.007711423560976982, -0.006794593296945095, 0.016917407512664795, 0.009262500330805779, 0.021451322361826897, 0.02770586498081684, -0.012270457111299038, -0.010593787766993046, 0.005884042475372553, 0.026173627004027367, 0.0031162817031145096, 0.007510474417358637, 0.004194813314825296, -0.007862135767936707, 0.0027096737176179886, -0.020195389166474342, -0.004998610354959965, -0.013501270674169064, 0.004712885711342096, -0.012201380915939808, -0.01872594840824604, 0.018625473603606224, -0.011881118640303612, -0.029087388888001442, -0.015787066891789436, -0.0003167304093949497, 0.048001728951931, 0.005959398113191128, 0.012100907042622566, 0.01939159259200096, 0.0028556757606565952, -0.005940559320151806, 0.02068520337343216, -0.018675711005926132, -0.005522961728274822, 0.039813049137592316, 0.0015455815009772778, -0.0063236188143491745, -0.02416413463652134, 0.004684627056121826, 0.004078639671206474, -0.015071186237037182, -0.024051101878285408, 0.008050525560975075, 0.028760846704244614, -0.002221429953351617, -0.014619050547480583, 0.006113249808549881, -0.033131491392850876, 0.02471674606204033, 0.003482071915641427, -0.021363407373428345, -0.015058627352118492, 0.012440008111298084, -0.023071475327014923, 0.0088919997215271, -0.0305191520601511, 0.02720349095761776, 0.03413623571395874, -0.005836945027112961, -0.006668999791145325, 0.003796054981648922, 0.00045880774268880486, -0.0013477721950039268, -0.007686304859817028, 0.02833382971584797, -0.007058338727802038, 0.0036641822662204504, 0.01891433820128441, -0.0021272350568324327, -0.02079823799431324, -0.005208978895097971, -0.012320694513618946, 0.022581661120057106, -0.017331864684820175, 0.005482144188135862, -0.011899957433342934, 0.011328508146107197, -0.003271703375503421, 0.012075788341462612, -0.013790135271847248, -0.00652456795796752, -0.006210584659129381, -0.0016672499477863312, 0.02665088139474392, 2.0819775272684637e-06, -0.001816391944885254, -0.01485767774283886, -0.017984949052333832, -0.024503236636519432, -0.04712257534265518, -0.004483677912503481, 0.0015801197150722146, -0.029589762911200523, -0.004659508354961872, -0.005466444883495569, 0.009551363997161388, 0.02565869502723217, 0.016779253259301186, -0.012358373031020164, 0.012553042732179165, 0.023247305303812027, -0.0181733388453722, -0.0057301907800138, 0.018688270822167397, 0.015975456684827805, -0.02584708482027054, 0.016251763328909874, -0.0035699873697012663, -0.019178085029125214, 0.004577872809022665, -0.014104118570685387, -0.004160275217145681, 0.011585974134504795, 0.013388236984610558, 0.00026119465474039316, -0.016113610938191414, 0.006348737049847841, 0.02936369553208351, 0.016867170110344887, 0.009036432020366192, -0.007322084624320269, -0.003296822076663375, 0.037050001323223114, -0.014217152260243893, 0.0041539957746863365, -0.007334643974900246, -0.012697474099695683, 0.019655339419841766, -0.007052059285342693, -0.007667466066777706, -0.0007700434071011841, -0.03576894849538803, -0.003962466027587652, -0.0019985020626336336, 0.008138440549373627, 0.014342745766043663, -0.015837304294109344, -0.03210162743926048, -0.0056108771823346615, -0.016703898087143898, 0.00034538135514594615, 0.009984660893678665, -0.01095172856003046, 0.001340707647614181, 0.013840372674167156, 0.016716457903385162, 0.00038973146001808345, 0.005940559320151806, -0.011535737663507462, -0.01093916967511177, -0.016314558684825897, -0.02427716925740242, -0.012634677812457085, -0.0012151143746450543, 0.041244812309741974, 0.006957864388823509, -0.018110541626811028, -0.0088919997215271, -0.005127343349158764, -0.02747979573905468, -0.03227745741605759, -0.002741072094067931, -0.0029655699618160725, -0.0056234365329146385, 0.016741575673222542, -0.008527779951691628, 0.022016491740942, 0.006263962015509605, -0.0022748070769011974, -0.00584008451551199, -0.019630219787359238, 0.011824601329863071, 0.004703465849161148, 0.01513398252427578, -0.000919970334507525, -0.02328498288989067, 0.002163343131542206, 0.0026296081487089396, 0.011912517249584198, -0.0042073726654052734, 0.00936297420412302, -0.009331576526165009, -0.010487033985555172, -0.014355304650962353, -0.0040284027345478535, -0.002763050841167569, 0.007253008428961039, 0.016364796087145805, -0.0157996267080307, 0.001935705542564392, 0.0050582666881382465, -0.003422415116801858, 0.003422415116801858, -0.00468148710206151, 0.008433585055172443, -0.015360050834715366, 0.015837304294109344, -0.019981881603598595, -0.006320478860288858, 0.003088023280724883, -0.02336033806204796, 0.015159101225435734, -0.025420067831873894, 0.00652456795796752, 0.0294390507042408, 0.01068798266351223, 0.00756699126213789, -0.0008603135356679559, -0.0018933177925646305, 0.0024160996545106173, 0.005488423630595207, -0.00935041531920433, -0.011315949261188507, -2.2285437808022834e-05, -0.007372322026640177, -0.021149897947907448, -0.02966511808335781, 0.0021617733873426914, -0.025721492245793343, -0.014330185949802399, 0.001420773332938552, 0.03476420417428017, 0.015787066891789436, -0.026550406590104103, -0.01319984719157219, -0.023159390315413475, 0.013011457398533821, 0.009406931698322296, -0.013388236984610558, 0.001562850666232407, -0.00962672010064125, -0.0042324913665652275, 0.021916016936302185, -0.01762072928249836, 0.005290614441037178, 0.018537558615207672, 0.016176406294107437, -0.004251330625265837, 0.039260439574718475, 0.2337038666009903, -0.012791668996214867, 0.0035228899214416742, 0.027354203164577484, 0.02217976190149784, 0.013551508076488972, 0.010210729204118252, 0.0017080678371712565, 0.006982983089983463, 0.0028289873152971268, -0.01159853395074606, 0.0012826207093894482, -0.00470974575728178, 0.006756915245205164, 0.005450745578855276, 0.00941949151456356, -0.02107454277575016, -0.019793491810560226, -0.030921051278710365, -0.024503236636519432, 0.006279660854488611, -0.03295566141605377, -0.025256795808672905, -0.014305067248642445, 0.017495134845376015, -0.0072969659231603146, -0.008446143940091133, 0.0011868559522554278, 0.03363386541604996, 0.009746033698320389, 0.004763122648000717, -0.011573415249586105, 0.011228034272789955, 0.011246873065829277, -0.014367864467203617, -0.019567424431443214, 0.03604525327682495, 0.013363118283450603, 0.03084569424390793, 0.003871411085128784, -0.016553185880184174, -0.025068406015634537, -0.004518216010183096, -0.017633287236094475, 0.0027238030452281237, 0.011096160858869553, 0.007246728986501694, -0.026173627004027367, 0.018123101443052292, 0.033985525369644165, 0.004656368400901556, 0.007014381233602762, 0.02300867810845375, 0.044032983481884, 0.00041406514355912805, -0.015950338914990425, 0.008653372526168823, 0.016666220501065254, -0.005259216297417879, -0.000553002639207989, 0.0005902881384827197, 0.03185044229030609, -2.5854542400338687e-05, 0.03943626955151558, -0.023598965257406235, 0.007472796365618706, -0.02519400045275688, -0.002959290286526084, 0.003968745935708284, -0.0035166102461516857, -0.01667878031730652, -0.0014144936576485634, 0.00603475421667099, -0.00840846635401249, -0.029514405876398087, -0.009519966319203377, 0.010499592870473862, 0.0036798813380301, 0.03335756063461304, 0.0274546779692173, -0.009388092905282974, -0.0015463664894923568, -0.009859067387878895, -0.02010747417807579, -0.038657594472169876, -0.029037151485681534, 0.0022512583527714014, -0.00729068648070097, -0.012107186019420624, -0.023372897878289223, -0.014531135559082031, -0.010254686698317528, 0.001805402571335435, -0.006069292314350605, 0.023686882108449936, 0.026198744773864746, -0.0034569534473121166, 0.010612627491354942, -0.02091127075254917, 0.012364652007818222, -0.005510402377694845, -0.00246790680103004, 7.820140308467671e-05, 0.0090866694226861, 0.0022292796056717634, 0.009903025813400745, -0.0071148560382425785, 0.0007563066901639104, -0.00300324778072536, -0.01563635654747486, -0.022280236706137657, -0.0016735296230763197, -0.006807152647525072, -0.01108988095074892, -0.00680087273940444, 0.00976487249135971, -0.005080245900899172, -0.00265472661703825, -0.0003416528052184731, -0.008885719813406467, -0.003959326073527336, -0.013915728777647018, -0.014794881455600262, 0.002414529677480459, -0.006169767118990421, -0.014807440340518951, -0.006888788193464279, 0.00915574561804533, 0.014531135559082031, -0.030016779899597168, 0.00854661874473095, -0.0002739502233453095, 0.021237812936306, 0.008113321848213673, -0.006989262532442808, 0.007334643974900246, 0.016992762684822083, -0.0028666651342064142, -0.016879728063941002, 0.0035448686685413122, -0.0021131059620529413, -0.010003499686717987, 0.0088919997215271, -0.0073534827679395676, -0.014920474030077457, -0.01649039052426815, 0.03112199902534485, -0.027555152773857117, -0.010970567353069782, 0.002863525412976742, -0.04272681474685669, -0.018185898661613464, -0.013903168961405754, -0.0035385889932513237, 0.027931932359933853, -0.020735440775752068, -0.019680457189679146, -0.02485489845275879, -0.006210584659129381, 0.009896745905280113, -0.012502805329859257, 0.009645558893680573, 0.02996654249727726, -0.007862135767936707, -0.038657594472169876, -0.013149609789252281, -0.16045789420604706, 0.014430660754442215, 0.011523177847266197, -0.012760271318256855, 0.02066008374094963, 0.015171661041676998, 0.024641389027237892, 0.004643809515982866, -0.009067830629646778, 0.018575238063931465, 0.009859067387878895, -0.01288586389273405, -0.014945592731237411, -0.014305067248642445, 0.005290614441037178, -0.01612616889178753, -0.008182398043572903, 0.012352093122899532, 0.029740475118160248, 0.028685491532087326, 0.035115864127874374, -0.007334643974900246, -0.0009168304968625307, -0.00477882195264101, 0.013714779168367386, -0.007811898365616798, 0.0007111715967766941, 0.03393528610467911, -0.010053737089037895, -0.007020661141723394, -0.016000576317310333, -0.02888644114136696, 0.02582196518778801, 0.004559034015983343, 0.021966254338622093, -0.006430373061448336, 0.012446288019418716, -0.042601220309734344, -0.01635223627090454, 0.024930253624916077, 0.032855186611413956, 0.018625473603606224, 0.019228322431445122, -0.013526389375329018, -0.015410288237035275, -0.005224677734076977, 0.020609848201274872, 0.015083746053278446, 0.01621408388018608, -0.0070897373370826244, 0.008119601756334305, -0.003028366481885314, -0.0005110073834657669, -0.00622942391782999, 0.019981881603598595, 0.029489288106560707, 0.006888788193464279, 0.02452835626900196, 0.0024773261975497007, -0.008458703756332397, -0.0002564848982729018, -0.012370931915938854, -0.006254542153328657, -0.004706605803221464, 0.006050453521311283, -0.005224677734076977, 0.0014686556532979012, 0.009689517319202423, -0.00024942029267549515, 0.018349168822169304, -0.008910838514566422, -0.023084033280611038, -0.01039911899715662, -0.02554566040635109, 0.006825991440564394, 0.008389626629650593, -0.03378457576036453, 0.03109688125550747, -0.0114603815600276, -0.014832559041678905, -0.008295431733131409, 0.021313169971108437, -0.0024836058728396893, 0.007070898078382015, -0.023121710866689682, 0.00038973146001808345, 0.0032465846743434668, 0.012653516605496407, -0.03134806826710701, -0.027002541348338127, 0.007453957572579384, -0.017984949052333832, -0.014166914857923985, -0.018022626638412476, 0.00543504673987627, 0.015598678030073643, 0.012534203007817268, 0.01623920351266861, -0.01571171171963215, -0.019893966615200043, -0.002260677982121706, 0.017143474891781807, -0.016013136133551598, 0.011190355755388737, 0.03049403429031372, -0.005469584837555885, 0.017532814294099808, 0.007981449365615845, 0.022405831143260002, -0.03169972822070122, -0.03217698261141777, 0.006631322205066681, 0.014694406650960445, 0.026148507371544838, 0.01623920351266861, 0.026449931785464287, -0.013262644410133362, -0.01734442450106144, 0.0048321993090212345, -0.013099372386932373, 0.052045829594135284, 0.006223144009709358, -0.04420881345868111, -0.0010887362295761704, 0.0028792244847863913, 0.0002376459160586819, -0.08409722149372101, -0.03438742458820343, 0.004712885711342096, 0.014405542053282261, -0.001269276486709714, 0.03835617005825043, 0.004216792527586222, -0.0017881334060803056, 0.0019875126890838146, 0.005535521078854799, 0.008923398330807686, -0.03049403429031372, -0.01787191443145275, -0.010568669065833092, 0.04071732237935066, -0.02720349095761776, 0.012779110111296177, 0.0010651875054463744, -0.033960405737161636, 0.011604813858866692, -0.008678491227328777, 0.007868414744734764, 0.0028478263411670923, -0.001828951295465231, -0.02222999930381775, 0.012201380915939808, -0.022971000522375107, 0.03127271309494972, 0.0016358516877517104, -0.004926394205540419, -0.016754135489463806, -0.024063661694526672, -0.014166914857923985, -0.008307991549372673, -0.004449139814823866, 0.013626864179968834, -0.04099362716078758, 0.03102152608335018, 0.013413355685770512, -0.04106898233294487, 0.027253728359937668, 0.010593787766993046, -0.006461771205067635, -0.05581362545490265, -0.012659796513617039, 0.017168592661619186, -0.0019137266790494323, 0.03777844086289406, 0.01870083063840866, -0.009048991836607456, -0.03222722187638283, -0.03139830380678177, -0.021916016936302185, 0.010832414962351322, 0.01048075407743454, -0.0017535953084006906, 0.008056805469095707, 0.028911558911204338, -0.009921864606440067, 0.011962753720581532, 0.013375678099691868, 0.006147787906229496, -0.02038377895951271, 0.01704300008714199, -0.012559321708977222, 0.018575238063931465, -0.012697474099695683, 0.0232221856713295, 0.0201577115803957, 0.00969579629600048, -0.01563635654747486, 0.028409186750650406, 0.006245122756808996, 0.010053737089037895, -0.03792915120720863, -0.005469584837555885, -0.015837304294109344, -0.020258186385035515, 0.006606203503906727, -0.005811826325953007, -0.03310637176036835, -0.020735440775752068, 0.009218541905283928, -0.008182398043572903, 0.026022914797067642, 0.01360174547880888, -0.01054983027279377, 0.0057710083201527596, -0.015347491018474102, -0.03544240817427635, -0.03054427169263363, 0.01035516057163477, 0.006989262532442808, -0.011510618962347507, 0.007529313676059246, 0.014757202938199043, -0.01180576253682375, -0.0010895212180912495, 0.01607593148946762, 0.010807296261191368, -0.02485489845275879, -0.011824601329863071, -0.0650070533156395, 0.032352812588214874, -0.016113610938191414, -0.019353915005922318, 0.0009600031771697104, -0.022016491740942, 0.006687839049845934, -0.01512142363935709, 0.002095836913213134, -0.00881036464124918, -0.018675711005926132, -0.0048635974526405334, -0.002117815660312772, 0.012615839019417763, -0.02798216976225376, 0.0032685634214431047, 0.020182831212878227, 0.014192033559083939, 0.005880902521312237, -0.002144504338502884, 0.00037756460369564593, -0.03245328739285469, -0.016502948477864265, 0.001036928966641426, -0.023774797096848488, 0.011177796870470047, -0.0314234234392643, 0.0005577123956754804, -0.002411389723420143, -0.00874128844588995, 0.0064366525039076805, -0.042576100677251816, 0.0064021144062280655, 0.022682135924696922, 0.0008108612382784486, -0.010122813284397125, 0.006794593296945095, 0.015083746053278446, 0.007366042118519545, 0.020333541557192802, -0.030895931646227837, -0.033181726932525635, 0.015611236914992332, -0.010832414962351322, 0.0036013855133205652, 0.004634389653801918, -0.005340851843357086, -0.0033941566944122314, 0.030418677255511284, 0.009808829985558987, -0.003227745648473501, 0.029263220727443695, -0.028409186750650406, -0.013036576099693775, -0.012841906398534775, -0.008351949043571949, -0.017143474891781807, -0.0003757984668482095, -0.008333110250532627, -0.04493725299835205, 0.02248118631541729, 0.00018220828496851027, 0.02966511808335781, -0.007868414744734764, 0.005246656946837902, 0.010813576169312, -0.0205470509827137, -0.002692404668778181, -0.014619050547480583, -0.02234303392469883, -0.006920186337083578, -0.00593741936609149, 0.00517130084335804, 0.005312593188136816, 0.019630219787359238, 0.020647525787353516, -0.003180648200213909, 0.0170932374894619, -0.040616847574710846, 0.037728201597929, 0.020258186385035515, 0.021112220361828804, -0.030318202450871468, -0.0004332966054789722, 0.034563254565000534, 0.006088131107389927, -0.006254542153328657, -0.01731930486857891, -0.009205983020365238, 0.00310058263130486, -0.03607037290930748, -0.0266759991645813, 0.006939025595784187, 0.024289729073643684, -0.0038368727546185255, 0.00473800441250205, 0.017520254477858543, 0.00783073715865612, 0.017658406868577003, 0.023372897878289223, 0.0023721419274806976, 0.009438330307602882, -0.016000576317310333, -0.004552754107862711, -0.008785245940089226, 0.005924860015511513, -0.006289080251008272, -0.037878915667533875, -0.007328364532440901, 0.012898423708975315, 0.011812042444944382, 0.029790712520480156, -0.006361296400427818, 0.015046067535877228, -0.02499305084347725, -0.00584008451551199, 0.02609827183187008, -0.0014945593429729342, -0.028434304520487785, 0.04242539033293724, -0.008973635733127594, 0.0035794067662209272, 0.02770586498081684, -0.015071186237037182, 0.008433585055172443, 0.002970279660075903, -0.007793059106916189, -0.0035228899214416742, 0.01905249059200287, -0.0050017498433589935, 0.011717847548425198, -0.010041178204119205, -0.016452711075544357, -0.015410288237035275, 0.0009136906592175364, 0.01372733898460865, -0.008351949043571949, 0.024339966475963593, -0.0161889661103487, 0.05631599947810173, -0.012722592800855637, -0.0041634151712059975, 0.02287052571773529, 0.014292508363723755, 0.020735440775752068, 0.024427881464362144, 0.018072864040732384, -0.008559177629649639, -0.02024562656879425, -0.0038494321051985025, 0.006756915245205164, -0.0008289152756333351, -0.02259422093629837, -0.011127559468150139, 0.004289008677005768, -0.01591266132891178, 0.004398902412503958, -0.019630219787359238, 0.01292354241013527, 0.018324051052331924, 0.008942237123847008, 0.0243902038782835, 0.013212407007813454, -0.018072864040732384, -0.019931644201278687, 0.00423877127468586, 0.0012331684119999409, -0.018349168822169304, -0.016151288524270058, 0.017243949696421623, 0.0007649412145838141, -0.0012857605470344424, -0.01260955911129713, 0.006656440440565348, 0.002959290286526084, -0.007108576130121946, -0.009513686411082745, -0.0036422032862901688, 0.010562390089035034, 0.02289564348757267, 0.0006240413058549166, -0.018097983673214912, -0.03295566141605377, 0.006813432089984417, 0.002750491490587592, -0.02936369553208351, -0.005384809337556362, -0.02317194826900959 ] } ], "model": "ada", "usage": { "prompt_tokens": 3, "total_tokens": 3 } } ================================================ FILE: libs/ktem/ktem_tests/test_qa.py ================================================ import json from pathlib import Path from unittest.mock import patch import pytest from index import ReaderIndexingPipeline from openai.resources.embeddings import Embeddings from openai.types.chat.chat_completion import ChatCompletion from kotaemon.llms import AzureChatOpenAI with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: openai_embedding = json.load(f) _openai_chat_completion_response = ChatCompletion.parse_obj( { "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", "object": "chat.completion", "created": 1692338378, "model": "gpt-35-turbo", "system_fingerprint": None, "choices": [ { "index": 0, "finish_reason": "stop", "message": { "role": "assistant", "content": "Hello! How can I assist you today?", "function_call": None, "tool_calls": None, }, } ], "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, } ) @pytest.fixture(scope="function") def mock_openai_embedding(monkeypatch): monkeypatch.setattr(Embeddings, "create", lambda *args, **kwargs: openai_embedding) @patch( "openai.resources.chat.completions.Completions.create", side_effect=lambda *args, **kwargs: _openai_chat_completion_response, ) def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path): indexing_pipeline = ReaderIndexingPipeline( storage_path=tmp_path, ) indexing_pipeline.indexing_vector_pipeline.embedding.openai_api_key = "some-key" input_file_path = Path(__file__).parent / "resources/dummy.pdf" # call ingestion pipeline indexing_pipeline(input_file_path, force_reindex=True) retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline() results = retrieving_pipeline("This is a query") assert len(results) == 1 # create llm llm = AzureChatOpenAI( api_key="dummy", api_version="2024-05-01-preview", azure_deployment="gpt-4o", azure_endpoint="https://test.openai.azure.com/", ) qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key") response = qa_pipeline("Summarize this document.") assert response ================================================ FILE: libs/ktem/migrations/README ================================================ Generic single-database configuration. To enable database migration, please set `KH_ENABLE_ALEMBIC` to True in the setting file. ================================================ FILE: libs/ktem/migrations/env.py ================================================ from logging.config import fileConfig from alembic import context from ktem.db.models import * # noqa from sqlalchemy import engine_from_config, pool from sqlmodel import SQLModel from theflow.settings import settings # this is the Alembic Config object, which provides # access to the values within the .ini file in use. config = context.config # Interpret the config file for Python logging. # This line sets up loggers basically. if config.config_file_name is not None: fileConfig(config.config_file_name) # add your model's MetaData object here # for 'autogenerate' support # from myapp import mymodel # target_metadata = mymodel.Base.metadata target_metadata = SQLModel.metadata # other values from the config, defined by the needs of env.py, # can be acquired: # my_important_option = config.get_main_option("my_important_option") # ... etc. def run_migrations_offline() -> None: """Run migrations in 'offline' mode. This configures the context with just a URL and not an Engine, though an Engine is acceptable here as well. By skipping the Engine creation we don't even need a DBAPI to be available. Calls to context.execute() here emit the given string to the script output. """ context.configure( url=settings.KH_DATABASE, target_metadata=target_metadata, literal_binds=True, dialect_opts={"paramstyle": "named"}, ) with context.begin_transaction(): context.run_migrations() def run_migrations_online() -> None: """Run migrations in 'online' mode. In this scenario we need to create an Engine and associate a connection with the context. """ configuration = config.get_section(config.config_ini_section, {}) configuration["sqlalchemy.url"] = settings.KH_DATABASE connectable = engine_from_config( configuration, prefix="sqlalchemy.", poolclass=pool.NullPool ) with connectable.connect() as connection: context.configure(connection=connection, target_metadata=target_metadata) with context.begin_transaction(): context.run_migrations() if context.is_offline_mode(): run_migrations_offline() else: run_migrations_online() ================================================ FILE: libs/ktem/migrations/script.py.mako ================================================ """${message} Revision ID: ${up_revision} Revises: ${down_revision | comma,n} Create Date: ${create_date} """ from typing import Sequence, Union from alembic import op import sqlalchemy as sa import sqlmodel ${imports if imports else ""} # revision identifiers, used by Alembic. revision: str = ${repr(up_revision)} down_revision: Union[str, None] = ${repr(down_revision)} branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} def upgrade() -> None: ${upgrades if upgrades else "pass"} def downgrade() -> None: ${downgrades if downgrades else "pass"} ================================================ FILE: libs/ktem/migrations/versions/.keep ================================================ ================================================ FILE: libs/ktem/pyproject.toml ================================================ [build-system] requires = ["setuptools >= 61.0", "wheel", "setuptools-git-versioning>=2.0,<3"] build-backend = "setuptools.build_meta" [tool.setuptools] include-package-data = true packages.find.exclude = ["ktem_tests*", "env*"] packages.find.include = ["ktem*"] [tool.setuptools-git-versioning] enabled = true dev_template = "{tag}" dirty_template = "{tag}" tag_filter = "v?\\d+(\\.\\d+)*.*" [project] name = "ktem" dynamic = ["version"] requires-python = ">= 3.10" description = "RAG-based Question and Answering Application" dependencies = [ "click>=8.1.7,<9", "platformdirs>=4.2.1,<5", "pluggy>=1.5.0,<2", "python-decouple>=3.8,<4", "SQLAlchemy>=2.0.29,<3", "sqlmodel>=0.0.16,<0.1", "tiktoken>=0.6.0,<1", "gradio>=4.31.0,<5", "gradiologin", "python-multipart==0.0.12", # required for gradio, pinning to avoid yanking issues with micropip (fixed in gradio >= 5.4.0) "markdown>=3.6,<4", "tzlocal>=5.0", "mcp>=1.0.0", ] authors = [ { name = "@trducng", email = "john@cinnamon.is" }, { name = "@lone17", email = "ian@cinnamon.is" }, { name = "@taprosoft", email = "tadashi@cinnamon.is" }, { name = "@cin-albert", email = "albert@cinnamon.is" }, ] classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", ] ================================================ FILE: libs/ktem/requirements.txt ================================================ platformdirs tzlocal ================================================ FILE: mkdocs.yml ================================================ repo_name: Cinnamon/kotaemon repo_url: https://github.com/Cinnamon/kotaemon site_name: kotaemon Docs edit_uri: edit/main/docs/ nav: - Getting Started: - Quick Start: index.md - Basic Usage: usage.md # - Application: # - Customize UI: pages/app/customize-ui.md # - Functional description: pages/app/functional-description.md - Development: - development/index.md # - Data & Data Structure Components: development/data-components.md # - Features: pages/app/features.md - Customize flow logic: pages/app/customize-flows.md - Creating a Component: development/create-a-component.md - Components: - Index: - File index: pages/app/index/file.md - Settings: - pages/app/settings/overview.md - pages/app/settings/user-settings.md - Extension: - User management: pages/app/ext/user-management.md - Contributing: development/contributing.md # generated using gen-files + literate-nav - API Reference: reference/ - Changelogs: https://github.com/Cinnamon/kotaemon/releases - Issue Tracker: https://github.com/Cinnamon/kotaemon/issues - Live Demo: https://huggingface.co/spaces/cin-model/kotaemon-demo markdown_extensions: - admonition - md_in_html - pymdownx.highlight: use_pygments: true anchor_linenums: true line_spans: __span linenums: true pygments_lang_class: true - pymdownx.inlinehilite - pymdownx.snippets - pymdownx.details - pymdownx.extra - pymdownx.tabbed: alternate_style: true - pymdownx.superfences: custom_fences: - name: mermaid class: mermaid format: !!python/name:pymdownx.superfences.fence_code_format - toc: permalink: true title: Page contents - mdx_truly_sane_lists plugins: - search - gen-files: scripts: - docs/scripts/generate_reference_docs.py - literate-nav: nav_file: Summary.md - mkdocstrings: handlers: python: options: docstring_options: ignore_init_summary: false filters: - "!^_" members_order: source separate_signature: true paths: [libs/kotaemon/kotaemon] - git-revision-date-localized: enable_creation_date: true type: timeago fallback_to_build_date: true - section-index - mkdocs-video - include-markdown theme: features: - content.action.edit - content.tabs.link - content.code.annotate - content.code.annotations - content.code.copy - navigation.tabs - navigation.top - navigation.instant - navigation.indexes - toc.follow - search.share - search.highlight - search.suggest name: material custom_dir: docs/theme palette: scheme: dracula primary: deep purple accent: deep purple icon: repo: fontawesome/brands/github edit: material/pencil view: material/eye extra_css: - extra/css/code_select.css - assets/pymdownx-extras/extra-fb5a2a1c86.css extra_javascript: - assets/pymdownx-extras/extra-loader-MCFnu0Wd.js validation: absolute_links: warn omitted_files: warn unrecognized_links: warn ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["setuptools >= 61.0", "wheel", "setuptools-git-versioning>=2.0,<3"] build-backend = "setuptools.build_meta" [tool.setuptools] include-package-data = false packages.find.include = [] [tool.setuptools-git-versioning] enabled = true dev_template = "{tag}" dirty_template = "{tag}" tag_filter = "v?\\d+(\\.\\d+)*.*" [project] name = "kotaemon-app" dynamic = ["version"] requires-python = ">= 3.10" description = "Kotaemon App" dependencies = [ "kotaemon[all]", "ktem" ] authors = [ { name = "@trducng", email = "john@cinnamon.is" }, { name = "@lone17", email = "ian@cinnamon.is" }, { name = "@taprosoft", email = "tadashi@cinnamon.is" }, { name = "@cin-albert", email = "albert@cinnamon.is" }, ] classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", ] [tool.uv.sources] kotaemon = { workspace = true } ktem = { workspace = true } # uv workspace configuration [tool.uv.workspace] members = ["libs/kotaemon", "libs/ktem"] [dependency-groups] dev = [ "black", "coverage", "flake8", "ipython", "pre-commit", "pytest", "pytest-mock", "sphinx" ] [project.urls] Homepage = "https://cinnamon.github.io/kotaemon/" Repository = "https://github.com/Cinnamon/kotaemon/" Documentation = "https://cinnamon.github.io/kotaemon/" [tool.codespell] skip = "*.js,*.css,*.map" # `llm` abbreviation for large language models ignore-words-list = "llm,fo" quiet-level = 3 check-filenames = "" [tool.isort] known_first_party = ["kotaemon"] ================================================ FILE: scripts/download_pdfjs.sh ================================================ #!/bin/bash set -eo pipefail # Check and capture input argument for PDFJS_VERSION_DIST if [ -z "$1" ]; then echo "Usage: $0 " exit 1 fi pdfjs_version_dist=$1 function check_path_for_spaces() { if [[ $PWD =~ \ ]]; then echo "The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later." exit 1 fi } function download_and_unzip() { local url=$1 local dest_dir=$2 if [ -d "$dest_dir" ]; then echo "Destination directory $dest_dir already exists. Skipping download." return fi mkdir -p "$dest_dir" local zip_file="${dest_dir}/downloaded.zip" echo "Downloading $url to $zip_file" curl -L -o "$zip_file" "$url" echo "Unzipping $zip_file to $dest_dir" unzip -o "$zip_file" -d "$dest_dir" rm "$zip_file" echo "Download and unzip completed successfully." } # Main script execution pdf_js_version="4.0.379" pdf_js_dist_name="pdfjs-${pdf_js_version}-dist" pdf_js_dist_url="https://github.com/mozilla/pdf.js/releases/download/v${pdf_js_version}/${pdf_js_dist_name}.zip" check_path_for_spaces echo "Downloading and unzipping PDF.js" download_and_unzip "$pdf_js_dist_url" "$pdfjs_version_dist" echo "PDF.js has been set up in $pdfjs_version_dist" ================================================ FILE: scripts/migrate/__init__.py ================================================ ================================================ FILE: scripts/migrate/migrate_chroma_db.py ================================================ import uuid from datetime import datetime import chromadb from ktem.index.models import Index from sqlalchemy import ( JSON, Column, DateTime, Integer, String, UniqueConstraint, create_engine, select, ) from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.mutable import MutableDict from sqlalchemy.orm import Session from tzlocal import get_localzone def _init_resource(private: bool = True, id: int = 1): """Init schemas. Hard-code""" Base = declarative_base() if private: Source = type( "Source", (Base,), { "__tablename__": f"index__{id}__source", "__table_args__": ( UniqueConstraint("name", "user", name="_name_user_uc"), ), "id": Column( String, primary_key=True, default=lambda: str(uuid.uuid4()), unique=True, ), "name": Column(String), "path": Column(String), "size": Column(Integer, default=0), "date_created": Column( DateTime(timezone=True), default=datetime.now(get_localzone()) ), "user": Column(Integer, default=1), "note": Column( MutableDict.as_mutable(JSON), # type: ignore default={}, ), }, ) else: Source = type( "Source", (Base,), { "__tablename__": f"index__{id}__source", "id": Column( String, primary_key=True, default=lambda: str(uuid.uuid4()), unique=True, ), "name": Column(String, unique=True), "path": Column(String), "size": Column(Integer, default=0), "date_created": Column( DateTime(timezone=True), default=datetime.now(get_localzone()) ), "user": Column(Integer, default=1), "note": Column( MutableDict.as_mutable(JSON), # type: ignore default={}, ), }, ) Index = type( "IndexTable", (Base,), { "__tablename__": f"index__{id}__index", "id": Column(Integer, primary_key=True, autoincrement=True), "source_id": Column(String), "target_id": Column(String), "relation_type": Column(String), "user": Column(Integer, default=1), }, ) return {"Source": Source, "Index": Index} def get_chromadb_collection( db_dir: str = "../ktem_app_data/user_data/vectorstore", collection_name: str = "index_1", ): """Extract collection from chromadb""" client = chromadb.PersistentClient(path=db_dir) collection = client.get_or_create_collection(collection_name) return collection def update_metadata(metadata, file_id): """Update file_id""" metadata["file_id"] = file_id return metadata def migrate_chroma_db( chroma_db_dir: str, sqlite_path: str, is_private: bool = True, int_index: int = 1 ): chroma_collection_name = f"index_{int_index}" """Update chromadb with metadata.file_id""" engine = create_engine(sqlite_path) resource = _init_resource(private=is_private, id=int_index) print("Load sqlalchemy engine successfully!") chroma_db_collection = get_chromadb_collection( db_dir=chroma_db_dir, collection_name=chroma_collection_name ) print( f"Load chromadb collection: {chroma_collection_name}, " f"path: {chroma_db_dir} successfully!" ) # Load docs id of user with Session(engine) as session: stmt = select(resource["Source"]) results = session.execute(stmt) doc_ids = [r[0].id for r in results.all()] print(f"Retrieve n-docs: {len(doc_ids)}") print(doc_ids) for doc_id in doc_ids: print("-") # Find corresponding vector ids with Session(engine) as session: stmt = select(resource["Index"]).where( resource["Index"].relation_type == "vector", resource["Index"].source_id.in_([doc_id]), ) results = session.execute(stmt) vs_ids = [r[0].target_id for r in results.all()] print(f"Got {len(vs_ids)} vs_ids for doc {doc_id}") # Update file_id if len(vs_ids) > 0: batch = chroma_db_collection.get(ids=vs_ids, include=["metadatas"]) batch.update( ids=batch["ids"], metadatas=[ update_metadata(metadata, doc_id) for metadata in batch["metadatas"] ], ) # Assert file_id. Skip print(f"doc-{doc_id} got updated") def main(chroma_db_dir: str, sqlite_path: str): engine = create_engine(sqlite_path) with Session(engine) as session: stmt = select(Index) results = session.execute(stmt) file_indices = [r[0] for r in results.all()] for file_index in file_indices: _id = file_index.id _is_private = file_index.config["private"] print(f"Migrating for Index id: {_id}, is_private: {_is_private}") migrate_chroma_db( chroma_db_dir=chroma_db_dir, sqlite_path=sqlite_path, is_private=_is_private, int_index=_id, ) if __name__ == "__main__": chrome_db_dir: str = "./vectorstore/kan_db" sqlite_path: str = "sqlite:///../ktem_app_data/user_data/sql.db" main(chrome_db_dir, sqlite_path) ================================================ FILE: scripts/run_linux.sh ================================================ #!/bin/bash # functions for better code organization function check_path_for_spaces() { if [[ $PWD =~ \ ]]; then echo "The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later." exit 1 fi } function install_miniconda() { # Miniconda installer is limited to two main architectures: x86_64 and arm64 local sys_arch=$(uname -m) case "${sys_arch}" in x86_64*) sys_arch="x86_64" ;; arm64*) sys_arch="aarch64" ;; aarch64*) sys_arch="aarch64" ;; *) { echo "Unknown system architecture: ${sys_arch}! This script runs only on x86_64 or arm64" exit 1 } ;; esac # if miniconda has not been installed, download and install it if ! "${conda_root}/bin/conda" --version &>/dev/null; then if [ ! -d "$install_dir/miniconda_installer.sh" ]; then echo "Downloading Miniconda from $miniconda_url" local miniconda_url="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${sys_arch}.sh" mkdir -p "$install_dir" curl -Lk "$miniconda_url" >"$install_dir/miniconda_installer.sh" fi echo "Installing Miniconda to $conda_root" chmod u+x "$install_dir/miniconda_installer.sh" bash "$install_dir/miniconda_installer.sh" -b -p "$conda_root" rm -rf "$install_dir/miniconda_installer.sh" fi echo "Miniconda is installed at $conda_root" # test conda echo "Conda version: " "$conda_root/bin/conda" --version || { echo "Conda not found. Will exit now..." exit 1 } } function create_conda_env() { local python_version="${1}" if [ ! -d "${env_dir}" ]; then echo "Creating conda environment with python=$python_version in $env_dir" "${conda_root}/bin/conda" create -y -k --prefix "$env_dir" python="$python_version" || { echo "Failed to create conda environment." echo "Will delete the ${env_dir} (if exist) and exit now..." rm -rf $env_dir exit 1 } else echo "Conda environment exists at $env_dir" fi } function activate_conda_env() { # deactivate the current env(s) to avoid conflicts { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null # check if conda env is broken (because of interruption during creation) if [ ! -f "$env_dir/bin/python" ]; then echo "Conda environment appears to be broken. You may need to remove $env_dir and run the installer again." exit 1 fi source "$conda_root/etc/profile.d/conda.sh" # conda init conda activate "$env_dir" || { echo "Failed to activate environment. Please remove $env_dir and run the installer again." exit 1 } echo "Activate conda environment at $CONDA_PREFIX" } function deactivate_conda_env() { # Conda deactivate if we are in the right env if [ "$CONDA_PREFIX" == "$env_dir" ]; then conda deactivate echo "Deactivate conda environment at $env_dir" fi } function install_dependencies() { if pip list 2>/dev/null | grep -q "kotaemon"; then echo "Requirements are already installed" else local kotaemon_root="$(pwd)/libs/kotaemon" local ktem_root="$(pwd)/libs/ktem/" if [ -f "$(pwd)/VERSION" ]; then local app_version=$(<"$(pwd)/VERSION") else local app_version="latest" fi if [ -f "pyproject.toml" ]; then echo "Found pyproject.toml. Installing from source" echo "" && echo "Installing libs/kotaemon" python -m pip install -e "$kotaemon_root" echo "" && echo "Installing libs/ktem" python -m pip install -e "$ktem_root" python -m pip install --no-deps -e . else echo "Installing Kotaemon $app_version" # Work around for versioning control python -m pip install "git+https://github.com/Cinnamon/kotaemon.git@$app_version#subdirectory=libs/kotaemon" python -m pip install "git+https://github.com/Cinnamon/kotaemon.git@$app_version#subdirectory=libs/ktem" python -m pip install --no-deps "git+https://github.com/Cinnamon/kotaemon.git@$app_version" fi if ! pip list 2>/dev/null | grep -q "kotaemon"; then echo "Installation failed. You may need to run the installer again." deactivate_conda_env exit 1 else print_highlight "Install finished successfully. Clear cache..." conda clean --all -y python -m pip cache purge print_highlight "Do you want to launch the web UI? [Y/N]" read -p "Input> " launch local launch=${launch,,} if [[ "$launch" != "yes" && "$launch" != "y" && "$launch" != "true" ]]; then echo "Will exit now..." deactivate_conda_env echo "Please run the installer again to launch the UI." exit 0 fi fi fi } function setup_local_model() { python $(pwd)/scripts/serve_local.py } function download_and_unzip() { local url=$1 local dest_dir=$2 # Check if the destination directory exists, create if it doesn't if [ -d "$dest_dir" ]; then echo "Destination directory $dest_dir already exists. Skipping download." return fi mkdir -p "$dest_dir" # Download the ZIP file local zip_file="${dest_dir}/downloaded.zip" echo "Downloading $url to $zip_file" curl -L -o "$zip_file" "$url" # Unzip the file to the destination directory echo "Unzipping $zip_file to $dest_dir" unzip -o "$zip_file" -d "$dest_dir" # Clean up the downloaded ZIP file rm "$zip_file" echo "Download and unzip completed successfully." } function launch_ui() { local pdfjs_prebuilt_dir=$1 PDFJS_PREBUILT_DIR="$pdfjs_prebuilt_dir" python $(pwd)/app.py || { echo "" && echo "Will exit now..." exit 1 } } function print_highlight() { local message="${1}" echo "" && echo "******************************************************" echo $message echo "******************************************************" && echo "" } # Main script execution # move two levels up from the dir where this script resides cd "$(dirname "${BASH_SOURCE[0]}")" && cd .. install_dir="$(pwd)/install_dir" conda_root="${install_dir}/conda" env_dir="${install_dir}/env" python_version="3.10" pdf_js_version="4.0.379" pdf_js_dist_name="pdfjs-${pdf_js_version}-dist" pdf_js_dist_url="https://github.com/mozilla/pdf.js/releases/download/v${pdf_js_version}/${pdf_js_dist_name}.zip" target_pdf_js_dir="$(pwd)/libs/ktem/ktem/assets/prebuilt/${pdf_js_dist_name}" check_path_for_spaces print_highlight "Setting up Miniconda" install_miniconda print_highlight "Creating conda environment" create_conda_env "$python_version" activate_conda_env print_highlight "Installing requirements" install_dependencies print_highlight "Downloading and unzipping PDF.js" download_and_unzip $pdf_js_dist_url $target_pdf_js_dir print_highlight "Setting up a local model" setup_local_model print_highlight "Launching Kotaemon in your browser, please wait..." launch_ui $target_pdf_js_dir deactivate_conda_env read -p "Press enter to continue" ================================================ FILE: scripts/run_macos.sh ================================================ #!/bin/bash # functions for better code organization function check_path_for_spaces() { if [[ $PWD =~ \ ]]; then echo "The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later." exit 1 fi } function install_miniconda() { # Miniconda installer is limited to two main architectures: x86_64 and arm64 local sys_arch=$(uname -m) case "${sys_arch}" in x86_64*) sys_arch="x86_64" ;; arm64*) sys_arch="arm64" ;; *) { echo "Unknown system architecture: ${sys_arch}! This script runs only on x86_64 or arm64" exit 1 } ;; esac # if miniconda has not been installed, download and install it if ! "${conda_root}/bin/conda" --version &>/dev/null; then if [ ! -d "$install_dir/miniconda_installer.sh" ]; then local miniconda_url="https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-${sys_arch}.sh" echo "Downloading Miniconda from $miniconda_url" mkdir -p "$install_dir" curl -Lk "$miniconda_url" >"$install_dir/miniconda_installer.sh" fi echo "Installing Miniconda to $conda_root" chmod u+x "$install_dir/miniconda_installer.sh" bash "$install_dir/miniconda_installer.sh" -b -p "$conda_root" rm -rf "$install_dir/miniconda_installer.sh" fi echo "Miniconda is installed at $conda_root" # test conda echo "Conda version: " "$conda_root/bin/conda" --version || { echo "Conda not found. Will exit now..." exit 1 } } function create_conda_env() { local python_version="${1}" if [ ! -d "${env_dir}" ]; then echo "Creating conda environment with python=$python_version in $env_dir" "${conda_root}/bin/conda" create -y -k --prefix "$env_dir" python="$python_version" || { echo "Failed to create conda environment." echo "Will delete the ${env_dir} (if exist) and exit now..." rm -rf $env_dir exit 1 } else echo "Conda environment exists at $env_dir" fi } function activate_conda_env() { # deactivate the current env(s) to avoid conflicts { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null # check if conda env is broken (because of interruption during creation) if [ ! -f "$env_dir/bin/python" ]; then echo "Conda environment appears to be broken. You may need to remove $env_dir and run the installer again." exit 1 fi source "$conda_root/etc/profile.d/conda.sh" # conda init conda activate "$env_dir" || { echo "Failed to activate environment. Please remove $env_dir and run the installer again." exit 1 } echo "Activate conda environment at $CONDA_PREFIX" } function deactivate_conda_env() { # Conda deactivate if we are in the right env if [[ "$CONDA_PREFIX" == "$env_dir" ]]; then conda deactivate echo "Deactivate conda environment at $env_dir" fi } function install_dependencies() { # check if the env is already setup by finding 'kotaemon' in 'pip list' if pip list 2>/dev/null | grep -q "kotaemon"; then echo "Requirements are already installed" else local kotaemon_root="$(pwd)/libs/kotaemon" local ktem_root="$(pwd)/libs/ktem/" if [ -f "$(pwd)/VERSION" ]; then local app_version=$(<"$(pwd)/VERSION") else local app_version="latest" fi if [ -f "pyproject.toml" ]; then echo "Found pyproject.toml. Installing from source" echo "" && echo "Installing libs/kotaemon" python -m pip install -e "$kotaemon_root" echo "" && echo "Installing libs/ktem" python -m pip install -e "$ktem_root" python -m pip install --no-deps -e . else echo "Installing Kotaemon $app_version" # Work around for versioning control python -m pip install "git+https://github.com/Cinnamon/kotaemon.git@$app_version#subdirectory=libs/kotaemon" python -m pip install "git+https://github.com/Cinnamon/kotaemon.git@$app_version#subdirectory=libs/ktem" python -m pip install --no-deps "git+https://github.com/Cinnamon/kotaemon.git@$app_version" fi if ! pip list 2>/dev/null | grep -q "kotaemon"; then echo "Installation failed. You may need to run the installer again." deactivate_conda_env exit 1 else print_highlight "Install finished successfully. Clear cache..." "$conda_root/bin/conda" clean --all -y python -m pip cache purge print_highlight "Do you want to launch the web UI? [Y/N]" read -p "Input (yes/no)> " launch # Convert user input to lowercase local launch=${launch:l} if [[ "$launch" != "yes" && "$launch" != "y" && "$launch" != "true" ]]; then echo "Will exit now..." deactivate_conda_env echo "Please run the installer again to launch the UI." exit 0 fi fi fi } function setup_local_model() { python $(pwd)/scripts/serve_local.py } function download_and_unzip() { local url=$1 local dest_dir=$2 # Check if the destination directory exists, create if it doesn't if [ -d "$dest_dir" ]; then echo "Destination directory $dest_dir already exists. Skipping download." return fi mkdir -p "$dest_dir" # Download the ZIP file local zip_file="${dest_dir}/downloaded.zip" echo "Downloading $url to $zip_file" curl -L -o "$zip_file" "$url" # Unzip the file to the destination directory echo "Unzipping $zip_file to $dest_dir" unzip -o "$zip_file" -d "$dest_dir" # Clean up the downloaded ZIP file rm "$zip_file" echo "Download and unzip completed successfully." } function launch_ui() { local pdfjs_prebuilt_dir=$1 PDFJS_PREBUILT_DIR="$pdfjs_prebuilt_dir" python $(pwd)/app.py || { echo "" && echo "Will exit now..." exit 1 } } function print_highlight() { local message="${1}" echo "" && echo "******************************************************" echo $message echo "******************************************************" && echo "" } # Main script execution # move two levels up from the dir where this script resides cd "$( cd -- "$(dirname "$0")" >/dev/null 2>&1 pwd -P )" && cd .. install_dir="$(pwd)/install_dir" conda_root="${install_dir}/conda" env_dir="${install_dir}/env" python_version="3.10" pdf_js_version="4.0.379" pdf_js_dist_name="pdfjs-${pdf_js_version}-dist" pdf_js_dist_url="https://github.com/mozilla/pdf.js/releases/download/v${pdf_js_version}/${pdf_js_dist_name}.zip" target_pdf_js_dir="$(pwd)/libs/ktem/ktem/assets/prebuilt/${pdf_js_dist_name}" check_path_for_spaces print_highlight "Setting up Miniconda" install_miniconda print_highlight "Creating conda environment" create_conda_env "$python_version" activate_conda_env print_highlight "Installing requirements" install_dependencies print_highlight "Downloading and unzipping PDF.js" download_and_unzip $pdf_js_dist_url $target_pdf_js_dir print_highlight "Setting up a local model" setup_local_model print_highlight "Launching Kotaemon in your browser, please wait..." launch_ui $target_pdf_js_dir deactivate_conda_env read -p "Press enter to continue" ================================================ FILE: scripts/run_windows.bat ================================================ @ECHO off :: Main script execution CD /D "%~dp0\.." SET /p app_version=<"%CD%\VERSION" || SET app_version=latest SET install_dir=%CD%\install_dir SET conda_root=%install_dir%\conda SET env_dir=%install_dir%\env SET python_version=3.10 SET miniconda_download_url=https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe SET git_install_dir=%install_dir%\Git SET seven_zip_dir=%install_dir%\7zip :: Determine if the machine is 32-bit or 64-bit IF "%PROCESSOR_ARCHITECTURE%"=="x86" ( SET seven_zip_url=https://7-zip.org/a/7z2408.exe SET git_download_url=https://github.com/git-for-windows/git/releases/download/v2.46.0.windows.1/PortableGit-2.46.0-32-bit.7z.exe ) ELSE ( SET seven_zip_url=https://7-zip.org/a/7z2408-x64.exe SET git_download_url=https://github.com/git-for-windows/git/releases/download/v2.46.0.windows.1/PortableGit-2.46.0-64-bit.7z.exe ) ECHO %CD%| FINDSTR /C:" " >nul 2>&1 IF %ERRORLEVEL% EQU 0 ( ECHO The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later. GOTO :end ) IF NOT EXIST "%install_dir%" ( MKDIR "%install_dir%" ) CALL :print_highlight "Setting up Git" CALL :download_and_install_git IF ERRORLEVEL 1 GOTO :end :: Temporarily add Portable Git to PATH SET "PATH=%git_install_dir%\bin;%PATH%" CALL :print_highlight "Setting up Miniconda" CALL :download_and_install_miniconda IF ERRORLEVEL 1 GOTO :end CALL :print_highlight "Creating conda environment" CALL :create_conda_environment IF ERRORLEVEL 1 GOTO :end CALL :activate_environment IF ERRORLEVEL 1 GOTO :end CALL :print_highlight "Installing Kotaemon" CALL :install_dependencies IF ERRORLEVEL 1 GOTO :end CALL :print_highlight "Setting up a local model" CALL :setup_local_model IF ERRORLEVEL 1 GOTO :end CALL :print_highlight "Downloading and extracting PDF.js" CALL :download_and_extract_pdf_js IF ERRORLEVEL 1 GOTO :end CALL :print_highlight "Launching Kotaemon in your browser, please wait..." CALL :launch_ui CALL :deactivate_environment GOTO :end_success :download_and_install_7zip :: Check if 7-Zip is installed IF NOT EXIST "%seven_zip_dir%\7z.exe" ( ECHO Downloading 7-Zip from %seven_zip_url% CALL curl -Lk "%seven_zip_url%" -o "%install_dir%\7zip_installer.exe" || ( ECHO. && ECHO Failed to download 7-Zip. Aborting... GOTO :exit_func_with_error ) ECHO Installing 7-Zip to %seven_zip_dir% CALL "%install_dir%\7zip_installer.exe" /S /D=%seven_zip_dir% DEL "%install_dir%\7zip_installer.exe" ) ECHO 7-Zip is installed at %seven_zip_dir% GOTO :eof :uninstall_7zip IF EXIST "%seven_zip_dir%\Uninstall.exe" ( CALL "%seven_zip_dir%\Uninstall.exe" /S ) ELSE ( ECHO. && ECHO Uninstaller not found. Manually deleting 7-Zip directory... RMDIR /S /Q "%seven_zip_dir%" ) GOTO :eof :download_and_install_git :: Check if Git is already installed CALL "%git_install_dir%\bin\git.exe" --version >nul 2>&1 IF %ERRORLEVEL% NEQ 0 ( ECHO Install 7-Zip to extract Portable Git. It will be uninstalled automatically after Git installation. && ECHO. CALL :download_and_install_7zip IF ERRORLEVEL 1 GOTO :end ECHO. && ECHO Downloading Portable Git from %git_download_url% CALL curl -Lk "%git_download_url%" -o "%install_dir%\portable_git.7z.exe" || ( ECHO. && ECHO Failed to download Git. Aborting... GOTO :exit_func_with_error ) ECHO Extracting Git to %git_install_dir%... CALL "%seven_zip_dir%\7z.exe" x "%install_dir%\portable_git.7z.exe" -o"%git_install_dir%" -y >nul || ( ECHO. && ECHO Failed to extract Git. Aborting... GOTO :exit_func_with_error ) DEL "%install_dir%\portable_git.7z.exe" ECHO. && ECHO Uninstalling 7-Zip... CALL :uninstall_7zip IF ERRORLEVEL 1 GOTO :end ) ECHO Git is installed at %git_install_dir% :: Recheck Git installation CALL "%git_install_dir%\bin\git.exe" --version || ( ECHO. && ECHO Git not found. Aborting... GOTO :exit_func_with_error ) SET "PATH=%git_install_dir%\bin;%PATH%" ECHO Git is added to PATH for this session GOTO :eof :download_and_install_miniconda :: If conda has been installed at the %conda_root%, don't need to reinstall it CALL "%conda_root%\_conda.exe" --version >nul 2>&1 IF %ERRORLEVEL% NEQ 0 ( IF NOT EXIST "%install_dir%\miniconda_installer.exe" ( ECHO Downloading Miniconda from %miniconda_download_url% CALL curl -Lk "%miniconda_download_url%" -o "%install_dir%\miniconda_installer.exe" || ( ECHO. && ECHO Failed to download Miniconda. Aborting... GOTO :exit_func_with_error ) ) ECHO Installing Miniconda to %conda_root% START /wait "" "%install_dir%\miniconda_installer.exe" /InstallationType=JustMe /NoShortcuts=1 /AddToPath=0 /RegisterPython=0 /NoRegistry=1 /S /D=%conda_root% DEL "%install_dir%\miniconda_installer.exe" ) ECHO Conda is installed at %conda_root% :: Recheck conda ECHO Conda version: CALL "%conda_root%\_conda.exe" --version || ( ECHO. && ECHO Conda not found. Aborting... && GOTO :exit_func_with_error ) GOTO :eof :create_conda_environment :: Create new conda environment if it doesn't exist IF NOT EXIST %env_dir% ( ECHO Creating conda environment with python=%python_version% in %env_dir% :: Create conda environment. If the interruption happens, rollback and remove the env_dir CALL "%conda_root%\_conda.exe" create --no-shortcuts -y -k --prefix %env_dir% python=%python_version% || ( ECHO. && ECHO Failed to create conda environment. Will delete the %env_dir% and abort now... RMDIR /s /q %env_dir% GOTO :exit_func_with_error ) ECHO Conda environment created successfully ) ELSE ( ECHO Conda environment exists at %env_dir% ) GOTO :eof :activate_environment :: Deactivate existing conda env(s) to avoid conflicts IF EXIST "%conda_root%\condabin\conda.bat" ( CALL "%conda_root%\condabin\conda.bat" deactivate CALL "%conda_root%\condabin\conda.bat" deactivate CALL "%conda_root%\condabin\conda.bat" deactivate ) CALL "%env_dir%\python.exe" --version >nul 2>&1 || ( ECHO The environment appears to be broken. You may need to remove %env_dir% and run the installer again. GOTO :exit_func_with_error ) CALL "%conda_root%\condabin\conda.bat" activate %env_dir% || ( ECHO Failed to activate environment. You may need to remove %env_dir% and run the installer again. GOTO :exit_func_with_error ) ECHO Activate conda environment at %env_dir% GOTO :eof :deactivate_environment :: Conda deactivate if we are in the right env IF "%CONDA_PREFIX%" == "%env_dir%" ( CALL "%conda_root%\condabin\conda.bat" deactivate ECHO Deactivate conda environment at %env_dir% ) GOTO :eof :install_dependencies pip list | findstr /C:"kotaemon" >NUL 2>&1 IF %ERRORLEVEL% == 0 ( ECHO Dependencies are already installed ) ELSE ( IF EXIST "pyproject.toml" ( ECHO Found pyproject.toml. Installing from source... ECHO Installing libs\kotaemon python -m pip install -e "%CD%\libs\kotaemon" ECHO Installing libs\ktem python -m pip install -e "%CD%\libs\ktem" python -m pip install --no-deps -e . ) ELSE ( ECHO Installing Kotaemon %app_version% @REM Work around for versioning control python -m pip install git+https://github.com/Cinnamon/kotaemon.git@"%app_version%"#subdirectory=libs/kotaemon python -m pip install git+https://github.com/Cinnamon/kotaemon.git@"%app_version%"#subdirectory=libs/ktem python -m pip install --no-deps git+https://github.com/Cinnamon/kotaemon.git@"%app_version%" ) ( CALL pip list | findstr /C:"kotaemon" >NUL 2>&1 ) || ( ECHO. && ECHO Installation failed. You may need to run the installer again. CALL :deactivate_environment GOTO :exit_func_with_error ) CALL :print_highlight "Install successfully. Clear cache..." "%conda_root%\condabin\conda.bat" clean --all -y python -m pip cache purge ) GOTO :eof :download_and_extract_pdf_js :: Download and extract a ZIP file from a URL to a destination directory REM Define variables set "pdf_js_version=4.0.379" set "pdf_js_dist_name=pdfjs-%pdf_js_version%-dist" set "pdf_js_dist_url=https://github.com/mozilla/pdf.js/releases/download/v%pdf_js_version%/%pdf_js_dist_name%.zip" for /f "delims=" %%i in ('cd') do set "current_dir=%%i" set "target_pdf_js_dir=%current_dir%\libs\ktem\ktem\assets\prebuilt\%pdf_js_dist_name%" REM Create the target directory if it does not exist (including parent folders) if not exist "%target_pdf_js_dir%" ( echo Creating directory %target_pdf_js_dir% mkdir "%target_pdf_js_dir%" ) else ( echo Directory already exists: %target_pdf_js_dir% GOTO :eof ) REM Download the ZIP file using PowerShell set "zip_file=%temp%\downloaded.zip" echo Downloading %url% to %zip_file% powershell -Command "Invoke-WebRequest -Uri '%pdf_js_dist_url%' -OutFile '%zip_file%'" REM Extract the ZIP file using PowerShell echo Extracting %zip_file% to %dest_dir% powershell -Command "Expand-Archive -Path '%zip_file%' -DestinationPath '%target_pdf_js_dir%'" REM Clean up the downloaded ZIP file del "%zip_file%" echo Download and extraction completed successfully. goto :eof :setup_local_model python "%CD%\scripts\serve_local.py" GOTO :eof :launch_ui :: Workaround for diskcache path with folder start with . SET THEFLOW_TEMP_PATH=flow_tmp SET PDFJS_PREBUILT_DIR=%target_pdf_js_dir% ECHO Starting Kotaemon UI... (prebuilt PDF.js is at %PDFJS_PREBUILT_DIR%) CALL python -Xutf8 "%CD%\app.py" || ( ECHO. && ECHO Will exit now... && GOTO :exit_func_with_error ) GOTO :eof :print_highlight ECHO. && ECHO ****************************************************** ECHO %~1 ECHO ****************************************************** && ECHO. GOTO :eof :exit_func_with_error :: Called inside functions when error happens, then back to the main routine with error code 1 EXIT /B 1 :end_success :: Exit the script main routine with error code 0 (success) ECHO Script completed successfully. PAUSE EXIT /B 0 :end :: Exit the script main routine with error code 1 (fail) PAUSE EXIT /B 1 ================================================ FILE: scripts/serve_local.py ================================================ import platform import subprocess from inspect import currentframe, getframeinfo from pathlib import Path from decouple import config system_name = platform.system() cur_frame = currentframe() if cur_frame is None: raise ValueError("Cannot get the current frame.") this_file = getframeinfo(cur_frame).filename this_dir = Path(this_file).parent def serve_llamacpp_python(local_model_file: Path, **kwargs): def guess_chat_format(local_model_file): model_name = local_model_file.stem # handle known cases that the server backends handle incorrectly # this is highly heuristic, should be expand later # server backends usually has logic for this but they could still be wrong if "qwen" in model_name: return "qwen" return None # default port if "port" not in kwargs: kwargs["port"] = 31415 chat_format = guess_chat_format(local_model_file) if chat_format: kwargs = {**kwargs, "chat_format": chat_format} # these scripts create a separate conda env and run the server if system_name == "Windows": script_file = this_dir / "server_llamacpp_windows.bat" elif system_name == "Linux": script_file = this_dir / "server_llamacpp_linux.sh" elif system_name == "Darwin": script_file = this_dir / "server_llamacpp_macos.sh" else: raise ValueError(f"Unsupported system: {system_name}") args = " ".join(f"--{k} {v}" for k, v in kwargs.items()) cmd = f"{script_file} --model {local_model_file} {args}" subprocess.Popen(cmd, shell=True) def main(): local_model_file = config("LOCAL_MODEL", default="") if not local_model_file: print("LOCAL_MODEL not set in the `.env` file.") return local_model_file = Path(local_model_file) if not local_model_file.exists(): print(f"Local model not found: {local_model_file}") return print(f"Local model found: {local_model_file}") will_start_server = input("Do you want to use this local model ? (y/n): ") if will_start_server.lower().strip() not in ["y", "yes"]: return print("Starting the local server...") if local_model_file.suffix == ".gguf": serve_llamacpp_python(local_model_file) else: raise ValueError(f"Unsupported model file type: {local_model_file.suffix}") if __name__ == "__main__": main() ================================================ FILE: scripts/server_llamacpp_linux.sh ================================================ #!/bin/bash # functions used in the main code execution function print_highlight() { local message="${1}" echo "" && echo "******************************************************" echo $message echo "******************************************************" && echo "" } function path_sanity_check() { echo "Path sanity checking" if [[ $PWD =~ \ ]]; then print_highlight "This script relies on Miniconda which can't be silently installed under a path with spaces. Please run it from a path without spaces." exit 1 fi } function deactivate_environment() { echo "Deactivate existing environment(s)" # deactivate existing conda envs as needed to avoid conflicts { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null } function check_conda_existence() { echo "Check for conda existence" conda_exists="F" # figure out whether conda exists if "$CONDA_ROOT_PREFIX/bin/conda" --version &>/dev/null; then conda_exists="T"; fi # verify if conda is installed by the main app, if not then raise error if [ "$conda_exists" == "F" ]; then # test the conda binary print_highlight "conda is not installed, seems like the app wasn't installed correctly." exit fi } function create_conda_environment() { # create the environment if needed if [ ! -e "$INSTALL_ENV_DIR" ]; then echo "Create conda environment" "$CONDA_ROOT_PREFIX/bin/conda" create -y -k --prefix "$INSTALL_ENV_DIR" python="$PYTHON_VERSION" || { echo && print_highlight "Conda environment creation failed." && exit 1 } fi # check if conda environment was actually created if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then print_highlight "Conda environment was not correctly created." exit 1 fi } function isolate_environment() { echo "Isolate environment" export PYTHONNOUSERSITE=1 unset PYTHONPATH unset PYTHONHOME } function activate_environment() { echo "Activate conda environment" source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh" # otherwise conda complains about 'shell not initialized' (needed when running in a script) conda activate "$INSTALL_ENV_DIR" } # main code execution cd "$(dirname "${BASH_SOURCE[0]}")/.." echo "Changed the current directory to: $(pwd)" path_sanity_check deactivate_environment # config ENV_NAME="llama-cpp-python-server" PYTHON_VERSION="3.10" CONDA_ROOT_PREFIX="$(pwd)/install_dir/conda" INSTALL_ENV_DIR="$(pwd)/install_dir/server_envs/${ENV_NAME}" check_conda_existence create_conda_environment isolate_environment activate_environment # install dependencies # ver 0.2.56 produces segment error for /embeddings on MacOS python -m pip install llama-cpp-python[server]==0.2.55 # start the server with passed params python -m llama_cpp.server $@ conda deactivate ================================================ FILE: scripts/server_llamacpp_macos.sh ================================================ #!/bin/bash # functions used in the main code execution function print_highlight() { local message="${1}" echo "" && echo "******************************************************" echo $message echo "******************************************************" && echo "" } function path_sanity_check() { echo "Path sanity checking" if [[ "$(pwd)" =~ " " ]]; then print_highlight "This script relies on Miniconda which can't be silently installed under a path with spaces. Please run it from a path without spaces." exit 1 fi } function deactivate_environment() { echo "Deactivate existing environment(s)" # deactivate existing conda envs as needed to avoid conflicts { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null } function check_conda_existence() { echo "Check for conda existence" conda_exists="F" # figure out whether conda exists if "$CONDA_ROOT_PREFIX/bin/conda" --version &>/dev/null; then conda_exists="T"; fi # verify if conda is installed by the main app, if not then raise error if [ "$conda_exists" == "F" ]; then # test the conda binary print_highlight "conda is not installed, seems like the app wasn't installed correctly." exit fi } function create_conda_environment() { # create the environment if needed if [ ! -d "${INSTALL_ENV_DIR}" ]; then echo "Create conda environment" "${CONDA_ROOT_PREFIX}/bin/conda" create -y -k --prefix "$INSTALL_ENV_DIR" python="$PYTHON_VERSION" || (echo && print_highlight "Conda environment creation failed." && exit 1) fi # check if conda environment was actually created if [ ! -f "$INSTALL_ENV_DIR/bin/python" ]; then print_highlight "Conda environment was not correctly created." exit 1 fi } function isolate_environment() { echo "Isolate environment" export PYTHONNOUSERSITE=1 unset PYTHONPATH unset PYTHONHOME } function activate_environment() { echo "Activate conda environment" source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh" # otherwise conda complains about 'shell not initialized' (needed when running in a script) conda activate "$INSTALL_ENV_DIR" } # main code execution cd "$( cd -- "$(dirname "$0")" >/dev/null 2>&1 pwd -P )" && cd .. echo "Changed the current directory to: $(pwd)" path_sanity_check deactivate_environment # config ENV_NAME="llama-cpp-python-server" PYTHON_VERSION="3.10" CONDA_ROOT_PREFIX="$(pwd)/install_dir/conda" INSTALL_ENV_DIR="$(pwd)/install_dir/server_envs/${ENV_NAME}" check_conda_existence create_conda_environment isolate_environment activate_environment # install dependencies # ver 0.2.56 produces segment error for /embeddings on MacOS python -m pip install llama-cpp-python[server]==0.2.55 # start the server with passed params python -m llama_cpp.server $@ conda deactivate ================================================ FILE: scripts/server_llamacpp_windows.bat ================================================ @echo off @rem main code execution call :print_highlight "Starting inference server for llama-cpp" cd /D "%~dp0\.." echo "Change the current directory to: %cd%" call :path_sanity_check call :deactivate_environment @rem config set ENV_NAME=llama-cpp-python-server set PYTHON_VERSION=3.10 set CONDA_ROOT_PREFIX=%cd%\install_dir\conda set INSTALL_ENV_DIR=%cd%\install_dir\server_envs\%ENV_NAME% echo "Python version: %PYTHON_VERSION%" echo "Conda prefix: %CONDA_ROOT_PREFIX%" echo "Environment path: %INSTALL_ENV_DIR%" @rem handle conda environment call :check_conda_existence call :create_conda_environment call :isolate_environment call :activate_environment @rem install dependencies @rem ver 0.2.56 produces segment error for /embeddings on MacOS call python -m pip install llama-cpp-python[server]==0.2.55 @REM @rem start the server with passed params call python -m llama_cpp.server %* call conda deactivate goto :end @rem the end of main code execution @rem below are the functions used in the above execution :print_highlight echo. echo ****************************************************** echo %~1 echo ****************************************************** echo. goto :eof :path_sanity_check echo "Path sanity checking" echo "%cd%"| findstr /C:" " >nul ^ && (call :print_highlight "This script relies on Miniconda which can not be silently installed under a path with spaces." ^ && goto :end) goto :eof :deactivate_environment echo "Deactivate existing environment(s)" (call conda deactivate && call conda deactivate && call conda deactivate) 2>nul goto :eof :check_conda_existence echo "Check for conda existence" set conda_exists=F @rem figure out whether conda exists call "%CONDA_ROOT_PREFIX%\_conda.exe" --version >nul 2>&1 if "%ERRORLEVEL%" EQU "0" set conda_exists=T @rem verify if conda is installed by the main app, if not then raise error if "%conda_exists%" == "F" ( call :print_highlight "conda is not installed, seems like the app wasn't installed correctly." goto :end ) goto :eof :create_conda_environment @rem create the environment if needed if not exist "%INSTALL_ENV_DIR%" ( echo "Create conda environment" call "%CONDA_ROOT_PREFIX%\_conda.exe" create ^ --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python="%PYTHON_VERSION%" || ^ ( echo. && call :print_highlight "Conda environment creation failed." && goto :end ) ) @rem check if conda environment was actually created if not exist "%INSTALL_ENV_DIR%\python.exe" ( call :print_highlight "Conda environment was not correctly created." goto :end ) goto :eof :isolate_environment echo "Isolate environment" set PYTHONNOUSERSITE=1 set PYTHONPATH= set PYTHONHOME= goto :eof :activate_environment echo "Activate conda environment" call "%CONDA_ROOT_PREFIX%\condabin\conda.bat" activate "%INSTALL_ENV_DIR%" || ^ ( echo. && call :print_highlight "Miniconda hook not found." && goto :end ) goto :eof :end ================================================ FILE: scripts/update_linux.sh ================================================ #!/bin/bash # functions for better code organization function check_path_for_spaces() { if [[ $PWD =~ \ ]]; then echo "The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later." exit 1 fi } function activate_conda_env() { # deactivate the current env(s) to avoid conflicts { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null # check if conda env is broken (because of interruption during creation) if [ ! -f "$env_dir/bin/python" ]; then echo "Conda environment appears to be broken. You may need to remove $env_dir and run the installer again." exit 1 fi source "$conda_root/etc/profile.d/conda.sh" # conda init conda activate "$env_dir" || { echo "Failed to activate environment. Please remove $env_dir and run the installer again" exit 1 } echo "Activate conda environment at $CONDA_PREFIX" } function deactivate_conda_env() { # Conda deactivate if we are in the right env if [ "$CONDA_PREFIX" == "$env_dir" ]; then conda deactivate echo "Deactivate conda environment at $env_dir" fi } function update_latest() { current_version=$(pip list | awk '/kotaemon-app/ {print $2}') echo "Current version $current_version" if [ -f "pyproject.toml" ]; then echo "Source files detected. Please perform git pull manually." deactivate_environment exit 1 else echo "Installing version: $app_version" # Work around for versioning control python -m pip install "git+https://github.com/Cinnamon/kotaemon.git@$app_version#subdirectory=libs/kotaemon" python -m pip install "git+https://github.com/Cinnamon/kotaemon.git@$app_version#subdirectory=libs/ktem" python -m pip install --no-deps git+https://github.com/Cinnamon/kotaemon.git@$app_version if [ $? -ne 0 ]; then echo echo "Update failed. You may need to run the update again." deactivate_environment exit 1 fi fi } function print_highlight() { local message="${1}" echo "" && echo "******************************************************" echo $message echo "******************************************************" && echo "" } # Main script execution # move two levels up from the dir where this script resides cd "$(dirname "${BASH_SOURCE[0]}")" && cd .. app_version="latest" install_dir="$(pwd)/install_dir" conda_root="${install_dir}/conda" env_dir="${install_dir}/env" check_path_for_spaces print_highlight "Activating conda environment" activate_conda_env print_highlight "Updating Kotaemon to latest" update_latest deactivate_conda_env read -p "Press enter to continue" ================================================ FILE: scripts/update_macos.sh ================================================ #!/bin/bash # functions for better code organization function check_path_for_spaces() { if [[ $PWD =~ \ ]]; then echo "The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later." exit 1 fi } function activate_conda_env() { # deactivate the current env(s) to avoid conflicts { conda deactivate && conda deactivate && conda deactivate; } 2>/dev/null # check if conda env is broken (because of interruption during creation) if [ ! -f "$env_dir/bin/python" ]; then echo "Conda environment appears to be broken. You may need to remove $env_dir and run the installer again." exit 1 fi source "$conda_root/etc/profile.d/conda.sh" # conda init conda activate "$env_dir" || { echo "Failed to activate environment. Please remove $env_dir and run the installer again" exit 1 } echo "Activate conda environment at $CONDA_PREFIX" } function deactivate_conda_env() { # Conda deactivate if we are in the right env if [ "$CONDA_PREFIX" == "$env_dir" ]; then conda deactivate echo "Deactivate conda environment at $env_dir" fi } function update_latest() { current_version=$(pip list | awk '/kotaemon-app/ {print $2}') echo "Current version $current_version" if [ -f "pyproject.toml" ]; then echo "Source files detected. Please perform git pull manually." deactivate_conda_env exit 1 else echo "Installing version: $app_version" # Work around for versioning control python -m pip install "git+https://github.com/Cinnamon/kotaemon.git@$app_version#subdirectory=libs/kotaemon" python -m pip install "git+https://github.com/Cinnamon/kotaemon.git@$app_version#subdirectory=libs/ktem" python -m pip install --no-deps git+https://github.com/Cinnamon/kotaemon.git@$app_version if [ $? -ne 0 ]; then echo echo "Update failed. You may need to run the update again." deactivate_conda_env exit 1 fi fi } function print_highlight() { local message="${1}" echo "" && echo "******************************************************" echo $message echo "******************************************************" && echo "" } # Main script execution # move two levels up from the dir where this script resides cd "$(dirname "${BASH_SOURCE[0]}")" && cd .. app_version="latest" install_dir="$(pwd)/install_dir" conda_root="${install_dir}/conda" env_dir="${install_dir}/env" check_path_for_spaces print_highlight "Activating conda environment" activate_conda_env print_highlight "Updating Kotaemon to latest" update_latest deactivate_conda_env read -p "Press enter to continue" ================================================ FILE: scripts/update_windows.bat ================================================ @ECHO off :: Main script execution CD /D "%~dp0\.." SET app_version=latest SET install_dir=%CD%\install_dir SET conda_root=%install_dir%\conda SET env_dir=%install_dir%\env ECHO %CD%| FINDSTR /C:" " >nul 2>&1 IF %ERRORLEVEL% EQU 0 ( ECHO The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later. GOTO :end ) CALL :print_highlight "Activating conda environment" CALL :activate_environment IF ERRORLEVEL 1 GOTO :end CALL :print_highlight "Updating Kotaemon to latest" CALL :update_latest IF ERRORLEVEL 1 GOTO :end CALL :deactivate_environment GOTO :end_success :activate_environment :: deactivate existing conda env(s) to avoid conflicts ( CALL conda deactivate && CALL conda deactivate && CALL conda deactivate ) 2> nul CALL "%env_dir%\python.exe" --version >nul 2>&1 || ( ECHO The environment appears to be broken. You may need to remove %env_dir% and run the installer again. GOTO :exit_func_with_error ) CALL "%conda_root%\condabin\conda.bat" activate %env_dir% || ( ECHO Failed to activate environment. You may need to remove %env_dir% and run the installer again. GOTO :exit_func_with_error ) ECHO Activate conda environment at %env_dir% GOTO :eof :deactivate_environment :: Conda deactivate if we are in the right env IF "%CONDA_PREFIX%" == "%env_dir%" ( CALL "%conda_root%\condabin\conda.bat" deactivate ECHO Deactivate conda environment at %env_dir% ) GOTO :eof :update_latest FOR /F "tokens=1,2" %%a in ('pip list') do if "%%a"=="kotaemon-app" set current_version=%%b ECHO Current version %current_version% IF EXIST "pyproject.toml" ( ECHO Source files detected. Please perform git pull manually. CALL :deactivate_environment GOTO :exit_func_with_error ) ELSE ( ECHO Installing version: %app_version% @REM Work around for versioning control python -m pip install git+https://github.com/Cinnamon/kotaemon.git@"%app_version%"#subdirectory=libs/kotaemon python -m pip install git+https://github.com/Cinnamon/kotaemon.git@"%app_version%"#subdirectory=libs/ktem python -m pip install --no-deps git+https://github.com/Cinnamon/kotaemon.git@"%app_version%" ) || ( ECHO. && ECHO Update failed. You may need to run the update again. CALL :deactivate_environment GOTO :exit_func_with_error ) CALL :print_highlight "Update successfully." FOR /F "tokens=1,2" %%a in ('pip list') do if "%%a"=="kotaemon-app" set updated_version=%%b ECHO Updated version %updated_version% ECHO %updated_version% > VERSION GOTO :eof :print_highlight ECHO. && ECHO ****************************************************** ECHO %~1 ECHO ****************************************************** && ECHO. GOTO :eof :exit_func_with_error :: Called inside functions when error happens, then back to the main routine with error code 1 EXIT /B 1 :end_success :: Exit the script main routine with error code 0 (success) ECHO Script completed successfully. PAUSE EXIT /B 0 :end :: Exit the script main routine with error code 1 (fail) PAUSE EXIT /B 1 ================================================ FILE: settings.yaml.example ================================================ # This is a sample GraphRAG settings.yaml file that allows users to run the GraphRAG index process with their customized parameters. # The parameters in this file will only take effect when the USE_CUSTOMIZED_GRAPHRAG_SETTING is true in .env file. # For a comprehensive understanding of GraphRAG parameters, please refer to: https://microsoft.github.io/graphrag/config/json_yaml/. encoding_model: cl100k_base skip_workflows: [] llm: api_key: ${GRAPHRAG_API_KEY} type: openai_chat # or azure_openai_chat api_base: http://127.0.0.1:11434/v1 model: qwen2 model_supports_json: true # recommended if this is available for your model. # max_tokens: 4000 request_timeout: 1800.0 # api_base: https://.openai.azure.com # api_version: 2024-02-15-preview # organization: # deployment_name: # tokens_per_minute: 150_000 # set a leaky bucket throttle # requests_per_minute: 10_000 # set a leaky bucket throttle # max_retries: 10 # max_retry_wait: 10.0 # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times concurrent_requests: 5 # the number of parallel inflight requests that may be made # temperature: 0 # temperature for sampling # top_p: 1 # top-p sampling # n: 1 # Number of completions to generate parallelization: stagger: 0.3 # num_threads: 50 # the number of threads to use for parallel processing async_mode: threaded # or asyncio embeddings: ## parallelization: override the global parallelization settings for embeddings async_mode: threaded # or asyncio # target: required # or all # batch_size: 16 # the number of documents to send in a single request # batch_max_tokens: 8191 # the maximum number of tokens to send in a single request llm: api_base: http://localhost:11434/v1 api_key: ${GRAPHRAG_API_KEY} model: nomic-embed-text type: openai_embedding # api_base: https://.openai.azure.com # api_version: 2024-02-15-preview # organization: # deployment_name: # tokens_per_minute: 150_000 # set a leaky bucket throttle # requests_per_minute: 10_000 # set a leaky bucket throttle # max_retries: 10 # max_retry_wait: 10.0 # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times # concurrent_requests: 25 # the number of parallel inflight requests that may be made chunks: size: 1200 overlap: 100 group_by_columns: [id] # by default, we don't allow chunks to cross documents input: type: file # or blob file_type: text # or csv base_dir: "input" file_encoding: utf-8 file_pattern: ".*\\.txt$" cache: type: file # or blob base_dir: "cache" # connection_string: # container_name: storage: type: file # or blob base_dir: "output" # connection_string: # container_name: reporting: type: file # or console, blob base_dir: "output" # connection_string: # container_name: entity_extraction: ## strategy: fully override the entity extraction strategy. ## type: one of graph_intelligence, graph_intelligence_json and nltk ## llm: override the global llm settings for this task ## parallelization: override the global parallelization settings for this task ## async_mode: override the global async_mode settings for this task prompt: "prompts/entity_extraction.txt" entity_types: [organization,person,geo,event] max_gleanings: 1 summarize_descriptions: ## llm: override the global llm settings for this task ## parallelization: override the global parallelization settings for this task ## async_mode: override the global async_mode settings for this task prompt: "prompts/summarize_descriptions.txt" max_length: 500 claim_extraction: ## llm: override the global llm settings for this task ## parallelization: override the global parallelization settings for this task ## async_mode: override the global async_mode settings for this task # enabled: true prompt: "prompts/claim_extraction.txt" description: "Any claims or facts that could be relevant to information discovery." max_gleanings: 1 community_reports: ## llm: override the global llm settings for this task ## parallelization: override the global parallelization settings for this task ## async_mode: override the global async_mode settings for this task prompt: "prompts/community_report.txt" max_length: 2000 max_input_length: 8000 cluster_graph: max_cluster_size: 10 embed_graph: enabled: false # if true, will generate node2vec embeddings for nodes # num_walks: 10 # walk_length: 40 # window_size: 2 # iterations: 3 # random_seed: 597832 umap: enabled: false # if true, will generate UMAP embeddings for nodes snapshots: graphml: false raw_entities: false top_level_nodes: false local_search: # text_unit_prop: 0.5 # community_prop: 0.1 # conversation_history_max_turns: 5 # top_k_mapped_entities: 10 # top_k_relationships: 10 # llm_temperature: 0 # temperature for sampling # llm_top_p: 1 # top-p sampling # llm_n: 1 # Number of completions to generate # max_tokens: 12000 global_search: # llm_temperature: 0 # temperature for sampling # llm_top_p: 1 # top-p sampling # llm_n: 1 # Number of completions to generate # max_tokens: 12000 # data_max_tokens: 12000 # map_max_tokens: 1000 # reduce_max_tokens: 2000 # concurrency: 32 ================================================ FILE: sso_app.py ================================================ import os import gradiologin as grlogin from decouple import config from fastapi import FastAPI from fastapi.responses import FileResponse from theflow.settings import settings as flowsettings KH_APP_DATA_DIR = getattr(flowsettings, "KH_APP_DATA_DIR", ".") GRADIO_TEMP_DIR = os.getenv("GRADIO_TEMP_DIR", None) AUTHENTICATION_METHOD = config("AUTHENTICATION_METHOD", "GOOGLE") # override GRADIO_TEMP_DIR if it's not set if GRADIO_TEMP_DIR is None: GRADIO_TEMP_DIR = os.path.join(KH_APP_DATA_DIR, "gradio_tmp") os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR # for authentication with Google GOOGLE_CLIENT_ID = config("GOOGLE_CLIENT_ID", default="") GOOGLE_CLIENT_SECRET = config("GOOGLE_CLIENT_SECRET", default="") # for authentication with Open ID by keycloak KEYCLOAK_SERVER_URL = config("KEYCLOAK_SERVER_URL", default="") KEYCLOAK_REALM = config("KEYCLOAK_REALM", default="") KEYCLOAK_CLIENT_ID = config("KEYCLOAK_CLIENT_ID", default="") KEYCLOAK_CLIENT_SECRET = config("KEYCLOAK_CLIENT_SECRET", default="") from ktem.main import App # noqa gradio_app = App() demo = gradio_app.make() app = FastAPI() if AUTHENTICATION_METHOD == "KEYCLOAK": # for authentication with Open ID by keycloak grlogin.register( name="keycloak", server_metadata_url=( f"{KEYCLOAK_SERVER_URL}/realms/{KEYCLOAK_REALM}/" ".well-known/openid-configuration" ), client_id=KEYCLOAK_CLIENT_ID, client_secret=KEYCLOAK_CLIENT_SECRET, client_kwargs={ "scope": "openid email profile", }, ) else: # for authentication with Google grlogin.register( name="google", server_metadata_url=( "https://accounts.google.com/.well-known/openid-configuration" ), client_id=GOOGLE_CLIENT_ID, client_secret=GOOGLE_CLIENT_SECRET, client_kwargs={ "scope": "openid email profile", }, ) @app.get("/favicon.ico", include_in_schema=False) async def favicon(): return FileResponse(gradio_app._favicon) grlogin.mount_gradio_app( app, demo, "/app", allowed_paths=[ "libs/ktem/ktem/assets", GRADIO_TEMP_DIR, ], ) ================================================ FILE: sso_app_demo.py ================================================ import os import gradio as gr from authlib.integrations.starlette_client import OAuth, OAuthError from decouple import config from fastapi import FastAPI, Request from fastapi.responses import FileResponse from starlette.config import Config from starlette.middleware.sessions import SessionMiddleware from starlette.responses import RedirectResponse from theflow.settings import settings as flowsettings KH_DEMO_MODE = getattr(flowsettings, "KH_DEMO_MODE", False) KH_APP_DATA_DIR = getattr(flowsettings, "KH_APP_DATA_DIR", ".") GRADIO_TEMP_DIR = os.getenv("GRADIO_TEMP_DIR", None) # override GRADIO_TEMP_DIR if it's not set if GRADIO_TEMP_DIR is None: GRADIO_TEMP_DIR = os.path.join(KH_APP_DATA_DIR, "gradio_tmp") os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR GOOGLE_CLIENT_ID = config("GOOGLE_CLIENT_ID", default="") GOOGLE_CLIENT_SECRET = config("GOOGLE_CLIENT_SECRET", default="") SECRET_KEY = config("SECRET_KEY", default="default-secret-key") def add_session_middleware(app): config_data = { "GOOGLE_CLIENT_ID": GOOGLE_CLIENT_ID, "GOOGLE_CLIENT_SECRET": GOOGLE_CLIENT_SECRET, } starlette_config = Config(environ=config_data) oauth = OAuth(starlette_config) oauth.register( name="google", server_metadata_url=( "https://accounts.google.com/" ".well-known/openid-configuration" ), client_kwargs={"scope": "openid email profile"}, ) app.add_middleware(SessionMiddleware, secret_key=SECRET_KEY) return oauth from ktem.main import App # noqa gradio_app = App() main_demo = gradio_app.make() app = FastAPI() oauth = add_session_middleware(app) @app.get("/") def public(request: Request): root_url = gr.route_utils.get_root_url(request, "/", None) return RedirectResponse(url=f"{root_url}/app/") @app.get("/favicon.ico", include_in_schema=False) async def favicon(): return FileResponse(gradio_app._favicon) @app.route("/logout") async def logout(request: Request): request.session.pop("user", None) return RedirectResponse(url="/") @app.route("/login") async def login(request: Request): root_url = gr.route_utils.get_root_url(request, "/login", None) redirect_uri = f"{root_url}/auth" return await oauth.google.authorize_redirect(request, redirect_uri) @app.route("/auth") async def auth(request: Request): try: access_token = await oauth.google.authorize_access_token(request) except OAuthError: return RedirectResponse(url="/") request.session["user"] = dict(access_token)["userinfo"] return RedirectResponse(url="/") app = gr.mount_gradio_app( app, main_demo, path="/app", allowed_paths=[ "libs/ktem/ktem/assets", GRADIO_TEMP_DIR, ], ) ================================================ FILE: templates/component-default/README.md ================================================ ================================================ FILE: templates/project-default/cookiecutter.json ================================================ { "project_name": "prj_kotaemon", "ptl": "john" } ================================================ FILE: templates/project-default/{{cookiecutter.project_name}}/.gitattributes ================================================ .gitattributes text eol=lf .gitignore text eol=lf *.build text eol=lf *.c text eol=lf *.cmake text eol=lf *.cpp text eol=lf *.csv text eol=lf *.f text eol=lf *.f90 text eol=lf *.for text eol=lf *.grc text eol=lf *.h text eol=lf *.ipynb text eol=lf *.m text eol=lf *.md text eol=lf *.pas text eol=lf *.py text eol=lf *.rst text eol=lf *.sh text eol=lf *.txt text eol=lf *.yml text eol=lf Makefile text eol=lf *.html linguist-documentation ================================================ FILE: templates/project-default/{{cookiecutter.project_name}}/.gitignore ================================================ # Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm ### Emacs ### # -*- mode: gitignore; -*- *~ \#*\# /.emacs.desktop /.emacs.desktop.lock *.elc auto-save-list tramp .\#* # Org-mode .org-id-locations *_archive # flymake-mode *_flymake.* # eshell files /eshell/history /eshell/lastdir # elpa packages /elpa/ # reftex files *.rel # AUCTeX auto folder /auto/ # cask packages .cask/ dist/ # Flycheck flycheck_*.el # server auth directory /server/ # projectiles files .projectile # directory configuration .dir-locals.el # network security /network-security.data ### Linux ### # temporary files which can be created if a process still has a handle open of a deleted file .fuse_hidden* # KDE directory preferences .directory # Linux trash folder which might appear on any partition or disk .Trash-* # .nfs files are created when an open file is removed but is still being accessed .nfs* ### macOS ### # General .DS_Store .AppleDouble .LSOverride # Icon must end with two \r Icon # Thumbnails ._* # Files that might appear in the root of a volume .DocumentRevisions-V100 .fseventsd .Spotlight-V100 .TemporaryItems .Trashes .VolumeIcon.icns .com.apple.timemachine.donotpresent # Directories potentially created on remote AFP share .AppleDB .AppleDesktop Network Trash Folder Temporary Items .apdisk ### macOS Patch ### # iCloud generated files *.icloud ### PyCharm ### # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 # User-specific stuff .idea/**/workspace.xml .idea/**/tasks.xml .idea/**/usage.statistics.xml .idea/**/dictionaries .idea/**/shelf # AWS User-specific .idea/**/aws.xml # Generated files .idea/**/contentModel.xml # Sensitive or high-churn files .idea/**/dataSources/ .idea/**/dataSources.ids .idea/**/dataSources.local.xml .idea/**/sqlDataSources.xml .idea/**/dynamic.xml .idea/**/uiDesigner.xml .idea/**/dbnavigator.xml # Gradle .idea/**/gradle.xml .idea/**/libraries # Gradle and Maven with auto-import # When using Gradle or Maven with auto-import, you should exclude module files, # since they will be recreated, and may cause churn. Uncomment if using # auto-import. # .idea/artifacts # .idea/compiler.xml # .idea/jarRepositories.xml # .idea/modules.xml # .idea/*.iml # .idea/modules # *.iml # *.ipr # CMake cmake-build-*/ # Mongo Explorer plugin .idea/**/mongoSettings.xml # File-based project format *.iws # IntelliJ out/ # mpeltonen/sbt-idea plugin .idea_modules/ # JIRA plugin atlassian-ide-plugin.xml # Cursive Clojure plugin .idea/replstate.xml # SonarLint plugin .idea/sonarlint/ # Crashlytics plugin (for Android Studio and IntelliJ) com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties fabric.properties # Editor-based Rest Client .idea/httpRequests # Android studio 3.1+ serialized cache file .idea/caches/build_file_checksums.ser ### PyCharm Patch ### # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 # *.iml # modules.xml # .idea/misc.xml # *.ipr # Sonarlint plugin # https://plugins.jetbrains.com/plugin/7973-sonarlint .idea/**/sonarlint/ # SonarQube Plugin # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin .idea/**/sonarIssues.xml # Markdown Navigator plugin # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced .idea/**/markdown-navigator.xml .idea/**/markdown-navigator-enh.xml .idea/**/markdown-navigator/ # Cache file creation bug # See https://youtrack.jetbrains.com/issue/JBR-2257 .idea/$CACHE_FILE$ # CodeStream plugin # https://plugins.jetbrains.com/plugin/12206-codestream .idea/codestream.xml # Azure Toolkit for IntelliJ plugin # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij .idea/**/azureSettings.xml ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ ### Python Patch ### # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration poetry.toml # ruff .ruff_cache/ # LSP config files pyrightconfig.json ### Vim ### # Swap [._]*.s[a-v][a-z] !*.svg # comment out if you don't need vector files [._]*.sw[a-p] [._]s[a-rt-v][a-z] [._]ss[a-gi-z] [._]sw[a-p] # Session Session.vim Sessionx.vim # Temporary .netrwhist # Auto-generated tag files tags # Persistent undo [._]*.un~ ### VisualStudioCode ### .vscode/* !.vscode/settings.json !.vscode/tasks.json !.vscode/launch.json !.vscode/extensions.json !.vscode/*.code-snippets # Local History for Visual Studio Code .history/ # Built Visual Studio Code Extensions *.vsix ### VisualStudioCode Patch ### # Ignore all local history of files .history .ionide ### Windows ### # Windows thumbnail cache files Thumbs.db Thumbs.db:encryptable ehthumbs.db ehthumbs_vista.db # Dump file *.stackdump # Folder config file [Dd]esktop.ini # Recycle Bin used on file shares $RECYCLE.BIN/ # Windows Installer files *.cab *.msi *.msix *.msm *.msp # Windows shortcuts *.lnk .theflow/ # End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm logs/ .gitsecret/keys/random_seed !*.secret credentials.txt S.gpg-agent* .vscode/settings.json ================================================ FILE: templates/project-default/{{cookiecutter.project_name}}/.pre-commit-config.yaml ================================================ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.3.0 hooks: - id: check-yaml - id: check-toml - id: end-of-file-fixer - id: trailing-whitespace - id: detect-aws-credentials args: ["--allow-missing-credentials"] - id: detect-private-key - id: check-added-large-files - repo: https://github.com/ambv/black rev: 22.3.0 hooks: - id: black language_version: python3 - repo: https://github.com/pycqa/isort rev: 5.12.0 hooks: - id: isort args: ["--profile", "black"] language_version: python3.10 - repo: https://github.com/pycqa/flake8 rev: 4.0.1 hooks: - id: flake8 args: ["--max-line-length", "88", "--extend-ignore", "E203"] - repo: https://github.com/myint/autoflake rev: v1.4 hooks: - id: autoflake args: [ "--in-place", "--remove-unused-variables", "--remove-all-unused-imports", "--ignore-init-module-imports", "--exclude=tests/*", ] - repo: https://github.com/pre-commit/mirrors-prettier rev: v2.7.1 hooks: - id: prettier types_or: [markdown, yaml] - repo: https://github.com/pre-commit/mirrors-mypy rev: "v1.5.1" hooks: - id: mypy additional_dependencies: [types-PyYAML==6.0.12.11, "types-requests"] args: ["--check-untyped-defs", "--ignore-missing-imports"] ================================================ FILE: templates/project-default/{{cookiecutter.project_name}}/README.md ================================================
# Project {{ cookiecutter.project_name }} [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/Cinnamon/kotaemon)
# Install ```bash # Create new conda env (optional) conda create -n {{ cookiecutter.project_name }} python=3.10 conda activate {{ cookiecutter.project_name }} # Clone and install the project git clone "<{{ cookiecutter.project_name }}-repo>" cd "<{{ cookiecutter.project_name }}-repo>" pip install -e . # Generate the project structure cd .. kh start-project ``` # Usage - Build the pipeline in `pipeline.py` For supported utilities and tools, refer: https://github.com/Cinnamon/kotaemon/wiki/Utilities # Contribute - For project issues and errors, please report in this repo issues. - For kotaemon issues and errors, please report or make PR fixes in https://github.com/Cinnamon/kotaemon.git - If the template for this project has issues and errors, please report or make PR fixes in https://github.com/Cinnamon/kotaemon/tree/main/templates/project-default ================================================ FILE: templates/project-default/{{cookiecutter.project_name}}/setup.py ================================================ import setuptools setuptools.setup( name="{{ cookiecutter.project_name }}", version="0.0.1", author="{{ cookiecutter.ptl }}", author_email="{{ cookiecutter.ptl }}@cinnamon.is", description="Project {{ cookiecutter.project_name }}", long_description="Project {{ cookiecutter.project_name }}", url="https://github.com/Cinnamon/kotaemon", python_requires=">=3", classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], install_requires=[ "kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git", ], ) ================================================ FILE: templates/project-default/{{cookiecutter.project_name}}/tests/__init__.py ================================================ ================================================ FILE: templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py ================================================ ================================================ FILE: templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/pipeline.py ================================================ import os from typing import List from kotaemon.base import BaseComponent, Document, LLMInterface, Node, Param, lazy from kotaemon.contribs.promptui.logs import ResultLog from kotaemon.embeddings import LCAzureOpenAIEmbeddings from kotaemon.indices import VectorIndexing, VectorRetrieval from kotaemon.llms import LCAzureChatOpenAI from kotaemon.storages import ChromaVectorStore, SimpleFileDocumentStore class QAResultLog(ResultLog): @staticmethod def _get_prompt(obj): return obj["prompt"] class QuestionAnsweringPipeline(BaseComponent): _promptui_resultlog = QAResultLog _promptui_outputs: list = [ { "step": ".prompt", "getter": "_get_prompt", "component": "text", "params": {"label": "Constructed prompt to LLM"}, }, { "step": ".", "getter": "_get_output", "component": "text", "params": {"label": "Answer"}, }, ] retrieval_top_k: int = 1 llm: LCAzureChatOpenAI = LCAzureChatOpenAI.withx( azure_endpoint="https://bleh-dummy-2.openai.azure.com/", openai_api_key=os.environ.get("OPENAI_API_KEY", "default-key"), openai_api_version="2023-03-15-preview", deployment_name="dummy-q2-gpt35", temperature=0, request_timeout=60, ) retrieving_pipeline: VectorRetrieval = Node( VectorRetrieval.withx( vector_store=lazy(ChromaVectorStore).withx(path="./tmp"), doc_store=lazy(SimpleFileDocumentStore).withx(path="docstore.json"), embedding=LCAzureOpenAIEmbeddings.withx( model="text-embedding-ada-002", deployment="dummy-q2-text-embedding", azure_endpoint="https://bleh-dummy-2.openai.azure.com/", openai_api_key=os.environ.get("OPENAI_API_KEY", "default-key"), ), ), ignore_ui=True, ) def run(self, text: str) -> LLMInterface: # retrieve relevant documents as context matched_texts: List[str] = [ _.text for _ in self.retrieving_pipeline(text, top_k=int(self.retrieval_top_k)) ] context = "\n".join(matched_texts) # generate the answer prompt = f'Answer the following question: "{text}". The context is: \n{context}' self.log_progress(".prompt", prompt=prompt) return self.llm(prompt).text class IndexingPipeline(VectorIndexing): vector_store: ChromaVectorStore = Param( lazy(ChromaVectorStore).withx(path="./tmp"), ignore_ui=True, ) doc_store: SimpleFileDocumentStore = Param( lazy(SimpleFileDocumentStore).withx(path="docstore.json"), ignore_ui=True, ) embedding: LCAzureOpenAIEmbeddings = LCAzureOpenAIEmbeddings.withx( model="text-embedding-ada-002", deployment="dummy-q2-text-embedding", azure_endpoint="https://bleh-dummy-2.openai.azure.com/", openai_api_key=os.environ.get("OPENAI_API_KEY", "default-key"), ) def run(self, text: str) -> Document: """Normally, this indexing pipeline returns nothing. For demonstration, we want it to return something, so let's return the number of documents in the vector store """ super().run(text) if self.doc_store is not None: # persist to local anytime an indexing is created # this can be bypassed when we have a FileDocumentStore self.doc_store.save("docstore.json") return Document(self.vector_store._collection.count())