Repository: arc53/DocsGPT Branch: main Commit: ce5cd5561a65 Files: 642 Total size: 4.1 MB Directory structure: gitextract_kso0ck8g/ ├── .devcontainer/ │ ├── Dockerfile │ ├── devc-welcome.md │ ├── devcontainer.json │ ├── docker-compose-dev.yaml │ ├── docker-compose.override.yaml │ └── post-create-command.sh ├── .env-template ├── .gitattributes ├── .github/ │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yml │ │ └── feature_request.yml │ ├── PULL_REQUEST_TEMPLATE.md │ ├── dependabot.yml │ ├── holopin.yml │ ├── labeler.yml │ ├── styles/ │ │ ├── DocsGPT/ │ │ │ └── Spelling.yml │ │ └── config/ │ │ └── vocabularies/ │ │ └── DocsGPT/ │ │ └── accept.txt │ └── workflows/ │ ├── bandit.yaml │ ├── ci.yml │ ├── cife.yml │ ├── docker-develop-build.yml │ ├── docker-develop-fe-build.yml │ ├── labeler.yml │ ├── lint.yml │ ├── pytest.yml │ ├── sync_fork.yaml │ └── vale.yml ├── .gitignore ├── .ruff.toml ├── .vale.ini ├── .vscode/ │ └── launch.json ├── AGENTS.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── HACKTOBERFEST.md ├── LICENSE ├── README.md ├── SECURITY.md ├── application/ │ ├── Dockerfile │ ├── __init__.py │ ├── agents/ │ │ ├── __init__.py │ │ ├── agent_creator.py │ │ ├── base.py │ │ ├── classic_agent.py │ │ ├── react_agent.py │ │ ├── tools/ │ │ │ ├── api_body_serializer.py │ │ │ ├── api_tool.py │ │ │ ├── base.py │ │ │ ├── brave.py │ │ │ ├── cryptoprice.py │ │ │ ├── duckduckgo.py │ │ │ ├── mcp_tool.py │ │ │ ├── memory.py │ │ │ ├── notes.py │ │ │ ├── ntfy.py │ │ │ ├── postgres.py │ │ │ ├── read_webpage.py │ │ │ ├── spec_parser.py │ │ │ ├── telegram.py │ │ │ ├── todo_list.py │ │ │ ├── tool_action_parser.py │ │ │ └── tool_manager.py │ │ ├── workflow_agent.py │ │ └── workflows/ │ │ ├── cel_evaluator.py │ │ ├── node_agent.py │ │ ├── schemas.py │ │ └── workflow_engine.py │ ├── api/ │ │ ├── __init__.py │ │ ├── answer/ │ │ │ ├── __init__.py │ │ │ ├── routes/ │ │ │ │ ├── __init__.py │ │ │ │ ├── answer.py │ │ │ │ ├── base.py │ │ │ │ ├── search.py │ │ │ │ └── stream.py │ │ │ └── services/ │ │ │ ├── __init__.py │ │ │ ├── compression/ │ │ │ │ ├── __init__.py │ │ │ │ ├── message_builder.py │ │ │ │ ├── orchestrator.py │ │ │ │ ├── prompt_builder.py │ │ │ │ ├── service.py │ │ │ │ ├── threshold_checker.py │ │ │ │ ├── token_counter.py │ │ │ │ └── types.py │ │ │ ├── conversation_service.py │ │ │ ├── prompt_renderer.py │ │ │ └── stream_processor.py │ │ ├── connector/ │ │ │ └── routes.py │ │ ├── internal/ │ │ │ ├── __init__.py │ │ │ └── routes.py │ │ └── user/ │ │ ├── __init__.py │ │ ├── agents/ │ │ │ ├── __init__.py │ │ │ ├── folders.py │ │ │ ├── routes.py │ │ │ ├── sharing.py │ │ │ └── webhooks.py │ │ ├── analytics/ │ │ │ ├── __init__.py │ │ │ └── routes.py │ │ ├── attachments/ │ │ │ ├── __init__.py │ │ │ └── routes.py │ │ ├── base.py │ │ ├── conversations/ │ │ │ ├── __init__.py │ │ │ └── routes.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ └── routes.py │ │ ├── prompts/ │ │ │ ├── __init__.py │ │ │ └── routes.py │ │ ├── routes.py │ │ ├── sharing/ │ │ │ ├── __init__.py │ │ │ └── routes.py │ │ ├── sources/ │ │ │ ├── __init__.py │ │ │ ├── chunks.py │ │ │ ├── routes.py │ │ │ └── upload.py │ │ ├── tasks.py │ │ ├── tools/ │ │ │ ├── __init__.py │ │ │ ├── mcp.py │ │ │ └── routes.py │ │ ├── utils.py │ │ └── workflows/ │ │ ├── __init__.py │ │ └── routes.py │ ├── app.py │ ├── auth.py │ ├── cache.py │ ├── celery_init.py │ ├── celeryconfig.py │ ├── core/ │ │ ├── __init__.py │ │ ├── json_schema_utils.py │ │ ├── logging_config.py │ │ ├── model_configs.py │ │ ├── model_settings.py │ │ ├── model_utils.py │ │ ├── mongo_db.py │ │ ├── settings.py │ │ └── url_validation.py │ ├── error.py │ ├── index.faiss │ ├── index.pkl │ ├── llm/ │ │ ├── __init__.py │ │ ├── anthropic.py │ │ ├── base.py │ │ ├── docsgpt_provider.py │ │ ├── google_ai.py │ │ ├── groq.py │ │ ├── handlers/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── google.py │ │ │ ├── handler_creator.py │ │ │ └── openai.py │ │ ├── llama_cpp.py │ │ ├── llm_creator.py │ │ ├── novita.py │ │ ├── open_router.py │ │ ├── openai.py │ │ ├── premai.py │ │ └── sagemaker.py │ ├── logging.py │ ├── parser/ │ │ ├── __init__.py │ │ ├── chunking.py │ │ ├── connectors/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── connector_creator.py │ │ │ ├── google_drive/ │ │ │ │ ├── __init__.py │ │ │ │ ├── auth.py │ │ │ │ └── loader.py │ │ │ └── share_point/ │ │ │ ├── __init__.py │ │ │ ├── auth.py │ │ │ └── loader.py │ │ ├── embedding_pipeline.py │ │ ├── file/ │ │ │ ├── __init__.py │ │ │ ├── audio_parser.py │ │ │ ├── base.py │ │ │ ├── base_parser.py │ │ │ ├── bulk.py │ │ │ ├── constants.py │ │ │ ├── docling_parser.py │ │ │ ├── docs_parser.py │ │ │ ├── epub_parser.py │ │ │ ├── html_parser.py │ │ │ ├── image_parser.py │ │ │ ├── json_parser.py │ │ │ ├── markdown_parser.py │ │ │ ├── openapi3_parser.py │ │ │ ├── pptx_parser.py │ │ │ ├── rst_parser.py │ │ │ └── tabular_parser.py │ │ ├── remote/ │ │ │ ├── base.py │ │ │ ├── crawler_loader.py │ │ │ ├── crawler_markdown.py │ │ │ ├── github_loader.py │ │ │ ├── reddit_loader.py │ │ │ ├── remote_creator.py │ │ │ ├── s3_loader.py │ │ │ ├── sitemap_loader.py │ │ │ ├── telegram.py │ │ │ └── web_loader.py │ │ └── schema/ │ │ ├── __init__.py │ │ ├── base.py │ │ └── schema.py │ ├── prompts/ │ │ ├── chat_combine_creative.txt │ │ ├── chat_combine_default.txt │ │ ├── chat_combine_strict.txt │ │ ├── chat_reduce_prompt.txt │ │ ├── compression/ │ │ │ └── v1.0.txt │ │ ├── react_final_prompt.txt │ │ └── react_planning_prompt.txt │ ├── requirements.txt │ ├── retriever/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── classic_rag.py │ │ └── retriever_creator.py │ ├── security/ │ │ ├── __init__.py │ │ └── encryption.py │ ├── seed/ │ │ ├── __init__.py │ │ ├── commands.py │ │ ├── config/ │ │ │ ├── agents_template.yaml │ │ │ └── premade_agents.yaml │ │ └── seeder.py │ ├── storage/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── local.py │ │ ├── s3.py │ │ └── storage_creator.py │ ├── stt/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── constants.py │ │ ├── faster_whisper_stt.py │ │ ├── live_session.py │ │ ├── openai_stt.py │ │ ├── stt_creator.py │ │ └── upload_limits.py │ ├── templates/ │ │ ├── __init__.py │ │ ├── namespaces.py │ │ └── template_engine.py │ ├── tts/ │ │ ├── base.py │ │ ├── elevenlabs.py │ │ ├── google_tts.py │ │ └── tts_creator.py │ ├── usage.py │ ├── utils.py │ ├── vectorstore/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── document_class.py │ │ ├── elasticsearch.py │ │ ├── embeddings_local.py │ │ ├── faiss.py │ │ ├── lancedb.py │ │ ├── milvus.py │ │ ├── mongodb.py │ │ ├── pgvector.py │ │ ├── qdrant.py │ │ └── vector_creator.py │ ├── worker.py │ └── wsgi.py ├── codecov.yml ├── deployment/ │ ├── docker-compose-azure.yaml │ ├── docker-compose-dev.yaml │ ├── docker-compose-hub.yaml │ ├── docker-compose-local.yaml │ ├── docker-compose.yaml │ ├── k8s/ │ │ ├── deployments/ │ │ │ ├── docsgpt-deploy.yaml │ │ │ ├── mongo-deploy.yaml │ │ │ ├── qdrant-deploy.yaml │ │ │ └── redis-deploy.yaml │ │ ├── docsgpt-secrets.yaml │ │ └── services/ │ │ ├── docsgpt-service.yaml │ │ ├── mongo-service.yaml │ │ ├── qdrant-service.yaml │ │ └── redis-service.yaml │ └── optional/ │ ├── docker-compose.optional.ollama-cpu.yaml │ └── docker-compose.optional.ollama-gpu.yaml ├── docs/ │ ├── README.md │ ├── app/ │ │ ├── [[...mdxPath]]/ │ │ │ └── page.jsx │ │ └── layout.jsx │ ├── components/ │ │ ├── DeploymentCards.jsx │ │ └── ToolCards.jsx │ ├── content/ │ │ ├── Agents/ │ │ │ ├── _meta.js │ │ │ ├── api.mdx │ │ │ ├── basics.mdx │ │ │ ├── nodes.mdx │ │ │ └── webhooks.mdx │ │ ├── Deploying/ │ │ │ ├── Amazon-Lightsail.mdx │ │ │ ├── Development-Environment.mdx │ │ │ ├── Docker-Deploying.mdx │ │ │ ├── DocsGPT-Settings.mdx │ │ │ ├── Hosting-the-app.mdx │ │ │ ├── Kubernetes-Deploying.mdx │ │ │ ├── Railway.mdx │ │ │ └── _meta.js │ │ ├── Extensions/ │ │ │ ├── Chatwoot-extension.mdx │ │ │ ├── Chrome-extension.mdx │ │ │ ├── _meta.js │ │ │ ├── api-key-guide.mdx │ │ │ ├── chat-widget.mdx │ │ │ └── search-widget.mdx │ │ ├── Guides/ │ │ │ ├── Architecture.mdx │ │ │ ├── Customising-prompts.mdx │ │ │ ├── How-to-train-on-other-documentation.mdx │ │ │ ├── How-to-use-different-LLM.mdx │ │ │ ├── Integrations/ │ │ │ │ ├── _meta.js │ │ │ │ └── google-drive-connector.mdx │ │ │ ├── My-AI-answers-questions-using-external-knowledge.mdx │ │ │ ├── _meta.js │ │ │ ├── compression.md │ │ │ └── ocr.mdx │ │ ├── Models/ │ │ │ ├── _meta.js │ │ │ ├── cloud-providers.mdx │ │ │ ├── embeddings.md │ │ │ └── local-inference.mdx │ │ ├── Tools/ │ │ │ ├── _meta.js │ │ │ ├── api-tool.mdx │ │ │ ├── basics.mdx │ │ │ └── creating-a-tool.mdx │ │ ├── _meta.js │ │ ├── changelog.mdx │ │ ├── index.mdx │ │ └── quickstart.mdx │ ├── mdx-components.jsx │ ├── next.config.js │ ├── package.json │ ├── public/ │ │ ├── favicons/ │ │ │ └── site.webmanifest │ │ └── llms.txt │ └── theme.config.jsx ├── extensions/ │ ├── chatwoot/ │ │ ├── .env_sample │ │ ├── __init__.py │ │ └── app.py │ ├── chrome/ │ │ ├── _locales/ │ │ │ └── en/ │ │ │ └── messages.json │ │ ├── dist/ │ │ │ └── output.css │ │ ├── js/ │ │ │ └── jquery/ │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── bower.json │ │ │ ├── component.json │ │ │ ├── composer.json │ │ │ ├── jquery.js │ │ │ └── package.json │ │ ├── manifest.json │ │ ├── package.json │ │ ├── popup.html │ │ ├── popup.js │ │ ├── src/ │ │ │ └── bg/ │ │ │ └── service-worker.js │ │ ├── styles.css │ │ └── tailwind.config.js │ ├── discord/ │ │ ├── __init__.py │ │ └── bot.py │ ├── react-widget/ │ │ ├── .gitignore │ │ ├── .parcelrc │ │ ├── README.md │ │ ├── custom.d.ts │ │ ├── package.json │ │ ├── publish.sh │ │ ├── src/ │ │ │ ├── App.tsx │ │ │ ├── browser.tsx │ │ │ ├── components/ │ │ │ │ ├── DocsGPTWidget.tsx │ │ │ │ └── SearchBar.tsx │ │ │ ├── index.html │ │ │ ├── index.ts │ │ │ ├── main.tsx │ │ │ ├── requests/ │ │ │ │ ├── searchAPI.ts │ │ │ │ └── streamingApi.ts │ │ │ ├── types/ │ │ │ │ └── index.ts │ │ │ └── utils/ │ │ │ └── helper.ts │ │ └── tsconfig.json │ ├── slack-bot/ │ │ ├── .gitignore │ │ ├── Readme.md │ │ ├── app.py │ │ └── requirements.txt │ └── web-widget/ │ ├── README.md │ ├── dist/ │ │ ├── chat-widget.js │ │ └── output.css │ ├── index.html │ ├── package.json │ ├── src/ │ │ ├── html/ │ │ │ └── widget.html │ │ ├── input.css │ │ └── js/ │ │ └── script.js │ └── tailwind.config.js ├── frontend/ │ ├── .husky/ │ │ └── pre-commit │ ├── .prettierignore │ ├── Dockerfile │ ├── components.json │ ├── eslint.config.js │ ├── index.html │ ├── package.json │ ├── postcss.config.cjs │ ├── prettier.config.cjs │ ├── src/ │ │ ├── App.tsx │ │ ├── Hero.tsx │ │ ├── Navigation.tsx │ │ ├── PageNotFound.tsx │ │ ├── agents/ │ │ │ ├── AgentCard.tsx │ │ │ ├── AgentLogs.tsx │ │ │ ├── AgentPreview.tsx │ │ │ ├── AgentsList.tsx │ │ │ ├── FolderCard.tsx │ │ │ ├── NewAgent.tsx │ │ │ ├── SharedAgent.tsx │ │ │ ├── SharedAgentCard.tsx │ │ │ ├── SharedAgentGate.tsx │ │ │ ├── WorkflowBuilder.tsx │ │ │ ├── agentPreviewSlice.ts │ │ │ ├── agents.config.ts │ │ │ ├── components/ │ │ │ │ └── AgentTypeModal.tsx │ │ │ ├── hooks/ │ │ │ │ ├── useAgentSearch.ts │ │ │ │ └── useAgentsFetch.ts │ │ │ ├── index.tsx │ │ │ ├── types/ │ │ │ │ ├── index.ts │ │ │ │ └── workflow.ts │ │ │ └── workflow/ │ │ │ ├── WorkflowBuilder.tsx │ │ │ ├── WorkflowPreview.tsx │ │ │ ├── components/ │ │ │ │ ├── MobileBlocker.tsx │ │ │ │ └── PromptTextArea.tsx │ │ │ ├── nodes/ │ │ │ │ ├── BaseNode.tsx │ │ │ │ ├── ConditionNode.tsx │ │ │ │ ├── SetStateNode.tsx │ │ │ │ └── index.tsx │ │ │ └── workflowPreviewSlice.ts │ │ ├── api/ │ │ │ ├── client.ts │ │ │ ├── endpoints.ts │ │ │ └── services/ │ │ │ ├── conversationService.ts │ │ │ ├── modelService.ts │ │ │ └── userService.ts │ │ ├── components/ │ │ │ ├── Accordion.tsx │ │ │ ├── ActionButtons.tsx │ │ │ ├── AgentImage.tsx │ │ │ ├── ArtifactSidebar.tsx │ │ │ ├── Avatar.tsx │ │ │ ├── Chunks.tsx │ │ │ ├── ConfigFields.tsx │ │ │ ├── ConnectedStateSkeleton.tsx │ │ │ ├── ConnectorAuth.tsx │ │ │ ├── ConnectorTree.tsx │ │ │ ├── ContextMenu.tsx │ │ │ ├── CopyButton.tsx │ │ │ ├── DocumentPagination.tsx │ │ │ ├── Dropdown.tsx │ │ │ ├── DropdownMenu.tsx │ │ │ ├── DropdownModel.tsx │ │ │ ├── FilePicker.tsx │ │ │ ├── FileSelectionSkeleton.tsx │ │ │ ├── FileTree.tsx │ │ │ ├── FileUpload.tsx │ │ │ ├── GoogleDrivePicker.tsx │ │ │ ├── Head.tsx │ │ │ ├── Help.tsx │ │ │ ├── Input.tsx │ │ │ ├── MermaidRenderer.tsx │ │ │ ├── MessageInput.tsx │ │ │ ├── MultiSelectPopup.tsx │ │ │ ├── Notification.tsx │ │ │ ├── RetryIcon.tsx │ │ │ ├── SearchableDropdown.tsx │ │ │ ├── SendArrowIcon.tsx │ │ │ ├── SettingsBar.tsx │ │ │ ├── Sidebar.tsx │ │ │ ├── SkeletonLoader.tsx │ │ │ ├── SourcesPopup.tsx │ │ │ ├── Spinner.tsx │ │ │ ├── Table.tsx │ │ │ ├── TextToSpeechButton.tsx │ │ │ ├── ToggleSwitch.tsx │ │ │ ├── ToolsPopup.tsx │ │ │ ├── UploadToast.tsx │ │ │ ├── types/ │ │ │ │ ├── Dropdown.types.ts │ │ │ │ └── index.ts │ │ │ └── ui/ │ │ │ ├── alert.tsx │ │ │ ├── button.tsx │ │ │ ├── command.tsx │ │ │ ├── dialog.tsx │ │ │ ├── input.tsx │ │ │ ├── label.tsx │ │ │ ├── multi-select.tsx │ │ │ ├── popover.tsx │ │ │ ├── select.tsx │ │ │ └── sheet.tsx │ │ ├── constants/ │ │ │ └── fileUpload.ts │ │ ├── conversation/ │ │ │ ├── Conversation.tsx │ │ │ ├── ConversationBubble.module.css │ │ │ ├── ConversationBubble.tsx │ │ │ ├── ConversationMessages.tsx │ │ │ ├── ConversationTile.tsx │ │ │ ├── SharedConversation.tsx │ │ │ ├── conversationHandlers.ts │ │ │ ├── conversationModels.ts │ │ │ ├── conversationSlice.ts │ │ │ ├── sharedConversationSlice.ts │ │ │ └── types/ │ │ │ └── index.ts │ │ ├── hooks/ │ │ │ ├── index.ts │ │ │ ├── useDataInitializer.ts │ │ │ └── useTokenAuth.ts │ │ ├── index.css │ │ ├── lib/ │ │ │ └── utils.ts │ │ ├── locale/ │ │ │ ├── de.json │ │ │ ├── en.json │ │ │ ├── es.json │ │ │ ├── i18n.ts │ │ │ ├── jp.json │ │ │ ├── ru.json │ │ │ ├── zh-TW.json │ │ │ └── zh.json │ │ ├── main.tsx │ │ ├── modals/ │ │ │ ├── AddActionModal.tsx │ │ │ ├── AddToolModal.tsx │ │ │ ├── AgentDetailsModal.tsx │ │ │ ├── ConfigToolModal.tsx │ │ │ ├── ConfirmationModal.tsx │ │ │ ├── DeleteConvModal.tsx │ │ │ ├── FolderManagementModal.tsx │ │ │ ├── ImportSpecModal.tsx │ │ │ ├── JWTModal.tsx │ │ │ ├── MCPServerModal.tsx │ │ │ ├── MoveToFolderModal.tsx │ │ │ ├── ShareConversationModal.tsx │ │ │ ├── WrapperModal.tsx │ │ │ └── types/ │ │ │ └── index.ts │ │ ├── models/ │ │ │ ├── misc.ts │ │ │ └── types.ts │ │ ├── preferences/ │ │ │ ├── PromptsModal.tsx │ │ │ ├── preferenceApi.ts │ │ │ ├── preferenceSlice.ts │ │ │ └── types/ │ │ │ └── index.ts │ │ ├── settings/ │ │ │ ├── Analytics.tsx │ │ │ ├── General.tsx │ │ │ ├── Logs.tsx │ │ │ ├── Prompts.tsx │ │ │ ├── Sources.tsx │ │ │ ├── ToolConfig.tsx │ │ │ ├── Tools.tsx │ │ │ ├── index.tsx │ │ │ └── types/ │ │ │ └── index.ts │ │ ├── store.ts │ │ ├── upload/ │ │ │ ├── Upload.tsx │ │ │ ├── types/ │ │ │ │ └── ingestor.ts │ │ │ └── uploadSlice.ts │ │ ├── utils/ │ │ │ ├── browserUtils.ts │ │ │ ├── chartUtils.ts │ │ │ ├── dateTimeUtils.ts │ │ │ ├── objectUtils.ts │ │ │ ├── providerUtils.ts │ │ │ └── stringUtils.ts │ │ └── vite-env.d.ts │ ├── tsconfig.json │ ├── tsconfig.node.json │ └── vite.config.ts ├── md-gen.py ├── pytest.ini ├── scripts/ │ ├── migrate_conversation_id_dbref_to_objectid.py │ └── migrate_to_v1_vectorstore.py ├── setup.ps1 ├── setup.sh └── tests/ ├── __init__.py ├── agents/ │ ├── __init__.py │ ├── test_agent_creator.py │ ├── test_base_agent.py │ ├── test_classic_agent.py │ ├── test_get_artifact.py │ ├── test_react_agent.py │ ├── test_tool_action_parser.py │ ├── test_tool_manager.py │ ├── test_workflow_engine.py │ └── test_workflow_template.py ├── api/ │ ├── __init__.py │ ├── answer/ │ │ ├── __init__.py │ │ ├── routes/ │ │ │ ├── __init__.py │ │ │ ├── test_base.py │ │ │ └── test_search.py │ │ └── services/ │ │ ├── __init__.py │ │ ├── test_conversation_service.py │ │ ├── test_prompt_renderer.py │ │ └── test_stream_processor.py │ ├── conftest.py │ └── user/ │ ├── attachments/ │ │ └── test_routes.py │ ├── sources/ │ │ ├── __init__.py │ │ ├── test_audio_upload.py │ │ └── test_routes.py │ ├── test_base.py │ └── test_exception_sanitization.py ├── conftest.py ├── core/ │ └── test_url_validation.py ├── integration/ │ ├── __init__.py │ ├── base.py │ ├── run_all.py │ ├── test_agents.py │ ├── test_analytics.py │ ├── test_chat.py │ ├── test_connectors.py │ ├── test_conversations.py │ ├── test_mcp.py │ ├── test_misc.py │ ├── test_prompts.py │ ├── test_sources.py │ └── test_tools.py ├── llm/ │ ├── handlers/ │ │ ├── test_google.py │ │ ├── test_handler_creator.py │ │ ├── test_llm_handlers.py │ │ └── test_openai.py │ ├── test_anthropic_llm.py │ ├── test_google_llm.py │ ├── test_openai_llm.py │ └── test_sagemaker.py ├── parser/ │ ├── file/ │ │ ├── test_audio_parser.py │ │ ├── test_docs_parser.py │ │ ├── test_embedding_pipeline.py │ │ ├── test_epub_parser.py │ │ ├── test_html_parser.py │ │ ├── test_image_parser.py │ │ ├── test_json_parser.py │ │ ├── test_markdown_parser.py │ │ ├── test_pptx_parser.py │ │ ├── test_rst_parser.py │ │ └── test_tabular_parser.py │ └── remote/ │ ├── test_crawler_loader.py │ ├── test_crawler_markdown.py │ ├── test_github_loader.py │ ├── test_reddit_loader.py │ ├── test_s3_loader.py │ ├── test_share_point_loader.py │ └── test_web_loader.py ├── requirements.txt ├── security/ │ └── test_encryption.py ├── storage/ │ ├── test_local_storage.py │ └── test_s3_storage.py ├── stt/ │ ├── test_live_session.py │ ├── test_stt_creator.py │ └── test_upload_limits.py ├── test_agent_token_tracking.py ├── test_app.py ├── test_attachment_worker_audio.py ├── test_cache.py ├── test_celery.py ├── test_compression_service.py ├── test_error.py ├── test_integration.py ├── test_memory_tool.py ├── test_model_validation.py ├── test_notes_tool.py ├── test_openapi3.yaml ├── test_openapi3parser.py ├── test_todo_tool.py ├── test_token_management.py ├── test_usage.py ├── test_zip_extraction_security.py └── tts/ ├── test_elevenlabs_tts.py ├── test_google_tts.py └── test_tts_creator.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .devcontainer/Dockerfile ================================================ FROM python:3.12-bookworm # Install Node.js 20.x RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \ && apt-get install -y nodejs \ && rm -rf /var/lib/apt/lists/* # Install global npm packages RUN npm install -g husky vite # Create and activate Python virtual environment RUN python -m venv /opt/venv ENV PATH="/opt/venv/bin:$PATH" WORKDIR /workspace ================================================ FILE: .devcontainer/devc-welcome.md ================================================ # Welcome to DocsGPT Devcontainer Welcome to the DocsGPT development environment! This guide will help you get started quickly. ## Starting Services To run DocsGPT, you need to start three main services: Flask (backend), Celery (task queue), and Vite (frontend). Here are the commands to start each service within the devcontainer: ### Vite (Frontend) ```bash cd frontend npm run dev -- --host ``` ### Flask (Backend) ```bash flask --app application/app.py run --host=0.0.0.0 --port=7091 ``` ### Celery (Task Queue) ```bash celery -A application.app.celery worker -l INFO ``` ## Github Codespaces Instructions ### 1. Make Ports Public: Go to the "Ports" panel in Codespaces (usually located at the bottom of the VS Code window). For both port 5173 and 7091, right-click on the port and select "Make Public". ![CleanShot 2025-02-12 at 09 46 14@2x](https://github.com/user-attachments/assets/00a34b16-a7ef-47af-9648-87a7e3008475) ### 2. Update VITE_API_HOST: After making port 7091 public, copy the public URL provided by Codespaces for port 7091. Open the file frontend/.env.development. Find the line VITE_API_HOST=http://localhost:7091. Replace http://localhost:7091 with the public URL you copied from Codespaces. ![CleanShot 2025-02-12 at 09 46 56@2x](https://github.com/user-attachments/assets/c472242f-1079-4cd8-bc0b-2d78db22b94c) ================================================ FILE: .devcontainer/devcontainer.json ================================================ { "name": "DocsGPT Dev Container", "dockerComposeFile": ["docker-compose-dev.yaml", "docker-compose.override.yaml"], "service": "dev", "workspaceFolder": "/workspace", "postCreateCommand": ".devcontainer/post-create-command.sh", "forwardPorts": [7091, 5173, 6379, 27017], "customizations": { "vscode": { "extensions": [ "ms-python.python", "ms-toolsai.jupyter", "esbenp.prettier-vscode", "dbaeumer.vscode-eslint" ] }, "codespaces": { "openFiles": [ ".devcontainer/devc-welcome.md", "CONTRIBUTING.md" ] } } } ================================================ FILE: .devcontainer/docker-compose-dev.yaml ================================================ services: redis: image: redis:6-alpine ports: - 6379:6379 mongo: image: mongo:6 ports: - 27017:27017 volumes: - mongodb_data_container:/data/db volumes: mongodb_data_container: ================================================ FILE: .devcontainer/docker-compose.override.yaml ================================================ version: '3.8' services: dev: build: context: . dockerfile: Dockerfile volumes: - ../:/workspace:cached command: sleep infinity depends_on: redis: condition: service_healthy mongo: condition: service_healthy environment: - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/1 - MONGO_URI=mongodb://mongo:27017/docsgpt - CACHE_REDIS_URL=redis://redis:6379/2 networks: - default redis: healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 5s timeout: 30s retries: 5 mongo: healthcheck: test: ["CMD", "mongosh", "--eval", "db.adminCommand('ping')"] interval: 5s timeout: 30s retries: 5 networks: default: name: docsgpt-dev-network ================================================ FILE: .devcontainer/post-create-command.sh ================================================ #!/bin/bash set -e # Exit immediately if a command exits with a non-zero status if [ ! -f frontend/.env.development ]; then cp -n .env-template frontend/.env.development || true # Assuming .env-template is in the root fi # Determine VITE_API_HOST based on environment if [ -n "$CODESPACES" ]; then # Running in Codespaces CODESPACE_NAME=$(echo "$CODESPACES" | cut -d'-' -f1) # Extract codespace name PUBLIC_API_HOST="https://${CODESPACE_NAME}-7091.${GITHUB_CODESPACES_PORT_FORWARDING_DOMAIN}" echo "Setting VITE_API_HOST for Codespaces: $PUBLIC_API_HOST in frontend/.env.development" sed -i "s|VITE_API_HOST=.*|VITE_API_HOST=$PUBLIC_API_HOST|" frontend/.env.development else # Not running in Codespaces (local devcontainer) DEFAULT_API_HOST="http://localhost:7091" echo "Setting VITE_API_HOST for local dev: $DEFAULT_API_HOST in frontend/.env.development" sed -i "s|VITE_API_HOST=.*|VITE_API_HOST=$DEFAULT_API_HOST|" frontend/.env.development fi mkdir -p model if [ ! -d model/all-mpnet-base-v2 ]; then wget -q https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.zip -O model/mpnet-base-v2.zip unzip -q model/mpnet-base-v2.zip -d model rm model/mpnet-base-v2.zip fi pip install -r application/requirements.txt cd frontend npm install --include=dev ================================================ FILE: .env-template ================================================ API_KEY= LLM_NAME=docsgpt VITE_API_STREAMING=true INTERNAL_KEY= # Remote Embeddings (Optional - for using a remote embeddings API instead of local SentenceTransformer) # When set, the app will use the remote API and won't load SentenceTransformer (saves RAM) EMBEDDINGS_BASE_URL= EMBEDDINGS_KEY= #For Azure (you can delete it if you don't use Azure) OPENAI_API_BASE= OPENAI_API_VERSION= AZURE_DEPLOYMENT_NAME= AZURE_EMBEDDINGS_DEPLOYMENT_NAME= #Azure AD Application (client) ID MICROSOFT_CLIENT_ID=your-azure-ad-client-id #Azure AD Application client secret MICROSOFT_CLIENT_SECRET=your-azure-ad-client-secret #Azure AD Tenant ID (or 'common' for multi-tenant) MICROSOFT_TENANT_ID=your-azure-ad-tenant-id #If you are using a Microsoft Entra ID tenant, #configure the AUTHORITY variable as #"https://login.microsoftonline.com/TENANT_GUID" #or "https://login.microsoftonline.com/contoso.onmicrosoft.com". #Alternatively, use "https://login.microsoftonline.com/common" for multi-tenant app. MICROSOFT_AUTHORITY=https://{tenantId}.ciamlogin.com/{tenantId} ================================================ FILE: .gitattributes ================================================ # Auto detect text files and perform LF normalization * text=auto ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: arc53 ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yml ================================================ name: "🐛 Bug Report" description: "Submit a bug report to help us improve" title: "🐛 Bug Report: " labels: ["type: bug"] body: - type: markdown attributes: value: We value your time and your efforts to submit this bug report is appreciated. 🙏 - type: textarea id: description validations: required: true attributes: label: "📜 Description" description: "A clear and concise description of what the bug is." placeholder: "It bugs out when ..." - type: textarea id: steps-to-reproduce validations: required: true attributes: label: "👟 Reproduction steps" description: "How do you trigger this bug? Please walk us through it step by step." placeholder: "1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error" - type: textarea id: expected-behavior validations: required: true attributes: label: "👍 Expected behavior" description: "What did you think should happen?" placeholder: "It should ..." - type: textarea id: actual-behavior validations: required: true attributes: label: "👎 Actual Behavior with Screenshots" description: "What did actually happen? Add screenshots, if applicable." placeholder: "It actually ..." - type: dropdown id: operating-system attributes: label: "💻 Operating system" description: "What OS is your app running on?" options: - Linux - MacOS - Windows - Something else validations: required: true - type: dropdown id: browsers attributes: label: What browsers are you seeing the problem on? multiple: true options: - Firefox - Chrome - Safari - Microsoft Edge - Something else - type: dropdown id: dev-environment validations: required: true attributes: label: "🤖 What development environment are you experiencing this bug on?" options: - Docker - Local dev server - type: textarea id: env-vars validations: required: false attributes: label: "🔒 Did you set the correct environment variables in the right path? List the environment variable names (not values please!)" description: "Please refer to the [Project setup instructions](https://github.com/arc53/DocsGPT#quickstart) if you are unsure." placeholder: "It actually ..." - type: textarea id: additional-context validations: required: false attributes: label: "📃 Provide any additional context for the Bug." description: "Add any other context about the problem here." placeholder: "It actually ..." - type: textarea id: logs validations: required: false attributes: label: 📖 Relevant log output description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. render: shell - type: checkboxes id: no-duplicate-issues attributes: label: "👀 Have you spent some time to check if this bug has been raised before?" options: - label: "I checked and didn't find similar issue" required: true - type: dropdown id: willing-to-submit-pr attributes: label: 🔗 Are you willing to submit PR? description: This is absolutely not required, but we are happy to guide you in the contribution process. options: # Added options key - "Yes, I am willing to submit a PR!" - "No" validations: required: false - type: checkboxes id: terms attributes: label: 🧑‍⚖️ Code of Conduct description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/arc53/DocsGPT/blob/main/CODE_OF_CONDUCT.md) options: - label: I agree to follow this project's Code of Conduct required: true ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.yml ================================================ name: 🚀 Feature description: "Submit a proposal for a new feature" title: "🚀 Feature: " labels: [feature] body: - type: markdown attributes: value: We value your time and your efforts to submit this bug report is appreciated. 🙏 - type: textarea id: feature-description validations: required: true attributes: label: "🔖 Feature description" description: "A clear and concise description of what the feature is." placeholder: "You should add ..." - type: textarea id: pitch validations: required: true attributes: label: "🎤 Why is this feature needed ?" description: "Please explain why this feature should be implemented and how it would be used. Add examples, if applicable." placeholder: "In my use-case, ..." - type: textarea id: solution validations: required: true attributes: label: "✌️ How do you aim to achieve this?" description: "A clear and concise description of what you want to happen." placeholder: "I want this feature to, ..." - type: textarea id: alternative validations: required: false attributes: label: "🔄️ Additional Information" description: "A clear and concise description of any alternative solutions or additional solutions you've considered." placeholder: "I tried, ..." - type: checkboxes id: no-duplicate-issues attributes: label: "👀 Have you spent some time to check if this feature request has been raised before?" options: - label: "I checked and didn't find similar issue" required: true - type: dropdown id: willing-to-submit-pr attributes: label: Are you willing to submit PR? description: This is absolutely not required, but we are happy to guide you in the contribution process. options: - "Yes I am willing to submit a PR!" ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ - **What kind of change does this PR introduce?** (Bug fix, feature, docs update, ...) - **Why was this change needed?** (You can also link to an open issue here) - **Other information**: ================================================ FILE: .github/dependabot.yml ================================================ # To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/application" # Location of package manifests schedule: interval: "daily" - package-ecosystem: "npm" # See documentation for possible values directory: "/frontend" # Location of package manifests schedule: interval: "daily" - package-ecosystem: "npm" directory: "/extensions/react-widget" schedule: interval: "daily" - package-ecosystem: "github-actions" directory: "/" schedule: interval: "daily" ================================================ FILE: .github/holopin.yml ================================================ organization: docsgpt defaultSticker: cm1ulwkkl180570cl82rtzympu stickers: - id: cm1ulwkkl180570cl82rtzympu alias: contributor2024 - id: cm1ureg8o130450cl8c1po6mil alias: api - id: cm1urhmag148240cl8yvqxkthx alias: lpc - id: cm1urlcpq622090cl2tvu4w71y alias: lexeu ================================================ FILE: .github/labeler.yml ================================================ repo: - changed-files: - any-glob-to-any-file: '*' github: - changed-files: - any-glob-to-any-file: '.github/**/*' application: - changed-files: - any-glob-to-any-file: 'application/**/*' docs: - changed-files: - any-glob-to-any-file: 'docs/**/*' extensions: - changed-files: - any-glob-to-any-file: 'extensions/**/*' frontend: - changed-files: - any-glob-to-any-file: 'frontend/**/*' scripts: - changed-files: - any-glob-to-any-file: 'scripts/**/*' tests: - changed-files: - any-glob-to-any-file: 'tests/**/*' ================================================ FILE: .github/styles/DocsGPT/Spelling.yml ================================================ extends: spelling level: warning message: "Did you really mean '%s'?" ignore: - "**/node_modules/**" - "**/dist/**" - "**/build/**" - "**/coverage/**" - "**/public/**" - "**/static/**" vocab: DocsGPT ================================================ FILE: .github/styles/config/vocabularies/DocsGPT/accept.txt ================================================ Ollama Qdrant Milvus Chatwoot Nextra VSCode npm LLMs APIs Groq SGLang LMDeploy OAuth Vite LLM JSONPath UIs configs uncomment qdrant vectorstore docsgpt llm GPUs kubectl Lightsail enqueues chatbot VSCode's Shareability feedbacks automations Premade Signup Repo repo env URl agentic llama_cpp parsable SDKs boolean bool hardcode EOL ================================================ FILE: .github/workflows/bandit.yaml ================================================ name: Bandit Security Scan on: push: branches: - main pull_request: types: [opened, synchronize, reopened] jobs: bandit_scan: if: ${{ github.repository == 'arc53/DocsGPT' }} runs-on: ubuntu-latest permissions: security-events: write actions: read contents: read steps: - name: Checkout code uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.12' - name: Install dependencies run: | python -m pip install --upgrade pip pip install bandit # Bandit is needed for this action if [ -f application/requirements.txt ]; then pip install -r application/requirements.txt; fi - name: Run Bandit scan uses: PyCQA/bandit-action@v1 with: severity: medium confidence: medium targets: application/ env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/ci.yml ================================================ name: Build and push DocsGPT Docker image on: release: types: [published] jobs: build: if: github.repository == 'arc53/DocsGPT' strategy: matrix: include: - platform: linux/amd64 runner: ubuntu-latest suffix: amd64 - platform: linux/arm64 runner: ubuntu-24.04-arm suffix: arm64 runs-on: ${{ matrix.runner }} permissions: contents: read packages: write steps: - uses: actions/checkout@v4 - name: Set up QEMU # Only needed for emulation, not for native arm64 builds if: matrix.platform == 'linux/arm64' uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver: docker-container install: true - name: Login to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Login to ghcr.io uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push platform-specific images uses: docker/build-push-action@v6 with: file: './application/Dockerfile' platforms: ${{ matrix.platform }} context: ./application push: true tags: | ${{ secrets.DOCKER_USERNAME }}/docsgpt:${{ github.event.release.tag_name }}-${{ matrix.suffix }} ghcr.io/${{ github.repository_owner }}/docsgpt:${{ github.event.release.tag_name }}-${{ matrix.suffix }} provenance: false sbom: false cache-from: type=registry,ref=${{ secrets.DOCKER_USERNAME }}/docsgpt:latest cache-to: type=inline manifest: if: github.repository == 'arc53/DocsGPT' needs: build runs-on: ubuntu-latest permissions: packages: write steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver: docker-container install: true - name: Login to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Login to ghcr.io uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Create and push manifest for DockerHub run: | set -e docker manifest create ${{ secrets.DOCKER_USERNAME }}/docsgpt:${{ github.event.release.tag_name }} \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt:${{ github.event.release.tag_name }}-amd64 \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt:${{ github.event.release.tag_name }}-arm64 docker manifest push ${{ secrets.DOCKER_USERNAME }}/docsgpt:${{ github.event.release.tag_name }} docker manifest create ${{ secrets.DOCKER_USERNAME }}/docsgpt:latest \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt:${{ github.event.release.tag_name }}-amd64 \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt:${{ github.event.release.tag_name }}-arm64 docker manifest push ${{ secrets.DOCKER_USERNAME }}/docsgpt:latest - name: Create and push manifest for ghcr.io run: | set -e docker manifest create ghcr.io/${{ github.repository_owner }}/docsgpt:${{ github.event.release.tag_name }} \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt:${{ github.event.release.tag_name }}-amd64 \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt:${{ github.event.release.tag_name }}-arm64 docker manifest push ghcr.io/${{ github.repository_owner }}/docsgpt:${{ github.event.release.tag_name }} docker manifest create ghcr.io/${{ github.repository_owner }}/docsgpt:latest \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt:${{ github.event.release.tag_name }}-amd64 \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt:${{ github.event.release.tag_name }}-arm64 docker manifest push ghcr.io/${{ github.repository_owner }}/docsgpt:latest ================================================ FILE: .github/workflows/cife.yml ================================================ name: Build and push DocsGPT-FE Docker image on: release: types: [published] jobs: build: if: github.repository == 'arc53/DocsGPT' strategy: matrix: include: - platform: linux/amd64 runner: ubuntu-latest suffix: amd64 - platform: linux/arm64 runner: ubuntu-24.04-arm suffix: arm64 runs-on: ${{ matrix.runner }} permissions: contents: read packages: write steps: - uses: actions/checkout@v4 - name: Set up QEMU # Only needed for emulation, not for native arm64 builds if: matrix.platform == 'linux/arm64' uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver: docker-container install: true - name: Login to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Login to ghcr.io uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push platform-specific images uses: docker/build-push-action@v6 with: file: './frontend/Dockerfile' platforms: ${{ matrix.platform }} context: ./frontend push: true tags: | ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:${{ github.event.release.tag_name }}-${{ matrix.suffix }} ghcr.io/${{ github.repository_owner }}/docsgpt-fe:${{ github.event.release.tag_name }}-${{ matrix.suffix }} provenance: false sbom: false cache-from: type=registry,ref=${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:latest cache-to: type=inline manifest: if: github.repository == 'arc53/DocsGPT' needs: build runs-on: ubuntu-latest permissions: packages: write steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver: docker-container install: true - name: Login to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Login to ghcr.io uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Create and push manifest for DockerHub run: | set -e docker manifest create ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:${{ github.event.release.tag_name }} \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:${{ github.event.release.tag_name }}-amd64 \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:${{ github.event.release.tag_name }}-arm64 docker manifest push ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:${{ github.event.release.tag_name }} docker manifest create ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:latest \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:${{ github.event.release.tag_name }}-amd64 \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:${{ github.event.release.tag_name }}-arm64 docker manifest push ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:latest - name: Create and push manifest for ghcr.io run: | set -e docker manifest create ghcr.io/${{ github.repository_owner }}/docsgpt-fe:${{ github.event.release.tag_name }} \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt-fe:${{ github.event.release.tag_name }}-amd64 \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt-fe:${{ github.event.release.tag_name }}-arm64 docker manifest push ghcr.io/${{ github.repository_owner }}/docsgpt-fe:${{ github.event.release.tag_name }} docker manifest create ghcr.io/${{ github.repository_owner }}/docsgpt-fe:latest \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt-fe:${{ github.event.release.tag_name }}-amd64 \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt-fe:${{ github.event.release.tag_name }}-arm64 docker manifest push ghcr.io/${{ github.repository_owner }}/docsgpt-fe:latest ================================================ FILE: .github/workflows/docker-develop-build.yml ================================================ name: Build and push multi-arch DocsGPT Docker image on: workflow_dispatch: push: branches: - main jobs: build: if: github.repository == 'arc53/DocsGPT' strategy: matrix: include: - platform: linux/amd64 runner: ubuntu-latest suffix: amd64 - platform: linux/arm64 runner: ubuntu-24.04-arm suffix: arm64 runs-on: ${{ matrix.runner }} permissions: contents: read packages: write steps: - uses: actions/checkout@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver: docker-container install: true - name: Login to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Login to ghcr.io uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push platform-specific images uses: docker/build-push-action@v6 with: file: './application/Dockerfile' platforms: ${{ matrix.platform }} context: ./application push: true tags: | ${{ secrets.DOCKER_USERNAME }}/docsgpt:develop-${{ matrix.suffix }} ghcr.io/${{ github.repository_owner }}/docsgpt:develop-${{ matrix.suffix }} provenance: false sbom: false cache-from: type=registry,ref=${{ secrets.DOCKER_USERNAME }}/docsgpt:develop cache-to: type=inline manifest: if: github.repository == 'arc53/DocsGPT' needs: build runs-on: ubuntu-latest permissions: packages: write steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver: docker-container install: true - name: Login to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Login to ghcr.io uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Create and push manifest for DockerHub run: | docker manifest create ${{ secrets.DOCKER_USERNAME }}/docsgpt:develop \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt:develop-amd64 \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt:develop-arm64 docker manifest push ${{ secrets.DOCKER_USERNAME }}/docsgpt:develop - name: Create and push manifest for ghcr.io run: | docker manifest create ghcr.io/${{ github.repository_owner }}/docsgpt:develop \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt:develop-amd64 \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt:develop-arm64 docker manifest push ghcr.io/${{ github.repository_owner }}/docsgpt:develop ================================================ FILE: .github/workflows/docker-develop-fe-build.yml ================================================ name: Build and push DocsGPT FE Docker image for development on: workflow_dispatch: push: branches: - main jobs: build: if: github.repository == 'arc53/DocsGPT' strategy: matrix: include: - platform: linux/amd64 runner: ubuntu-latest suffix: amd64 - platform: linux/arm64 runner: ubuntu-24.04-arm suffix: arm64 runs-on: ${{ matrix.runner }} permissions: contents: read packages: write steps: - uses: actions/checkout@v4 - name: Set up QEMU # Only needed for emulation, not for native arm64 builds if: matrix.platform == 'linux/arm64' uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver: docker-container install: true - name: Login to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Login to ghcr.io uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push platform-specific images uses: docker/build-push-action@v6 with: file: './frontend/Dockerfile' platforms: ${{ matrix.platform }} context: ./frontend push: true tags: | ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:develop-${{ matrix.suffix }} ghcr.io/${{ github.repository_owner }}/docsgpt-fe:develop-${{ matrix.suffix }} provenance: false sbom: false cache-from: type=registry,ref=${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:develop cache-to: type=inline manifest: if: github.repository == 'arc53/DocsGPT' needs: build runs-on: ubuntu-latest permissions: packages: write steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver: docker-container install: true - name: Login to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Login to ghcr.io uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Create and push manifest for DockerHub run: | docker manifest create ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:develop \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:develop-amd64 \ --amend ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:develop-arm64 docker manifest push ${{ secrets.DOCKER_USERNAME }}/docsgpt-fe:develop - name: Create and push manifest for ghcr.io run: | docker manifest create ghcr.io/${{ github.repository_owner }}/docsgpt-fe:develop \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt-fe:develop-amd64 \ --amend ghcr.io/${{ github.repository_owner }}/docsgpt-fe:develop-arm64 docker manifest push ghcr.io/${{ github.repository_owner }}/docsgpt-fe:develop ================================================ FILE: .github/workflows/labeler.yml ================================================ # https://github.com/actions/labeler name: Pull Request Labeler on: - pull_request_target jobs: triage: if: github.repository == 'arc53/DocsGPT' permissions: contents: read pull-requests: write runs-on: ubuntu-latest steps: - uses: actions/labeler@v5 with: repo-token: "${{ secrets.GITHUB_TOKEN }}" sync-labels: true ================================================ FILE: .github/workflows/lint.yml ================================================ name: Python linting on: push: branches: - '*' pull_request: types: [ opened, synchronize ] permissions: contents: read jobs: ruff: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Lint with Ruff uses: chartboost/ruff-action@v1 ================================================ FILE: .github/workflows/pytest.yml ================================================ name: Run python tests with pytest on: [push, pull_request] permissions: contents: read jobs: pytest_and_coverage: name: Run tests and count coverage runs-on: ubuntu-latest strategy: matrix: python-version: ["3.12"] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip cd application if [ -f requirements.txt ]; then pip install -r requirements.txt; fi cd ../tests if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Test with pytest and generate coverage report run: | python -m pytest --cov=application --cov-report=xml --cov-report=term-missing - name: Upload coverage reports to Codecov if: github.event_name == 'pull_request' && matrix.python-version == '3.12' uses: codecov/codecov-action@v5 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} ================================================ FILE: .github/workflows/sync_fork.yaml ================================================ name: Upstream Sync permissions: contents: write on: schedule: - cron: "0 0 * * *" # every hour workflow_dispatch: jobs: sync_latest_from_upstream: name: Sync latest commits from upstream repo runs-on: ubuntu-latest if: ${{ github.event.repository.fork }} steps: # Step 1: run a standard checkout action - name: Checkout target repo uses: actions/checkout@v4 # Step 2: run the sync action - name: Sync upstream changes id: sync uses: aormsby/Fork-Sync-With-Upstream-action@v3.4 with: # set your upstream repo and branch upstream_sync_repo: arc53/DocsGPT upstream_sync_branch: main target_sync_branch: main target_repo_token: ${{ secrets.GITHUB_TOKEN }} # automatically generated, no need to set # Set test_mode true to run tests instead of the true action!! test_mode: false - name: Sync check if: failure() run: | echo "::error::由于权限不足,导致同步失败(这是预期的行为),请前往仓库首页手动执行[Sync fork]。" echo "::error::Due to insufficient permissions, synchronization failed (as expected). Please go to the repository homepage and manually perform [Sync fork]." exit 1 ================================================ FILE: .github/workflows/vale.yml ================================================ name: Vale Documentation Linter on: pull_request: paths: - 'docs/**/*.md' - 'docs/**/*.mdx' - '**/*.md' - '.vale.ini' - '.github/styles/**' permissions: contents: read pull-requests: write jobs: vale: runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Vale linter uses: errata-ai/vale-action@v2 with: files: docs fail_on_error: false version: 3.0.5 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class experiments/ experiments # C extensions *.so *.next # Distribution / packaging .Python build/ develop-eggs/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ docs/public/_pagefind/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints **/*.ipynb # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ .flaskenv # Spyder project settings .spyderproject .spyproject .jwt_secret_key # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ #pycharm .idea/ # macOS .DS_Store #frontend # Logs frontend/logs frontend/*.log frontend/npm-debug.log* frontend/yarn-debug.log* frontend/yarn-error.log* frontend/pnpm-debug.log* frontend/lerna-debug.log* # Keep frontend utility helpers tracked (overrides global lib/ ignore) !frontend/src/lib/ !frontend/src/lib/** frontend/node_modules frontend/dist frontend/dist-ssr frontend/*.local # Editor directories and files frontend/.vscode/* frontend/!.vscode/extensions.json frontend/.idea frontend/.DS_Store frontend/*.suo frontend/*.ntvs* frontend/*.njsproj frontend/*.sln frontend/*.sw? application/vectors/ **/inputs **/indexes **/temp **/yarn.lock node_modules/ .vscode/settings.json /models/ model/ ================================================ FILE: .ruff.toml ================================================ # Allow lines to be as long as 120 characters. line-length = 120 [lint.per-file-ignores] # Integration tests use sys.path.insert() before imports for standalone execution "tests/integration/*.py" = ["E402"] ================================================ FILE: .vale.ini ================================================ MinAlertLevel = warning StylesPath = .github/styles [*.{md,mdx}] BasedOnStyles = DocsGPT ================================================ FILE: .vscode/launch.json ================================================ { "version": "0.2.0", "configurations": [ { "name": "Frontend Debug (npm)", "type": "node-terminal", "request": "launch", "command": "npm run dev", "cwd": "${workspaceFolder}/frontend" }, { "name": "Flask Debugger", "type": "debugpy", "request": "launch", "module": "flask", "env": { "FLASK_APP": "application/app.py", "PYTHONPATH": "${workspaceFolder}", "FLASK_ENV": "development", "FLASK_DEBUG": "1", "FLASK_RUN_PORT": "7091", "FLASK_RUN_HOST": "0.0.0.0" }, "args": [ "run", "--no-debugger" ], "cwd": "${workspaceFolder}", }, { "name": "Celery Debugger", "type": "debugpy", "request": "launch", "module": "celery", "env": { "PYTHONPATH": "${workspaceFolder}", }, "args": [ "-A", "application.app.celery", "worker", "-l", "INFO", "--pool=solo" ], "cwd": "${workspaceFolder}" }, { "name": "Dev Containers (Mongo + Redis)", "type": "node-terminal", "request": "launch", "command": "docker compose -f deployment/docker-compose-dev.yaml up --build", "cwd": "${workspaceFolder}" } ], "compounds": [ { "name": "DocsGPT: Full Stack", "configurations": [ "Frontend Debug (npm)", "Flask Debugger", "Celery Debugger" ], "presentation": { "group": "DocsGPT", "order": 1 } } ] } ================================================ FILE: AGENTS.md ================================================ # AGENTS.md - Read `CONTRIBUTING.md` before making non-trivial changes. - For day-to-day development and feature work, follow the development-environment workflow rather than defaulting to `setup.sh` / `setup.ps1`. - Avoid using the setup scripts during normal feature work unless the user explicitly asks for them. Users configure `.env` usually. - Try to follow red/green TDD ### Check existing dev prerequisites first For feature work, do **not** assume the environment needs to be recreated. - Check whether the user already has a Python virtual environment such as `venv/` or `.venv/`. - Check whether MongoDB is already running. - Check whether Redis is already running. - Reuse what is already working. Do not stop or recreate MongoDB, Redis, or the Python environment unless the task is environment setup or troubleshooting. ## Normal local development commands Use these commands once the dev prerequisites above are satisfied. ### Backend ```bash source .venv/bin/activate # macOS/Linux uv pip install -r application/requirements.txt # or: pip install -r application/requirements.txt ``` Run the Flask API (if needed): ```bash flask --app application/app.py run --host=0.0.0.0 --port=7091 ``` Run the Celery worker in a separate terminal (if needed): ```bash celery -A application.app.celery worker -l INFO ``` On macOS, prefer the solo pool for Celery: ```bash python -m celery -A application.app.celery worker -l INFO --pool=solo ``` ### Frontend Install dependencies only when needed, then run the dev server: ```bash cd frontend npm install --include=dev npm run dev ``` ### Docs site ```bash cd docs npm install ``` ### Python / backend changes validation ```bash ruff check . python -m pytest ``` ### Frontend changes ```bash cd frontend && npm run lint cd frontend && npm run build ``` ### Documentation changes ```bash cd docs && npm run build ``` If Vale is installed locally and you edited prose, also run: ```bash vale . ``` ## Repository map - `application/`: Flask backend, API routes, agent logic, retrieval, parsing, security, storage, Celery worker, and WSGI entrypoints. - `tests/`: backend unit/integration tests and test-only Python dependencies. - `frontend/`: Vite + React + TypeScript application. - `frontend/src/`: main UI code, including `components`, `conversation`, `hooks`, `locale`, `settings`, `upload`, and Redux store wiring in `store.ts`. - `docs/`: separate documentation site built with Next.js/Nextra. - `extensions/`: integrations and widgets such as Chatwoot, Chrome, Discord, React widget, Slack bot, and web widget. - `deployment/`: Docker Compose variants and Kubernetes manifests. ## Coding rules ### Backend - Follow PEP 8 and keep Python line length at or under 120 characters. - Use type hints for function arguments and return values. - Add Google-style docstrings to new or substantially changed functions and classes. - Add or update tests under `tests/` for backend behavior changes. - Keep changes narrow in `api`, `auth`, `security`, `parser`, `retriever`, and `storage` areas. ### Backend Abstractions - LLM providers implement a common interface in `application/llm/` (add new providers by extending the base class). - Vector stores are abstracted in `application/vectorstore/`. - Parsers live in `application/parser/` and handle different document formats in the ingestion stage. - Agents and tools are in `application/agents/` and `application/agents/tools/`. - Celery setup/config lives in `application/celery_init.py` and `application/celeryconfig.py`. - Settings and env vars are managed via Pydantic in `application/core/settings.py`. ### Frontend - Follow the existing ESLint + Prettier setup. - Prefer small, reusable functional components and hooks. - If shared state must be added, use Redux rather than introducing a new global state library. - Avoid broad UI refactors unless the task explicitly asks for them. - Do not re-create components if we already have some in the app. ## PR readiness Before opening a PR: - run the relevant validation commands above - confirm backend changes still work end-to-end after ingesting sample data when applicable - clearly summarize user-visible behavior changes - mention any config, dependency, or deployment implications - Ask your user to attach a screenshot or a video to it ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors and leaders pledge to make participation in our community, a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive and a healthy community. ## Our Standards Examples of behavior that contribute to a positive environment for our community include: ## Demonstrating empathy and kindness towards other people 1. Being respectful and open to differing opinions, viewpoints, and experiences 2. Giving and gracefully accepting constructive feedback 3. Taking accountability and offering apologies to those who have been impacted by our errors, while also gaining insights from the situation 4. Focusing on what is best not just for us as individuals but for the community as a whole Examples of unacceptable behavior include: 1. The use of sexualized language or imagery, and sexual attention or advances of any kind 2. Trolling, insulting or derogatory comments, and personal or political attacks 3. Public or private harassment 4. Publishing other's private information, such as a physical or email address, without their explicit permission 5. Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at contact@arc53.com. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to be respectful towards the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action that they deem in violation of this Code of Conduct: ### 1. Correction * **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community space. * **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning * **Community Impact**: A violation through a single incident or series of actions. * **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban * **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. * **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban * **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior,harassment of an individual or aggression towards or disparagement of classes of individuals. * **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. ================================================ FILE: CONTRIBUTING.md ================================================ # Welcome to DocsGPT Contributing Guidelines Thank you for choosing to contribute to DocsGPT! We are all very grateful! # We accept different types of contributions 📣 **Discussions** - Engage in conversations, start new topics, or help answer questions. 🐞 **Issues** - This is where we keep track of tasks. It could be bugs, fixes or suggestions for new features. 🛠️ **Pull requests** - Suggest changes to our repository, either by working on existing issues or adding new features. 📚 **Wiki** - This is where our documentation resides. ## 🐞 Issues and Pull requests - We value contributions in the form of discussions or suggestions. We recommend taking a look at existing issues and our [roadmap](https://github.com/orgs/arc53/projects/2). - If you're interested in contributing code, here are some important things to know: - We have a frontend built on React (Vite) and a backend in Python. > **Required for every PR:** Please attach screenshots or a short screen > recording that shows the working version of your changes. This makes the > requirement visible to reviewers and helps them quickly verify what you are > submitting. Before creating issues, please check out how the latest version of our app looks and works by launching it via [Quickstart](https://github.com/arc53/DocsGPT#quickstart) the version on our live demo is slightly modified with login. Your issues should relate to the version you can launch via [Quickstart](https://github.com/arc53/DocsGPT#quickstart). ### 👨‍💻 If you're interested in contributing code, here are some important things to know: For instructions on setting up a development environment, please refer to our [Development Deployment Guide](https://docs.docsgpt.cloud/Deploying/Development-Environment). Tech Stack Overview: - 🌐 Frontend: Built with React (Vite) ⚛️, - 🖥 Backend: Developed in Python 🐍 ### 🌐 Frontend Contributions (⚛️ React, Vite) * The updated Figma design can be found [here](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1). Please try to follow the guidelines. * **Coding Style:** We follow a strict coding style enforced by ESLint and Prettier. Please ensure your code adheres to the configuration provided in our repository's `fronetend/.eslintrc.js` file. We recommend configuring your editor with ESLint and Prettier to help with this. * **Component Structure:** Strive for small, reusable components. Favor functional components and hooks over class components where possible. * **State Management** If you need to add stores, please use Redux. ### 🖥 Backend Contributions (🐍 Python) - Review our issues and contribute to [`/application`](https://github.com/arc53/DocsGPT/tree/main/application) - All new code should be covered with unit tests ([pytest](https://github.com/pytest-dev/pytest)). Please find tests under [`/tests`](https://github.com/arc53/DocsGPT/tree/main/tests) folder. - Before submitting your Pull Request, ensure it can be queried after ingesting some test data. - **Coding Style:** We adhere to the [PEP 8](https://www.python.org/dev/peps/pep-0008/) style guide for Python code. We use `ruff` as our linter and code formatter. Please ensure your code is formatted correctly and passes `ruff` checks before submitting. - **Type Hinting:** Please use type hints for all function arguments and return values. This improves code readability and helps catch errors early. Example: ```python def my_function(name: str, count: int) -> list[str]: ... ``` - **Docstrings:** All functions and classes should have docstrings explaining their purpose, parameters, and return values. We prefer the [Google style docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html). Example: ```python def my_function(name: str, count: int) -> list[str]: """Does something with a name and a count. Args: name: The name to use. count: The number of times to do it. Returns: A list of strings. """ ... ``` ### Testing To run unit tests from the root of the repository, execute: ``` python -m pytest ``` ## Workflow 📈 Here's a step-by-step guide on how to contribute to DocsGPT: 1. **Fork the Repository:** - Click the "Fork" button at the top-right of this repository to create your fork. 2. **Clone the Forked Repository:** - Clone the repository using: ``` shell git clone https://github.com//DocsGPT.git ``` 3. **Keep your Fork in Sync:** - Before you make any changes, make sure that your fork is in sync to avoid merge conflicts using: ```shell git remote add upstream https://github.com/arc53/DocsGPT.git git pull upstream main ``` 4. **Create and Switch to a New Branch:** - Create a new branch for your contribution using: ```shell git checkout -b your-branch-name ``` 5. **Make Changes:** - Make the required changes in your branch. 6. **Add Changes to the Staging Area:** - Add your changes to the staging area using: ```shell git add . ``` 7. **Commit Your Changes:** - Commit your changes with a descriptive commit message using: ```shell git commit -m "Your descriptive commit message" ``` 8. **Push Your Changes to the Remote Repository:** - Push your branch with changes to your fork on GitHub using: ```shell git push origin your-branch-name ``` 9. **Submit a Pull Request (PR):** - Create a Pull Request from your branch to the main repository. Make sure to include a detailed description of your changes, reference any related issues, and attach screenshots or a screen recording showing the working version. 10. **Collaborate:** - Be responsive to comments and feedback on your PR. - Make necessary updates as suggested. - Once your PR is approved, it will be merged into the main repository. 11. **Testing:** - Before submitting a Pull Request, ensure your code passes all unit tests. - To run unit tests from the root of the repository, execute: ```shell python -m pytest ``` *Note: You should run the unit test only after making the changes to the backend code.* 12. **Questions and Collaboration:** - Feel free to join our Discord. We're very friendly and welcoming to new contributors, so don't hesitate to reach out. Thank you for considering contributing to DocsGPT! 🙏 ## Questions/collaboration Feel free to join our [Discord](https://discord.gg/vN7YFfdMpj). We're very friendly and welcoming to new contributors, so don't hesitate to reach out. # Thank you so much for considering to contributing DocsGPT!🙏 ================================================ FILE: HACKTOBERFEST.md ================================================ # **🎉 Join the Hacktoberfest with DocsGPT and win a Free T-shirt for a meaningful PR! 🎉** Welcome, contributors! We're excited to announce that DocsGPT is participating in Hacktoberfest. Get involved by submitting meaningful pull requests. All Meaningful contributors with accepted PRs that were created for issues with the `hacktoberfest` label (set by our maintainer team: dartpain, siiddhantt, pabik, ManishMadan2882) will receive a cool T-shirt! 🤩. hacktoberfest-mocks-preview Fill in [this form](https://forms.gle/Npaba4n9Epfyx56S8 ) after your PR was merged please If you are in doubt don't hesitate to ping us on discord, ping me - Alex (dartpain). ## 📜 Here's How to Contribute: ```text 🛠️ Code: This is the golden ticket! Make meaningful contributions through PRs. 🧩 API extension: Build an app utilising DocsGPT API. We prefer submissions that showcase original ideas and turn the API into an AI agent. They can be a completely separate repos. For example: https://github.com/arc53/tg-bot-docsgpt-extenstion or https://github.com/arc53/DocsGPT-cli Non-Code Contributions: 📚 Wiki: Improve our documentation, create a guide. 🖥️ Design: Improve the UI/UX or design a new feature. ``` ### 📝 Guidelines for Pull Requests: - Familiarize yourself with the current contributions and our [Roadmap](https://github.com/orgs/arc53/projects/2). - Before contributing check existing [issues](https://github.com/arc53/DocsGPT/issues) or [create](https://github.com/arc53/DocsGPT/issues/new/choose) an issue and wait to get assigned. - Once you are finished with your contribution, please fill in this [form](https://forms.gle/Npaba4n9Epfyx56S8). - Refer to the [Documentation](https://docs.docsgpt.cloud/). - Feel free to join our [Discord](https://discord.gg/vN7YFfdMpj) server. We're here to help newcomers, so don't hesitate to jump in! Join us [here](https://discord.gg/vN7YFfdMpj). Thank you very much for considering contributing to DocsGPT during Hacktoberfest! 🙏 Your contributions (not just simple typos) could earn you a stylish new t-shirt. We will publish a t-shirt design later into the October. ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 arc53 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

DocsGPT 🦖

Private AI for agents, assistants and enterprise search

DocsGPT is an open-source AI platform for building intelligent agents and assistants. Features Agent Builder, deep research tools, document analysis (PDF, Office, web content, and audio), Multi-model support (choose your provider or run locally), and rich API connectivity for agents with actionable tools and integrations. Deploy anywhere with complete privacy control.

![link to main GitHub showing Stars number](https://img.shields.io/github/stars/arc53/docsgpt?style=social) ![link to main GitHub showing Forks number](https://img.shields.io/github/forks/arc53/docsgpt?style=social) ![link to license file](https://img.shields.io/github/license/arc53/docsgpt) ![link to discord](https://img.shields.io/discord/1070046503302877216) ![X (formerly Twitter) URL](https://img.shields.io/twitter/follow/docsgptai) ⚡️ Quickstart☁️ Cloud Version💬 Discord
📖 Documentation👫 Contribute🗞 Blog

video-example-of-docs-gpt

Key Features:

  • 🗂️ Wide Format Support: Reads PDF, DOCX, CSV, XLSX, EPUB, MD, RST, HTML, MDX, JSON, PPTX, images, and audio files such as MP3, WAV, M4A, OGG, and WebM.
  • 🎙️ Speech Workflows: Record voice input into chat, transcribe audio on the backend, and ingest meeting recordings or voice notes as searchable knowledge.
  • 🌐 Web & Data Integration: Ingests from URLs, sitemaps, Reddit, GitHub and web crawlers.
  • ✅ Reliable Answers: Get accurate, hallucination-free responses with source citations viewable in a clean UI.
  • 🔑 Streamlined API Keys: Generate keys linked to your settings, documents, and models, simplifying chatbot and integration setup.
  • 🔗 Actionable Tooling: Connect to APIs, tools, and other services to enable LLM actions.
  • 🧩 Pre-built Integrations: Use readily available HTML/React chat widgets, search tools, Discord/Telegram bots, and more.
  • 🔌 Flexible Deployment: Works with major LLMs (OpenAI, Google, Anthropic) and local models (Ollama, llama_cpp).
  • 🏢 Secure & Scalable: Run privately and securely with Kubernetes support, designed for enterprise-grade reliability.
## Roadmap - [x] Add OAuth 2.0 authentication for MCP ( September 2025 ) - [x] Deep Agents ( October 2025 ) - [x] Prompt Templating ( October 2025 ) - [x] Full api tooling ( Dec 2025 ) - [ ] Agent scheduling ( Jan 2026 ) You can find our full roadmap [here](https://github.com/orgs/arc53/projects/2). Please don't hesitate to contribute or create issues, it helps us improve DocsGPT! ### Production Support / Help for Companies: We're eager to provide personalized assistance when deploying your DocsGPT to a live environment. [Get a Demo :wave:](https://www.docsgpt.cloud/contact)⁠ [Send Email :email:](mailto:support@docsgpt.cloud?subject=DocsGPT%20support%2Fsolutions) ## Join the Lighthouse Program 🌟 Calling all developers and GenAI innovators! The **DocsGPT Lighthouse Program** connects technical leaders actively deploying or extending DocsGPT in real-world scenarios. Collaborate directly with our team to shape the roadmap, access priority support, and build enterprise-ready solutions with exclusive community insights. [Learn More & Apply →](https://docs.google.com/forms/d/1KAADiJinUJ8EMQyfTXUIGyFbqINNClNR3jBNWq7DgTE) ## QuickStart > [!Note] > Make sure you have [Docker](https://docs.docker.com/engine/install/) installed A more detailed [Quickstart](https://docs.docsgpt.cloud/quickstart) is available in our documentation 1. **Clone the repository:** ```bash git clone https://github.com/arc53/DocsGPT.git cd DocsGPT ``` **For macOS and Linux:** 2. **Run the setup script:** ```bash ./setup.sh ``` **For Windows:** 2. **Run the PowerShell setup script:** ```powershell PowerShell -ExecutionPolicy Bypass -File .\setup.ps1 ``` Either script will guide you through setting up DocsGPT. Five options available: using the public API, running locally, connecting to a local inference engine, using a cloud API provider, or build the docker image locally. Scripts will automatically configure your `.env` file and handle necessary downloads and installations based on your chosen option. **Navigate to http://localhost:5173/** To stop DocsGPT, open a terminal in the `DocsGPT` directory and run: ```bash docker compose -f deployment/docker-compose.yaml down ``` (or use the specific `docker compose down` command shown after running the setup script). > [!Note] > For development environment setup instructions, please refer to the [Development Environment Guide](https://docs.docsgpt.cloud/Deploying/Development-Environment). ## Contributing Please refer to the [CONTRIBUTING.md](CONTRIBUTING.md) file for information about how to get involved. We welcome issues, questions, and pull requests. ## Architecture ![Architecture chart](https://github.com/user-attachments/assets/fc6a7841-ddfc-45e6-b5a0-d05fe648cbe2) ## Project Structure - Application - Flask app (main application). - Extensions - Extensions, like react widget or discord bot. - Frontend - Frontend uses Vite and React. - Scripts - Miscellaneous scripts. ## Code Of Conduct We as members, contributors, and leaders, pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. Please refer to the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file for more information about contributing. ## Many Thanks To Our Contributors⚡ Contributors ## License The source code license is [MIT](https://opensource.org/license/mit/), as described in the [LICENSE](LICENSE) file. ## This project is supported by:

color

================================================ FILE: SECURITY.md ================================================ # Security Policy ## Supported Versions Supported Versions: Currently, we support security patches by committing changes and bumping the version published on Github. ## Reporting a Vulnerability Found a vulnerability? Please email us: security@arc53.com ================================================ FILE: application/Dockerfile ================================================ # Builder Stage FROM ubuntu:24.04 as builder ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ apt-get install -y --no-install-recommends gcc g++ wget unzip libc6-dev python3.12 python3.12-venv python3.12-dev && \ rm -rf /var/lib/apt/lists/* # Verify Python installation and setup symlink RUN if [ -f /usr/bin/python3.12 ]; then \ ln -s /usr/bin/python3.12 /usr/bin/python; \ else \ echo "Python 3.12 not found"; exit 1; \ fi # Download and unzip the model RUN wget https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.zip && \ unzip mpnet-base-v2.zip -d models && \ rm mpnet-base-v2.zip # Install Rust RUN wget -q -O - https://sh.rustup.rs | sh -s -- -y # Clean up to reduce container size RUN apt-get remove --purge -y wget unzip && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* # Copy requirements.txt COPY requirements.txt . # Setup Python virtual environment RUN python3.12 -m venv /venv # Activate virtual environment and install Python packages ENV PATH="/venv/bin:$PATH" # Install Python packages RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir tiktoken && \ pip install --no-cache-dir -r requirements.txt # Final Stage FROM ubuntu:24.04 as final RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && apt-get install -y --no-install-recommends \ python3.12 \ libgl1 \ libglib2.0-0 \ poppler-utils \ && \ ln -s /usr/bin/python3.12 /usr/bin/python && \ rm -rf /var/lib/apt/lists/* # Set working directory WORKDIR /app # Create a non-root user: `appuser` (Feel free to choose a name) RUN groupadd -r appuser && \ useradd -r -g appuser -d /app -s /sbin/nologin -c "Docker image user" appuser # Copy the virtual environment and model from the builder stage COPY --from=builder /venv /venv COPY --from=builder /models /app/models # Copy your application code COPY . /app/application # Change the ownership of the /app directory to the appuser RUN mkdir -p /app/application/inputs/local RUN chown -R appuser:appuser /app # Set environment variables ENV FLASK_APP=app.py \ FLASK_DEBUG=true \ PATH="/venv/bin:$PATH" # Expose the port the app runs on EXPOSE 7091 # Switch to non-root user USER appuser # Start Gunicorn CMD ["gunicorn", "-w", "1", "--timeout", "120", "--bind", "0.0.0.0:7091", "--preload", "application.wsgi:app"] ================================================ FILE: application/__init__.py ================================================ ================================================ FILE: application/agents/__init__.py ================================================ ================================================ FILE: application/agents/agent_creator.py ================================================ import logging from application.agents.classic_agent import ClassicAgent from application.agents.react_agent import ReActAgent from application.agents.workflow_agent import WorkflowAgent logger = logging.getLogger(__name__) class AgentCreator: agents = { "classic": ClassicAgent, "react": ReActAgent, "workflow": WorkflowAgent, } @classmethod def create_agent(cls, type, *args, **kwargs): agent_class = cls.agents.get(type.lower()) if not agent_class: raise ValueError(f"No agent class found for type {type}") return agent_class(*args, **kwargs) ================================================ FILE: application/agents/base.py ================================================ import logging import uuid from abc import ABC, abstractmethod from typing import Dict, Generator, List, Optional from bson.objectid import ObjectId from application.agents.tools.tool_action_parser import ToolActionParser from application.agents.tools.tool_manager import ToolManager from application.core.json_schema_utils import ( JsonSchemaValidationError, normalize_json_schema_payload, ) from application.core.mongo_db import MongoDB from application.core.settings import settings from application.llm.handlers.handler_creator import LLMHandlerCreator from application.llm.llm_creator import LLMCreator from application.logging import build_stack_data, log_activity, LogContext from application.security.encryption import decrypt_credentials logger = logging.getLogger(__name__) class BaseAgent(ABC): def __init__( self, endpoint: str, llm_name: str, model_id: str, api_key: str, agent_id: Optional[str] = None, user_api_key: Optional[str] = None, prompt: str = "", chat_history: Optional[List[Dict]] = None, retrieved_docs: Optional[List[Dict]] = None, decoded_token: Optional[Dict] = None, attachments: Optional[List[Dict]] = None, json_schema: Optional[Dict] = None, limited_token_mode: Optional[bool] = False, token_limit: Optional[int] = settings.DEFAULT_AGENT_LIMITS["token_limit"], limited_request_mode: Optional[bool] = False, request_limit: Optional[int] = settings.DEFAULT_AGENT_LIMITS["request_limit"], compressed_summary: Optional[str] = None, ): self.endpoint = endpoint self.llm_name = llm_name self.model_id = model_id self.api_key = api_key self.agent_id = agent_id self.user_api_key = user_api_key self.prompt = prompt self.decoded_token = decoded_token or {} self.user: str = self.decoded_token.get("sub") self.tool_config: Dict = {} self.tools: List[Dict] = [] self.tool_calls: List[Dict] = [] self.chat_history: List[Dict] = chat_history if chat_history is not None else [] self.llm = LLMCreator.create_llm( llm_name, api_key=api_key, user_api_key=user_api_key, decoded_token=decoded_token, model_id=model_id, agent_id=agent_id, ) self.retrieved_docs = retrieved_docs or [] self.llm_handler = LLMHandlerCreator.create_handler( llm_name if llm_name else "default" ) self.attachments = attachments or [] self.json_schema = None if json_schema is not None: try: self.json_schema = normalize_json_schema_payload(json_schema) except JsonSchemaValidationError as exc: logger.warning("Ignoring invalid JSON schema payload: %s", exc) self.limited_token_mode = limited_token_mode self.token_limit = token_limit self.limited_request_mode = limited_request_mode self.request_limit = request_limit self.compressed_summary = compressed_summary self.current_token_count = 0 self.context_limit_reached = False @log_activity() def gen( self, query: str, log_context: LogContext = None ) -> Generator[Dict, None, None]: yield from self._gen_inner(query, log_context) @abstractmethod def _gen_inner( self, query: str, log_context: LogContext ) -> Generator[Dict, None, None]: pass def _get_tools(self, api_key: str = None) -> Dict[str, Dict]: mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] agents_collection = db["agents"] tools_collection = db["user_tools"] agent_data = agents_collection.find_one({"key": api_key or self.user_api_key}) tool_ids = agent_data.get("tools", []) if agent_data else [] tools = ( tools_collection.find( {"_id": {"$in": [ObjectId(tool_id) for tool_id in tool_ids]}} ) if tool_ids else [] ) tools = list(tools) tools_by_id = {str(tool["_id"]): tool for tool in tools} if tools else {} return tools_by_id def _get_user_tools(self, user="local"): mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] user_tools_collection = db["user_tools"] user_tools = user_tools_collection.find({"user": user, "status": True}) user_tools = list(user_tools) return {str(i): tool for i, tool in enumerate(user_tools)} def _build_tool_parameters(self, action): params = {"type": "object", "properties": {}, "required": []} for param_type in ["query_params", "headers", "body", "parameters"]: if param_type in action and action[param_type].get("properties"): for k, v in action[param_type]["properties"].items(): if v.get("filled_by_llm", True): params["properties"][k] = { key: value for key, value in v.items() if key not in ("filled_by_llm", "value", "required") } if v.get("required", False): params["required"].append(k) return params def _prepare_tools(self, tools_dict): self.tools = [ { "type": "function", "function": { "name": f"{action['name']}_{tool_id}", "description": action["description"], "parameters": self._build_tool_parameters(action), }, } for tool_id, tool in tools_dict.items() if ( (tool["name"] == "api_tool" and "actions" in tool.get("config", {})) or (tool["name"] != "api_tool" and "actions" in tool) ) for action in ( tool["config"]["actions"].values() if tool["name"] == "api_tool" else tool["actions"] ) if action.get("active", True) ] def _execute_tool_action(self, tools_dict, call): parser = ToolActionParser(self.llm.__class__.__name__) tool_id, action_name, call_args = parser.parse_args(call) call_id = getattr(call, "id", None) or str(uuid.uuid4()) # Check if parsing failed if tool_id is None or action_name is None: error_message = f"Error: Failed to parse LLM tool call. Tool name: {getattr(call, 'name', 'unknown')}" logger.error(error_message) tool_call_data = { "tool_name": "unknown", "call_id": call_id, "action_name": getattr(call, "name", "unknown"), "arguments": call_args or {}, "result": f"Failed to parse tool call. Invalid tool name format: {getattr(call, 'name', 'unknown')}", } yield {"type": "tool_call", "data": {**tool_call_data, "status": "error"}} self.tool_calls.append(tool_call_data) return "Failed to parse tool call.", call_id # Check if tool_id exists in available tools if tool_id not in tools_dict: error_message = f"Error: Tool ID '{tool_id}' extracted from LLM call not found in available tools_dict. Available IDs: {list(tools_dict.keys())}" logger.error(error_message) # Return error result tool_call_data = { "tool_name": "unknown", "call_id": call_id, "action_name": f"{action_name}_{tool_id}", "arguments": call_args, "result": f"Tool with ID {tool_id} not found. Available tools: {list(tools_dict.keys())}", } yield {"type": "tool_call", "data": {**tool_call_data, "status": "error"}} self.tool_calls.append(tool_call_data) return f"Tool with ID {tool_id} not found.", call_id tool_call_data = { "tool_name": tools_dict[tool_id]["name"], "call_id": call_id, "action_name": f"{action_name}_{tool_id}", "arguments": call_args, } yield {"type": "tool_call", "data": {**tool_call_data, "status": "pending"}} tool_data = tools_dict[tool_id] action_data = ( tool_data["config"]["actions"][action_name] if tool_data["name"] == "api_tool" else next( action for action in tool_data["actions"] if action["name"] == action_name ) ) query_params, headers, body, parameters = {}, {}, {}, {} param_types = { "query_params": query_params, "headers": headers, "body": body, "parameters": parameters, } for param_type, target_dict in param_types.items(): if param_type in action_data and action_data[param_type].get("properties"): for param, details in action_data[param_type]["properties"].items(): if ( param not in call_args and "value" in details and details["value"] ): target_dict[param] = details["value"] for param, value in call_args.items(): for param_type, target_dict in param_types.items(): if param_type in action_data and param in action_data[param_type].get( "properties", {} ): target_dict[param] = value tm = ToolManager(config={}) # Prepare tool_config and add tool_id for memory tools if tool_data["name"] == "api_tool": action_config = tool_data["config"]["actions"][action_name] tool_config = { "url": action_config["url"], "method": action_config["method"], "headers": headers, "query_params": query_params, } if "body_content_type" in action_config: tool_config["body_content_type"] = action_config.get( "body_content_type", "application/json" ) tool_config["body_encoding_rules"] = action_config.get( "body_encoding_rules", {} ) else: tool_config = tool_data["config"].copy() if tool_data["config"] else {} if tool_config.get("encrypted_credentials") and self.user: decrypted = decrypt_credentials( tool_config["encrypted_credentials"], self.user ) tool_config.update(decrypted) tool_config["auth_credentials"] = decrypted tool_config.pop("encrypted_credentials", None) tool_config["tool_id"] = str(tool_data.get("_id", tool_id)) if hasattr(self, "conversation_id") and self.conversation_id: tool_config["conversation_id"] = self.conversation_id if tool_data["name"] == "mcp_tool": tool_config["query_mode"] = True tool = tm.load_tool( tool_data["name"], tool_config=tool_config, user_id=self.user, ) resolved_arguments = ( {"query_params": query_params, "headers": headers, "body": body} if tool_data["name"] == "api_tool" else parameters ) if tool_data["name"] == "api_tool": logger.debug( f"Executing api: {action_name} with query_params: {query_params}, headers: {headers}, body: {body}" ) result = tool.execute_action(action_name, **body) else: logger.debug(f"Executing tool: {action_name} with args: {call_args}") result = tool.execute_action(action_name, **parameters) get_artifact_id = ( getattr(tool, "get_artifact_id", None) if tool_data["name"] != "api_tool" else None ) artifact_id = None if callable(get_artifact_id): try: artifact_id = get_artifact_id(action_name, **parameters) except Exception: logger.exception( "Failed to extract artifact_id from tool %s for action %s", tool_data["name"], action_name, ) artifact_id = str(artifact_id).strip() if artifact_id is not None else "" if artifact_id: tool_call_data["artifact_id"] = artifact_id result_full = str(result) tool_call_data["resolved_arguments"] = resolved_arguments tool_call_data["result_full"] = result_full tool_call_data["result"] = ( f"{result_full[:50]}..." if len(result_full) > 50 else result_full ) stream_tool_call_data = { key: value for key, value in tool_call_data.items() if key not in {"result_full", "resolved_arguments"} } yield {"type": "tool_call", "data": {**stream_tool_call_data, "status": "completed"}} self.tool_calls.append(tool_call_data) return result, call_id def _get_truncated_tool_calls(self): return [ { "tool_name": tool_call.get("tool_name"), "call_id": tool_call.get("call_id"), "action_name": tool_call.get("action_name"), "arguments": tool_call.get("arguments"), "artifact_id": tool_call.get("artifact_id"), "result": ( f"{str(tool_call['result'])[:50]}..." if len(str(tool_call["result"])) > 50 else tool_call["result"] ), "status": "completed", } for tool_call in self.tool_calls ] def _calculate_current_context_tokens(self, messages: List[Dict]) -> int: """ Calculate total tokens in current context (messages). Args: messages: List of message dicts Returns: Total token count """ from application.api.answer.services.compression.token_counter import ( TokenCounter, ) return TokenCounter.count_message_tokens(messages) def _check_context_limit(self, messages: List[Dict]) -> bool: """ Check if we're approaching context limit (80%). Args: messages: Current message list Returns: True if at or above 80% of context limit """ from application.core.model_utils import get_token_limit from application.core.settings import settings try: # Calculate current tokens current_tokens = self._calculate_current_context_tokens(messages) self.current_token_count = current_tokens # Get context limit for model context_limit = get_token_limit(self.model_id) # Calculate threshold (80%) threshold = int(context_limit * settings.COMPRESSION_THRESHOLD_PERCENTAGE) # Check if we've reached the limit if current_tokens >= threshold: logger.warning( f"Context limit approaching: {current_tokens}/{context_limit} tokens " f"({(current_tokens/context_limit)*100:.1f}%)" ) return True return False except Exception as e: logger.error(f"Error checking context limit: {str(e)}", exc_info=True) return False def _validate_context_size(self, messages: List[Dict]) -> None: """ Pre-flight validation before calling LLM. Logs warnings but never raises errors. Args: messages: Messages to be sent to LLM """ from application.core.model_utils import get_token_limit current_tokens = self._calculate_current_context_tokens(messages) self.current_token_count = current_tokens context_limit = get_token_limit(self.model_id) percentage = (current_tokens / context_limit) * 100 # Log based on usage level if current_tokens >= context_limit: logger.warning( f"Context at limit: {current_tokens:,}/{context_limit:,} tokens " f"({percentage:.1f}%). Model: {self.model_id}" ) elif current_tokens >= int( context_limit * settings.COMPRESSION_THRESHOLD_PERCENTAGE ): logger.info( f"Context approaching limit: {current_tokens:,}/{context_limit:,} tokens " f"({percentage:.1f}%)" ) def _truncate_text_middle(self, text: str, max_tokens: int) -> str: """ Truncate text by removing content from the middle, preserving start and end. Args: text: Text to truncate max_tokens: Maximum tokens allowed Returns: Truncated text with middle removed if needed """ from application.utils import num_tokens_from_string current_tokens = num_tokens_from_string(text) if current_tokens <= max_tokens: return text # Estimate chars per token (roughly 4 chars per token for English) chars_per_token = len(text) / current_tokens if current_tokens > 0 else 4 target_chars = int(max_tokens * chars_per_token * 0.95) # 5% safety margin if target_chars <= 0: return "" # Split: keep 40% from start, 40% from end, remove middle start_chars = int(target_chars * 0.4) end_chars = int(target_chars * 0.4) truncation_marker = "\n\n[... content truncated to fit context limit ...]\n\n" truncated = text[:start_chars] + truncation_marker + text[-end_chars:] logger.info( f"Truncated text from {current_tokens:,} to ~{max_tokens:,} tokens " f"(removed middle section)" ) return truncated def _build_messages( self, system_prompt: str, query: str, ) -> List[Dict]: """Build messages using pre-rendered system prompt""" from application.core.model_utils import get_token_limit from application.utils import num_tokens_from_string # Append compression summary to system prompt if present if self.compressed_summary: compression_context = ( "\n\n---\n\n" "This session is being continued from a previous conversation that " "has been compressed to fit within context limits. " "The conversation is summarized below:\n\n" f"{self.compressed_summary}" ) system_prompt = system_prompt + compression_context context_limit = get_token_limit(self.model_id) system_tokens = num_tokens_from_string(system_prompt) # Reserve 10% for response/tools safety_buffer = int(context_limit * 0.1) available_after_system = context_limit - system_tokens - safety_buffer # Max tokens for query: 80% of available space (leave room for history) max_query_tokens = int(available_after_system * 0.8) query_tokens = num_tokens_from_string(query) # Truncate query from middle if it exceeds 80% of available context if query_tokens > max_query_tokens: query = self._truncate_text_middle(query, max_query_tokens) query_tokens = num_tokens_from_string(query) # Calculate remaining budget for chat history available_for_history = max(available_after_system - query_tokens, 0) # Truncate chat history to fit within available budget working_history = self._truncate_history_to_fit( self.chat_history, available_for_history, ) messages = [{"role": "system", "content": system_prompt}] for i in working_history: if "prompt" in i and "response" in i: messages.append({"role": "user", "content": i["prompt"]}) messages.append({"role": "assistant", "content": i["response"]}) if "tool_calls" in i: for tool_call in i["tool_calls"]: call_id = tool_call.get("call_id") or str(uuid.uuid4()) function_call_dict = { "function_call": { "name": tool_call.get("action_name"), "args": tool_call.get("arguments"), "call_id": call_id, } } function_response_dict = { "function_response": { "name": tool_call.get("action_name"), "response": {"result": tool_call.get("result")}, "call_id": call_id, } } messages.append( {"role": "assistant", "content": [function_call_dict]} ) messages.append( {"role": "tool", "content": [function_response_dict]} ) messages.append({"role": "user", "content": query}) return messages def _truncate_history_to_fit( self, history: List[Dict], max_tokens: int, ) -> List[Dict]: """ Truncate chat history to fit within token budget, keeping most recent messages. Args: history: Full chat history max_tokens: Maximum tokens allowed for history Returns: Truncated history (most recent messages that fit) """ from application.utils import num_tokens_from_string if not history or max_tokens <= 0: return [] truncated = [] current_tokens = 0 # Iterate from newest to oldest for message in reversed(history): message_tokens = 0 if "prompt" in message and "response" in message: message_tokens += num_tokens_from_string(message["prompt"]) message_tokens += num_tokens_from_string(message["response"]) if "tool_calls" in message: for tool_call in message["tool_calls"]: tool_str = ( f"Tool: {tool_call.get('tool_name')} | " f"Action: {tool_call.get('action_name')} | " f"Args: {tool_call.get('arguments')} | " f"Response: {tool_call.get('result')}" ) message_tokens += num_tokens_from_string(tool_str) if current_tokens + message_tokens <= max_tokens: current_tokens += message_tokens truncated.insert(0, message) # Maintain chronological order else: break if len(truncated) < len(history): logger.info( f"Truncated chat history from {len(history)} to {len(truncated)} messages " f"to fit within {max_tokens:,} token budget" ) return truncated def _llm_gen(self, messages: List[Dict], log_context: Optional[LogContext] = None): # Pre-flight context validation - fail fast if over limit self._validate_context_size(messages) gen_kwargs = {"model": self.model_id, "messages": messages} if self.attachments: # Usage accounting only; stripped before provider invocation. gen_kwargs["_usage_attachments"] = self.attachments if ( hasattr(self.llm, "_supports_tools") and self.llm._supports_tools and self.tools ): gen_kwargs["tools"] = self.tools if ( self.json_schema and hasattr(self.llm, "_supports_structured_output") and self.llm._supports_structured_output() ): structured_format = self.llm.prepare_structured_output_format( self.json_schema ) if structured_format: if self.llm_name == "openai": gen_kwargs["response_format"] = structured_format elif self.llm_name == "google": gen_kwargs["response_schema"] = structured_format resp = self.llm.gen_stream(**gen_kwargs) if log_context: data = build_stack_data(self.llm, exclude_attributes=["client"]) log_context.stacks.append({"component": "llm", "data": data}) return resp def _llm_handler( self, resp, tools_dict: Dict, messages: List[Dict], log_context: Optional[LogContext] = None, attachments: Optional[List[Dict]] = None, ): resp = self.llm_handler.process_message_flow( self, resp, tools_dict, messages, attachments, True ) if log_context: data = build_stack_data(self.llm_handler, exclude_attributes=["tool_calls"]) log_context.stacks.append({"component": "llm_handler", "data": data}) return resp def _handle_response(self, response, tools_dict, messages, log_context): is_structured_output = ( self.json_schema is not None and hasattr(self.llm, "_supports_structured_output") and self.llm._supports_structured_output() ) if isinstance(response, str): answer_data = {"answer": response} if is_structured_output: answer_data["structured"] = True answer_data["schema"] = self.json_schema yield answer_data return if hasattr(response, "message") and getattr(response.message, "content", None): answer_data = {"answer": response.message.content} if is_structured_output: answer_data["structured"] = True answer_data["schema"] = self.json_schema yield answer_data return processed_response_gen = self._llm_handler( response, tools_dict, messages, log_context, self.attachments ) for event in processed_response_gen: if isinstance(event, str): answer_data = {"answer": event} if is_structured_output: answer_data["structured"] = True answer_data["schema"] = self.json_schema yield answer_data elif hasattr(event, "message") and getattr(event.message, "content", None): answer_data = {"answer": event.message.content} if is_structured_output: answer_data["structured"] = True answer_data["schema"] = self.json_schema yield answer_data elif isinstance(event, dict) and "type" in event: yield event ================================================ FILE: application/agents/classic_agent.py ================================================ import logging from typing import Dict, Generator from application.agents.base import BaseAgent from application.logging import LogContext logger = logging.getLogger(__name__) class ClassicAgent(BaseAgent): """A simplified agent with clear execution flow""" def _gen_inner( self, query: str, log_context: LogContext ) -> Generator[Dict, None, None]: """Core generator function for ClassicAgent execution flow""" tools_dict = ( self._get_user_tools(self.user) if not self.user_api_key else self._get_tools(self.user_api_key) ) self._prepare_tools(tools_dict) messages = self._build_messages(self.prompt, query) llm_response = self._llm_gen(messages, log_context) yield from self._handle_response( llm_response, tools_dict, messages, log_context ) yield {"sources": self.retrieved_docs} yield {"tool_calls": self._get_truncated_tool_calls()} log_context.stacks.append( {"component": "agent", "data": {"tool_calls": self.tool_calls.copy()}} ) ================================================ FILE: application/agents/react_agent.py ================================================ import logging import os from typing import Any, Dict, Generator, List from application.agents.base import BaseAgent from application.logging import build_stack_data, LogContext logger = logging.getLogger(__name__) MAX_ITERATIONS_REASONING = 10 current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) with open( os.path.join(current_dir, "application/prompts", "react_planning_prompt.txt"), "r" ) as f: PLANNING_PROMPT_TEMPLATE = f.read() with open( os.path.join(current_dir, "application/prompts", "react_final_prompt.txt"), "r" ) as f: FINAL_PROMPT_TEMPLATE = f.read() class ReActAgent(BaseAgent): """ Research and Action (ReAct) Agent - Advanced reasoning agent with iterative planning. Implements a think-act-observe loop for complex problem-solving: 1. Creates a strategic plan based on the query 2. Executes tools and gathers observations 3. Iteratively refines approach until satisfied 4. Synthesizes final answer from all observations """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.plan: str = "" self.observations: List[str] = [] def _gen_inner( self, query: str, log_context: LogContext ) -> Generator[Dict, None, None]: """Execute ReAct reasoning loop with planning, action, and observation cycles""" self._reset_state() tools_dict = ( self._get_tools(self.user_api_key) if self.user_api_key else self._get_user_tools(self.user) ) self._prepare_tools(tools_dict) for iteration in range(1, MAX_ITERATIONS_REASONING + 1): yield {"thought": f"Reasoning... (iteration {iteration})\n\n"} yield from self._planning_phase(query, log_context) if not self.plan: logger.warning( f"ReActAgent: No plan generated in iteration {iteration}" ) break self.observations.append(f"Plan (iteration {iteration}): {self.plan}") satisfied = yield from self._execution_phase(query, tools_dict, log_context) if satisfied: logger.info("ReActAgent: Goal satisfied, stopping reasoning loop") break yield from self._synthesis_phase(query, log_context) def _reset_state(self): """Reset agent state for new query""" self.plan = "" self.observations = [] def _planning_phase( self, query: str, log_context: LogContext ) -> Generator[Dict, None, None]: """Generate strategic plan for query""" logger.info("ReActAgent: Creating plan...") plan_prompt = self._build_planning_prompt(query) messages = [{"role": "user", "content": plan_prompt}] plan_stream = self.llm.gen_stream( model=self.model_id, messages=messages, tools=self.tools if self.tools else None, ) if log_context: log_context.stacks.append( {"component": "planning_llm", "data": build_stack_data(self.llm)} ) plan_parts = [] for chunk in plan_stream: content = self._extract_content(chunk) if content: plan_parts.append(content) yield {"thought": content} self.plan = "".join(plan_parts) def _execution_phase( self, query: str, tools_dict: Dict, log_context: LogContext ) -> Generator[bool, None, None]: """Execute plan with tool calls and observations""" execution_prompt = self._build_execution_prompt(query) messages = self._build_messages(execution_prompt, query) llm_response = self._llm_gen(messages, log_context) initial_content = self._extract_content(llm_response) if initial_content: self.observations.append(f"Initial response: {initial_content}") processed_response = self._llm_handler( llm_response, tools_dict, messages, log_context ) for tool_call in self.tool_calls: observation = ( f"Executed: {tool_call.get('tool_name', 'Unknown')} " f"with args {tool_call.get('arguments', {})}. " f"Result: {str(tool_call.get('result', ''))[:200]}" ) self.observations.append(observation) final_content = self._extract_content(processed_response) if final_content: self.observations.append(f"Response after tools: {final_content}") if log_context: log_context.stacks.append( { "component": "agent_tool_calls", "data": {"tool_calls": self.tool_calls.copy()}, } ) yield {"sources": self.retrieved_docs} yield {"tool_calls": self._get_truncated_tool_calls()} return "SATISFIED" in (final_content or "") def _synthesis_phase( self, query: str, log_context: LogContext ) -> Generator[Dict, None, None]: """Synthesize final answer from all observations""" logger.info("ReActAgent: Generating final answer...") final_prompt = self._build_final_answer_prompt(query) messages = [{"role": "user", "content": final_prompt}] final_stream = self.llm.gen_stream( model=self.model_id, messages=messages, tools=None ) if log_context: log_context.stacks.append( {"component": "final_answer_llm", "data": build_stack_data(self.llm)} ) for chunk in final_stream: content = self._extract_content(chunk) if content: yield {"answer": content} def _build_planning_prompt(self, query: str) -> str: """Build planning phase prompt""" prompt = PLANNING_PROMPT_TEMPLATE.replace("{query}", query) prompt = prompt.replace("{prompt}", self.prompt or "") prompt = prompt.replace("{summaries}", "") prompt = prompt.replace("{observations}", "\n".join(self.observations)) return prompt def _build_execution_prompt(self, query: str) -> str: """Build execution phase prompt with plan and observations""" observations_str = "\n".join(self.observations) if len(observations_str) > 20000: observations_str = observations_str[:20000] + "\n...[truncated]" return ( f"{self.prompt or ''}\n\n" f"Follow this plan:\n{self.plan}\n\n" f"Observations:\n{observations_str}\n\n" f"If sufficient data exists to answer '{query}', respond with 'SATISFIED'. " f"Otherwise, continue executing the plan." ) def _build_final_answer_prompt(self, query: str) -> str: """Build final synthesis prompt""" observations_str = "\n".join(self.observations) if len(observations_str) > 10000: observations_str = observations_str[:10000] + "\n...[truncated]" logger.warning("ReActAgent: Observations truncated for final answer") return FINAL_PROMPT_TEMPLATE.format(query=query, observations=observations_str) def _extract_content(self, response: Any) -> str: """Extract text content from various LLM response formats""" if not response: return "" collected = [] if isinstance(response, str): return response if hasattr(response, "message") and hasattr(response.message, "content"): if response.message.content: return response.message.content if hasattr(response, "choices") and response.choices: if hasattr(response.choices[0], "message"): content = response.choices[0].message.content if content: return content if hasattr(response, "content") and isinstance(response.content, list): if response.content and hasattr(response.content[0], "text"): return response.content[0].text try: for chunk in response: content_piece = "" if hasattr(chunk, "choices") and chunk.choices: if hasattr(chunk.choices[0], "delta"): delta_content = chunk.choices[0].delta.content if delta_content: content_piece = delta_content elif hasattr(chunk, "type") and chunk.type == "content_block_delta": if hasattr(chunk, "delta") and hasattr(chunk.delta, "text"): content_piece = chunk.delta.text elif isinstance(chunk, str): content_piece = chunk if content_piece: collected.append(content_piece) except (TypeError, AttributeError): logger.debug( f"Response not iterable or unexpected format: {type(response)}" ) except Exception as e: logger.error(f"Error extracting content: {e}") return "".join(collected) ================================================ FILE: application/agents/tools/api_body_serializer.py ================================================ import base64 import json import logging from enum import Enum from typing import Any, Dict, Optional, Union from urllib.parse import quote, urlencode logger = logging.getLogger(__name__) class ContentType(str, Enum): """Supported content types for request bodies.""" JSON = "application/json" FORM_URLENCODED = "application/x-www-form-urlencoded" MULTIPART_FORM_DATA = "multipart/form-data" TEXT_PLAIN = "text/plain" XML = "application/xml" OCTET_STREAM = "application/octet-stream" class RequestBodySerializer: """Serializes request bodies according to content-type and OpenAPI 3.1 spec.""" @staticmethod def serialize( body_data: Dict[str, Any], content_type: str = ContentType.JSON, encoding_rules: Optional[Dict[str, Dict[str, Any]]] = None, ) -> tuple[Union[str, bytes], Dict[str, str]]: """ Serialize body data to appropriate format. Args: body_data: Dictionary of body parameters content_type: Content-Type header value encoding_rules: OpenAPI Encoding Object rules per field Returns: Tuple of (serialized_body, updated_headers_dict) Raises: ValueError: If serialization fails """ if not body_data: return None, {} try: content_type_lower = content_type.lower().split(";")[0].strip() if content_type_lower == ContentType.JSON: return RequestBodySerializer._serialize_json(body_data) elif content_type_lower == ContentType.FORM_URLENCODED: return RequestBodySerializer._serialize_form_urlencoded( body_data, encoding_rules ) elif content_type_lower == ContentType.MULTIPART_FORM_DATA: return RequestBodySerializer._serialize_multipart_form_data( body_data, encoding_rules ) elif content_type_lower == ContentType.TEXT_PLAIN: return RequestBodySerializer._serialize_text_plain(body_data) elif content_type_lower == ContentType.XML: return RequestBodySerializer._serialize_xml(body_data) elif content_type_lower == ContentType.OCTET_STREAM: return RequestBodySerializer._serialize_octet_stream(body_data) else: logger.warning( f"Unknown content type: {content_type}, treating as JSON" ) return RequestBodySerializer._serialize_json(body_data) except Exception as e: logger.error(f"Error serializing body: {str(e)}", exc_info=True) raise ValueError(f"Failed to serialize request body: {str(e)}") @staticmethod def _serialize_json(body_data: Dict[str, Any]) -> tuple[str, Dict[str, str]]: """Serialize body as JSON per OpenAPI spec.""" try: serialized = json.dumps( body_data, separators=(",", ":"), ensure_ascii=False ) headers = {"Content-Type": ContentType.JSON.value} return serialized, headers except (TypeError, ValueError) as e: raise ValueError(f"Failed to serialize JSON body: {str(e)}") @staticmethod def _serialize_form_urlencoded( body_data: Dict[str, Any], encoding_rules: Optional[Dict[str, Dict[str, Any]]] = None, ) -> tuple[str, Dict[str, str]]: """Serialize body as application/x-www-form-urlencoded per RFC1866/RFC3986.""" encoding_rules = encoding_rules or {} params = [] for key, value in body_data.items(): if value is None: continue rule = encoding_rules.get(key, {}) style = rule.get("style", "form") explode = rule.get("explode", style == "form") content_type = rule.get("contentType", "text/plain") serialized_value = RequestBodySerializer._serialize_form_value( value, style, explode, content_type, key ) if isinstance(serialized_value, list): for sv in serialized_value: params.append((key, sv)) else: params.append((key, serialized_value)) # Use standard urlencode (replaces space with +) serialized = urlencode(params, safe="") headers = {"Content-Type": ContentType.FORM_URLENCODED.value} return serialized, headers @staticmethod def _serialize_form_value( value: Any, style: str, explode: bool, content_type: str, key: str ) -> Union[str, list]: """Serialize individual form value with encoding rules.""" if isinstance(value, dict): if content_type == "application/json": return json.dumps(value, separators=(",", ":")) elif content_type == "application/xml": return RequestBodySerializer._dict_to_xml(value) else: if style == "deepObject" and explode: return [ f"{RequestBodySerializer._percent_encode(str(v))}" for v in value.values() ] elif explode: return [ f"{RequestBodySerializer._percent_encode(str(v))}" for v in value.values() ] else: pairs = [f"{k},{v}" for k, v in value.items()] return RequestBodySerializer._percent_encode(",".join(pairs)) elif isinstance(value, (list, tuple)): if explode: return [ RequestBodySerializer._percent_encode(str(item)) for item in value ] else: return RequestBodySerializer._percent_encode( ",".join(str(v) for v in value) ) else: return RequestBodySerializer._percent_encode(str(value)) @staticmethod def _serialize_multipart_form_data( body_data: Dict[str, Any], encoding_rules: Optional[Dict[str, Dict[str, Any]]] = None, ) -> tuple[bytes, Dict[str, str]]: """ Serialize body as multipart/form-data per RFC7578. Supports file uploads and encoding rules. """ import secrets encoding_rules = encoding_rules or {} boundary = f"----DocsGPT{secrets.token_hex(16)}" parts = [] for key, value in body_data.items(): if value is None: continue rule = encoding_rules.get(key, {}) content_type = rule.get("contentType", "text/plain") headers_rule = rule.get("headers", {}) part = RequestBodySerializer._create_multipart_part( key, value, content_type, headers_rule ) parts.append(part) body_bytes = f"--{boundary}\r\n".encode("utf-8") body_bytes += f"--{boundary}\r\n".join(parts).encode("utf-8") body_bytes += f"\r\n--{boundary}--\r\n".encode("utf-8") headers = { "Content-Type": f"multipart/form-data; boundary={boundary}", } return body_bytes, headers @staticmethod def _create_multipart_part( name: str, value: Any, content_type: str, headers_rule: Dict[str, Any] ) -> str: """Create a single multipart/form-data part.""" headers = [ f'Content-Disposition: form-data; name="{RequestBodySerializer._percent_encode(name)}"' ] if isinstance(value, bytes): if content_type == "application/octet-stream": value_encoded = base64.b64encode(value).decode("utf-8") else: value_encoded = value.decode("utf-8", errors="replace") headers.append(f"Content-Type: {content_type}") headers.append("Content-Transfer-Encoding: base64") elif isinstance(value, dict): if content_type == "application/json": value_encoded = json.dumps(value, separators=(",", ":")) elif content_type == "application/xml": value_encoded = RequestBodySerializer._dict_to_xml(value) else: value_encoded = str(value) headers.append(f"Content-Type: {content_type}") elif isinstance(value, str) and content_type != "text/plain": try: if content_type == "application/json": json.loads(value) value_encoded = value elif content_type == "application/xml": value_encoded = value else: value_encoded = str(value) except json.JSONDecodeError: value_encoded = str(value) headers.append(f"Content-Type: {content_type}") else: value_encoded = str(value) if content_type != "text/plain": headers.append(f"Content-Type: {content_type}") part = "\r\n".join(headers) + "\r\n\r\n" + value_encoded + "\r\n" return part @staticmethod def _serialize_text_plain(body_data: Dict[str, Any]) -> tuple[str, Dict[str, str]]: """Serialize body as plain text.""" if len(body_data) == 1: value = list(body_data.values())[0] return str(value), {"Content-Type": ContentType.TEXT_PLAIN.value} else: text = "\n".join(f"{k}: {v}" for k, v in body_data.items()) return text, {"Content-Type": ContentType.TEXT_PLAIN.value} @staticmethod def _serialize_xml(body_data: Dict[str, Any]) -> tuple[str, Dict[str, str]]: """Serialize body as XML.""" xml_str = RequestBodySerializer._dict_to_xml(body_data) return xml_str, {"Content-Type": ContentType.XML.value} @staticmethod def _serialize_octet_stream( body_data: Dict[str, Any], ) -> tuple[bytes, Dict[str, str]]: """Serialize body as binary octet stream.""" if isinstance(body_data, bytes): return body_data, {"Content-Type": ContentType.OCTET_STREAM.value} elif isinstance(body_data, str): return body_data.encode("utf-8"), { "Content-Type": ContentType.OCTET_STREAM.value } else: serialized = json.dumps(body_data) return serialized.encode("utf-8"), { "Content-Type": ContentType.OCTET_STREAM.value } @staticmethod def _percent_encode(value: str, safe_chars: str = "") -> str: """ Percent-encode per RFC3986. Args: value: String to encode safe_chars: Additional characters to not encode """ return quote(value, safe=safe_chars) @staticmethod def _dict_to_xml(data: Dict[str, Any], root_name: str = "root") -> str: """ Convert dict to simple XML format. """ def build_xml(obj: Any, name: str) -> str: if isinstance(obj, dict): inner = "".join(build_xml(v, k) for k, v in obj.items()) return f"<{name}>{inner}" elif isinstance(obj, (list, tuple)): items = "".join( build_xml(item, f"{name[:-1] if name.endswith('s') else name}") for item in obj ) return items else: return f"<{name}>{RequestBodySerializer._escape_xml(str(obj))}" root = build_xml(data, root_name) return f'{root}' @staticmethod def _escape_xml(value: str) -> str: """Escape XML special characters.""" return ( value.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) .replace("'", "'") ) ================================================ FILE: application/agents/tools/api_tool.py ================================================ import json import logging import re from typing import Any, Dict, Optional from urllib.parse import urlencode import requests from application.agents.tools.api_body_serializer import ( ContentType, RequestBodySerializer, ) from application.agents.tools.base import Tool from application.core.url_validation import validate_url, SSRFError logger = logging.getLogger(__name__) DEFAULT_TIMEOUT = 90 # seconds class APITool(Tool): """ API Tool A flexible tool for performing various API actions (e.g., sending messages, retrieving data) via custom user-specified APIs. """ def __init__(self, config): self.config = config self.url = config.get("url", "") self.method = config.get("method", "GET") self.headers = config.get("headers", {}) self.query_params = config.get("query_params", {}) self.body_content_type = config.get("body_content_type", ContentType.JSON) self.body_encoding_rules = config.get("body_encoding_rules", {}) def execute_action(self, action_name, **kwargs): """Execute an API action with the given arguments.""" return self._make_api_call( self.url, self.method, self.headers, self.query_params, kwargs, self.body_content_type, self.body_encoding_rules, ) def _make_api_call( self, url: str, method: str, headers: Dict[str, str], query_params: Dict[str, Any], body: Dict[str, Any], content_type: str = ContentType.JSON, encoding_rules: Optional[Dict[str, Dict[str, Any]]] = None, ) -> Dict[str, Any]: """ Make an API call with proper body serialization and error handling. Args: url: API endpoint URL method: HTTP method (GET, POST, PUT, DELETE, PATCH, HEAD, OPTIONS) headers: Request headers dict query_params: URL query parameters body: Request body as dict content_type: Content-Type for serialization encoding_rules: OpenAPI encoding rules Returns: Dict with status_code, data, and message """ request_url = url request_headers = headers.copy() if headers else {} response = None # Validate URL to prevent SSRF attacks try: validate_url(request_url) except SSRFError as e: logger.error(f"URL validation failed: {e}") return { "status_code": None, "message": f"URL validation error: {e}", "data": None, } try: path_params_used = set() if query_params: for match in re.finditer(r"\{([^}]+)\}", request_url): param_name = match.group(1) if param_name in query_params: request_url = request_url.replace( f"{{{param_name}}}", str(query_params[param_name]) ) path_params_used.add(param_name) remaining_params = { k: v for k, v in query_params.items() if k not in path_params_used } if remaining_params: query_string = urlencode(remaining_params) separator = "&" if "?" in request_url else "?" request_url = f"{request_url}{separator}{query_string}" # Re-validate URL after parameter substitution to prevent SSRF via path params try: validate_url(request_url) except SSRFError as e: logger.error(f"URL validation failed after parameter substitution: {e}") return { "status_code": None, "message": f"URL validation error: {e}", "data": None, } # Serialize body based on content type if body and body != {}: try: serialized_body, body_headers = RequestBodySerializer.serialize( body, content_type, encoding_rules ) request_headers.update(body_headers) except ValueError as e: logger.error(f"Body serialization failed: {str(e)}") return { "status_code": None, "message": f"Body serialization error: {str(e)}", "data": None, } else: serialized_body = None if "Content-Type" not in request_headers and method not in [ "GET", "HEAD", "DELETE", ]: request_headers["Content-Type"] = ContentType.JSON logger.debug( f"API Call: {method} {request_url} | Content-Type: {request_headers.get('Content-Type', 'N/A')}" ) if method.upper() == "GET": response = requests.get( request_url, headers=request_headers, timeout=DEFAULT_TIMEOUT ) elif method.upper() == "POST": response = requests.post( request_url, data=serialized_body, headers=request_headers, timeout=DEFAULT_TIMEOUT, ) elif method.upper() == "PUT": response = requests.put( request_url, data=serialized_body, headers=request_headers, timeout=DEFAULT_TIMEOUT, ) elif method.upper() == "DELETE": response = requests.delete( request_url, headers=request_headers, timeout=DEFAULT_TIMEOUT ) elif method.upper() == "PATCH": response = requests.patch( request_url, data=serialized_body, headers=request_headers, timeout=DEFAULT_TIMEOUT, ) elif method.upper() == "HEAD": response = requests.head( request_url, headers=request_headers, timeout=DEFAULT_TIMEOUT ) elif method.upper() == "OPTIONS": response = requests.options( request_url, headers=request_headers, timeout=DEFAULT_TIMEOUT ) else: return { "status_code": None, "message": f"Unsupported HTTP method: {method}", "data": None, } response.raise_for_status() data = self._parse_response(response) return { "status_code": response.status_code, "data": data, "message": "API call successful.", } except requests.exceptions.Timeout: logger.error(f"Request timeout for {request_url}") return { "status_code": None, "message": f"Request timeout ({DEFAULT_TIMEOUT}s exceeded)", "data": None, } except requests.exceptions.ConnectionError as e: logger.error(f"Connection error: {str(e)}") return { "status_code": None, "message": f"Connection error: {str(e)}", "data": None, } except requests.exceptions.HTTPError as e: logger.error(f"HTTP error {response.status_code}: {str(e)}") try: error_data = response.json() except (json.JSONDecodeError, ValueError): error_data = response.text return { "status_code": response.status_code, "message": f"HTTP Error {response.status_code}", "data": error_data, } except requests.exceptions.RequestException as e: logger.error(f"Request failed: {str(e)}") return { "status_code": response.status_code if response else None, "message": f"API call failed: {str(e)}", "data": None, } except Exception as e: logger.error(f"Unexpected error in API call: {str(e)}", exc_info=True) return { "status_code": None, "message": f"Unexpected error: {str(e)}", "data": None, } def _parse_response(self, response: requests.Response) -> Any: """ Parse response based on Content-Type header. Supports: JSON, XML, plain text, binary data. """ content_type = response.headers.get("Content-Type", "").lower() if not response.content: return None # JSON response if "application/json" in content_type: try: return response.json() except json.JSONDecodeError as e: logger.warning(f"Failed to parse JSON response: {str(e)}") return response.text # XML response elif "application/xml" in content_type or "text/xml" in content_type: return response.text # Plain text response elif "text/plain" in content_type or "text/html" in content_type: return response.text # Binary/unknown response else: # Try to decode as text first, fall back to base64 try: return response.text except (UnicodeDecodeError, AttributeError): import base64 return base64.b64encode(response.content).decode("utf-8") def get_actions_metadata(self): """Return metadata for available actions (none for API Tool - actions are user-defined).""" return [] def get_config_requirements(self): """Return configuration requirements for the tool.""" return {} ================================================ FILE: application/agents/tools/base.py ================================================ from abc import ABC, abstractmethod class Tool(ABC): @abstractmethod def execute_action(self, action_name: str, **kwargs): pass @abstractmethod def get_actions_metadata(self): """ Returns a list of JSON objects describing the actions supported by the tool. """ pass @abstractmethod def get_config_requirements(self): """ Returns a dictionary describing the configuration requirements for the tool. """ pass ================================================ FILE: application/agents/tools/brave.py ================================================ import logging import requests from application.agents.tools.base import Tool logger = logging.getLogger(__name__) class BraveSearchTool(Tool): """ Brave Search A tool for performing web and image searches using the Brave Search API. Requires an API key for authentication. """ def __init__(self, config): self.config = config self.token = config.get("token", "") self.base_url = "https://api.search.brave.com/res/v1" def execute_action(self, action_name, **kwargs): actions = { "brave_web_search": self._web_search, "brave_image_search": self._image_search, } if action_name in actions: return actions[action_name](**kwargs) else: raise ValueError(f"Unknown action: {action_name}") def _web_search( self, query, country="ALL", search_lang="en", count=10, offset=0, safesearch="off", freshness=None, result_filter=None, extra_snippets=False, summary=False, ): """ Performs a web search using the Brave Search API. """ logger.debug("Performing Brave web search for: %s", query) url = f"{self.base_url}/web/search" params = { "q": query, "country": country, "search_lang": search_lang, "count": min(count, 20), "offset": min(offset, 9), "safesearch": safesearch, } if freshness: params["freshness"] = freshness if result_filter: params["result_filter"] = result_filter if extra_snippets: params["extra_snippets"] = 1 if summary: params["summary"] = 1 headers = { "Accept": "application/json", "Accept-Encoding": "gzip", "X-Subscription-Token": self.token, } response = requests.get(url, params=params, headers=headers) if response.status_code == 200: return { "status_code": response.status_code, "results": response.json(), "message": "Search completed successfully.", } else: return { "status_code": response.status_code, "message": f"Search failed with status code: {response.status_code}.", } def _image_search( self, query, country="ALL", search_lang="en", count=5, safesearch="off", spellcheck=False, ): """ Performs an image search using the Brave Search API. """ logger.debug("Performing Brave image search for: %s", query) url = f"{self.base_url}/images/search" params = { "q": query, "country": country, "search_lang": search_lang, "count": min(count, 100), # API max is 100 "safesearch": safesearch, "spellcheck": 1 if spellcheck else 0, } headers = { "Accept": "application/json", "Accept-Encoding": "gzip", "X-Subscription-Token": self.token, } response = requests.get(url, params=params, headers=headers) if response.status_code == 200: return { "status_code": response.status_code, "results": response.json(), "message": "Image search completed successfully.", } else: return { "status_code": response.status_code, "message": f"Image search failed with status code: {response.status_code}.", } def get_actions_metadata(self): return [ { "name": "brave_web_search", "description": "Perform a web search using Brave Search", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "The search query (max 400 characters, 50 words)", }, "search_lang": { "type": "string", "description": "The search language preference (default: en)", }, "freshness": { "type": "string", "description": "Time filter for results (pd: last 24h, pw: last week, pm: last month, py: last year)", }, }, "required": ["query"], "additionalProperties": False, }, }, { "name": "brave_image_search", "description": "Perform an image search using Brave Search", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "The search query (max 400 characters, 50 words)", }, "count": { "type": "integer", "description": "Number of results to return (max 100, default: 5)", }, }, "required": ["query"], "additionalProperties": False, }, }, ] def get_config_requirements(self): return { "token": { "type": "string", "label": "API Key", "description": "Brave Search API key for authentication", "required": True, "secret": True, "order": 1, }, } ================================================ FILE: application/agents/tools/cryptoprice.py ================================================ import requests from application.agents.tools.base import Tool class CryptoPriceTool(Tool): """ CryptoPrice A tool for retrieving cryptocurrency prices using the CryptoCompare public API """ def __init__(self, config): self.config = config def execute_action(self, action_name, **kwargs): actions = {"cryptoprice_get": self._get_price} if action_name in actions: return actions[action_name](**kwargs) else: raise ValueError(f"Unknown action: {action_name}") def _get_price(self, symbol, currency): """ Fetches the current price of a given cryptocurrency symbol in the specified currency. Example: symbol = "BTC" currency = "USD" returns price in USD. """ url = f"https://min-api.cryptocompare.com/data/price?fsym={symbol.upper()}&tsyms={currency.upper()}" response = requests.get(url) if response.status_code == 200: data = response.json() if currency.upper() in data: return { "status_code": response.status_code, "price": data[currency.upper()], "message": f"Price of {symbol.upper()} in {currency.upper()} retrieved successfully.", } else: return { "status_code": response.status_code, "message": f"Couldn't find price for {symbol.upper()} in {currency.upper()}.", } else: return { "status_code": response.status_code, "message": "Failed to retrieve price.", } def get_actions_metadata(self): return [ { "name": "cryptoprice_get", "description": "Retrieve the price of a specified cryptocurrency in a given currency", "parameters": { "type": "object", "properties": { "symbol": { "type": "string", "description": "The cryptocurrency symbol (e.g. BTC)", }, "currency": { "type": "string", "description": "The currency in which you want the price (e.g. USD)", }, }, "required": ["symbol", "currency"], "additionalProperties": False, }, } ] def get_config_requirements(self): # No specific configuration needed for this tool as it just queries a public endpoint return {} ================================================ FILE: application/agents/tools/duckduckgo.py ================================================ import logging import time from typing import Any, Dict, Optional from application.agents.tools.base import Tool logger = logging.getLogger(__name__) MAX_RETRIES = 3 RETRY_DELAY = 2.0 DEFAULT_TIMEOUT = 15 class DuckDuckGoSearchTool(Tool): """ DuckDuckGo Search A tool for performing web and image searches using DuckDuckGo. """ def __init__(self, config): self.config = config self.timeout = config.get("timeout", DEFAULT_TIMEOUT) def _get_ddgs_client(self): from ddgs import DDGS return DDGS(timeout=self.timeout) def _execute_with_retry(self, operation, operation_name: str) -> Dict[str, Any]: last_error = None for attempt in range(1, MAX_RETRIES + 1): try: results = operation() return { "status_code": 200, "results": list(results) if results else [], "message": f"{operation_name} completed successfully.", } except Exception as e: last_error = e error_str = str(e).lower() if "ratelimit" in error_str or "429" in error_str: if attempt < MAX_RETRIES: delay = RETRY_DELAY * attempt logger.warning( f"{operation_name} rate limited, retrying in {delay}s (attempt {attempt}/{MAX_RETRIES})" ) time.sleep(delay) continue logger.error(f"{operation_name} failed: {e}") break return { "status_code": 500, "results": [], "message": f"{operation_name} failed: {str(last_error)}", } def execute_action(self, action_name, **kwargs): actions = { "ddg_web_search": self._web_search, "ddg_image_search": self._image_search, "ddg_news_search": self._news_search, } if action_name not in actions: raise ValueError(f"Unknown action: {action_name}") return actions[action_name](**kwargs) def _web_search( self, query: str, max_results: int = 5, region: str = "wt-wt", safesearch: str = "moderate", timelimit: Optional[str] = None, ) -> Dict[str, Any]: logger.info(f"DuckDuckGo web search: {query}") def operation(): client = self._get_ddgs_client() return client.text( query, region=region, safesearch=safesearch, timelimit=timelimit, max_results=min(max_results, 20), ) return self._execute_with_retry(operation, "Web search") def _image_search( self, query: str, max_results: int = 5, region: str = "wt-wt", safesearch: str = "moderate", timelimit: Optional[str] = None, ) -> Dict[str, Any]: logger.info(f"DuckDuckGo image search: {query}") def operation(): client = self._get_ddgs_client() return client.images( query, region=region, safesearch=safesearch, timelimit=timelimit, max_results=min(max_results, 50), ) return self._execute_with_retry(operation, "Image search") def _news_search( self, query: str, max_results: int = 5, region: str = "wt-wt", safesearch: str = "moderate", timelimit: Optional[str] = None, ) -> Dict[str, Any]: logger.info(f"DuckDuckGo news search: {query}") def operation(): client = self._get_ddgs_client() return client.news( query, region=region, safesearch=safesearch, timelimit=timelimit, max_results=min(max_results, 20), ) return self._execute_with_retry(operation, "News search") def get_actions_metadata(self): return [ { "name": "ddg_web_search", "description": "Search the web using DuckDuckGo. Returns titles, URLs, and snippets.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Search query", }, "max_results": { "type": "integer", "description": "Number of results (default: 5, max: 20)", }, "region": { "type": "string", "description": "Region code (default: wt-wt for worldwide, us-en for US)", }, "timelimit": { "type": "string", "description": "Time filter: d (day), w (week), m (month), y (year)", }, }, "required": ["query"], }, }, { "name": "ddg_image_search", "description": "Search for images using DuckDuckGo. Returns image URLs and metadata.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Image search query", }, "max_results": { "type": "integer", "description": "Number of results (default: 5, max: 50)", }, "region": { "type": "string", "description": "Region code (default: wt-wt for worldwide)", }, }, "required": ["query"], }, }, { "name": "ddg_news_search", "description": "Search for news articles using DuckDuckGo. Returns recent news.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "News search query", }, "max_results": { "type": "integer", "description": "Number of results (default: 5, max: 20)", }, "timelimit": { "type": "string", "description": "Time filter: d (day), w (week), m (month)", }, }, "required": ["query"], }, }, ] def get_config_requirements(self): return {} ================================================ FILE: application/agents/tools/mcp_tool.py ================================================ import asyncio import base64 import concurrent.futures import json import logging import time from typing import Any, Dict, List, Optional from urllib.parse import parse_qs, urlparse from fastmcp import Client from fastmcp.client.auth import BearerAuth from fastmcp.client.transports import ( SSETransport, StdioTransport, StreamableHttpTransport, ) from mcp.client.auth import OAuthClientProvider, TokenStorage from mcp.shared.auth import OAuthClientInformationFull, OAuthClientMetadata, OAuthToken from pydantic import AnyHttpUrl, ValidationError from redis import Redis from application.agents.tools.base import Tool from application.api.user.tasks import mcp_oauth_status_task, mcp_oauth_task from application.cache import get_redis_instance from application.core.mongo_db import MongoDB from application.core.settings import settings from application.security.encryption import decrypt_credentials logger = logging.getLogger(__name__) mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] _mcp_clients_cache = {} class MCPTool(Tool): """ MCP Tool Connect to remote Model Context Protocol (MCP) servers to access dynamic tools and resources. """ def __init__(self, config: Dict[str, Any], user_id: Optional[str] = None): """ Initialize the MCP Tool with configuration. Args: config: Dictionary containing MCP server configuration: - server_url: URL of the remote MCP server - transport_type: Transport type (auto, sse, http, stdio) - auth_type: Type of authentication (bearer, oauth, api_key, basic, none) - encrypted_credentials: Encrypted credentials (if available) - timeout: Request timeout in seconds (default: 30) - headers: Custom headers for requests - command: Command for STDIO transport - args: Arguments for STDIO transport - oauth_scopes: OAuth scopes for oauth auth type - oauth_client_name: OAuth client name for oauth auth type - query_mode: If True, use non-interactive OAuth (fail-fast on 401) user_id: User ID for decrypting credentials (required if encrypted_credentials exist) """ self.config = config self.user_id = user_id self.server_url = config.get("server_url", "") self.transport_type = config.get("transport_type", "auto") self.auth_type = config.get("auth_type", "none") self.timeout = config.get("timeout", 30) self.custom_headers = config.get("headers", {}) self.auth_credentials = {} if config.get("encrypted_credentials") and user_id: self.auth_credentials = decrypt_credentials( config["encrypted_credentials"], user_id ) else: self.auth_credentials = config.get("auth_credentials", {}) self.oauth_scopes = config.get("oauth_scopes", []) self.oauth_task_id = config.get("oauth_task_id", None) self.oauth_client_name = config.get("oauth_client_name", "DocsGPT-MCP") self.redirect_uri = self._resolve_redirect_uri(config.get("redirect_uri")) self.available_tools = [] self._cache_key = self._generate_cache_key() self._client = None self.query_mode = config.get("query_mode", False) if self.server_url and self.auth_type != "oauth": self._setup_client() def _resolve_redirect_uri(self, configured_redirect_uri: Optional[str]) -> str: if configured_redirect_uri: return configured_redirect_uri.rstrip("/") explicit = getattr(settings, "MCP_OAUTH_REDIRECT_URI", None) if explicit: return explicit.rstrip("/") connector_base = getattr(settings, "CONNECTOR_REDIRECT_BASE_URI", None) if connector_base: parsed = urlparse(connector_base) if parsed.scheme and parsed.netloc: return f"{parsed.scheme}://{parsed.netloc}/api/mcp_server/callback" return f"{settings.API_URL.rstrip('/')}/api/mcp_server/callback" def _generate_cache_key(self) -> str: """Generate a unique cache key for this MCP server configuration.""" auth_key = "" if self.auth_type == "oauth": scopes_str = ",".join(self.oauth_scopes) if self.oauth_scopes else "none" auth_key = ( f"oauth:{self.oauth_client_name}:{scopes_str}:{self.redirect_uri}" ) elif self.auth_type in ["bearer"]: token = self.auth_credentials.get( "bearer_token", "" ) or self.auth_credentials.get("access_token", "") auth_key = f"bearer:{token[:10]}..." if token else "bearer:none" elif self.auth_type == "api_key": api_key = self.auth_credentials.get("api_key", "") auth_key = f"apikey:{api_key[:10]}..." if api_key else "apikey:none" elif self.auth_type == "basic": username = self.auth_credentials.get("username", "") auth_key = f"basic:{username}" else: auth_key = "none" return f"{self.server_url}#{self.transport_type}#{auth_key}" def _setup_client(self): global _mcp_clients_cache if self._cache_key in _mcp_clients_cache: cached_data = _mcp_clients_cache[self._cache_key] if time.time() - cached_data["created_at"] < 300: self._client = cached_data["client"] return else: del _mcp_clients_cache[self._cache_key] transport = self._create_transport() auth = None if self.auth_type == "oauth": redis_client = get_redis_instance() if self.query_mode: auth = NonInteractiveOAuth( mcp_url=self.server_url, scopes=self.oauth_scopes, redis_client=redis_client, redirect_uri=self.redirect_uri, db=db, user_id=self.user_id, ) else: auth = DocsGPTOAuth( mcp_url=self.server_url, scopes=self.oauth_scopes, redis_client=redis_client, redirect_uri=self.redirect_uri, task_id=self.oauth_task_id, db=db, user_id=self.user_id, ) elif self.auth_type == "bearer": token = self.auth_credentials.get( "bearer_token", "" ) or self.auth_credentials.get("access_token", "") if token: auth = BearerAuth(token) self._client = Client(transport, auth=auth) _mcp_clients_cache[self._cache_key] = { "client": self._client, "created_at": time.time(), } def _create_transport(self): """Create appropriate transport based on configuration.""" headers = {"Content-Type": "application/json", "User-Agent": "DocsGPT-MCP/1.0"} headers.update(self.custom_headers) if self.auth_type == "api_key": api_key = self.auth_credentials.get("api_key", "") header_name = self.auth_credentials.get("api_key_header", "X-API-Key") if api_key: headers[header_name] = api_key elif self.auth_type == "basic": username = self.auth_credentials.get("username", "") password = self.auth_credentials.get("password", "") if username and password: credentials = base64.b64encode( f"{username}:{password}".encode() ).decode() headers["Authorization"] = f"Basic {credentials}" if self.transport_type == "auto": if "sse" in self.server_url.lower() or self.server_url.endswith("/sse"): transport_type = "sse" else: transport_type = "http" else: transport_type = self.transport_type if transport_type == "stdio": raise ValueError("STDIO transport is disabled") if transport_type == "sse": headers.update({"Accept": "text/event-stream", "Cache-Control": "no-cache"}) return SSETransport(url=self.server_url, headers=headers) elif transport_type == "http": return StreamableHttpTransport(url=self.server_url, headers=headers) elif transport_type == "stdio": command = self.config.get("command", "python") args = self.config.get("args", []) env = self.auth_credentials if self.auth_credentials else None return StdioTransport(command=command, args=args, env=env) else: return StreamableHttpTransport(url=self.server_url, headers=headers) def _format_tools(self, tools_response) -> List[Dict]: """Format tools response to match expected format.""" if hasattr(tools_response, "tools"): tools = tools_response.tools elif isinstance(tools_response, list): tools = tools_response else: tools = [] tools_dict = [] for tool in tools: if hasattr(tool, "name"): tool_dict = { "name": tool.name, "description": tool.description, } if hasattr(tool, "inputSchema"): tool_dict["inputSchema"] = tool.inputSchema tools_dict.append(tool_dict) elif isinstance(tool, dict): tools_dict.append(tool) else: if hasattr(tool, "model_dump"): tools_dict.append(tool.model_dump()) else: tools_dict.append({"name": str(tool), "description": ""}) return tools_dict async def _execute_with_client(self, operation: str, *args, **kwargs): """Execute operation with FastMCP client.""" if not self._client: raise Exception("FastMCP client not initialized") async with self._client: if operation == "ping": return await self._client.ping() elif operation == "list_tools": tools_response = await self._client.list_tools() self.available_tools = self._format_tools(tools_response) return self.available_tools elif operation == "call_tool": tool_name = args[0] tool_args = kwargs return await self._client.call_tool(tool_name, tool_args) elif operation == "list_resources": return await self._client.list_resources() elif operation == "list_prompts": return await self._client.list_prompts() else: raise Exception(f"Unknown operation: {operation}") _ERROR_MAP = [ (concurrent.futures.TimeoutError, lambda op, t, _: f"Timed out after {t}s"), (ConnectionRefusedError, lambda *_: "Connection refused"), ] _ERROR_PATTERNS = { ("403", "Forbidden"): "Access denied (403 Forbidden)", ("401", "Unauthorized"): "Authentication failed (401 Unauthorized)", ("ECONNREFUSED",): "Connection refused", ("SSL", "certificate"): "SSL/TLS error", } def _run_async_operation(self, operation: str, *args, **kwargs): try: try: asyncio.get_running_loop() with concurrent.futures.ThreadPoolExecutor() as executor: future = executor.submit( self._run_in_new_loop, operation, *args, **kwargs ) return future.result(timeout=self.timeout) except RuntimeError: return self._run_in_new_loop(operation, *args, **kwargs) except Exception as e: raise self._map_error(operation, e) from e raise self._map_error(operation, e) from e def _run_in_new_loop(self, operation, *args, **kwargs): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: return loop.run_until_complete( self._execute_with_client(operation, *args, **kwargs) ) finally: loop.close() def _map_error(self, operation: str, exc: Exception) -> Exception: for exc_type, msg_fn in self._ERROR_MAP: if isinstance(exc, exc_type): return Exception(msg_fn(operation, self.timeout, exc)) error_msg = str(exc) for patterns, friendly in self._ERROR_PATTERNS.items(): if any(p.lower() in error_msg.lower() for p in patterns): return Exception(friendly) logger.error("MCP %s failed: %s", operation, exc) return exc def discover_tools(self) -> List[Dict]: """ Discover available tools from the MCP server using FastMCP. Returns: List of tool definitions from the server """ if not self.server_url: return [] if not self._client: self._setup_client() try: tools = self._run_async_operation("list_tools") self.available_tools = tools return self.available_tools except Exception as e: raise Exception(f"Failed to discover tools from MCP server: {str(e)}") def execute_action(self, action_name: str, **kwargs) -> Any: if not self.server_url: raise Exception("No MCP server configured") if not self._client: self._setup_client() cleaned_kwargs = {} for key, value in kwargs.items(): if value == "" or value is None: continue cleaned_kwargs[key] = value try: result = self._run_async_operation( "call_tool", action_name, **cleaned_kwargs ) return self._format_result(result) except Exception as e: error_msg = str(e) lower_msg = error_msg.lower() is_auth_error = ( "401" in error_msg or "unauthorized" in lower_msg or "session expired" in lower_msg or "re-authorize" in lower_msg ) if is_auth_error: if self.auth_type == "oauth": raise Exception( f"Action '{action_name}' failed: OAuth session expired. " "Please re-authorize this MCP server in tool settings." ) from e global _mcp_clients_cache _mcp_clients_cache.pop(self._cache_key, None) self._client = None self._setup_client() try: result = self._run_async_operation( "call_tool", action_name, **cleaned_kwargs ) return self._format_result(result) except Exception as retry_e: raise Exception( f"Action '{action_name}' failed after re-auth attempt: {retry_e}. " "Your credentials may have expired — please re-authorize in tool settings." ) from retry_e raise Exception( f"Failed to execute action '{action_name}': {error_msg}" ) from e def _format_result(self, result) -> Dict: """Format FastMCP result to match expected format.""" if hasattr(result, "content"): content_list = [] for content_item in result.content: if hasattr(content_item, "text"): content_list.append({"type": "text", "text": content_item.text}) elif hasattr(content_item, "data"): content_list.append({"type": "data", "data": content_item.data}) else: content_list.append( {"type": "unknown", "content": str(content_item)} ) return { "content": content_list, "isError": getattr(result, "isError", False), } else: return result def test_connection(self) -> Dict: if not self.server_url: return { "success": False, "message": "No server URL configured", "tools_count": 0, } try: parsed = urlparse(self.server_url) if parsed.scheme not in ("http", "https"): return { "success": False, "message": f"Invalid URL scheme '{parsed.scheme}' — use http:// or https://", "tools_count": 0, } except Exception: return { "success": False, "message": "Invalid URL format", "tools_count": 0, } if not self._client: try: self._setup_client() except Exception as e: return { "success": False, "message": f"Client init failed: {str(e)}", "tools_count": 0, } try: if self.auth_type == "oauth": return self._test_oauth_connection() else: return self._test_regular_connection() except Exception as e: return { "success": False, "message": f"Connection failed: {str(e)}", "tools_count": 0, } def _test_regular_connection(self) -> Dict: ping_ok = False ping_error = None try: self._run_async_operation("ping") ping_ok = True except Exception as e: ping_error = str(e) try: tools = self.discover_tools() except Exception as e: return { "success": False, "message": f"Connection failed: {ping_error or str(e)}", "tools_count": 0, } if not tools and not ping_ok: return { "success": False, "message": f"Connection failed: {ping_error or 'No tools found'}", "tools_count": 0, } return { "success": True, "message": f"Connected — found {len(tools)} tool{'s' if len(tools) != 1 else ''}.", "tools_count": len(tools), "tools": [ { "name": tool.get("name", "unknown"), "description": tool.get("description", ""), } for tool in tools ], } def _test_oauth_connection(self) -> Dict: storage = DBTokenStorage( server_url=self.server_url, user_id=self.user_id, db_client=db ) loop = asyncio.new_event_loop() try: tokens = loop.run_until_complete(storage.get_tokens()) finally: loop.close() if tokens and tokens.access_token: self.query_mode = True _mcp_clients_cache.pop(self._cache_key, None) self._client = None self._setup_client() try: tools = self.discover_tools() return { "success": True, "message": f"Connected — found {len(tools)} tool{'s' if len(tools) != 1 else ''}.", "tools_count": len(tools), "tools": [ { "name": t.get("name", "unknown"), "description": t.get("description", ""), } for t in tools ], } except Exception as e: logger.warning("OAuth token validation failed: %s", e) _mcp_clients_cache.pop(self._cache_key, None) self._client = None return self._start_oauth_task() def _start_oauth_task(self) -> Dict: task_config = self.config.copy() task_config.pop("query_mode", None) result = mcp_oauth_task.delay(task_config, self.user_id) return { "success": False, "requires_oauth": True, "task_id": result.id, "message": "OAuth authorization required.", "tools_count": 0, } def get_actions_metadata(self) -> List[Dict]: """ Get metadata for all available actions. Returns: List of action metadata dictionaries """ actions = [] for tool in self.available_tools: input_schema = ( tool.get("inputSchema") or tool.get("input_schema") or tool.get("schema") or tool.get("parameters") ) parameters_schema = { "type": "object", "properties": {}, "required": [], } if input_schema: if isinstance(input_schema, dict): if "properties" in input_schema: parameters_schema = { "type": input_schema.get("type", "object"), "properties": input_schema.get("properties", {}), "required": input_schema.get("required", []), } for key in ["additionalProperties", "description"]: if key in input_schema: parameters_schema[key] = input_schema[key] else: parameters_schema["properties"] = input_schema action = { "name": tool.get("name", ""), "description": tool.get("description", ""), "parameters": parameters_schema, } actions.append(action) return actions def get_config_requirements(self) -> Dict: return { "server_url": { "type": "string", "label": "Server URL", "description": "URL of the remote MCP server", "required": True, "secret": False, "order": 1, }, "auth_type": { "type": "string", "label": "Authentication Type", "description": "Authentication method for the MCP server", "enum": ["none", "bearer", "oauth", "api_key", "basic"], "default": "none", "required": True, "secret": False, "order": 2, }, "api_key": { "type": "string", "label": "API Key", "description": "API key for authentication", "required": False, "secret": True, "order": 3, "depends_on": {"auth_type": "api_key"}, }, "api_key_header": { "type": "string", "label": "API Key Header", "description": "Header name for API key (default: X-API-Key)", "default": "X-API-Key", "required": False, "secret": False, "order": 4, "depends_on": {"auth_type": "api_key"}, }, "bearer_token": { "type": "string", "label": "Bearer Token", "description": "Bearer token for authentication", "required": False, "secret": True, "order": 3, "depends_on": {"auth_type": "bearer"}, }, "username": { "type": "string", "label": "Username", "description": "Username for basic authentication", "required": False, "secret": False, "order": 3, "depends_on": {"auth_type": "basic"}, }, "password": { "type": "string", "label": "Password", "description": "Password for basic authentication", "required": False, "secret": True, "order": 4, "depends_on": {"auth_type": "basic"}, }, "oauth_scopes": { "type": "string", "label": "OAuth Scopes", "description": "Comma-separated OAuth scopes to request", "required": False, "secret": False, "order": 3, "depends_on": {"auth_type": "oauth"}, }, "timeout": { "type": "number", "label": "Timeout (seconds)", "description": "Request timeout in seconds (1-300)", "default": 30, "required": False, "secret": False, "order": 10, }, } class DocsGPTOAuth(OAuthClientProvider): """ Custom OAuth handler for DocsGPT that uses frontend redirect instead of browser. """ def __init__( self, mcp_url: str, redirect_uri: str, redis_client: Redis | None = None, redis_prefix: str = "mcp_oauth:", task_id: str = None, scopes: str | list[str] | None = None, client_name: str = "DocsGPT-MCP", user_id=None, db=None, additional_client_metadata: dict[str, Any] | None = None, skip_redirect_validation: bool = False, ): self.redirect_uri = redirect_uri self.redis_client = redis_client self.redis_prefix = redis_prefix self.task_id = task_id self.user_id = user_id self.db = db parsed_url = urlparse(mcp_url) self.server_base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" if isinstance(scopes, list): scopes = " ".join(scopes) client_metadata = OAuthClientMetadata( client_name=client_name, redirect_uris=[AnyHttpUrl(redirect_uri)], grant_types=["authorization_code", "refresh_token"], response_types=["code"], scope=scopes, **(additional_client_metadata or {}), ) storage = DBTokenStorage( server_url=self.server_base_url, user_id=self.user_id, db_client=self.db, expected_redirect_uri=None if skip_redirect_validation else redirect_uri, ) super().__init__( server_url=self.server_base_url, client_metadata=client_metadata, storage=storage, redirect_handler=self.redirect_handler, callback_handler=self.callback_handler, ) self.auth_url = None self.extracted_state = None def _process_auth_url(self, authorization_url: str) -> tuple[str, str]: """Process authorization URL to extract state""" try: parsed_url = urlparse(authorization_url) query_params = parse_qs(parsed_url.query) state_params = query_params.get("state", []) if state_params: state = state_params[0] else: raise ValueError("No state in auth URL") return authorization_url, state except Exception as e: raise Exception(f"Failed to process auth URL: {e}") async def redirect_handler(self, authorization_url: str) -> None: """Store auth URL and state in Redis for frontend to use.""" auth_url, state = self._process_auth_url(authorization_url) logger.info("Processed auth_url: %s, state: %s", auth_url, state) self.auth_url = auth_url self.extracted_state = state if self.redis_client and self.extracted_state: key = f"{self.redis_prefix}auth_url:{self.extracted_state}" self.redis_client.setex(key, 600, auth_url) logger.info("Stored auth_url in Redis: %s", key) if self.task_id: status_key = f"mcp_oauth_status:{self.task_id}" status_data = { "status": "requires_redirect", "message": "Authorization required", "authorization_url": self.auth_url, "state": self.extracted_state, "requires_oauth": True, "task_id": self.task_id, } self.redis_client.setex(status_key, 600, json.dumps(status_data)) async def callback_handler(self) -> tuple[str, str | None]: """Wait for auth code from Redis using the state value.""" if not self.redis_client or not self.extracted_state: raise Exception("Redis client or state not configured for OAuth") poll_interval = 1 max_wait_time = 300 code_key = f"{self.redis_prefix}code:{self.extracted_state}" if self.task_id: status_key = f"mcp_oauth_status:{self.task_id}" status_data = { "status": "awaiting_callback", "message": "Waiting for authorization...", "authorization_url": self.auth_url, "state": self.extracted_state, "requires_oauth": True, "task_id": self.task_id, } self.redis_client.setex(status_key, 600, json.dumps(status_data)) start_time = time.time() while time.time() - start_time < max_wait_time: code_data = self.redis_client.get(code_key) if code_data: code = code_data.decode() returned_state = self.extracted_state self.redis_client.delete(code_key) self.redis_client.delete( f"{self.redis_prefix}auth_url:{self.extracted_state}" ) self.redis_client.delete( f"{self.redis_prefix}state:{self.extracted_state}" ) if self.task_id: status_data = { "status": "callback_received", "message": "Completing authentication...", "task_id": self.task_id, } self.redis_client.setex(status_key, 600, json.dumps(status_data)) return code, returned_state error_key = f"{self.redis_prefix}error:{self.extracted_state}" error_data = self.redis_client.get(error_key) if error_data: error_msg = error_data.decode() self.redis_client.delete(error_key) self.redis_client.delete( f"{self.redis_prefix}auth_url:{self.extracted_state}" ) self.redis_client.delete( f"{self.redis_prefix}state:{self.extracted_state}" ) raise Exception(f"OAuth error: {error_msg}") await asyncio.sleep(poll_interval) self.redis_client.delete(f"{self.redis_prefix}auth_url:{self.extracted_state}") self.redis_client.delete(f"{self.redis_prefix}state:{self.extracted_state}") raise Exception("OAuth timeout: no code received within 5 minutes") class NonInteractiveOAuth(DocsGPTOAuth): """OAuth provider that fails fast on 401 instead of starting interactive auth. Used during query execution to prevent the streaming response from blocking while waiting for user authorization that will never come. """ def __init__(self, **kwargs): kwargs.setdefault("task_id", None) kwargs["skip_redirect_validation"] = True super().__init__(**kwargs) async def redirect_handler(self, authorization_url: str) -> None: raise Exception( "OAuth session expired — please re-authorize this MCP server in tool settings." ) async def callback_handler(self) -> tuple[str, str | None]: raise Exception( "OAuth session expired — please re-authorize this MCP server in tool settings." ) class DBTokenStorage(TokenStorage): def __init__( self, server_url: str, user_id: str, db_client, expected_redirect_uri: Optional[str] = None, ): self.server_url = server_url self.user_id = user_id self.db_client = db_client self.expected_redirect_uri = expected_redirect_uri self.collection = db_client["connector_sessions"] @staticmethod def get_base_url(url: str) -> str: parsed = urlparse(url) return f"{parsed.scheme}://{parsed.netloc}" def get_db_key(self) -> dict: return { "server_url": self.get_base_url(self.server_url), "user_id": self.user_id, } async def get_tokens(self) -> OAuthToken | None: doc = await asyncio.to_thread(self.collection.find_one, self.get_db_key()) if not doc or "tokens" not in doc: return None try: return OAuthToken.model_validate(doc["tokens"]) except ValidationError as e: logger.error("Could not load tokens: %s", e) return None async def set_tokens(self, tokens: OAuthToken) -> None: await asyncio.to_thread( self.collection.update_one, self.get_db_key(), {"$set": {"tokens": tokens.model_dump()}}, True, ) logger.info("Saved tokens for %s", self.get_base_url(self.server_url)) async def get_client_info(self) -> OAuthClientInformationFull | None: doc = await asyncio.to_thread(self.collection.find_one, self.get_db_key()) if not doc or "client_info" not in doc: logger.debug( "No client_info in DB for %s", self.get_base_url(self.server_url) ) return None try: client_info = OAuthClientInformationFull.model_validate(doc["client_info"]) if self.expected_redirect_uri: stored_uris = [ str(uri).rstrip("/") for uri in client_info.redirect_uris ] expected_uri = self.expected_redirect_uri.rstrip("/") if expected_uri not in stored_uris: logger.warning( "Redirect URI mismatch for %s: expected=%s stored=%s — clearing.", self.get_base_url(self.server_url), expected_uri, stored_uris, ) await asyncio.to_thread( self.collection.update_one, self.get_db_key(), {"$unset": {"client_info": "", "tokens": ""}}, ) return None return client_info except ValidationError as e: logger.error("Could not load client info: %s", e) return None def _serialize_client_info(self, info: dict) -> dict: if "redirect_uris" in info and isinstance(info["redirect_uris"], list): info["redirect_uris"] = [str(u) for u in info["redirect_uris"]] return info async def set_client_info(self, client_info: OAuthClientInformationFull) -> None: serialized_info = self._serialize_client_info(client_info.model_dump()) await asyncio.to_thread( self.collection.update_one, self.get_db_key(), {"$set": {"client_info": serialized_info}}, True, ) logger.info("Saved client info for %s", self.get_base_url(self.server_url)) async def clear(self) -> None: await asyncio.to_thread(self.collection.delete_one, self.get_db_key()) logger.info("Cleared OAuth cache for %s", self.get_base_url(self.server_url)) @classmethod async def clear_all(cls, db_client) -> None: collection = db_client["connector_sessions"] await asyncio.to_thread(collection.delete_many, {}) logger.info("Cleared all OAuth client cache data.") class MCPOAuthManager: """Manager for handling MCP OAuth callbacks.""" def __init__(self, redis_client: Redis | None, redis_prefix: str = "mcp_oauth:"): self.redis_client = redis_client self.redis_prefix = redis_prefix def handle_oauth_callback( self, state: str, code: str, error: Optional[str] = None ) -> bool: """ Handle OAuth callback from provider. Args: state: The state parameter from OAuth callback code: The authorization code from OAuth callback error: Error message if OAuth failed Returns: True if successful, False otherwise """ try: if not self.redis_client or not state: raise Exception("Redis client or state not provided") if error: error_key = f"{self.redis_prefix}error:{state}" self.redis_client.setex(error_key, 300, error) raise Exception(f"OAuth error received: {error}") code_key = f"{self.redis_prefix}code:{state}" self.redis_client.setex(code_key, 300, code) state_key = f"{self.redis_prefix}state:{state}" self.redis_client.setex(state_key, 300, "completed") return True except Exception as e: logger.error("Error handling OAuth callback: %s", e) return False def get_oauth_status(self, task_id: str) -> Dict[str, Any]: """Get current status of OAuth flow using provided task_id.""" if not task_id: return {"status": "not_started", "message": "OAuth flow not started"} return mcp_oauth_status_task(task_id) ================================================ FILE: application/agents/tools/memory.py ================================================ from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional import re import uuid from .base import Tool from application.core.mongo_db import MongoDB from application.core.settings import settings class MemoryTool(Tool): """Memory Stores and retrieves information across conversations through a memory file directory. """ def __init__(self, tool_config: Optional[Dict[str, Any]] = None, user_id: Optional[str] = None) -> None: """Initialize the tool. Args: tool_config: Optional tool configuration. Should include: - tool_id: Unique identifier for this memory tool instance (from user_tools._id) This ensures each user's tool configuration has isolated memories user_id: The authenticated user's id (should come from decoded_token["sub"]). """ self.user_id: Optional[str] = user_id # Get tool_id from configuration (passed from user_tools._id in production) # In production, tool_id is the MongoDB ObjectId string from user_tools collection if tool_config and "tool_id" in tool_config: self.tool_id = tool_config["tool_id"] elif user_id: # Fallback for backward compatibility or testing self.tool_id = f"default_{user_id}" else: # Last resort fallback (shouldn't happen in normal use) self.tool_id = str(uuid.uuid4()) db = MongoDB.get_client()[settings.MONGO_DB_NAME] self.collection = db["memories"] # ----------------------------- # Action implementations # ----------------------------- def execute_action(self, action_name: str, **kwargs: Any) -> str: """Execute an action by name. Args: action_name: One of view, create, str_replace, insert, delete, rename. **kwargs: Parameters for the action. Returns: A human-readable string result. """ if not self.user_id: return "Error: MemoryTool requires a valid user_id." if action_name == "view": return self._view( kwargs.get("path", "/"), kwargs.get("view_range") ) if action_name == "create": return self._create( kwargs.get("path", ""), kwargs.get("file_text", "") ) if action_name == "str_replace": return self._str_replace( kwargs.get("path", ""), kwargs.get("old_str", ""), kwargs.get("new_str", "") ) if action_name == "insert": return self._insert( kwargs.get("path", ""), kwargs.get("insert_line", 1), kwargs.get("insert_text", "") ) if action_name == "delete": return self._delete(kwargs.get("path", "")) if action_name == "rename": return self._rename( kwargs.get("old_path", ""), kwargs.get("new_path", "") ) return f"Unknown action: {action_name}" def get_actions_metadata(self) -> List[Dict[str, Any]]: """Return JSON metadata describing supported actions for tool schemas.""" return [ { "name": "view", "description": "Shows directory contents or file contents with optional line ranges.", "parameters": { "type": "object", "properties": { "path": { "type": "string", "description": "Path to file or directory (e.g., /notes.txt or /project/ or /)." }, "view_range": { "type": "array", "items": {"type": "integer"}, "description": "Optional [start_line, end_line] to view specific lines (1-indexed)." } }, "required": ["path"] }, }, { "name": "create", "description": "Create or overwrite a file.", "parameters": { "type": "object", "properties": { "path": { "type": "string", "description": "File path to create (e.g., /notes.txt or /project/task.txt)." }, "file_text": { "type": "string", "description": "Content to write to the file." } }, "required": ["path", "file_text"] }, }, { "name": "str_replace", "description": "Replace text in a file.", "parameters": { "type": "object", "properties": { "path": { "type": "string", "description": "File path (e.g., /notes.txt)." }, "old_str": { "type": "string", "description": "String to find." }, "new_str": { "type": "string", "description": "String to replace with." } }, "required": ["path", "old_str", "new_str"] }, }, { "name": "insert", "description": "Insert text at a specific line in a file.", "parameters": { "type": "object", "properties": { "path": { "type": "string", "description": "File path (e.g., /notes.txt)." }, "insert_line": { "type": "integer", "description": "Line number to insert at (1-indexed)." }, "insert_text": { "type": "string", "description": "Text to insert." } }, "required": ["path", "insert_line", "insert_text"] }, }, { "name": "delete", "description": "Delete a file or directory.", "parameters": { "type": "object", "properties": { "path": { "type": "string", "description": "Path to delete (e.g., /notes.txt or /project/)." } }, "required": ["path"] }, }, { "name": "rename", "description": "Rename or move a file/directory.", "parameters": { "type": "object", "properties": { "old_path": { "type": "string", "description": "Current path (e.g., /old.txt)." }, "new_path": { "type": "string", "description": "New path (e.g., /new.txt)." } }, "required": ["old_path", "new_path"] }, }, ] def get_config_requirements(self) -> Dict[str, Any]: """Return configuration requirements.""" return {} # ----------------------------- # Path validation # ----------------------------- def _validate_path(self, path: str) -> Optional[str]: """Validate and normalize path. Args: path: User-provided path. Returns: Normalized path or None if invalid. """ if not path: return None # Remove any leading/trailing whitespace path = path.strip() # Preserve whether path ends with / (indicates directory) is_directory = path.endswith("/") # Ensure path starts with / for consistency if not path.startswith("/"): path = "/" + path # Check for directory traversal patterns if ".." in path or path.count("//") > 0: return None # Normalize the path try: # Convert to Path object and resolve to canonical form normalized = str(Path(path).as_posix()) # Ensure it still starts with / if not normalized.startswith("/"): return None # Preserve trailing slash for directories if is_directory and not normalized.endswith("/") and normalized != "/": normalized = normalized + "/" return normalized except Exception: return None # ----------------------------- # Internal helpers # ----------------------------- def _view(self, path: str, view_range: Optional[List[int]] = None) -> str: """View directory contents or file contents.""" validated_path = self._validate_path(path) if not validated_path: return "Error: Invalid path." # Check if viewing directory (ends with / or is root) if validated_path == "/" or validated_path.endswith("/"): return self._view_directory(validated_path) # Otherwise view file return self._view_file(validated_path, view_range) def _view_directory(self, path: str) -> str: """List files in a directory.""" # Ensure path ends with / for proper prefix matching search_path = path if path.endswith("/") else path + "/" # Find all files that start with this directory path query = { "user_id": self.user_id, "tool_id": self.tool_id, "path": {"$regex": f"^{re.escape(search_path)}"} } docs = list(self.collection.find(query, {"path": 1})) if not docs: return f"Directory: {path}\n(empty)" # Extract filenames relative to the directory files = [] for doc in docs: file_path = doc["path"] # Remove the directory prefix if file_path.startswith(search_path): relative = file_path[len(search_path):] if relative: files.append(relative) files.sort() file_list = "\n".join(f"- {f}" for f in files) return f"Directory: {path}\n{file_list}" def _view_file(self, path: str, view_range: Optional[List[int]] = None) -> str: """View file contents with optional line range.""" doc = self.collection.find_one({"user_id": self.user_id, "tool_id": self.tool_id, "path": path}) if not doc or not doc.get("content"): return f"Error: File not found: {path}" content = str(doc["content"]) # Apply view_range if specified if view_range and len(view_range) == 2: lines = content.split("\n") start, end = view_range # Convert to 0-indexed start_idx = max(0, start - 1) end_idx = min(len(lines), end) if start_idx >= len(lines): return f"Error: Line range out of bounds. File has {len(lines)} lines." selected_lines = lines[start_idx:end_idx] # Add line numbers (enumerate with 1-based start) numbered_lines = [f"{i}: {line}" for i, line in enumerate(selected_lines, start=start)] return "\n".join(numbered_lines) return content def _create(self, path: str, file_text: str) -> str: """Create or overwrite a file.""" validated_path = self._validate_path(path) if not validated_path: return "Error: Invalid path." if validated_path == "/" or validated_path.endswith("/"): return "Error: Cannot create a file at directory path." self.collection.update_one( {"user_id": self.user_id, "tool_id": self.tool_id, "path": validated_path}, { "$set": { "content": file_text, "updated_at": datetime.now() } }, upsert=True ) return f"File created: {validated_path}" def _str_replace(self, path: str, old_str: str, new_str: str) -> str: """Replace text in a file.""" validated_path = self._validate_path(path) if not validated_path: return "Error: Invalid path." if not old_str: return "Error: old_str is required." doc = self.collection.find_one({"user_id": self.user_id, "tool_id": self.tool_id, "path": validated_path}) if not doc or not doc.get("content"): return f"Error: File not found: {validated_path}" current_content = str(doc["content"]) # Check if old_str exists (case-insensitive) if old_str.lower() not in current_content.lower(): return f"Error: String '{old_str}' not found in file." # Replace the string (case-insensitive) import re as regex_module updated_content = regex_module.sub(regex_module.escape(old_str), new_str, current_content, flags=regex_module.IGNORECASE) self.collection.update_one( {"user_id": self.user_id, "tool_id": self.tool_id, "path": validated_path}, { "$set": { "content": updated_content, "updated_at": datetime.now() } } ) return f"File updated: {validated_path}" def _insert(self, path: str, insert_line: int, insert_text: str) -> str: """Insert text at a specific line.""" validated_path = self._validate_path(path) if not validated_path: return "Error: Invalid path." if not insert_text: return "Error: insert_text is required." doc = self.collection.find_one({"user_id": self.user_id, "tool_id": self.tool_id, "path": validated_path}) if not doc or not doc.get("content"): return f"Error: File not found: {validated_path}" current_content = str(doc["content"]) lines = current_content.split("\n") # Convert to 0-indexed index = insert_line - 1 if index < 0 or index > len(lines): return f"Error: Invalid line number. File has {len(lines)} lines." lines.insert(index, insert_text) updated_content = "\n".join(lines) self.collection.update_one( {"user_id": self.user_id, "tool_id": self.tool_id, "path": validated_path}, { "$set": { "content": updated_content, "updated_at": datetime.now() } } ) return f"Text inserted at line {insert_line} in {validated_path}" def _delete(self, path: str) -> str: """Delete a file or directory.""" validated_path = self._validate_path(path) if not validated_path: return "Error: Invalid path." if validated_path == "/": # Delete all files for this user and tool result = self.collection.delete_many({"user_id": self.user_id, "tool_id": self.tool_id}) return f"Deleted {result.deleted_count} file(s) from memory." # Check if it's a directory (ends with /) if validated_path.endswith("/"): # Delete all files in directory result = self.collection.delete_many({ "user_id": self.user_id, "tool_id": self.tool_id, "path": {"$regex": f"^{re.escape(validated_path)}"} }) return f"Deleted directory and {result.deleted_count} file(s)." # Try to delete as directory first (without trailing slash) # Check if any files start with this path + / search_path = validated_path + "/" directory_result = self.collection.delete_many({ "user_id": self.user_id, "tool_id": self.tool_id, "path": {"$regex": f"^{re.escape(search_path)}"} }) if directory_result.deleted_count > 0: return f"Deleted directory and {directory_result.deleted_count} file(s)." # Delete single file result = self.collection.delete_one({ "user_id": self.user_id, "tool_id": self.tool_id, "path": validated_path }) if result.deleted_count: return f"Deleted: {validated_path}" return f"Error: File not found: {validated_path}" def _rename(self, old_path: str, new_path: str) -> str: """Rename or move a file/directory.""" validated_old = self._validate_path(old_path) validated_new = self._validate_path(new_path) if not validated_old or not validated_new: return "Error: Invalid path." if validated_old == "/" or validated_new == "/": return "Error: Cannot rename root directory." # Check if renaming a directory if validated_old.endswith("/"): # Ensure validated_new also ends with / for proper path replacement if not validated_new.endswith("/"): validated_new = validated_new + "/" # Find all files in the old directory docs = list(self.collection.find({ "user_id": self.user_id, "tool_id": self.tool_id, "path": {"$regex": f"^{re.escape(validated_old)}"} })) if not docs: return f"Error: Directory not found: {validated_old}" # Update paths for all files for doc in docs: old_file_path = doc["path"] new_file_path = old_file_path.replace(validated_old, validated_new, 1) self.collection.update_one( {"_id": doc["_id"]}, {"$set": {"path": new_file_path, "updated_at": datetime.now()}} ) return f"Renamed directory: {validated_old} -> {validated_new} ({len(docs)} files)" # Rename single file doc = self.collection.find_one({ "user_id": self.user_id, "tool_id": self.tool_id, "path": validated_old }) if not doc: return f"Error: File not found: {validated_old}" # Check if new path already exists existing = self.collection.find_one({ "user_id": self.user_id, "tool_id": self.tool_id, "path": validated_new }) if existing: return f"Error: File already exists at {validated_new}" # Delete the old document and create a new one with the new path self.collection.delete_one({"user_id": self.user_id, "tool_id": self.tool_id, "path": validated_old}) self.collection.insert_one({ "user_id": self.user_id, "tool_id": self.tool_id, "path": validated_new, "content": doc.get("content", ""), "updated_at": datetime.now() }) return f"Renamed: {validated_old} -> {validated_new}" ================================================ FILE: application/agents/tools/notes.py ================================================ from datetime import datetime from typing import Any, Dict, List, Optional import uuid from .base import Tool from application.core.mongo_db import MongoDB from application.core.settings import settings class NotesTool(Tool): """Notepad Single note. Supports viewing, overwriting, string replacement. """ def __init__(self, tool_config: Optional[Dict[str, Any]] = None, user_id: Optional[str] = None) -> None: """Initialize the tool. Args: tool_config: Optional tool configuration. Should include: - tool_id: Unique identifier for this notes tool instance (from user_tools._id) This ensures each user's tool configuration has isolated notes user_id: The authenticated user's id (should come from decoded_token["sub"]). """ self.user_id: Optional[str] = user_id # Get tool_id from configuration (passed from user_tools._id in production) # In production, tool_id is the MongoDB ObjectId string from user_tools collection if tool_config and "tool_id" in tool_config: self.tool_id = tool_config["tool_id"] elif user_id: # Fallback for backward compatibility or testing self.tool_id = f"default_{user_id}" else: # Last resort fallback (shouldn't happen in normal use) self.tool_id = str(uuid.uuid4()) db = MongoDB.get_client()[settings.MONGO_DB_NAME] self.collection = db["notes"] self._last_artifact_id: Optional[str] = None # ----------------------------- # Action implementations # ----------------------------- def execute_action(self, action_name: str, **kwargs: Any) -> str: """Execute an action by name. Args: action_name: One of view, overwrite, str_replace, insert, delete. **kwargs: Parameters for the action. Returns: A human-readable string result. """ if not self.user_id: return "Error: NotesTool requires a valid user_id." self._last_artifact_id = None if action_name == "view": return self._get_note() if action_name == "overwrite": return self._overwrite_note(kwargs.get("text", "")) if action_name == "str_replace": return self._str_replace(kwargs.get("old_str", ""), kwargs.get("new_str", "")) if action_name == "insert": return self._insert(kwargs.get("line_number", 1), kwargs.get("text", "")) if action_name == "delete": return self._delete_note() return f"Unknown action: {action_name}" def get_actions_metadata(self) -> List[Dict[str, Any]]: """Return JSON metadata describing supported actions for tool schemas.""" return [ { "name": "view", "description": "Retrieve the user's note.", "parameters": {"type": "object", "properties": {}}, }, { "name": "overwrite", "description": "Replace the entire note content (creates if doesn't exist).", "parameters": { "type": "object", "properties": { "text": {"type": "string", "description": "New note content."} }, "required": ["text"], }, }, { "name": "str_replace", "description": "Replace occurrences of old_str with new_str in the note.", "parameters": { "type": "object", "properties": { "old_str": {"type": "string", "description": "String to find."}, "new_str": {"type": "string", "description": "String to replace with."} }, "required": ["old_str", "new_str"], }, }, { "name": "insert", "description": "Insert text at the specified line number (1-indexed).", "parameters": { "type": "object", "properties": { "line_number": {"type": "integer", "description": "Line number to insert at (1-indexed)."}, "text": {"type": "string", "description": "Text to insert."} }, "required": ["line_number", "text"], }, }, { "name": "delete", "description": "Delete the user's note.", "parameters": {"type": "object", "properties": {}}, }, ] def get_config_requirements(self) -> Dict[str, Any]: """Return configuration requirements (none for now).""" return {} def get_artifact_id(self, action_name: str, **kwargs: Any) -> Optional[str]: return self._last_artifact_id # ----------------------------- # Internal helpers (single-note) # ----------------------------- def _get_note(self) -> str: doc = self.collection.find_one({"user_id": self.user_id, "tool_id": self.tool_id}) if not doc or not doc.get("note"): return "No note found." if doc.get("_id") is not None: self._last_artifact_id = str(doc.get("_id")) return str(doc["note"]) def _overwrite_note(self, content: str) -> str: content = (content or "").strip() if not content: return "Note content required." result = self.collection.find_one_and_update( {"user_id": self.user_id, "tool_id": self.tool_id}, {"$set": {"note": content, "updated_at": datetime.utcnow()}}, upsert=True, return_document=True, ) if result and result.get("_id") is not None: self._last_artifact_id = str(result.get("_id")) return "Note saved." def _str_replace(self, old_str: str, new_str: str) -> str: if not old_str: return "old_str is required." doc = self.collection.find_one({"user_id": self.user_id, "tool_id": self.tool_id}) if not doc or not doc.get("note"): return "No note found." current_note = str(doc["note"]) # Case-insensitive search if old_str.lower() not in current_note.lower(): return f"String '{old_str}' not found in note." # Case-insensitive replacement import re updated_note = re.sub(re.escape(old_str), new_str, current_note, flags=re.IGNORECASE) result = self.collection.find_one_and_update( {"user_id": self.user_id, "tool_id": self.tool_id}, {"$set": {"note": updated_note, "updated_at": datetime.utcnow()}}, return_document=True, ) if result and result.get("_id") is not None: self._last_artifact_id = str(result.get("_id")) return "Note updated." def _insert(self, line_number: int, text: str) -> str: if not text: return "Text is required." doc = self.collection.find_one({"user_id": self.user_id, "tool_id": self.tool_id}) if not doc or not doc.get("note"): return "No note found." current_note = str(doc["note"]) lines = current_note.split("\n") # Convert to 0-indexed and validate index = line_number - 1 if index < 0 or index > len(lines): return f"Invalid line number. Note has {len(lines)} lines." lines.insert(index, text) updated_note = "\n".join(lines) result = self.collection.find_one_and_update( {"user_id": self.user_id, "tool_id": self.tool_id}, {"$set": {"note": updated_note, "updated_at": datetime.utcnow()}}, return_document=True, ) if result and result.get("_id") is not None: self._last_artifact_id = str(result.get("_id")) return "Text inserted." def _delete_note(self) -> str: doc = self.collection.find_one_and_delete( {"user_id": self.user_id, "tool_id": self.tool_id} ) if not doc: return "No note found to delete." if doc.get("_id") is not None: self._last_artifact_id = str(doc.get("_id")) return "Note deleted." ================================================ FILE: application/agents/tools/ntfy.py ================================================ import requests from application.agents.tools.base import Tool class NtfyTool(Tool): """ Ntfy Tool A tool for sending notifications to ntfy topics on a specified server. """ def __init__(self, config): """ Initialize the NtfyTool with configuration. Args: config (dict): Configuration dictionary containing the access token. """ self.config = config self.token = config.get("token", "") def execute_action(self, action_name, **kwargs): """ Execute the specified action with given parameters. Args: action_name (str): Name of the action to execute. **kwargs: Parameters for the action, including server_url. Returns: dict: Result of the action with status code and message. Raises: ValueError: If the action name is unknown. """ actions = { "ntfy_send_message": self._send_message, } if action_name in actions: return actions[action_name](**kwargs) else: raise ValueError(f"Unknown action: {action_name}") def _send_message(self, server_url, message, topic, title=None, priority=None): """ Send a message to an ntfy topic on the specified server. Args: server_url (str): Base URL of the ntfy server (e.g., https://ntfy.sh). message (str): The message text to send. topic (str): The topic to send the message to. title (str, optional): Title of the notification. priority (int, optional): Priority of the notification (1-5). Returns: dict: Response with status code and a confirmation message. Raises: ValueError: If priority is not an integer between 1 and 5. """ url = f"{server_url.rstrip('/')}/{topic}" headers = {} if title: headers["X-Title"] = title if priority: try: priority = int(priority) except (ValueError, TypeError): raise ValueError("Priority must be convertible to an integer") if priority < 1 or priority > 5: raise ValueError("Priority must be an integer between 1 and 5") headers["X-Priority"] = str(priority) if self.token: headers["Authorization"] = f"Basic {self.token}" data = message.encode("utf-8") response = requests.post(url, headers=headers, data=data) return {"status_code": response.status_code, "message": "Message sent"} def get_actions_metadata(self): """ Provide metadata about available actions. Returns: list: List of dictionaries describing each action. """ return [ { "name": "ntfy_send_message", "description": "Send a notification to an ntfy topic", "parameters": { "type": "object", "properties": { "server_url": { "type": "string", "description": "Base URL of the ntfy server", }, "message": { "type": "string", "description": "Text to send in the notification", }, "topic": { "type": "string", "description": "Topic to send the notification to", }, "title": { "type": "string", "description": "Title of the notification (optional)", }, "priority": { "type": "integer", "description": "Priority of the notification (1-5, optional)", }, }, "required": ["server_url", "message", "topic"], "additionalProperties": False, }, }, ] def get_config_requirements(self): return { "token": { "type": "string", "label": "Access Token", "description": "Ntfy access token for authentication", "required": True, "secret": True, "order": 1, }, } ================================================ FILE: application/agents/tools/postgres.py ================================================ import logging import psycopg2 from application.agents.tools.base import Tool logger = logging.getLogger(__name__) class PostgresTool(Tool): """ PostgreSQL Database Tool A tool for connecting to a PostgreSQL database using a connection string, executing SQL queries, and retrieving schema information. """ def __init__(self, config): self.config = config self.connection_string = config.get("token", "") def execute_action(self, action_name, **kwargs): actions = { "postgres_execute_sql": self._execute_sql, "postgres_get_schema": self._get_schema, } if action_name not in actions: raise ValueError(f"Unknown action: {action_name}") return actions[action_name](**kwargs) def _execute_sql(self, sql_query): """ Executes an SQL query against the PostgreSQL database using a connection string. """ conn = None try: conn = psycopg2.connect(self.connection_string) cur = conn.cursor() cur.execute(sql_query) conn.commit() if sql_query.strip().lower().startswith("select"): column_names = ( [desc[0] for desc in cur.description] if cur.description else [] ) results = [] rows = cur.fetchall() for row in rows: results.append(dict(zip(column_names, row))) response_data = {"data": results, "column_names": column_names} else: row_count = cur.rowcount response_data = { "message": f"Query executed successfully, {row_count} rows affected." } cur.close() return { "status_code": 200, "message": "SQL query executed successfully.", "response_data": response_data, } except psycopg2.Error as e: error_message = f"Database error: {e}" logger.error("PostgreSQL execute_sql error: %s", e) return { "status_code": 500, "message": "Failed to execute SQL query.", "error": error_message, } finally: if conn: conn.close() def _get_schema(self, db_name): """ Retrieves the schema of the PostgreSQL database using a connection string. """ conn = None try: conn = psycopg2.connect(self.connection_string) cur = conn.cursor() cur.execute( """ SELECT table_name, column_name, data_type, column_default, is_nullable FROM information_schema.columns WHERE table_schema = 'public' ORDER BY table_name, ordinal_position; """ ) schema_data = {} for row in cur.fetchall(): table_name, column_name, data_type, column_default, is_nullable = row if table_name not in schema_data: schema_data[table_name] = [] schema_data[table_name].append( { "column_name": column_name, "data_type": data_type, "column_default": column_default, "is_nullable": is_nullable, } ) cur.close() return { "status_code": 200, "message": "Database schema retrieved successfully.", "schema": schema_data, } except psycopg2.Error as e: error_message = f"Database error: {e}" logger.error("PostgreSQL get_schema error: %s", e) return { "status_code": 500, "message": "Failed to retrieve database schema.", "error": error_message, } finally: if conn: conn.close() def get_actions_metadata(self): return [ { "name": "postgres_execute_sql", "description": "Execute an SQL query against the PostgreSQL database and return the results. Use this tool to interact with the database, e.g., retrieve specific data or perform updates. Only SELECT queries will return data, other queries will return execution status.", "parameters": { "type": "object", "properties": { "sql_query": { "type": "string", "description": "The SQL query to execute.", }, }, "required": ["sql_query"], "additionalProperties": False, }, }, { "name": "postgres_get_schema", "description": "Retrieve the schema of the PostgreSQL database, including tables and their columns. Use this to understand the database structure before executing queries. db_name is 'default' if not provided.", "parameters": { "type": "object", "properties": { "db_name": { "type": "string", "description": "The name of the database to retrieve the schema for.", }, }, "required": ["db_name"], "additionalProperties": False, }, }, ] def get_config_requirements(self): return { "token": { "type": "string", "label": "Connection String", "description": "PostgreSQL database connection string", "required": True, "secret": True, "order": 1, }, } ================================================ FILE: application/agents/tools/read_webpage.py ================================================ import requests from markdownify import markdownify from application.agents.tools.base import Tool from application.core.url_validation import validate_url, SSRFError class ReadWebpageTool(Tool): """ Read Webpage (browser) A tool to fetch the HTML content of a URL and convert it to Markdown. """ def __init__(self, config=None): """ Initializes the tool. :param config: Optional configuration dictionary. Not used by this tool. """ self.config = config def execute_action(self, action_name: str, **kwargs) -> str: """ Executes the specified action. For this tool, the only action is 'read_webpage'. :param action_name: The name of the action to execute. Should be 'read_webpage'. :param kwargs: Keyword arguments, must include 'url'. :return: The Markdown content of the webpage or an error message. """ if action_name != "read_webpage": return f"Error: Unknown action '{action_name}'. This tool only supports 'read_webpage'." url = kwargs.get("url") if not url: return "Error: URL parameter is missing." # Validate URL to prevent SSRF attacks try: url = validate_url(url) except SSRFError as e: return f"Error: URL validation failed - {e}" try: response = requests.get(url, timeout=10, headers={'User-Agent': 'DocsGPT-Agent/1.0'}) response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx) html_content = response.text #soup = BeautifulSoup(html_content, 'html.parser') markdown_content = markdownify(html_content, heading_style="ATX", newline_style="BACKSLASH") return markdown_content except requests.exceptions.RequestException as e: return f"Error fetching URL {url}: {e}" except Exception as e: return f"Error processing URL {url}: {e}" def get_actions_metadata(self): """ Returns metadata for the actions supported by this tool. """ return [ { "name": "read_webpage", "description": "Fetches the HTML content of a given URL and returns it as clean Markdown text. Input must be a valid URL.", "parameters": { "type": "object", "properties": { "url": { "type": "string", "description": "The fully qualified URL of the webpage to read (e.g., 'https://www.example.com').", } }, "required": ["url"], "additionalProperties": False, }, } ] def get_config_requirements(self): """ Returns a dictionary describing the configuration requirements for the tool. This tool does not require any specific configuration. """ return {} ================================================ FILE: application/agents/tools/spec_parser.py ================================================ """ API Specification Parser Parses OpenAPI 3.x and Swagger 2.0 specifications and converts them to API Tool action definitions for use in DocsGPT. """ import json import logging import re from typing import Any, Dict, List, Optional, Tuple import yaml logger = logging.getLogger(__name__) SUPPORTED_METHODS = frozenset( {"get", "post", "put", "delete", "patch", "head", "options"} ) def parse_spec(spec_content: str) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]: """ Parse an API specification and convert operations to action definitions. Supports OpenAPI 3.x and Swagger 2.0 formats in JSON or YAML. Args: spec_content: Raw specification content as string Returns: Tuple of (metadata dict, list of action dicts) Raises: ValueError: If the spec is invalid or uses an unsupported format """ spec = _load_spec(spec_content) _validate_spec(spec) is_swagger = "swagger" in spec metadata = _extract_metadata(spec, is_swagger) actions = _extract_actions(spec, is_swagger) return metadata, actions def _load_spec(content: str) -> Dict[str, Any]: """Parse spec content from JSON or YAML string.""" content = content.strip() if not content: raise ValueError("Empty specification content") try: if content.startswith("{"): return json.loads(content) return yaml.safe_load(content) except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON format: {e.msg}") except yaml.YAMLError as e: raise ValueError(f"Invalid YAML format: {e}") def _validate_spec(spec: Dict[str, Any]) -> None: """Validate spec version and required fields.""" if not isinstance(spec, dict): raise ValueError("Specification must be a valid object") openapi_version = spec.get("openapi", "") swagger_version = spec.get("swagger", "") if not (openapi_version.startswith("3.") or swagger_version == "2.0"): raise ValueError( "Unsupported specification version. Expected OpenAPI 3.x or Swagger 2.0" ) if "paths" not in spec or not spec["paths"]: raise ValueError("No API paths defined in the specification") def _extract_metadata(spec: Dict[str, Any], is_swagger: bool) -> Dict[str, Any]: """Extract API metadata from specification.""" info = spec.get("info", {}) base_url = _get_base_url(spec, is_swagger) return { "title": info.get("title", "Untitled API"), "description": (info.get("description", "") or "")[:500], "version": info.get("version", ""), "base_url": base_url, } def _get_base_url(spec: Dict[str, Any], is_swagger: bool) -> str: """Extract base URL from spec (handles both OpenAPI 3.x and Swagger 2.0).""" if is_swagger: schemes = spec.get("schemes", ["https"]) host = spec.get("host", "") base_path = spec.get("basePath", "") if host: scheme = schemes[0] if schemes else "https" return f"{scheme}://{host}{base_path}".rstrip("/") return "" servers = spec.get("servers", []) if servers and isinstance(servers, list) and servers[0].get("url"): return servers[0]["url"].rstrip("/") return "" def _extract_actions(spec: Dict[str, Any], is_swagger: bool) -> List[Dict[str, Any]]: """Extract all API operations as action definitions.""" actions = [] paths = spec.get("paths", {}) base_url = _get_base_url(spec, is_swagger) components = spec.get("components", {}) definitions = spec.get("definitions", {}) for path, path_item in paths.items(): if not isinstance(path_item, dict): continue path_params = path_item.get("parameters", []) for method in SUPPORTED_METHODS: operation = path_item.get(method) if not isinstance(operation, dict): continue try: action = _build_action( path=path, method=method, operation=operation, path_params=path_params, base_url=base_url, components=components, definitions=definitions, is_swagger=is_swagger, ) actions.append(action) except Exception as e: logger.warning( f"Failed to parse operation {method.upper()} {path}: {e}" ) continue return actions def _build_action( path: str, method: str, operation: Dict[str, Any], path_params: List[Dict], base_url: str, components: Dict[str, Any], definitions: Dict[str, Any], is_swagger: bool, ) -> Dict[str, Any]: """Build a single action from an API operation.""" action_name = _generate_action_name(operation, method, path) full_url = f"{base_url}{path}" if base_url else path all_params = path_params + operation.get("parameters", []) query_params, headers = _categorize_parameters(all_params, components, definitions) body, body_content_type = _extract_request_body( operation, components, definitions, is_swagger ) description = operation.get("summary", "") or operation.get("description", "") return { "name": action_name, "url": full_url, "method": method.upper(), "description": (description or "")[:500], "query_params": {"type": "object", "properties": query_params}, "headers": {"type": "object", "properties": headers}, "body": {"type": "object", "properties": body}, "body_content_type": body_content_type, "active": True, } def _generate_action_name(operation: Dict[str, Any], method: str, path: str) -> str: """Generate a valid action name from operationId or method+path.""" if operation.get("operationId"): name = operation["operationId"] else: path_slug = re.sub(r"[{}]", "", path) path_slug = re.sub(r"[^a-zA-Z0-9]", "_", path_slug) path_slug = re.sub(r"_+", "_", path_slug).strip("_") name = f"{method}_{path_slug}" name = re.sub(r"[^a-zA-Z0-9_-]", "_", name) return name[:64] def _categorize_parameters( parameters: List[Dict], components: Dict[str, Any], definitions: Dict[str, Any], ) -> Tuple[Dict, Dict]: """Categorize parameters into query params and headers.""" query_params = {} headers = {} for param in parameters: resolved = _resolve_ref(param, components, definitions) if not resolved or "name" not in resolved: continue location = resolved.get("in", "query") prop = _param_to_property(resolved) if location in ("query", "path"): query_params[resolved["name"]] = prop elif location == "header": headers[resolved["name"]] = prop return query_params, headers def _param_to_property(param: Dict) -> Dict[str, Any]: """Convert an API parameter to an action property definition.""" schema = param.get("schema", {}) param_type = schema.get("type", param.get("type", "string")) mapped_type = "integer" if param_type in ("integer", "number") else "string" return { "type": mapped_type, "description": (param.get("description", "") or "")[:200], "value": "", "filled_by_llm": param.get("required", False), "required": param.get("required", False), } def _extract_request_body( operation: Dict[str, Any], components: Dict[str, Any], definitions: Dict[str, Any], is_swagger: bool, ) -> Tuple[Dict, str]: """Extract request body schema and content type.""" content_types = [ "application/json", "application/x-www-form-urlencoded", "multipart/form-data", "text/plain", "application/xml", ] if is_swagger: consumes = operation.get("consumes", []) body_param = next( (p for p in operation.get("parameters", []) if p.get("in") == "body"), None ) if not body_param: return {}, "application/json" selected_type = consumes[0] if consumes else "application/json" schema = body_param.get("schema", {}) else: request_body = operation.get("requestBody", {}) if not request_body: return {}, "application/json" request_body = _resolve_ref(request_body, components, definitions) content = request_body.get("content", {}) selected_type = "application/json" schema = {} for ct in content_types: if ct in content: selected_type = ct schema = content[ct].get("schema", {}) break if not schema and content: first_type = next(iter(content)) selected_type = first_type schema = content[first_type].get("schema", {}) properties = _schema_to_properties(schema, components, definitions) return properties, selected_type def _schema_to_properties( schema: Dict, components: Dict[str, Any], definitions: Dict[str, Any], depth: int = 0, ) -> Dict[str, Any]: """Convert schema to action body properties (limited depth to prevent recursion).""" if depth > 3: return {} schema = _resolve_ref(schema, components, definitions) if not schema or not isinstance(schema, dict): return {} properties = {} schema_type = schema.get("type", "object") if schema_type == "object": required_fields = set(schema.get("required", [])) for prop_name, prop_schema in schema.get("properties", {}).items(): resolved = _resolve_ref(prop_schema, components, definitions) if not isinstance(resolved, dict): continue prop_type = resolved.get("type", "string") mapped_type = "integer" if prop_type in ("integer", "number") else "string" properties[prop_name] = { "type": mapped_type, "description": (resolved.get("description", "") or "")[:200], "value": "", "filled_by_llm": prop_name in required_fields, "required": prop_name in required_fields, } return properties def _resolve_ref( obj: Any, components: Dict[str, Any], definitions: Dict[str, Any], ) -> Optional[Dict]: """Resolve $ref references in the specification.""" if not isinstance(obj, dict): return obj if isinstance(obj, dict) else None if "$ref" not in obj: return obj ref_path = obj["$ref"] if ref_path.startswith("#/components/"): parts = ref_path.replace("#/components/", "").split("/") return _traverse_path(components, parts) elif ref_path.startswith("#/definitions/"): parts = ref_path.replace("#/definitions/", "").split("/") return _traverse_path(definitions, parts) logger.debug(f"Unsupported ref path: {ref_path}") return None def _traverse_path(obj: Dict, parts: List[str]) -> Optional[Dict]: """Traverse a nested dictionary using path parts.""" try: for part in parts: obj = obj[part] return obj if isinstance(obj, dict) else None except (KeyError, TypeError): return None ================================================ FILE: application/agents/tools/telegram.py ================================================ import logging import requests from application.agents.tools.base import Tool logger = logging.getLogger(__name__) class TelegramTool(Tool): """ Telegram Bot A flexible Telegram tool for performing various actions (e.g., sending messages, images). Requires a bot token and chat ID for configuration """ def __init__(self, config): self.config = config self.token = config.get("token", "") def execute_action(self, action_name, **kwargs): actions = { "telegram_send_message": self._send_message, "telegram_send_image": self._send_image, } if action_name not in actions: raise ValueError(f"Unknown action: {action_name}") return actions[action_name](**kwargs) def _send_message(self, text, chat_id): logger.debug("Sending Telegram message to chat_id=%s", chat_id) url = f"https://api.telegram.org/bot{self.token}/sendMessage" payload = {"chat_id": chat_id, "text": text} response = requests.post(url, data=payload) return {"status_code": response.status_code, "message": "Message sent"} def _send_image(self, image_url, chat_id): logger.debug("Sending Telegram image to chat_id=%s", chat_id) url = f"https://api.telegram.org/bot{self.token}/sendPhoto" payload = {"chat_id": chat_id, "photo": image_url} response = requests.post(url, data=payload) return {"status_code": response.status_code, "message": "Image sent"} def get_actions_metadata(self): return [ { "name": "telegram_send_message", "description": "Send a notification to Telegram chat", "parameters": { "type": "object", "properties": { "text": { "type": "string", "description": "Text to send in the notification", }, "chat_id": { "type": "string", "description": "Chat ID to send the notification to", }, }, "required": ["text"], "additionalProperties": False, }, }, { "name": "telegram_send_image", "description": "Send an image to the Telegram chat", "parameters": { "type": "object", "properties": { "image_url": { "type": "string", "description": "URL of the image to send", }, "chat_id": { "type": "string", "description": "Chat ID to send the image to", }, }, "required": ["image_url"], "additionalProperties": False, }, }, ] def get_config_requirements(self): return { "token": { "type": "string", "label": "Bot Token", "description": "Telegram bot token for authentication", "required": True, "secret": True, "order": 1, }, } ================================================ FILE: application/agents/tools/todo_list.py ================================================ from datetime import datetime from typing import Any, Dict, List, Optional import uuid from .base import Tool from application.core.mongo_db import MongoDB from application.core.settings import settings class TodoListTool(Tool): """Todo List Manages todo items for users. Supports creating, viewing, updating, and deleting todos. """ def __init__(self, tool_config: Optional[Dict[str, Any]] = None, user_id: Optional[str] = None) -> None: """Initialize the tool. Args: tool_config: Optional tool configuration. Should include: - tool_id: Unique identifier for this todo list tool instance (from user_tools._id) This ensures each user's tool configuration has isolated todos user_id: The authenticated user's id (should come from decoded_token["sub"]). """ self.user_id: Optional[str] = user_id # Get tool_id from configuration (passed from user_tools._id in production) # In production, tool_id is the MongoDB ObjectId string from user_tools collection if tool_config and "tool_id" in tool_config: self.tool_id = tool_config["tool_id"] elif user_id: # Fallback for backward compatibility or testing self.tool_id = f"default_{user_id}" else: # Last resort fallback (shouldn't happen in normal use) self.tool_id = str(uuid.uuid4()) db = MongoDB.get_client()[settings.MONGO_DB_NAME] self.collection = db["todos"] self._last_artifact_id: Optional[str] = None # ----------------------------- # Action implementations # ----------------------------- def execute_action(self, action_name: str, **kwargs: Any) -> str: """Execute an action by name. Args: action_name: One of list, create, get, update, complete, delete. **kwargs: Parameters for the action. Returns: A human-readable string result. """ if not self.user_id: return "Error: TodoListTool requires a valid user_id." self._last_artifact_id = None if action_name == "list": return self._list() if action_name == "create": return self._create(kwargs.get("title", "")) if action_name == "get": return self._get(kwargs.get("todo_id")) if action_name == "update": return self._update( kwargs.get("todo_id"), kwargs.get("title", "") ) if action_name == "complete": return self._complete(kwargs.get("todo_id")) if action_name == "delete": return self._delete(kwargs.get("todo_id")) return f"Unknown action: {action_name}" def get_actions_metadata(self) -> List[Dict[str, Any]]: """Return JSON metadata describing supported actions for tool schemas.""" return [ { "name": "list", "description": "List all todos for the user.", "parameters": {"type": "object", "properties": {}}, }, { "name": "create", "description": "Create a new todo item.", "parameters": { "type": "object", "properties": { "title": { "type": "string", "description": "Title of the todo item." } }, "required": ["title"], }, }, { "name": "get", "description": "Get a specific todo by ID.", "parameters": { "type": "object", "properties": { "todo_id": { "type": "integer", "description": "The ID of the todo to retrieve." } }, "required": ["todo_id"], }, }, { "name": "update", "description": "Update a todo's title by ID.", "parameters": { "type": "object", "properties": { "todo_id": { "type": "integer", "description": "The ID of the todo to update." }, "title": { "type": "string", "description": "The new title for the todo." } }, "required": ["todo_id", "title"], }, }, { "name": "complete", "description": "Mark a todo as completed.", "parameters": { "type": "object", "properties": { "todo_id": { "type": "integer", "description": "The ID of the todo to mark as completed." } }, "required": ["todo_id"], }, }, { "name": "delete", "description": "Delete a specific todo by ID.", "parameters": { "type": "object", "properties": { "todo_id": { "type": "integer", "description": "The ID of the todo to delete." } }, "required": ["todo_id"], }, }, ] def get_config_requirements(self) -> Dict[str, Any]: """Return configuration requirements.""" return {} def get_artifact_id(self, action_name: str, **kwargs: Any) -> Optional[str]: return self._last_artifact_id # ----------------------------- # Internal helpers # ----------------------------- def _coerce_todo_id(self, value: Optional[Any]) -> Optional[int]: """Convert todo identifiers to sequential integers.""" if value is None: return None if isinstance(value, int): return value if value > 0 else None if isinstance(value, str): stripped = value.strip() if stripped.isdigit(): numeric_value = int(stripped) return numeric_value if numeric_value > 0 else None return None def _get_next_todo_id(self) -> int: """Get the next sequential todo_id for this user and tool. Returns a simple integer (1, 2, 3, ...) scoped to this user/tool. With 5-10 todos max, scanning is negligible. """ query = {"user_id": self.user_id, "tool_id": self.tool_id} todos = list(self.collection.find(query, {"todo_id": 1})) # Find the maximum todo_id max_id = 0 for todo in todos: todo_id = self._coerce_todo_id(todo.get("todo_id")) if todo_id is not None: max_id = max(max_id, todo_id) return max_id + 1 def _list(self) -> str: """List all todos for the user.""" query = {"user_id": self.user_id, "tool_id": self.tool_id} todos = list(self.collection.find(query)) if not todos: return "No todos found." result_lines = ["Todos:"] for doc in todos: todo_id = doc.get("todo_id") title = doc.get("title", "Untitled") status = doc.get("status", "open") line = f"[{todo_id}] {title} ({status})" result_lines.append(line) return "\n".join(result_lines) def _create(self, title: str) -> str: """Create a new todo item.""" title = (title or "").strip() if not title: return "Error: Title is required." now = datetime.now() todo_id = self._get_next_todo_id() doc = { "todo_id": todo_id, "user_id": self.user_id, "tool_id": self.tool_id, "title": title, "status": "open", "created_at": now, "updated_at": now, } insert_result = self.collection.insert_one(doc) inserted_id = getattr(insert_result, "inserted_id", None) or doc.get("_id") if inserted_id is not None: self._last_artifact_id = str(inserted_id) return f"Todo created with ID {todo_id}: {title}" def _get(self, todo_id: Optional[Any]) -> str: """Get a specific todo by ID.""" parsed_todo_id = self._coerce_todo_id(todo_id) if parsed_todo_id is None: return "Error: todo_id must be a positive integer." query = {"user_id": self.user_id, "tool_id": self.tool_id, "todo_id": parsed_todo_id} doc = self.collection.find_one(query) if not doc: return f"Error: Todo with ID {parsed_todo_id} not found." if doc.get("_id") is not None: self._last_artifact_id = str(doc.get("_id")) title = doc.get("title", "Untitled") status = doc.get("status", "open") result = f"Todo [{parsed_todo_id}]:\nTitle: {title}\nStatus: {status}" return result def _update(self, todo_id: Optional[Any], title: str) -> str: """Update a todo's title by ID.""" parsed_todo_id = self._coerce_todo_id(todo_id) if parsed_todo_id is None: return "Error: todo_id must be a positive integer." title = (title or "").strip() if not title: return "Error: Title is required." query = {"user_id": self.user_id, "tool_id": self.tool_id, "todo_id": parsed_todo_id} doc = self.collection.find_one_and_update( query, {"$set": {"title": title, "updated_at": datetime.now()}}, ) if not doc: return f"Error: Todo with ID {parsed_todo_id} not found." if doc.get("_id") is not None: self._last_artifact_id = str(doc.get("_id")) return f"Todo {parsed_todo_id} updated to: {title}" def _complete(self, todo_id: Optional[Any]) -> str: """Mark a todo as completed.""" parsed_todo_id = self._coerce_todo_id(todo_id) if parsed_todo_id is None: return "Error: todo_id must be a positive integer." query = {"user_id": self.user_id, "tool_id": self.tool_id, "todo_id": parsed_todo_id} doc = self.collection.find_one_and_update( query, {"$set": {"status": "completed", "updated_at": datetime.now()}}, ) if not doc: return f"Error: Todo with ID {parsed_todo_id} not found." if doc.get("_id") is not None: self._last_artifact_id = str(doc.get("_id")) return f"Todo {parsed_todo_id} marked as completed." def _delete(self, todo_id: Optional[Any]) -> str: """Delete a specific todo by ID.""" parsed_todo_id = self._coerce_todo_id(todo_id) if parsed_todo_id is None: return "Error: todo_id must be a positive integer." query = {"user_id": self.user_id, "tool_id": self.tool_id, "todo_id": parsed_todo_id} doc = self.collection.find_one_and_delete(query) if not doc: return f"Error: Todo with ID {parsed_todo_id} not found." if doc.get("_id") is not None: self._last_artifact_id = str(doc.get("_id")) return f"Todo {parsed_todo_id} deleted." ================================================ FILE: application/agents/tools/tool_action_parser.py ================================================ import json import logging logger = logging.getLogger(__name__) class ToolActionParser: def __init__(self, llm_type): self.llm_type = llm_type self.parsers = { "OpenAILLM": self._parse_openai_llm, "GoogleLLM": self._parse_google_llm, } def parse_args(self, call): parser = self.parsers.get(self.llm_type, self._parse_openai_llm) return parser(call) def _parse_openai_llm(self, call): try: call_args = json.loads(call.arguments) tool_parts = call.name.split("_") # If the tool name doesn't contain an underscore, it's likely a hallucinated tool if len(tool_parts) < 2: logger.warning( f"Invalid tool name format: {call.name}. Expected format: action_name_tool_id" ) return None, None, None tool_id = tool_parts[-1] action_name = "_".join(tool_parts[:-1]) # Validate that tool_id looks like a numerical ID if not tool_id.isdigit(): logger.warning( f"Tool ID '{tool_id}' is not numerical. This might be a hallucinated tool call." ) except (AttributeError, TypeError, json.JSONDecodeError) as e: logger.error(f"Error parsing OpenAI LLM call: {e}") return None, None, None return tool_id, action_name, call_args def _parse_google_llm(self, call): try: call_args = call.arguments tool_parts = call.name.split("_") # If the tool name doesn't contain an underscore, it's likely a hallucinated tool if len(tool_parts) < 2: logger.warning( f"Invalid tool name format: {call.name}. Expected format: action_name_tool_id" ) return None, None, None tool_id = tool_parts[-1] action_name = "_".join(tool_parts[:-1]) # Validate that tool_id looks like a numerical ID if not tool_id.isdigit(): logger.warning( f"Tool ID '{tool_id}' is not numerical. This might be a hallucinated tool call." ) except (AttributeError, TypeError) as e: logger.error(f"Error parsing Google LLM call: {e}") return None, None, None return tool_id, action_name, call_args ================================================ FILE: application/agents/tools/tool_manager.py ================================================ import importlib import inspect import os import pkgutil from application.agents.tools.base import Tool class ToolManager: def __init__(self, config): self.config = config self.tools = {} self.load_tools() def load_tools(self): tools_dir = os.path.join(os.path.dirname(__file__)) for finder, name, ispkg in pkgutil.iter_modules([tools_dir]): if name == "base" or name.startswith("__"): continue module = importlib.import_module(f"application.agents.tools.{name}") for member_name, obj in inspect.getmembers(module, inspect.isclass): if issubclass(obj, Tool) and obj is not Tool: tool_config = self.config.get(name, {}) self.tools[name] = obj(tool_config) def load_tool(self, tool_name, tool_config, user_id=None): self.config[tool_name] = tool_config module = importlib.import_module(f"application.agents.tools.{tool_name}") for member_name, obj in inspect.getmembers(module, inspect.isclass): if issubclass(obj, Tool) and obj is not Tool: if tool_name in {"mcp_tool", "notes", "memory", "todo_list"} and user_id: return obj(tool_config, user_id) else: return obj(tool_config) def execute_action(self, tool_name, action_name, user_id=None, **kwargs): if tool_name not in self.tools: raise ValueError(f"Tool '{tool_name}' not loaded") if tool_name in {"mcp_tool", "memory", "todo_list", "notes"} and user_id: tool_config = self.config.get(tool_name, {}) tool = self.load_tool(tool_name, tool_config, user_id) return tool.execute_action(action_name, **kwargs) return self.tools[tool_name].execute_action(action_name, **kwargs) def get_all_actions_metadata(self): metadata = [] for tool in self.tools.values(): metadata.extend(tool.get_actions_metadata()) return metadata ================================================ FILE: application/agents/workflow_agent.py ================================================ import logging from datetime import datetime, timezone from typing import Any, Dict, Generator, Optional from application.agents.base import BaseAgent from application.agents.workflows.schemas import ( ExecutionStatus, Workflow, WorkflowEdge, WorkflowGraph, WorkflowNode, WorkflowRun, ) from application.agents.workflows.workflow_engine import WorkflowEngine from application.core.mongo_db import MongoDB from application.core.settings import settings from application.logging import log_activity, LogContext logger = logging.getLogger(__name__) class WorkflowAgent(BaseAgent): """A specialized agent that executes predefined workflows.""" def __init__( self, *args, workflow_id: Optional[str] = None, workflow: Optional[Dict[str, Any]] = None, workflow_owner: Optional[str] = None, **kwargs, ): super().__init__(*args, **kwargs) self.workflow_id = workflow_id self.workflow_owner = workflow_owner self._workflow_data = workflow self._engine: Optional[WorkflowEngine] = None @log_activity() def gen( self, query: str, log_context: LogContext = None ) -> Generator[Dict[str, str], None, None]: yield from self._gen_inner(query, log_context) def _gen_inner( self, query: str, log_context: LogContext ) -> Generator[Dict[str, str], None, None]: graph = self._load_workflow_graph() if not graph: yield {"type": "error", "error": "Failed to load workflow configuration."} return self._engine = WorkflowEngine(graph, self) yield from self._engine.execute({}, query) self._save_workflow_run(query) def _load_workflow_graph(self) -> Optional[WorkflowGraph]: if self._workflow_data: return self._parse_embedded_workflow() if self.workflow_id: return self._load_from_database() return None def _parse_embedded_workflow(self) -> Optional[WorkflowGraph]: try: nodes_data = self._workflow_data.get("nodes", []) edges_data = self._workflow_data.get("edges", []) workflow = Workflow( name=self._workflow_data.get("name", "Embedded Workflow"), description=self._workflow_data.get("description"), ) nodes = [] for n in nodes_data: node_config = n.get("data", {}) nodes.append( WorkflowNode( id=n["id"], workflow_id=self.workflow_id or "embedded", type=n["type"], title=n.get("title", "Node"), description=n.get("description"), position=n.get("position", {"x": 0, "y": 0}), config=node_config, ) ) edges = [] for e in edges_data: edges.append( WorkflowEdge( id=e["id"], workflow_id=self.workflow_id or "embedded", source=e.get("source") or e.get("source_id"), target=e.get("target") or e.get("target_id"), sourceHandle=e.get("sourceHandle") or e.get("source_handle"), targetHandle=e.get("targetHandle") or e.get("target_handle"), ) ) return WorkflowGraph(workflow=workflow, nodes=nodes, edges=edges) except Exception as e: logger.error(f"Invalid embedded workflow: {e}") return None def _load_from_database(self) -> Optional[WorkflowGraph]: try: from bson.objectid import ObjectId if not self.workflow_id or not ObjectId.is_valid(self.workflow_id): logger.error(f"Invalid workflow ID: {self.workflow_id}") return None owner_id = self.workflow_owner if not owner_id and isinstance(self.decoded_token, dict): owner_id = self.decoded_token.get("sub") if not owner_id: logger.error( f"Workflow owner not available for workflow load: {self.workflow_id}" ) return None mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] workflows_coll = db["workflows"] workflow_nodes_coll = db["workflow_nodes"] workflow_edges_coll = db["workflow_edges"] workflow_doc = workflows_coll.find_one( {"_id": ObjectId(self.workflow_id), "user": owner_id} ) if not workflow_doc: logger.error( f"Workflow {self.workflow_id} not found or inaccessible for user {owner_id}" ) return None workflow = Workflow(**workflow_doc) graph_version = workflow_doc.get("current_graph_version", 1) try: graph_version = int(graph_version) if graph_version <= 0: graph_version = 1 except (ValueError, TypeError): graph_version = 1 nodes_docs = list( workflow_nodes_coll.find( {"workflow_id": self.workflow_id, "graph_version": graph_version} ) ) if not nodes_docs and graph_version == 1: nodes_docs = list( workflow_nodes_coll.find( { "workflow_id": self.workflow_id, "graph_version": {"$exists": False}, } ) ) nodes = [WorkflowNode(**doc) for doc in nodes_docs] edges_docs = list( workflow_edges_coll.find( {"workflow_id": self.workflow_id, "graph_version": graph_version} ) ) if not edges_docs and graph_version == 1: edges_docs = list( workflow_edges_coll.find( { "workflow_id": self.workflow_id, "graph_version": {"$exists": False}, } ) ) edges = [WorkflowEdge(**doc) for doc in edges_docs] return WorkflowGraph(workflow=workflow, nodes=nodes, edges=edges) except Exception as e: logger.error(f"Failed to load workflow from database: {e}") return None def _save_workflow_run(self, query: str) -> None: if not self._engine: return try: mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] workflow_runs_coll = db["workflow_runs"] run = WorkflowRun( workflow_id=self.workflow_id or "unknown", status=self._determine_run_status(), inputs={"query": query}, outputs=self._serialize_state(self._engine.state), steps=self._engine.get_execution_summary(), created_at=datetime.now(timezone.utc), completed_at=datetime.now(timezone.utc), ) workflow_runs_coll.insert_one(run.to_mongo_doc()) except Exception as e: logger.error(f"Failed to save workflow run: {e}") def _determine_run_status(self) -> ExecutionStatus: if not self._engine or not self._engine.execution_log: return ExecutionStatus.COMPLETED for log in self._engine.execution_log: if log.get("status") == ExecutionStatus.FAILED.value: return ExecutionStatus.FAILED return ExecutionStatus.COMPLETED def _serialize_state(self, state: Dict[str, Any]) -> Dict[str, Any]: serialized: Dict[str, Any] = {} for key, value in state.items(): serialized[key] = self._serialize_state_value(value) return serialized def _serialize_state_value(self, value: Any) -> Any: if isinstance(value, dict): return { str(dict_key): self._serialize_state_value(dict_value) for dict_key, dict_value in value.items() } if isinstance(value, list): return [self._serialize_state_value(item) for item in value] if isinstance(value, tuple): return [self._serialize_state_value(item) for item in value] if isinstance(value, datetime): return value.isoformat() if isinstance(value, (str, int, float, bool, type(None))): return value return str(value) ================================================ FILE: application/agents/workflows/cel_evaluator.py ================================================ from typing import Any, Dict import celpy import celpy.celtypes class CelEvaluationError(Exception): pass def _convert_value(value: Any) -> Any: if isinstance(value, bool): return celpy.celtypes.BoolType(value) if isinstance(value, int): return celpy.celtypes.IntType(value) if isinstance(value, float): return celpy.celtypes.DoubleType(value) if isinstance(value, str): return celpy.celtypes.StringType(value) if isinstance(value, list): return celpy.celtypes.ListType([_convert_value(item) for item in value]) if isinstance(value, dict): return celpy.celtypes.MapType( {celpy.celtypes.StringType(k): _convert_value(v) for k, v in value.items()} ) if value is None: return celpy.celtypes.BoolType(False) return celpy.celtypes.StringType(str(value)) def build_activation(state: Dict[str, Any]) -> Dict[str, Any]: return {k: _convert_value(v) for k, v in state.items()} def evaluate_cel(expression: str, state: Dict[str, Any]) -> Any: if not expression or not expression.strip(): raise CelEvaluationError("Empty expression") try: env = celpy.Environment() ast = env.compile(expression) program = env.program(ast) activation = build_activation(state) result = program.evaluate(activation) except celpy.CELEvalError as exc: raise CelEvaluationError(f"CEL evaluation error: {exc}") from exc except Exception as exc: raise CelEvaluationError(f"CEL error: {exc}") from exc return cel_to_python(result) def cel_to_python(value: Any) -> Any: if isinstance(value, celpy.celtypes.BoolType): return bool(value) if isinstance(value, celpy.celtypes.IntType): return int(value) if isinstance(value, celpy.celtypes.DoubleType): return float(value) if isinstance(value, celpy.celtypes.StringType): return str(value) if isinstance(value, celpy.celtypes.ListType): return [cel_to_python(item) for item in value] if isinstance(value, celpy.celtypes.MapType): return {str(k): cel_to_python(v) for k, v in value.items()} return value ================================================ FILE: application/agents/workflows/node_agent.py ================================================ """Workflow Node Agents - defines specialized agents for workflow nodes.""" from typing import Any, Dict, List, Optional, Type from application.agents.base import BaseAgent from application.agents.classic_agent import ClassicAgent from application.agents.react_agent import ReActAgent from application.agents.workflows.schemas import AgentType class ToolFilterMixin: """Mixin that filters fetched tools to only those specified in tool_ids.""" _allowed_tool_ids: List[str] def _get_user_tools(self, user: str = "local") -> Dict[str, Dict[str, Any]]: all_tools = super()._get_user_tools(user) if not self._allowed_tool_ids: return {} filtered_tools = { tool_id: tool for tool_id, tool in all_tools.items() if str(tool.get("_id", "")) in self._allowed_tool_ids } return filtered_tools def _get_tools(self, api_key: str = None) -> Dict[str, Dict[str, Any]]: all_tools = super()._get_tools(api_key) if not self._allowed_tool_ids: return {} filtered_tools = { tool_id: tool for tool_id, tool in all_tools.items() if str(tool.get("_id", "")) in self._allowed_tool_ids } return filtered_tools class WorkflowNodeClassicAgent(ToolFilterMixin, ClassicAgent): def __init__( self, endpoint: str, llm_name: str, model_id: str, api_key: str, tool_ids: Optional[List[str]] = None, **kwargs, ): super().__init__( endpoint=endpoint, llm_name=llm_name, model_id=model_id, api_key=api_key, **kwargs, ) self._allowed_tool_ids = tool_ids or [] class WorkflowNodeReActAgent(ToolFilterMixin, ReActAgent): def __init__( self, endpoint: str, llm_name: str, model_id: str, api_key: str, tool_ids: Optional[List[str]] = None, **kwargs, ): super().__init__( endpoint=endpoint, llm_name=llm_name, model_id=model_id, api_key=api_key, **kwargs, ) self._allowed_tool_ids = tool_ids or [] class WorkflowNodeAgentFactory: _agents: Dict[AgentType, Type[BaseAgent]] = { AgentType.CLASSIC: WorkflowNodeClassicAgent, AgentType.REACT: WorkflowNodeReActAgent, } @classmethod def create( cls, agent_type: AgentType, endpoint: str, llm_name: str, model_id: str, api_key: str, tool_ids: Optional[List[str]] = None, **kwargs, ) -> BaseAgent: agent_class = cls._agents.get(agent_type) if not agent_class: raise ValueError(f"Unsupported agent type: {agent_type}") return agent_class( endpoint=endpoint, llm_name=llm_name, model_id=model_id, api_key=api_key, tool_ids=tool_ids, **kwargs, ) ================================================ FILE: application/agents/workflows/schemas.py ================================================ from datetime import datetime, timezone from enum import Enum from typing import Any, Dict, List, Literal, Optional, Union from bson import ObjectId from pydantic import BaseModel, ConfigDict, Field, field_validator class NodeType(str, Enum): START = "start" END = "end" AGENT = "agent" NOTE = "note" STATE = "state" CONDITION = "condition" class AgentType(str, Enum): CLASSIC = "classic" REACT = "react" class ExecutionStatus(str, Enum): PENDING = "pending" RUNNING = "running" COMPLETED = "completed" FAILED = "failed" class Position(BaseModel): model_config = ConfigDict(extra="forbid") x: float = 0.0 y: float = 0.0 class AgentNodeConfig(BaseModel): model_config = ConfigDict(extra="allow") agent_type: AgentType = AgentType.CLASSIC llm_name: Optional[str] = None system_prompt: str = "You are a helpful assistant." prompt_template: str = "" output_variable: Optional[str] = None stream_to_user: bool = True tools: List[str] = Field(default_factory=list) sources: List[str] = Field(default_factory=list) chunks: str = "2" retriever: str = "" model_id: Optional[str] = None json_schema: Optional[Dict[str, Any]] = None class ConditionCase(BaseModel): model_config = ConfigDict(extra="forbid", populate_by_name=True) name: Optional[str] = None expression: str = "" source_handle: str = Field(..., alias="sourceHandle") class ConditionNodeConfig(BaseModel): model_config = ConfigDict(extra="allow") mode: Literal["simple", "advanced"] = "simple" cases: List[ConditionCase] = Field(default_factory=list) class StateOperation(BaseModel): model_config = ConfigDict(extra="forbid") expression: str = "" target_variable: str = "" class WorkflowEdgeCreate(BaseModel): model_config = ConfigDict(populate_by_name=True) id: str workflow_id: str source_id: str = Field(..., alias="source") target_id: str = Field(..., alias="target") source_handle: Optional[str] = Field(None, alias="sourceHandle") target_handle: Optional[str] = Field(None, alias="targetHandle") class WorkflowEdge(WorkflowEdgeCreate): mongo_id: Optional[str] = Field(None, alias="_id") @field_validator("mongo_id", mode="before") @classmethod def convert_objectid(cls, v: Any) -> Optional[str]: if isinstance(v, ObjectId): return str(v) return v def to_mongo_doc(self) -> Dict[str, Any]: return { "id": self.id, "workflow_id": self.workflow_id, "source_id": self.source_id, "target_id": self.target_id, "source_handle": self.source_handle, "target_handle": self.target_handle, } class WorkflowNodeCreate(BaseModel): model_config = ConfigDict(extra="allow") id: str workflow_id: str type: NodeType title: str = "Node" description: Optional[str] = None position: Position = Field(default_factory=Position) config: Dict[str, Any] = Field(default_factory=dict) @field_validator("position", mode="before") @classmethod def parse_position(cls, v: Union[Dict[str, float], Position]) -> Position: if isinstance(v, dict): return Position(**v) return v class WorkflowNode(WorkflowNodeCreate): mongo_id: Optional[str] = Field(None, alias="_id") @field_validator("mongo_id", mode="before") @classmethod def convert_objectid(cls, v: Any) -> Optional[str]: if isinstance(v, ObjectId): return str(v) return v def to_mongo_doc(self) -> Dict[str, Any]: return { "id": self.id, "workflow_id": self.workflow_id, "type": self.type.value, "title": self.title, "description": self.description, "position": self.position.model_dump(), "config": self.config, } class WorkflowCreate(BaseModel): model_config = ConfigDict(extra="allow") name: str = "New Workflow" description: Optional[str] = None user: Optional[str] = None class Workflow(WorkflowCreate): id: Optional[str] = Field(None, alias="_id") created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) @field_validator("id", mode="before") @classmethod def convert_objectid(cls, v: Any) -> Optional[str]: if isinstance(v, ObjectId): return str(v) return v def to_mongo_doc(self) -> Dict[str, Any]: return { "name": self.name, "description": self.description, "user": self.user, "created_at": self.created_at, "updated_at": self.updated_at, } class WorkflowGraph(BaseModel): workflow: Workflow nodes: List[WorkflowNode] = Field(default_factory=list) edges: List[WorkflowEdge] = Field(default_factory=list) def get_node_by_id(self, node_id: str) -> Optional[WorkflowNode]: for node in self.nodes: if node.id == node_id: return node return None def get_start_node(self) -> Optional[WorkflowNode]: for node in self.nodes: if node.type == NodeType.START: return node return None def get_outgoing_edges(self, node_id: str) -> List[WorkflowEdge]: return [edge for edge in self.edges if edge.source_id == node_id] class NodeExecutionLog(BaseModel): model_config = ConfigDict(extra="forbid") node_id: str node_type: str status: ExecutionStatus started_at: datetime completed_at: Optional[datetime] = None error: Optional[str] = None state_snapshot: Dict[str, Any] = Field(default_factory=dict) class WorkflowRunCreate(BaseModel): workflow_id: str inputs: Dict[str, str] = Field(default_factory=dict) class WorkflowRun(BaseModel): model_config = ConfigDict(extra="allow") id: Optional[str] = Field(None, alias="_id") workflow_id: str status: ExecutionStatus = ExecutionStatus.PENDING inputs: Dict[str, str] = Field(default_factory=dict) outputs: Dict[str, Any] = Field(default_factory=dict) steps: List[NodeExecutionLog] = Field(default_factory=list) created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) completed_at: Optional[datetime] = None @field_validator("id", mode="before") @classmethod def convert_objectid(cls, v: Any) -> Optional[str]: if isinstance(v, ObjectId): return str(v) return v def to_mongo_doc(self) -> Dict[str, Any]: return { "workflow_id": self.workflow_id, "status": self.status.value, "inputs": self.inputs, "outputs": self.outputs, "steps": [step.model_dump() for step in self.steps], "created_at": self.created_at, "completed_at": self.completed_at, } ================================================ FILE: application/agents/workflows/workflow_engine.py ================================================ import json import logging from datetime import datetime, timezone from typing import Any, Dict, Generator, List, Optional, TYPE_CHECKING from application.agents.workflows.cel_evaluator import CelEvaluationError, evaluate_cel from application.agents.workflows.node_agent import WorkflowNodeAgentFactory from application.agents.workflows.schemas import ( AgentNodeConfig, ConditionNodeConfig, ExecutionStatus, NodeExecutionLog, NodeType, WorkflowGraph, WorkflowNode, ) from application.core.json_schema_utils import ( JsonSchemaValidationError, normalize_json_schema_payload, ) from application.error import sanitize_api_error from application.templates.namespaces import NamespaceManager from application.templates.template_engine import TemplateEngine, TemplateRenderError try: import jsonschema except ImportError: # pragma: no cover - optional dependency in some deployments. jsonschema = None if TYPE_CHECKING: from application.agents.base import BaseAgent logger = logging.getLogger(__name__) StateValue = Any WorkflowState = Dict[str, StateValue] TEMPLATE_RESERVED_NAMESPACES = {"agent", "system", "source", "tools", "passthrough"} class WorkflowEngine: MAX_EXECUTION_STEPS = 50 def __init__(self, graph: WorkflowGraph, agent: "BaseAgent"): self.graph = graph self.agent = agent self.state: WorkflowState = {} self.execution_log: List[Dict[str, Any]] = [] self._condition_result: Optional[str] = None self._template_engine = TemplateEngine() self._namespace_manager = NamespaceManager() def execute( self, initial_inputs: WorkflowState, query: str ) -> Generator[Dict[str, str], None, None]: self._initialize_state(initial_inputs, query) start_node = self.graph.get_start_node() if not start_node: yield {"type": "error", "error": "No start node found in workflow."} return current_node_id: Optional[str] = start_node.id steps = 0 while current_node_id and steps < self.MAX_EXECUTION_STEPS: node = self.graph.get_node_by_id(current_node_id) if not node: yield {"type": "error", "error": f"Node {current_node_id} not found."} break log_entry = self._create_log_entry(node) yield { "type": "workflow_step", "node_id": node.id, "node_type": node.type.value, "node_title": node.title, "status": "running", } try: yield from self._execute_node(node) log_entry["status"] = ExecutionStatus.COMPLETED.value log_entry["completed_at"] = datetime.now(timezone.utc) output_key = f"node_{node.id}_output" node_output = self.state.get(output_key) yield { "type": "workflow_step", "node_id": node.id, "node_type": node.type.value, "node_title": node.title, "status": "completed", "state_snapshot": dict(self.state), "output": node_output, } except Exception as e: logger.error(f"Error executing node {node.id}: {e}", exc_info=True) log_entry["status"] = ExecutionStatus.FAILED.value log_entry["error"] = str(e) log_entry["completed_at"] = datetime.now(timezone.utc) log_entry["state_snapshot"] = dict(self.state) self.execution_log.append(log_entry) user_friendly_error = sanitize_api_error(e) yield { "type": "workflow_step", "node_id": node.id, "node_type": node.type.value, "node_title": node.title, "status": "failed", "state_snapshot": dict(self.state), "error": user_friendly_error, } yield {"type": "error", "error": user_friendly_error} break log_entry["state_snapshot"] = dict(self.state) self.execution_log.append(log_entry) if node.type == NodeType.END: break current_node_id = self._get_next_node_id(current_node_id) if current_node_id is None and node.type != NodeType.END: logger.warning( f"Branch ended at node '{node.title}' ({node.id}) without reaching an end node" ) steps += 1 if steps >= self.MAX_EXECUTION_STEPS: logger.warning( f"Workflow reached max steps limit ({self.MAX_EXECUTION_STEPS})" ) def _initialize_state(self, initial_inputs: WorkflowState, query: str) -> None: self.state.update(initial_inputs) self.state["query"] = query self.state["chat_history"] = str(self.agent.chat_history) def _create_log_entry(self, node: WorkflowNode) -> Dict[str, Any]: return { "node_id": node.id, "node_type": node.type.value, "started_at": datetime.now(timezone.utc), "completed_at": None, "status": ExecutionStatus.RUNNING.value, "error": None, "state_snapshot": {}, } def _get_next_node_id(self, current_node_id: str) -> Optional[str]: node = self.graph.get_node_by_id(current_node_id) edges = self.graph.get_outgoing_edges(current_node_id) if not edges: return None if node and node.type == NodeType.CONDITION and self._condition_result: target_handle = self._condition_result self._condition_result = None for edge in edges: if edge.source_handle == target_handle: return edge.target_id return None return edges[0].target_id def _execute_node( self, node: WorkflowNode ) -> Generator[Dict[str, str], None, None]: logger.info(f"Executing node {node.id} ({node.type.value})") node_handlers = { NodeType.START: self._execute_start_node, NodeType.NOTE: self._execute_note_node, NodeType.AGENT: self._execute_agent_node, NodeType.STATE: self._execute_state_node, NodeType.CONDITION: self._execute_condition_node, NodeType.END: self._execute_end_node, } handler = node_handlers.get(node.type) if handler: yield from handler(node) def _execute_start_node( self, node: WorkflowNode ) -> Generator[Dict[str, str], None, None]: yield from () def _execute_note_node( self, node: WorkflowNode ) -> Generator[Dict[str, str], None, None]: yield from () def _execute_agent_node( self, node: WorkflowNode ) -> Generator[Dict[str, str], None, None]: from application.core.model_utils import ( get_api_key_for_provider, get_model_capabilities, get_provider_from_model_id, ) node_config = AgentNodeConfig(**node.config.get("config", node.config)) if node_config.prompt_template: formatted_prompt = self._format_template(node_config.prompt_template) else: formatted_prompt = self.state.get("query", "") node_json_schema = self._normalize_node_json_schema( node_config.json_schema, node.title ) node_model_id = node_config.model_id or self.agent.model_id node_llm_name = ( node_config.llm_name or get_provider_from_model_id(node_model_id or "") or self.agent.llm_name ) node_api_key = get_api_key_for_provider(node_llm_name) or self.agent.api_key if node_json_schema and node_model_id: model_capabilities = get_model_capabilities(node_model_id) if model_capabilities and not model_capabilities.get( "supports_structured_output", False ): raise ValueError( f'Model "{node_model_id}" does not support structured output for node "{node.title}"' ) node_agent = WorkflowNodeAgentFactory.create( agent_type=node_config.agent_type, endpoint=self.agent.endpoint, llm_name=node_llm_name, model_id=node_model_id, api_key=node_api_key, tool_ids=node_config.tools, prompt=node_config.system_prompt, chat_history=self.agent.chat_history, decoded_token=self.agent.decoded_token, json_schema=node_json_schema, ) full_response_parts: List[str] = [] structured_response_parts: List[str] = [] has_structured_response = False first_chunk = True for event in node_agent.gen(formatted_prompt): if "answer" in event: chunk = str(event["answer"]) full_response_parts.append(chunk) if event.get("structured"): has_structured_response = True structured_response_parts.append(chunk) if node_config.stream_to_user: if first_chunk and hasattr(self, "_has_streamed"): yield {"answer": "\n\n"} first_chunk = False yield event if node_config.stream_to_user: self._has_streamed = True full_response = "".join(full_response_parts).strip() output_value: Any = full_response if has_structured_response: structured_response = "".join(structured_response_parts).strip() response_to_parse = structured_response or full_response parsed_success, parsed_structured = self._parse_structured_output( response_to_parse ) output_value = parsed_structured if parsed_success else response_to_parse if node_json_schema: self._validate_structured_output(node_json_schema, output_value) elif node_json_schema: parsed_success, parsed_structured = self._parse_structured_output( full_response ) if not parsed_success: raise ValueError( "Structured output was expected but response was not valid JSON" ) output_value = parsed_structured self._validate_structured_output(node_json_schema, output_value) default_output_key = f"node_{node.id}_output" self.state[default_output_key] = output_value if node_config.output_variable: self.state[node_config.output_variable] = output_value def _execute_state_node( self, node: WorkflowNode ) -> Generator[Dict[str, str], None, None]: config = node.config.get("config", node.config) for op in config.get("operations", []): expression = op.get("expression", "") target_variable = op.get("target_variable", "") if expression and target_variable: self.state[target_variable] = evaluate_cel(expression, self.state) yield from () def _execute_condition_node( self, node: WorkflowNode ) -> Generator[Dict[str, str], None, None]: config = ConditionNodeConfig(**node.config.get("config", node.config)) matched_handle = None for case in config.cases: if not case.expression.strip(): continue try: if evaluate_cel(case.expression, self.state): matched_handle = case.source_handle break except CelEvaluationError: continue self._condition_result = matched_handle or "else" yield from () def _execute_end_node( self, node: WorkflowNode ) -> Generator[Dict[str, str], None, None]: config = node.config.get("config", node.config) output_template = str(config.get("output_template", "")) if output_template: formatted_output = self._format_template(output_template) yield {"answer": formatted_output} def _parse_structured_output(self, raw_response: str) -> tuple[bool, Optional[Any]]: normalized_response = raw_response.strip() if not normalized_response: return False, None try: return True, json.loads(normalized_response) except json.JSONDecodeError: logger.warning( "Workflow agent returned structured output that was not valid JSON" ) return False, None def _normalize_node_json_schema( self, schema: Optional[Dict[str, Any]], node_title: str ) -> Optional[Dict[str, Any]]: if schema is None: return None try: return normalize_json_schema_payload(schema) except JsonSchemaValidationError as exc: raise ValueError( f'Invalid JSON schema for node "{node_title}": {exc}' ) from exc def _validate_structured_output(self, schema: Dict[str, Any], output_value: Any) -> None: if jsonschema is None: logger.warning( "jsonschema package is not available, skipping structured output validation" ) return try: normalized_schema = normalize_json_schema_payload(schema) except JsonSchemaValidationError as exc: raise ValueError(f"Invalid JSON schema: {exc}") from exc try: jsonschema.validate(instance=output_value, schema=normalized_schema) except jsonschema.exceptions.ValidationError as exc: raise ValueError(f"Structured output did not match schema: {exc.message}") from exc except jsonschema.exceptions.SchemaError as exc: raise ValueError(f"Invalid JSON schema: {exc.message}") from exc def _format_template(self, template: str) -> str: context = self._build_template_context() try: return self._template_engine.render(template, context) except TemplateRenderError as e: logger.warning( "Workflow template rendering failed, using raw template: %s", str(e) ) return template def _build_template_context(self) -> Dict[str, Any]: docs, docs_together = self._get_source_template_data() passthrough_data = ( self.state.get("passthrough") if isinstance(self.state.get("passthrough"), dict) else None ) tools_data = ( self.state.get("tools") if isinstance(self.state.get("tools"), dict) else None ) context = self._namespace_manager.build_context( user_id=getattr(self.agent, "user", None), request_id=getattr(self.agent, "request_id", None), passthrough_data=passthrough_data, docs=docs, docs_together=docs_together, tools_data=tools_data, ) agent_context: Dict[str, Any] = {} for key, value in self.state.items(): if not isinstance(key, str): continue normalized_key = key.strip() if not normalized_key: continue agent_context[normalized_key] = value context["agent"] = agent_context # Keep legacy top-level variables working while namespaced variables are adopted. for key, value in agent_context.items(): if key in TEMPLATE_RESERVED_NAMESPACES: context[f"agent_{key}"] = value continue if key not in context: context[key] = value return context def _get_source_template_data(self) -> tuple[Optional[List[Dict[str, Any]]], Optional[str]]: docs = getattr(self.agent, "retrieved_docs", None) if not isinstance(docs, list) or len(docs) == 0: return None, None docs_together_parts: List[str] = [] for doc in docs: if not isinstance(doc, dict): continue text = doc.get("text") if not isinstance(text, str): continue filename = doc.get("filename") or doc.get("title") or doc.get("source") if isinstance(filename, str) and filename.strip(): docs_together_parts.append(f"{filename}\n{text}") else: docs_together_parts.append(text) docs_together = "\n\n".join(docs_together_parts) if docs_together_parts else None return docs, docs_together def get_execution_summary(self) -> List[NodeExecutionLog]: return [ NodeExecutionLog( node_id=log["node_id"], node_type=log["node_type"], status=ExecutionStatus(log["status"]), started_at=log["started_at"], completed_at=log.get("completed_at"), error=log.get("error"), state_snapshot=log.get("state_snapshot", {}), ) for log in self.execution_log ] ================================================ FILE: application/api/__init__.py ================================================ from flask_restx import Api api = Api( version="1.0", title="DocsGPT API", description="API for DocsGPT", ) ================================================ FILE: application/api/answer/__init__.py ================================================ from flask import Blueprint from application.api import api from application.api.answer.routes.answer import AnswerResource from application.api.answer.routes.base import answer_ns from application.api.answer.routes.search import SearchResource from application.api.answer.routes.stream import StreamResource answer = Blueprint("answer", __name__) api.add_namespace(answer_ns) def init_answer_routes(): api.add_resource(StreamResource, "/stream") api.add_resource(AnswerResource, "/api/answer") api.add_resource(SearchResource, "/api/search") init_answer_routes() ================================================ FILE: application/api/answer/routes/__init__.py ================================================ ================================================ FILE: application/api/answer/routes/answer.py ================================================ import logging import traceback from flask import make_response, request from flask_restx import fields, Resource from application.api import api from application.api.answer.routes.base import answer_ns, BaseAnswerResource from application.api.answer.services.stream_processor import StreamProcessor logger = logging.getLogger(__name__) @answer_ns.route("/api/answer") class AnswerResource(Resource, BaseAnswerResource): def __init__(self, *args, **kwargs): Resource.__init__(self, *args, **kwargs) BaseAnswerResource.__init__(self) answer_model = answer_ns.model( "AnswerModel", { "question": fields.String( required=True, description="Question to be asked" ), "history": fields.List( fields.String, required=False, description="Conversation history (only for new conversations)", ), "conversation_id": fields.String( required=False, description="Existing conversation ID (loads history)", ), "prompt_id": fields.String( required=False, default="default", description="Prompt ID" ), "chunks": fields.Integer( required=False, default=2, description="Number of chunks" ), "retriever": fields.String(required=False, description="Retriever type"), "api_key": fields.String(required=False, description="API key"), "agent_id": fields.String(required=False, description="Agent ID"), "active_docs": fields.String( required=False, description="Active documents" ), "isNoneDoc": fields.Boolean( required=False, description="Flag indicating if no document is used" ), "save_conversation": fields.Boolean( required=False, default=True, description="Whether to save the conversation", ), "model_id": fields.String( required=False, description="Model ID to use for this request", ), "passthrough": fields.Raw( required=False, description="Dynamic parameters to inject into prompt template", ), }, ) @api.expect(answer_model) @api.doc(description="Provide a response based on the question and retriever") def post(self): data = request.get_json() if error := self.validate_request(data): return error decoded_token = getattr(request, "decoded_token", None) processor = StreamProcessor(data, decoded_token) try: processor.initialize() if not processor.decoded_token: return make_response({"error": "Unauthorized"}, 401) docs_together, docs_list = processor.pre_fetch_docs( data.get("question", "") ) tools_data = processor.pre_fetch_tools() agent = processor.create_agent( docs_together=docs_together, docs=docs_list, tools_data=tools_data, ) if error := self.check_usage(processor.agent_config): return error stream = self.complete_stream( question=data["question"], agent=agent, conversation_id=processor.conversation_id, user_api_key=processor.agent_config.get("user_api_key"), decoded_token=processor.decoded_token, isNoneDoc=data.get("isNoneDoc"), index=None, should_save_conversation=data.get("save_conversation", True), agent_id=processor.agent_id, is_shared_usage=processor.is_shared_usage, shared_token=processor.shared_token, model_id=processor.model_id, ) stream_result = self.process_response_stream(stream) if len(stream_result) == 7: ( conversation_id, response, sources, tool_calls, thought, error, structured_info, ) = stream_result else: conversation_id, response, sources, tool_calls, thought, error = ( stream_result ) structured_info = None if error: return make_response({"error": error}, 400) result = { "conversation_id": conversation_id, "answer": response, "sources": sources, "tool_calls": tool_calls, "thought": thought, } if structured_info: result.update(structured_info) except Exception as e: logger.error( f"/api/answer - error: {str(e)} - traceback: {traceback.format_exc()}", extra={"error": str(e), "traceback": traceback.format_exc()}, ) return make_response({"error": "An error occurred processing your request"}, 500) return make_response(result, 200) ================================================ FILE: application/api/answer/routes/base.py ================================================ import datetime import json import logging from typing import Any, Dict, Generator, List, Optional from flask import jsonify, make_response, Response from flask_restx import Namespace from application.api.answer.services.conversation_service import ConversationService from application.core.model_utils import ( get_api_key_for_provider, get_default_model_id, get_provider_from_model_id, ) from application.core.mongo_db import MongoDB from application.core.settings import settings from application.error import sanitize_api_error from application.llm.llm_creator import LLMCreator from application.utils import check_required_fields logger = logging.getLogger(__name__) answer_ns = Namespace("answer", description="Answer related operations", path="/") class BaseAnswerResource: """Shared base class for answer endpoints""" def __init__(self): mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] self.db = db self.user_logs_collection = db["user_logs"] self.default_model_id = get_default_model_id() self.conversation_service = ConversationService() def validate_request( self, data: Dict[str, Any], require_conversation_id: bool = False ) -> Optional[Response]: """Common request validation""" required_fields = ["question"] if require_conversation_id: required_fields.append("conversation_id") if missing_fields := check_required_fields(data, required_fields): return missing_fields return None @staticmethod def _prepare_tool_calls_for_logging( tool_calls: Optional[List[Dict[str, Any]]], max_chars: int = 10000 ) -> List[Dict[str, Any]]: if not tool_calls: return [] prepared = [] for tool_call in tool_calls: if not isinstance(tool_call, dict): prepared.append({"result": str(tool_call)[:max_chars]}) continue item = dict(tool_call) for key in ("result", "result_full"): value = item.get(key) if isinstance(value, str) and len(value) > max_chars: item[key] = value[:max_chars] prepared.append(item) return prepared def check_usage(self, agent_config: Dict) -> Optional[Response]: """Check if there is a usage limit and if it is exceeded Args: agent_config: The config dict of agent instance Returns: None or Response if either of limits exceeded. """ api_key = agent_config.get("user_api_key") if not api_key: return None agents_collection = self.db["agents"] agent = agents_collection.find_one({"key": api_key}) if not agent: return make_response( jsonify({"success": False, "message": "Invalid API key."}), 401 ) limited_token_mode_raw = agent.get("limited_token_mode", False) limited_request_mode_raw = agent.get("limited_request_mode", False) limited_token_mode = ( limited_token_mode_raw if isinstance(limited_token_mode_raw, bool) else limited_token_mode_raw == "True" ) limited_request_mode = ( limited_request_mode_raw if isinstance(limited_request_mode_raw, bool) else limited_request_mode_raw == "True" ) token_limit = int( agent.get("token_limit", settings.DEFAULT_AGENT_LIMITS["token_limit"]) ) request_limit = int( agent.get("request_limit", settings.DEFAULT_AGENT_LIMITS["request_limit"]) ) token_usage_collection = self.db["token_usage"] end_date = datetime.datetime.now() start_date = end_date - datetime.timedelta(hours=24) match_query = { "timestamp": {"$gte": start_date, "$lte": end_date}, "api_key": api_key, } if limited_token_mode: token_pipeline = [ {"$match": match_query}, { "$group": { "_id": None, "total_tokens": { "$sum": {"$add": ["$prompt_tokens", "$generated_tokens"]} }, } }, ] token_result = list(token_usage_collection.aggregate(token_pipeline)) daily_token_usage = token_result[0]["total_tokens"] if token_result else 0 else: daily_token_usage = 0 if limited_request_mode: daily_request_usage = token_usage_collection.count_documents(match_query) else: daily_request_usage = 0 if not limited_token_mode and not limited_request_mode: return None token_exceeded = ( limited_token_mode and token_limit > 0 and daily_token_usage >= token_limit ) request_exceeded = ( limited_request_mode and request_limit > 0 and daily_request_usage >= request_limit ) if token_exceeded or request_exceeded: return make_response( jsonify( { "success": False, "message": "Exceeding usage limit, please try again later.", } ), 429, ) return None def complete_stream( self, question: str, agent: Any, conversation_id: Optional[str], user_api_key: Optional[str], decoded_token: Dict[str, Any], isNoneDoc: bool = False, index: Optional[int] = None, should_save_conversation: bool = True, attachment_ids: Optional[List[str]] = None, agent_id: Optional[str] = None, is_shared_usage: bool = False, shared_token: Optional[str] = None, model_id: Optional[str] = None, ) -> Generator[str, None, None]: """ Generator function that streams the complete conversation response. Args: question: The user's question agent: The agent instance retriever: The retriever instance conversation_id: Existing conversation ID user_api_key: User's API key if any decoded_token: Decoded JWT token isNoneDoc: Flag for document-less responses index: Index of message to update should_save_conversation: Whether to persist the conversation attachment_ids: List of attachment IDs agent_id: ID of agent used is_shared_usage: Flag for shared agent usage shared_token: Token for shared agent model_id: Model ID used for the request retrieved_docs: Pre-fetched documents for sources (optional) Yields: Server-sent event strings """ try: response_full, thought, source_log_docs, tool_calls = "", "", [], [] is_structured = False schema_info = None structured_chunks = [] for line in agent.gen(query=question): if "answer" in line: response_full += str(line["answer"]) if line.get("structured"): is_structured = True schema_info = line.get("schema") structured_chunks.append(line["answer"]) else: data = json.dumps({"type": "answer", "answer": line["answer"]}) yield f"data: {data}\n\n" elif "sources" in line: truncated_sources = [] source_log_docs = line["sources"] for source in line["sources"]: truncated_source = source.copy() if "text" in truncated_source: truncated_source["text"] = ( truncated_source["text"][:100].strip() + "..." ) truncated_sources.append(truncated_source) if truncated_sources: data = json.dumps( {"type": "source", "source": truncated_sources} ) yield f"data: {data}\n\n" elif "tool_calls" in line: tool_calls = line["tool_calls"] data = json.dumps({"type": "tool_calls", "tool_calls": tool_calls}) yield f"data: {data}\n\n" elif "thought" in line: thought += line["thought"] data = json.dumps({"type": "thought", "thought": line["thought"]}) yield f"data: {data}\n\n" elif "type" in line: if line.get("type") == "error": sanitized_error = { "type": "error", "error": sanitize_api_error(line.get("error", "An error occurred")) } data = json.dumps(sanitized_error) else: data = json.dumps(line) yield f"data: {data}\n\n" if is_structured and structured_chunks: structured_data = { "type": "structured_answer", "answer": response_full, "structured": True, "schema": schema_info, } data = json.dumps(structured_data) yield f"data: {data}\n\n" if isNoneDoc: for doc in source_log_docs: doc["source"] = "None" provider = ( get_provider_from_model_id(model_id) if model_id else settings.LLM_PROVIDER ) system_api_key = get_api_key_for_provider(provider or settings.LLM_PROVIDER) llm = LLMCreator.create_llm( provider or settings.LLM_PROVIDER, api_key=system_api_key, user_api_key=user_api_key, decoded_token=decoded_token, model_id=model_id, agent_id=agent_id, ) if should_save_conversation: conversation_id = self.conversation_service.save_conversation( conversation_id, question, response_full, thought, source_log_docs, tool_calls, llm, model_id or self.default_model_id, decoded_token, index=index, api_key=user_api_key, agent_id=agent_id, is_shared_usage=is_shared_usage, shared_token=shared_token, attachment_ids=attachment_ids, ) # Persist compression metadata/summary if it exists and wasn't saved mid-execution compression_meta = getattr(agent, "compression_metadata", None) compression_saved = getattr(agent, "compression_saved", False) if conversation_id and compression_meta and not compression_saved: try: self.conversation_service.update_compression_metadata( conversation_id, compression_meta ) self.conversation_service.append_compression_message( conversation_id, compression_meta ) agent.compression_saved = True logger.info( f"Persisted compression metadata for conversation {conversation_id}" ) except Exception as e: logger.error( f"Failed to persist compression metadata: {str(e)}", exc_info=True, ) else: conversation_id = None id_data = {"type": "id", "id": str(conversation_id)} data = json.dumps(id_data) yield f"data: {data}\n\n" tool_calls_for_logging = self._prepare_tool_calls_for_logging( getattr(agent, "tool_calls", tool_calls) or tool_calls ) log_data = { "action": "stream_answer", "level": "info", "user": decoded_token.get("sub"), "api_key": user_api_key, "agent_id": agent_id, "question": question, "response": response_full, "sources": source_log_docs, "tool_calls": tool_calls_for_logging, "attachments": attachment_ids, "timestamp": datetime.datetime.now(datetime.timezone.utc), } if is_structured: log_data["structured_output"] = True if schema_info: log_data["schema"] = schema_info # Clean up text fields to be no longer than 10000 characters for key, value in log_data.items(): if isinstance(value, str) and len(value) > 10000: log_data[key] = value[:10000] self.user_logs_collection.insert_one(log_data) data = json.dumps({"type": "end"}) yield f"data: {data}\n\n" except GeneratorExit: logger.info(f"Stream aborted by client for question: {question[:50]}... ") # Save partial response if should_save_conversation and response_full: try: if isNoneDoc: for doc in source_log_docs: doc["source"] = "None" llm = LLMCreator.create_llm( settings.LLM_PROVIDER, api_key=settings.API_KEY, user_api_key=user_api_key, decoded_token=decoded_token, agent_id=agent_id, ) self.conversation_service.save_conversation( conversation_id, question, response_full, thought, source_log_docs, tool_calls, llm, model_id or self.default_model_id, decoded_token, index=index, api_key=user_api_key, agent_id=agent_id, is_shared_usage=is_shared_usage, shared_token=shared_token, attachment_ids=attachment_ids, ) compression_meta = getattr(agent, "compression_metadata", None) compression_saved = getattr(agent, "compression_saved", False) if conversation_id and compression_meta and not compression_saved: try: self.conversation_service.update_compression_metadata( conversation_id, compression_meta ) self.conversation_service.append_compression_message( conversation_id, compression_meta ) agent.compression_saved = True logger.info( f"Persisted compression metadata for conversation {conversation_id} (partial stream)" ) except Exception as e: logger.error( f"Failed to persist compression metadata (partial stream): {str(e)}", exc_info=True, ) except Exception as e: logger.error( f"Error saving partial response: {str(e)}", exc_info=True ) raise except Exception as e: logger.error(f"Error in stream: {str(e)}", exc_info=True) data = json.dumps( { "type": "error", "error": "Please try again later. We apologize for any inconvenience.", } ) yield f"data: {data}\n\n" return def process_response_stream(self, stream): """Process the stream response for non-streaming endpoint""" conversation_id = "" response_full = "" source_log_docs = [] tool_calls = [] thought = "" stream_ended = False is_structured = False schema_info = None for line in stream: try: event_data = line.replace("data: ", "").strip() event = json.loads(event_data) if event["type"] == "id": conversation_id = event["id"] elif event["type"] == "answer": response_full += event["answer"] elif event["type"] == "structured_answer": response_full = event["answer"] is_structured = True schema_info = event.get("schema") elif event["type"] == "source": source_log_docs = event["source"] elif event["type"] == "tool_calls": tool_calls = event["tool_calls"] elif event["type"] == "thought": thought = event["thought"] elif event["type"] == "error": logger.error(f"Error from stream: {event['error']}") return None, None, None, None, event["error"], None elif event["type"] == "end": stream_ended = True except (json.JSONDecodeError, KeyError) as e: logger.warning(f"Error parsing stream event: {e}, line: {line}") continue if not stream_ended: logger.error("Stream ended unexpectedly without an 'end' event.") return None, None, None, None, "Stream ended unexpectedly", None result = ( conversation_id, response_full, source_log_docs, tool_calls, thought, None, ) if is_structured: result = result + ({"structured": True, "schema": schema_info},) return result def error_stream_generate(self, err_response): data = json.dumps({"type": "error", "error": err_response}) yield f"data: {data}\n\n" ================================================ FILE: application/api/answer/routes/search.py ================================================ import logging from typing import Any, Dict, List from flask import make_response, request from flask_restx import fields, Resource from bson.dbref import DBRef from application.api.answer.routes.base import answer_ns from application.core.mongo_db import MongoDB from application.core.settings import settings from application.vectorstore.vector_creator import VectorCreator logger = logging.getLogger(__name__) @answer_ns.route("/api/search") class SearchResource(Resource): """Fast search endpoint for retrieving relevant documents""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) mongo = MongoDB.get_client() self.db = mongo[settings.MONGO_DB_NAME] self.agents_collection = self.db["agents"] search_model = answer_ns.model( "SearchModel", { "question": fields.String( required=True, description="Search query" ), "api_key": fields.String( required=True, description="API key for authentication" ), "chunks": fields.Integer( required=False, default=5, description="Number of results to return" ), }, ) def _get_sources_from_api_key(self, api_key: str) -> List[str]: """Get source IDs connected to the API key/agent. """ agent_data = self.agents_collection.find_one({"key": api_key}) if not agent_data: return [] source_ids = [] # Handle multiple sources (only if non-empty) sources = agent_data.get("sources", []) if sources and isinstance(sources, list) and len(sources) > 0: for source_ref in sources: # Skip "default" - it's a placeholder, not an actual vectorstore if source_ref == "default": continue elif isinstance(source_ref, DBRef): source_doc = self.db.dereference(source_ref) if source_doc: source_ids.append(str(source_doc["_id"])) # Handle single source (legacy) - check if sources was empty or didn't yield results if not source_ids: source = agent_data.get("source") if isinstance(source, DBRef): source_doc = self.db.dereference(source) if source_doc: source_ids.append(str(source_doc["_id"])) # Skip "default" - it's a placeholder, not an actual vectorstore elif source and source != "default": source_ids.append(source) return source_ids def _search_vectorstores( self, query: str, source_ids: List[str], chunks: int ) -> List[Dict[str, Any]]: """Search across vectorstores and return results""" if not source_ids: return [] results = [] chunks_per_source = max(1, chunks // len(source_ids)) seen_texts = set() for source_id in source_ids: if not source_id or not source_id.strip(): continue try: docsearch = VectorCreator.create_vectorstore( settings.VECTOR_STORE, source_id, settings.EMBEDDINGS_KEY ) docs = docsearch.search(query, k=chunks_per_source * 2) for doc in docs: if len(results) >= chunks: break if hasattr(doc, "page_content") and hasattr(doc, "metadata"): page_content = doc.page_content metadata = doc.metadata else: page_content = doc.get("text", doc.get("page_content", "")) metadata = doc.get("metadata", {}) # Skip duplicates text_hash = hash(page_content[:200]) if text_hash in seen_texts: continue seen_texts.add(text_hash) title = metadata.get( "title", metadata.get("post_title", "") ) if not isinstance(title, str): title = str(title) if title else "" # Clean up title if title: title = title.split("/")[-1] else: # Use filename or first part of content as title title = metadata.get("filename", page_content[:50] + "...") source = metadata.get("source", source_id) results.append({ "text": page_content, "title": title, "source": source, }) if len(results) >= chunks: break except Exception as e: logger.error( f"Error searching vectorstore {source_id}: {e}", exc_info=True, ) continue return results[:chunks] @answer_ns.expect(search_model) @answer_ns.doc(description="Search for relevant documents based on query") def post(self): data = request.get_json() question = data.get("question") api_key = data.get("api_key") chunks = data.get("chunks", 5) if not question: return make_response({"error": "question is required"}, 400) if not api_key: return make_response({"error": "api_key is required"}, 400) # Validate API key agent = self.agents_collection.find_one({"key": api_key}) if not agent: return make_response({"error": "Invalid API key"}, 401) try: # Get sources connected to this API key source_ids = self._get_sources_from_api_key(api_key) if not source_ids: return make_response([], 200) # Perform search results = self._search_vectorstores(question, source_ids, chunks) return make_response(results, 200) except Exception as e: logger.error( f"/api/search - error: {str(e)}", extra={"error": str(e)}, exc_info=True, ) return make_response({"error": "Search failed"}, 500) ================================================ FILE: application/api/answer/routes/stream.py ================================================ import logging import traceback from flask import request, Response from flask_restx import fields, Resource from application.api import api from application.api.answer.routes.base import answer_ns, BaseAnswerResource from application.api.answer.services.stream_processor import StreamProcessor logger = logging.getLogger(__name__) @answer_ns.route("/stream") class StreamResource(Resource, BaseAnswerResource): def __init__(self, *args, **kwargs): Resource.__init__(self, *args, **kwargs) BaseAnswerResource.__init__(self) stream_model = answer_ns.model( "StreamModel", { "question": fields.String( required=True, description="Question to be asked" ), "history": fields.List( fields.String, required=False, description="Conversation history (only for new conversations)", ), "conversation_id": fields.String( required=False, description="Existing conversation ID (loads history)", ), "prompt_id": fields.String( required=False, default="default", description="Prompt ID" ), "chunks": fields.Integer( required=False, default=2, description="Number of chunks" ), "retriever": fields.String(required=False, description="Retriever type"), "api_key": fields.String(required=False, description="API key"), "agent_id": fields.String(required=False, description="Agent ID"), "active_docs": fields.String( required=False, description="Active documents" ), "isNoneDoc": fields.Boolean( required=False, description="Flag indicating if no document is used" ), "index": fields.Integer( required=False, description="Index of the query to update" ), "save_conversation": fields.Boolean( required=False, default=True, description="Whether to save the conversation", ), "model_id": fields.String( required=False, description="Model ID to use for this request", ), "attachments": fields.List( fields.String, required=False, description="List of attachment IDs" ), "passthrough": fields.Raw( required=False, description="Dynamic parameters to inject into prompt template", ), }, ) @api.expect(stream_model) @api.doc(description="Stream a response based on the question and retriever") def post(self): data = request.get_json() if error := self.validate_request(data, "index" in data): return error decoded_token = getattr(request, "decoded_token", None) processor = StreamProcessor(data, decoded_token) try: processor.initialize() if not processor.decoded_token: return Response( self.error_stream_generate("Unauthorized"), status=401, mimetype="text/event-stream", ) docs_together, docs_list = processor.pre_fetch_docs(data["question"]) tools_data = processor.pre_fetch_tools() agent = processor.create_agent( docs_together=docs_together, docs=docs_list, tools_data=tools_data ) if error := self.check_usage(processor.agent_config): return error return Response( self.complete_stream( question=data["question"], agent=agent, conversation_id=processor.conversation_id, user_api_key=processor.agent_config.get("user_api_key"), decoded_token=processor.decoded_token, isNoneDoc=data.get("isNoneDoc"), index=data.get("index"), should_save_conversation=data.get("save_conversation", True), attachment_ids=data.get("attachments", []), agent_id=processor.agent_id, is_shared_usage=processor.is_shared_usage, shared_token=processor.shared_token, model_id=processor.model_id, ), mimetype="text/event-stream", ) except ValueError as e: message = "Malformed request body" logger.error( f"/stream - error: {message} - specific error: {str(e)} - traceback: {traceback.format_exc()}", extra={"error": str(e), "traceback": traceback.format_exc()}, ) return Response( self.error_stream_generate(message), status=400, mimetype="text/event-stream", ) except Exception as e: logger.error( f"/stream - error: {str(e)} - traceback: {traceback.format_exc()}", extra={"error": str(e), "traceback": traceback.format_exc()}, ) return Response( self.error_stream_generate("Unknown error occurred"), status=400, mimetype="text/event-stream", ) ================================================ FILE: application/api/answer/services/__init__.py ================================================ ================================================ FILE: application/api/answer/services/compression/__init__.py ================================================ """ Compression module for managing conversation context compression. """ from application.api.answer.services.compression.orchestrator import ( CompressionOrchestrator, ) from application.api.answer.services.compression.service import CompressionService from application.api.answer.services.compression.types import ( CompressionResult, CompressionMetadata, ) __all__ = [ "CompressionOrchestrator", "CompressionService", "CompressionResult", "CompressionMetadata", ] ================================================ FILE: application/api/answer/services/compression/message_builder.py ================================================ """Message reconstruction utilities for compression.""" import logging import uuid from typing import Dict, List, Optional logger = logging.getLogger(__name__) class MessageBuilder: """Builds message arrays from compressed context.""" @staticmethod def build_from_compressed_context( system_prompt: str, compressed_summary: Optional[str], recent_queries: List[Dict], include_tool_calls: bool = False, context_type: str = "pre_request", ) -> List[Dict]: """ Build messages from compressed context. Args: system_prompt: Original system prompt compressed_summary: Compressed summary (if any) recent_queries: Recent uncompressed queries include_tool_calls: Whether to include tool calls from history context_type: Type of context ('pre_request' or 'mid_execution') Returns: List of message dicts ready for LLM """ # Append compression summary to system prompt if present if compressed_summary: system_prompt = MessageBuilder._append_compression_context( system_prompt, compressed_summary, context_type ) messages = [{"role": "system", "content": system_prompt}] # Add recent history for query in recent_queries: if "prompt" in query and "response" in query: messages.append({"role": "user", "content": query["prompt"]}) messages.append({"role": "assistant", "content": query["response"]}) # Add tool calls from history if present if include_tool_calls and "tool_calls" in query: for tool_call in query["tool_calls"]: call_id = tool_call.get("call_id") or str(uuid.uuid4()) function_call_dict = { "function_call": { "name": tool_call.get("action_name"), "args": tool_call.get("arguments"), "call_id": call_id, } } function_response_dict = { "function_response": { "name": tool_call.get("action_name"), "response": {"result": tool_call.get("result")}, "call_id": call_id, } } messages.append( {"role": "assistant", "content": [function_call_dict]} ) messages.append( {"role": "tool", "content": [function_response_dict]} ) # If no recent queries (everything was compressed), add a continuation user message if len(recent_queries) == 0 and compressed_summary: messages.append({ "role": "user", "content": "Please continue with the remaining tasks based on the context above." }) logger.info("Added continuation user message to maintain proper turn-taking after full compression") return messages @staticmethod def _append_compression_context( system_prompt: str, compressed_summary: str, context_type: str = "pre_request" ) -> str: """ Append compression context to system prompt. Args: system_prompt: Original system prompt compressed_summary: Summary to append context_type: Type of compression context Returns: Updated system prompt """ # Remove existing compression context if present if "This session is being continued" in system_prompt or "Context window limit reached" in system_prompt: parts = system_prompt.split("\n\n---\n\n") system_prompt = parts[0] # Build appropriate context message based on type if context_type == "mid_execution": context_message = ( "\n\n---\n\n" "Context window limit reached during execution. " "Previous conversation has been compressed to fit within limits. " "The conversation is summarized below:\n\n" f"{compressed_summary}" ) else: # pre_request context_message = ( "\n\n---\n\n" "This session is being continued from a previous conversation that " "has been compressed to fit within context limits. " "The conversation is summarized below:\n\n" f"{compressed_summary}" ) return system_prompt + context_message @staticmethod def rebuild_messages_after_compression( messages: List[Dict], compressed_summary: Optional[str], recent_queries: List[Dict], include_current_execution: bool = False, include_tool_calls: bool = False, ) -> Optional[List[Dict]]: """ Rebuild the message list after compression so tool execution can continue. Args: messages: Original message list compressed_summary: Compressed summary recent_queries: Recent uncompressed queries include_current_execution: Whether to preserve current execution messages include_tool_calls: Whether to include tool calls from history Returns: Rebuilt message list or None if failed """ # Find the system message system_message = next( (msg for msg in messages if msg.get("role") == "system"), None ) if not system_message: logger.warning("No system message found in messages list") return None # Update system message with compressed summary if compressed_summary: content = system_message.get("content", "") system_message["content"] = MessageBuilder._append_compression_context( content, compressed_summary, "mid_execution" ) logger.info( "Appended compression summary to system prompt (truncated): %s", ( compressed_summary[:500] + "..." if len(compressed_summary) > 500 else compressed_summary ), ) rebuilt_messages = [system_message] # Add recent history from compressed context for query in recent_queries: if "prompt" in query and "response" in query: rebuilt_messages.append({"role": "user", "content": query["prompt"]}) rebuilt_messages.append( {"role": "assistant", "content": query["response"]} ) # Add tool calls from history if present if include_tool_calls and "tool_calls" in query: for tool_call in query["tool_calls"]: call_id = tool_call.get("call_id") or str(uuid.uuid4()) function_call_dict = { "function_call": { "name": tool_call.get("action_name"), "args": tool_call.get("arguments"), "call_id": call_id, } } function_response_dict = { "function_response": { "name": tool_call.get("action_name"), "response": {"result": tool_call.get("result")}, "call_id": call_id, } } rebuilt_messages.append( {"role": "assistant", "content": [function_call_dict]} ) rebuilt_messages.append( {"role": "tool", "content": [function_response_dict]} ) # If no recent queries (everything was compressed), add a continuation user message if len(recent_queries) == 0 and compressed_summary: rebuilt_messages.append({ "role": "user", "content": "Please continue with the remaining tasks based on the context above." }) logger.info("Added continuation user message to maintain proper turn-taking after full compression") if include_current_execution: # Preserve any messages that were added during the current execution cycle recent_msg_count = 1 # system message for query in recent_queries: if "prompt" in query and "response" in query: recent_msg_count += 2 if "tool_calls" in query: recent_msg_count += len(query["tool_calls"]) * 2 if len(messages) > recent_msg_count: current_execution_messages = messages[recent_msg_count:] rebuilt_messages.extend(current_execution_messages) logger.info( f"Preserved {len(current_execution_messages)} messages from current execution cycle" ) logger.info( f"Messages rebuilt: {len(messages)} → {len(rebuilt_messages)} messages. " f"Ready to continue tool execution." ) return rebuilt_messages ================================================ FILE: application/api/answer/services/compression/orchestrator.py ================================================ """High-level compression orchestration.""" import logging from typing import Any, Dict, Optional from application.api.answer.services.compression.service import CompressionService from application.api.answer.services.compression.threshold_checker import ( CompressionThresholdChecker, ) from application.api.answer.services.compression.types import CompressionResult from application.api.answer.services.conversation_service import ConversationService from application.core.model_utils import ( get_api_key_for_provider, get_provider_from_model_id, ) from application.core.settings import settings from application.llm.llm_creator import LLMCreator logger = logging.getLogger(__name__) class CompressionOrchestrator: """ Facade for compression operations. Coordinates between all compression components and provides a simple interface for callers. """ def __init__( self, conversation_service: ConversationService, threshold_checker: Optional[CompressionThresholdChecker] = None, ): """ Initialize orchestrator. Args: conversation_service: Service for DB operations threshold_checker: Custom threshold checker (optional) """ self.conversation_service = conversation_service self.threshold_checker = threshold_checker or CompressionThresholdChecker() def compress_if_needed( self, conversation_id: str, user_id: str, model_id: str, decoded_token: Dict[str, Any], current_query_tokens: int = 500, ) -> CompressionResult: """ Check if compression is needed and perform it if so. This is the main entry point for compression operations. Args: conversation_id: Conversation ID user_id: User ID model_id: Model being used for conversation decoded_token: User's decoded JWT token current_query_tokens: Estimated tokens for current query Returns: CompressionResult with summary and recent queries """ try: # Load conversation conversation = self.conversation_service.get_conversation( conversation_id, user_id ) if not conversation: logger.warning( f"Conversation {conversation_id} not found for user {user_id}" ) return CompressionResult.failure("Conversation not found") # Check if compression is needed if not self.threshold_checker.should_compress( conversation, model_id, current_query_tokens ): # No compression needed, return full history queries = conversation.get("queries", []) return CompressionResult.success_no_compression(queries) # Perform compression return self._perform_compression( conversation_id, conversation, model_id, decoded_token ) except Exception as e: logger.error( f"Error in compress_if_needed: {str(e)}", exc_info=True ) return CompressionResult.failure(str(e)) def _perform_compression( self, conversation_id: str, conversation: Dict[str, Any], model_id: str, decoded_token: Dict[str, Any], ) -> CompressionResult: """ Perform the actual compression operation. Args: conversation_id: Conversation ID conversation: Conversation document model_id: Model ID for conversation decoded_token: User token Returns: CompressionResult """ try: # Determine which model to use for compression compression_model = ( settings.COMPRESSION_MODEL_OVERRIDE if settings.COMPRESSION_MODEL_OVERRIDE else model_id ) # Get provider and API key for compression model provider = get_provider_from_model_id(compression_model) api_key = get_api_key_for_provider(provider) # Create compression LLM compression_llm = LLMCreator.create_llm( provider, api_key=api_key, user_api_key=None, decoded_token=decoded_token, model_id=compression_model, agent_id=conversation.get("agent_id"), ) # Create compression service with DB update capability compression_service = CompressionService( llm=compression_llm, model_id=compression_model, conversation_service=self.conversation_service, ) # Compress all queries up to the latest queries_count = len(conversation.get("queries", [])) compress_up_to = queries_count - 1 if compress_up_to < 0: logger.warning("No queries to compress") return CompressionResult.success_no_compression([]) logger.info( f"Initiating compression for conversation {conversation_id}: " f"compressing all {queries_count} queries (0-{compress_up_to})" ) # Perform compression and save to DB metadata = compression_service.compress_and_save( conversation_id, conversation, compress_up_to ) logger.info( f"Compression successful - ratio: {metadata.compression_ratio:.1f}x, " f"saved {metadata.original_token_count - metadata.compressed_token_count} tokens" ) # Reload conversation with updated metadata conversation = self.conversation_service.get_conversation( conversation_id, user_id=decoded_token.get("sub") ) # Get compressed context compressed_summary, recent_queries = ( compression_service.get_compressed_context(conversation) ) return CompressionResult.success_with_compression( compressed_summary, recent_queries, metadata ) except Exception as e: logger.error(f"Error performing compression: {str(e)}", exc_info=True) return CompressionResult.failure(str(e)) def compress_mid_execution( self, conversation_id: str, user_id: str, model_id: str, decoded_token: Dict[str, Any], current_conversation: Optional[Dict[str, Any]] = None, ) -> CompressionResult: """ Perform compression during tool execution. Args: conversation_id: Conversation ID user_id: User ID model_id: Model ID decoded_token: User token current_conversation: Pre-loaded conversation (optional) Returns: CompressionResult """ try: # Load conversation if not provided if current_conversation: conversation = current_conversation else: conversation = self.conversation_service.get_conversation( conversation_id, user_id ) if not conversation: logger.warning( f"Could not load conversation {conversation_id} for mid-execution compression" ) return CompressionResult.failure("Conversation not found") # Perform compression return self._perform_compression( conversation_id, conversation, model_id, decoded_token ) except Exception as e: logger.error( f"Error in mid-execution compression: {str(e)}", exc_info=True ) return CompressionResult.failure(str(e)) ================================================ FILE: application/api/answer/services/compression/prompt_builder.py ================================================ """Compression prompt building logic.""" import logging from pathlib import Path from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) class CompressionPromptBuilder: """Builds prompts for LLM compression calls.""" def __init__(self, version: str = "v1.0"): """ Initialize prompt builder. Args: version: Prompt template version to use """ self.version = version self.system_prompt = self._load_prompt(version) def _load_prompt(self, version: str) -> str: """ Load prompt template from file. Args: version: Version string (e.g., 'v1.0') Returns: Prompt template content Raises: FileNotFoundError: If prompt template file doesn't exist """ current_dir = Path(__file__).resolve().parents[4] prompt_path = current_dir / "prompts" / "compression" / f"{version}.txt" try: with open(prompt_path, "r") as f: return f.read() except FileNotFoundError: logger.error(f"Compression prompt template not found: {prompt_path}") raise FileNotFoundError( f"Compression prompt template '{version}' not found at {prompt_path}. " f"Please ensure the template file exists." ) def build_prompt( self, queries: List[Dict[str, Any]], existing_compressions: Optional[List[Dict[str, Any]]] = None, ) -> List[Dict[str, str]]: """ Build messages for compression LLM call. Args: queries: List of query objects to compress existing_compressions: List of previous compression points Returns: List of message dicts for LLM """ # Build conversation text conversation_text = self._format_conversation(queries) # Add existing compression context if present existing_compression_context = "" if existing_compressions and len(existing_compressions) > 0: existing_compression_context = ( "\n\nIMPORTANT: This conversation has been compressed before. " "Previous compression summaries:\n\n" ) for i, comp in enumerate(existing_compressions): existing_compression_context += ( f"--- Compression {i + 1} (up to message {comp.get('query_index', 'unknown')}) ---\n" f"{comp.get('compressed_summary', '')}\n\n" ) existing_compression_context += ( "Your task is to create a NEW summary that incorporates the context from " "previous compressions AND the new messages below. The final summary should " "be comprehensive and include all important information from both previous " "compressions and new messages.\n\n" ) user_prompt = ( f"{existing_compression_context}" f"Here is the conversation to summarize:\n\n" f"{conversation_text}" ) messages = [ {"role": "system", "content": self.system_prompt}, {"role": "user", "content": user_prompt}, ] return messages def _format_conversation(self, queries: List[Dict[str, Any]]) -> str: """ Format conversation queries into readable text for compression. Args: queries: List of query objects Returns: Formatted conversation text """ conversation_lines = [] for i, query in enumerate(queries): conversation_lines.append(f"--- Message {i + 1} ---") conversation_lines.append(f"User: {query.get('prompt', '')}") # Add tool calls if present tool_calls = query.get("tool_calls", []) if tool_calls: conversation_lines.append("\nTool Calls:") for tc in tool_calls: tool_name = tc.get("tool_name", "unknown") action_name = tc.get("action_name", "unknown") arguments = tc.get("arguments", {}) result = tc.get("result", "") if result is None: result = "" status = tc.get("status", "unknown") # Include full tool result for complete compression context conversation_lines.append( f" - {tool_name}.{action_name}({arguments}) " f"[{status}] → {result}" ) # Add agent thought if present thought = query.get("thought", "") if thought: conversation_lines.append(f"\nAgent Thought: {thought}") # Add assistant response conversation_lines.append(f"\nAssistant: {query.get('response', '')}") # Add sources if present sources = query.get("sources", []) if sources: conversation_lines.append(f"\nSources Used: {len(sources)} documents") conversation_lines.append("") # Empty line between messages return "\n".join(conversation_lines) ================================================ FILE: application/api/answer/services/compression/service.py ================================================ """Core compression service with simplified responsibilities.""" import logging import re from datetime import datetime, timezone from typing import Any, Dict, List, Optional from application.api.answer.services.compression.prompt_builder import ( CompressionPromptBuilder, ) from application.api.answer.services.compression.token_counter import TokenCounter from application.api.answer.services.compression.types import ( CompressionMetadata, ) from application.core.settings import settings logger = logging.getLogger(__name__) class CompressionService: """ Service for compressing conversation history. Handles DB updates. """ def __init__( self, llm, model_id: str, conversation_service=None, prompt_builder: Optional[CompressionPromptBuilder] = None, ): """ Initialize compression service. Args: llm: LLM instance to use for compression model_id: Model ID for compression conversation_service: Service for DB operations (optional, for DB updates) prompt_builder: Custom prompt builder (optional) """ self.llm = llm self.model_id = model_id self.conversation_service = conversation_service self.prompt_builder = prompt_builder or CompressionPromptBuilder( version=settings.COMPRESSION_PROMPT_VERSION ) def compress_conversation( self, conversation: Dict[str, Any], compress_up_to_index: int, ) -> CompressionMetadata: """ Compress conversation history up to specified index. Args: conversation: Full conversation document compress_up_to_index: Last query index to include in compression Returns: CompressionMetadata with compression details Raises: ValueError: If compress_up_to_index is invalid """ try: queries = conversation.get("queries", []) if compress_up_to_index < 0 or compress_up_to_index >= len(queries): raise ValueError( f"Invalid compress_up_to_index: {compress_up_to_index} " f"(conversation has {len(queries)} queries)" ) # Get queries to compress queries_to_compress = queries[: compress_up_to_index + 1] # Check if there are existing compressions existing_compressions = conversation.get("compression_metadata", {}).get( "compression_points", [] ) if existing_compressions: logger.info( f"Found {len(existing_compressions)} previous compression(s) - " f"will incorporate into new summary" ) # Calculate original token count original_tokens = TokenCounter.count_query_tokens(queries_to_compress) # Log tool call stats self._log_tool_call_stats(queries_to_compress) # Build compression prompt messages = self.prompt_builder.build_prompt( queries_to_compress, existing_compressions ) # Call LLM to generate compression logger.info( f"Starting compression: {len(queries_to_compress)} queries " f"(messages 0-{compress_up_to_index}, {original_tokens} tokens) " f"using model {self.model_id}" ) response = self.llm.gen( model=self.model_id, messages=messages, max_tokens=4000 ) # Extract summary from response compressed_summary = self._extract_summary(response) # Calculate compressed token count compressed_tokens = TokenCounter.count_message_tokens( [{"content": compressed_summary}] ) # Calculate compression ratio compression_ratio = ( original_tokens / compressed_tokens if compressed_tokens > 0 else 0 ) logger.info( f"Compression complete: {original_tokens} → {compressed_tokens} tokens " f"({compression_ratio:.1f}x compression)" ) # Build compression metadata compression_metadata = CompressionMetadata( timestamp=datetime.now(timezone.utc), query_index=compress_up_to_index, compressed_summary=compressed_summary, original_token_count=original_tokens, compressed_token_count=compressed_tokens, compression_ratio=compression_ratio, model_used=self.model_id, compression_prompt_version=self.prompt_builder.version, ) return compression_metadata except Exception as e: logger.error(f"Error compressing conversation: {str(e)}", exc_info=True) raise def compress_and_save( self, conversation_id: str, conversation: Dict[str, Any], compress_up_to_index: int, ) -> CompressionMetadata: """ Compress conversation and save to database. Args: conversation_id: Conversation ID conversation: Full conversation document compress_up_to_index: Last query index to include Returns: CompressionMetadata Raises: ValueError: If conversation_service not provided or invalid index """ if not self.conversation_service: raise ValueError( "conversation_service required for compress_and_save operation" ) # Perform compression metadata = self.compress_conversation(conversation, compress_up_to_index) # Save to database self.conversation_service.update_compression_metadata( conversation_id, metadata.to_dict() ) logger.info(f"Compression metadata saved to database for {conversation_id}") return metadata def get_compressed_context( self, conversation: Dict[str, Any] ) -> tuple[Optional[str], List[Dict[str, Any]]]: """ Get compressed summary + recent uncompressed messages. Args: conversation: Full conversation document Returns: (compressed_summary, recent_messages) """ try: compression_metadata = conversation.get("compression_metadata", {}) if not compression_metadata.get("is_compressed"): logger.debug("No compression metadata found - using full history") queries = conversation.get("queries", []) if queries is None: logger.error("Conversation queries is None - returning empty list") return None, [] return None, queries compression_points = compression_metadata.get("compression_points", []) if not compression_points: logger.debug("No compression points found - using full history") queries = conversation.get("queries", []) if queries is None: logger.error("Conversation queries is None - returning empty list") return None, [] return None, queries # Get the most recent compression point latest_compression = compression_points[-1] compressed_summary = latest_compression.get("compressed_summary") last_compressed_index = latest_compression.get("query_index") compressed_tokens = latest_compression.get("compressed_token_count", 0) original_tokens = latest_compression.get("original_token_count", 0) # Get only messages after compression point queries = conversation.get("queries", []) total_queries = len(queries) recent_queries = queries[last_compressed_index + 1 :] logger.info( f"Using compressed context: summary ({compressed_tokens} tokens, " f"compressed from {original_tokens}) + {len(recent_queries)} recent messages " f"(messages {last_compressed_index + 1}-{total_queries - 1})" ) return compressed_summary, recent_queries except Exception as e: logger.error( f"Error getting compressed context: {str(e)}", exc_info=True ) queries = conversation.get("queries", []) if queries is None: return None, [] return None, queries def _extract_summary(self, llm_response: str) -> str: """ Extract clean summary from LLM response. Args: llm_response: Raw LLM response Returns: Cleaned summary text """ try: # Try to extract content within tags summary_match = re.search( r"(.*?)", llm_response, re.DOTALL ) if summary_match: summary = summary_match.group(1).strip() else: # If no summary tags, remove analysis tags and use the rest summary = re.sub( r".*?", "", llm_response, flags=re.DOTALL ).strip() return summary except Exception as e: logger.warning(f"Error extracting summary: {str(e)}, using full response") return llm_response def _log_tool_call_stats(self, queries: List[Dict[str, Any]]) -> None: """Log statistics about tool calls in queries.""" total_tool_calls = 0 total_tool_result_chars = 0 tool_call_breakdown = {} for q in queries: for tc in q.get("tool_calls", []): total_tool_calls += 1 tool_name = tc.get("tool_name", "unknown") action_name = tc.get("action_name", "unknown") key = f"{tool_name}.{action_name}" tool_call_breakdown[key] = tool_call_breakdown.get(key, 0) + 1 # Track total tool result size result = tc.get("result", "") if result: total_tool_result_chars += len(str(result)) if total_tool_calls > 0: tool_breakdown_str = ", ".join( f"{tool}({count})" for tool, count in sorted(tool_call_breakdown.items()) ) tool_result_kb = total_tool_result_chars / 1024 logger.info( f"Tool call breakdown: {tool_breakdown_str} " f"(total result size: {tool_result_kb:.1f} KB, {total_tool_result_chars:,} chars)" ) ================================================ FILE: application/api/answer/services/compression/threshold_checker.py ================================================ """Compression threshold checking logic.""" import logging from typing import Any, Dict from application.core.model_utils import get_token_limit from application.core.settings import settings from application.api.answer.services.compression.token_counter import TokenCounter logger = logging.getLogger(__name__) class CompressionThresholdChecker: """Determines if compression is needed based on token thresholds.""" def __init__(self, threshold_percentage: float = None): """ Initialize threshold checker. Args: threshold_percentage: Percentage of context to use as threshold (defaults to settings.COMPRESSION_THRESHOLD_PERCENTAGE) """ self.threshold_percentage = ( threshold_percentage or settings.COMPRESSION_THRESHOLD_PERCENTAGE ) def should_compress( self, conversation: Dict[str, Any], model_id: str, current_query_tokens: int = 500, ) -> bool: """ Determine if compression is needed. Args: conversation: Full conversation document model_id: Target model for this request current_query_tokens: Estimated tokens for current query Returns: True if tokens >= threshold% of context window """ try: # Calculate total tokens in conversation total_tokens = TokenCounter.count_conversation_tokens(conversation) total_tokens += current_query_tokens # Get context window limit for model context_limit = get_token_limit(model_id) # Calculate threshold threshold = int(context_limit * self.threshold_percentage) compression_needed = total_tokens >= threshold percentage_used = (total_tokens / context_limit) * 100 if compression_needed: logger.warning( f"COMPRESSION TRIGGERED: {total_tokens} tokens / {context_limit} limit " f"({percentage_used:.1f}% used, threshold: {self.threshold_percentage * 100:.0f}%)" ) else: logger.info( f"Compression check: {total_tokens}/{context_limit} tokens " f"({percentage_used:.1f}% used, threshold: {self.threshold_percentage * 100:.0f}%) - No compression needed" ) return compression_needed except Exception as e: logger.error(f"Error checking compression need: {str(e)}", exc_info=True) return False def check_message_tokens(self, messages: list, model_id: str) -> bool: """ Check if message list exceeds threshold. Args: messages: List of message dicts model_id: Target model Returns: True if at or above threshold """ try: current_tokens = TokenCounter.count_message_tokens(messages) context_limit = get_token_limit(model_id) threshold = int(context_limit * self.threshold_percentage) if current_tokens >= threshold: logger.warning( f"Message context limit approaching: {current_tokens}/{context_limit} tokens " f"({(current_tokens/context_limit)*100:.1f}%)" ) return True return False except Exception as e: logger.error(f"Error checking message tokens: {str(e)}", exc_info=True) return False ================================================ FILE: application/api/answer/services/compression/token_counter.py ================================================ """Token counting utilities for compression.""" import logging from typing import Any, Dict, List from application.utils import num_tokens_from_string from application.core.settings import settings logger = logging.getLogger(__name__) class TokenCounter: """Centralized token counting for conversations and messages.""" @staticmethod def count_message_tokens(messages: List[Dict]) -> int: """ Calculate total tokens in a list of messages. Args: messages: List of message dicts with 'content' field Returns: Total token count """ total_tokens = 0 for message in messages: content = message.get("content", "") if isinstance(content, str): total_tokens += num_tokens_from_string(content) elif isinstance(content, list): # Handle structured content (tool calls, etc.) for item in content: if isinstance(item, dict): total_tokens += num_tokens_from_string(str(item)) return total_tokens @staticmethod def count_query_tokens( queries: List[Dict[str, Any]], include_tool_calls: bool = True ) -> int: """ Count tokens across multiple query objects. Args: queries: List of query objects from conversation include_tool_calls: Whether to count tool call tokens Returns: Total token count """ total_tokens = 0 for query in queries: # Count prompt and response tokens if "prompt" in query: total_tokens += num_tokens_from_string(query["prompt"]) if "response" in query: total_tokens += num_tokens_from_string(query["response"]) if "thought" in query: total_tokens += num_tokens_from_string(query.get("thought", "")) # Count tool call tokens if include_tool_calls and "tool_calls" in query: for tool_call in query["tool_calls"]: tool_call_string = ( f"Tool: {tool_call.get('tool_name')} | " f"Action: {tool_call.get('action_name')} | " f"Args: {tool_call.get('arguments')} | " f"Response: {tool_call.get('result')}" ) total_tokens += num_tokens_from_string(tool_call_string) return total_tokens @staticmethod def count_conversation_tokens( conversation: Dict[str, Any], include_system_prompt: bool = False ) -> int: """ Calculate total tokens in a conversation. Args: conversation: Conversation document include_system_prompt: Whether to include system prompt in count Returns: Total token count """ try: queries = conversation.get("queries", []) total_tokens = TokenCounter.count_query_tokens(queries) # Add system prompt tokens if requested if include_system_prompt: # Rough estimate for system prompt total_tokens += settings.RESERVED_TOKENS.get("system_prompt", 500) return total_tokens except Exception as e: logger.error(f"Error calculating conversation tokens: {str(e)}") return 0 ================================================ FILE: application/api/answer/services/compression/types.py ================================================ """Type definitions for compression module.""" from dataclasses import dataclass, field from datetime import datetime from typing import Any, Dict, List, Optional @dataclass class CompressionMetadata: """Metadata about a compression operation.""" timestamp: datetime query_index: int compressed_summary: str original_token_count: int compressed_token_count: int compression_ratio: float model_used: str compression_prompt_version: str def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for DB storage.""" return { "timestamp": self.timestamp, "query_index": self.query_index, "compressed_summary": self.compressed_summary, "original_token_count": self.original_token_count, "compressed_token_count": self.compressed_token_count, "compression_ratio": self.compression_ratio, "model_used": self.model_used, "compression_prompt_version": self.compression_prompt_version, } @dataclass class CompressionResult: """Result of a compression operation.""" success: bool compressed_summary: Optional[str] = None recent_queries: List[Dict[str, Any]] = field(default_factory=list) metadata: Optional[CompressionMetadata] = None error: Optional[str] = None compression_performed: bool = False @classmethod def success_with_compression( cls, summary: str, queries: List[Dict], metadata: CompressionMetadata ) -> "CompressionResult": """Create a successful result with compression.""" return cls( success=True, compressed_summary=summary, recent_queries=queries, metadata=metadata, compression_performed=True, ) @classmethod def success_no_compression(cls, queries: List[Dict]) -> "CompressionResult": """Create a successful result without compression needed.""" return cls( success=True, recent_queries=queries, compression_performed=False, ) @classmethod def failure(cls, error: str) -> "CompressionResult": """Create a failure result.""" return cls(success=False, error=error, compression_performed=False) def as_history(self) -> List[Dict[str, str]]: """ Convert recent queries to history format. Returns: List of prompt/response dicts """ return [ {"prompt": q["prompt"], "response": q["response"]} for q in self.recent_queries ] ================================================ FILE: application/api/answer/services/conversation_service.py ================================================ import logging from datetime import datetime, timezone from typing import Any, Dict, List, Optional from application.core.mongo_db import MongoDB from application.core.settings import settings from bson import ObjectId logger = logging.getLogger(__name__) class ConversationService: def __init__(self): mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] self.conversations_collection = db["conversations"] self.agents_collection = db["agents"] def get_conversation( self, conversation_id: str, user_id: str ) -> Optional[Dict[str, Any]]: """Retrieve a conversation with proper access control""" if not conversation_id or not user_id: return None try: conversation = self.conversations_collection.find_one( { "_id": ObjectId(conversation_id), "$or": [{"user": user_id}, {"shared_with": user_id}], } ) if not conversation: logger.warning( f"Conversation not found or unauthorized - ID: {conversation_id}, User: {user_id}" ) return None conversation["_id"] = str(conversation["_id"]) return conversation except Exception as e: logger.error(f"Error fetching conversation: {str(e)}", exc_info=True) return None def save_conversation( self, conversation_id: Optional[str], question: str, response: str, thought: str, sources: List[Dict[str, Any]], tool_calls: List[Dict[str, Any]], llm: Any, model_id: str, decoded_token: Dict[str, Any], index: Optional[int] = None, api_key: Optional[str] = None, agent_id: Optional[str] = None, is_shared_usage: bool = False, shared_token: Optional[str] = None, attachment_ids: Optional[List[str]] = None, ) -> str: """Save or update a conversation in the database""" if decoded_token is None: raise ValueError("Invalid or missing authentication token") user_id = decoded_token.get("sub") if not user_id: raise ValueError("User ID not found in token") current_time = datetime.now(timezone.utc) # clean up in sources array such that we save max 1k characters for text part for source in sources: if "text" in source and isinstance(source["text"], str): source["text"] = source["text"][:1000] if conversation_id is not None and index is not None: # Update existing conversation with new query result = self.conversations_collection.update_one( { "_id": ObjectId(conversation_id), "user": user_id, f"queries.{index}": {"$exists": True}, }, { "$set": { f"queries.{index}.prompt": question, f"queries.{index}.response": response, f"queries.{index}.thought": thought, f"queries.{index}.sources": sources, f"queries.{index}.tool_calls": tool_calls, f"queries.{index}.timestamp": current_time, f"queries.{index}.attachments": attachment_ids, f"queries.{index}.model_id": model_id, } }, ) if result.matched_count == 0: raise ValueError("Conversation not found or unauthorized") self.conversations_collection.update_one( { "_id": ObjectId(conversation_id), "user": user_id, f"queries.{index}": {"$exists": True}, }, {"$push": {"queries": {"$each": [], "$slice": index + 1}}}, ) return conversation_id elif conversation_id: # Append new message to existing conversation result = self.conversations_collection.update_one( {"_id": ObjectId(conversation_id), "user": user_id}, { "$push": { "queries": { "prompt": question, "response": response, "thought": thought, "sources": sources, "tool_calls": tool_calls, "timestamp": current_time, "attachments": attachment_ids, "model_id": model_id, } } }, ) if result.matched_count == 0: raise ValueError("Conversation not found or unauthorized") return conversation_id else: # Create new conversation messages_summary = [ { "role": "system", "content": "You are a helpful assistant that creates concise conversation titles. " "Summarize conversations in 3 words or less using the same language as the user.", }, { "role": "user", "content": "Summarise following conversation in no more than 3 words, " "respond ONLY with the summary, use the same language as the " "user query \n\nUser: " + question + "\n\n" + "AI: " + response, }, ] completion = llm.gen( model=model_id, messages=messages_summary, max_tokens=500 ) if not completion or not completion.strip(): completion = question[:50] if question else "New Conversation" conversation_data = { "user": user_id, "date": current_time, "name": completion, "queries": [ { "prompt": question, "response": response, "thought": thought, "sources": sources, "tool_calls": tool_calls, "timestamp": current_time, "attachments": attachment_ids, "model_id": model_id, } ], } if api_key: if agent_id: conversation_data["agent_id"] = agent_id if is_shared_usage: conversation_data["is_shared_usage"] = is_shared_usage conversation_data["shared_token"] = shared_token agent = self.agents_collection.find_one({"key": api_key}) if agent: conversation_data["api_key"] = agent["key"] result = self.conversations_collection.insert_one(conversation_data) return str(result.inserted_id) def update_compression_metadata( self, conversation_id: str, compression_metadata: Dict[str, Any] ) -> None: """ Update conversation with compression metadata. Uses $push with $slice to keep only the most recent compression points, preventing unbounded array growth. Since each compression incorporates previous compressions, older points become redundant. Args: conversation_id: Conversation ID compression_metadata: Compression point data """ try: self.conversations_collection.update_one( {"_id": ObjectId(conversation_id)}, { "$set": { "compression_metadata.is_compressed": True, "compression_metadata.last_compression_at": compression_metadata.get( "timestamp" ), }, "$push": { "compression_metadata.compression_points": { "$each": [compression_metadata], "$slice": -settings.COMPRESSION_MAX_HISTORY_POINTS, } }, }, ) logger.info( f"Updated compression metadata for conversation {conversation_id}" ) except Exception as e: logger.error( f"Error updating compression metadata: {str(e)}", exc_info=True ) raise def append_compression_message( self, conversation_id: str, compression_metadata: Dict[str, Any] ) -> None: """ Append a synthetic compression summary entry into the conversation history. This makes the summary visible in the DB alongside normal queries. """ try: summary = compression_metadata.get("compressed_summary", "") if not summary: return timestamp = compression_metadata.get("timestamp", datetime.now(timezone.utc)) self.conversations_collection.update_one( {"_id": ObjectId(conversation_id)}, { "$push": { "queries": { "prompt": "[Context Compression Summary]", "response": summary, "thought": "", "sources": [], "tool_calls": [], "timestamp": timestamp, "attachments": [], "model_id": compression_metadata.get("model_used"), } } }, ) logger.info(f"Appended compression summary to conversation {conversation_id}") except Exception as e: logger.error( f"Error appending compression summary: {str(e)}", exc_info=True ) def get_compression_metadata( self, conversation_id: str ) -> Optional[Dict[str, Any]]: """ Get compression metadata for a conversation. Args: conversation_id: Conversation ID Returns: Compression metadata dict or None """ try: conversation = self.conversations_collection.find_one( {"_id": ObjectId(conversation_id)}, {"compression_metadata": 1} ) return conversation.get("compression_metadata") if conversation else None except Exception as e: logger.error( f"Error getting compression metadata: {str(e)}", exc_info=True ) return None ================================================ FILE: application/api/answer/services/prompt_renderer.py ================================================ import logging from typing import Any, Dict, Optional from application.templates.namespaces import NamespaceManager from application.templates.template_engine import TemplateEngine, TemplateRenderError logger = logging.getLogger(__name__) class PromptRenderer: """Service for rendering prompts with dynamic context using namespaces""" def __init__(self): self.template_engine = TemplateEngine() self.namespace_manager = NamespaceManager() def render_prompt( self, prompt_content: str, user_id: Optional[str] = None, request_id: Optional[str] = None, passthrough_data: Optional[Dict[str, Any]] = None, docs: Optional[list] = None, docs_together: Optional[str] = None, tools_data: Optional[Dict[str, Any]] = None, **kwargs, ) -> str: """ Render prompt with full context from all namespaces. Args: prompt_content: Raw prompt template string user_id: Current user identifier request_id: Unique request identifier passthrough_data: Parameters from web request docs: RAG retrieved documents docs_together: Concatenated document content tools_data: Pre-fetched tool results organized by tool name **kwargs: Additional parameters for namespace builders Returns: Rendered prompt string with all variables substituted Raises: TemplateRenderError: If template rendering fails """ if not prompt_content: return "" uses_template = self._uses_template_syntax(prompt_content) if not uses_template: return self._apply_legacy_substitutions(prompt_content, docs_together) try: context = self.namespace_manager.build_context( user_id=user_id, request_id=request_id, passthrough_data=passthrough_data, docs=docs, docs_together=docs_together, tools_data=tools_data, **kwargs, ) return self.template_engine.render(prompt_content, context) except TemplateRenderError: raise except Exception as e: error_msg = f"Prompt rendering failed: {str(e)}" logger.error(error_msg) raise TemplateRenderError(error_msg) from e def _uses_template_syntax(self, prompt_content: str) -> bool: """Check if prompt uses Jinja2 template syntax""" return "{{" in prompt_content and "}}" in prompt_content def _apply_legacy_substitutions( self, prompt_content: str, docs_together: Optional[str] = None ) -> str: """ Apply backward-compatible substitutions for old prompt format. Handles legacy {summaries} and {query} placeholders during transition period. """ if docs_together: prompt_content = prompt_content.replace("{summaries}", docs_together) return prompt_content def validate_template(self, prompt_content: str) -> bool: """Validate prompt template syntax""" return self.template_engine.validate_template(prompt_content) def extract_variables(self, prompt_content: str) -> set[str]: """Extract all variable names from prompt template""" return self.template_engine.extract_variables(prompt_content) ================================================ FILE: application/api/answer/services/stream_processor.py ================================================ import datetime import json import logging import os from pathlib import Path from typing import Any, Dict, Optional, Set from bson.dbref import DBRef from bson.objectid import ObjectId from application.agents.agent_creator import AgentCreator from application.api.answer.services.compression import CompressionOrchestrator from application.api.answer.services.compression.token_counter import TokenCounter from application.api.answer.services.conversation_service import ConversationService from application.api.answer.services.prompt_renderer import PromptRenderer from application.core.model_utils import ( get_api_key_for_provider, get_default_model_id, get_provider_from_model_id, validate_model_id, ) from application.core.mongo_db import MongoDB from application.core.settings import settings from application.retriever.retriever_creator import RetrieverCreator from application.utils import ( calculate_doc_token_budget, limit_chat_history, ) logger = logging.getLogger(__name__) def get_prompt(prompt_id: str, prompts_collection=None) -> str: """ Get a prompt by preset name or MongoDB ID """ current_dir = Path(__file__).resolve().parents[3] prompts_dir = current_dir / "prompts" preset_mapping = { "default": "chat_combine_default.txt", "creative": "chat_combine_creative.txt", "strict": "chat_combine_strict.txt", "reduce": "chat_reduce_prompt.txt", } if prompt_id in preset_mapping: file_path = os.path.join(prompts_dir, preset_mapping[prompt_id]) try: with open(file_path, "r") as f: return f.read() except FileNotFoundError: raise FileNotFoundError(f"Prompt file not found: {file_path}") try: if prompts_collection is None: mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] prompts_collection = db["prompts"] prompt_doc = prompts_collection.find_one({"_id": ObjectId(prompt_id)}) if not prompt_doc: raise ValueError(f"Prompt with ID {prompt_id} not found") return prompt_doc["content"] except Exception as e: raise ValueError(f"Invalid prompt ID: {prompt_id}") from e class StreamProcessor: def __init__( self, request_data: Dict[str, Any], decoded_token: Optional[Dict[str, Any]] ): mongo = MongoDB.get_client() self.db = mongo[settings.MONGO_DB_NAME] self.agents_collection = self.db["agents"] self.attachments_collection = self.db["attachments"] self.prompts_collection = self.db["prompts"] self.data = request_data self.decoded_token = decoded_token self.initial_user_id = ( self.decoded_token.get("sub") if self.decoded_token is not None else None ) self.conversation_id = self.data.get("conversation_id") self.source = {} self.all_sources = [] self.attachments = [] self.history = [] self.retrieved_docs = [] self.agent_config = {} self.retriever_config = {} self.is_shared_usage = False self.shared_token = None self.agent_id = self.data.get("agent_id") self.model_id: Optional[str] = None self.conversation_service = ConversationService() self.compression_orchestrator = CompressionOrchestrator( self.conversation_service ) self.prompt_renderer = PromptRenderer() self._prompt_content: Optional[str] = None self._required_tool_actions: Optional[Dict[str, Set[Optional[str]]]] = None self.compressed_summary: Optional[str] = None self.compressed_summary_tokens: int = 0 def initialize(self): """Initialize all required components for processing""" self._configure_agent() self._validate_and_set_model() self._configure_source() self._configure_retriever() self._load_conversation_history() self._process_attachments() def _load_conversation_history(self): """Load conversation history either from DB or request""" if self.conversation_id and self.initial_user_id: conversation = self.conversation_service.get_conversation( self.conversation_id, self.initial_user_id ) if not conversation: raise ValueError("Conversation not found or unauthorized") # Check if compression is enabled and needed if settings.ENABLE_CONVERSATION_COMPRESSION: self._handle_compression(conversation) else: # Original behavior - load all history self.history = [ {"prompt": query["prompt"], "response": query["response"]} for query in conversation.get("queries", []) ] else: self.history = limit_chat_history( json.loads(self.data.get("history", "[]")), model_id=self.model_id ) def _handle_compression(self, conversation: Dict[str, Any]): """ Handle conversation compression logic using orchestrator. Args: conversation: Full conversation document """ try: # Use orchestrator to handle all compression logic result = self.compression_orchestrator.compress_if_needed( conversation_id=self.conversation_id, user_id=self.initial_user_id, model_id=self.model_id, decoded_token=self.decoded_token, ) if not result.success: logger.error(f"Compression failed: {result.error}, using full history") self.history = [ {"prompt": query["prompt"], "response": query["response"]} for query in conversation.get("queries", []) ] return # Set compressed summary if compression was performed if result.compression_performed and result.compressed_summary: self.compressed_summary = result.compressed_summary self.compressed_summary_tokens = TokenCounter.count_message_tokens( [{"content": result.compressed_summary}] ) logger.info( f"Using compressed summary ({self.compressed_summary_tokens} tokens) " f"+ {len(result.recent_queries)} recent messages" ) # Build history from recent queries self.history = result.as_history() except Exception as e: logger.error( f"Error handling compression, falling back to standard history: {str(e)}", exc_info=True, ) # Fallback to original behavior self.history = [ {"prompt": query["prompt"], "response": query["response"]} for query in conversation.get("queries", []) ] def _process_attachments(self): """Process any attachments in the request""" attachment_ids = self.data.get("attachments", []) self.attachments = self._get_attachments_content( attachment_ids, self.initial_user_id ) def _get_attachments_content(self, attachment_ids, user_id): """ Retrieve content from attachment documents based on their IDs. """ if not attachment_ids: return [] attachments = [] for attachment_id in attachment_ids: try: attachment_doc = self.attachments_collection.find_one( {"_id": ObjectId(attachment_id), "user": user_id} ) if attachment_doc: attachments.append(attachment_doc) except Exception as e: logger.error( f"Error retrieving attachment {attachment_id}: {e}", exc_info=True ) return attachments def _validate_and_set_model(self): """Validate and set model_id from request""" from application.core.model_settings import ModelRegistry requested_model = self.data.get("model_id") if requested_model: if not validate_model_id(requested_model): registry = ModelRegistry.get_instance() available_models = [m.id for m in registry.get_enabled_models()] raise ValueError( f"Invalid model_id '{requested_model}'. " f"Available models: {', '.join(available_models[:5])}" + ( f" and {len(available_models) - 5} more" if len(available_models) > 5 else "" ) ) self.model_id = requested_model else: # Check if agent has a default model configured agent_default_model = self.agent_config.get("default_model_id", "") if agent_default_model and validate_model_id(agent_default_model): self.model_id = agent_default_model else: self.model_id = get_default_model_id() def _get_agent_key(self, agent_id: Optional[str], user_id: Optional[str]) -> tuple: """Get API key for agent with access control""" if not agent_id: return None, False, None try: agent = self.agents_collection.find_one({"_id": ObjectId(agent_id)}) if agent is None: raise Exception("Agent not found") is_owner = agent.get("user") == user_id is_shared_with_user = agent.get( "shared_publicly", False ) or user_id in agent.get("shared_with", []) if not (is_owner or is_shared_with_user): raise Exception("Unauthorized access to the agent") if is_owner: self.agents_collection.update_one( {"_id": ObjectId(agent_id)}, { "$set": { "lastUsedAt": datetime.datetime.now(datetime.timezone.utc) } }, ) return str(agent["key"]), not is_owner, agent.get("shared_token") except Exception as e: logger.error(f"Error in get_agent_key: {str(e)}", exc_info=True) raise def _get_data_from_api_key(self, api_key: str) -> Dict[str, Any]: data = self.agents_collection.find_one({"key": api_key}) if not data: raise Exception("Invalid API Key, please generate a new key", 401) source = data.get("source") if isinstance(source, DBRef): source_doc = self.db.dereference(source) if source_doc: data["source"] = str(source_doc["_id"]) data["retriever"] = source_doc.get("retriever", data.get("retriever")) data["chunks"] = source_doc.get("chunks", data.get("chunks")) else: data["source"] = None elif source == "default": data["source"] = "default" else: data["source"] = None # Handle multiple sources sources = data.get("sources", []) if sources and isinstance(sources, list): sources_list = [] for i, source_ref in enumerate(sources): if source_ref == "default": processed_source = { "id": "default", "retriever": "classic", "chunks": data.get("chunks", "2"), } sources_list.append(processed_source) elif isinstance(source_ref, DBRef): source_doc = self.db.dereference(source_ref) if source_doc: processed_source = { "id": str(source_doc["_id"]), "retriever": source_doc.get("retriever", "classic"), "chunks": source_doc.get("chunks", data.get("chunks", "2")), } sources_list.append(processed_source) data["sources"] = sources_list else: data["sources"] = [] # Preserve model configuration from agent data["default_model_id"] = data.get("default_model_id", "") return data def _configure_source(self): """Configure the source based on agent data""" api_key = self.data.get("api_key") or self.agent_key if api_key: agent_data = self._get_data_from_api_key(api_key) if agent_data.get("sources") and len(agent_data["sources"]) > 0: source_ids = [ source["id"] for source in agent_data["sources"] if source.get("id") ] if source_ids: self.source = {"active_docs": source_ids} else: self.source = {} self.all_sources = agent_data["sources"] elif agent_data.get("source"): self.source = {"active_docs": agent_data["source"]} self.all_sources = [ { "id": agent_data["source"], "retriever": agent_data.get("retriever", "classic"), } ] else: self.source = {} self.all_sources = [] return if "active_docs" in self.data: self.source = {"active_docs": self.data["active_docs"]} return self.source = {} self.all_sources = [] def _resolve_agent_id(self) -> Optional[str]: """Resolve agent_id from request, then fall back to conversation context.""" request_agent_id = self.data.get("agent_id") if request_agent_id: return str(request_agent_id) if not self.conversation_id or not self.initial_user_id: return None try: conversation = self.conversation_service.get_conversation( self.conversation_id, self.initial_user_id ) except Exception: return None if not conversation: return None conversation_agent_id = conversation.get("agent_id") if conversation_agent_id: return str(conversation_agent_id) return None def _configure_agent(self): """Configure the agent based on request data""" agent_id = self._resolve_agent_id() self.agent_key, self.is_shared_usage, self.shared_token = self._get_agent_key( agent_id, self.initial_user_id ) self.agent_id = str(agent_id) if agent_id else None api_key = self.data.get("api_key") if api_key: data_key = self._get_data_from_api_key(api_key) if data_key.get("_id"): self.agent_id = str(data_key.get("_id")) self.agent_config.update( { "prompt_id": data_key.get("prompt_id", "default"), "agent_type": data_key.get("agent_type", settings.AGENT_NAME), "user_api_key": api_key, "json_schema": data_key.get("json_schema"), "default_model_id": data_key.get("default_model_id", ""), } ) self.initial_user_id = data_key.get("user") self.decoded_token = {"sub": data_key.get("user")} if data_key.get("source"): self.source = {"active_docs": data_key["source"]} if data_key.get("workflow"): self.agent_config["workflow"] = data_key["workflow"] self.agent_config["workflow_owner"] = data_key.get("user") if data_key.get("retriever"): self.retriever_config["retriever_name"] = data_key["retriever"] if data_key.get("chunks") is not None: try: self.retriever_config["chunks"] = int(data_key["chunks"]) except (ValueError, TypeError): logger.warning( f"Invalid chunks value: {data_key['chunks']}, using default value 2" ) self.retriever_config["chunks"] = 2 elif self.agent_key: data_key = self._get_data_from_api_key(self.agent_key) if data_key.get("_id"): self.agent_id = str(data_key.get("_id")) self.agent_config.update( { "prompt_id": data_key.get("prompt_id", "default"), "agent_type": data_key.get("agent_type", settings.AGENT_NAME), "user_api_key": self.agent_key, "json_schema": data_key.get("json_schema"), "default_model_id": data_key.get("default_model_id", ""), } ) self.decoded_token = ( self.decoded_token if self.is_shared_usage else {"sub": data_key.get("user")} ) if data_key.get("source"): self.source = {"active_docs": data_key["source"]} if data_key.get("workflow"): self.agent_config["workflow"] = data_key["workflow"] self.agent_config["workflow_owner"] = data_key.get("user") if data_key.get("retriever"): self.retriever_config["retriever_name"] = data_key["retriever"] if data_key.get("chunks") is not None: try: self.retriever_config["chunks"] = int(data_key["chunks"]) except (ValueError, TypeError): logger.warning( f"Invalid chunks value: {data_key['chunks']}, using default value 2" ) self.retriever_config["chunks"] = 2 else: agent_type = settings.AGENT_NAME if self.data.get("workflow") and isinstance( self.data.get("workflow"), dict ): agent_type = "workflow" self.agent_config["workflow"] = self.data["workflow"] if isinstance(self.decoded_token, dict): self.agent_config["workflow_owner"] = self.decoded_token.get("sub") self.agent_config.update( { "prompt_id": self.data.get("prompt_id", "default"), "agent_type": agent_type, "user_api_key": None, "json_schema": None, "default_model_id": "", } ) def _configure_retriever(self): doc_token_limit = calculate_doc_token_budget(model_id=self.model_id) self.retriever_config = { "retriever_name": self.data.get("retriever", "classic"), "chunks": int(self.data.get("chunks", 2)), "doc_token_limit": doc_token_limit, } api_key = self.data.get("api_key") or self.agent_key if not api_key and "isNoneDoc" in self.data and self.data["isNoneDoc"]: self.retriever_config["chunks"] = 0 def create_retriever(self): return RetrieverCreator.create_retriever( self.retriever_config["retriever_name"], source=self.source, chat_history=self.history, prompt=get_prompt(self.agent_config["prompt_id"], self.prompts_collection), chunks=self.retriever_config["chunks"], doc_token_limit=self.retriever_config.get("doc_token_limit", 50000), model_id=self.model_id, user_api_key=self.agent_config["user_api_key"], agent_id=self.agent_id, decoded_token=self.decoded_token, ) def pre_fetch_docs(self, question: str) -> tuple[Optional[str], Optional[list]]: """Pre-fetch documents for template rendering before agent creation""" if self.data.get("isNoneDoc", False) and not self.agent_id: logger.info("Pre-fetch skipped: isNoneDoc=True") return None, None try: retriever = self.create_retriever() logger.info( f"Pre-fetching docs with chunks={retriever.chunks}, doc_token_limit={retriever.doc_token_limit}" ) docs = retriever.search(question) logger.info(f"Pre-fetch retrieved {len(docs) if docs else 0} documents") if not docs: logger.info("Pre-fetch: No documents returned from search") return None, None self.retrieved_docs = docs docs_with_filenames = [] for doc in docs: filename = doc.get("filename") or doc.get("title") or doc.get("source") if filename: chunk_header = str(filename) docs_with_filenames.append(f"{chunk_header}\n{doc['text']}") else: docs_with_filenames.append(doc["text"]) docs_together = "\n\n".join(docs_with_filenames) logger.info(f"Pre-fetch docs_together size: {len(docs_together)} chars") return docs_together, docs except Exception as e: logger.error(f"Failed to pre-fetch docs: {str(e)}", exc_info=True) return None, None def pre_fetch_tools(self) -> Optional[Dict[str, Any]]: """Pre-fetch tool data for template rendering before agent creation Can be controlled via: 1. Global setting: ENABLE_TOOL_PREFETCH in .env 2. Per-request: disable_tool_prefetch in request data """ if not settings.ENABLE_TOOL_PREFETCH: logger.info( "Tool pre-fetching disabled globally via ENABLE_TOOL_PREFETCH setting" ) return None if self.data.get("disable_tool_prefetch", False): logger.info("Tool pre-fetching disabled for this request") return None required_tool_actions = self._get_required_tool_actions() filtering_enabled = required_tool_actions is not None try: user_tools_collection = self.db["user_tools"] user_id = self.initial_user_id or "local" user_tools = list( user_tools_collection.find({"user": user_id, "status": True}) ) if not user_tools: return None tools_data = {} for tool_doc in user_tools: tool_name = tool_doc.get("name") tool_id = str(tool_doc.get("_id")) if filtering_enabled: required_actions_by_name = required_tool_actions.get( tool_name, set() ) required_actions_by_id = required_tool_actions.get(tool_id, set()) required_actions = required_actions_by_name | required_actions_by_id if not required_actions: continue else: required_actions = None tool_data = self._fetch_tool_data(tool_doc, required_actions) if tool_data: tools_data[tool_name] = tool_data tools_data[tool_id] = tool_data return tools_data if tools_data else None except Exception as e: logger.warning(f"Failed to pre-fetch tools: {type(e).__name__}") return None def _fetch_tool_data( self, tool_doc: Dict[str, Any], required_actions: Optional[Set[Optional[str]]], ) -> Optional[Dict[str, Any]]: """Fetch and execute tool actions with saved parameters""" try: from application.agents.tools.tool_manager import ToolManager tool_name = tool_doc.get("name") tool_config = tool_doc.get("config", {}).copy() tool_config["tool_id"] = str(tool_doc["_id"]) tool_manager = ToolManager(config={tool_name: tool_config}) user_id = self.initial_user_id or "local" tool = tool_manager.load_tool(tool_name, tool_config, user_id=user_id) if not tool: logger.debug(f"Tool '{tool_name}' failed to load") return None tool_actions = tool.get_actions_metadata() if not tool_actions: logger.debug(f"Tool '{tool_name}' has no actions") return None saved_actions = tool_doc.get("actions", []) include_all_actions = required_actions is None or ( required_actions and None in required_actions ) allowed_actions: Set[str] = ( {action for action in required_actions if isinstance(action, str)} if required_actions else set() ) action_results = {} for action_meta in tool_actions: action_name = action_meta.get("name") if action_name is None: continue if ( not include_all_actions and allowed_actions and action_name not in allowed_actions ): continue try: saved_action = None for sa in saved_actions: if sa.get("name") == action_name: saved_action = sa break action_params = action_meta.get("parameters", {}) properties = action_params.get("properties", {}) kwargs = {} for param_name, param_spec in properties.items(): if saved_action: saved_props = saved_action.get("parameters", {}).get( "properties", {} ) if param_name in saved_props: param_value = saved_props[param_name].get("value") if param_value is not None: kwargs[param_name] = param_value continue if param_name in tool_config: kwargs[param_name] = tool_config[param_name] elif "default" in param_spec: kwargs[param_name] = param_spec["default"] result = tool.execute_action(action_name, **kwargs) action_results[action_name] = result except Exception as e: logger.debug( f"Action '{action_name}' execution failed: {type(e).__name__}" ) continue return action_results if action_results else None except Exception as e: logger.debug(f"Tool pre-fetch failed for '{tool_name}': {type(e).__name__}") return None def _get_prompt_content(self) -> Optional[str]: """Retrieve and cache the raw prompt content for the current agent configuration.""" if self._prompt_content is not None: return self._prompt_content prompt_id = ( self.agent_config.get("prompt_id") if isinstance(self.agent_config, dict) else None ) if not prompt_id: return None try: self._prompt_content = get_prompt(prompt_id, self.prompts_collection) except ValueError as e: logger.debug(f"Invalid prompt ID '{prompt_id}': {str(e)}") self._prompt_content = None except Exception as e: logger.debug(f"Failed to fetch prompt '{prompt_id}': {type(e).__name__}") self._prompt_content = None return self._prompt_content def _get_required_tool_actions(self) -> Optional[Dict[str, Set[Optional[str]]]]: """Determine which tool actions are referenced in the prompt template""" if self._required_tool_actions is not None: return self._required_tool_actions prompt_content = self._get_prompt_content() if prompt_content is None: return None if "{{" not in prompt_content or "}}" not in prompt_content: self._required_tool_actions = {} return self._required_tool_actions try: from application.templates.template_engine import TemplateEngine template_engine = TemplateEngine() usages = template_engine.extract_tool_usages(prompt_content) self._required_tool_actions = usages return self._required_tool_actions except Exception as e: logger.debug(f"Failed to extract tool usages: {type(e).__name__}") self._required_tool_actions = {} return self._required_tool_actions def _fetch_memory_tool_data( self, tool_doc: Dict[str, Any] ) -> Optional[Dict[str, Any]]: """Fetch memory tool data for pre-injection into prompt""" try: tool_config = tool_doc.get("config", {}).copy() tool_config["tool_id"] = str(tool_doc["_id"]) from application.agents.tools.memory import MemoryTool memory_tool = MemoryTool(tool_config, self.initial_user_id) root_view = memory_tool.execute_action("view", path="/") if "Error:" in root_view or not root_view.strip(): return None return {"root": root_view, "available": True} except Exception as e: logger.warning(f"Failed to fetch memory tool data: {str(e)}") return None def create_agent( self, docs_together: Optional[str] = None, docs: Optional[list] = None, tools_data: Optional[Dict[str, Any]] = None, ): """Create and return the configured agent with rendered prompt""" raw_prompt = self._get_prompt_content() if raw_prompt is None: raw_prompt = get_prompt( self.agent_config["prompt_id"], self.prompts_collection ) self._prompt_content = raw_prompt rendered_prompt = self.prompt_renderer.render_prompt( prompt_content=raw_prompt, user_id=self.initial_user_id, request_id=self.data.get("request_id"), passthrough_data=self.data.get("passthrough"), docs=docs, docs_together=docs_together, tools_data=tools_data, ) provider = ( get_provider_from_model_id(self.model_id) if self.model_id else settings.LLM_PROVIDER ) system_api_key = get_api_key_for_provider(provider or settings.LLM_PROVIDER) agent_type = self.agent_config["agent_type"] # Base agent kwargs agent_kwargs = { "endpoint": "stream", "llm_name": provider or settings.LLM_PROVIDER, "model_id": self.model_id, "api_key": system_api_key, "agent_id": self.agent_id, "user_api_key": self.agent_config["user_api_key"], "prompt": rendered_prompt, "chat_history": self.history, "retrieved_docs": self.retrieved_docs, "decoded_token": self.decoded_token, "attachments": self.attachments, "json_schema": self.agent_config.get("json_schema"), "compressed_summary": self.compressed_summary, } # Workflow-specific kwargs for workflow agents if agent_type == "workflow": workflow_config = self.agent_config.get("workflow") if isinstance(workflow_config, str): agent_kwargs["workflow_id"] = workflow_config elif isinstance(workflow_config, dict): agent_kwargs["workflow"] = workflow_config workflow_owner = self.agent_config.get("workflow_owner") if workflow_owner: agent_kwargs["workflow_owner"] = workflow_owner agent = AgentCreator.create_agent(agent_type, **agent_kwargs) agent.conversation_id = self.conversation_id agent.initial_user_id = self.initial_user_id return agent ================================================ FILE: application/api/connector/routes.py ================================================ import base64 import datetime import html import json import uuid from urllib.parse import urlencode from bson.objectid import ObjectId from flask import ( Blueprint, current_app, jsonify, make_response, request ) from flask_restx import fields, Namespace, Resource from application.api.user.tasks import ( ingest_connector_task, ) from application.core.mongo_db import MongoDB from application.core.settings import settings from application.api import api from application.parser.connectors.connector_creator import ConnectorCreator mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] sources_collection = db["sources"] sessions_collection = db["connector_sessions"] connector = Blueprint("connector", __name__) connectors_ns = Namespace("connectors", description="Connector operations", path="/") api.add_namespace(connectors_ns) # Fixed callback status path to prevent open redirect CALLBACK_STATUS_PATH = "/api/connectors/callback-status" def build_callback_redirect(params: dict) -> str: """Build a safe redirect URL to the callback status page. Uses a fixed path and properly URL-encodes all parameters to prevent URL injection and open redirect vulnerabilities. """ return f"{CALLBACK_STATUS_PATH}?{urlencode(params)}" @connectors_ns.route("/api/connectors/auth") class ConnectorAuth(Resource): @api.doc(description="Get connector OAuth authorization URL", params={"provider": "Connector provider (e.g., google_drive)"}) def get(self): try: provider = request.args.get('provider') or request.args.get('source') if not provider: return make_response(jsonify({"success": False, "error": "Missing provider"}), 400) if not ConnectorCreator.is_supported(provider): return make_response(jsonify({"success": False, "error": f"Unsupported provider: {provider}"}), 400) decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False, "error": "Unauthorized"}), 401) user_id = decoded_token.get('sub') now = datetime.datetime.now(datetime.timezone.utc) result = sessions_collection.insert_one({ "provider": provider, "user": user_id, "status": "pending", "created_at": now }) state_dict = { "provider": provider, "object_id": str(result.inserted_id) } state = base64.urlsafe_b64encode(json.dumps(state_dict).encode()).decode() auth = ConnectorCreator.create_auth(provider) authorization_url = auth.get_authorization_url(state=state) return make_response(jsonify({ "success": True, "authorization_url": authorization_url, "state": state }), 200) except Exception as e: current_app.logger.error(f"Error generating connector auth URL: {e}", exc_info=True) return make_response(jsonify({"success": False, "error": "Failed to generate authorization URL"}), 500) @connectors_ns.route("/api/connectors/callback") class ConnectorsCallback(Resource): @api.doc(description="Handle OAuth callback for external connectors") def get(self): """Handle OAuth callback for external connectors""" try: from application.parser.connectors.connector_creator import ConnectorCreator from flask import request, redirect authorization_code = request.args.get('code') state = request.args.get('state') error = request.args.get('error') state_dict = json.loads(base64.urlsafe_b64decode(state.encode()).decode()) provider = state_dict.get("provider") state_object_id = state_dict.get("object_id") # Validate provider if not provider or not isinstance(provider, str) or not ConnectorCreator.is_supported(provider): return redirect(build_callback_redirect({ "status": "error", "message": "Invalid provider" })) if error: if error == "access_denied": return redirect(build_callback_redirect({ "status": "cancelled", "message": "Authentication was cancelled. You can try again if you'd like to connect your account.", "provider": provider })) else: current_app.logger.warning(f"OAuth error in callback: {error}") return redirect(build_callback_redirect({ "status": "error", "message": "Authentication failed. Please try again and make sure to grant all requested permissions.", "provider": provider })) if not authorization_code: return redirect(build_callback_redirect({ "status": "error", "message": "Authentication failed. Please try again and make sure to grant all requested permissions.", "provider": provider })) try: auth = ConnectorCreator.create_auth(provider) token_info = auth.exchange_code_for_tokens(authorization_code) session_token = str(uuid.uuid4()) try: if provider == "google_drive": credentials = auth.create_credentials_from_token_info(token_info) service = auth.build_drive_service(credentials) user_info = service.about().get(fields="user").execute() user_email = user_info.get('user', {}).get('emailAddress', 'Connected User') else: user_email = token_info.get('user_info', {}).get('email', 'Connected User') except Exception as e: current_app.logger.warning(f"Could not get user info: {e}") user_email = 'Connected User' sanitized_token_info = auth.sanitize_token_info(token_info) sessions_collection.find_one_and_update( {"_id": ObjectId(state_object_id), "provider": provider}, { "$set": { "session_token": session_token, "token_info": sanitized_token_info, "user_email": user_email, "status": "authorized" } } ) # Redirect to success page with session token and user email return redirect(build_callback_redirect({ "status": "success", "message": "Authentication successful", "provider": provider, "session_token": session_token, "user_email": user_email })) except Exception as e: current_app.logger.error(f"Error exchanging code for tokens: {str(e)}", exc_info=True) return redirect(build_callback_redirect({ "status": "error", "message": "Authentication failed. Please try again and make sure to grant all requested permissions.", "provider": provider })) except Exception as e: current_app.logger.error(f"Error handling connector callback: {e}") return redirect(build_callback_redirect({ "status": "error", "message": "Authentication failed. Please try again and make sure to grant all requested permissions." })) @connectors_ns.route("/api/connectors/files") class ConnectorFiles(Resource): @api.expect(api.model("ConnectorFilesModel", { "provider": fields.String(required=True), "session_token": fields.String(required=True), "folder_id": fields.String(required=False), "limit": fields.Integer(required=False), "page_token": fields.String(required=False), "search_query": fields.String(required=False), })) @api.doc(description="List files from a connector provider (supports pagination and search)") def post(self): try: data = request.get_json() provider = data.get('provider') session_token = data.get('session_token') limit = data.get('limit', 10) if not provider or not session_token: return make_response(jsonify({"success": False, "error": "provider and session_token are required"}), 400) decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False, "error": "Unauthorized"}), 401) user = decoded_token.get('sub') session = sessions_collection.find_one({"session_token": session_token, "user": user}) if not session: return make_response(jsonify({"success": False, "error": "Invalid or unauthorized session"}), 401) loader = ConnectorCreator.create_connector(provider, session_token) generic_keys = {'provider', 'session_token'} input_config = { k: v for k, v in data.items() if k not in generic_keys } input_config['list_only'] = True documents = loader.load_data(input_config) files = [] for doc in documents[:limit]: metadata = doc.extra_info modified_time = metadata.get('modified_time') if modified_time: date_part = modified_time.split('T')[0] time_part = modified_time.split('T')[1].split('.')[0].split('Z')[0] formatted_time = f"{date_part} {time_part}" else: formatted_time = None files.append({ 'id': doc.doc_id, 'name': metadata.get('file_name', 'Unknown File'), 'type': metadata.get('mime_type', 'unknown'), 'size': metadata.get('size', None), 'modifiedTime': formatted_time, 'isFolder': metadata.get('is_folder', False) }) next_token = getattr(loader, 'next_page_token', None) has_more = bool(next_token) return make_response(jsonify({ "success": True, "files": files, "total": len(files), "next_page_token": next_token, "has_more": has_more }), 200) except Exception as e: current_app.logger.error(f"Error loading connector files: {e}", exc_info=True) return make_response(jsonify({"success": False, "error": "Failed to load files"}), 500) @connectors_ns.route("/api/connectors/validate-session") class ConnectorValidateSession(Resource): @api.expect(api.model("ConnectorValidateSessionModel", {"provider": fields.String(required=True), "session_token": fields.String(required=True)})) @api.doc(description="Validate connector session token and return user info and access token") def post(self): try: data = request.get_json() provider = data.get('provider') session_token = data.get('session_token') if not provider or not session_token: return make_response(jsonify({"success": False, "error": "provider and session_token are required"}), 400) decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False, "error": "Unauthorized"}), 401) user = decoded_token.get('sub') session = sessions_collection.find_one({"session_token": session_token, "user": user}) if not session or "token_info" not in session: return make_response(jsonify({"success": False, "error": "Invalid or expired session"}), 401) token_info = session["token_info"] auth = ConnectorCreator.create_auth(provider) is_expired = auth.is_token_expired(token_info) if is_expired and token_info.get('refresh_token'): try: refreshed_token_info = auth.refresh_access_token(token_info.get('refresh_token')) sanitized_token_info = auth.sanitize_token_info(refreshed_token_info) sessions_collection.update_one( {"session_token": session_token}, {"$set": {"token_info": sanitized_token_info}} ) token_info = sanitized_token_info is_expired = False except Exception as refresh_error: current_app.logger.error(f"Failed to refresh token: {refresh_error}") if is_expired: return make_response(jsonify({ "success": False, "expired": True, "error": "Session token has expired. Please reconnect." }), 401) _base_fields = {"access_token", "refresh_token", "token_uri", "expiry"} provider_extras = {k: v for k, v in token_info.items() if k not in _base_fields} response_data = { "success": True, "expired": False, "user_email": session.get('user_email', 'Connected User'), "access_token": token_info.get('access_token'), **provider_extras, } return make_response(jsonify(response_data), 200) except Exception as e: current_app.logger.error(f"Error validating connector session: {e}", exc_info=True) return make_response(jsonify({"success": False, "error": "Failed to validate session"}), 500) @connectors_ns.route("/api/connectors/disconnect") class ConnectorDisconnect(Resource): @api.expect(api.model("ConnectorDisconnectModel", {"provider": fields.String(required=True), "session_token": fields.String(required=False)})) @api.doc(description="Disconnect a connector session") def post(self): try: data = request.get_json() provider = data.get('provider') session_token = data.get('session_token') if not provider: return make_response(jsonify({"success": False, "error": "provider is required"}), 400) if session_token: sessions_collection.delete_one({"session_token": session_token}) return make_response(jsonify({"success": True}), 200) except Exception as e: current_app.logger.error(f"Error disconnecting connector session: {e}", exc_info=True) return make_response(jsonify({"success": False, "error": "Failed to disconnect session"}), 500) @connectors_ns.route("/api/connectors/sync") class ConnectorSync(Resource): @api.expect( api.model( "ConnectorSyncModel", { "source_id": fields.String(required=True, description="Source ID to sync"), "session_token": fields.String(required=True, description="Authentication token") }, ) ) @api.doc(description="Sync connector source to check for modifications") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) try: data = request.get_json() source_id = data.get('source_id') session_token = data.get('session_token') if not all([source_id, session_token]): return make_response( jsonify({ "success": False, "error": "source_id and session_token are required" }), 400 ) source = sources_collection.find_one({"_id": ObjectId(source_id)}) if not source: return make_response( jsonify({ "success": False, "error": "Source not found" }), 404 ) if source.get('user') != decoded_token.get('sub'): return make_response( jsonify({ "success": False, "error": "Unauthorized access to source" }), 403 ) remote_data = {} try: if source.get('remote_data'): remote_data = json.loads(source.get('remote_data')) except json.JSONDecodeError: current_app.logger.error(f"Invalid remote_data format for source {source_id}") remote_data = {} source_type = remote_data.get('provider') if not source_type: return make_response( jsonify({ "success": False, "error": "Source provider not found in remote_data" }), 400 ) # Extract configuration from remote_data file_ids = remote_data.get('file_ids', []) folder_ids = remote_data.get('folder_ids', []) recursive = remote_data.get('recursive', True) # Start the sync task task = ingest_connector_task.delay( job_name=source.get('name'), user=decoded_token.get('sub'), source_type=source_type, session_token=session_token, file_ids=file_ids, folder_ids=folder_ids, recursive=recursive, retriever=source.get('retriever', 'classic'), operation_mode="sync", doc_id=source_id, sync_frequency=source.get('sync_frequency', 'never') ) return make_response( jsonify({ "success": True, "task_id": task.id }), 200 ) except Exception as err: current_app.logger.error( f"Error syncing connector source: {err}", exc_info=True ) return make_response( jsonify({ "success": False, "error": "Failed to sync connector source" }), 400 ) @connectors_ns.route("/api/connectors/callback-status") class ConnectorCallbackStatus(Resource): @api.doc(description="Return HTML page with connector authentication status") def get(self): """Return HTML page with connector authentication status""" try: # Validate and sanitize status to a known value status_raw = request.args.get('status', 'error') status = status_raw if status_raw in ('success', 'error', 'cancelled') else 'error' # Escape all user-controlled values for HTML context message = html.escape(request.args.get('message', '')) provider_raw = request.args.get('provider', 'connector') provider = html.escape(provider_raw.replace('_', ' ').title()) session_token = request.args.get('session_token', '') user_email = html.escape(request.args.get('user_email', '')) def safe_js_string(value: str) -> str: """Safely encode a string for embedding in inline JavaScript.""" js_encoded = json.dumps(value) return js_encoded.replace(' {provider} Authentication

{provider} Authentication

{message}

{f'

Connected as: {user_email}

' if status == 'success' else ''}

You can close this window. {f"Your {provider} is now connected and ready to use." if status == 'success' else "Feel free to close this window."}

""" return make_response(html_content, 200, {'Content-Type': 'text/html'}) except Exception as e: current_app.logger.error(f"Error rendering callback status page: {e}") return make_response("Authentication error occurred", 500, {'Content-Type': 'text/html'}) ================================================ FILE: application/api/internal/__init__.py ================================================ ================================================ FILE: application/api/internal/routes.py ================================================ import os import datetime import json from flask import Blueprint, request, send_from_directory, jsonify from werkzeug.utils import secure_filename from bson.objectid import ObjectId import logging from application.core.mongo_db import MongoDB from application.core.settings import settings from application.storage.storage_creator import StorageCreator logger = logging.getLogger(__name__) mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] conversations_collection = db["conversations"] sources_collection = db["sources"] current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) internal = Blueprint("internal", __name__) @internal.before_request def verify_internal_key(): """Verify INTERNAL_KEY for all internal endpoint requests.""" if settings.INTERNAL_KEY: internal_key = request.headers.get("X-Internal-Key") if not internal_key or internal_key != settings.INTERNAL_KEY: logger.warning(f"Unauthorized internal API access attempt from {request.remote_addr}") return jsonify({"error": "Unauthorized", "message": "Invalid or missing internal key"}), 401 @internal.route("/api/download", methods=["get"]) def download_file(): user = secure_filename(request.args.get("user")) job_name = secure_filename(request.args.get("name")) filename = secure_filename(request.args.get("file")) save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name) return send_from_directory(save_dir, filename, as_attachment=True) @internal.route("/api/upload_index", methods=["POST"]) def upload_index_files(): """Upload two files(index.faiss, index.pkl) to the user's folder.""" if "user" not in request.form: return {"status": "no user"} user = request.form["user"] if "name" not in request.form: return {"status": "no name"} job_name = request.form["name"] tokens = request.form["tokens"] retriever = request.form["retriever"] id = request.form["id"] type = request.form["type"] remote_data = request.form["remote_data"] if "remote_data" in request.form else None sync_frequency = request.form["sync_frequency"] if "sync_frequency" in request.form else None file_path = request.form.get("file_path") directory_structure = request.form.get("directory_structure") file_name_map = request.form.get("file_name_map") if directory_structure: try: directory_structure = json.loads(directory_structure) except Exception: logger.error("Error parsing directory_structure") directory_structure = {} else: directory_structure = {} if file_name_map: try: file_name_map = json.loads(file_name_map) except Exception: logger.error("Error parsing file_name_map") file_name_map = None else: file_name_map = None storage = StorageCreator.get_storage() index_base_path = f"indexes/{id}" if settings.VECTOR_STORE == "faiss": if "file_faiss" not in request.files: logger.error("No file_faiss part") return {"status": "no file"} file_faiss = request.files["file_faiss"] if file_faiss.filename == "": return {"status": "no file name"} if "file_pkl" not in request.files: logger.error("No file_pkl part") return {"status": "no file"} file_pkl = request.files["file_pkl"] if file_pkl.filename == "": return {"status": "no file name"} # Save index files to storage faiss_storage_path = f"{index_base_path}/index.faiss" pkl_storage_path = f"{index_base_path}/index.pkl" storage.save_file(file_faiss, faiss_storage_path) storage.save_file(file_pkl, pkl_storage_path) existing_entry = sources_collection.find_one({"_id": ObjectId(id)}) if existing_entry: update_fields = { "user": user, "name": job_name, "language": job_name, "date": datetime.datetime.now(), "model": settings.EMBEDDINGS_NAME, "type": type, "tokens": tokens, "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, "file_path": file_path, "directory_structure": directory_structure, } if file_name_map is not None: update_fields["file_name_map"] = file_name_map sources_collection.update_one( {"_id": ObjectId(id)}, {"$set": update_fields}, ) else: insert_doc = { "_id": ObjectId(id), "user": user, "name": job_name, "language": job_name, "date": datetime.datetime.now(), "model": settings.EMBEDDINGS_NAME, "type": type, "tokens": tokens, "retriever": retriever, "remote_data": remote_data, "sync_frequency": sync_frequency, "file_path": file_path, "directory_structure": directory_structure, } if file_name_map is not None: insert_doc["file_name_map"] = file_name_map sources_collection.insert_one(insert_doc) return {"status": "ok"} ================================================ FILE: application/api/user/__init__.py ================================================ """User API module - provides all user-related API endpoints""" from .routes import user __all__ = ["user"] ================================================ FILE: application/api/user/agents/__init__.py ================================================ """Agents module.""" from .routes import agents_ns from .sharing import agents_sharing_ns from .webhooks import agents_webhooks_ns from .folders import agents_folders_ns __all__ = ["agents_ns", "agents_sharing_ns", "agents_webhooks_ns", "agents_folders_ns"] ================================================ FILE: application/api/user/agents/folders.py ================================================ """ Agent folders management routes. Provides virtual folder organization for agents (Google Drive-like structure). """ import datetime from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import Namespace, Resource, fields from application.api import api from application.api.user.base import ( agent_folders_collection, agents_collection, ) agents_folders_ns = Namespace( "agents_folders", description="Agent folder management", path="/api/agents/folders" ) def _folder_error_response(message: str, err: Exception): current_app.logger.error(f"{message}: {err}", exc_info=True) return make_response(jsonify({"success": False, "message": message}), 400) @agents_folders_ns.route("/") class AgentFolders(Resource): @api.doc(description="Get all folders for the user") def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") try: folders = list(agent_folders_collection.find({"user": user})) result = [ { "id": str(f["_id"]), "name": f["name"], "parent_id": f.get("parent_id"), "created_at": f.get("created_at", "").isoformat() if f.get("created_at") else None, "updated_at": f.get("updated_at", "").isoformat() if f.get("updated_at") else None, } for f in folders ] return make_response(jsonify({"folders": result}), 200) except Exception as err: return _folder_error_response("Failed to fetch folders", err) @api.doc(description="Create a new folder") @api.expect( api.model( "CreateFolder", { "name": fields.String(required=True, description="Folder name"), "parent_id": fields.String(required=False, description="Parent folder ID"), }, ) ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() if not data or not data.get("name"): return make_response(jsonify({"success": False, "message": "Folder name is required"}), 400) parent_id = data.get("parent_id") if parent_id: parent = agent_folders_collection.find_one({"_id": ObjectId(parent_id), "user": user}) if not parent: return make_response(jsonify({"success": False, "message": "Parent folder not found"}), 404) try: now = datetime.datetime.now(datetime.timezone.utc) folder = { "user": user, "name": data["name"], "parent_id": parent_id, "created_at": now, "updated_at": now, } result = agent_folders_collection.insert_one(folder) return make_response( jsonify({"id": str(result.inserted_id), "name": data["name"], "parent_id": parent_id}), 201, ) except Exception as err: return _folder_error_response("Failed to create folder", err) @agents_folders_ns.route("/") class AgentFolder(Resource): @api.doc(description="Get a specific folder with its agents") def get(self, folder_id): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") try: folder = agent_folders_collection.find_one({"_id": ObjectId(folder_id), "user": user}) if not folder: return make_response(jsonify({"success": False, "message": "Folder not found"}), 404) agents = list(agents_collection.find({"user": user, "folder_id": folder_id})) agents_list = [ {"id": str(a["_id"]), "name": a["name"], "description": a.get("description", "")} for a in agents ] subfolders = list(agent_folders_collection.find({"user": user, "parent_id": folder_id})) subfolders_list = [{"id": str(sf["_id"]), "name": sf["name"]} for sf in subfolders] return make_response( jsonify({ "id": str(folder["_id"]), "name": folder["name"], "parent_id": folder.get("parent_id"), "agents": agents_list, "subfolders": subfolders_list, }), 200, ) except Exception as err: return _folder_error_response("Failed to fetch folder", err) @api.doc(description="Update a folder") def put(self, folder_id): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() if not data: return make_response(jsonify({"success": False, "message": "No data provided"}), 400) try: update_fields = {"updated_at": datetime.datetime.now(datetime.timezone.utc)} if "name" in data: update_fields["name"] = data["name"] if "parent_id" in data: if data["parent_id"] == folder_id: return make_response(jsonify({"success": False, "message": "Cannot set folder as its own parent"}), 400) update_fields["parent_id"] = data["parent_id"] result = agent_folders_collection.update_one( {"_id": ObjectId(folder_id), "user": user}, {"$set": update_fields} ) if result.matched_count == 0: return make_response(jsonify({"success": False, "message": "Folder not found"}), 404) return make_response(jsonify({"success": True}), 200) except Exception as err: return _folder_error_response("Failed to update folder", err) @api.doc(description="Delete a folder") def delete(self, folder_id): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") try: agents_collection.update_many( {"user": user, "folder_id": folder_id}, {"$unset": {"folder_id": ""}} ) agent_folders_collection.update_many( {"user": user, "parent_id": folder_id}, {"$unset": {"parent_id": ""}} ) result = agent_folders_collection.delete_one({"_id": ObjectId(folder_id), "user": user}) if result.deleted_count == 0: return make_response(jsonify({"success": False, "message": "Folder not found"}), 404) return make_response(jsonify({"success": True}), 200) except Exception as err: return _folder_error_response("Failed to delete folder", err) @agents_folders_ns.route("/move_agent") class MoveAgentToFolder(Resource): @api.doc(description="Move an agent to a folder or remove from folder") @api.expect( api.model( "MoveAgent", { "agent_id": fields.String(required=True, description="Agent ID to move"), "folder_id": fields.String(required=False, description="Target folder ID (null to remove from folder)"), }, ) ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() if not data or not data.get("agent_id"): return make_response(jsonify({"success": False, "message": "Agent ID is required"}), 400) agent_id = data["agent_id"] folder_id = data.get("folder_id") try: agent = agents_collection.find_one({"_id": ObjectId(agent_id), "user": user}) if not agent: return make_response(jsonify({"success": False, "message": "Agent not found"}), 404) if folder_id: folder = agent_folders_collection.find_one({"_id": ObjectId(folder_id), "user": user}) if not folder: return make_response(jsonify({"success": False, "message": "Folder not found"}), 404) agents_collection.update_one( {"_id": ObjectId(agent_id)}, {"$set": {"folder_id": folder_id}} ) else: agents_collection.update_one( {"_id": ObjectId(agent_id)}, {"$unset": {"folder_id": ""}} ) return make_response(jsonify({"success": True}), 200) except Exception as err: return _folder_error_response("Failed to move agent", err) @agents_folders_ns.route("/bulk_move") class BulkMoveAgents(Resource): @api.doc(description="Move multiple agents to a folder") @api.expect( api.model( "BulkMoveAgents", { "agent_ids": fields.List(fields.String, required=True, description="List of agent IDs"), "folder_id": fields.String(required=False, description="Target folder ID"), }, ) ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() if not data or not data.get("agent_ids"): return make_response(jsonify({"success": False, "message": "Agent IDs are required"}), 400) agent_ids = data["agent_ids"] folder_id = data.get("folder_id") try: if folder_id: folder = agent_folders_collection.find_one({"_id": ObjectId(folder_id), "user": user}) if not folder: return make_response(jsonify({"success": False, "message": "Folder not found"}), 404) object_ids = [ObjectId(aid) for aid in agent_ids] if folder_id: agents_collection.update_many( {"_id": {"$in": object_ids}, "user": user}, {"$set": {"folder_id": folder_id}}, ) else: agents_collection.update_many( {"_id": {"$in": object_ids}, "user": user}, {"$unset": {"folder_id": ""}}, ) return make_response(jsonify({"success": True}), 200) except Exception as err: return _folder_error_response("Failed to move agents", err) ================================================ FILE: application/api/user/agents/routes.py ================================================ """Agent management routes.""" import datetime import json import uuid from bson.dbref import DBRef from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import fields, Namespace, Resource from application.api import api from application.api.user.base import ( agent_folders_collection, agents_collection, db, ensure_user_doc, handle_image_upload, resolve_tool_details, storage, users_collection, workflow_edges_collection, workflow_nodes_collection, workflows_collection, ) from application.core.json_schema_utils import ( JsonSchemaValidationError, normalize_json_schema_payload, ) from application.core.settings import settings from application.utils import ( check_required_fields, generate_image_url, validate_required_fields, ) agents_ns = Namespace("agents", description="Agent management operations", path="/api") AGENT_TYPE_SCHEMAS = { "classic": { "required_published": [ "name", "description", "chunks", "retriever", "prompt_id", ], "required_draft": ["name"], "validate_published": ["name", "description", "prompt_id"], "validate_draft": [], "require_source": True, "fields": [ "user", "name", "description", "agent_type", "status", "key", "image", "source", "sources", "chunks", "retriever", "prompt_id", "tools", "json_schema", "models", "default_model_id", "folder_id", "limited_token_mode", "token_limit", "limited_request_mode", "request_limit", "createdAt", "updatedAt", "lastUsedAt", ], }, "workflow": { "required_published": ["name", "workflow"], "required_draft": ["name"], "validate_published": ["name", "workflow"], "validate_draft": [], "fields": [ "user", "name", "description", "agent_type", "status", "key", "workflow", "folder_id", "limited_token_mode", "token_limit", "limited_request_mode", "request_limit", "createdAt", "updatedAt", "lastUsedAt", ], }, } AGENT_TYPE_SCHEMAS["react"] = AGENT_TYPE_SCHEMAS["classic"] AGENT_TYPE_SCHEMAS["openai"] = AGENT_TYPE_SCHEMAS["classic"] def normalize_workflow_reference(workflow_value): """Normalize workflow references from form/json payloads.""" if workflow_value is None: return None if isinstance(workflow_value, dict): return ( workflow_value.get("id") or workflow_value.get("_id") or workflow_value.get("workflow_id") ) if isinstance(workflow_value, str): value = workflow_value.strip() if not value: return "" try: parsed = json.loads(value) if isinstance(parsed, str): return parsed.strip() if isinstance(parsed, dict): return ( parsed.get("id") or parsed.get("_id") or parsed.get("workflow_id") ) except json.JSONDecodeError: pass return value return str(workflow_value) def validate_workflow_access(workflow_value, user, required=False): """Validate workflow reference and ensure ownership.""" workflow_id = normalize_workflow_reference(workflow_value) if not workflow_id: if required: return None, make_response( jsonify({"success": False, "message": "Workflow is required"}), 400 ) return None, None if not ObjectId.is_valid(workflow_id): return None, make_response( jsonify({"success": False, "message": "Invalid workflow ID format"}), 400 ) workflow = workflows_collection.find_one({"_id": ObjectId(workflow_id), "user": user}) if not workflow: return None, make_response( jsonify({"success": False, "message": "Workflow not found"}), 404 ) return workflow_id, None def build_agent_document( data, user, key, agent_type, image_url=None, source_field=None, sources_list=None ): """Build agent document based on agent type schema.""" if not agent_type or agent_type not in AGENT_TYPE_SCHEMAS: agent_type = "classic" schema = AGENT_TYPE_SCHEMAS.get(agent_type, AGENT_TYPE_SCHEMAS["classic"]) allowed_fields = set(schema["fields"]) now = datetime.datetime.now(datetime.timezone.utc) base_doc = { "user": user, "name": data.get("name"), "description": data.get("description", ""), "agent_type": agent_type, "status": data.get("status"), "key": key, "createdAt": now, "updatedAt": now, "lastUsedAt": None, } if agent_type == "workflow": base_doc["workflow"] = data.get("workflow") base_doc["folder_id"] = data.get("folder_id") else: base_doc.update( { "image": image_url or "", "source": source_field or "", "sources": sources_list or [], "chunks": data.get("chunks", ""), "retriever": data.get("retriever", ""), "prompt_id": data.get("prompt_id", ""), "tools": data.get("tools", []), "json_schema": data.get("json_schema"), "models": data.get("models", []), "default_model_id": data.get("default_model_id", ""), "folder_id": data.get("folder_id"), } ) if "limited_token_mode" in allowed_fields: base_doc["limited_token_mode"] = ( data.get("limited_token_mode") == "True" if isinstance(data.get("limited_token_mode"), str) else bool(data.get("limited_token_mode", False)) ) if "token_limit" in allowed_fields: base_doc["token_limit"] = int( data.get("token_limit", settings.DEFAULT_AGENT_LIMITS["token_limit"]) ) if "limited_request_mode" in allowed_fields: base_doc["limited_request_mode"] = ( data.get("limited_request_mode") == "True" if isinstance(data.get("limited_request_mode"), str) else bool(data.get("limited_request_mode", False)) ) if "request_limit" in allowed_fields: base_doc["request_limit"] = int( data.get("request_limit", settings.DEFAULT_AGENT_LIMITS["request_limit"]) ) return {k: v for k, v in base_doc.items() if k in allowed_fields} @agents_ns.route("/get_agent") class GetAgent(Resource): @api.doc(params={"id": "Agent ID"}, description="Get agent by ID") def get(self): if not (decoded_token := request.decoded_token): return {"success": False}, 401 if not (agent_id := request.args.get("id")): return {"success": False, "message": "ID required"}, 400 try: agent = agents_collection.find_one( {"_id": ObjectId(agent_id), "user": decoded_token["sub"]} ) if not agent: return {"status": "Not found"}, 404 data = { "id": str(agent["_id"]), "name": agent["name"], "description": agent.get("description", ""), "image": ( generate_image_url(agent["image"]) if agent.get("image") else "" ), "source": ( str(source_doc["_id"]) if isinstance(agent.get("source"), DBRef) and (source_doc := db.dereference(agent.get("source"))) else "" ), "sources": [ ( str(db.dereference(source_ref)["_id"]) if isinstance(source_ref, DBRef) and db.dereference(source_ref) else source_ref ) for source_ref in agent.get("sources", []) if (isinstance(source_ref, DBRef) and db.dereference(source_ref)) or source_ref == "default" ], "chunks": agent.get("chunks", "2"), "retriever": agent.get("retriever", ""), "prompt_id": agent.get("prompt_id", ""), "tools": agent.get("tools", []), "tool_details": resolve_tool_details(agent.get("tools", [])), "agent_type": agent.get("agent_type", ""), "status": agent.get("status", ""), "json_schema": agent.get("json_schema"), "limited_token_mode": agent.get("limited_token_mode", False), "token_limit": agent.get( "token_limit", settings.DEFAULT_AGENT_LIMITS["token_limit"] ), "limited_request_mode": agent.get("limited_request_mode", False), "request_limit": agent.get( "request_limit", settings.DEFAULT_AGENT_LIMITS["request_limit"] ), "created_at": agent.get("createdAt", ""), "updated_at": agent.get("updatedAt", ""), "last_used_at": agent.get("lastUsedAt", ""), "key": ( f"{agent['key'][:4]}...{agent['key'][-4:]}" if "key" in agent else "" ), "pinned": agent.get("pinned", False), "shared": agent.get("shared_publicly", False), "shared_metadata": agent.get("shared_metadata", {}), "shared_token": agent.get("shared_token", ""), "models": agent.get("models", []), "default_model_id": agent.get("default_model_id", ""), "folder_id": agent.get("folder_id"), "workflow": agent.get("workflow"), } return make_response(jsonify(data), 200) except Exception as e: current_app.logger.error(f"Agent fetch error: {e}", exc_info=True) return {"success": False}, 400 @agents_ns.route("/get_agents") class GetAgents(Resource): @api.doc(description="Retrieve agents for the user") def get(self): if not (decoded_token := request.decoded_token): return {"success": False}, 401 user = decoded_token.get("sub") try: user_doc = ensure_user_doc(user) pinned_ids = set(user_doc.get("agent_preferences", {}).get("pinned", [])) agents = agents_collection.find({"user": user}) list_agents = [ { "id": str(agent["_id"]), "name": agent["name"], "description": agent.get("description", ""), "image": ( generate_image_url(agent["image"]) if agent.get("image") else "" ), "source": ( str(source_doc["_id"]) if isinstance(agent.get("source"), DBRef) and (source_doc := db.dereference(agent.get("source"))) else ( agent.get("source", "") if agent.get("source") == "default" else "" ) ), "sources": [ ( source_ref if source_ref == "default" else str(db.dereference(source_ref)["_id"]) ) for source_ref in agent.get("sources", []) if source_ref == "default" or ( isinstance(source_ref, DBRef) and db.dereference(source_ref) ) ], "chunks": agent.get("chunks", "2"), "retriever": agent.get("retriever", ""), "prompt_id": agent.get("prompt_id", ""), "tools": agent.get("tools", []), "tool_details": resolve_tool_details(agent.get("tools", [])), "agent_type": agent.get("agent_type", ""), "status": agent.get("status", ""), "json_schema": agent.get("json_schema"), "limited_token_mode": agent.get("limited_token_mode", False), "token_limit": agent.get( "token_limit", settings.DEFAULT_AGENT_LIMITS["token_limit"] ), "limited_request_mode": agent.get("limited_request_mode", False), "request_limit": agent.get( "request_limit", settings.DEFAULT_AGENT_LIMITS["request_limit"] ), "created_at": agent.get("createdAt", ""), "updated_at": agent.get("updatedAt", ""), "last_used_at": agent.get("lastUsedAt", ""), "key": ( f"{agent['key'][:4]}...{agent['key'][-4:]}" if "key" in agent else "" ), "pinned": str(agent["_id"]) in pinned_ids, "shared": agent.get("shared_publicly", False), "shared_metadata": agent.get("shared_metadata", {}), "shared_token": agent.get("shared_token", ""), "models": agent.get("models", []), "default_model_id": agent.get("default_model_id", ""), "folder_id": agent.get("folder_id"), "workflow": agent.get("workflow"), } for agent in agents if "source" in agent or "retriever" in agent or agent.get("agent_type") == "workflow" ] except Exception as err: current_app.logger.error(f"Error retrieving agents: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify(list_agents), 200) @agents_ns.route("/create_agent") class CreateAgent(Resource): create_agent_model = api.model( "CreateAgentModel", { "name": fields.String(required=True, description="Name of the agent"), "description": fields.String( required=True, description="Description of the agent" ), "image": fields.Raw( required=False, description="Image file upload", type="file" ), "source": fields.String( required=False, description="Source ID (legacy single source)" ), "sources": fields.List( fields.String, required=False, description="List of source identifiers for multiple sources", ), "chunks": fields.Integer(required=False, description="Chunks count"), "retriever": fields.String(required=False, description="Retriever ID"), "prompt_id": fields.String(required=False, description="Prompt ID"), "tools": fields.List( fields.String, required=False, description="List of tool identifiers" ), "agent_type": fields.String( required=False, description="Type of the agent (classic, react, workflow). Defaults to 'classic' for backwards compatibility.", ), "status": fields.String( required=True, description="Status of the agent (draft or published)" ), "workflow": fields.String( required=False, description="Workflow ID for workflow-type agents" ), "json_schema": fields.Raw( required=False, description="JSON schema for enforcing structured output format", ), "limited_token_mode": fields.Boolean( required=False, description="Whether the agent is in limited token mode" ), "token_limit": fields.Integer( required=False, description="Token limit for the agent in limited mode" ), "limited_request_mode": fields.Boolean( required=False, description="Whether the agent is in limited request mode", ), "request_limit": fields.Integer( required=False, description="Request limit for the agent in limited mode", ), "models": fields.List( fields.String, required=False, description="List of available model IDs for this agent", ), "default_model_id": fields.String( required=False, description="Default model ID for this agent" ), "folder_id": fields.String( required=False, description="Folder ID to organize the agent" ), }, ) @api.expect(create_agent_model) @api.doc(description="Create a new agent") def post(self): if not (decoded_token := request.decoded_token): return {"success": False}, 401 user = decoded_token.get("sub") if request.content_type == "application/json": data = request.get_json() else: data = request.form.to_dict() if "tools" in data: try: data["tools"] = json.loads(data["tools"]) except json.JSONDecodeError: data["tools"] = [] if "sources" in data: try: data["sources"] = json.loads(data["sources"]) except json.JSONDecodeError: data["sources"] = [] if "json_schema" in data: try: data["json_schema"] = json.loads(data["json_schema"]) except json.JSONDecodeError: data["json_schema"] = None if "models" in data: try: data["models"] = json.loads(data["models"]) except json.JSONDecodeError: data["models"] = [] print(f"Received data: {data}") # Validate and normalize JSON schema if provided if "json_schema" in data: try: data["json_schema"] = normalize_json_schema_payload( data.get("json_schema") ) except JsonSchemaValidationError as exc: return make_response( jsonify({"success": False, "message": f"JSON schema {exc}"}), 400, ) if data.get("status") not in ["draft", "published"]: return make_response( jsonify( { "success": False, "message": "Status must be either 'draft' or 'published'", } ), 400, ) agent_type = data.get("agent_type", "") # Default to classic schema for empty or unknown agent types if not agent_type or agent_type not in AGENT_TYPE_SCHEMAS: schema = AGENT_TYPE_SCHEMAS["classic"] # Set agent_type to classic if it was empty if not agent_type: agent_type = "classic" else: schema = AGENT_TYPE_SCHEMAS[agent_type] is_published = data.get("status") == "published" if agent_type == "workflow": workflow_id, workflow_error = validate_workflow_access( data.get("workflow"), user, required=is_published ) if workflow_error: return workflow_error data["workflow"] = workflow_id if data.get("status") == "published": required_fields = schema["required_published"] validate_fields = schema["validate_published"] if ( schema.get("require_source") and not data.get("source") and not data.get("sources") ): return make_response( jsonify( { "success": False, "message": "Either 'source' or 'sources' field is required for published agents", } ), 400, ) else: required_fields = schema["required_draft"] validate_fields = schema["validate_draft"] missing_fields = check_required_fields(data, required_fields) invalid_fields = validate_required_fields(data, validate_fields) if missing_fields: return missing_fields if invalid_fields: return invalid_fields image_url, error = handle_image_upload(request, "", user, storage) if error: return make_response( jsonify({"success": False, "message": "Image upload failed"}), 400 ) folder_id = data.get("folder_id") if folder_id: if not ObjectId.is_valid(folder_id): return make_response( jsonify({"success": False, "message": "Invalid folder ID format"}), 400, ) folder = agent_folders_collection.find_one( {"_id": ObjectId(folder_id), "user": user} ) if not folder: return make_response( jsonify({"success": False, "message": "Folder not found"}), 404 ) try: key = str(uuid.uuid4()) if data.get("status") == "published" else "" sources_list = [] source_field = "" if data.get("sources") and len(data.get("sources", [])) > 0: for source_id in data.get("sources", []): if source_id == "default": sources_list.append("default") elif ObjectId.is_valid(source_id): sources_list.append(DBRef("sources", ObjectId(source_id))) else: source_value = data.get("source", "") if source_value == "default": source_field = "default" elif ObjectId.is_valid(source_value): source_field = DBRef("sources", ObjectId(source_value)) new_agent = build_agent_document( data, user, key, agent_type, image_url, source_field, sources_list ) if agent_type != "workflow": if new_agent.get("chunks") == "": new_agent["chunks"] = "2" if ( new_agent.get("source") == "" and new_agent.get("retriever") == "" and not new_agent.get("sources") ): new_agent["retriever"] = "classic" resp = agents_collection.insert_one(new_agent) new_id = str(resp.inserted_id) except Exception as err: current_app.logger.error(f"Error creating agent: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"id": new_id, "key": key}), 201) @agents_ns.route("/update_agent/") class UpdateAgent(Resource): update_agent_model = api.model( "UpdateAgentModel", { "name": fields.String(required=True, description="New name of the agent"), "description": fields.String( required=True, description="New description of the agent" ), "image": fields.String( required=False, description="New image URL or identifier" ), "source": fields.String( required=False, description="Source ID (legacy single source)" ), "sources": fields.List( fields.String, required=False, description="List of source identifiers for multiple sources", ), "chunks": fields.Integer(required=False, description="Chunks count"), "retriever": fields.String(required=False, description="Retriever ID"), "prompt_id": fields.String(required=False, description="Prompt ID"), "tools": fields.List( fields.String, required=False, description="List of tool identifiers" ), "agent_type": fields.String( required=False, description="Type of the agent (classic, react, workflow). Defaults to 'classic' for backwards compatibility.", ), "status": fields.String( required=True, description="Status of the agent (draft or published)" ), "workflow": fields.String( required=False, description="Workflow ID for workflow-type agents" ), "json_schema": fields.Raw( required=False, description="JSON schema for enforcing structured output format", ), "limited_token_mode": fields.Boolean( required=False, description="Whether the agent is in limited token mode" ), "token_limit": fields.Integer( required=False, description="Token limit for the agent in limited mode" ), "limited_request_mode": fields.Boolean( require=False, description="Whether the agent is in limited request mode", ), "request_limit": fields.Integer( required=False, description="Request limit for the agent in limited mode", ), "models": fields.List( fields.String, required=False, description="List of available model IDs for this agent", ), "default_model_id": fields.String( required=False, description="Default model ID for this agent" ), "folder_id": fields.String( required=False, description="Folder ID to organize the agent" ), }, ) @api.expect(update_agent_model) @api.doc(description="Update an existing agent") def put(self, agent_id): if not (decoded_token := request.decoded_token): return make_response( jsonify({"success": False, "message": "Unauthorized"}), 401 ) user = decoded_token.get("sub") if not ObjectId.is_valid(agent_id): return make_response( jsonify({"success": False, "message": "Invalid agent ID format"}), 400 ) oid = ObjectId(agent_id) try: if request.content_type and "application/json" in request.content_type: data = request.get_json() else: data = request.form.to_dict() json_fields = ["tools", "sources", "json_schema", "models"] for field in json_fields: if field in data and data[field]: try: data[field] = json.loads(data[field]) except json.JSONDecodeError: return make_response( jsonify( { "success": False, "message": f"Invalid JSON format for field: {field}", } ), 400, ) if data.get("json_schema") == "": data["json_schema"] = None except Exception as err: current_app.logger.error( f"Error parsing request data: {err}", exc_info=True ) return make_response( jsonify({"success": False, "message": "Invalid request data"}), 400 ) try: existing_agent = agents_collection.find_one({"_id": oid, "user": user}) except Exception as err: current_app.logger.error( f"Error finding agent {agent_id}: {err}", exc_info=True ) return make_response( jsonify({"success": False, "message": "Database error finding agent"}), 500, ) if not existing_agent: return make_response( jsonify( {"success": False, "message": "Agent not found or not authorized"} ), 404, ) image_url, error = handle_image_upload( request, existing_agent.get("image", ""), user, storage ) if error: return error update_fields = {} allowed_fields = [ "name", "description", "image", "source", "sources", "chunks", "retriever", "prompt_id", "tools", "agent_type", "status", "json_schema", "limited_token_mode", "token_limit", "limited_request_mode", "request_limit", "models", "default_model_id", "folder_id", "workflow", ] for field in allowed_fields: if field not in data: continue if field == "status": new_status = data.get("status") if new_status not in ["draft", "published"]: return make_response( jsonify( { "success": False, "message": "Invalid status value. Must be 'draft' or 'published'", } ), 400, ) update_fields[field] = new_status elif field == "source": source_id = data.get("source") if source_id == "default": update_fields[field] = "default" elif source_id and ObjectId.is_valid(source_id): update_fields[field] = DBRef("sources", ObjectId(source_id)) elif source_id: return make_response( jsonify( { "success": False, "message": f"Invalid source ID format: {source_id}", } ), 400, ) else: update_fields[field] = "" elif field == "sources": sources_list = data.get("sources", []) if sources_list and isinstance(sources_list, list): valid_sources = [] for source_id in sources_list: if source_id == "default": valid_sources.append("default") elif ObjectId.is_valid(source_id): valid_sources.append(DBRef("sources", ObjectId(source_id))) else: return make_response( jsonify( { "success": False, "message": f"Invalid source ID in list: {source_id}", } ), 400, ) update_fields[field] = valid_sources else: update_fields[field] = [] elif field == "chunks": chunks_value = data.get("chunks") if chunks_value == "" or chunks_value is None: update_fields[field] = "2" else: try: chunks_int = int(chunks_value) if chunks_int < 0: return make_response( jsonify( { "success": False, "message": "Chunks value must be a non-negative integer", } ), 400, ) update_fields[field] = str(chunks_int) except (ValueError, TypeError): return make_response( jsonify( { "success": False, "message": f"Invalid chunks value: {chunks_value}", } ), 400, ) elif field == "tools": tools_list = data.get("tools", []) if isinstance(tools_list, list): update_fields[field] = tools_list else: return make_response( jsonify( { "success": False, "message": "Tools must be a list", } ), 400, ) elif field == "json_schema": json_schema = data.get("json_schema") if json_schema is not None: try: update_fields[field] = normalize_json_schema_payload( json_schema ) except JsonSchemaValidationError as exc: return make_response( jsonify({"success": False, "message": f"JSON schema {exc}"}), 400, ) else: update_fields[field] = None elif field == "limited_token_mode": raw_value = data.get("limited_token_mode", False) bool_value = ( raw_value == "True" if isinstance(raw_value, str) else bool(raw_value) ) update_fields[field] = bool_value if bool_value and data.get("token_limit") is None: return make_response( jsonify( { "success": False, "message": "Token limit must be provided when limited token mode is enabled", } ), 400, ) elif field == "limited_request_mode": raw_value = data.get("limited_request_mode", False) bool_value = ( raw_value == "True" if isinstance(raw_value, str) else bool(raw_value) ) update_fields[field] = bool_value if bool_value and data.get("request_limit") is None: return make_response( jsonify( { "success": False, "message": "Request limit must be provided when limited request mode is enabled", } ), 400, ) elif field == "token_limit": token_limit = data.get("token_limit") update_fields[field] = int(token_limit) if token_limit else 0 # Validate consistency with mode if update_fields[field] > 0 and not data.get("limited_token_mode"): return make_response( jsonify( { "success": False, "message": "Token limit cannot be set when limited token mode is disabled", } ), 400, ) elif field == "request_limit": request_limit = data.get("request_limit") update_fields[field] = int(request_limit) if request_limit else 0 if update_fields[field] > 0 and not data.get("limited_request_mode"): return make_response( jsonify( { "success": False, "message": "Request limit cannot be set when limited request mode is disabled", } ), 400, ) elif field == "folder_id": folder_id = data.get("folder_id") if folder_id: if not ObjectId.is_valid(folder_id): return make_response( jsonify( { "success": False, "message": "Invalid folder ID format", } ), 400, ) folder = agent_folders_collection.find_one( {"_id": ObjectId(folder_id), "user": user} ) if not folder: return make_response( jsonify({"success": False, "message": "Folder not found"}), 404, ) update_fields[field] = folder_id else: update_fields[field] = None elif field == "workflow": workflow_required = ( data.get("status", existing_agent.get("status")) == "published" and data.get("agent_type", existing_agent.get("agent_type")) == "workflow" ) workflow_id, workflow_error = validate_workflow_access( data.get("workflow"), user, required=workflow_required ) if workflow_error: return workflow_error update_fields[field] = workflow_id else: value = data[field] if field in ["name", "description", "prompt_id", "agent_type"]: if not value or not str(value).strip(): return make_response( jsonify( { "success": False, "message": f"Field '{field}' cannot be empty", } ), 400, ) update_fields[field] = value if image_url: update_fields["image"] = image_url if not update_fields: return make_response( jsonify( { "success": False, "message": "No valid update data provided", } ), 400, ) newly_generated_key = None final_status = update_fields.get("status", existing_agent.get("status")) agent_type = update_fields.get("agent_type", existing_agent.get("agent_type")) if final_status == "published": if agent_type == "workflow": required_published_fields = { "name": "Agent name", } missing_published_fields = [] for req_field, field_label in required_published_fields.items(): final_value = update_fields.get( req_field, existing_agent.get(req_field) ) if not final_value: missing_published_fields.append(field_label) workflow_id = update_fields.get("workflow", existing_agent.get("workflow")) if not workflow_id: missing_published_fields.append("Workflow") elif not ObjectId.is_valid(workflow_id): missing_published_fields.append("Valid workflow") else: workflow = workflows_collection.find_one( {"_id": ObjectId(workflow_id), "user": user} ) if not workflow: missing_published_fields.append("Workflow access") if missing_published_fields: return make_response( jsonify( { "success": False, "message": f"Cannot publish workflow agent. Missing required fields: {', '.join(missing_published_fields)}", } ), 400, ) else: required_published_fields = { "name": "Agent name", "description": "Agent description", "chunks": "Chunks count", "prompt_id": "Prompt", "agent_type": "Agent type", } missing_published_fields = [] for req_field, field_label in required_published_fields.items(): final_value = update_fields.get( req_field, existing_agent.get(req_field) ) if not final_value: missing_published_fields.append(field_label) source_val = update_fields.get("source", existing_agent.get("source")) sources_val = update_fields.get( "sources", existing_agent.get("sources", []) ) has_valid_source = ( isinstance(source_val, DBRef) or source_val == "default" or (isinstance(sources_val, list) and len(sources_val) > 0) ) if not has_valid_source: missing_published_fields.append("Source") if missing_published_fields: return make_response( jsonify( { "success": False, "message": f"Cannot publish agent. Missing or invalid required fields: {', '.join(missing_published_fields)}", } ), 400, ) if not existing_agent.get("key"): newly_generated_key = str(uuid.uuid4()) update_fields["key"] = newly_generated_key update_fields["updatedAt"] = datetime.datetime.now(datetime.timezone.utc) try: result = agents_collection.update_one( {"_id": oid, "user": user}, {"$set": update_fields} ) if result.matched_count == 0: return make_response( jsonify( { "success": False, "message": "Agent not found or update failed", } ), 404, ) if result.modified_count == 0 and result.matched_count == 1: return make_response( jsonify( { "success": True, "message": "No changes detected", "id": agent_id, } ), 200, ) except Exception as err: current_app.logger.error( f"Error updating agent {agent_id}: {err}", exc_info=True ) return make_response( jsonify({"success": False, "message": "Database error during update"}), 500, ) response_data = { "success": True, "id": agent_id, "message": "Agent updated successfully", } if newly_generated_key: response_data["key"] = newly_generated_key return make_response(jsonify(response_data), 200) @agents_ns.route("/delete_agent") class DeleteAgent(Resource): @api.doc(params={"id": "ID of the agent"}, description="Delete an agent by ID") def delete(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") agent_id = request.args.get("id") if not agent_id: return make_response( jsonify({"success": False, "message": "ID is required"}), 400 ) try: deleted_agent = agents_collection.find_one_and_delete( {"_id": ObjectId(agent_id), "user": user} ) if not deleted_agent: return make_response( jsonify({"success": False, "message": "Agent not found"}), 404 ) deleted_id = str(deleted_agent["_id"]) if deleted_agent.get("agent_type") == "workflow" and deleted_agent.get( "workflow" ): workflow_id = normalize_workflow_reference(deleted_agent.get("workflow")) if workflow_id and ObjectId.is_valid(workflow_id): workflow_oid = ObjectId(workflow_id) owned_workflow = workflows_collection.find_one( {"_id": workflow_oid, "user": user}, {"_id": 1} ) if owned_workflow: workflow_nodes_collection.delete_many({"workflow_id": workflow_id}) workflow_edges_collection.delete_many({"workflow_id": workflow_id}) workflows_collection.delete_one({"_id": workflow_oid, "user": user}) else: current_app.logger.warning( f"Skipping workflow cleanup for non-owned workflow {workflow_id}" ) elif workflow_id: current_app.logger.warning( f"Skipping workflow cleanup for invalid workflow id {workflow_id}" ) except Exception as err: current_app.logger.error(f"Error deleting agent: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"id": deleted_id}), 200) @agents_ns.route("/pinned_agents") class PinnedAgents(Resource): @api.doc(description="Get pinned agents for the user") def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user_id = decoded_token.get("sub") try: user_doc = ensure_user_doc(user_id) pinned_ids = user_doc.get("agent_preferences", {}).get("pinned", []) if not pinned_ids: return make_response(jsonify([]), 200) pinned_object_ids = [ObjectId(agent_id) for agent_id in pinned_ids] pinned_agents_cursor = agents_collection.find( {"_id": {"$in": pinned_object_ids}} ) pinned_agents = list(pinned_agents_cursor) existing_ids = {str(agent["_id"]) for agent in pinned_agents} # Clean up any stale pinned IDs stale_ids = [ agent_id for agent_id in pinned_ids if agent_id not in existing_ids ] if stale_ids: users_collection.update_one( {"user_id": user_id}, {"$pullAll": {"agent_preferences.pinned": stale_ids}}, ) list_pinned_agents = [ { "id": str(agent["_id"]), "name": agent.get("name", ""), "description": agent.get("description", ""), "image": ( generate_image_url(agent["image"]) if agent.get("image") else "" ), "source": ( str(db.dereference(agent["source"])["_id"]) if "source" in agent and agent["source"] and isinstance(agent["source"], DBRef) and db.dereference(agent["source"]) is not None else "" ), "chunks": agent.get("chunks", ""), "retriever": agent.get("retriever", ""), "prompt_id": agent.get("prompt_id", ""), "tools": agent.get("tools", []), "tool_details": resolve_tool_details(agent.get("tools", [])), "agent_type": agent.get("agent_type", ""), "status": agent.get("status", ""), "created_at": agent.get("createdAt", ""), "updated_at": agent.get("updatedAt", ""), "last_used_at": agent.get("lastUsedAt", ""), "key": ( f"{agent['key'][:4]}...{agent['key'][-4:]}" if "key" in agent else "" ), "pinned": True, } for agent in pinned_agents if "source" in agent or "retriever" in agent ] except Exception as err: current_app.logger.error(f"Error retrieving pinned agents: {err}") return make_response(jsonify({"success": False}), 400) return make_response(jsonify(list_pinned_agents), 200) @agents_ns.route("/template_agents") class GetTemplateAgents(Resource): @api.doc(description="Get template/premade agents") def get(self): try: template_agents = agents_collection.find({"user": "system"}) template_agents = [ { "id": str(agent["_id"]), "name": agent["name"], "description": agent["description"], "image": agent.get("image", ""), } for agent in template_agents ] return make_response(jsonify(template_agents), 200) except Exception as e: current_app.logger.error(f"Template agents fetch error: {e}", exc_info=True) return make_response(jsonify({"success": False}), 400) @agents_ns.route("/adopt_agent") class AdoptAgent(Resource): @api.doc(params={"id": "Agent ID"}, description="Adopt an agent by ID") def post(self): if not (decoded_token := request.decoded_token): return make_response(jsonify({"success": False}), 401) if not (agent_id := request.args.get("id")): return make_response( jsonify({"success": False, "message": "ID required"}), 400 ) try: agent = agents_collection.find_one( {"_id": ObjectId(agent_id), "user": "system"} ) if not agent: return make_response(jsonify({"status": "Not found"}), 404) new_agent = agent.copy() new_agent.pop("_id", None) new_agent["user"] = decoded_token["sub"] new_agent["status"] = "published" new_agent["lastUsedAt"] = datetime.datetime.now(datetime.timezone.utc) new_agent["key"] = str(uuid.uuid4()) insert_result = agents_collection.insert_one(new_agent) response_agent = new_agent.copy() response_agent.pop("_id", None) response_agent["id"] = str(insert_result.inserted_id) response_agent["tool_details"] = resolve_tool_details( response_agent.get("tools", []) ) if isinstance(response_agent.get("source"), DBRef): response_agent["source"] = str(response_agent["source"].id) return make_response( jsonify({"success": True, "agent": response_agent}), 200 ) except Exception as e: current_app.logger.error(f"Agent adopt error: {e}", exc_info=True) return make_response(jsonify({"success": False}), 400) @agents_ns.route("/pin_agent") class PinAgent(Resource): @api.doc(params={"id": "ID of the agent"}, description="Pin or unpin an agent") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user_id = decoded_token.get("sub") agent_id = request.args.get("id") if not agent_id: return make_response( jsonify({"success": False, "message": "ID is required"}), 400 ) try: agent = agents_collection.find_one({"_id": ObjectId(agent_id)}) if not agent: return make_response( jsonify({"success": False, "message": "Agent not found"}), 404 ) user_doc = ensure_user_doc(user_id) pinned_list = user_doc.get("agent_preferences", {}).get("pinned", []) if agent_id in pinned_list: users_collection.update_one( {"user_id": user_id}, {"$pull": {"agent_preferences.pinned": agent_id}}, ) action = "unpinned" else: users_collection.update_one( {"user_id": user_id}, {"$addToSet": {"agent_preferences.pinned": agent_id}}, ) action = "pinned" except Exception as err: current_app.logger.error(f"Error pinning/unpinning agent: {err}") return make_response( jsonify({"success": False, "message": "Server error"}), 500 ) return make_response(jsonify({"success": True, "action": action}), 200) @agents_ns.route("/remove_shared_agent") class RemoveSharedAgent(Resource): @api.doc( params={"id": "ID of the shared agent"}, description="Remove a shared agent from the current user's shared list", ) def delete(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user_id = decoded_token.get("sub") agent_id = request.args.get("id") if not agent_id: return make_response( jsonify({"success": False, "message": "ID is required"}), 400 ) try: agent = agents_collection.find_one( {"_id": ObjectId(agent_id), "shared_publicly": True} ) if not agent: return make_response( jsonify({"success": False, "message": "Shared agent not found"}), 404, ) ensure_user_doc(user_id) users_collection.update_one( {"user_id": user_id}, { "$pull": { "agent_preferences.shared_with_me": agent_id, "agent_preferences.pinned": agent_id, } }, ) return make_response(jsonify({"success": True, "action": "removed"}), 200) except Exception as err: current_app.logger.error(f"Error removing shared agent: {err}") return make_response( jsonify({"success": False, "message": "Server error"}), 500 ) ================================================ FILE: application/api/user/agents/sharing.py ================================================ """Agent management sharing functionality.""" import datetime import secrets from bson import DBRef from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import fields, Namespace, Resource from application.api import api from application.core.settings import settings from application.api.user.base import ( agents_collection, db, ensure_user_doc, resolve_tool_details, user_tools_collection, users_collection, ) from application.utils import generate_image_url agents_sharing_ns = Namespace( "agents", description="Agent management operations", path="/api" ) @agents_sharing_ns.route("/shared_agent") class SharedAgent(Resource): @api.doc( params={ "token": "Shared token of the agent", }, description="Get a shared agent by token or ID", ) def get(self): shared_token = request.args.get("token") if not shared_token: return make_response( jsonify({"success": False, "message": "Token or ID is required"}), 400 ) try: query = { "shared_publicly": True, "shared_token": shared_token, } shared_agent = agents_collection.find_one(query) if not shared_agent: return make_response( jsonify({"success": False, "message": "Shared agent not found"}), 404, ) agent_id = str(shared_agent["_id"]) data = { "id": agent_id, "user": shared_agent.get("user", ""), "name": shared_agent.get("name", ""), "image": ( generate_image_url(shared_agent["image"]) if shared_agent.get("image") else "" ), "description": shared_agent.get("description", ""), "source": ( str(source_doc["_id"]) if isinstance(shared_agent.get("source"), DBRef) and (source_doc := db.dereference(shared_agent.get("source"))) else "" ), "chunks": shared_agent.get("chunks", "0"), "retriever": shared_agent.get("retriever", "classic"), "prompt_id": shared_agent.get("prompt_id", "default"), "tools": shared_agent.get("tools", []), "tool_details": resolve_tool_details(shared_agent.get("tools", [])), "agent_type": shared_agent.get("agent_type", ""), "status": shared_agent.get("status", ""), "json_schema": shared_agent.get("json_schema"), "limited_token_mode": shared_agent.get("limited_token_mode", False), "token_limit": shared_agent.get("token_limit", settings.DEFAULT_AGENT_LIMITS["token_limit"]), "limited_request_mode": shared_agent.get("limited_request_mode", False), "request_limit": shared_agent.get("request_limit", settings.DEFAULT_AGENT_LIMITS["request_limit"]), "created_at": shared_agent.get("createdAt", ""), "updated_at": shared_agent.get("updatedAt", ""), "shared": shared_agent.get("shared_publicly", False), "shared_token": shared_agent.get("shared_token", ""), "shared_metadata": shared_agent.get("shared_metadata", {}), } if data["tools"]: enriched_tools = [] for tool in data["tools"]: tool_data = user_tools_collection.find_one({"_id": ObjectId(tool)}) if tool_data: enriched_tools.append(tool_data.get("name", "")) data["tools"] = enriched_tools decoded_token = getattr(request, "decoded_token", None) if decoded_token: user_id = decoded_token.get("sub") owner_id = shared_agent.get("user") if user_id != owner_id: ensure_user_doc(user_id) users_collection.update_one( {"user_id": user_id}, {"$addToSet": {"agent_preferences.shared_with_me": agent_id}}, ) return make_response(jsonify(data), 200) except Exception as err: current_app.logger.error(f"Error retrieving shared agent: {err}") return make_response(jsonify({"success": False}), 400) @agents_sharing_ns.route("/shared_agents") class SharedAgents(Resource): @api.doc(description="Get shared agents explicitly shared with the user") def get(self): try: decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user_id = decoded_token.get("sub") user_doc = ensure_user_doc(user_id) shared_with_ids = user_doc.get("agent_preferences", {}).get( "shared_with_me", [] ) shared_object_ids = [ObjectId(id) for id in shared_with_ids] shared_agents_cursor = agents_collection.find( {"_id": {"$in": shared_object_ids}, "shared_publicly": True} ) shared_agents = list(shared_agents_cursor) found_ids_set = {str(agent["_id"]) for agent in shared_agents} stale_ids = [id for id in shared_with_ids if id not in found_ids_set] if stale_ids: users_collection.update_one( {"user_id": user_id}, {"$pullAll": {"agent_preferences.shared_with_me": stale_ids}}, ) pinned_ids = set(user_doc.get("agent_preferences", {}).get("pinned", [])) list_shared_agents = [ { "id": str(agent["_id"]), "name": agent.get("name", ""), "description": agent.get("description", ""), "image": ( generate_image_url(agent["image"]) if agent.get("image") else "" ), "tools": agent.get("tools", []), "tool_details": resolve_tool_details(agent.get("tools", [])), "agent_type": agent.get("agent_type", ""), "status": agent.get("status", ""), "json_schema": agent.get("json_schema"), "limited_token_mode": agent.get("limited_token_mode", False), "token_limit": agent.get("token_limit", settings.DEFAULT_AGENT_LIMITS["token_limit"]), "limited_request_mode": agent.get("limited_request_mode", False), "request_limit": agent.get("request_limit", settings.DEFAULT_AGENT_LIMITS["request_limit"]), "created_at": agent.get("createdAt", ""), "updated_at": agent.get("updatedAt", ""), "pinned": str(agent["_id"]) in pinned_ids, "shared": agent.get("shared_publicly", False), "shared_token": agent.get("shared_token", ""), "shared_metadata": agent.get("shared_metadata", {}), } for agent in shared_agents ] return make_response(jsonify(list_shared_agents), 200) except Exception as err: current_app.logger.error(f"Error retrieving shared agents: {err}") return make_response(jsonify({"success": False}), 400) @agents_sharing_ns.route("/share_agent") class ShareAgent(Resource): @api.expect( api.model( "ShareAgentModel", { "id": fields.String(required=True, description="ID of the agent"), "shared": fields.Boolean( required=True, description="Share or unshare the agent" ), "username": fields.String( required=False, description="Name of the user" ), }, ) ) @api.doc(description="Share or unshare an agent") def put(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() if not data: return make_response( jsonify({"success": False, "message": "Missing JSON body"}), 400 ) agent_id = data.get("id") shared = data.get("shared") username = data.get("username", "") if not agent_id: return make_response( jsonify({"success": False, "message": "ID is required"}), 400 ) if shared is None: return make_response( jsonify( { "success": False, "message": "Shared parameter is required and must be true or false", } ), 400, ) try: try: agent_oid = ObjectId(agent_id) except Exception: return make_response( jsonify({"success": False, "message": "Invalid agent ID"}), 400 ) agent = agents_collection.find_one({"_id": agent_oid, "user": user}) if not agent: return make_response( jsonify({"success": False, "message": "Agent not found"}), 404 ) if shared: shared_metadata = { "shared_by": username, "shared_at": datetime.datetime.now(datetime.timezone.utc), } shared_token = secrets.token_urlsafe(32) agents_collection.update_one( {"_id": agent_oid, "user": user}, { "$set": { "shared_publicly": shared, "shared_metadata": shared_metadata, "shared_token": shared_token, } }, ) else: agents_collection.update_one( {"_id": agent_oid, "user": user}, {"$set": {"shared_publicly": shared, "shared_token": None}}, {"$unset": {"shared_metadata": ""}}, ) except Exception as err: current_app.logger.error(f"Error sharing/unsharing agent: {err}", exc_info=True) return make_response(jsonify({"success": False, "error": "Failed to update agent sharing status"}), 400) shared_token = shared_token if shared else None return make_response( jsonify({"success": True, "shared_token": shared_token}), 200 ) ================================================ FILE: application/api/user/agents/webhooks.py ================================================ """Agent management webhook handlers.""" import secrets from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import Namespace, Resource from application.api import api from application.api.user.base import agents_collection, require_agent from application.api.user.tasks import process_agent_webhook from application.core.settings import settings agents_webhooks_ns = Namespace( "agents", description="Agent management operations", path="/api" ) @agents_webhooks_ns.route("/agent_webhook") class AgentWebhook(Resource): @api.doc( params={"id": "ID of the agent"}, description="Generate webhook URL for the agent", ) def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") agent_id = request.args.get("id") if not agent_id: return make_response( jsonify({"success": False, "message": "ID is required"}), 400 ) try: agent = agents_collection.find_one( {"_id": ObjectId(agent_id), "user": user} ) if not agent: return make_response( jsonify({"success": False, "message": "Agent not found"}), 404 ) webhook_token = agent.get("incoming_webhook_token") if not webhook_token: webhook_token = secrets.token_urlsafe(32) agents_collection.update_one( {"_id": ObjectId(agent_id), "user": user}, {"$set": {"incoming_webhook_token": webhook_token}}, ) base_url = settings.API_URL.rstrip("/") full_webhook_url = f"{base_url}/api/webhooks/agents/{webhook_token}" except Exception as err: current_app.logger.error( f"Error generating webhook URL: {err}", exc_info=True ) return make_response( jsonify({"success": False, "message": "Error generating webhook URL"}), 400, ) return make_response( jsonify({"success": True, "webhook_url": full_webhook_url}), 200 ) @agents_webhooks_ns.route("/webhooks/agents/") class AgentWebhookListener(Resource): method_decorators = [require_agent] def _enqueue_webhook_task(self, agent_id_str, payload, source_method): if not payload: current_app.logger.warning( f"Webhook ({source_method}) received for agent {agent_id_str} with empty payload." ) current_app.logger.info( f"Incoming {source_method} webhook for agent {agent_id_str}. Enqueuing task with payload: {payload}" ) try: task = process_agent_webhook.delay( agent_id=agent_id_str, payload=payload, ) current_app.logger.info( f"Task {task.id} enqueued for agent {agent_id_str} ({source_method})." ) return make_response(jsonify({"success": True, "task_id": task.id}), 200) except Exception as err: current_app.logger.error( f"Error enqueuing webhook task ({source_method}) for agent {agent_id_str}: {err}", exc_info=True, ) return make_response( jsonify({"success": False, "message": "Error processing webhook"}), 500 ) @api.doc( description="Webhook listener for agent events (POST). Expects JSON payload, which is used to trigger processing.", ) def post(self, webhook_token, agent, agent_id_str): payload = request.get_json() if payload is None: return make_response( jsonify( { "success": False, "message": "Invalid or missing JSON data in request body", } ), 400, ) return self._enqueue_webhook_task(agent_id_str, payload, source_method="POST") @api.doc( description="Webhook listener for agent events (GET). Uses URL query parameters as payload to trigger processing.", ) def get(self, webhook_token, agent, agent_id_str): payload = request.args.to_dict(flat=True) return self._enqueue_webhook_task(agent_id_str, payload, source_method="GET") ================================================ FILE: application/api/user/analytics/__init__.py ================================================ """Analytics module.""" from .routes import analytics_ns __all__ = ["analytics_ns"] ================================================ FILE: application/api/user/analytics/routes.py ================================================ """Analytics and reporting routes.""" import datetime from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import fields, Namespace, Resource from application.api import api from application.api.user.base import ( agents_collection, conversations_collection, generate_date_range, generate_hourly_range, generate_minute_range, token_usage_collection, user_logs_collection, ) analytics_ns = Namespace( "analytics", description="Analytics and reporting operations", path="/api" ) @analytics_ns.route("/get_message_analytics") class GetMessageAnalytics(Resource): get_message_analytics_model = api.model( "GetMessageAnalyticsModel", { "api_key_id": fields.String(required=False, description="API Key ID"), "filter_option": fields.String( required=False, description="Filter option for analytics", default="last_30_days", enum=[ "last_hour", "last_24_hour", "last_7_days", "last_15_days", "last_30_days", ], ), }, ) @api.expect(get_message_analytics_model) @api.doc(description="Get message analytics based on filter option") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() api_key_id = data.get("api_key_id") filter_option = data.get("filter_option", "last_30_days") try: api_key = ( agents_collection.find_one({"_id": ObjectId(api_key_id), "user": user})[ "key" ] if api_key_id else None ) except Exception as err: current_app.logger.error(f"Error getting API key: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) end_date = datetime.datetime.now(datetime.timezone.utc) if filter_option == "last_hour": start_date = end_date - datetime.timedelta(hours=1) group_format = "%Y-%m-%d %H:%M:00" elif filter_option == "last_24_hour": start_date = end_date - datetime.timedelta(hours=24) group_format = "%Y-%m-%d %H:00" else: if filter_option in ["last_7_days", "last_15_days", "last_30_days"]: filter_days = ( 6 if filter_option == "last_7_days" else 14 if filter_option == "last_15_days" else 29 ) else: return make_response( jsonify({"success": False, "message": "Invalid option"}), 400 ) start_date = end_date - datetime.timedelta(days=filter_days) start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0) end_date = end_date.replace( hour=23, minute=59, second=59, microsecond=999999 ) group_format = "%Y-%m-%d" try: match_stage = { "$match": { "user": user, } } if api_key: match_stage["$match"]["api_key"] = api_key pipeline = [ match_stage, {"$unwind": "$queries"}, { "$match": { "queries.timestamp": {"$gte": start_date, "$lte": end_date} } }, { "$group": { "_id": { "$dateToString": { "format": group_format, "date": "$queries.timestamp", } }, "count": {"$sum": 1}, } }, {"$sort": {"_id": 1}}, ] message_data = conversations_collection.aggregate(pipeline) if filter_option == "last_hour": intervals = generate_minute_range(start_date, end_date) elif filter_option == "last_24_hour": intervals = generate_hourly_range(start_date, end_date) else: intervals = generate_date_range(start_date, end_date) daily_messages = {interval: 0 for interval in intervals} for entry in message_data: daily_messages[entry["_id"]] = entry["count"] except Exception as err: current_app.logger.error( f"Error getting message analytics: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response( jsonify({"success": True, "messages": daily_messages}), 200 ) @analytics_ns.route("/get_token_analytics") class GetTokenAnalytics(Resource): get_token_analytics_model = api.model( "GetTokenAnalyticsModel", { "api_key_id": fields.String(required=False, description="API Key ID"), "filter_option": fields.String( required=False, description="Filter option for analytics", default="last_30_days", enum=[ "last_hour", "last_24_hour", "last_7_days", "last_15_days", "last_30_days", ], ), }, ) @api.expect(get_token_analytics_model) @api.doc(description="Get token analytics data") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() api_key_id = data.get("api_key_id") filter_option = data.get("filter_option", "last_30_days") try: api_key = ( agents_collection.find_one({"_id": ObjectId(api_key_id), "user": user})[ "key" ] if api_key_id else None ) except Exception as err: current_app.logger.error(f"Error getting API key: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) end_date = datetime.datetime.now(datetime.timezone.utc) if filter_option == "last_hour": start_date = end_date - datetime.timedelta(hours=1) group_format = "%Y-%m-%d %H:%M:00" group_stage = { "$group": { "_id": { "minute": { "$dateToString": { "format": group_format, "date": "$timestamp", } } }, "total_tokens": { "$sum": {"$add": ["$prompt_tokens", "$generated_tokens"]} }, } } elif filter_option == "last_24_hour": start_date = end_date - datetime.timedelta(hours=24) group_format = "%Y-%m-%d %H:00" group_stage = { "$group": { "_id": { "hour": { "$dateToString": { "format": group_format, "date": "$timestamp", } } }, "total_tokens": { "$sum": {"$add": ["$prompt_tokens", "$generated_tokens"]} }, } } else: if filter_option in ["last_7_days", "last_15_days", "last_30_days"]: filter_days = ( 6 if filter_option == "last_7_days" else (14 if filter_option == "last_15_days" else 29) ) else: return make_response( jsonify({"success": False, "message": "Invalid option"}), 400 ) start_date = end_date - datetime.timedelta(days=filter_days) start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0) end_date = end_date.replace( hour=23, minute=59, second=59, microsecond=999999 ) group_format = "%Y-%m-%d" group_stage = { "$group": { "_id": { "day": { "$dateToString": { "format": group_format, "date": "$timestamp", } } }, "total_tokens": { "$sum": {"$add": ["$prompt_tokens", "$generated_tokens"]} }, } } try: match_stage = { "$match": { "user_id": user, "timestamp": {"$gte": start_date, "$lte": end_date}, } } if api_key: match_stage["$match"]["api_key"] = api_key token_usage_data = token_usage_collection.aggregate( [ match_stage, group_stage, {"$sort": {"_id": 1}}, ] ) if filter_option == "last_hour": intervals = generate_minute_range(start_date, end_date) elif filter_option == "last_24_hour": intervals = generate_hourly_range(start_date, end_date) else: intervals = generate_date_range(start_date, end_date) daily_token_usage = {interval: 0 for interval in intervals} for entry in token_usage_data: if filter_option == "last_hour": daily_token_usage[entry["_id"]["minute"]] = entry["total_tokens"] elif filter_option == "last_24_hour": daily_token_usage[entry["_id"]["hour"]] = entry["total_tokens"] else: daily_token_usage[entry["_id"]["day"]] = entry["total_tokens"] except Exception as err: current_app.logger.error( f"Error getting token analytics: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response( jsonify({"success": True, "token_usage": daily_token_usage}), 200 ) @analytics_ns.route("/get_feedback_analytics") class GetFeedbackAnalytics(Resource): get_feedback_analytics_model = api.model( "GetFeedbackAnalyticsModel", { "api_key_id": fields.String(required=False, description="API Key ID"), "filter_option": fields.String( required=False, description="Filter option for analytics", default="last_30_days", enum=[ "last_hour", "last_24_hour", "last_7_days", "last_15_days", "last_30_days", ], ), }, ) @api.expect(get_feedback_analytics_model) @api.doc(description="Get feedback analytics data") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() api_key_id = data.get("api_key_id") filter_option = data.get("filter_option", "last_30_days") try: api_key = ( agents_collection.find_one({"_id": ObjectId(api_key_id), "user": user})[ "key" ] if api_key_id else None ) except Exception as err: current_app.logger.error(f"Error getting API key: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) end_date = datetime.datetime.now(datetime.timezone.utc) if filter_option == "last_hour": start_date = end_date - datetime.timedelta(hours=1) group_format = "%Y-%m-%d %H:%M:00" date_field = { "$dateToString": { "format": group_format, "date": "$queries.feedback_timestamp", } } elif filter_option == "last_24_hour": start_date = end_date - datetime.timedelta(hours=24) group_format = "%Y-%m-%d %H:00" date_field = { "$dateToString": { "format": group_format, "date": "$queries.feedback_timestamp", } } else: if filter_option in ["last_7_days", "last_15_days", "last_30_days"]: filter_days = ( 6 if filter_option == "last_7_days" else (14 if filter_option == "last_15_days" else 29) ) else: return make_response( jsonify({"success": False, "message": "Invalid option"}), 400 ) start_date = end_date - datetime.timedelta(days=filter_days) start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0) end_date = end_date.replace( hour=23, minute=59, second=59, microsecond=999999 ) group_format = "%Y-%m-%d" date_field = { "$dateToString": { "format": group_format, "date": "$queries.feedback_timestamp", } } try: match_stage = { "$match": { "queries.feedback_timestamp": { "$gte": start_date, "$lte": end_date, }, "queries.feedback": {"$exists": True}, } } if api_key: match_stage["$match"]["api_key"] = api_key pipeline = [ match_stage, {"$unwind": "$queries"}, {"$match": {"queries.feedback": {"$exists": True}}}, { "$group": { "_id": {"time": date_field, "feedback": "$queries.feedback"}, "count": {"$sum": 1}, } }, { "$group": { "_id": "$_id.time", "positive": { "$sum": { "$cond": [ {"$eq": ["$_id.feedback", "LIKE"]}, "$count", 0, ] } }, "negative": { "$sum": { "$cond": [ {"$eq": ["$_id.feedback", "DISLIKE"]}, "$count", 0, ] } }, } }, {"$sort": {"_id": 1}}, ] feedback_data = conversations_collection.aggregate(pipeline) if filter_option == "last_hour": intervals = generate_minute_range(start_date, end_date) elif filter_option == "last_24_hour": intervals = generate_hourly_range(start_date, end_date) else: intervals = generate_date_range(start_date, end_date) daily_feedback = { interval: {"positive": 0, "negative": 0} for interval in intervals } for entry in feedback_data: daily_feedback[entry["_id"]] = { "positive": entry["positive"], "negative": entry["negative"], } except Exception as err: current_app.logger.error( f"Error getting feedback analytics: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response( jsonify({"success": True, "feedback": daily_feedback}), 200 ) @analytics_ns.route("/get_user_logs") class GetUserLogs(Resource): get_user_logs_model = api.model( "GetUserLogsModel", { "page": fields.Integer( required=False, description="Page number for pagination", default=1, ), "api_key_id": fields.String(required=False, description="API Key ID"), "page_size": fields.Integer( required=False, description="Number of logs per page", default=10, ), }, ) @api.expect(get_user_logs_model) @api.doc(description="Get user logs with pagination") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() page = int(data.get("page", 1)) api_key_id = data.get("api_key_id") page_size = int(data.get("page_size", 10)) skip = (page - 1) * page_size try: api_key = ( agents_collection.find_one({"_id": ObjectId(api_key_id)})["key"] if api_key_id else None ) except Exception as err: current_app.logger.error(f"Error getting API key: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) query = {"user": user} if api_key: query = {"api_key": api_key} items_cursor = ( user_logs_collection.find(query) .sort("timestamp", -1) .skip(skip) .limit(page_size + 1) ) items = list(items_cursor) results = [ { "id": str(item.get("_id")), "action": item.get("action"), "level": item.get("level"), "user": item.get("user"), "question": item.get("question"), "sources": item.get("sources"), "retriever_params": item.get("retriever_params"), "timestamp": item.get("timestamp"), } for item in items[:page_size] ] has_more = len(items) > page_size return make_response( jsonify( { "success": True, "logs": results, "page": page, "page_size": page_size, "has_more": has_more, } ), 200, ) ================================================ FILE: application/api/user/attachments/__init__.py ================================================ """Attachments module.""" from .routes import attachments_ns __all__ = ["attachments_ns"] ================================================ FILE: application/api/user/attachments/routes.py ================================================ """File attachments and media routes.""" import os import tempfile from pathlib import Path from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import fields, Namespace, Resource from application.api import api from application.cache import get_redis_instance from application.core.settings import settings from application.stt.constants import ( SUPPORTED_AUDIO_EXTENSIONS, SUPPORTED_AUDIO_MIME_TYPES, ) from application.stt.upload_limits import ( AudioFileTooLargeError, build_stt_file_size_limit_message, enforce_audio_file_size_limit, is_audio_filename, ) from application.stt.live_session import ( apply_live_stt_hypothesis, create_live_stt_session, delete_live_stt_session, finalize_live_stt_session, get_live_stt_transcript_text, load_live_stt_session, save_live_stt_session, ) from application.stt.stt_creator import STTCreator from application.tts.tts_creator import TTSCreator from application.utils import safe_filename attachments_ns = Namespace( "attachments", description="File attachments and media operations", path="/api" ) def _resolve_authenticated_user(): decoded_token = getattr(request, "decoded_token", None) api_key = request.form.get("api_key") or request.args.get("api_key") if decoded_token: return safe_filename(decoded_token.get("sub")) if api_key: from application.api.user.base import agents_collection agent = agents_collection.find_one({"key": api_key}) if not agent: return make_response( jsonify({"success": False, "message": "Invalid API key"}), 401 ) return safe_filename(agent.get("user")) return None def _get_uploaded_file_size(file) -> int: try: current_position = file.stream.tell() file.stream.seek(0, os.SEEK_END) size_bytes = file.stream.tell() file.stream.seek(current_position) return size_bytes except Exception: return 0 def _is_supported_audio_mimetype(mimetype: str) -> bool: if not mimetype: return True normalized = mimetype.split(";")[0].strip().lower() return normalized.startswith("audio/") or normalized in SUPPORTED_AUDIO_MIME_TYPES def _enforce_uploaded_audio_size_limit(file, filename: str) -> None: if not is_audio_filename(filename): return size_bytes = _get_uploaded_file_size(file) if size_bytes: enforce_audio_file_size_limit(size_bytes) def _get_store_attachment_user_error(exc: Exception) -> str: if isinstance(exc, AudioFileTooLargeError): return build_stt_file_size_limit_message() return "Failed to process file" def _require_live_stt_redis(): redis_client = get_redis_instance() if redis_client: return redis_client return make_response( jsonify({"success": False, "message": "Live transcription is unavailable"}), 503, ) def _parse_bool_form_value(value: str | None) -> bool: if value is None: return False return value.strip().lower() in {"1", "true", "yes", "on"} @attachments_ns.route("/store_attachment") class StoreAttachment(Resource): @api.expect( api.model( "AttachmentModel", { "file": fields.Raw(required=True, description="File(s) to upload"), "api_key": fields.String( required=False, description="API key (optional)" ), }, ) ) @api.doc( description="Stores one or multiple attachments without vectorization or training. Supports user or API key authentication." ) def post(self): auth_user = _resolve_authenticated_user() if hasattr(auth_user, "status_code"): return auth_user files = request.files.getlist("file") if not files: single_file = request.files.get("file") if single_file: files = [single_file] if not files or all(f.filename == "" for f in files): return make_response( jsonify({"status": "error", "message": "Missing file(s)"}), 400, ) user = auth_user if not user: return make_response( jsonify({"success": False, "message": "Authentication required"}), 401 ) try: from application.api.user.tasks import store_attachment from application.api.user.base import storage tasks = [] errors = [] original_file_count = len(files) for idx, file in enumerate(files): try: attachment_id = ObjectId() original_filename = safe_filename(os.path.basename(file.filename)) _enforce_uploaded_audio_size_limit(file, original_filename) relative_path = f"{settings.UPLOAD_FOLDER}/{user}/attachments/{str(attachment_id)}/{original_filename}" metadata = storage.save_file(file, relative_path) file_info = { "filename": original_filename, "attachment_id": str(attachment_id), "path": relative_path, "metadata": metadata, } task = store_attachment.delay(file_info, user) tasks.append({ "task_id": task.id, "filename": original_filename, "attachment_id": str(attachment_id), "upload_index": idx, }) except Exception as file_err: current_app.logger.error(f"Error processing file {idx} ({file.filename}): {file_err}", exc_info=True) errors.append({ "upload_index": idx, "filename": file.filename, "error": _get_store_attachment_user_error(file_err), }) if not tasks: if errors and all( error.get("error") == build_stt_file_size_limit_message() for error in errors ): return make_response( jsonify( { "success": False, "message": build_stt_file_size_limit_message(), "errors": errors, } ), 413, ) return make_response( jsonify({"status": "error", "message": "No valid files to upload"}), 400, ) if original_file_count == 1 and len(tasks) == 1: current_app.logger.info("Returning single task_id response") return make_response( jsonify( { "success": True, "task_id": tasks[0]["task_id"], "message": "File uploaded successfully. Processing started.", } ), 200, ) else: response_data = { "success": True, "tasks": tasks, "message": f"{len(tasks)} file(s) uploaded successfully. Processing started.", } if errors: response_data["errors"] = errors response_data["message"] += f" {len(errors)} file(s) failed." return make_response( jsonify(response_data), 200, ) except Exception as err: current_app.logger.error(f"Error storing attachment: {err}", exc_info=True) return make_response(jsonify({"success": False, "error": "Failed to store attachment"}), 400) @attachments_ns.route("/stt") class SpeechToText(Resource): @api.expect( api.model( "SpeechToTextModel", { "file": fields.Raw(required=True, description="Audio file"), "language": fields.String( required=False, description="Optional transcription language hint" ), }, ) ) @api.doc(description="Transcribe an uploaded audio file") def post(self): auth_user = _resolve_authenticated_user() if hasattr(auth_user, "status_code"): return auth_user if not auth_user: return make_response( jsonify({"success": False, "message": "Authentication required"}), 401, ) file = request.files.get("file") if not file or file.filename == "": return make_response( jsonify({"success": False, "message": "Missing file"}), 400, ) filename = safe_filename(os.path.basename(file.filename)) suffix = Path(filename).suffix.lower() if suffix not in SUPPORTED_AUDIO_EXTENSIONS: return make_response( jsonify({"success": False, "message": "Unsupported audio format"}), 400, ) if not _is_supported_audio_mimetype(file.mimetype or ""): return make_response( jsonify({"success": False, "message": "Unsupported audio MIME type"}), 400, ) try: _enforce_uploaded_audio_size_limit(file, filename) except AudioFileTooLargeError: return make_response( jsonify( { "success": False, "message": build_stt_file_size_limit_message(), } ), 413, ) temp_path = None try: with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file: file.save(temp_file.name) temp_path = Path(temp_file.name) stt_instance = STTCreator.create_stt(settings.STT_PROVIDER) transcript = stt_instance.transcribe( temp_path, language=request.form.get("language") or settings.STT_LANGUAGE, timestamps=settings.STT_ENABLE_TIMESTAMPS, diarize=settings.STT_ENABLE_DIARIZATION, ) return make_response(jsonify({"success": True, **transcript}), 200) except Exception as err: current_app.logger.error(f"Error transcribing audio: {err}", exc_info=True) return make_response( jsonify({"success": False, "message": "Failed to transcribe audio"}), 400, ) finally: if temp_path and temp_path.exists(): temp_path.unlink() @attachments_ns.route("/stt/live/start") class LiveSpeechToTextStart(Resource): @api.doc(description="Start a live speech-to-text session") def post(self): auth_user = _resolve_authenticated_user() if hasattr(auth_user, "status_code"): return auth_user if not auth_user: return make_response( jsonify({"success": False, "message": "Authentication required"}), 401, ) redis_client = _require_live_stt_redis() if hasattr(redis_client, "status_code"): return redis_client payload = request.get_json(silent=True) or {} session_state = create_live_stt_session( user=auth_user, language=payload.get("language") or settings.STT_LANGUAGE, ) save_live_stt_session(redis_client, session_state) return make_response( jsonify( { "success": True, "session_id": session_state["session_id"], "language": session_state.get("language"), "committed_text": "", "mutable_text": "", "previous_hypothesis": "", "latest_hypothesis": "", "finalized_text": "", "pending_text": "", "transcript_text": "", } ), 200, ) @attachments_ns.route("/stt/live/chunk") class LiveSpeechToTextChunk(Resource): @api.expect( api.model( "LiveSpeechToTextChunkModel", { "session_id": fields.String( required=True, description="Live transcription session ID" ), "chunk_index": fields.Integer( required=True, description="Sequential chunk index" ), "is_silence": fields.Boolean( required=False, description="Whether the latest capture window was mostly silence", ), "file": fields.Raw(required=True, description="Audio chunk"), }, ) ) @api.doc(description="Transcribe a chunk for a live speech-to-text session") def post(self): auth_user = _resolve_authenticated_user() if hasattr(auth_user, "status_code"): return auth_user if not auth_user: return make_response( jsonify({"success": False, "message": "Authentication required"}), 401, ) redis_client = _require_live_stt_redis() if hasattr(redis_client, "status_code"): return redis_client session_id = request.form.get("session_id", "").strip() if not session_id: return make_response( jsonify({"success": False, "message": "Missing session_id"}), 400, ) session_state = load_live_stt_session(redis_client, session_id) if not session_state: return make_response( jsonify( { "success": False, "message": "Live transcription session not found", } ), 404, ) if safe_filename(str(session_state.get("user", ""))) != auth_user: return make_response( jsonify({"success": False, "message": "Forbidden"}), 403, ) chunk_index_raw = request.form.get("chunk_index", "").strip() if chunk_index_raw == "": return make_response( jsonify({"success": False, "message": "Missing chunk_index"}), 400, ) try: chunk_index = int(chunk_index_raw) except ValueError: return make_response( jsonify({"success": False, "message": "Invalid chunk_index"}), 400, ) is_silence = _parse_bool_form_value(request.form.get("is_silence")) file = request.files.get("file") if not file or file.filename == "": return make_response( jsonify({"success": False, "message": "Missing file"}), 400, ) filename = safe_filename(os.path.basename(file.filename)) suffix = Path(filename).suffix.lower() if suffix not in SUPPORTED_AUDIO_EXTENSIONS: return make_response( jsonify({"success": False, "message": "Unsupported audio format"}), 400, ) if not _is_supported_audio_mimetype(file.mimetype or ""): return make_response( jsonify({"success": False, "message": "Unsupported audio MIME type"}), 400, ) try: _enforce_uploaded_audio_size_limit(file, filename) except AudioFileTooLargeError: return make_response( jsonify( { "success": False, "message": build_stt_file_size_limit_message(), } ), 413, ) temp_path = None try: with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file: file.save(temp_file.name) temp_path = Path(temp_file.name) session_language = session_state.get("language") or settings.STT_LANGUAGE stt_instance = STTCreator.create_stt(settings.STT_PROVIDER) transcript = stt_instance.transcribe( temp_path, language=session_language, timestamps=False, diarize=False, ) if not session_state.get("language") and transcript.get("language"): session_state["language"] = transcript["language"] try: apply_live_stt_hypothesis( session_state, str(transcript.get("text", "")), chunk_index, is_silence=is_silence, ) except ValueError: current_app.logger.warning( "Invalid live transcription chunk", exc_info=True, ) return make_response( jsonify( { "success": False, "message": "Invalid live transcription chunk", } ), 409, ) save_live_stt_session(redis_client, session_state) return make_response( jsonify( { "success": True, "session_id": session_id, "chunk_index": chunk_index, "chunk_text": transcript.get("text", ""), "is_silence": is_silence, "language": session_state.get("language"), "committed_text": session_state.get("committed_text", ""), "mutable_text": session_state.get("mutable_text", ""), "previous_hypothesis": session_state.get( "previous_hypothesis", "" ), "latest_hypothesis": session_state.get( "latest_hypothesis", "" ), "finalized_text": session_state.get("committed_text", ""), "pending_text": session_state.get("mutable_text", ""), "transcript_text": get_live_stt_transcript_text(session_state), } ), 200, ) except Exception as err: current_app.logger.error( f"Error transcribing live audio chunk: {err}", exc_info=True ) return make_response( jsonify({"success": False, "message": "Failed to transcribe audio"}), 400, ) finally: if temp_path and temp_path.exists(): temp_path.unlink() @attachments_ns.route("/stt/live/finish") class LiveSpeechToTextFinish(Resource): @api.doc(description="Finish a live speech-to-text session") def post(self): auth_user = _resolve_authenticated_user() if hasattr(auth_user, "status_code"): return auth_user if not auth_user: return make_response( jsonify({"success": False, "message": "Authentication required"}), 401, ) redis_client = _require_live_stt_redis() if hasattr(redis_client, "status_code"): return redis_client payload = request.get_json(silent=True) or {} session_id = str(payload.get("session_id", "")).strip() if not session_id: return make_response( jsonify({"success": False, "message": "Missing session_id"}), 400, ) session_state = load_live_stt_session(redis_client, session_id) if not session_state: return make_response( jsonify( { "success": False, "message": "Live transcription session not found", } ), 404, ) if safe_filename(str(session_state.get("user", ""))) != auth_user: return make_response( jsonify({"success": False, "message": "Forbidden"}), 403, ) final_text = finalize_live_stt_session(session_state) delete_live_stt_session(redis_client, session_id) return make_response( jsonify( { "success": True, "session_id": session_id, "language": session_state.get("language"), "text": final_text, } ), 200, ) @attachments_ns.route("/images/") class ServeImage(Resource): @api.doc(description="Serve an image from storage") def get(self, image_path): try: from application.api.user.base import storage file_obj = storage.get_file(image_path) extension = image_path.split(".")[-1].lower() content_type = f"image/{extension}" if extension == "jpg": content_type = "image/jpeg" response = make_response(file_obj.read()) response.headers.set("Content-Type", content_type) response.headers.set("Cache-Control", "max-age=86400") return response except FileNotFoundError: return make_response( jsonify({"success": False, "message": "Image not found"}), 404 ) except Exception as e: current_app.logger.error(f"Error serving image: {e}") return make_response( jsonify({"success": False, "message": "Error retrieving image"}), 500 ) @attachments_ns.route("/tts") class TextToSpeech(Resource): tts_model = api.model( "TextToSpeechModel", { "text": fields.String( required=True, description="Text to be synthesized as audio" ), }, ) @api.expect(tts_model) @api.doc(description="Synthesize audio speech from text") def post(self): data = request.get_json() text = data["text"] try: tts_instance = TTSCreator.create_tts(settings.TTS_PROVIDER) audio_base64, detected_language = tts_instance.text_to_speech(text) return make_response( jsonify( { "success": True, "audio_base64": audio_base64, "lang": detected_language, } ), 200, ) except Exception as err: current_app.logger.error(f"Error synthesizing audio: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) ================================================ FILE: application/api/user/base.py ================================================ """ Shared utilities, database connections, and helper functions for user API routes. """ import datetime import os import uuid from functools import wraps from typing import Optional, Tuple from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, Response from pymongo import ReturnDocument from werkzeug.utils import secure_filename from application.core.mongo_db import MongoDB from application.core.settings import settings from application.storage.storage_creator import StorageCreator from application.vectorstore.vector_creator import VectorCreator storage = StorageCreator.get_storage() mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] conversations_collection = db["conversations"] sources_collection = db["sources"] prompts_collection = db["prompts"] feedback_collection = db["feedback"] agents_collection = db["agents"] agent_folders_collection = db["agent_folders"] token_usage_collection = db["token_usage"] shared_conversations_collections = db["shared_conversations"] users_collection = db["users"] user_logs_collection = db["user_logs"] user_tools_collection = db["user_tools"] attachments_collection = db["attachments"] workflow_runs_collection = db["workflow_runs"] workflows_collection = db["workflows"] workflow_nodes_collection = db["workflow_nodes"] workflow_edges_collection = db["workflow_edges"] try: agents_collection.create_index( [("shared", 1)], name="shared_index", background=True, ) users_collection.create_index("user_id", unique=True) workflows_collection.create_index( [("user", 1)], name="workflow_user_index", background=True ) workflow_nodes_collection.create_index( [("workflow_id", 1)], name="node_workflow_index", background=True ) workflow_nodes_collection.create_index( [("workflow_id", 1), ("graph_version", 1)], name="node_workflow_graph_version_index", background=True, ) workflow_edges_collection.create_index( [("workflow_id", 1)], name="edge_workflow_index", background=True ) workflow_edges_collection.create_index( [("workflow_id", 1), ("graph_version", 1)], name="edge_workflow_graph_version_index", background=True, ) except Exception as e: print("Error creating indexes:", e) current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) def generate_minute_range(start_date, end_date): """Generate a dictionary with minute-level time ranges.""" return { (start_date + datetime.timedelta(minutes=i)).strftime("%Y-%m-%d %H:%M:00"): 0 for i in range(int((end_date - start_date).total_seconds() // 60) + 1) } def generate_hourly_range(start_date, end_date): """Generate a dictionary with hourly time ranges.""" return { (start_date + datetime.timedelta(hours=i)).strftime("%Y-%m-%d %H:00"): 0 for i in range(int((end_date - start_date).total_seconds() // 3600) + 1) } def generate_date_range(start_date, end_date): """Generate a dictionary with daily date ranges.""" return { (start_date + datetime.timedelta(days=i)).strftime("%Y-%m-%d"): 0 for i in range((end_date - start_date).days + 1) } def ensure_user_doc(user_id): """ Ensure user document exists with proper agent preferences structure. Args: user_id: The user ID to ensure Returns: The user document """ default_prefs = { "pinned": [], "shared_with_me": [], } user_doc = users_collection.find_one_and_update( {"user_id": user_id}, {"$setOnInsert": {"agent_preferences": default_prefs}}, upsert=True, return_document=ReturnDocument.AFTER, ) prefs = user_doc.get("agent_preferences", {}) updates = {} if "pinned" not in prefs: updates["agent_preferences.pinned"] = [] if "shared_with_me" not in prefs: updates["agent_preferences.shared_with_me"] = [] if updates: users_collection.update_one({"user_id": user_id}, {"$set": updates}) user_doc = users_collection.find_one({"user_id": user_id}) return user_doc def resolve_tool_details(tool_ids): """ Resolve tool IDs to their details. Args: tool_ids: List of tool IDs Returns: List of tool details with id, name, and display_name """ tools = user_tools_collection.find( {"_id": {"$in": [ObjectId(tid) for tid in tool_ids]}} ) return [ { "id": str(tool["_id"]), "name": tool.get("name", ""), "display_name": tool.get("displayName", tool.get("name", "")), } for tool in tools ] def get_vector_store(source_id): """ Get the Vector Store for a given source ID. Args: source_id (str): source id of the document Returns: Vector store instance """ store = VectorCreator.create_vectorstore( settings.VECTOR_STORE, source_id=source_id, embeddings_key=os.getenv("EMBEDDINGS_KEY"), ) return store def handle_image_upload( request, existing_url: str, user: str, storage, base_path: str = "attachments/" ) -> Tuple[str, Optional[Response]]: """ Handle image file upload from request. Args: request: Flask request object existing_url: Existing image URL (fallback) user: User ID storage: Storage instance base_path: Base path for upload Returns: Tuple of (image_url, error_response) """ image_url = existing_url if "image" in request.files: file = request.files["image"] if file.filename != "": filename = secure_filename(file.filename) upload_path = f"{settings.UPLOAD_FOLDER.rstrip('/')}/{user}/{base_path.rstrip('/')}/{uuid.uuid4()}_{filename}" try: storage.save_file(file, upload_path, storage_class="STANDARD") image_url = upload_path except Exception as e: current_app.logger.error(f"Error uploading image: {e}") return None, make_response( jsonify({"success": False, "message": "Image upload failed"}), 400, ) return image_url, None def require_agent(func): """ Decorator to require valid agent webhook token. Args: func: Function to decorate Returns: Wrapped function """ @wraps(func) def wrapper(*args, **kwargs): webhook_token = kwargs.get("webhook_token") if not webhook_token: return make_response( jsonify({"success": False, "message": "Webhook token missing"}), 400 ) agent = agents_collection.find_one( {"incoming_webhook_token": webhook_token}, {"_id": 1} ) if not agent: current_app.logger.warning( f"Webhook attempt with invalid token: {webhook_token}" ) return make_response( jsonify({"success": False, "message": "Agent not found"}), 404 ) kwargs["agent"] = agent kwargs["agent_id_str"] = str(agent["_id"]) return func(*args, **kwargs) return wrapper ================================================ FILE: application/api/user/conversations/__init__.py ================================================ """Conversation management module.""" from .routes import conversations_ns __all__ = ["conversations_ns"] ================================================ FILE: application/api/user/conversations/routes.py ================================================ """Conversation management routes.""" import datetime from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import fields, Namespace, Resource from application.api import api from application.api.user.base import attachments_collection, conversations_collection from application.utils import check_required_fields conversations_ns = Namespace( "conversations", description="Conversation management operations", path="/api" ) @conversations_ns.route("/delete_conversation") class DeleteConversation(Resource): @api.doc( description="Deletes a conversation by ID", params={"id": "The ID of the conversation to delete"}, ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) conversation_id = request.args.get("id") if not conversation_id: return make_response( jsonify({"success": False, "message": "ID is required"}), 400 ) try: conversations_collection.delete_one( {"_id": ObjectId(conversation_id), "user": decoded_token["sub"]} ) except Exception as err: current_app.logger.error( f"Error deleting conversation: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) @conversations_ns.route("/delete_all_conversations") class DeleteAllConversations(Resource): @api.doc( description="Deletes all conversations for a specific user", ) def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user_id = decoded_token.get("sub") try: conversations_collection.delete_many({"user": user_id}) except Exception as err: current_app.logger.error( f"Error deleting all conversations: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) @conversations_ns.route("/get_conversations") class GetConversations(Resource): @api.doc( description="Retrieve a list of the latest 30 conversations (excluding API key conversations)", ) def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) try: conversations = ( conversations_collection.find( { "$or": [ {"api_key": {"$exists": False}}, {"agent_id": {"$exists": True}}, ], "user": decoded_token.get("sub"), } ) .sort("date", -1) .limit(30) ) list_conversations = [ { "id": str(conversation["_id"]), "name": conversation["name"], "agent_id": conversation.get("agent_id", None), "is_shared_usage": conversation.get("is_shared_usage", False), "shared_token": conversation.get("shared_token", None), } for conversation in conversations ] except Exception as err: current_app.logger.error( f"Error retrieving conversations: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify(list_conversations), 200) @conversations_ns.route("/get_single_conversation") class GetSingleConversation(Resource): @api.doc( description="Retrieve a single conversation by ID", params={"id": "The conversation ID"}, ) def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) conversation_id = request.args.get("id") if not conversation_id: return make_response( jsonify({"success": False, "message": "ID is required"}), 400 ) try: conversation = conversations_collection.find_one( {"_id": ObjectId(conversation_id), "user": decoded_token.get("sub")} ) if not conversation: return make_response(jsonify({"status": "not found"}), 404) # Process queries to include attachment names queries = conversation["queries"] for query in queries: if "attachments" in query and query["attachments"]: attachment_details = [] for attachment_id in query["attachments"]: try: attachment = attachments_collection.find_one( {"_id": ObjectId(attachment_id)} ) if attachment: attachment_details.append( { "id": str(attachment["_id"]), "fileName": attachment.get( "filename", "Unknown file" ), } ) except Exception as e: current_app.logger.error( f"Error retrieving attachment {attachment_id}: {e}", exc_info=True, ) query["attachments"] = attachment_details except Exception as err: current_app.logger.error( f"Error retrieving conversation: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) data = { "queries": queries, "agent_id": conversation.get("agent_id"), "is_shared_usage": conversation.get("is_shared_usage", False), "shared_token": conversation.get("shared_token", None), } return make_response(jsonify(data), 200) @conversations_ns.route("/update_conversation_name") class UpdateConversationName(Resource): @api.expect( api.model( "UpdateConversationModel", { "id": fields.String(required=True, description="Conversation ID"), "name": fields.String( required=True, description="New name of the conversation" ), }, ) ) @api.doc( description="Updates the name of a conversation", ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) data = request.get_json() required_fields = ["id", "name"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: conversations_collection.update_one( {"_id": ObjectId(data["id"]), "user": decoded_token.get("sub")}, {"$set": {"name": data["name"]}}, ) except Exception as err: current_app.logger.error( f"Error updating conversation name: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) @conversations_ns.route("/feedback") class SubmitFeedback(Resource): @api.expect( api.model( "FeedbackModel", { "question": fields.String( required=False, description="The user question" ), "answer": fields.String(required=False, description="The AI answer"), "feedback": fields.String(required=True, description="User feedback"), "question_index": fields.Integer( required=True, description="The question number in that particular conversation", ), "conversation_id": fields.String( required=True, description="id of the particular conversation" ), "api_key": fields.String(description="Optional API key"), }, ) ) @api.doc( description="Submit feedback for a conversation", ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) data = request.get_json() required_fields = ["feedback", "conversation_id", "question_index"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: if data["feedback"] is None: # Remove feedback and feedback_timestamp if feedback is null conversations_collection.update_one( { "_id": ObjectId(data["conversation_id"]), "user": decoded_token.get("sub"), f"queries.{data['question_index']}": {"$exists": True}, }, { "$unset": { f"queries.{data['question_index']}.feedback": "", f"queries.{data['question_index']}.feedback_timestamp": "", } }, ) else: # Set feedback and feedback_timestamp if feedback has a value conversations_collection.update_one( { "_id": ObjectId(data["conversation_id"]), "user": decoded_token.get("sub"), f"queries.{data['question_index']}": {"$exists": True}, }, { "$set": { f"queries.{data['question_index']}.feedback": data[ "feedback" ], f"queries.{data['question_index']}.feedback_timestamp": datetime.datetime.now( datetime.timezone.utc ), } }, ) except Exception as err: current_app.logger.error(f"Error submitting feedback: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) ================================================ FILE: application/api/user/models/__init__.py ================================================ from .routes import models_ns __all__ = ["models_ns"] ================================================ FILE: application/api/user/models/routes.py ================================================ from flask import current_app, jsonify, make_response from flask_restx import Namespace, Resource from application.core.model_settings import ModelRegistry models_ns = Namespace("models", description="Available models", path="/api") @models_ns.route("/models") class ModelsListResource(Resource): def get(self): """Get list of available models with their capabilities.""" try: registry = ModelRegistry.get_instance() models = registry.get_enabled_models() response = { "models": [model.to_dict() for model in models], "default_model_id": registry.default_model_id, "count": len(models), } except Exception as err: current_app.logger.error(f"Error fetching models: {err}", exc_info=True) return make_response(jsonify({"success": False}), 500) return make_response(jsonify(response), 200) ================================================ FILE: application/api/user/prompts/__init__.py ================================================ """Prompts module.""" from .routes import prompts_ns __all__ = ["prompts_ns"] ================================================ FILE: application/api/user/prompts/routes.py ================================================ """Prompt management routes.""" import os from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import fields, Namespace, Resource from application.api import api from application.api.user.base import current_dir, prompts_collection from application.utils import check_required_fields prompts_ns = Namespace( "prompts", description="Prompt management operations", path="/api" ) @prompts_ns.route("/create_prompt") class CreatePrompt(Resource): create_prompt_model = api.model( "CreatePromptModel", { "content": fields.String( required=True, description="Content of the prompt" ), "name": fields.String(required=True, description="Name of the prompt"), }, ) @api.expect(create_prompt_model) @api.doc(description="Create a new prompt") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) data = request.get_json() required_fields = ["content", "name"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields user = decoded_token.get("sub") try: resp = prompts_collection.insert_one( { "name": data["name"], "content": data["content"], "user": user, } ) new_id = str(resp.inserted_id) except Exception as err: current_app.logger.error(f"Error creating prompt: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"id": new_id}), 200) @prompts_ns.route("/get_prompts") class GetPrompts(Resource): @api.doc(description="Get all prompts for the user") def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") try: prompts = prompts_collection.find({"user": user}) list_prompts = [ {"id": "default", "name": "default", "type": "public"}, {"id": "creative", "name": "creative", "type": "public"}, {"id": "strict", "name": "strict", "type": "public"}, ] for prompt in prompts: list_prompts.append( { "id": str(prompt["_id"]), "name": prompt["name"], "type": "private", } ) except Exception as err: current_app.logger.error(f"Error retrieving prompts: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify(list_prompts), 200) @prompts_ns.route("/get_single_prompt") class GetSinglePrompt(Resource): @api.doc(params={"id": "ID of the prompt"}, description="Get a single prompt by ID") def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") prompt_id = request.args.get("id") if not prompt_id: return make_response( jsonify({"success": False, "message": "ID is required"}), 400 ) try: if prompt_id == "default": with open( os.path.join(current_dir, "prompts", "chat_combine_default.txt"), "r", ) as f: chat_combine_template = f.read() return make_response(jsonify({"content": chat_combine_template}), 200) elif prompt_id == "creative": with open( os.path.join(current_dir, "prompts", "chat_combine_creative.txt"), "r", ) as f: chat_reduce_creative = f.read() return make_response(jsonify({"content": chat_reduce_creative}), 200) elif prompt_id == "strict": with open( os.path.join(current_dir, "prompts", "chat_combine_strict.txt"), "r" ) as f: chat_reduce_strict = f.read() return make_response(jsonify({"content": chat_reduce_strict}), 200) prompt = prompts_collection.find_one( {"_id": ObjectId(prompt_id), "user": user} ) except Exception as err: current_app.logger.error(f"Error retrieving prompt: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"content": prompt["content"]}), 200) @prompts_ns.route("/delete_prompt") class DeletePrompt(Resource): delete_prompt_model = api.model( "DeletePromptModel", {"id": fields.String(required=True, description="Prompt ID to delete")}, ) @api.expect(delete_prompt_model) @api.doc(description="Delete a prompt by ID") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["id"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: prompts_collection.delete_one({"_id": ObjectId(data["id"]), "user": user}) except Exception as err: current_app.logger.error(f"Error deleting prompt: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) @prompts_ns.route("/update_prompt") class UpdatePrompt(Resource): update_prompt_model = api.model( "UpdatePromptModel", { "id": fields.String(required=True, description="Prompt ID to update"), "name": fields.String(required=True, description="New name of the prompt"), "content": fields.String( required=True, description="New content of the prompt" ), }, ) @api.expect(update_prompt_model) @api.doc(description="Update an existing prompt") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["id", "name", "content"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: prompts_collection.update_one( {"_id": ObjectId(data["id"]), "user": user}, {"$set": {"name": data["name"], "content": data["content"]}}, ) except Exception as err: current_app.logger.error(f"Error updating prompt: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) ================================================ FILE: application/api/user/routes.py ================================================ """ Main user API routes - registers all namespace modules. """ from flask import Blueprint from application.api import api from .agents import agents_ns, agents_sharing_ns, agents_webhooks_ns, agents_folders_ns from .analytics import analytics_ns from .attachments import attachments_ns from .conversations import conversations_ns from .models import models_ns from .prompts import prompts_ns from .sharing import sharing_ns from .sources import sources_chunks_ns, sources_ns, sources_upload_ns from .tools import tools_mcp_ns, tools_ns from .workflows import workflows_ns user = Blueprint("user", __name__) # Analytics api.add_namespace(analytics_ns) # Attachments api.add_namespace(attachments_ns) # Conversations api.add_namespace(conversations_ns) # Models api.add_namespace(models_ns) # Agents (main, sharing, webhooks, folders) api.add_namespace(agents_ns) api.add_namespace(agents_sharing_ns) api.add_namespace(agents_webhooks_ns) api.add_namespace(agents_folders_ns) # Prompts api.add_namespace(prompts_ns) # Sharing api.add_namespace(sharing_ns) # Sources (main, chunks, upload) api.add_namespace(sources_ns) api.add_namespace(sources_chunks_ns) api.add_namespace(sources_upload_ns) # Tools (main, MCP) api.add_namespace(tools_ns) api.add_namespace(tools_mcp_ns) # Workflows api.add_namespace(workflows_ns) ================================================ FILE: application/api/user/sharing/__init__.py ================================================ """Sharing module.""" from .routes import sharing_ns __all__ = ["sharing_ns"] ================================================ FILE: application/api/user/sharing/routes.py ================================================ """Conversation sharing routes.""" import uuid from bson.binary import Binary, UuidRepresentation from bson.dbref import DBRef from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import fields, inputs, Namespace, Resource from application.api import api from application.api.user.base import ( agents_collection, attachments_collection, conversations_collection, shared_conversations_collections, ) from application.utils import check_required_fields sharing_ns = Namespace( "sharing", description="Conversation sharing operations", path="/api" ) @sharing_ns.route("/share") class ShareConversation(Resource): share_conversation_model = api.model( "ShareConversationModel", { "conversation_id": fields.String( required=True, description="Conversation ID" ), "user": fields.String(description="User ID (optional)"), "prompt_id": fields.String(description="Prompt ID (optional)"), "chunks": fields.Integer(description="Chunks count (optional)"), }, ) @api.expect(share_conversation_model) @api.doc(description="Share a conversation") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["conversation_id"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields is_promptable = request.args.get("isPromptable", type=inputs.boolean) if is_promptable is None: return make_response( jsonify({"success": False, "message": "isPromptable is required"}), 400 ) conversation_id = data["conversation_id"] try: conversation = conversations_collection.find_one( {"_id": ObjectId(conversation_id)} ) if conversation is None: return make_response( jsonify( { "status": "error", "message": "Conversation does not exist", } ), 404, ) current_n_queries = len(conversation["queries"]) explicit_binary = Binary.from_uuid( uuid.uuid4(), UuidRepresentation.STANDARD ) if is_promptable: prompt_id = data.get("prompt_id", "default") chunks = data.get("chunks", "2") name = conversation["name"] + "(shared)" new_api_key_data = { "prompt_id": prompt_id, "chunks": chunks, "user": user, } if "source" in data and ObjectId.is_valid(data["source"]): new_api_key_data["source"] = DBRef( "sources", ObjectId(data["source"]) ) if "retriever" in data: new_api_key_data["retriever"] = data["retriever"] pre_existing_api_document = agents_collection.find_one(new_api_key_data) if pre_existing_api_document: api_uuid = pre_existing_api_document["key"] pre_existing = shared_conversations_collections.find_one( { "conversation_id": ObjectId(conversation_id), "isPromptable": is_promptable, "first_n_queries": current_n_queries, "user": user, "api_key": api_uuid, } ) if pre_existing is not None: return make_response( jsonify( { "success": True, "identifier": str(pre_existing["uuid"].as_uuid()), } ), 200, ) else: shared_conversations_collections.insert_one( { "uuid": explicit_binary, "conversation_id": ObjectId(conversation_id), "isPromptable": is_promptable, "first_n_queries": current_n_queries, "user": user, "api_key": api_uuid, } ) return make_response( jsonify( { "success": True, "identifier": str(explicit_binary.as_uuid()), } ), 201, ) else: api_uuid = str(uuid.uuid4()) new_api_key_data["key"] = api_uuid new_api_key_data["name"] = name if "source" in data and ObjectId.is_valid(data["source"]): new_api_key_data["source"] = DBRef( "sources", ObjectId(data["source"]) ) if "retriever" in data: new_api_key_data["retriever"] = data["retriever"] agents_collection.insert_one(new_api_key_data) shared_conversations_collections.insert_one( { "uuid": explicit_binary, "conversation_id": ObjectId(conversation_id), "isPromptable": is_promptable, "first_n_queries": current_n_queries, "user": user, "api_key": api_uuid, } ) return make_response( jsonify( { "success": True, "identifier": str(explicit_binary.as_uuid()), } ), 201, ) pre_existing = shared_conversations_collections.find_one( { "conversation_id": ObjectId(conversation_id), "isPromptable": is_promptable, "first_n_queries": current_n_queries, "user": user, } ) if pre_existing is not None: return make_response( jsonify( { "success": True, "identifier": str(pre_existing["uuid"].as_uuid()), } ), 200, ) else: shared_conversations_collections.insert_one( { "uuid": explicit_binary, "conversation_id": ObjectId(conversation_id), "isPromptable": is_promptable, "first_n_queries": current_n_queries, "user": user, } ) return make_response( jsonify( {"success": True, "identifier": str(explicit_binary.as_uuid())} ), 201, ) except Exception as err: current_app.logger.error( f"Error sharing conversation: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) @sharing_ns.route("/shared_conversation/") class GetPubliclySharedConversations(Resource): @api.doc(description="Get publicly shared conversations by identifier") def get(self, identifier: str): try: query_uuid = Binary.from_uuid( uuid.UUID(identifier), UuidRepresentation.STANDARD ) shared = shared_conversations_collections.find_one({"uuid": query_uuid}) conversation_queries = [] if ( shared and "conversation_id" in shared ): # Handle DBRef (legacy), ObjectId, dict, and string formats for conversation_id conversation_id = shared["conversation_id"] if isinstance(conversation_id, DBRef): conversation_id = conversation_id.id elif isinstance(conversation_id, dict): # Handle dict representation of DBRef (e.g., {"$ref": "...", "$id": "..."}) if "$id" in conversation_id: conv_id = conversation_id["$id"] # $id might be a dict like {"$oid": "..."} or a string if isinstance(conv_id, dict) and "$oid" in conv_id: conversation_id = ObjectId(conv_id["$oid"]) else: conversation_id = ObjectId(conv_id) elif "_id" in conversation_id: conversation_id = ObjectId(conversation_id["_id"]) elif isinstance(conversation_id, str): conversation_id = ObjectId(conversation_id) conversation = conversations_collection.find_one( {"_id": conversation_id} ) if conversation is None: return make_response( jsonify( { "success": False, "error": "might have broken url or the conversation does not exist", } ), 404, ) conversation_queries = conversation["queries"][ : (shared["first_n_queries"]) ] for query in conversation_queries: if "attachments" in query and query["attachments"]: attachment_details = [] for attachment_id in query["attachments"]: try: attachment = attachments_collection.find_one( {"_id": ObjectId(attachment_id)} ) if attachment: attachment_details.append( { "id": str(attachment["_id"]), "fileName": attachment.get( "filename", "Unknown file" ), } ) except Exception as e: current_app.logger.error( f"Error retrieving attachment {attachment_id}: {e}", exc_info=True, ) query["attachments"] = attachment_details else: return make_response( jsonify( { "success": False, "error": "might have broken url or the conversation does not exist", } ), 404, ) date = conversation["_id"].generation_time.isoformat() res = { "success": True, "queries": conversation_queries, "title": conversation["name"], "timestamp": date, } if shared["isPromptable"] and "api_key" in shared: res["api_key"] = shared["api_key"] return make_response(jsonify(res), 200) except Exception as err: current_app.logger.error( f"Error getting shared conversation: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) ================================================ FILE: application/api/user/sources/__init__.py ================================================ """Sources module.""" from .chunks import sources_chunks_ns from .routes import sources_ns from .upload import sources_upload_ns __all__ = ["sources_ns", "sources_chunks_ns", "sources_upload_ns"] ================================================ FILE: application/api/user/sources/chunks.py ================================================ """Source document management chunk management.""" from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import fields, Namespace, Resource from application.api import api from application.api.user.base import get_vector_store, sources_collection from application.utils import check_required_fields, num_tokens_from_string sources_chunks_ns = Namespace( "sources", description="Source document management operations", path="/api" ) @sources_chunks_ns.route("/get_chunks") class GetChunks(Resource): @api.doc( description="Retrieves chunks from a document, optionally filtered by file path and search term", params={ "id": "The document ID", "page": "Page number for pagination", "per_page": "Number of chunks per page", "path": "Optional: Filter chunks by relative file path", "search": "Optional: Search term to filter chunks by title or content", }, ) def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") doc_id = request.args.get("id") page = int(request.args.get("page", 1)) per_page = int(request.args.get("per_page", 10)) path = request.args.get("path") search_term = request.args.get("search", "").strip().lower() if not ObjectId.is_valid(doc_id): return make_response(jsonify({"error": "Invalid doc_id"}), 400) doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user}) if not doc: return make_response( jsonify({"error": "Document not found or access denied"}), 404 ) try: store = get_vector_store(doc_id) chunks = store.get_chunks() filtered_chunks = [] for chunk in chunks: metadata = chunk.get("metadata", {}) # Filter by path if provided if path: chunk_source = metadata.get("source", "") chunk_file_path = metadata.get("file_path", "") # Check if the chunk matches the requested path # For file uploads: source ends with path (e.g., "inputs/.../file.pdf" ends with "file.pdf") # For crawlers: file_path ends with path (e.g., "guides/setup.md" ends with "setup.md") source_match = chunk_source and chunk_source.endswith(path) file_path_match = chunk_file_path and chunk_file_path.endswith(path) if not (source_match or file_path_match): continue # Filter by search term if provided if search_term: text_match = search_term in chunk.get("text", "").lower() title_match = search_term in metadata.get("title", "").lower() if not (text_match or title_match): continue filtered_chunks.append(chunk) chunks = filtered_chunks total_chunks = len(chunks) start = (page - 1) * per_page end = start + per_page paginated_chunks = chunks[start:end] return make_response( jsonify( { "page": page, "per_page": per_page, "total": total_chunks, "chunks": paginated_chunks, "path": path if path else None, "search": search_term if search_term else None, } ), 200, ) except Exception as e: current_app.logger.error(f"Error getting chunks: {e}", exc_info=True) return make_response(jsonify({"success": False}), 500) @sources_chunks_ns.route("/add_chunk") class AddChunk(Resource): @api.expect( api.model( "AddChunkModel", { "id": fields.String(required=True, description="Document ID"), "text": fields.String(required=True, description="Text of the chunk"), "metadata": fields.Raw( required=False, description="Metadata associated with the chunk", ), }, ) ) @api.doc( description="Adds a new chunk to the document", ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["id", "text"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields doc_id = data.get("id") text = data.get("text") metadata = data.get("metadata", {}) token_count = num_tokens_from_string(text) metadata["token_count"] = token_count if not ObjectId.is_valid(doc_id): return make_response(jsonify({"error": "Invalid doc_id"}), 400) doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user}) if not doc: return make_response( jsonify({"error": "Document not found or access denied"}), 404 ) try: store = get_vector_store(doc_id) chunk_id = store.add_chunk(text, metadata) return make_response( jsonify({"message": "Chunk added successfully", "chunk_id": chunk_id}), 201, ) except Exception as e: current_app.logger.error(f"Error adding chunk: {e}", exc_info=True) return make_response(jsonify({"success": False}), 500) @sources_chunks_ns.route("/delete_chunk") class DeleteChunk(Resource): @api.doc( description="Deletes a specific chunk from the document.", params={"id": "The document ID", "chunk_id": "The ID of the chunk to delete"}, ) def delete(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") doc_id = request.args.get("id") chunk_id = request.args.get("chunk_id") if not ObjectId.is_valid(doc_id): return make_response(jsonify({"error": "Invalid doc_id"}), 400) doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user}) if not doc: return make_response( jsonify({"error": "Document not found or access denied"}), 404 ) try: store = get_vector_store(doc_id) deleted = store.delete_chunk(chunk_id) if deleted: return make_response( jsonify({"message": "Chunk deleted successfully"}), 200 ) else: return make_response( jsonify({"message": "Chunk not found or could not be deleted"}), 404, ) except Exception as e: current_app.logger.error(f"Error deleting chunk: {e}", exc_info=True) return make_response(jsonify({"success": False}), 500) @sources_chunks_ns.route("/update_chunk") class UpdateChunk(Resource): @api.expect( api.model( "UpdateChunkModel", { "id": fields.String(required=True, description="Document ID"), "chunk_id": fields.String( required=True, description="Chunk ID to update" ), "text": fields.String( required=False, description="New text of the chunk" ), "metadata": fields.Raw( required=False, description="Updated metadata associated with the chunk", ), }, ) ) @api.doc( description="Updates an existing chunk in the document.", ) def put(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["id", "chunk_id"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields doc_id = data.get("id") chunk_id = data.get("chunk_id") text = data.get("text") metadata = data.get("metadata") if text is not None: token_count = num_tokens_from_string(text) if metadata is None: metadata = {} metadata["token_count"] = token_count if not ObjectId.is_valid(doc_id): return make_response(jsonify({"error": "Invalid doc_id"}), 400) doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user}) if not doc: return make_response( jsonify({"error": "Document not found or access denied"}), 404 ) try: store = get_vector_store(doc_id) chunks = store.get_chunks() existing_chunk = next((c for c in chunks if c["doc_id"] == chunk_id), None) if not existing_chunk: return make_response(jsonify({"error": "Chunk not found"}), 404) new_text = text if text is not None else existing_chunk["text"] if metadata is not None: new_metadata = existing_chunk["metadata"].copy() new_metadata.update(metadata) else: new_metadata = existing_chunk["metadata"].copy() if text is not None: new_metadata["token_count"] = num_tokens_from_string(new_text) try: new_chunk_id = store.add_chunk(new_text, new_metadata) deleted = store.delete_chunk(chunk_id) if not deleted: current_app.logger.warning( f"Failed to delete old chunk {chunk_id}, but new chunk {new_chunk_id} was created" ) return make_response( jsonify( { "message": "Chunk updated successfully", "chunk_id": new_chunk_id, "original_chunk_id": chunk_id, } ), 200, ) except Exception as add_error: current_app.logger.error(f"Failed to add updated chunk: {add_error}") return make_response( jsonify({"error": "Failed to update chunk - addition failed"}), 500 ) except Exception as e: current_app.logger.error(f"Error updating chunk: {e}", exc_info=True) return make_response(jsonify({"success": False}), 500) ================================================ FILE: application/api/user/sources/routes.py ================================================ """Source document management routes.""" import json import math from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, redirect, request from flask_restx import fields, Namespace, Resource from application.api import api from application.api.user.base import sources_collection from application.api.user.tasks import sync_source from application.core.settings import settings from application.storage.storage_creator import StorageCreator from application.utils import check_required_fields from application.vectorstore.vector_creator import VectorCreator sources_ns = Namespace( "sources", description="Source document management operations", path="/api" ) def _get_provider_from_remote_data(remote_data): if not remote_data: return None if isinstance(remote_data, dict): return remote_data.get("provider") if isinstance(remote_data, str): try: remote_data_obj = json.loads(remote_data) except Exception: return None if isinstance(remote_data_obj, dict): return remote_data_obj.get("provider") return None @sources_ns.route("/sources") class CombinedJson(Resource): @api.doc(description="Provide JSON file with combined available indexes") def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = [ { "name": "Default", "date": "default", "model": settings.EMBEDDINGS_NAME, "location": "remote", "tokens": "", "retriever": "classic", } ] try: for index in sources_collection.find({"user": user}).sort("date", -1): provider = _get_provider_from_remote_data(index.get("remote_data")) data.append( { "id": str(index["_id"]), "name": index.get("name"), "date": index.get("date"), "model": settings.EMBEDDINGS_NAME, "location": "local", "tokens": index.get("tokens", ""), "retriever": index.get("retriever", "classic"), "syncFrequency": index.get("sync_frequency", ""), "provider": provider, "is_nested": bool(index.get("directory_structure")), "type": index.get( "type", "file" ), # Add type field with default "file" } ) except Exception as err: current_app.logger.error(f"Error retrieving sources: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify(data), 200) @sources_ns.route("/sources/paginated") class PaginatedSources(Resource): @api.doc(description="Get document with pagination, sorting and filtering") def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") sort_field = request.args.get("sort", "date") # Default to 'date' sort_order = request.args.get("order", "desc") # Default to 'desc' page = int(request.args.get("page", 1)) # Default to 1 rows_per_page = int(request.args.get("rows", 10)) # Default to 10 # add .strip() to remove leading and trailing whitespaces search_term = request.args.get( "search", "" ).strip() # add search for filter documents # Prepare query for filtering query = {"user": user} if search_term: query["name"] = { "$regex": search_term, "$options": "i", # using case-insensitive search } total_documents = sources_collection.count_documents(query) total_pages = max(1, math.ceil(total_documents / rows_per_page)) page = min( max(1, page), total_pages ) # add this to make sure page inbound is within the range sort_order = 1 if sort_order == "asc" else -1 skip = (page - 1) * rows_per_page try: documents = ( sources_collection.find(query) .sort(sort_field, sort_order) .skip(skip) .limit(rows_per_page) ) paginated_docs = [] for doc in documents: provider = _get_provider_from_remote_data(doc.get("remote_data")) doc_data = { "id": str(doc["_id"]), "name": doc.get("name", ""), "date": doc.get("date", ""), "model": settings.EMBEDDINGS_NAME, "location": "local", "tokens": doc.get("tokens", ""), "retriever": doc.get("retriever", "classic"), "syncFrequency": doc.get("sync_frequency", ""), "provider": provider, "isNested": bool(doc.get("directory_structure")), "type": doc.get("type", "file"), } paginated_docs.append(doc_data) response = { "total": total_documents, "totalPages": total_pages, "currentPage": page, "paginated": paginated_docs, } return make_response(jsonify(response), 200) except Exception as err: current_app.logger.error( f"Error retrieving paginated sources: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) @sources_ns.route("/delete_by_ids") class DeleteByIds(Resource): @api.doc( description="Deletes documents from the vector store by IDs", params={"path": "Comma-separated list of IDs"}, ) def get(self): ids = request.args.get("path") if not ids: return make_response( jsonify({"success": False, "message": "Missing required fields"}), 400 ) try: result = sources_collection.delete_index(ids=ids) if result: return make_response(jsonify({"success": True}), 200) except Exception as err: current_app.logger.error(f"Error deleting indexes: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": False}), 400) @sources_ns.route("/delete_old") class DeleteOldIndexes(Resource): @api.doc( description="Deletes old indexes and associated files", params={"source_id": "The source ID to delete"}, ) def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) source_id = request.args.get("source_id") if not source_id: return make_response( jsonify({"success": False, "message": "Missing required fields"}), 400 ) doc = sources_collection.find_one( {"_id": ObjectId(source_id), "user": decoded_token.get("sub")} ) if not doc: return make_response(jsonify({"status": "not found"}), 404) storage = StorageCreator.get_storage() try: # Delete vector index if settings.VECTOR_STORE == "faiss": index_path = f"indexes/{str(doc['_id'])}" if storage.file_exists(f"{index_path}/index.faiss"): storage.delete_file(f"{index_path}/index.faiss") if storage.file_exists(f"{index_path}/index.pkl"): storage.delete_file(f"{index_path}/index.pkl") else: vectorstore = VectorCreator.create_vectorstore( settings.VECTOR_STORE, source_id=str(doc["_id"]) ) vectorstore.delete_index() if "file_path" in doc and doc["file_path"]: file_path = doc["file_path"] if storage.is_directory(file_path): files = storage.list_files(file_path) for f in files: storage.delete_file(f) else: storage.delete_file(file_path) except FileNotFoundError: pass except Exception as err: current_app.logger.error( f"Error deleting files and indexes: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) sources_collection.delete_one({"_id": ObjectId(source_id)}) return make_response(jsonify({"success": True}), 200) @sources_ns.route("/combine") class RedirectToSources(Resource): @api.doc( description="Redirects /api/combine to /api/sources for backward compatibility" ) def get(self): return redirect("/api/sources", code=301) @sources_ns.route("/manage_sync") class ManageSync(Resource): manage_sync_model = api.model( "ManageSyncModel", { "source_id": fields.String(required=True, description="Source ID"), "sync_frequency": fields.String( required=True, description="Sync frequency (never, daily, weekly, monthly)", ), }, ) @api.expect(manage_sync_model) @api.doc(description="Manage sync frequency for sources") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() or {} required_fields = ["source_id", "sync_frequency"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields source_id = data["source_id"] sync_frequency = data["sync_frequency"] if sync_frequency not in ["never", "daily", "weekly", "monthly"]: return make_response( jsonify({"success": False, "message": "Invalid frequency"}), 400 ) update_data = {"$set": {"sync_frequency": sync_frequency}} try: sources_collection.update_one( { "_id": ObjectId(source_id), "user": user, }, update_data, ) except Exception as err: current_app.logger.error( f"Error updating sync frequency: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) @sources_ns.route("/sync_source") class SyncSource(Resource): sync_source_model = api.model( "SyncSourceModel", {"source_id": fields.String(required=True, description="Source ID")}, ) @api.expect(sync_source_model) @api.doc(description="Trigger an immediate sync for a source") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["source_id"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields source_id = data["source_id"] if not ObjectId.is_valid(source_id): return make_response( jsonify({"success": False, "message": "Invalid source ID"}), 400 ) doc = sources_collection.find_one( {"_id": ObjectId(source_id), "user": user} ) if not doc: return make_response( jsonify({"success": False, "message": "Source not found"}), 404 ) source_type = doc.get("type", "") if source_type.startswith("connector"): return make_response( jsonify( { "success": False, "message": "Connector sources must be synced via /api/connectors/sync", } ), 400, ) source_data = doc.get("remote_data") if not source_data: return make_response( jsonify({"success": False, "message": "Source is not syncable"}), 400 ) try: task = sync_source.delay( source_data=source_data, job_name=doc.get("name", ""), user=user, loader=source_type, sync_frequency=doc.get("sync_frequency", "never"), retriever=doc.get("retriever", "classic"), doc_id=source_id, ) except Exception as err: current_app.logger.error( f"Error starting sync for source {source_id}: {err}", exc_info=True, ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True, "task_id": task.id}), 200) @sources_ns.route("/directory_structure") class DirectoryStructure(Resource): @api.doc( description="Get the directory structure for a document", params={"id": "The document ID"}, ) def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") doc_id = request.args.get("id") if not doc_id: return make_response(jsonify({"error": "Document ID is required"}), 400) if not ObjectId.is_valid(doc_id): return make_response(jsonify({"error": "Invalid document ID"}), 400) try: doc = sources_collection.find_one({"_id": ObjectId(doc_id), "user": user}) if not doc: return make_response( jsonify({"error": "Document not found or access denied"}), 404 ) directory_structure = doc.get("directory_structure", {}) base_path = doc.get("file_path", "") provider = None remote_data = doc.get("remote_data") try: if isinstance(remote_data, str) and remote_data: remote_data_obj = json.loads(remote_data) provider = remote_data_obj.get("provider") except Exception as e: current_app.logger.warning( f"Failed to parse remote_data for doc {doc_id}: {e}" ) return make_response( jsonify( { "success": True, "directory_structure": directory_structure, "base_path": base_path, "provider": provider, } ), 200, ) except Exception as e: current_app.logger.error( f"Error retrieving directory structure: {e}", exc_info=True ) return make_response(jsonify({"success": False, "error": "Failed to retrieve directory structure"}), 500) ================================================ FILE: application/api/user/sources/upload.py ================================================ """Source document management upload functionality.""" import json import os import tempfile import zipfile from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import fields, Namespace, Resource from application.api import api from application.api.user.base import sources_collection from application.api.user.tasks import ingest, ingest_connector_task, ingest_remote from application.core.settings import settings from application.parser.connectors.connector_creator import ConnectorCreator from application.parser.file.constants import SUPPORTED_SOURCE_EXTENSIONS from application.storage.storage_creator import StorageCreator from application.stt.upload_limits import ( AudioFileTooLargeError, build_stt_file_size_limit_message, enforce_audio_file_size_limit, is_audio_filename, ) from application.utils import check_required_fields, safe_filename sources_upload_ns = Namespace( "sources", description="Source document management operations", path="/api" ) def _enforce_audio_path_size_limit(file_path: str, filename: str) -> None: if not is_audio_filename(filename): return enforce_audio_file_size_limit(os.path.getsize(file_path)) @sources_upload_ns.route("/upload") class UploadFile(Resource): @api.expect( api.model( "UploadModel", { "user": fields.String(required=True, description="User ID"), "name": fields.String(required=True, description="Job name"), "file": fields.Raw(required=True, description="File(s) to upload"), }, ) ) @api.doc( description="Uploads a file to be vectorized and indexed", ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) data = request.form files = request.files.getlist("file") required_fields = ["user", "name"] missing_fields = check_required_fields(data, required_fields) if missing_fields or not files or all(file.filename == "" for file in files): return make_response( jsonify( { "status": "error", "message": "Missing required fields or files", } ), 400, ) user = decoded_token.get("sub") job_name = request.form["name"] # Create safe versions for filesystem operations safe_user = safe_filename(user) dir_name = safe_filename(job_name) base_path = f"{settings.UPLOAD_FOLDER}/{safe_user}/{dir_name}" file_name_map = {} try: storage = StorageCreator.get_storage() for file in files: original_filename = os.path.basename(file.filename) safe_file = safe_filename(original_filename) if original_filename: file_name_map[safe_file] = original_filename with tempfile.TemporaryDirectory() as temp_dir: temp_file_path = os.path.join(temp_dir, safe_file) file.save(temp_file_path) _enforce_audio_path_size_limit(temp_file_path, safe_file) # Only extract actual .zip files, not Office formats (.docx, .xlsx, .pptx) # which are technically zip archives but should be processed as-is is_office_format = safe_file.lower().endswith( (".docx", ".xlsx", ".pptx", ".odt", ".ods", ".odp", ".epub") ) if zipfile.is_zipfile(temp_file_path) and not is_office_format: try: with zipfile.ZipFile(temp_file_path, "r") as zip_ref: zip_ref.extractall(path=temp_dir) # Walk through extracted files and upload them for root, _, files in os.walk(temp_dir): for extracted_file in files: if ( os.path.join(root, extracted_file) == temp_file_path ): continue rel_path = os.path.relpath( os.path.join(root, extracted_file), temp_dir ) storage_path = f"{base_path}/{rel_path}" _enforce_audio_path_size_limit( os.path.join(root, extracted_file), extracted_file, ) with open( os.path.join(root, extracted_file), "rb" ) as f: storage.save_file(f, storage_path) except Exception as e: current_app.logger.error( f"Error extracting zip: {e}", exc_info=True ) # If zip extraction fails, save the original zip file file_path = f"{base_path}/{safe_file}" with open(temp_file_path, "rb") as f: storage.save_file(f, file_path) else: # For non-zip files, save directly file_path = f"{base_path}/{safe_file}" with open(temp_file_path, "rb") as f: storage.save_file(f, file_path) task = ingest.delay( settings.UPLOAD_FOLDER, list(SUPPORTED_SOURCE_EXTENSIONS), job_name, user, file_path=base_path, filename=dir_name, file_name_map=file_name_map, ) except AudioFileTooLargeError: return make_response( jsonify( { "success": False, "message": build_stt_file_size_limit_message(), } ), 413, ) except Exception as err: current_app.logger.error(f"Error uploading file: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True, "task_id": task.id}), 200) @sources_upload_ns.route("/remote") class UploadRemote(Resource): @api.expect( api.model( "RemoteUploadModel", { "user": fields.String(required=True, description="User ID"), "source": fields.String( required=True, description="Source of the data" ), "name": fields.String(required=True, description="Job name"), "data": fields.String(required=True, description="Data to process"), "repo_url": fields.String(description="GitHub repository URL"), }, ) ) @api.doc( description="Uploads remote source for vectorization", ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) data = request.form required_fields = ["user", "source", "name", "data"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: config = json.loads(data["data"]) source_data = None if data["source"] == "github": source_data = config.get("repo_url") elif data["source"] in ["crawler", "url"]: source_data = config.get("url") elif data["source"] == "reddit": source_data = config elif data["source"] == "s3": source_data = config elif data["source"] in ConnectorCreator.get_supported_connectors(): session_token = config.get("session_token") if not session_token: return make_response( jsonify( { "success": False, "error": f"Missing session_token in {data['source']} configuration", } ), 400, ) # Process file_ids file_ids = config.get("file_ids", []) if isinstance(file_ids, str): file_ids = [id.strip() for id in file_ids.split(",") if id.strip()] elif not isinstance(file_ids, list): file_ids = [] # Process folder_ids folder_ids = config.get("folder_ids", []) if isinstance(folder_ids, str): folder_ids = [ id.strip() for id in folder_ids.split(",") if id.strip() ] elif not isinstance(folder_ids, list): folder_ids = [] config["file_ids"] = file_ids config["folder_ids"] = folder_ids task = ingest_connector_task.delay( job_name=data["name"], user=decoded_token.get("sub"), source_type=data["source"], session_token=session_token, file_ids=file_ids, folder_ids=folder_ids, recursive=config.get("recursive", False), retriever=config.get("retriever", "classic"), ) return make_response( jsonify({"success": True, "task_id": task.id}), 200 ) task = ingest_remote.delay( source_data=source_data, job_name=data["name"], user=decoded_token.get("sub"), loader=data["source"], ) except Exception as err: current_app.logger.error( f"Error uploading remote source: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True, "task_id": task.id}), 200) @sources_upload_ns.route("/manage_source_files") class ManageSourceFiles(Resource): @api.expect( api.model( "ManageSourceFilesModel", { "source_id": fields.String( required=True, description="Source ID to modify" ), "operation": fields.String( required=True, description="Operation: 'add', 'remove', or 'remove_directory'", ), "file_paths": fields.List( fields.String, required=False, description="File paths to remove (for remove operation)", ), "directory_path": fields.String( required=False, description="Directory path to remove (for remove_directory operation)", ), "file": fields.Raw( required=False, description="Files to add (for add operation)" ), "parent_dir": fields.String( required=False, description="Parent directory path relative to source root", ), }, ) ) @api.doc( description="Add files, remove files, or remove directories from an existing source", ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response( jsonify({"success": False, "message": "Unauthorized"}), 401 ) user = decoded_token.get("sub") source_id = request.form.get("source_id") operation = request.form.get("operation") if not source_id or not operation: return make_response( jsonify( { "success": False, "message": "source_id and operation are required", } ), 400, ) if operation not in ["add", "remove", "remove_directory"]: return make_response( jsonify( { "success": False, "message": "operation must be 'add', 'remove', or 'remove_directory'", } ), 400, ) try: ObjectId(source_id) except Exception: return make_response( jsonify({"success": False, "message": "Invalid source ID format"}), 400 ) try: source = sources_collection.find_one( {"_id": ObjectId(source_id), "user": user} ) if not source: return make_response( jsonify( { "success": False, "message": "Source not found or access denied", } ), 404, ) except Exception as err: current_app.logger.error(f"Error finding source: {err}", exc_info=True) return make_response( jsonify({"success": False, "message": "Database error"}), 500 ) try: storage = StorageCreator.get_storage() source_file_path = source.get("file_path", "") parent_dir = request.form.get("parent_dir", "") file_name_map = source.get("file_name_map") or {} if isinstance(file_name_map, str): try: file_name_map = json.loads(file_name_map) except Exception: file_name_map = {} if not isinstance(file_name_map, dict): file_name_map = {} if parent_dir and (parent_dir.startswith("/") or ".." in parent_dir): return make_response( jsonify( {"success": False, "message": "Invalid parent directory path"} ), 400, ) if operation == "add": files = request.files.getlist("file") if not files or all(file.filename == "" for file in files): return make_response( jsonify( { "success": False, "message": "No files provided for add operation", } ), 400, ) added_files = [] map_updated = False target_dir = source_file_path if parent_dir: target_dir = f"{source_file_path}/{parent_dir}" for file in files: if file.filename: original_filename = os.path.basename(file.filename) safe_filename_str = safe_filename(original_filename) file_path = f"{target_dir}/{safe_filename_str}" # Save file to storage storage.save_file(file, file_path) added_files.append(safe_filename_str) if original_filename: relative_key = ( f"{parent_dir}/{safe_filename_str}" if parent_dir else safe_filename_str ) file_name_map[relative_key] = original_filename map_updated = True if map_updated: sources_collection.update_one( {"_id": ObjectId(source_id)}, {"$set": {"file_name_map": file_name_map}}, ) # Trigger re-ingestion pipeline from application.api.user.tasks import reingest_source_task task = reingest_source_task.delay(source_id=source_id, user=user) return make_response( jsonify( { "success": True, "message": f"Added {len(added_files)} files", "added_files": added_files, "parent_dir": parent_dir, "reingest_task_id": task.id, } ), 200, ) elif operation == "remove": file_paths_str = request.form.get("file_paths") if not file_paths_str: return make_response( jsonify( { "success": False, "message": "file_paths required for remove operation", } ), 400, ) try: file_paths = ( json.loads(file_paths_str) if isinstance(file_paths_str, str) else file_paths_str ) except Exception: return make_response( jsonify( {"success": False, "message": "Invalid file_paths format"} ), 400, ) # Remove files from storage and directory structure removed_files = [] map_updated = False for file_path in file_paths: full_path = f"{source_file_path}/{file_path}" # Remove from storage if storage.file_exists(full_path): storage.delete_file(full_path) removed_files.append(file_path) if file_path in file_name_map: file_name_map.pop(file_path, None) map_updated = True if map_updated and isinstance(file_name_map, dict): sources_collection.update_one( {"_id": ObjectId(source_id)}, {"$set": {"file_name_map": file_name_map}}, ) # Trigger re-ingestion pipeline from application.api.user.tasks import reingest_source_task task = reingest_source_task.delay(source_id=source_id, user=user) return make_response( jsonify( { "success": True, "message": f"Removed {len(removed_files)} files", "removed_files": removed_files, "reingest_task_id": task.id, } ), 200, ) elif operation == "remove_directory": directory_path = request.form.get("directory_path") if not directory_path: return make_response( jsonify( { "success": False, "message": "directory_path required for remove_directory operation", } ), 400, ) # Validate directory path (prevent path traversal) if directory_path.startswith("/") or ".." in directory_path: current_app.logger.warning( f"Invalid directory path attempted for removal. " f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}" ) return make_response( jsonify( {"success": False, "message": "Invalid directory path"} ), 400, ) full_directory_path = ( f"{source_file_path}/{directory_path}" if directory_path else source_file_path ) if not storage.is_directory(full_directory_path): current_app.logger.warning( f"Directory not found or is not a directory for removal. " f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, " f"Full path: {full_directory_path}" ) return make_response( jsonify( { "success": False, "message": "Directory not found or is not a directory", } ), 404, ) success = storage.remove_directory(full_directory_path) if not success: current_app.logger.error( f"Failed to remove directory from storage. " f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, " f"Full path: {full_directory_path}" ) return make_response( jsonify( {"success": False, "message": "Failed to remove directory"} ), 500, ) current_app.logger.info( f"Successfully removed directory. " f"User: {user}, Source ID: {source_id}, Directory path: {directory_path}, " f"Full path: {full_directory_path}" ) if directory_path and file_name_map: prefix = f"{directory_path.rstrip('/')}/" keys_to_remove = [ key for key in file_name_map.keys() if key == directory_path or key.startswith(prefix) ] if keys_to_remove: for key in keys_to_remove: file_name_map.pop(key, None) sources_collection.update_one( {"_id": ObjectId(source_id)}, {"$set": {"file_name_map": file_name_map}}, ) # Trigger re-ingestion pipeline from application.api.user.tasks import reingest_source_task task = reingest_source_task.delay(source_id=source_id, user=user) return make_response( jsonify( { "success": True, "message": f"Successfully removed directory: {directory_path}", "removed_directory": directory_path, "reingest_task_id": task.id, } ), 200, ) except Exception as err: error_context = f"operation={operation}, user={user}, source_id={source_id}" if operation == "remove_directory": directory_path = request.form.get("directory_path", "") error_context += f", directory_path={directory_path}" elif operation == "remove": file_paths_str = request.form.get("file_paths", "") error_context += f", file_paths={file_paths_str}" elif operation == "add": parent_dir = request.form.get("parent_dir", "") error_context += f", parent_dir={parent_dir}" current_app.logger.error( f"Error managing source files: {err} ({error_context})", exc_info=True ) return make_response( jsonify({"success": False, "message": "Operation failed"}), 500 ) @sources_upload_ns.route("/task_status") class TaskStatus(Resource): task_status_model = api.model( "TaskStatusModel", {"task_id": fields.String(required=True, description="Task ID")}, ) @api.expect(task_status_model) @api.doc(description="Get celery job status") def get(self): task_id = request.args.get("task_id") if not task_id: return make_response( jsonify({"success": False, "message": "Task ID is required"}), 400 ) try: from application.celery_init import celery task = celery.AsyncResult(task_id) task_meta = task.info print(f"Task status: {task.status}") if task.status == "PENDING": inspect = celery.control.inspect() active_workers = inspect.ping() if not active_workers: raise ConnectionError("Service unavailable") if not isinstance( task_meta, (dict, list, str, int, float, bool, type(None)) ): task_meta = str(task_meta) # Convert to a string representation except ConnectionError as err: current_app.logger.error(f"Connection error getting task status: {err}") return make_response( jsonify({"success": False, "message": "Service unavailable"}), 503 ) except Exception as err: current_app.logger.error(f"Error getting task status: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"status": task.status, "result": task_meta}), 200) ================================================ FILE: application/api/user/tasks.py ================================================ from datetime import timedelta from application.celery_init import celery from application.worker import ( agent_webhook_worker, attachment_worker, ingest_worker, mcp_oauth, mcp_oauth_status, remote_worker, sync, sync_worker, ) @celery.task(bind=True) def ingest( self, directory, formats, job_name, user, file_path, filename, file_name_map=None ): resp = ingest_worker( self, directory, formats, job_name, file_path, filename, user, file_name_map=file_name_map, ) return resp @celery.task(bind=True) def ingest_remote(self, source_data, job_name, user, loader): resp = remote_worker(self, source_data, job_name, user, loader) return resp @celery.task(bind=True) def reingest_source_task(self, source_id, user): from application.worker import reingest_source_worker resp = reingest_source_worker(self, source_id, user) return resp @celery.task(bind=True) def schedule_syncs(self, frequency): resp = sync_worker(self, frequency) return resp @celery.task(bind=True) def sync_source( self, source_data, job_name, user, loader, sync_frequency, retriever, doc_id, ): resp = sync( self, source_data, job_name, user, loader, sync_frequency, retriever, doc_id, ) return resp @celery.task(bind=True) def store_attachment(self, file_info, user): resp = attachment_worker(self, file_info, user) return resp @celery.task(bind=True) def process_agent_webhook(self, agent_id, payload): resp = agent_webhook_worker(self, agent_id, payload) return resp @celery.task(bind=True) def ingest_connector_task( self, job_name, user, source_type, session_token=None, file_ids=None, folder_ids=None, recursive=True, retriever="classic", operation_mode="upload", doc_id=None, sync_frequency="never", ): from application.worker import ingest_connector resp = ingest_connector( self, job_name, user, source_type, session_token=session_token, file_ids=file_ids, folder_ids=folder_ids, recursive=recursive, retriever=retriever, operation_mode=operation_mode, doc_id=doc_id, sync_frequency=sync_frequency, ) return resp @celery.on_after_configure.connect def setup_periodic_tasks(sender, **kwargs): sender.add_periodic_task( timedelta(days=1), schedule_syncs.s("daily"), ) sender.add_periodic_task( timedelta(weeks=1), schedule_syncs.s("weekly"), ) sender.add_periodic_task( timedelta(days=30), schedule_syncs.s("monthly"), ) @celery.task(bind=True) def mcp_oauth_task(self, config, user): resp = mcp_oauth(self, config, user) return resp @celery.task(bind=True) def mcp_oauth_status_task(self, task_id): resp = mcp_oauth_status(self, task_id) return resp ================================================ FILE: application/api/user/tools/__init__.py ================================================ """Tools module.""" from .mcp import tools_mcp_ns from .routes import tools_ns __all__ = ["tools_ns", "tools_mcp_ns"] ================================================ FILE: application/api/user/tools/mcp.py ================================================ """Tool management MCP server integration.""" import json from urllib.parse import urlencode, urlparse from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, redirect, request from flask_restx import Namespace, Resource, fields from application.agents.tools.mcp_tool import MCPOAuthManager, MCPTool from application.api import api from application.api.user.base import user_tools_collection from application.api.user.tools.routes import transform_actions from application.cache import get_redis_instance from application.core.mongo_db import MongoDB from application.core.settings import settings from application.security.encryption import decrypt_credentials, encrypt_credentials from application.utils import check_required_fields tools_mcp_ns = Namespace("tools", description="Tool management operations", path="/api") _mongo = MongoDB.get_client() _db = _mongo[settings.MONGO_DB_NAME] _connector_sessions = _db["connector_sessions"] _ALLOWED_TRANSPORTS = {"auto", "sse", "http"} def _sanitize_mcp_transport(config): """Normalise and validate the transport_type field. Strips ``command`` / ``args`` keys that are only valid for local STDIO transports and returns the cleaned transport type string. """ transport_type = (config.get("transport_type") or "auto").lower() if transport_type not in _ALLOWED_TRANSPORTS: raise ValueError(f"Unsupported transport_type: {transport_type}") config.pop("command", None) config.pop("args", None) config["transport_type"] = transport_type return transport_type def _extract_auth_credentials(config): """Build an ``auth_credentials`` dict from the raw MCP config.""" auth_credentials = {} auth_type = config.get("auth_type", "none") if auth_type == "api_key": if config.get("api_key"): auth_credentials["api_key"] = config["api_key"] if config.get("api_key_header"): auth_credentials["api_key_header"] = config["api_key_header"] elif auth_type == "bearer": if config.get("bearer_token"): auth_credentials["bearer_token"] = config["bearer_token"] elif auth_type == "basic": if config.get("username"): auth_credentials["username"] = config["username"] if config.get("password"): auth_credentials["password"] = config["password"] return auth_credentials @tools_mcp_ns.route("/mcp_server/test") class TestMCPServerConfig(Resource): @api.expect( api.model( "MCPServerTestModel", { "config": fields.Raw( required=True, description="MCP server configuration to test" ), }, ) ) @api.doc(description="Test MCP server connection with provided configuration") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["config"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: config = data["config"] try: _sanitize_mcp_transport(config) except ValueError: return make_response( jsonify({"success": False, "error": "Unsupported transport_type"}), 400, ) auth_credentials = _extract_auth_credentials(config) test_config = config.copy() test_config["auth_credentials"] = auth_credentials mcp_tool = MCPTool(config=test_config, user_id=user) result = mcp_tool.test_connection() if result.get("requires_oauth"): return make_response(jsonify(result), 200) if not result.get("success") and "message" in result: current_app.logger.error( f"MCP connection test failed: {result.get('message')}" ) result["message"] = "Connection test failed" return make_response(jsonify(result), 200) except Exception as e: current_app.logger.error(f"Error testing MCP server: {e}", exc_info=True) return make_response( jsonify({"success": False, "error": "Connection test failed"}), 500, ) @tools_mcp_ns.route("/mcp_server/save") class MCPServerSave(Resource): @api.expect( api.model( "MCPServerSaveModel", { "id": fields.String( required=False, description="Tool ID for updates (optional)" ), "displayName": fields.String( required=True, description="Display name for the MCP server" ), "config": fields.Raw( required=True, description="MCP server configuration" ), "status": fields.Boolean( required=False, default=True, description="Tool status" ), }, ) ) @api.doc(description="Create or update MCP server with automatic tool discovery") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["displayName", "config"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: config = data["config"] try: _sanitize_mcp_transport(config) except ValueError: return make_response( jsonify({"success": False, "error": "Unsupported transport_type"}), 400, ) auth_credentials = _extract_auth_credentials(config) auth_type = config.get("auth_type", "none") mcp_config = config.copy() mcp_config["auth_credentials"] = auth_credentials if auth_type == "oauth": if not config.get("oauth_task_id"): return make_response( jsonify( { "success": False, "error": "Connection not authorized. Please complete the OAuth authorization first.", } ), 400, ) redis_client = get_redis_instance() manager = MCPOAuthManager(redis_client) result = manager.get_oauth_status(config["oauth_task_id"]) if not result.get("status") == "completed": return make_response( jsonify( { "success": False, "error": "OAuth failed or not completed. Please try authorizing again.", } ), 400, ) actions_metadata = result.get("tools", []) elif auth_type == "none" or auth_credentials: mcp_tool = MCPTool(config=mcp_config, user_id=user) mcp_tool.discover_tools() actions_metadata = mcp_tool.get_actions_metadata() else: raise Exception( "No valid credentials provided for the selected authentication type" ) storage_config = config.copy() tool_id = data.get("id") existing_encrypted = None if tool_id: existing_doc = user_tools_collection.find_one( {"_id": ObjectId(tool_id), "user": user, "name": "mcp_tool"} ) if existing_doc: existing_encrypted = existing_doc.get("config", {}).get( "encrypted_credentials" ) if auth_credentials: if existing_encrypted: existing_secrets = decrypt_credentials(existing_encrypted, user) existing_secrets.update(auth_credentials) auth_credentials = existing_secrets storage_config["encrypted_credentials"] = encrypt_credentials( auth_credentials, user ) elif existing_encrypted: storage_config["encrypted_credentials"] = existing_encrypted for field in [ "api_key", "bearer_token", "username", "password", "api_key_header", "redirect_uri", ]: storage_config.pop(field, None) transformed_actions = transform_actions(actions_metadata) tool_data = { "name": "mcp_tool", "displayName": data["displayName"], "customName": data["displayName"], "description": f"MCP Server: {storage_config.get('server_url', 'Unknown')}", "config": storage_config, "actions": transformed_actions, "status": data.get("status", True), "user": user, } if tool_id: result = user_tools_collection.update_one( {"_id": ObjectId(tool_id), "user": user, "name": "mcp_tool"}, {"$set": {k: v for k, v in tool_data.items() if k != "user"}}, ) if result.matched_count == 0: return make_response( jsonify( { "success": False, "error": "Tool not found or access denied", } ), 404, ) response_data = { "success": True, "id": tool_id, "message": f"MCP server updated successfully! Discovered {len(transformed_actions)} tools.", "tools_count": len(transformed_actions), } else: result = user_tools_collection.insert_one(tool_data) tool_id = str(result.inserted_id) response_data = { "success": True, "id": tool_id, "message": f"MCP server created successfully! Discovered {len(transformed_actions)} tools.", "tools_count": len(transformed_actions), } return make_response(jsonify(response_data), 200) except Exception as e: current_app.logger.error(f"Error saving MCP server: {e}", exc_info=True) return make_response( jsonify({"success": False, "error": "Failed to save MCP server"}), 500, ) @tools_mcp_ns.route("/mcp_server/callback") class MCPOAuthCallback(Resource): @api.expect( api.model( "MCPServerCallbackModel", { "code": fields.String(required=True, description="Authorization code"), "state": fields.String(required=True, description="State parameter"), "error": fields.String( required=False, description="Error message (if any)" ), }, ) ) @api.doc( description="Handle OAuth callback by providing the authorization code and state" ) def get(self): code = request.args.get("code") state = request.args.get("state") error = request.args.get("error") if error: params = { "status": "error", "message": f"OAuth error: {error}. Please try again and make sure to grant all requested permissions, including offline access.", "provider": "mcp_tool", } return redirect(f"/api/connectors/callback-status?{urlencode(params)}") if not code or not state: return redirect( "/api/connectors/callback-status?status=error&message=Authorization+code+or+state+not+provided.+Please+complete+the+authorization+process+and+make+sure+to+grant+offline+access.&provider=mcp_tool" ) try: redis_client = get_redis_instance() if not redis_client: return redirect( "/api/connectors/callback-status?status=error&message=Internal+server+error:+Redis+not+available.&provider=mcp_tool" ) manager = MCPOAuthManager(redis_client) success = manager.handle_oauth_callback(state, code, error) if success: return redirect( "/api/connectors/callback-status?status=success&message=Authorization+code+received+successfully.+You+can+close+this+window.&provider=mcp_tool" ) else: return redirect( "/api/connectors/callback-status?status=error&message=OAuth+callback+failed.&provider=mcp_tool" ) except Exception as e: current_app.logger.error( f"Error handling MCP OAuth callback: {str(e)}", exc_info=True ) return redirect( "/api/connectors/callback-status?status=error&message=Internal+server+error.&provider=mcp_tool" ) @tools_mcp_ns.route("/mcp_server/oauth_status/") class MCPOAuthStatus(Resource): def get(self, task_id): try: redis_client = get_redis_instance() status_key = f"mcp_oauth_status:{task_id}" status_data = redis_client.get(status_key) if status_data: status = json.loads(status_data) if "tools" in status and isinstance(status["tools"], list): status["tools"] = [ { "name": t.get("name", "unknown"), "description": t.get("description", ""), } for t in status["tools"] ] return make_response( jsonify({"success": True, "task_id": task_id, **status}) ) else: return make_response( jsonify( { "success": True, "task_id": task_id, "status": "pending", "message": "Waiting for OAuth to start...", } ), 200, ) except Exception as e: current_app.logger.error( f"Error getting OAuth status for task {task_id}: {str(e)}", exc_info=True, ) return make_response( jsonify( { "success": False, "error": "Failed to get OAuth status", "task_id": task_id, } ), 500, ) @tools_mcp_ns.route("/mcp_server/auth_status") class MCPAuthStatus(Resource): @api.doc( description="Batch check auth status for all MCP tools. " "Lightweight DB-only check — no network calls to MCP servers." ) def get(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") try: mcp_tools = list( user_tools_collection.find( {"user": user, "name": "mcp_tool"}, {"_id": 1, "config": 1}, ) ) if not mcp_tools: return make_response(jsonify({"success": True, "statuses": {}}), 200) oauth_server_urls = {} statuses = {} for tool in mcp_tools: tool_id = str(tool["_id"]) config = tool.get("config", {}) auth_type = config.get("auth_type", "none") if auth_type == "oauth": server_url = config.get("server_url", "") if server_url: parsed = urlparse(server_url) base_url = f"{parsed.scheme}://{parsed.netloc}" oauth_server_urls[tool_id] = base_url else: statuses[tool_id] = "needs_auth" else: statuses[tool_id] = "configured" if oauth_server_urls: unique_urls = list(set(oauth_server_urls.values())) sessions = list( _connector_sessions.find( {"user_id": user, "server_url": {"$in": unique_urls}}, {"server_url": 1, "tokens": 1}, ) ) url_has_tokens = { doc["server_url"]: bool(doc.get("tokens", {}).get("access_token")) for doc in sessions } for tool_id, base_url in oauth_server_urls.items(): if url_has_tokens.get(base_url): statuses[tool_id] = "connected" else: statuses[tool_id] = "needs_auth" return make_response(jsonify({"success": True, "statuses": statuses}), 200) except Exception as e: current_app.logger.error( "Error checking MCP auth status: %s", e, exc_info=True ) return make_response( jsonify({"success": False, "error": "Failed to check auth status"}), 500, ) ================================================ FILE: application/api/user/tools/routes.py ================================================ """Tool management routes.""" from bson.objectid import ObjectId from flask import current_app, jsonify, make_response, request from flask_restx import fields, Namespace, Resource from application.agents.tools.spec_parser import parse_spec from application.agents.tools.tool_manager import ToolManager from application.api import api from application.api.user.base import user_tools_collection from application.security.encryption import decrypt_credentials, encrypt_credentials from application.utils import check_required_fields, validate_function_name tool_config = {} tool_manager = ToolManager(config=tool_config) def _encrypt_secret_fields(config, config_requirements, user_id): secret_keys = [ key for key, spec in config_requirements.items() if spec.get("secret") and key in config and config[key] ] if not secret_keys: return config storage_config = config.copy() secret_values = {k: config[k] for k in secret_keys} storage_config["encrypted_credentials"] = encrypt_credentials(secret_values, user_id) for key in secret_keys: storage_config.pop(key, None) return storage_config def _validate_config(config, config_requirements, has_existing_secrets=False): errors = {} for key, spec in config_requirements.items(): depends_on = spec.get("depends_on") if depends_on: if not all(config.get(dk) == dv for dk, dv in depends_on.items()): continue if spec.get("required") and not config.get(key): if has_existing_secrets and spec.get("secret"): continue errors[key] = f"{spec.get('label', key)} is required" value = config.get(key) if value is not None and value != "": if spec.get("type") == "number": try: num = float(value) if key == "timeout" and (num < 1 or num > 300): errors[key] = "Timeout must be between 1 and 300" except (ValueError, TypeError): errors[key] = f"{spec.get('label', key)} must be a number" if spec.get("enum") and value not in spec["enum"]: errors[key] = f"Invalid value for {spec.get('label', key)}" return errors def _merge_secrets_on_update(new_config, existing_config, config_requirements, user_id): """Merge incoming config with existing encrypted secrets and re-encrypt. For updates, the client may omit unchanged secret values. This helper decrypts any previously stored secrets, overlays whatever the client *did* send, strips plain-text secrets from the stored config, and re-encrypts the merged result. Returns the final ``config`` dict ready for persistence. """ secret_keys = [ key for key, spec in config_requirements.items() if spec.get("secret") ] if not secret_keys: return new_config existing_secrets = {} if "encrypted_credentials" in existing_config: existing_secrets = decrypt_credentials( existing_config["encrypted_credentials"], user_id ) merged_secrets = existing_secrets.copy() for key in secret_keys: if key in new_config and new_config[key]: merged_secrets[key] = new_config[key] # Start from existing non-secret values, then overlay incoming non-secrets storage_config = { k: v for k, v in existing_config.items() if k not in secret_keys and k != "encrypted_credentials" } storage_config.update( {k: v for k, v in new_config.items() if k not in secret_keys} ) if merged_secrets: storage_config["encrypted_credentials"] = encrypt_credentials( merged_secrets, user_id ) else: storage_config.pop("encrypted_credentials", None) storage_config.pop("has_encrypted_credentials", None) return storage_config def transform_actions(actions_metadata): """Set default flags on action metadata for storage. Marks each action as active, sets ``filled_by_llm`` and ``value`` on every parameter property. Used by both the generic create_tool and MCP save routes. """ transformed = [] for action in actions_metadata: action["active"] = True if "parameters" in action: props = action["parameters"].get("properties", {}) for param_details in props.values(): param_details["filled_by_llm"] = True param_details["value"] = "" transformed.append(action) return transformed tools_ns = Namespace("tools", description="Tool management operations", path="/api") @tools_ns.route("/available_tools") class AvailableTools(Resource): @api.doc(description="Get available tools for a user") def get(self): try: tools_metadata = [] for tool_name, tool_instance in tool_manager.tools.items(): doc = tool_instance.__doc__.strip() lines = doc.split("\n", 1) name = lines[0].strip() description = lines[1].strip() if len(lines) > 1 else "" config_req = tool_instance.get_config_requirements() actions = tool_instance.get_actions_metadata() tools_metadata.append( { "name": tool_name, "displayName": name, "description": description, "configRequirements": config_req, "actions": actions, } ) except Exception as err: current_app.logger.error( f"Error getting available tools: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True, "data": tools_metadata}), 200) @tools_ns.route("/get_tools") class GetTools(Resource): @api.doc(description="Get tools created by a user") def get(self): try: decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") tools = user_tools_collection.find({"user": user}) user_tools = [] for tool in tools: tool_copy = {**tool} tool_copy["id"] = str(tool["_id"]) tool_copy.pop("_id", None) config_req = tool_copy.get("configRequirements", {}) if not config_req: tool_instance = tool_manager.tools.get(tool_copy.get("name")) if tool_instance: config_req = tool_instance.get_config_requirements() tool_copy["configRequirements"] = config_req has_secrets = any( spec.get("secret") for spec in config_req.values() ) if config_req else False if has_secrets and "encrypted_credentials" in tool_copy.get("config", {}): tool_copy["config"]["has_encrypted_credentials"] = True tool_copy["config"].pop("encrypted_credentials", None) user_tools.append(tool_copy) except Exception as err: current_app.logger.error(f"Error getting user tools: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True, "tools": user_tools}), 200) @tools_ns.route("/create_tool") class CreateTool(Resource): @api.expect( api.model( "CreateToolModel", { "name": fields.String(required=True, description="Name of the tool"), "displayName": fields.String( required=True, description="Display name for the tool" ), "description": fields.String( required=True, description="Tool description" ), "config": fields.Raw( required=True, description="Configuration of the tool" ), "customName": fields.String( required=False, description="Custom name for the tool" ), "status": fields.Boolean( required=True, description="Status of the tool" ), }, ) ) @api.doc(description="Create a new tool") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = [ "name", "displayName", "description", "config", "status", ] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: tool_instance = tool_manager.tools.get(data["name"]) if not tool_instance: return make_response( jsonify({"success": False, "message": "Tool not found"}), 404 ) actions_metadata = tool_instance.get_actions_metadata() transformed_actions = transform_actions(actions_metadata) except Exception as err: current_app.logger.error( f"Error getting tool actions: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) try: config_requirements = tool_instance.get_config_requirements() if config_requirements: validation_errors = _validate_config( data["config"], config_requirements ) if validation_errors: return make_response( jsonify( { "success": False, "message": "Validation failed", "errors": validation_errors, } ), 400, ) storage_config = _encrypt_secret_fields( data["config"], config_requirements, user ) new_tool = { "user": user, "name": data["name"], "displayName": data["displayName"], "description": data["description"], "customName": data.get("customName", ""), "actions": transformed_actions, "config": storage_config, "configRequirements": config_requirements, "status": data["status"], } resp = user_tools_collection.insert_one(new_tool) new_id = str(resp.inserted_id) except Exception as err: current_app.logger.error(f"Error creating tool: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"id": new_id}), 200) @tools_ns.route("/update_tool") class UpdateTool(Resource): @api.expect( api.model( "UpdateToolModel", { "id": fields.String(required=True, description="Tool ID"), "name": fields.String(description="Name of the tool"), "displayName": fields.String(description="Display name for the tool"), "customName": fields.String(description="Custom name for the tool"), "description": fields.String(description="Tool description"), "config": fields.Raw(description="Configuration of the tool"), "actions": fields.List( fields.Raw, description="Actions the tool can perform" ), "status": fields.Boolean(description="Status of the tool"), }, ) ) @api.doc(description="Update a tool by ID") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["id"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: update_data = {} if "name" in data: update_data["name"] = data["name"] if "displayName" in data: update_data["displayName"] = data["displayName"] if "customName" in data: update_data["customName"] = data["customName"] if "description" in data: update_data["description"] = data["description"] if "actions" in data: update_data["actions"] = data["actions"] if "config" in data: if "actions" in data["config"]: for action_name in list(data["config"]["actions"].keys()): if not validate_function_name(action_name): return make_response( jsonify( { "success": False, "message": f"Invalid function name '{action_name}'. Function names must match pattern '^[a-zA-Z0-9_-]+$'.", "param": "tools[].function.name", } ), 400, ) tool_doc = user_tools_collection.find_one( {"_id": ObjectId(data["id"]), "user": user} ) if not tool_doc: return make_response( jsonify({"success": False, "message": "Tool not found"}), 404, ) tool_name = tool_doc.get("name", data.get("name")) tool_instance = tool_manager.tools.get(tool_name) config_requirements = ( tool_instance.get_config_requirements() if tool_instance else {} ) existing_config = tool_doc.get("config", {}) has_existing_secrets = "encrypted_credentials" in existing_config if config_requirements: validation_errors = _validate_config( data["config"], config_requirements, has_existing_secrets=has_existing_secrets, ) if validation_errors: return make_response( jsonify({ "success": False, "message": "Validation failed", "errors": validation_errors, }), 400, ) update_data["config"] = _merge_secrets_on_update( data["config"], existing_config, config_requirements, user ) if "status" in data: update_data["status"] = data["status"] user_tools_collection.update_one( {"_id": ObjectId(data["id"]), "user": user}, {"$set": update_data}, ) except Exception as err: current_app.logger.error(f"Error updating tool: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) @tools_ns.route("/update_tool_config") class UpdateToolConfig(Resource): @api.expect( api.model( "UpdateToolConfigModel", { "id": fields.String(required=True, description="Tool ID"), "config": fields.Raw( required=True, description="Configuration of the tool" ), }, ) ) @api.doc(description="Update the configuration of a tool") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["id", "config"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: tool_doc = user_tools_collection.find_one( {"_id": ObjectId(data["id"]), "user": user} ) if not tool_doc: return make_response(jsonify({"success": False}), 404) tool_name = tool_doc.get("name") tool_instance = tool_manager.tools.get(tool_name) config_requirements = ( tool_instance.get_config_requirements() if tool_instance else {} ) existing_config = tool_doc.get("config", {}) has_existing_secrets = "encrypted_credentials" in existing_config if config_requirements: validation_errors = _validate_config( data["config"], config_requirements, has_existing_secrets=has_existing_secrets, ) if validation_errors: return make_response( jsonify({ "success": False, "message": "Validation failed", "errors": validation_errors, }), 400, ) final_config = _merge_secrets_on_update( data["config"], existing_config, config_requirements, user ) user_tools_collection.update_one( {"_id": ObjectId(data["id"]), "user": user}, {"$set": {"config": final_config}}, ) except Exception as err: current_app.logger.error( f"Error updating tool config: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) @tools_ns.route("/update_tool_actions") class UpdateToolActions(Resource): @api.expect( api.model( "UpdateToolActionsModel", { "id": fields.String(required=True, description="Tool ID"), "actions": fields.List( fields.Raw, required=True, description="Actions the tool can perform", ), }, ) ) @api.doc(description="Update the actions of a tool") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["id", "actions"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: user_tools_collection.update_one( {"_id": ObjectId(data["id"]), "user": user}, {"$set": {"actions": data["actions"]}}, ) except Exception as err: current_app.logger.error( f"Error updating tool actions: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) @tools_ns.route("/update_tool_status") class UpdateToolStatus(Resource): @api.expect( api.model( "UpdateToolStatusModel", { "id": fields.String(required=True, description="Tool ID"), "status": fields.Boolean( required=True, description="Status of the tool" ), }, ) ) @api.doc(description="Update the status of a tool") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["id", "status"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: user_tools_collection.update_one( {"_id": ObjectId(data["id"]), "user": user}, {"$set": {"status": data["status"]}}, ) except Exception as err: current_app.logger.error( f"Error updating tool status: {err}", exc_info=True ) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) @tools_ns.route("/delete_tool") class DeleteTool(Resource): @api.expect( api.model( "DeleteToolModel", {"id": fields.String(required=True, description="Tool ID")}, ) ) @api.doc(description="Delete a tool by ID") def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user = decoded_token.get("sub") data = request.get_json() required_fields = ["id"] missing_fields = check_required_fields(data, required_fields) if missing_fields: return missing_fields try: result = user_tools_collection.delete_one( {"_id": ObjectId(data["id"]), "user": user} ) if result.deleted_count == 0: return make_response( jsonify({"success": False, "message": "Tool not found"}), 404 ) except Exception as err: current_app.logger.error(f"Error deleting tool: {err}", exc_info=True) return make_response(jsonify({"success": False}), 400) return make_response(jsonify({"success": True}), 200) @tools_ns.route("/parse_spec") class ParseSpec(Resource): @api.doc( description="Parse an API specification (OpenAPI 3.x or Swagger 2.0) and return actions" ) def post(self): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) if "file" in request.files: file = request.files["file"] if not file.filename: return make_response( jsonify({"success": False, "message": "No file selected"}), 400 ) try: spec_content = file.read().decode("utf-8") except UnicodeDecodeError: return make_response( jsonify({"success": False, "message": "Invalid file encoding"}), 400 ) elif request.is_json: data = request.get_json() spec_content = data.get("spec_content", "") else: return make_response( jsonify({"success": False, "message": "No spec provided"}), 400 ) if not spec_content or not spec_content.strip(): return make_response( jsonify({"success": False, "message": "Empty spec content"}), 400 ) try: metadata, actions = parse_spec(spec_content) return make_response( jsonify( { "success": True, "metadata": metadata, "actions": actions, } ), 200, ) except ValueError as e: current_app.logger.error(f"Spec validation error: {e}") return make_response(jsonify({"success": False, "error": "Invalid specification format"}), 400) except Exception as err: current_app.logger.error(f"Error parsing spec: {err}", exc_info=True) return make_response(jsonify({"success": False, "error": "Failed to parse specification"}), 500) @tools_ns.route("/artifact/") class GetArtifact(Resource): @api.doc(description="Get artifact data by artifact ID. Returns all todos for the tool when fetching a todo artifact.") def get(self, artifact_id: str): decoded_token = request.decoded_token if not decoded_token: return make_response(jsonify({"success": False}), 401) user_id = decoded_token.get("sub") try: obj_id = ObjectId(artifact_id) except Exception: return make_response( jsonify({"success": False, "message": "Invalid artifact ID"}), 400 ) from application.core.mongo_db import MongoDB from application.core.settings import settings db = MongoDB.get_client()[settings.MONGO_DB_NAME] note_doc = db["notes"].find_one({"_id": obj_id, "user_id": user_id}) if note_doc: content = note_doc.get("note", "") line_count = len(content.split("\n")) if content else 0 artifact = { "artifact_type": "note", "data": { "content": content, "line_count": line_count, "updated_at": ( note_doc["updated_at"].isoformat() if note_doc.get("updated_at") else None ), }, } return make_response(jsonify({"success": True, "artifact": artifact}), 200) todo_doc = db["todos"].find_one({"_id": obj_id, "user_id": user_id}) if todo_doc: tool_id = todo_doc.get("tool_id") query = {"user_id": user_id, "tool_id": tool_id} all_todos = list(db["todos"].find(query)) items = [] open_count = 0 completed_count = 0 for t in all_todos: status = t.get("status", "open") if status == "open": open_count += 1 elif status == "completed": completed_count += 1 items.append({ "todo_id": t.get("todo_id"), "title": t.get("title", ""), "status": status, "created_at": ( t["created_at"].isoformat() if t.get("created_at") else None ), "updated_at": ( t["updated_at"].isoformat() if t.get("updated_at") else None ), }) artifact = { "artifact_type": "todo_list", "data": { "items": items, "total_count": len(items), "open_count": open_count, "completed_count": completed_count, }, } return make_response(jsonify({"success": True, "artifact": artifact}), 200) return make_response( jsonify({"success": False, "message": "Artifact not found"}), 404 ) ================================================ FILE: application/api/user/utils.py ================================================ """Centralized utilities for API routes.""" from functools import wraps from typing import Any, Callable, Dict, List, Optional, Tuple from bson.errors import InvalidId from bson.objectid import ObjectId from flask import ( Response, current_app, has_app_context, jsonify, make_response, request, ) from pymongo.collection import Collection def get_user_id() -> Optional[str]: """ Extract user ID from decoded JWT token. Returns: User ID string or None if not authenticated """ decoded_token = getattr(request, "decoded_token", None) return decoded_token.get("sub") if decoded_token else None def require_auth(func: Callable) -> Callable: """ Decorator to require authentication for route handlers. Usage: @require_auth def get(self): user_id = get_user_id() ... """ @wraps(func) def wrapper(*args, **kwargs): user_id = get_user_id() if not user_id: return error_response("Unauthorized", 401) return func(*args, **kwargs) return wrapper def success_response( data: Optional[Dict[str, Any]] = None, status: int = 200 ) -> Response: """ Create a standardized success response. Args: data: Optional data dictionary to include in response status: HTTP status code (default: 200) Returns: Flask Response object Example: return success_response({"users": [...], "total": 10}) """ response = {"success": True} if data: response.update(data) return make_response(jsonify(response), status) def error_response(message: str, status: int = 400, **kwargs) -> Response: """ Create a standardized error response. Args: message: Error message string status: HTTP status code (default: 400) **kwargs: Additional fields to include in response Returns: Flask Response object Example: return error_response("Resource not found", 404) return error_response("Invalid input", 400, errors=["field1", "field2"]) """ response = {"success": False, "message": message} response.update(kwargs) return make_response(jsonify(response), status) def validate_object_id( id_string: str, resource_name: str = "Resource" ) -> Tuple[Optional[ObjectId], Optional[Response]]: """ Validate and convert string to ObjectId. Args: id_string: String to convert resource_name: Name of resource for error message Returns: Tuple of (ObjectId or None, error_response or None) Example: obj_id, error = validate_object_id(workflow_id, "Workflow") if error: return error """ try: return ObjectId(id_string), None except (InvalidId, TypeError): return None, error_response(f"Invalid {resource_name} ID format") def validate_pagination( default_limit: int = 20, max_limit: int = 100 ) -> Tuple[int, int, Optional[Response]]: """ Extract and validate pagination parameters from request. Args: default_limit: Default items per page max_limit: Maximum allowed items per page Returns: Tuple of (limit, skip, error_response or None) Example: limit, skip, error = validate_pagination() if error: return error """ try: limit = min(int(request.args.get("limit", default_limit)), max_limit) skip = int(request.args.get("skip", 0)) if limit < 1 or skip < 0: return 0, 0, error_response("Invalid pagination parameters") return limit, skip, None except ValueError: return 0, 0, error_response("Invalid pagination parameters") def check_resource_ownership( collection: Collection, resource_id: ObjectId, user_id: str, resource_name: str = "Resource", ) -> Tuple[Optional[Dict], Optional[Response]]: """ Check if resource exists and belongs to user. Args: collection: MongoDB collection resource_id: Resource ObjectId user_id: User ID string resource_name: Name of resource for error messages Returns: Tuple of (resource_dict or None, error_response or None) Example: workflow, error = check_resource_ownership( workflows_collection, workflow_id, user_id, "Workflow" ) if error: return error """ resource = collection.find_one({"_id": resource_id, "user": user_id}) if not resource: return None, error_response(f"{resource_name} not found", 404) return resource, None def serialize_object_id( obj: Dict[str, Any], id_field: str = "_id", new_field: str = "id" ) -> Dict[str, Any]: """ Convert ObjectId to string in a dictionary. Args: obj: Dictionary containing ObjectId id_field: Field name containing ObjectId new_field: New field name for string ID Returns: Modified dictionary Example: user = serialize_object_id(user_doc) # user["id"] = "507f1f77bcf86cd799439011" """ if id_field in obj: obj[new_field] = str(obj[id_field]) if id_field != new_field: obj.pop(id_field, None) return obj def serialize_list(items: List[Dict], serializer: Callable[[Dict], Dict]) -> List[Dict]: """ Apply serializer function to list of items. Args: items: List of dictionaries serializer: Function to apply to each item Returns: List of serialized items Example: workflows = serialize_list(workflow_docs, serialize_workflow) """ return [serializer(item) for item in items] def paginated_response( collection: Collection, query: Dict[str, Any], serializer: Callable[[Dict], Dict], limit: int, skip: int, sort_field: str = "created_at", sort_order: int = -1, response_key: str = "items", ) -> Response: """ Create paginated response for collection query. Args: collection: MongoDB collection query: Query dictionary serializer: Function to serialize each item limit: Items per page skip: Number of items to skip sort_field: Field to sort by sort_order: Sort order (1=asc, -1=desc) response_key: Key name for items in response Returns: Flask Response with paginated data Example: return paginated_response( workflows_collection, {"user": user_id}, serialize_workflow, limit, skip, response_key="workflows" ) """ items = list( collection.find(query).sort(sort_field, sort_order).skip(skip).limit(limit) ) total = collection.count_documents(query) return success_response( { response_key: serialize_list(items, serializer), "total": total, "limit": limit, "skip": skip, } ) def require_fields(required: List[str]) -> Callable: """ Decorator to validate required fields in request JSON. Args: required: List of required field names Returns: Decorator function Example: @require_fields(["name", "description"]) def post(self): data = request.get_json() ... """ def decorator(func: Callable) -> Callable: @wraps(func) def wrapper(*args, **kwargs): data = request.get_json() if not data: return error_response("Request body required") missing = [field for field in required if not data.get(field)] if missing: return error_response(f"Missing required fields: {', '.join(missing)}") return func(*args, **kwargs) return wrapper return decorator def safe_db_operation( operation: Callable, error_message: str = "Database operation failed" ) -> Tuple[Any, Optional[Response]]: """ Safely execute database operation with error handling. Args: operation: Function to execute error_message: Error message if operation fails Returns: Tuple of (result or None, error_response or None) Example: result, error = safe_db_operation( lambda: collection.insert_one(doc), "Failed to create resource" ) if error: return error """ try: result = operation() return result, None except Exception as err: if has_app_context(): current_app.logger.error(f"{error_message}: {err}", exc_info=True) return None, error_response(error_message) def validate_enum( value: Any, allowed: List[Any], field_name: str ) -> Optional[Response]: """ Validate that value is in allowed list. Args: value: Value to validate allowed: List of allowed values field_name: Field name for error message Returns: error_response if invalid, None if valid Example: error = validate_enum(status, ["draft", "published"], "status") if error: return error """ if value not in allowed: allowed_str = ", ".join(f"'{v}'" for v in allowed) return error_response(f"Invalid {field_name}. Must be one of: {allowed_str}") return None def extract_sort_params( default_field: str = "created_at", default_order: str = "desc", allowed_fields: Optional[List[str]] = None, ) -> Tuple[str, int]: """ Extract and validate sort parameters from request. Args: default_field: Default sort field default_order: Default sort order ("asc" or "desc") allowed_fields: List of allowed sort fields (None = no validation) Returns: Tuple of (sort_field, sort_order) Example: sort_field, sort_order = extract_sort_params( allowed_fields=["name", "date", "status"] ) """ sort_field = request.args.get("sort", default_field) sort_order_str = request.args.get("order", default_order).lower() if allowed_fields and sort_field not in allowed_fields: sort_field = default_field sort_order = -1 if sort_order_str == "desc" else 1 return sort_field, sort_order ================================================ FILE: application/api/user/workflows/__init__.py ================================================ from .routes import workflows_ns __all__ = ["workflows_ns"] ================================================ FILE: application/api/user/workflows/routes.py ================================================ """Workflow management routes.""" from datetime import datetime, timezone from typing import Any, Dict, List, Optional, Set from flask import current_app, request from flask_restx import Namespace, Resource from application.api.user.base import ( workflow_edges_collection, workflow_nodes_collection, workflows_collection, ) from application.core.json_schema_utils import ( JsonSchemaValidationError, normalize_json_schema_payload, ) from application.core.model_utils import get_model_capabilities from application.api.user.utils import ( check_resource_ownership, error_response, get_user_id, require_auth, require_fields, safe_db_operation, success_response, validate_object_id, ) workflows_ns = Namespace("workflows", path="/api") def _workflow_error_response(message: str, err: Exception): current_app.logger.error(f"{message}: {err}", exc_info=True) return error_response(message) def serialize_workflow(w: Dict) -> Dict: """Serialize workflow document to API response format.""" return { "id": str(w["_id"]), "name": w.get("name"), "description": w.get("description"), "created_at": w["created_at"].isoformat() if w.get("created_at") else None, "updated_at": w["updated_at"].isoformat() if w.get("updated_at") else None, } def serialize_node(n: Dict) -> Dict: """Serialize workflow node document to API response format.""" return { "id": n["id"], "type": n["type"], "title": n.get("title"), "description": n.get("description"), "position": n.get("position"), "data": n.get("config", {}), } def serialize_edge(e: Dict) -> Dict: """Serialize workflow edge document to API response format.""" return { "id": e["id"], "source": e.get("source_id"), "target": e.get("target_id"), "sourceHandle": e.get("source_handle"), "targetHandle": e.get("target_handle"), } def get_workflow_graph_version(workflow: Dict) -> int: """Get current graph version with legacy fallback.""" raw_version = workflow.get("current_graph_version", 1) try: version = int(raw_version) return version if version > 0 else 1 except (ValueError, TypeError): return 1 def fetch_graph_documents(collection, workflow_id: str, graph_version: int) -> List[Dict]: """Fetch graph docs for active version, with fallback for legacy unversioned data.""" docs = list( collection.find({"workflow_id": workflow_id, "graph_version": graph_version}) ) if docs: return docs if graph_version == 1: return list( collection.find( {"workflow_id": workflow_id, "graph_version": {"$exists": False}} ) ) return docs def validate_json_schema_payload( json_schema: Any, ) -> tuple[Optional[Dict[str, Any]], Optional[str]]: """Validate and normalize optional JSON schema payload for structured output.""" if json_schema is None: return None, None try: return normalize_json_schema_payload(json_schema), None except JsonSchemaValidationError as exc: return None, str(exc) def normalize_agent_node_json_schemas(nodes: List[Dict]) -> List[Dict]: """Normalize agent-node JSON schema payloads before persistence.""" normalized_nodes: List[Dict] = [] for node in nodes: if not isinstance(node, dict): normalized_nodes.append(node) continue normalized_node = dict(node) if normalized_node.get("type") != "agent": normalized_nodes.append(normalized_node) continue raw_config = normalized_node.get("data") if not isinstance(raw_config, dict) or "json_schema" not in raw_config: normalized_nodes.append(normalized_node) continue normalized_config = dict(raw_config) try: normalized_config["json_schema"] = normalize_json_schema_payload( raw_config.get("json_schema") ) except JsonSchemaValidationError: # Validation runs before normalization; keep original on unexpected shape. normalized_config["json_schema"] = raw_config.get("json_schema") normalized_node["data"] = normalized_config normalized_nodes.append(normalized_node) return normalized_nodes def validate_workflow_structure(nodes: List[Dict], edges: List[Dict]) -> List[str]: """Validate workflow graph structure.""" errors = [] if not nodes: errors.append("Workflow must have at least one node") return errors start_nodes = [n for n in nodes if n.get("type") == "start"] if len(start_nodes) != 1: errors.append("Workflow must have exactly one start node") end_nodes = [n for n in nodes if n.get("type") == "end"] if not end_nodes: errors.append("Workflow must have at least one end node") node_ids = {n.get("id") for n in nodes} node_map = {n.get("id"): n for n in nodes} end_ids = {n.get("id") for n in end_nodes} for edge in edges: source_id = edge.get("source") target_id = edge.get("target") if source_id not in node_ids: errors.append(f"Edge references non-existent source: {source_id}") if target_id not in node_ids: errors.append(f"Edge references non-existent target: {target_id}") if start_nodes: start_id = start_nodes[0].get("id") if not any(e.get("source") == start_id for e in edges): errors.append("Start node must have at least one outgoing edge") condition_nodes = [n for n in nodes if n.get("type") == "condition"] for cnode in condition_nodes: cnode_id = cnode.get("id") cnode_title = cnode.get("title", cnode_id) outgoing = [e for e in edges if e.get("source") == cnode_id] if len(outgoing) < 2: errors.append( f"Condition node '{cnode_title}' must have at least 2 outgoing edges" ) node_data = cnode.get("data", {}) or {} cases = node_data.get("cases", []) if not isinstance(cases, list): cases = [] if not cases or not any( isinstance(c, dict) and str(c.get("expression", "")).strip() for c in cases ): errors.append( f"Condition node '{cnode_title}' must have at least one case with an expression" ) case_handles: Set[str] = set() duplicate_case_handles: Set[str] = set() for case in cases: if not isinstance(case, dict): continue raw_handle = case.get("sourceHandle", "") handle = raw_handle.strip() if isinstance(raw_handle, str) else "" if not handle: errors.append( f"Condition node '{cnode_title}' has a case without a branch handle" ) continue if handle in case_handles: duplicate_case_handles.add(handle) case_handles.add(handle) for handle in duplicate_case_handles: errors.append( f"Condition node '{cnode_title}' has duplicate case handle '{handle}'" ) outgoing_by_handle: Dict[str, List[Dict]] = {} for out_edge in outgoing: raw_handle = out_edge.get("sourceHandle", "") handle = raw_handle.strip() if isinstance(raw_handle, str) else "" outgoing_by_handle.setdefault(handle, []).append(out_edge) for handle, handle_edges in outgoing_by_handle.items(): if not handle: errors.append( f"Condition node '{cnode_title}' has an outgoing edge without sourceHandle" ) continue if handle != "else" and handle not in case_handles: errors.append( f"Condition node '{cnode_title}' has a connection from unknown branch '{handle}'" ) if len(handle_edges) > 1: errors.append( f"Condition node '{cnode_title}' has multiple outgoing edges from branch '{handle}'" ) if "else" not in outgoing_by_handle: errors.append(f"Condition node '{cnode_title}' must have an 'else' branch") for case in cases: if not isinstance(case, dict): continue raw_handle = case.get("sourceHandle", "") handle = raw_handle.strip() if isinstance(raw_handle, str) else "" if not handle: continue raw_expression = case.get("expression", "") has_expression = isinstance(raw_expression, str) and bool( raw_expression.strip() ) has_outgoing = bool(outgoing_by_handle.get(handle)) if has_expression and not has_outgoing: errors.append( f"Condition node '{cnode_title}' case '{handle}' has an expression but no outgoing edge" ) if not has_expression and has_outgoing: errors.append( f"Condition node '{cnode_title}' case '{handle}' has an outgoing edge but no expression" ) for handle, handle_edges in outgoing_by_handle.items(): if not handle: continue for out_edge in handle_edges: target = out_edge.get("target") if target and not _can_reach_end(target, edges, node_map, end_ids): errors.append( f"Branch '{handle}' of condition '{cnode_title}' " f"must eventually reach an end node" ) agent_nodes = [n for n in nodes if n.get("type") == "agent"] for agent_node in agent_nodes: agent_title = agent_node.get("title", agent_node.get("id", "unknown")) raw_config = agent_node.get("data", {}) or {} if not isinstance(raw_config, dict): errors.append(f"Agent node '{agent_title}' has invalid configuration") continue normalized_schema, schema_error = validate_json_schema_payload( raw_config.get("json_schema") ) has_json_schema = normalized_schema is not None model_id = raw_config.get("model_id") if has_json_schema and isinstance(model_id, str) and model_id.strip(): capabilities = get_model_capabilities(model_id.strip()) if capabilities and not capabilities.get("supports_structured_output", False): errors.append( f"Agent node '{agent_title}' selected model does not support structured output" ) if schema_error: errors.append(f"Agent node '{agent_title}' JSON schema {schema_error}") for node in nodes: if not node.get("id"): errors.append("All nodes must have an id") if not node.get("type"): errors.append(f"Node {node.get('id', 'unknown')} must have a type") return errors def _can_reach_end( node_id: str, edges: List[Dict], node_map: Dict, end_ids: set, visited: set = None ) -> bool: if visited is None: visited = set() if node_id in end_ids: return True if node_id in visited or node_id not in node_map: return False visited.add(node_id) outgoing = [e.get("target") for e in edges if e.get("source") == node_id] return any(_can_reach_end(t, edges, node_map, end_ids, visited) for t in outgoing if t) def create_workflow_nodes( workflow_id: str, nodes_data: List[Dict], graph_version: int ) -> None: """Insert workflow nodes into database.""" if nodes_data: workflow_nodes_collection.insert_many( [ { "id": n["id"], "workflow_id": workflow_id, "graph_version": graph_version, "type": n["type"], "title": n.get("title", ""), "description": n.get("description", ""), "position": n.get("position", {"x": 0, "y": 0}), "config": n.get("data", {}), } for n in nodes_data ] ) def create_workflow_edges( workflow_id: str, edges_data: List[Dict], graph_version: int ) -> None: """Insert workflow edges into database.""" if edges_data: workflow_edges_collection.insert_many( [ { "id": e["id"], "workflow_id": workflow_id, "graph_version": graph_version, "source_id": e.get("source"), "target_id": e.get("target"), "source_handle": e.get("sourceHandle"), "target_handle": e.get("targetHandle"), } for e in edges_data ] ) @workflows_ns.route("/workflows") class WorkflowList(Resource): @require_auth @require_fields(["name"]) def post(self): """Create a new workflow with nodes and edges.""" user_id = get_user_id() data = request.get_json() name = data.get("name", "").strip() nodes_data = data.get("nodes", []) edges_data = data.get("edges", []) validation_errors = validate_workflow_structure(nodes_data, edges_data) if validation_errors: return error_response( "Workflow validation failed", errors=validation_errors ) nodes_data = normalize_agent_node_json_schemas(nodes_data) now = datetime.now(timezone.utc) workflow_doc = { "name": name, "description": data.get("description", ""), "user": user_id, "created_at": now, "updated_at": now, "current_graph_version": 1, } result, error = safe_db_operation( lambda: workflows_collection.insert_one(workflow_doc), "Failed to create workflow", ) if error: return error workflow_id = str(result.inserted_id) try: create_workflow_nodes(workflow_id, nodes_data, 1) create_workflow_edges(workflow_id, edges_data, 1) except Exception as err: workflow_nodes_collection.delete_many({"workflow_id": workflow_id}) workflow_edges_collection.delete_many({"workflow_id": workflow_id}) workflows_collection.delete_one({"_id": result.inserted_id}) return _workflow_error_response("Failed to create workflow structure", err) return success_response({"id": workflow_id}, 201) @workflows_ns.route("/workflows/") class WorkflowDetail(Resource): @require_auth def get(self, workflow_id: str): """Get workflow details with nodes and edges.""" user_id = get_user_id() obj_id, error = validate_object_id(workflow_id, "Workflow") if error: return error workflow, error = check_resource_ownership( workflows_collection, obj_id, user_id, "Workflow" ) if error: return error graph_version = get_workflow_graph_version(workflow) nodes = fetch_graph_documents( workflow_nodes_collection, workflow_id, graph_version ) edges = fetch_graph_documents( workflow_edges_collection, workflow_id, graph_version ) return success_response( { "workflow": serialize_workflow(workflow), "nodes": [serialize_node(n) for n in nodes], "edges": [serialize_edge(e) for e in edges], } ) @require_auth @require_fields(["name"]) def put(self, workflow_id: str): """Update workflow and replace nodes/edges.""" user_id = get_user_id() obj_id, error = validate_object_id(workflow_id, "Workflow") if error: return error workflow, error = check_resource_ownership( workflows_collection, obj_id, user_id, "Workflow" ) if error: return error data = request.get_json() name = data.get("name", "").strip() nodes_data = data.get("nodes", []) edges_data = data.get("edges", []) validation_errors = validate_workflow_structure(nodes_data, edges_data) if validation_errors: return error_response( "Workflow validation failed", errors=validation_errors ) nodes_data = normalize_agent_node_json_schemas(nodes_data) current_graph_version = get_workflow_graph_version(workflow) next_graph_version = current_graph_version + 1 try: create_workflow_nodes(workflow_id, nodes_data, next_graph_version) create_workflow_edges(workflow_id, edges_data, next_graph_version) except Exception as err: workflow_nodes_collection.delete_many( {"workflow_id": workflow_id, "graph_version": next_graph_version} ) workflow_edges_collection.delete_many( {"workflow_id": workflow_id, "graph_version": next_graph_version} ) return _workflow_error_response("Failed to update workflow structure", err) now = datetime.now(timezone.utc) _, error = safe_db_operation( lambda: workflows_collection.update_one( {"_id": obj_id}, { "$set": { "name": name, "description": data.get("description", ""), "updated_at": now, "current_graph_version": next_graph_version, } }, ), "Failed to update workflow", ) if error: workflow_nodes_collection.delete_many( {"workflow_id": workflow_id, "graph_version": next_graph_version} ) workflow_edges_collection.delete_many( {"workflow_id": workflow_id, "graph_version": next_graph_version} ) return error try: workflow_nodes_collection.delete_many( {"workflow_id": workflow_id, "graph_version": {"$ne": next_graph_version}} ) workflow_edges_collection.delete_many( {"workflow_id": workflow_id, "graph_version": {"$ne": next_graph_version}} ) except Exception as cleanup_err: current_app.logger.warning( f"Failed to clean old workflow graph versions for {workflow_id}: {cleanup_err}" ) return success_response() @require_auth def delete(self, workflow_id: str): """Delete workflow and its graph.""" user_id = get_user_id() obj_id, error = validate_object_id(workflow_id, "Workflow") if error: return error workflow, error = check_resource_ownership( workflows_collection, obj_id, user_id, "Workflow" ) if error: return error try: workflow_nodes_collection.delete_many({"workflow_id": workflow_id}) workflow_edges_collection.delete_many({"workflow_id": workflow_id}) workflows_collection.delete_one({"_id": workflow["_id"], "user": user_id}) except Exception as err: return _workflow_error_response("Failed to delete workflow", err) return success_response() ================================================ FILE: application/app.py ================================================ import os import platform import uuid import dotenv from flask import Flask, jsonify, redirect, request from jose import jwt from application.auth import handle_auth from application.core.logging_config import setup_logging setup_logging() from application.api import api # noqa: E402 from application.api.answer import answer # noqa: E402 from application.api.internal.routes import internal # noqa: E402 from application.api.user.routes import user # noqa: E402 from application.api.connector.routes import connector # noqa: E402 from application.celery_init import celery # noqa: E402 from application.core.settings import settings # noqa: E402 from application.stt.upload_limits import ( # noqa: E402 build_stt_file_size_limit_message, should_reject_stt_request, ) if platform.system() == "Windows": import pathlib pathlib.PosixPath = pathlib.WindowsPath dotenv.load_dotenv() app = Flask(__name__) app.register_blueprint(user) app.register_blueprint(answer) app.register_blueprint(internal) app.register_blueprint(connector) app.config.update( UPLOAD_FOLDER="inputs", CELERY_BROKER_URL=settings.CELERY_BROKER_URL, CELERY_RESULT_BACKEND=settings.CELERY_RESULT_BACKEND, MONGO_URI=settings.MONGO_URI, ) celery.config_from_object("application.celeryconfig") api.init_app(app) if settings.AUTH_TYPE in ("simple_jwt", "session_jwt") and not settings.JWT_SECRET_KEY: key_file = ".jwt_secret_key" try: with open(key_file, "r") as f: settings.JWT_SECRET_KEY = f.read().strip() except FileNotFoundError: new_key = os.urandom(32).hex() with open(key_file, "w") as f: f.write(new_key) settings.JWT_SECRET_KEY = new_key except Exception as e: raise RuntimeError(f"Failed to setup JWT_SECRET_KEY: {e}") SIMPLE_JWT_TOKEN = None if settings.AUTH_TYPE == "simple_jwt": payload = {"sub": "local"} SIMPLE_JWT_TOKEN = jwt.encode(payload, settings.JWT_SECRET_KEY, algorithm="HS256") print(f"Generated Simple JWT Token: {SIMPLE_JWT_TOKEN}") @app.route("/") def home(): if request.remote_addr in ("0.0.0.0", "127.0.0.1", "localhost", "172.18.0.1"): return redirect("http://localhost:5173") else: return "Welcome to DocsGPT Backend!" @app.route("/api/config") def get_config(): response = { "auth_type": settings.AUTH_TYPE, "requires_auth": settings.AUTH_TYPE in ["simple_jwt", "session_jwt"], } return jsonify(response) @app.route("/api/generate_token") def generate_token(): if settings.AUTH_TYPE == "session_jwt": new_user_id = str(uuid.uuid4()) token = jwt.encode( {"sub": new_user_id}, settings.JWT_SECRET_KEY, algorithm="HS256" ) return jsonify({"token": token}) return jsonify({"error": "Token generation not allowed in current auth mode"}), 400 @app.before_request def enforce_stt_request_size_limits(): if request.method == "OPTIONS": return None if should_reject_stt_request(request.path, request.content_length): return ( jsonify( { "success": False, "message": build_stt_file_size_limit_message(), } ), 413, ) return None @app.before_request def authenticate_request(): if request.method == "OPTIONS": return "", 200 decoded_token = handle_auth(request) if not decoded_token: request.decoded_token = None elif "error" in decoded_token: return jsonify(decoded_token), 401 else: request.decoded_token = decoded_token @app.after_request def after_request(response): response.headers.add("Access-Control-Allow-Origin", "*") response.headers.add("Access-Control-Allow-Headers", "Content-Type, Authorization") response.headers.add( "Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS" ) return response if __name__ == "__main__": app.run(debug=settings.FLASK_DEBUG_MODE, port=7091) ================================================ FILE: application/auth.py ================================================ from jose import jwt from application.core.settings import settings def handle_auth(request, data={}): if settings.AUTH_TYPE in ["simple_jwt", "session_jwt"]: jwt_token = request.headers.get("Authorization") if not jwt_token: return None jwt_token = jwt_token.replace("Bearer ", "") try: decoded_token = jwt.decode( jwt_token, settings.JWT_SECRET_KEY, algorithms=["HS256"], options={"verify_exp": False}, ) return decoded_token except Exception: return { "message": "Authentication error: invalid token", "error": "invalid_token", } else: return {"sub": "local"} ================================================ FILE: application/cache.py ================================================ import json import logging import time from threading import Lock import redis from application.core.settings import settings from application.utils import get_hash logger = logging.getLogger(__name__) _redis_instance = None _redis_creation_failed = False _instance_lock = Lock() def get_redis_instance(): global _redis_instance, _redis_creation_failed if _redis_instance is None and not _redis_creation_failed: with _instance_lock: if _redis_instance is None and not _redis_creation_failed: try: _redis_instance = redis.Redis.from_url( settings.CACHE_REDIS_URL, socket_connect_timeout=2 ) except ValueError as e: logger.error(f"Invalid Redis URL: {e}") _redis_creation_failed = True # Stop future attempts _redis_instance = None except redis.ConnectionError as e: logger.error(f"Redis connection error: {e}") _redis_instance = None # Keep trying for connection errors return _redis_instance def gen_cache_key(messages, model="docgpt", tools=None): if not all(isinstance(msg, dict) for msg in messages): raise ValueError("All messages must be dictionaries.") messages_str = json.dumps(messages) tools_str = json.dumps(str(tools)) if tools else "" combined = f"{model}_{messages_str}_{tools_str}" cache_key = get_hash(combined) return cache_key def gen_cache(func): def wrapper(self, model, messages, stream, tools=None, *args, **kwargs): if tools is not None: return func(self, model, messages, stream, tools, *args, **kwargs) try: cache_key = gen_cache_key(messages, model, tools) except ValueError as e: logger.error(f"Cache key generation failed: {e}") return func(self, model, messages, stream, tools, *args, **kwargs) redis_client = get_redis_instance() if redis_client: try: cached_response = redis_client.get(cache_key) if cached_response: return cached_response.decode("utf-8") except Exception as e: logger.error(f"Error getting cached response: {e}", exc_info=True) result = func(self, model, messages, stream, tools, *args, **kwargs) if redis_client and isinstance(result, str): try: redis_client.set(cache_key, result, ex=1800) except Exception as e: logger.error(f"Error setting cache: {e}", exc_info=True) return result return wrapper def stream_cache(func): def wrapper(self, model, messages, stream, tools=None, *args, **kwargs): if tools is not None: yield from func(self, model, messages, stream, tools, *args, **kwargs) return try: cache_key = gen_cache_key(messages, model, tools) except ValueError as e: logger.error(f"Cache key generation failed: {e}") yield from func(self, model, messages, stream, tools, *args, **kwargs) return redis_client = get_redis_instance() if redis_client: try: cached_response = redis_client.get(cache_key) if cached_response: logger.info(f"Cache hit for stream key: {cache_key}") cached_response = json.loads(cached_response.decode("utf-8")) for chunk in cached_response: yield chunk time.sleep(0.03) # Simulate streaming delay return except Exception as e: logger.error(f"Error getting cached stream: {e}", exc_info=True) stream_cache_data = [] for chunk in func(self, model, messages, stream, tools, *args, **kwargs): yield chunk stream_cache_data.append(str(chunk)) if redis_client: try: redis_client.set(cache_key, json.dumps(stream_cache_data), ex=1800) logger.info(f"Stream cache saved for key: {cache_key}") except Exception as e: logger.error(f"Error setting stream cache: {e}", exc_info=True) return wrapper ================================================ FILE: application/celery_init.py ================================================ from celery import Celery from application.core.settings import settings from celery.signals import setup_logging def make_celery(app_name=__name__): celery = Celery( app_name, broker=settings.CELERY_BROKER_URL, backend=settings.CELERY_RESULT_BACKEND, ) celery.conf.update(settings) return celery @setup_logging.connect def config_loggers(*args, **kwargs): from application.core.logging_config import setup_logging setup_logging() celery = make_celery() celery.config_from_object("application.celeryconfig") ================================================ FILE: application/celeryconfig.py ================================================ import os broker_url = os.getenv("CELERY_BROKER_URL") result_backend = os.getenv("CELERY_RESULT_BACKEND") task_serializer = 'json' result_serializer = 'json' accept_content = ['json'] # Autodiscover tasks imports = ('application.api.user.tasks',) ================================================ FILE: application/core/__init__.py ================================================ ================================================ FILE: application/core/json_schema_utils.py ================================================ from typing import Any, Dict, Optional class JsonSchemaValidationError(ValueError): """Raised when a JSON schema payload is invalid.""" def normalize_json_schema_payload(json_schema: Any) -> Optional[Dict[str, Any]]: """ Normalize accepted JSON schema payload shapes to a plain schema object. Accepted inputs: - None - A raw schema object with a top-level "type" - A wrapped payload with a top-level "schema" object """ if json_schema is None: return None if not isinstance(json_schema, dict): raise JsonSchemaValidationError("must be a valid JSON object") wrapped_schema = json_schema.get("schema") if wrapped_schema is not None: if not isinstance(wrapped_schema, dict): raise JsonSchemaValidationError('field "schema" must be a valid JSON object') return wrapped_schema if "type" not in json_schema: raise JsonSchemaValidationError( 'must include either a "type" or "schema" field' ) return json_schema ================================================ FILE: application/core/logging_config.py ================================================ from logging.config import dictConfig def setup_logging(): dictConfig({ 'version': 1, 'formatters': { 'default': { 'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s', } }, "handlers": { "console": { "class": "logging.StreamHandler", "stream": "ext://sys.stdout", "formatter": "default", } }, 'root': { 'level': 'INFO', 'handlers': ['console'], }, }) ================================================ FILE: application/core/model_configs.py ================================================ """ Model configurations for all supported LLM providers. """ from application.core.model_settings import ( AvailableModel, ModelCapabilities, ModelProvider, ) # Base image attachment types supported by most vision-capable LLMs IMAGE_ATTACHMENTS = [ "image/png", "image/jpeg", "image/jpg", "image/webp", "image/gif", ] # PDF excluded: most OpenAI-compatible endpoints don't support native PDF uploads. # When excluded, PDFs are synthetically processed by converting pages to images. OPENAI_ATTACHMENTS = IMAGE_ATTACHMENTS GOOGLE_ATTACHMENTS = ["application/pdf"] + IMAGE_ATTACHMENTS ANTHROPIC_ATTACHMENTS = IMAGE_ATTACHMENTS OPENROUTER_ATTACHMENTS = IMAGE_ATTACHMENTS OPENAI_MODELS = [ AvailableModel( id="gpt-5.1", provider=ModelProvider.OPENAI, display_name="GPT-5.1", description="Flagship model with enhanced reasoning, coding, and agentic capabilities", capabilities=ModelCapabilities( supports_tools=True, supports_structured_output=True, supported_attachment_types=OPENAI_ATTACHMENTS, context_window=200000, ), ), AvailableModel( id="gpt-5-mini", provider=ModelProvider.OPENAI, display_name="GPT-5 Mini", description="Faster, cost-effective variant of GPT-5.1", capabilities=ModelCapabilities( supports_tools=True, supports_structured_output=True, supported_attachment_types=OPENAI_ATTACHMENTS, context_window=200000, ), ) ] ANTHROPIC_MODELS = [ AvailableModel( id="claude-3-5-sonnet-20241022", provider=ModelProvider.ANTHROPIC, display_name="Claude 3.5 Sonnet (Latest)", description="Latest Claude 3.5 Sonnet with enhanced capabilities", capabilities=ModelCapabilities( supports_tools=True, supported_attachment_types=ANTHROPIC_ATTACHMENTS, context_window=200000, ), ), AvailableModel( id="claude-3-5-sonnet", provider=ModelProvider.ANTHROPIC, display_name="Claude 3.5 Sonnet", description="Balanced performance and capability", capabilities=ModelCapabilities( supports_tools=True, supported_attachment_types=ANTHROPIC_ATTACHMENTS, context_window=200000, ), ), AvailableModel( id="claude-3-opus", provider=ModelProvider.ANTHROPIC, display_name="Claude 3 Opus", description="Most capable Claude model", capabilities=ModelCapabilities( supports_tools=True, supported_attachment_types=ANTHROPIC_ATTACHMENTS, context_window=200000, ), ), AvailableModel( id="claude-3-haiku", provider=ModelProvider.ANTHROPIC, display_name="Claude 3 Haiku", description="Fastest Claude model", capabilities=ModelCapabilities( supports_tools=True, supported_attachment_types=ANTHROPIC_ATTACHMENTS, context_window=200000, ), ), ] GOOGLE_MODELS = [ AvailableModel( id="gemini-flash-latest", provider=ModelProvider.GOOGLE, display_name="Gemini Flash (Latest)", description="Latest experimental Gemini model", capabilities=ModelCapabilities( supports_tools=True, supports_structured_output=True, supported_attachment_types=GOOGLE_ATTACHMENTS, context_window=int(1e6), ), ), AvailableModel( id="gemini-flash-lite-latest", provider=ModelProvider.GOOGLE, display_name="Gemini Flash Lite (Latest)", description="Fast with huge context window", capabilities=ModelCapabilities( supports_tools=True, supports_structured_output=True, supported_attachment_types=GOOGLE_ATTACHMENTS, context_window=int(1e6), ), ), AvailableModel( id="gemini-3-pro-preview", provider=ModelProvider.GOOGLE, display_name="Gemini 3 Pro", description="Most capable Gemini model", capabilities=ModelCapabilities( supports_tools=True, supports_structured_output=True, supported_attachment_types=GOOGLE_ATTACHMENTS, context_window=2000000, ), ), ] GROQ_MODELS = [ AvailableModel( id="llama-3.3-70b-versatile", provider=ModelProvider.GROQ, display_name="Llama 3.3 70B", description="Latest Llama model with high-speed inference", capabilities=ModelCapabilities( supports_tools=True, context_window=128000, ), ), AvailableModel( id="openai/gpt-oss-120b", provider=ModelProvider.GROQ, display_name="GPT-OSS 120B", description="Open-source GPT model optimized for speed", capabilities=ModelCapabilities( supports_tools=True, context_window=128000, ), ), ] OPENROUTER_MODELS = [ AvailableModel( id="qwen/qwen3-coder:free", provider=ModelProvider.OPENROUTER, display_name="Qwen 3 Coder", description="Latest Qwen model with high-speed inference", capabilities=ModelCapabilities( supports_tools=True, context_window=128000, supported_attachment_types=OPENROUTER_ATTACHMENTS ), ), AvailableModel( id="google/gemma-3-27b-it:free", provider=ModelProvider.OPENROUTER, display_name="Gemma 3 27B", description="Latest Gemma model with high-speed inference", capabilities=ModelCapabilities( supports_tools=True, context_window=128000, supported_attachment_types=OPENROUTER_ATTACHMENTS ), ), ] AZURE_OPENAI_MODELS = [ AvailableModel( id="azure-gpt-4", provider=ModelProvider.AZURE_OPENAI, display_name="Azure OpenAI GPT-4", description="Azure-hosted GPT model", capabilities=ModelCapabilities( supports_tools=True, supports_structured_output=True, supported_attachment_types=OPENAI_ATTACHMENTS, context_window=8192, ), ), ] def create_custom_openai_model(model_name: str, base_url: str) -> AvailableModel: """Create a custom OpenAI-compatible model (e.g., LM Studio, Ollama).""" return AvailableModel( id=model_name, provider=ModelProvider.OPENAI, display_name=model_name, description=f"Custom OpenAI-compatible model at {base_url}", base_url=base_url, capabilities=ModelCapabilities( supports_tools=True, supported_attachment_types=OPENAI_ATTACHMENTS, ), ) ================================================ FILE: application/core/model_settings.py ================================================ import logging from dataclasses import dataclass, field from enum import Enum from typing import Dict, List, Optional logger = logging.getLogger(__name__) class ModelProvider(str, Enum): OPENAI = "openai" OPENROUTER = "openrouter" AZURE_OPENAI = "azure_openai" ANTHROPIC = "anthropic" GROQ = "groq" GOOGLE = "google" HUGGINGFACE = "huggingface" LLAMA_CPP = "llama.cpp" DOCSGPT = "docsgpt" PREMAI = "premai" SAGEMAKER = "sagemaker" NOVITA = "novita" @dataclass class ModelCapabilities: supports_tools: bool = False supports_structured_output: bool = False supports_streaming: bool = True supported_attachment_types: List[str] = field(default_factory=list) context_window: int = 128000 input_cost_per_token: Optional[float] = None output_cost_per_token: Optional[float] = None @dataclass class AvailableModel: id: str provider: ModelProvider display_name: str description: str = "" capabilities: ModelCapabilities = field(default_factory=ModelCapabilities) enabled: bool = True base_url: Optional[str] = None def to_dict(self) -> Dict: result = { "id": self.id, "provider": self.provider.value, "display_name": self.display_name, "description": self.description, "supported_attachment_types": self.capabilities.supported_attachment_types, "supports_tools": self.capabilities.supports_tools, "supports_structured_output": self.capabilities.supports_structured_output, "supports_streaming": self.capabilities.supports_streaming, "context_window": self.capabilities.context_window, "enabled": self.enabled, } if self.base_url: result["base_url"] = self.base_url return result class ModelRegistry: _instance = None _initialized = False def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) return cls._instance def __init__(self): if not ModelRegistry._initialized: self.models: Dict[str, AvailableModel] = {} self.default_model_id: Optional[str] = None self._load_models() ModelRegistry._initialized = True @classmethod def get_instance(cls) -> "ModelRegistry": return cls() def _load_models(self): from application.core.settings import settings self.models.clear() # Skip DocsGPT model if using custom OpenAI-compatible endpoint if not settings.OPENAI_BASE_URL: self._add_docsgpt_models(settings) if ( settings.OPENAI_API_KEY or (settings.LLM_PROVIDER == "openai" and settings.API_KEY) or settings.OPENAI_BASE_URL ): self._add_openai_models(settings) if settings.OPENAI_API_BASE or ( settings.LLM_PROVIDER == "azure_openai" and settings.API_KEY ): self._add_azure_openai_models(settings) if settings.ANTHROPIC_API_KEY or ( settings.LLM_PROVIDER == "anthropic" and settings.API_KEY ): self._add_anthropic_models(settings) if settings.GOOGLE_API_KEY or ( settings.LLM_PROVIDER == "google" and settings.API_KEY ): self._add_google_models(settings) if settings.GROQ_API_KEY or ( settings.LLM_PROVIDER == "groq" and settings.API_KEY ): self._add_groq_models(settings) if settings.OPEN_ROUTER_API_KEY or ( settings.LLM_PROVIDER == "openrouter" and settings.API_KEY ): self._add_openrouter_models(settings) if settings.HUGGINGFACE_API_KEY or ( settings.LLM_PROVIDER == "huggingface" and settings.API_KEY ): self._add_huggingface_models(settings) # Default model selection if settings.LLM_NAME: # Parse LLM_NAME (may be comma-separated) model_names = self._parse_model_names(settings.LLM_NAME) # First model in the list becomes default for model_name in model_names: if model_name in self.models: self.default_model_id = model_name break # Backward compat: try exact match if no parsed model found if not self.default_model_id and settings.LLM_NAME in self.models: self.default_model_id = settings.LLM_NAME if not self.default_model_id: if settings.LLM_PROVIDER and settings.API_KEY: for model_id, model in self.models.items(): if model.provider.value == settings.LLM_PROVIDER: self.default_model_id = model_id break if not self.default_model_id and self.models: self.default_model_id = next(iter(self.models.keys())) logger.info( f"ModelRegistry loaded {len(self.models)} models, default: {self.default_model_id}" ) def _add_openai_models(self, settings): from application.core.model_configs import ( OPENAI_MODELS, create_custom_openai_model, ) # Check if using local OpenAI-compatible endpoint (Ollama, LM Studio, etc.) using_local_endpoint = bool( settings.OPENAI_BASE_URL and settings.OPENAI_BASE_URL.strip() ) if using_local_endpoint: # When OPENAI_BASE_URL is set, ONLY register custom models from LLM_NAME # Do NOT add standard OpenAI models (gpt-5.1, etc.) if settings.LLM_NAME: model_names = self._parse_model_names(settings.LLM_NAME) for model_name in model_names: custom_model = create_custom_openai_model( model_name, settings.OPENAI_BASE_URL ) self.models[model_name] = custom_model logger.info( f"Registered custom OpenAI model: {model_name} at {settings.OPENAI_BASE_URL}" ) else: # Standard OpenAI API usage - add standard models if API key is valid if settings.OPENAI_API_KEY: for model in OPENAI_MODELS: self.models[model.id] = model def _add_azure_openai_models(self, settings): from application.core.model_configs import AZURE_OPENAI_MODELS if settings.LLM_PROVIDER == "azure_openai" and settings.LLM_NAME: for model in AZURE_OPENAI_MODELS: if model.id == settings.LLM_NAME: self.models[model.id] = model return for model in AZURE_OPENAI_MODELS: self.models[model.id] = model def _add_anthropic_models(self, settings): from application.core.model_configs import ANTHROPIC_MODELS if settings.ANTHROPIC_API_KEY: for model in ANTHROPIC_MODELS: self.models[model.id] = model return if settings.LLM_PROVIDER == "anthropic" and settings.LLM_NAME: for model in ANTHROPIC_MODELS: if model.id == settings.LLM_NAME: self.models[model.id] = model return for model in ANTHROPIC_MODELS: self.models[model.id] = model def _add_google_models(self, settings): from application.core.model_configs import GOOGLE_MODELS if settings.GOOGLE_API_KEY: for model in GOOGLE_MODELS: self.models[model.id] = model return if settings.LLM_PROVIDER == "google" and settings.LLM_NAME: for model in GOOGLE_MODELS: if model.id == settings.LLM_NAME: self.models[model.id] = model return for model in GOOGLE_MODELS: self.models[model.id] = model def _add_groq_models(self, settings): from application.core.model_configs import GROQ_MODELS if settings.GROQ_API_KEY: for model in GROQ_MODELS: self.models[model.id] = model return if settings.LLM_PROVIDER == "groq" and settings.LLM_NAME: for model in GROQ_MODELS: if model.id == settings.LLM_NAME: self.models[model.id] = model return for model in GROQ_MODELS: self.models[model.id] = model def _add_openrouter_models(self, settings): from application.core.model_configs import OPENROUTER_MODELS if settings.OPEN_ROUTER_API_KEY: for model in OPENROUTER_MODELS: self.models[model.id] = model return if settings.LLM_PROVIDER == "openrouter" and settings.LLM_NAME: for model in OPENROUTER_MODELS: if model.id == settings.LLM_NAME: self.models[model.id] = model return for model in OPENROUTER_MODELS: self.models[model.id] = model def _add_docsgpt_models(self, settings): model_id = "docsgpt-local" model = AvailableModel( id=model_id, provider=ModelProvider.DOCSGPT, display_name="DocsGPT Model", description="Local model", capabilities=ModelCapabilities( supports_tools=False, supported_attachment_types=[], ), ) self.models[model_id] = model def _add_huggingface_models(self, settings): model_id = "huggingface-local" model = AvailableModel( id=model_id, provider=ModelProvider.HUGGINGFACE, display_name="Hugging Face Model", description="Local Hugging Face model", capabilities=ModelCapabilities( supports_tools=False, supported_attachment_types=[], ), ) self.models[model_id] = model def _parse_model_names(self, llm_name: str) -> List[str]: """ Parse LLM_NAME which may contain comma-separated model names. E.g., 'deepseek-r1:1.5b,gemma:2b' -> ['deepseek-r1:1.5b', 'gemma:2b'] """ if not llm_name: return [] return [name.strip() for name in llm_name.split(",") if name.strip()] def get_model(self, model_id: str) -> Optional[AvailableModel]: return self.models.get(model_id) def get_all_models(self) -> List[AvailableModel]: return list(self.models.values()) def get_enabled_models(self) -> List[AvailableModel]: return [m for m in self.models.values() if m.enabled] def model_exists(self, model_id: str) -> bool: return model_id in self.models ================================================ FILE: application/core/model_utils.py ================================================ from typing import Any, Dict, Optional from application.core.model_settings import ModelRegistry def get_api_key_for_provider(provider: str) -> Optional[str]: """Get the appropriate API key for a provider""" from application.core.settings import settings provider_key_map = { "openai": settings.OPENAI_API_KEY, "openrouter": settings.OPEN_ROUTER_API_KEY, "anthropic": settings.ANTHROPIC_API_KEY, "google": settings.GOOGLE_API_KEY, "groq": settings.GROQ_API_KEY, "huggingface": settings.HUGGINGFACE_API_KEY, "azure_openai": settings.API_KEY, "docsgpt": None, "llama.cpp": None, } provider_key = provider_key_map.get(provider) if provider_key: return provider_key return settings.API_KEY def get_all_available_models() -> Dict[str, Dict[str, Any]]: """Get all available models with metadata for API response""" registry = ModelRegistry.get_instance() return {model.id: model.to_dict() for model in registry.get_enabled_models()} def validate_model_id(model_id: str) -> bool: """Check if a model ID exists in registry""" registry = ModelRegistry.get_instance() return registry.model_exists(model_id) def get_model_capabilities(model_id: str) -> Optional[Dict[str, Any]]: """Get capabilities for a specific model""" registry = ModelRegistry.get_instance() model = registry.get_model(model_id) if model: return { "supported_attachment_types": model.capabilities.supported_attachment_types, "supports_tools": model.capabilities.supports_tools, "supports_structured_output": model.capabilities.supports_structured_output, "context_window": model.capabilities.context_window, } return None def get_default_model_id() -> str: """Get the system default model ID""" registry = ModelRegistry.get_instance() return registry.default_model_id def get_provider_from_model_id(model_id: str) -> Optional[str]: """Get the provider name for a given model_id""" registry = ModelRegistry.get_instance() model = registry.get_model(model_id) if model: return model.provider.value return None def get_token_limit(model_id: str) -> int: """ Get context window (token limit) for a model. Returns model's context_window or default 128000 if model not found. """ from application.core.settings import settings registry = ModelRegistry.get_instance() model = registry.get_model(model_id) if model: return model.capabilities.context_window return settings.DEFAULT_LLM_TOKEN_LIMIT def get_base_url_for_model(model_id: str) -> Optional[str]: """ Get the custom base_url for a specific model if configured. Returns None if no custom base_url is set. """ registry = ModelRegistry.get_instance() model = registry.get_model(model_id) if model: return model.base_url return None ================================================ FILE: application/core/mongo_db.py ================================================ from application.core.settings import settings from pymongo import MongoClient class MongoDB: _client = None @classmethod def get_client(cls): """ Get the MongoDB client instance, creating it if necessary. """ if cls._client is None: cls._client = MongoClient(settings.MONGO_URI) return cls._client @classmethod def close_client(cls): """ Close the MongoDB client connection. """ if cls._client is not None: cls._client.close() cls._client = None ================================================ FILE: application/core/settings.py ================================================ import os from pathlib import Path from typing import Optional from pydantic import field_validator from pydantic_settings import BaseSettings, SettingsConfigDict current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) class Settings(BaseSettings): model_config = SettingsConfigDict(extra="ignore") AUTH_TYPE: Optional[str] = None # simple_jwt, session_jwt, or None LLM_PROVIDER: str = "docsgpt" LLM_NAME: Optional[str] = ( None # if LLM_PROVIDER is openai, LLM_NAME can be gpt-4 or gpt-3.5-turbo ) EMBEDDINGS_NAME: str = "huggingface_sentence-transformers/all-mpnet-base-v2" EMBEDDINGS_BASE_URL: Optional[str] = None # Remote embeddings API URL (OpenAI-compatible) EMBEDDINGS_KEY: Optional[str] = ( None # api key for embeddings (if using openai, just copy API_KEY) ) CELERY_BROKER_URL: str = "redis://localhost:6379/0" CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" MONGO_URI: str = "mongodb://localhost:27017/docsgpt" MONGO_DB_NAME: str = "docsgpt" LLM_PATH: str = os.path.join(current_dir, "models/docsgpt-7b-f16.gguf") DEFAULT_MAX_HISTORY: int = 150 DEFAULT_LLM_TOKEN_LIMIT: int = 128000 # Fallback when model not found in registry RESERVED_TOKENS: dict = { "system_prompt": 500, "current_query": 500, "safety_buffer": 1000, } DEFAULT_AGENT_LIMITS: dict = { "token_limit": 50000, "request_limit": 500, } UPLOAD_FOLDER: str = "inputs" PARSE_PDF_AS_IMAGE: bool = False PARSE_IMAGE_REMOTE: bool = False DOCLING_OCR_ENABLED: bool = False # Enable OCR for docling parsers (PDF, images) DOCLING_OCR_ATTACHMENTS_ENABLED: bool = False # Enable OCR for docling when parsing attachments VECTOR_STORE: str = ( "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb" or "pgvector" ) RETRIEVERS_ENABLED: list = ["classic_rag"] AGENT_NAME: str = "classic" FALLBACK_LLM_PROVIDER: Optional[str] = None # provider for fallback llm FALLBACK_LLM_NAME: Optional[str] = None # model name for fallback llm FALLBACK_LLM_API_KEY: Optional[str] = None # api key for fallback llm # Google Drive integration GOOGLE_CLIENT_ID: Optional[str] = ( None # Replace with your actual Google OAuth client ID ) GOOGLE_CLIENT_SECRET: Optional[str] = ( None # Replace with your actual Google OAuth client secret ) CONNECTOR_REDIRECT_BASE_URI: Optional[str] = ( "http://127.0.0.1:7091/api/connectors/callback" ##add redirect url as it is to your provider's console(gcp) ) # Microsoft Entra ID (Azure AD) integration MICROSOFT_CLIENT_ID: Optional[str] = None # Azure AD Application (client) ID MICROSOFT_CLIENT_SECRET: Optional[str] = None # Azure AD Application client secret MICROSOFT_TENANT_ID: Optional[str] = "common" # Azure AD Tenant ID (or 'common' for multi-tenant) MICROSOFT_AUTHORITY: Optional[str] = None # e.g., "https://login.microsoftonline.com/{tenant_id}" # GitHub source GITHUB_ACCESS_TOKEN: Optional[str] = None # PAT token with read repo access # LLM Cache CACHE_REDIS_URL: str = "redis://localhost:6379/2" API_URL: str = "http://localhost:7091" # backend url for celery worker MCP_OAUTH_REDIRECT_URI: Optional[str] = None # public callback URL for MCP OAuth INTERNAL_KEY: Optional[str] = None # internal api key for worker-to-backend auth API_KEY: Optional[str] = None # LLM api key (used by LLM_PROVIDER) # Provider-specific API keys (for multi-model support) OPENAI_API_KEY: Optional[str] = None ANTHROPIC_API_KEY: Optional[str] = None GOOGLE_API_KEY: Optional[str] = None GROQ_API_KEY: Optional[str] = None HUGGINGFACE_API_KEY: Optional[str] = None OPEN_ROUTER_API_KEY: Optional[str] = None OPENAI_API_BASE: Optional[str] = None # azure openai api base url OPENAI_API_VERSION: Optional[str] = None # azure openai api version AZURE_DEPLOYMENT_NAME: Optional[str] = None # azure deployment name for answering AZURE_EMBEDDINGS_DEPLOYMENT_NAME: Optional[str] = ( None # azure deployment name for embeddings ) OPENAI_BASE_URL: Optional[str] = ( None # openai base url for open ai compatable models ) # elasticsearch ELASTIC_CLOUD_ID: Optional[str] = None # cloud id for elasticsearch ELASTIC_USERNAME: Optional[str] = None # username for elasticsearch ELASTIC_PASSWORD: Optional[str] = None # password for elasticsearch ELASTIC_URL: Optional[str] = None # url for elasticsearch ELASTIC_INDEX: Optional[str] = "docsgpt" # index name for elasticsearch # SageMaker config SAGEMAKER_ENDPOINT: Optional[str] = None # SageMaker endpoint name SAGEMAKER_REGION: Optional[str] = None # SageMaker region name SAGEMAKER_ACCESS_KEY: Optional[str] = None # SageMaker access key SAGEMAKER_SECRET_KEY: Optional[str] = None # SageMaker secret key # prem ai project id PREMAI_PROJECT_ID: Optional[str] = None # Qdrant vectorstore config QDRANT_COLLECTION_NAME: Optional[str] = "docsgpt" QDRANT_LOCATION: Optional[str] = None QDRANT_URL: Optional[str] = None QDRANT_PORT: Optional[int] = 6333 QDRANT_GRPC_PORT: int = 6334 QDRANT_PREFER_GRPC: bool = False QDRANT_HTTPS: Optional[bool] = None QDRANT_API_KEY: Optional[str] = None QDRANT_PREFIX: Optional[str] = None QDRANT_TIMEOUT: Optional[float] = None QDRANT_HOST: Optional[str] = None QDRANT_PATH: Optional[str] = None QDRANT_DISTANCE_FUNC: str = "Cosine" # PGVector vectorstore config PGVECTOR_CONNECTION_STRING: Optional[str] = None # Milvus vectorstore config MILVUS_COLLECTION_NAME: Optional[str] = "docsgpt" MILVUS_URI: Optional[str] = "./milvus_local.db" # milvus lite version as default MILVUS_TOKEN: Optional[str] = "" # LanceDB vectorstore config LANCEDB_PATH: str = "./data/lancedb" # Path where LanceDB stores its local data LANCEDB_TABLE_NAME: Optional[str] = ( "docsgpts" # Name of the table to use for storing vectors ) FLASK_DEBUG_MODE: bool = False STORAGE_TYPE: str = "local" # local or s3 URL_STRATEGY: str = "backend" # backend or s3 JWT_SECRET_KEY: str = "" # Encryption settings ENCRYPTION_SECRET_KEY: str = "default-docsgpt-encryption-key" TTS_PROVIDER: str = "google_tts" # google_tts or elevenlabs ELEVENLABS_API_KEY: Optional[str] = None STT_PROVIDER: str = "openai" # openai or faster_whisper OPENAI_STT_MODEL: str = "gpt-4o-mini-transcribe" STT_LANGUAGE: Optional[str] = None STT_MAX_FILE_SIZE_MB: int = 50 STT_ENABLE_TIMESTAMPS: bool = False STT_ENABLE_DIARIZATION: bool = False # Tool pre-fetch settings ENABLE_TOOL_PREFETCH: bool = True # Conversation Compression Settings ENABLE_CONVERSATION_COMPRESSION: bool = True COMPRESSION_THRESHOLD_PERCENTAGE: float = 0.8 # Trigger at 80% of context COMPRESSION_MODEL_OVERRIDE: Optional[str] = None # Use different model for compression COMPRESSION_PROMPT_VERSION: str = "v1.0" # Track prompt iterations COMPRESSION_MAX_HISTORY_POINTS: int = 3 # Keep only last N compression points to prevent DB bloat @field_validator( "API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GOOGLE_API_KEY", "GROQ_API_KEY", "HUGGINGFACE_API_KEY", "EMBEDDINGS_KEY", "FALLBACK_LLM_API_KEY", "QDRANT_API_KEY", "ELEVENLABS_API_KEY", "INTERNAL_KEY", mode="before", ) @classmethod def normalize_api_key(cls, v: Optional[str]) -> Optional[str]: """ Normalize API keys: convert 'None', 'none', empty strings, and whitespace-only strings to actual None. Handles Pydantic loading 'None' from .env as string "None". """ if v is None: return None if not isinstance(v, str): return v stripped = v.strip() if stripped == "" or stripped.lower() == "none": return None return stripped # Project root is one level above application/ path = Path(__file__).parent.parent.parent.absolute() settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8") ================================================ FILE: application/core/url_validation.py ================================================ """ URL validation utilities to prevent SSRF (Server-Side Request Forgery) attacks. This module provides functions to validate URLs before making HTTP requests, blocking access to internal networks, cloud metadata services, and other potentially dangerous endpoints. """ import ipaddress import socket from urllib.parse import urlparse from typing import Optional, Set class SSRFError(Exception): """Raised when a URL fails SSRF validation.""" pass # Blocked hostnames that should never be accessed BLOCKED_HOSTNAMES: Set[str] = { "localhost", "localhost.localdomain", "metadata.google.internal", "metadata", } # Cloud metadata IP addresses (AWS, GCP, Azure, etc.) METADATA_IPS: Set[str] = { "169.254.169.254", # AWS, GCP, Azure metadata "169.254.170.2", # AWS ECS task metadata "fd00:ec2::254", # AWS IPv6 metadata } # Allowed schemes for external requests ALLOWED_SCHEMES: Set[str] = {"http", "https"} def is_private_ip(ip_str: str) -> bool: """ Check if an IP address is private, loopback, or link-local. Args: ip_str: IP address as a string Returns: True if the IP is private/internal, False otherwise """ try: ip = ipaddress.ip_address(ip_str) return ( ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast or ip.is_unspecified ) except ValueError: # If we can't parse it as an IP, return False return False def is_metadata_ip(ip_str: str) -> bool: """ Check if an IP address is a cloud metadata service IP. Args: ip_str: IP address as a string Returns: True if the IP is a metadata service, False otherwise """ return ip_str in METADATA_IPS def resolve_hostname(hostname: str) -> Optional[str]: """ Resolve a hostname to an IP address. Args: hostname: The hostname to resolve Returns: The resolved IP address, or None if resolution fails """ try: return socket.gethostbyname(hostname) except socket.gaierror: return None def validate_url(url: str, allow_localhost: bool = False) -> str: """ Validate a URL to prevent SSRF attacks. This function checks that: 1. The URL has an allowed scheme (http or https) 2. The hostname is not a blocked hostname 3. The resolved IP is not a private/internal IP 4. The resolved IP is not a cloud metadata service Args: url: The URL to validate allow_localhost: If True, allow localhost connections (for testing only) Returns: The validated URL (with scheme added if missing) Raises: SSRFError: If the URL fails validation """ # Ensure URL has a scheme if not urlparse(url).scheme: url = "http://" + url parsed = urlparse(url) # Check scheme if parsed.scheme not in ALLOWED_SCHEMES: raise SSRFError(f"URL scheme '{parsed.scheme}' is not allowed. Only HTTP(S) is permitted.") hostname = parsed.hostname if not hostname: raise SSRFError("URL must have a valid hostname.") hostname_lower = hostname.lower() # Check blocked hostnames if hostname_lower in BLOCKED_HOSTNAMES and not allow_localhost: raise SSRFError(f"Access to '{hostname}' is not allowed.") # Check if hostname is an IP address directly try: ip = ipaddress.ip_address(hostname) ip_str = str(ip) if is_metadata_ip(ip_str): raise SSRFError("Access to cloud metadata services is not allowed.") if is_private_ip(ip_str) and not allow_localhost: raise SSRFError("Access to private/internal IP addresses is not allowed.") return url except ValueError: # Not an IP address, it's a hostname - resolve it pass # Resolve hostname and check the IP resolved_ip = resolve_hostname(hostname) if resolved_ip is None: raise SSRFError(f"Unable to resolve hostname: {hostname}") if is_metadata_ip(resolved_ip): raise SSRFError("Access to cloud metadata services is not allowed.") if is_private_ip(resolved_ip) and not allow_localhost: raise SSRFError("Access to private/internal networks is not allowed.") return url def validate_url_safe(url: str, allow_localhost: bool = False) -> tuple[bool, str, Optional[str]]: """ Validate a URL and return a tuple with validation result. This is a non-throwing version of validate_url for cases where you want to handle validation failures gracefully. Args: url: The URL to validate allow_localhost: If True, allow localhost connections (for testing only) Returns: Tuple of (is_valid, validated_url_or_original, error_message_or_none) """ try: validated = validate_url(url, allow_localhost) return (True, validated, None) except SSRFError as e: return (False, url, str(e)) ================================================ FILE: application/error.py ================================================ from flask import jsonify from werkzeug.http import HTTP_STATUS_CODES def response_error(code_status, message=None): payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")} if message: payload['message'] = message response = jsonify(payload) response.status_code = code_status return response def bad_request(status_code=400, message=''): return response_error(code_status=status_code, message=message) def sanitize_api_error(error) -> str: """ Convert technical API errors to user-friendly messages. Works with both Exception objects and error message strings. """ error_str = str(error).lower() if "503" in error_str or "unavailable" in error_str or "high demand" in error_str: return "The AI service is temporarily unavailable due to high demand. Please try again in a moment." if "429" in error_str or "rate limit" in error_str or "quota" in error_str: return "Rate limit exceeded. Please wait a moment before trying again." if "401" in error_str or "unauthorized" in error_str or "invalid api key" in error_str: return "Authentication error. Please check your API configuration." if "timeout" in error_str or "timed out" in error_str: return "The request timed out. Please try again." if "connection" in error_str or "network" in error_str: return "Network error. Please check your connection and try again." original = str(error) if len(original) > 200 or "{" in original or "traceback" in error_str: return "An error occurred while processing your request. Please try again later." return original ================================================ FILE: application/llm/__init__.py ================================================ ================================================ FILE: application/llm/anthropic.py ================================================ import base64 import logging from anthropic import AI_PROMPT, Anthropic, HUMAN_PROMPT from application.core.settings import settings from application.llm.base import BaseLLM from application.storage.storage_creator import StorageCreator logger = logging.getLogger(__name__) class AnthropicLLM(BaseLLM): def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs): super().__init__(*args, **kwargs) self.api_key = api_key or settings.ANTHROPIC_API_KEY or settings.API_KEY self.user_api_key = user_api_key # Use custom base_url if provided if base_url: self.anthropic = Anthropic(api_key=self.api_key, base_url=base_url) else: self.anthropic = Anthropic(api_key=self.api_key) self.HUMAN_PROMPT = HUMAN_PROMPT self.AI_PROMPT = AI_PROMPT self.storage = StorageCreator.get_storage() def _raw_gen( self, baseself, model, messages, stream=False, tools=None, max_tokens=300, **kwargs, ): context = messages[0]["content"] user_question = messages[-1]["content"] prompt = f"### Context \n {context} \n ### Question \n {user_question}" if stream: return self.gen_stream(model, prompt, stream, max_tokens, **kwargs) completion = self.anthropic.completions.create( model=model, max_tokens_to_sample=max_tokens, stream=stream, prompt=f"{self.HUMAN_PROMPT} {prompt}{self.AI_PROMPT}", ) return completion.completion def _raw_gen_stream( self, baseself, model, messages, stream=True, tools=None, max_tokens=300, **kwargs, ): context = messages[0]["content"] user_question = messages[-1]["content"] prompt = f"### Context \n {context} \n ### Question \n {user_question}" stream_response = self.anthropic.completions.create( model=model, prompt=f"{self.HUMAN_PROMPT} {prompt}{self.AI_PROMPT}", max_tokens_to_sample=max_tokens, stream=True, ) try: for completion in stream_response: yield completion.completion finally: if hasattr(stream_response, "close"): stream_response.close() def get_supported_attachment_types(self): """ Return a list of MIME types supported by Anthropic Claude for file uploads. Claude supports images but not PDFs natively. PDFs are synthetically supported via PDF-to-image conversion in the handler. Returns: list: List of supported MIME types """ return [ "image/png", "image/jpeg", "image/jpg", "image/webp", "image/gif", ] def prepare_messages_with_attachments(self, messages, attachments=None): """ Process attachments for Anthropic Claude API. Formats images using Claude's vision message format. Args: messages (list): List of message dictionaries. attachments (list): List of attachment dictionaries with content and metadata. Returns: list: Messages formatted with image content for Claude API. """ if not attachments: return messages prepared_messages = messages.copy() # Find the last user message to attach images to user_message_index = None for i in range(len(prepared_messages) - 1, -1, -1): if prepared_messages[i].get("role") == "user": user_message_index = i break if user_message_index is None: user_message = {"role": "user", "content": []} prepared_messages.append(user_message) user_message_index = len(prepared_messages) - 1 # Convert content to list format if it's a string if isinstance(prepared_messages[user_message_index].get("content"), str): text_content = prepared_messages[user_message_index]["content"] prepared_messages[user_message_index]["content"] = [ {"type": "text", "text": text_content} ] elif not isinstance(prepared_messages[user_message_index].get("content"), list): prepared_messages[user_message_index]["content"] = [] for attachment in attachments: mime_type = attachment.get("mime_type") if mime_type and mime_type.startswith("image/"): try: # Check if this is a pre-converted image (from PDF-to-image conversion) # These have 'data' key with base64 already if "data" in attachment: base64_image = attachment["data"] else: base64_image = self._get_base64_image(attachment) # Claude uses a specific format for images prepared_messages[user_message_index]["content"].append( { "type": "image", "source": { "type": "base64", "media_type": mime_type, "data": base64_image, }, } ) except Exception as e: logger.error( f"Error processing image attachment: {e}", exc_info=True ) if "content" in attachment: prepared_messages[user_message_index]["content"].append( { "type": "text", "text": f"[Image could not be processed: {attachment.get('path', 'unknown')}]", } ) return prepared_messages def _get_base64_image(self, attachment): """ Convert an image file to base64 encoding. Args: attachment (dict): Attachment dictionary with path and metadata. Returns: str: Base64-encoded image data. """ file_path = attachment.get("path") if not file_path: raise ValueError("No file path provided in attachment") try: with self.storage.get_file(file_path) as image_file: return base64.b64encode(image_file.read()).decode("utf-8") except FileNotFoundError: raise FileNotFoundError(f"File not found: {file_path}") ================================================ FILE: application/llm/base.py ================================================ import logging from abc import ABC, abstractmethod from application.cache import gen_cache, stream_cache from application.core.settings import settings from application.usage import gen_token_usage, stream_token_usage logger = logging.getLogger(__name__) class BaseLLM(ABC): def __init__( self, decoded_token=None, agent_id=None, model_id=None, base_url=None, ): self.decoded_token = decoded_token self.agent_id = str(agent_id) if agent_id else None self.model_id = model_id self.base_url = base_url self.token_usage = {"prompt_tokens": 0, "generated_tokens": 0} self._fallback_llm = None self._fallback_sequence_index = 0 @property def fallback_llm(self): """Lazy-loaded fallback LLM from FALLBACK_* settings.""" if self._fallback_llm is None and settings.FALLBACK_LLM_PROVIDER: try: from application.llm.llm_creator import LLMCreator self._fallback_llm = LLMCreator.create_llm( settings.FALLBACK_LLM_PROVIDER, api_key=settings.FALLBACK_LLM_API_KEY or settings.API_KEY, user_api_key=getattr(self, "user_api_key", None), decoded_token=self.decoded_token, model_id=settings.FALLBACK_LLM_NAME, agent_id=self.agent_id, ) logger.info( f"Fallback LLM initialized: {settings.FALLBACK_LLM_PROVIDER}/{settings.FALLBACK_LLM_NAME}" ) except Exception as e: logger.error( f"Failed to initialize fallback LLM: {str(e)}", exc_info=True ) return self._fallback_llm @staticmethod def _remove_null_values(args_dict): if not isinstance(args_dict, dict): return args_dict return {k: v for k, v in args_dict.items() if v is not None} def _execute_with_fallback( self, method_name: str, decorators: list, *args, **kwargs ): """ Execute method with fallback support. Args: method_name: Name of the raw method ('_raw_gen' or '_raw_gen_stream') decorators: List of decorators to apply *args: Positional arguments **kwargs: Keyword arguments """ def decorated_method(): method = getattr(self, method_name) for decorator in decorators: method = decorator(method) return method(self, *args, **kwargs) try: return decorated_method() except Exception as e: if not self.fallback_llm: logger.error(f"Primary LLM failed and no fallback configured: {str(e)}") raise logger.warning( f"Primary LLM failed. Falling back to {settings.FALLBACK_LLM_PROVIDER}/{settings.FALLBACK_LLM_NAME}. Error: {str(e)}" ) fallback_method = getattr( self.fallback_llm, method_name.replace("_raw_", "") ) return fallback_method(*args, **kwargs) def gen(self, model, messages, stream=False, tools=None, *args, **kwargs): decorators = [gen_token_usage, gen_cache] return self._execute_with_fallback( "_raw_gen", decorators, model=model, messages=messages, stream=stream, tools=tools, *args, **kwargs, ) def gen_stream(self, model, messages, stream=True, tools=None, *args, **kwargs): decorators = [stream_cache, stream_token_usage] return self._execute_with_fallback( "_raw_gen_stream", decorators, model=model, messages=messages, stream=stream, tools=tools, *args, **kwargs, ) @abstractmethod def _raw_gen(self, model, messages, stream, tools, *args, **kwargs): pass @abstractmethod def _raw_gen_stream(self, model, messages, stream, *args, **kwargs): pass def supports_tools(self): return hasattr(self, "_supports_tools") and callable( getattr(self, "_supports_tools") ) def _supports_tools(self): raise NotImplementedError("Subclass must implement _supports_tools method") def supports_structured_output(self): """Check if the LLM supports structured output/JSON schema enforcement""" return hasattr(self, "_supports_structured_output") and callable( getattr(self, "_supports_structured_output") ) def _supports_structured_output(self): return False def prepare_structured_output_format(self, json_schema): """Prepare structured output format specific to the LLM provider""" _ = json_schema return None def get_supported_attachment_types(self): """ Return a list of MIME types supported by this LLM for file uploads. Returns: list: List of supported MIME types """ return [] ================================================ FILE: application/llm/docsgpt_provider.py ================================================ from application.core.settings import settings from application.llm.openai import OpenAILLM DOCSGPT_API_KEY = "sk-docsgpt-public" DOCSGPT_BASE_URL = "https://oai.arc53.com" DOCSGPT_MODEL = "docsgpt" class DocsGPTAPILLM(OpenAILLM): def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs): super().__init__( api_key=DOCSGPT_API_KEY, user_api_key=user_api_key, base_url=DOCSGPT_BASE_URL, *args, **kwargs, ) def _raw_gen( self, baseself, model, messages, stream=False, tools=None, engine=settings.AZURE_DEPLOYMENT_NAME, response_format=None, **kwargs, ): return super()._raw_gen( baseself, DOCSGPT_MODEL, messages, stream=stream, tools=tools, engine=engine, response_format=response_format, **kwargs, ) def _raw_gen_stream( self, baseself, model, messages, stream=True, tools=None, engine=settings.AZURE_DEPLOYMENT_NAME, response_format=None, **kwargs, ): return super()._raw_gen_stream( baseself, DOCSGPT_MODEL, messages, stream=stream, tools=tools, engine=engine, response_format=response_format, **kwargs, ) ================================================ FILE: application/llm/google_ai.py ================================================ import logging from google import genai from google.genai import types from application.core.settings import settings from application.llm.base import BaseLLM from application.storage.storage_creator import StorageCreator class GoogleLLM(BaseLLM): def __init__( self, api_key=None, user_api_key=None, decoded_token=None, *args, **kwargs ): super().__init__(decoded_token=decoded_token, *args, **kwargs) self.api_key = api_key or settings.GOOGLE_API_KEY or settings.API_KEY self.user_api_key = user_api_key self.client = genai.Client(api_key=self.api_key) self.storage = StorageCreator.get_storage() def get_supported_attachment_types(self): """ Return a list of MIME types supported by Google Gemini for file uploads. Returns: list: List of supported MIME types """ return [ "application/pdf", "image/png", "image/jpeg", "image/jpg", "image/webp", "image/gif", "application/pdf", "image/png", "image/jpeg", "image/jpg", "image/webp", "image/gif", ] def prepare_messages_with_attachments(self, messages, attachments=None): """ Process attachments using Google AI's file API for more efficient handling. Args: messages (list): List of message dictionaries. attachments (list): List of attachment dictionaries with content and metadata. Returns: list: Messages formatted with file references for Google AI API. """ if not attachments: return messages prepared_messages = messages.copy() # Find the user message to attach files to the last one user_message_index = None for i in range(len(prepared_messages) - 1, -1, -1): if prepared_messages[i].get("role") == "user": user_message_index = i break if user_message_index is None: user_message = {"role": "user", "content": []} prepared_messages.append(user_message) user_message_index = len(prepared_messages) - 1 if isinstance(prepared_messages[user_message_index].get("content"), str): text_content = prepared_messages[user_message_index]["content"] prepared_messages[user_message_index]["content"] = [ {"type": "text", "text": text_content} ] elif not isinstance(prepared_messages[user_message_index].get("content"), list): prepared_messages[user_message_index]["content"] = [] files = [] for attachment in attachments: mime_type = attachment.get("mime_type") if mime_type in self.get_supported_attachment_types(): try: file_uri = self._upload_file_to_google(attachment) logging.info( f"GoogleLLM: Successfully uploaded file, got URI: {file_uri}" ) files.append({"file_uri": file_uri, "mime_type": mime_type}) except Exception as e: logging.error( f"GoogleLLM: Error uploading file: {e}", exc_info=True ) if "content" in attachment: prepared_messages[user_message_index]["content"].append( { "type": "text", "text": f"[File could not be processed: {attachment.get('path', 'unknown')}]", } ) if files: logging.info(f"GoogleLLM: Adding {len(files)} files to message") prepared_messages[user_message_index]["content"].append({"files": files}) return prepared_messages def _upload_file_to_google(self, attachment): """ Upload a file to Google AI and return the file URI. Args: attachment (dict): Attachment dictionary with path and metadata. Returns: str: Google AI file URI for the uploaded file. """ if "google_file_uri" in attachment: return attachment["google_file_uri"] file_path = attachment.get("path") if not file_path: raise ValueError("No file path provided in attachment") if not self.storage.file_exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") try: file_uri = self.storage.process_file( file_path, lambda local_path, **kwargs: self.client.files.upload( file=local_path ).uri, ) from application.core.mongo_db import MongoDB mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] attachments_collection = db["attachments"] if "_id" in attachment: attachments_collection.update_one( {"_id": attachment["_id"]}, {"$set": {"google_file_uri": file_uri}} ) return file_uri except Exception as e: logging.error(f"Error uploading file to Google AI: {e}", exc_info=True) raise def _clean_messages_google(self, messages): """ Convert OpenAI format messages to Google AI format and collect system prompts. Returns: tuple[list[types.Content], Optional[str]]: cleaned messages and optional combined system instruction. """ cleaned_messages = [] system_instructions = [] def _extract_system_text(content): if isinstance(content, str): return content if isinstance(content, list): parts = [] for item in content: if ( isinstance(item, dict) and "text" in item and item["text"] is not None ): parts.append(item["text"]) return "\n".join(parts) return "" for message in messages: role = message.get("role") content = message.get("content") # Gemini only accepts user/model in the contents list. if role == "system": sys_text = _extract_system_text(content) if sys_text: system_instructions.append(sys_text) continue if role == "assistant": role = "model" elif role == "tool": role = "model" parts = [] if role and content is not None: if isinstance(content, str): parts = [types.Part.from_text(text=content)] elif isinstance(content, list): for item in content: if "text" in item: parts.append(types.Part.from_text(text=item["text"])) elif "function_call" in item: # Remove null values from args to avoid API errors cleaned_args = self._remove_null_values( item["function_call"]["args"] ) # Create function call part with thought_signature if present # For Gemini 3 models, we need to include thought_signature if "thought_signature" in item: # Use Part constructor with functionCall and thoughtSignature parts.append( types.Part( functionCall=types.FunctionCall( name=item["function_call"]["name"], args=cleaned_args, ), thoughtSignature=item["thought_signature"], ) ) else: # Use helper method when no thought_signature parts.append( types.Part.from_function_call( name=item["function_call"]["name"], args=cleaned_args, ) ) elif "function_response" in item: parts.append( types.Part.from_function_response( name=item["function_response"]["name"], response=item["function_response"]["response"], ) ) elif "files" in item: for file_data in item["files"]: parts.append( types.Part.from_uri( file_uri=file_data["file_uri"], mime_type=file_data["mime_type"], ) ) else: raise ValueError( f"Unexpected content dictionary format:{item}" ) else: raise ValueError(f"Unexpected content type: {type(content)}") if parts: cleaned_messages.append(types.Content(role=role, parts=parts)) system_instruction = ( "\n\n".join(system_instructions) if system_instructions else None ) return cleaned_messages, system_instruction def _clean_schema(self, schema_obj): """ Recursively remove unsupported fields from schema objects and validate required properties. """ if not isinstance(schema_obj, dict): return schema_obj allowed_fields = { "type", "description", "items", "properties", "required", "enum", "pattern", "minimum", "maximum", "nullable", "default", } cleaned = {} for key, value in schema_obj.items(): if key not in allowed_fields: continue elif key == "type" and isinstance(value, str): cleaned[key] = value.upper() elif isinstance(value, dict): cleaned[key] = self._clean_schema(value) elif isinstance(value, list): cleaned[key] = [self._clean_schema(item) for item in value] else: cleaned[key] = value # Validate that required properties actually exist in properties if "required" in cleaned and "properties" in cleaned: valid_required = [] properties_keys = set(cleaned["properties"].keys()) for required_prop in cleaned["required"]: if required_prop in properties_keys: valid_required.append(required_prop) if valid_required: cleaned["required"] = valid_required else: cleaned.pop("required", None) elif "required" in cleaned and "properties" not in cleaned: cleaned.pop("required", None) return cleaned def _clean_tools_format(self, tools_list): """Convert OpenAI format tools to Google AI format.""" genai_tools = [] for tool_data in tools_list: if tool_data["type"] == "function": function = tool_data["function"] parameters = function["parameters"] properties = parameters.get("properties", {}) if properties: cleaned_properties = {} for k, v in properties.items(): cleaned_properties[k] = self._clean_schema(v) genai_function = dict( name=function["name"], description=function["description"], parameters={ "type": "OBJECT", "properties": cleaned_properties, "required": ( parameters["required"] if "required" in parameters else [] ), }, ) else: genai_function = dict( name=function["name"], description=function["description"], ) genai_tool = types.Tool(function_declarations=[genai_function]) genai_tools.append(genai_tool) return genai_tools def _extract_preview_from_message(self, message): """Get a short, human-readable preview from the last message.""" try: if hasattr(message, "parts"): for part in reversed(message.parts): if getattr(part, "text", None): return part.text function_call = getattr(part, "function_call", None) if function_call: name = getattr(function_call, "name", "") or "function_call" return f"function_call:{name}" function_response = getattr(part, "function_response", None) if function_response: name = ( getattr(function_response, "name", "") or "function_response" ) return f"function_response:{name}" if isinstance(message, dict): content = message.get("content") if isinstance(content, str): return content if isinstance(content, list): for item in reversed(content): if isinstance(item, str): return item if isinstance(item, dict): if item.get("text"): return item["text"] if item.get("function_call"): fn = item["function_call"] if isinstance(fn, dict): name = fn.get("name") or "function_call" return f"function_call:{name}" return "function_call" if item.get("function_response"): resp = item["function_response"] if isinstance(resp, dict): name = resp.get("name") or "function_response" return f"function_response:{name}" return "function_response" if "text" in message and isinstance(message["text"], str): return message["text"] except Exception: pass return str(message) def _summarize_messages_for_log(self, messages, preview_chars=20): """Return a compact summary for logging to avoid huge payloads.""" message_count = len(messages) if messages else 0 last_preview = "" if messages: last_preview = self._extract_preview_from_message(messages[-1]) or "" last_preview = str(last_preview).replace("\n", " ") if len(last_preview) > preview_chars: last_preview = f"{last_preview[:preview_chars]}..." return f"count={message_count}, last='{last_preview}'" @staticmethod def _get_text_value(part): """Get text from both SDK objects and dict-shaped test doubles.""" if isinstance(part, dict): value = part.get("text") return value if isinstance(value, str) else "" value = getattr(part, "text", None) return value if isinstance(value, str) else "" @staticmethod def _is_thought_part(part): """Detect Gemini thinking parts when available.""" if isinstance(part, dict): return bool(part.get("thought")) return bool(getattr(part, "thought", False)) def _raw_gen( self, baseself, model, messages, stream=False, tools=None, formatting="openai", response_schema=None, **kwargs, ): """Generate content using Google AI API without streaming.""" client = genai.Client(api_key=self.api_key) system_instruction = None if formatting == "openai": messages, system_instruction = self._clean_messages_google(messages) config = types.GenerateContentConfig() if system_instruction: config.system_instruction = system_instruction if tools: cleaned_tools = self._clean_tools_format(tools) config.tools = cleaned_tools # Add response schema for structured output if provided if response_schema: config.response_schema = response_schema config.response_mime_type = "application/json" response = client.models.generate_content( model=model, contents=messages, config=config, ) if tools: return response else: return response.text def _raw_gen_stream( self, baseself, model, messages, stream=True, tools=None, formatting="openai", response_schema=None, **kwargs, ): """Generate content using Google AI API with streaming.""" client = genai.Client(api_key=self.api_key) system_instruction = None if formatting == "openai": messages, system_instruction = self._clean_messages_google(messages) config = types.GenerateContentConfig() if system_instruction: config.system_instruction = system_instruction if tools: cleaned_tools = self._clean_tools_format(tools) config.tools = cleaned_tools if response_schema: config.response_schema = response_schema config.response_mime_type = "application/json" # Check if we have both tools and file attachments has_attachments = False for message in messages: for part in message.parts: if hasattr(part, "file_data") and part.file_data is not None: has_attachments = True break if has_attachments: break messages_summary = self._summarize_messages_for_log(messages) logging.info( "GoogleLLM: Starting stream generation. Model: %s, Messages: %s, Has attachments: %s", model, messages_summary, has_attachments, ) response = client.models.generate_content_stream( model=model, contents=messages, config=config, ) try: for chunk in response: if hasattr(chunk, "candidates") and chunk.candidates: for candidate in chunk.candidates: if candidate.content and candidate.content.parts: for part in candidate.content.parts: if part.function_call: yield part continue part_text = self._get_text_value(part) if not part_text: continue if self._is_thought_part(part): yield {"type": "thought", "thought": part_text} else: yield part_text elif hasattr(chunk, "text"): chunk_text = self._get_text_value(chunk) if chunk_text: if self._is_thought_part(chunk): yield {"type": "thought", "thought": chunk_text} else: yield chunk_text except Exception as e: logging.error(f"GoogleLLM: Stream error: {e}", exc_info=True) raise finally: if hasattr(response, "close"): response.close() def _supports_tools(self): """Return whether this LLM supports function calling.""" return True def _supports_structured_output(self): """Return whether this LLM supports structured JSON output.""" return True def prepare_structured_output_format(self, json_schema): """Convert JSON schema to Google AI structured output format.""" if not json_schema: return None type_map = { "object": "OBJECT", "array": "ARRAY", "string": "STRING", "integer": "INTEGER", "number": "NUMBER", "boolean": "BOOLEAN", } def convert(schema): if not isinstance(schema, dict): return schema result = {} schema_type = schema.get("type") if schema_type: result["type"] = type_map.get(schema_type.lower(), schema_type.upper()) for key in [ "description", "nullable", "enum", "minItems", "maxItems", "required", "propertyOrdering", ]: if key in schema: result[key] = schema[key] if "format" in schema: format_value = schema["format"] if schema_type == "string": if format_value == "date": result["format"] = "date-time" elif format_value in ["enum", "date-time"]: result["format"] = format_value else: result["format"] = format_value if "properties" in schema: result["properties"] = { k: convert(v) for k, v in schema["properties"].items() } if "propertyOrdering" not in result and result.get("type") == "OBJECT": result["propertyOrdering"] = list(result["properties"].keys()) if "items" in schema: result["items"] = convert(schema["items"]) for field in ["anyOf", "oneOf", "allOf"]: if field in schema: result[field] = [convert(s) for s in schema[field]] return result try: return convert(json_schema) except Exception as e: logging.error( f"Error preparing structured output format for Google: {e}", exc_info=True, ) return None ================================================ FILE: application/llm/groq.py ================================================ from application.core.settings import settings from application.llm.openai import OpenAILLM GROQ_BASE_URL = "https://api.groq.com/openai/v1" class GroqLLM(OpenAILLM): def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs): super().__init__( api_key=api_key or settings.GROQ_API_KEY or settings.API_KEY, user_api_key=user_api_key, base_url=base_url or GROQ_BASE_URL, *args, **kwargs, ) ================================================ FILE: application/llm/handlers/__init__.py ================================================ ================================================ FILE: application/llm/handlers/base.py ================================================ import logging import uuid from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Any, Dict, Generator, List, Optional, Union from application.logging import build_stack_data logger = logging.getLogger(__name__) @dataclass class ToolCall: """Represents a tool/function call from the LLM.""" id: str name: str arguments: Union[str, Dict] index: Optional[int] = None thought_signature: Optional[str] = None @classmethod def from_dict(cls, data: Dict) -> "ToolCall": """Create ToolCall from dictionary.""" return cls( id=data.get("id", ""), name=data.get("name", ""), arguments=data.get("arguments", {}), index=data.get("index"), ) @dataclass class LLMResponse: """Represents a response from the LLM.""" content: str tool_calls: List[ToolCall] finish_reason: str raw_response: Any @property def requires_tool_call(self) -> bool: """Check if the response requires tool calls.""" return bool(self.tool_calls) and self.finish_reason == "tool_calls" class LLMHandler(ABC): """Abstract base class for LLM handlers.""" def __init__(self): self.llm_calls = [] self.tool_calls = [] @abstractmethod def parse_response(self, response: Any) -> LLMResponse: """Parse raw LLM response into standardized format.""" pass @abstractmethod def create_tool_message(self, tool_call: ToolCall, result: Any) -> Dict: """Create a tool result message for the conversation history.""" pass @abstractmethod def _iterate_stream(self, response: Any) -> Generator: """Iterate through streaming response chunks.""" pass def process_message_flow( self, agent, initial_response, tools_dict: Dict, messages: List[Dict], attachments: Optional[List] = None, stream: bool = False, ) -> Union[str, Generator]: """ Main orchestration method for processing LLM message flow. Args: agent: The agent instance initial_response: Initial LLM response tools_dict: Dictionary of available tools messages: Conversation history attachments: Optional attachments stream: Whether to use streaming Returns: Final response or generator for streaming """ messages = self.prepare_messages(agent, messages, attachments) if stream: return self.handle_streaming(agent, initial_response, tools_dict, messages) else: return self.handle_non_streaming( agent, initial_response, tools_dict, messages ) def prepare_messages( self, agent, messages: List[Dict], attachments: Optional[List] = None ) -> List[Dict]: """ Prepare messages with attachments and provider-specific formatting. Args: agent: The agent instance messages: Original messages attachments: List of attachments Returns: Prepared messages list """ if not attachments: return messages logger.info(f"Preparing messages with {len(attachments)} attachments") supported_types = agent.llm.get_supported_attachment_types() # Check if provider supports images but not PDF (synthetic PDF support) supports_images = any(t.startswith("image/") for t in supported_types) supports_pdf = "application/pdf" in supported_types # Process attachments, converting PDFs to images if needed processed_attachments = [] for attachment in attachments: mime_type = attachment.get("mime_type") # Synthetic PDF support: convert PDF to images if LLM supports images but not PDF if mime_type == "application/pdf" and supports_images and not supports_pdf: logger.info( f"Converting PDF to images for synthetic PDF support: {attachment.get('path', 'unknown')}" ) try: converted_images = self._convert_pdf_to_images(attachment) processed_attachments.extend(converted_images) logger.info( f"Converted PDF to {len(converted_images)} images" ) except Exception as e: logger.error( f"Failed to convert PDF to images, falling back to text: {e}" ) # Fall back to treating as unsupported (text extraction) processed_attachments.append(attachment) else: processed_attachments.append(attachment) supported_attachments = [ a for a in processed_attachments if a.get("mime_type") in supported_types ] unsupported_attachments = [ a for a in processed_attachments if a.get("mime_type") not in supported_types ] # Process supported attachments with the LLM's custom method if supported_attachments: logger.info( f"Processing {len(supported_attachments)} supported attachments" ) messages = agent.llm.prepare_messages_with_attachments( messages, supported_attachments ) # Process unsupported attachments with default method if unsupported_attachments: logger.info( f"Processing {len(unsupported_attachments)} unsupported attachments" ) messages = self._append_unsupported_attachments( messages, unsupported_attachments ) return messages def _convert_pdf_to_images(self, attachment: Dict) -> List[Dict]: """ Convert a PDF attachment to a list of image attachments. This enables synthetic PDF support for LLMs that support images but not PDFs. Args: attachment: PDF attachment dictionary with 'path' and optional 'content' Returns: List of image attachment dictionaries with 'data', 'mime_type', and 'page' """ from application.utils import convert_pdf_to_images from application.storage.storage_creator import StorageCreator file_path = attachment.get("path") if not file_path: raise ValueError("No file path provided in PDF attachment") storage = StorageCreator.get_storage() # Convert PDF to images images_data = convert_pdf_to_images( file_path=file_path, storage=storage, max_pages=20, dpi=150, ) return images_data def _append_unsupported_attachments( self, messages: List[Dict], attachments: List[Dict] ) -> List[Dict]: """ Default method to append unsupported attachment content to system prompt. Args: messages: Current messages attachments: List of unsupported attachments Returns: Updated messages list """ prepared_messages = messages.copy() attachment_texts = [] for attachment in attachments: logger.info(f"Adding attachment {attachment.get('id')} to context") if "content" in attachment: attachment_texts.append( f"Attached file content:\n\n{attachment['content']}" ) if attachment_texts: combined_text = "\n\n".join(attachment_texts) system_msg = next( (msg for msg in prepared_messages if msg.get("role") == "system"), {"role": "system", "content": ""}, ) if system_msg not in prepared_messages: prepared_messages.insert(0, system_msg) system_msg["content"] += f"\n\n{combined_text}" return prepared_messages def _prune_messages_minimal(self, messages: List[Dict]) -> Optional[List[Dict]]: """ Build a minimal context: system prompt + latest user message only. Drops all tool/function messages to shrink context aggressively. """ system_message = next((m for m in messages if m.get("role") == "system"), None) if not system_message: logger.warning("Cannot prune messages minimally: missing system message.") return None last_non_system = None for m in reversed(messages): if m.get("role") == "user": last_non_system = m break if not last_non_system and m.get("role") not in ("system", None): last_non_system = m if not last_non_system: logger.warning("Cannot prune messages minimally: missing user/assistant messages.") return None logger.info("Pruning context to system + latest user/assistant message to proceed.") return [system_message, last_non_system] def _extract_text_from_content(self, content: Any) -> str: """ Convert message content (str or list of parts) to plain text for compression. """ if isinstance(content, str): return content if isinstance(content, list): parts_text = [] for item in content: if isinstance(item, dict): if "text" in item and item["text"] is not None: parts_text.append(str(item["text"])) elif "function_call" in item or "function_response" in item: # Keep serialized function calls/responses so the compressor sees actions parts_text.append(str(item)) elif "files" in item: parts_text.append(str(item)) return "\n".join(parts_text) return "" def _build_conversation_from_messages(self, messages: List[Dict]) -> Optional[Dict]: """ Build a conversation-like dict from current messages so we can compress even when the conversation isn't persisted yet. Includes tool calls/results. """ queries = [] current_prompt = None current_tool_calls = {} def _commit_query(response_text: str): nonlocal current_prompt, current_tool_calls if current_prompt is None and not response_text: return tool_calls_list = list(current_tool_calls.values()) queries.append( { "prompt": current_prompt or "", "response": response_text, "tool_calls": tool_calls_list, } ) current_prompt = None current_tool_calls = {} for message in messages: role = message.get("role") content = message.get("content") if role == "user": current_prompt = self._extract_text_from_content(content) elif role in {"assistant", "model"}: # If this assistant turn contains tool calls, collect them; otherwise commit a response. if isinstance(content, list): for item in content: if "function_call" in item: fc = item["function_call"] call_id = fc.get("call_id") or str(uuid.uuid4()) current_tool_calls[call_id] = { "tool_name": "unknown_tool", "action_name": fc.get("name"), "arguments": fc.get("args"), "result": None, "status": "called", "call_id": call_id, } elif "function_response" in item: fr = item["function_response"] call_id = fr.get("call_id") or str(uuid.uuid4()) current_tool_calls[call_id] = { "tool_name": "unknown_tool", "action_name": fr.get("name"), "arguments": None, "result": fr.get("response", {}).get("result"), "status": "completed", "call_id": call_id, } # No direct assistant text here; continue to next message continue response_text = self._extract_text_from_content(content) _commit_query(response_text) elif role == "tool": # Attach tool outputs to the latest pending tool call if possible tool_text = self._extract_text_from_content(content) # Attempt to parse function_response style call_id = None if isinstance(content, list): for item in content: if "function_response" in item and item["function_response"].get("call_id"): call_id = item["function_response"]["call_id"] break if call_id and call_id in current_tool_calls: current_tool_calls[call_id]["result"] = tool_text current_tool_calls[call_id]["status"] = "completed" elif queries: queries[-1].setdefault("tool_calls", []).append( { "tool_name": "unknown_tool", "action_name": "unknown_action", "arguments": {}, "result": tool_text, "status": "completed", } ) # If there's an unfinished prompt with tool_calls but no response yet, commit it if current_prompt is not None or current_tool_calls: _commit_query(response_text="") if not queries: return None return { "queries": queries, "compression_metadata": { "is_compressed": False, "compression_points": [], }, } def _rebuild_messages_after_compression( self, messages: List[Dict], compressed_summary: Optional[str], recent_queries: List[Dict], include_current_execution: bool = False, include_tool_calls: bool = False, ) -> Optional[List[Dict]]: """ Rebuild the message list after compression so tool execution can continue. Delegates to MessageBuilder for the actual reconstruction. """ from application.api.answer.services.compression.message_builder import ( MessageBuilder, ) return MessageBuilder.rebuild_messages_after_compression( messages=messages, compressed_summary=compressed_summary, recent_queries=recent_queries, include_current_execution=include_current_execution, include_tool_calls=include_tool_calls, ) def _perform_mid_execution_compression( self, agent, messages: List[Dict] ) -> tuple[bool, Optional[List[Dict]]]: """ Perform compression during tool execution and rebuild messages. Uses the new orchestrator for simplified compression. Args: agent: The agent instance messages: Current conversation messages Returns: (success: bool, rebuilt_messages: Optional[List[Dict]]) """ try: from application.api.answer.services.compression import ( CompressionOrchestrator, ) from application.api.answer.services.conversation_service import ( ConversationService, ) conversation_service = ConversationService() orchestrator = CompressionOrchestrator(conversation_service) # Get conversation from database (may be None for new sessions) conversation = conversation_service.get_conversation( agent.conversation_id, agent.initial_user_id ) if conversation: # Merge current in-flight messages (including tool calls) conversation_from_msgs = self._build_conversation_from_messages(messages) if conversation_from_msgs: conversation = conversation_from_msgs else: logger.warning( "Could not load conversation for compression; attempting in-memory compression" ) return self._perform_in_memory_compression(agent, messages) # Use orchestrator to perform compression result = orchestrator.compress_mid_execution( conversation_id=agent.conversation_id, user_id=agent.initial_user_id, model_id=agent.model_id, decoded_token=getattr(agent, "decoded_token", {}), current_conversation=conversation, ) if not result.success: logger.warning(f"Mid-execution compression failed: {result.error}") # Try minimal pruning as fallback pruned = self._prune_messages_minimal(messages) if pruned: agent.context_limit_reached = False agent.current_token_count = 0 return True, pruned return False, None if not result.compression_performed: logger.warning("Compression not performed") return False, None # Check if compression actually reduced tokens if result.metadata: if result.metadata.compressed_token_count >= result.metadata.original_token_count: logger.warning( "Compression did not reduce token count; falling back to minimal pruning" ) pruned = self._prune_messages_minimal(messages) if pruned: agent.context_limit_reached = False agent.current_token_count = 0 return True, pruned return False, None logger.info( f"Mid-execution compression successful - ratio: {result.metadata.compression_ratio:.1f}x, " f"saved {result.metadata.original_token_count - result.metadata.compressed_token_count} tokens" ) # Also store the compression summary as a visible message if result.metadata: conversation_service.append_compression_message( agent.conversation_id, result.metadata.to_dict() ) # Update agent's compressed summary for downstream persistence agent.compressed_summary = result.compressed_summary agent.compression_metadata = result.metadata.to_dict() if result.metadata else None agent.compression_saved = False # Reset the context limit flag so tools can continue agent.context_limit_reached = False agent.current_token_count = 0 # Rebuild messages rebuilt_messages = self._rebuild_messages_after_compression( messages, result.compressed_summary, result.recent_queries, include_current_execution=False, include_tool_calls=False, ) if rebuilt_messages is None: return False, None return True, rebuilt_messages except Exception as e: logger.error( f"Error performing mid-execution compression: {str(e)}", exc_info=True ) return False, None def _perform_in_memory_compression( self, agent, messages: List[Dict] ) -> tuple[bool, Optional[List[Dict]]]: """ Fallback compression path when the conversation is not yet persisted. Uses CompressionService directly without DB persistence. """ try: from application.api.answer.services.compression.service import ( CompressionService, ) from application.core.model_utils import ( get_api_key_for_provider, get_provider_from_model_id, ) from application.core.settings import settings from application.llm.llm_creator import LLMCreator conversation = self._build_conversation_from_messages(messages) if not conversation: logger.warning( "Cannot perform in-memory compression: no user/assistant turns found" ) return False, None compression_model = ( settings.COMPRESSION_MODEL_OVERRIDE if settings.COMPRESSION_MODEL_OVERRIDE else agent.model_id ) provider = get_provider_from_model_id(compression_model) api_key = get_api_key_for_provider(provider) compression_llm = LLMCreator.create_llm( provider, api_key, getattr(agent, "user_api_key", None), getattr(agent, "decoded_token", None), model_id=compression_model, agent_id=getattr(agent, "agent_id", None), ) # Create service without DB persistence capability compression_service = CompressionService( llm=compression_llm, model_id=compression_model, conversation_service=None, # No DB updates for in-memory ) queries_count = len(conversation.get("queries", [])) compress_up_to = queries_count - 1 if compress_up_to < 0 or queries_count == 0: logger.warning("Not enough queries to compress in-memory context") return False, None metadata = compression_service.compress_conversation( conversation, compress_up_to_index=compress_up_to, ) # If compression doesn't reduce tokens, fall back to minimal pruning if ( metadata.compressed_token_count >= metadata.original_token_count ): logger.warning( "In-memory compression did not reduce token count; falling back to minimal pruning" ) pruned = self._prune_messages_minimal(messages) if pruned: agent.context_limit_reached = False agent.current_token_count = 0 return True, pruned return False, None # Attach metadata to synthetic conversation conversation["compression_metadata"] = { "is_compressed": True, "compression_points": [metadata.to_dict()], } compressed_summary, recent_queries = ( compression_service.get_compressed_context(conversation) ) agent.compressed_summary = compressed_summary agent.compression_metadata = metadata.to_dict() agent.compression_saved = False agent.context_limit_reached = False agent.current_token_count = 0 rebuilt_messages = self._rebuild_messages_after_compression( messages, compressed_summary, recent_queries, include_current_execution=False, include_tool_calls=False, ) if rebuilt_messages is None: return False, None logger.info( f"In-memory compression successful - ratio: {metadata.compression_ratio:.1f}x, " f"saved {metadata.original_token_count - metadata.compressed_token_count} tokens" ) return True, rebuilt_messages except Exception as e: logger.error( f"Error performing in-memory compression: {str(e)}", exc_info=True ) return False, None def handle_tool_calls( self, agent, tool_calls: List[ToolCall], tools_dict: Dict, messages: List[Dict] ) -> Generator: """ Execute tool calls and update conversation history. Args: agent: The agent instance tool_calls: List of tool calls to execute tools_dict: Available tools dictionary messages: Current conversation history Returns: Updated messages list """ updated_messages = messages.copy() for i, call in enumerate(tool_calls): # Check context limit before executing tool call if hasattr(agent, '_check_context_limit') and agent._check_context_limit(updated_messages): # Context limit reached - attempt mid-execution compression compression_attempted = False compression_successful = False try: from application.core.settings import settings compression_enabled = settings.ENABLE_CONVERSATION_COMPRESSION except Exception: compression_enabled = False if compression_enabled: compression_attempted = True try: logger.info( f"Context limit reached with {len(tool_calls) - i} remaining tool calls. " f"Attempting mid-execution compression..." ) # Trigger mid-execution compression (DB-backed if available, otherwise in-memory) compression_successful, rebuilt_messages = self._perform_mid_execution_compression( agent, updated_messages ) if compression_successful and rebuilt_messages is not None: # Update the messages list with rebuilt compressed version updated_messages = rebuilt_messages # Yield compression success message yield { "type": "info", "data": { "message": "Context window limit reached. Compressed conversation history to continue processing." } } logger.info( f"Mid-execution compression successful. Continuing with {len(tool_calls) - i} remaining tool calls." ) # Proceed to execute the current tool call with the reduced context else: logger.warning("Mid-execution compression attempted but failed. Skipping remaining tools.") except Exception as e: logger.error(f"Error during mid-execution compression: {str(e)}", exc_info=True) compression_attempted = True compression_successful = False # If compression wasn't attempted or failed, skip remaining tools if not compression_successful: if i == 0: # Special case: limit reached before executing any tools # This can happen when previous tool responses pushed context over limit if compression_attempted: logger.warning( f"Context limit reached before executing any tools. " f"Compression attempted but failed. " f"Skipping all {len(tool_calls)} pending tool call(s). " f"This typically occurs when previous tool responses contained large amounts of data." ) else: logger.warning( f"Context limit reached before executing any tools. " f"Skipping all {len(tool_calls)} pending tool call(s). " f"This typically occurs when previous tool responses contained large amounts of data. " f"Consider enabling compression or using a model with larger context window." ) else: # Normal case: executed some tools, now stopping tool_word = "tool call" if i == 1 else "tool calls" remaining = len(tool_calls) - i remaining_word = "tool call" if remaining == 1 else "tool calls" if compression_attempted: logger.warning( f"Context limit reached after executing {i} {tool_word}. " f"Compression attempted but failed. " f"Skipping remaining {remaining} {remaining_word}." ) else: logger.warning( f"Context limit reached after executing {i} {tool_word}. " f"Skipping remaining {remaining} {remaining_word}. " f"Consider enabling compression or using a model with larger context window." ) # Mark remaining tools as skipped for remaining_call in tool_calls[i:]: skip_message = { "type": "tool_call", "data": { "tool_name": "system", "call_id": remaining_call.id, "action_name": remaining_call.name, "arguments": {}, "result": "Skipped: Context limit reached. Too many tool calls in conversation.", "status": "skipped" } } yield skip_message # Set flag on agent agent.context_limit_reached = True break try: self.tool_calls.append(call) tool_executor_gen = agent._execute_tool_action(tools_dict, call) while True: try: yield next(tool_executor_gen) except StopIteration as e: tool_response, call_id = e.value break function_call_content = { "function_call": { "name": call.name, "args": call.arguments, "call_id": call_id, } } # Include thought_signature for Google Gemini 3 models # It should be at the same level as function_call, not inside it if call.thought_signature: function_call_content["thought_signature"] = call.thought_signature updated_messages.append( { "role": "assistant", "content": [function_call_content], } ) updated_messages.append(self.create_tool_message(call, tool_response)) except Exception as e: logger.error(f"Error executing tool: {str(e)}", exc_info=True) error_call = ToolCall( id=call.id, name=call.name, arguments=call.arguments ) error_response = f"Error executing tool: {str(e)}" error_message = self.create_tool_message(error_call, error_response) updated_messages.append(error_message) call_parts = call.name.split("_") if len(call_parts) >= 2: tool_id = call_parts[-1] # Last part is tool ID (e.g., "1") action_name = "_".join(call_parts[:-1]) tool_name = tools_dict.get(tool_id, {}).get("name", "unknown_tool") full_action_name = f"{action_name}_{tool_id}" else: tool_name = "unknown_tool" action_name = call.name full_action_name = call.name yield { "type": "tool_call", "data": { "tool_name": tool_name, "call_id": call.id, "action_name": full_action_name, "arguments": call.arguments, "error": error_response, "status": "error", }, } return updated_messages def handle_non_streaming( self, agent, response: Any, tools_dict: Dict, messages: List[Dict] ) -> Generator: """ Handle non-streaming response flow. Args: agent: The agent instance response: Current LLM response tools_dict: Available tools dictionary messages: Conversation history Returns: Final response after processing all tool calls """ parsed = self.parse_response(response) self.llm_calls.append(build_stack_data(agent.llm)) while parsed.requires_tool_call: tool_handler_gen = self.handle_tool_calls( agent, parsed.tool_calls, tools_dict, messages ) while True: try: yield next(tool_handler_gen) except StopIteration as e: messages = e.value break response = agent.llm.gen( model=agent.model_id, messages=messages, tools=agent.tools ) parsed = self.parse_response(response) self.llm_calls.append(build_stack_data(agent.llm)) return parsed.content def handle_streaming( self, agent, response: Any, tools_dict: Dict, messages: List[Dict] ) -> Generator: """ Handle streaming response flow. Args: agent: The agent instance response: Current LLM response tools_dict: Available tools dictionary messages: Conversation history Yields: Streaming response chunks """ buffer = "" tool_calls = {} for chunk in self._iterate_stream(response): if isinstance(chunk, dict) and chunk.get("type") == "thought": yield chunk continue if isinstance(chunk, str): yield chunk continue parsed = self.parse_response(chunk) if parsed.tool_calls: for call in parsed.tool_calls: if call.index not in tool_calls: tool_calls[call.index] = call else: existing = tool_calls[call.index] if call.id: existing.id = call.id if call.name: existing.name = call.name if call.arguments: if existing.arguments is None: existing.arguments = call.arguments else: existing.arguments += call.arguments # Preserve thought_signature for Google Gemini 3 models if call.thought_signature: existing.thought_signature = call.thought_signature if parsed.finish_reason == "tool_calls": tool_handler_gen = self.handle_tool_calls( agent, list(tool_calls.values()), tools_dict, messages ) while True: try: yield next(tool_handler_gen) except StopIteration as e: messages = e.value break tool_calls = {} # Check if context limit was reached during tool execution if hasattr(agent, 'context_limit_reached') and agent.context_limit_reached: # Add system message warning about context limit messages.append({ "role": "system", "content": ( "WARNING: Context window limit has been reached. " "Please provide a final response to the user without making additional tool calls. " "Summarize the work completed so far." ) }) logger.info("Context limit reached - instructing agent to wrap up") response = agent.llm.gen_stream( model=agent.model_id, messages=messages, tools=agent.tools if not agent.context_limit_reached else None ) self.llm_calls.append(build_stack_data(agent.llm)) yield from self.handle_streaming(agent, response, tools_dict, messages) return if parsed.content: buffer += parsed.content yield buffer buffer = "" if parsed.finish_reason == "stop": return ================================================ FILE: application/llm/handlers/google.py ================================================ import uuid from typing import Any, Dict, Generator from application.llm.handlers.base import LLMHandler, LLMResponse, ToolCall class GoogleLLMHandler(LLMHandler): """Handler for Google's GenAI API.""" def parse_response(self, response: Any) -> LLMResponse: """Parse Google response into standardized format.""" if isinstance(response, str): return LLMResponse( content=response, tool_calls=[], finish_reason="stop", raw_response=response, ) if hasattr(response, "candidates"): parts = response.candidates[0].content.parts if response.candidates else [] tool_calls = [] for idx, part in enumerate(parts): if hasattr(part, "function_call") and part.function_call is not None: has_sig = hasattr(part, "thought_signature") and part.thought_signature is not None thought_sig = part.thought_signature if has_sig else None tool_calls.append( ToolCall( id=str(uuid.uuid4()), name=part.function_call.name, arguments=part.function_call.args, index=idx, thought_signature=thought_sig, ) ) content = " ".join( part.text for part in parts if hasattr(part, "text") and part.text is not None ) return LLMResponse( content=content, tool_calls=tool_calls, finish_reason="tool_calls" if tool_calls else "stop", raw_response=response, ) else: # This branch handles individual Part objects from streaming responses tool_calls = [] if hasattr(response, "function_call") and response.function_call is not None: has_sig = hasattr(response, "thought_signature") and response.thought_signature is not None thought_sig = response.thought_signature if has_sig else None tool_calls.append( ToolCall( id=str(uuid.uuid4()), name=response.function_call.name, arguments=response.function_call.args, thought_signature=thought_sig, ) ) return LLMResponse( content=response.text if hasattr(response, "text") else "", tool_calls=tool_calls, finish_reason="tool_calls" if tool_calls else "stop", raw_response=response, ) def create_tool_message(self, tool_call: ToolCall, result: Any) -> Dict: """Create Google-style tool message.""" return { "role": "model", "content": [ { "function_response": { "name": tool_call.name, "response": {"result": result}, } } ], } def _iterate_stream(self, response: Any) -> Generator: """Iterate through Google streaming response.""" for chunk in response: yield chunk ================================================ FILE: application/llm/handlers/handler_creator.py ================================================ from application.llm.handlers.base import LLMHandler from application.llm.handlers.google import GoogleLLMHandler from application.llm.handlers.openai import OpenAILLMHandler class LLMHandlerCreator: handlers = { "openai": OpenAILLMHandler, "google": GoogleLLMHandler, "default": OpenAILLMHandler, } @classmethod def create_handler(cls, llm_type: str, *args, **kwargs) -> LLMHandler: handler_class = cls.handlers.get(llm_type.lower()) if not handler_class: handler_class = OpenAILLMHandler return handler_class(*args, **kwargs) ================================================ FILE: application/llm/handlers/openai.py ================================================ from typing import Any, Dict, Generator from application.llm.handlers.base import LLMHandler, LLMResponse, ToolCall class OpenAILLMHandler(LLMHandler): """Handler for OpenAI API.""" def parse_response(self, response: Any) -> LLMResponse: """Parse OpenAI response into standardized format.""" if isinstance(response, str): return LLMResponse( content=response, tool_calls=[], finish_reason="stop", raw_response=response, ) message = getattr(response, "message", None) or getattr(response, "delta", None) tool_calls = [] if hasattr(message, "tool_calls"): tool_calls = [ ToolCall( id=getattr(tc, "id", ""), name=getattr(tc.function, "name", ""), arguments=getattr(tc.function, "arguments", ""), index=getattr(tc, "index", None), ) for tc in message.tool_calls or [] ] return LLMResponse( content=getattr(message, "content", ""), tool_calls=tool_calls, finish_reason=getattr(response, "finish_reason", ""), raw_response=response, ) def create_tool_message(self, tool_call: ToolCall, result: Any) -> Dict: """Create OpenAI-style tool message.""" return { "role": "tool", "content": [ { "function_response": { "name": tool_call.name, "response": {"result": result}, "call_id": tool_call.id, } } ], } def _iterate_stream(self, response: Any) -> Generator: """Iterate through OpenAI streaming response.""" for chunk in response: yield chunk ================================================ FILE: application/llm/llama_cpp.py ================================================ from application.llm.base import BaseLLM from application.core.settings import settings import threading class LlamaSingleton: _instances = {} _lock = threading.Lock() # Add a lock for thread synchronization @classmethod def get_instance(cls, llm_name): if llm_name not in cls._instances: try: from llama_cpp import Llama except ImportError: raise ImportError( "Please install llama_cpp using pip install llama-cpp-python" ) cls._instances[llm_name] = Llama(model_path=llm_name, n_ctx=2048) return cls._instances[llm_name] @classmethod def query_model(cls, llm, prompt, **kwargs): with cls._lock: return llm(prompt, **kwargs) class LlamaCpp(BaseLLM): def __init__( self, api_key=None, user_api_key=None, llm_name=settings.LLM_PATH, *args, **kwargs, ): super().__init__(*args, **kwargs) self.api_key = api_key self.user_api_key = user_api_key self.llama = LlamaSingleton.get_instance(llm_name) def _raw_gen(self, baseself, model, messages, stream=False, **kwargs): context = messages[0]["content"] user_question = messages[-1]["content"] prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" result = LlamaSingleton.query_model( self.llama, prompt, max_tokens=150, echo=False ) return result["choices"][0]["text"].split("### Answer \n")[-1] def _raw_gen_stream(self, baseself, model, messages, stream=True, **kwargs): context = messages[0]["content"] user_question = messages[-1]["content"] prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" result = LlamaSingleton.query_model( self.llama, prompt, max_tokens=150, echo=False, stream=stream ) for item in result: for choice in item["choices"]: yield choice["text"] ================================================ FILE: application/llm/llm_creator.py ================================================ import logging from application.llm.anthropic import AnthropicLLM from application.llm.docsgpt_provider import DocsGPTAPILLM from application.llm.google_ai import GoogleLLM from application.llm.groq import GroqLLM from application.llm.llama_cpp import LlamaCpp from application.llm.novita import NovitaLLM from application.llm.openai import AzureOpenAILLM, OpenAILLM from application.llm.premai import PremAILLM from application.llm.sagemaker import SagemakerAPILLM from application.llm.open_router import OpenRouterLLM logger = logging.getLogger(__name__) class LLMCreator: llms = { "openai": OpenAILLM, "azure_openai": AzureOpenAILLM, "sagemaker": SagemakerAPILLM, "llama.cpp": LlamaCpp, "anthropic": AnthropicLLM, "docsgpt": DocsGPTAPILLM, "premai": PremAILLM, "groq": GroqLLM, "google": GoogleLLM, "novita": NovitaLLM, "openrouter": OpenRouterLLM, } @classmethod def create_llm( cls, type, api_key, user_api_key, decoded_token, model_id=None, agent_id=None, *args, **kwargs, ): from application.core.model_utils import get_base_url_for_model llm_class = cls.llms.get(type.lower()) if not llm_class: raise ValueError(f"No LLM class found for type {type}") # Extract base_url from model configuration if model_id is provided base_url = None if model_id: base_url = get_base_url_for_model(model_id) return llm_class( api_key, user_api_key, decoded_token=decoded_token, model_id=model_id, agent_id=agent_id, base_url=base_url, *args, **kwargs, ) ================================================ FILE: application/llm/novita.py ================================================ from application.core.settings import settings from application.llm.openai import OpenAILLM NOVITA_BASE_URL = "https://api.novita.ai/v3/openai" class NovitaLLM(OpenAILLM): def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs): super().__init__( api_key=api_key or settings.API_KEY, user_api_key=user_api_key, base_url=base_url or NOVITA_BASE_URL, *args, **kwargs, ) ================================================ FILE: application/llm/open_router.py ================================================ from application.core.settings import settings from application.llm.openai import OpenAILLM OPEN_ROUTER_BASE_URL = "https://openrouter.ai/api/v1" class OpenRouterLLM(OpenAILLM): def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs): super().__init__( api_key=api_key or settings.OPEN_ROUTER_API_KEY or settings.API_KEY, user_api_key=user_api_key, base_url=base_url or OPEN_ROUTER_BASE_URL, *args, **kwargs, ) ================================================ FILE: application/llm/openai.py ================================================ import base64 import json import logging from openai import OpenAI from application.core.settings import settings from application.llm.base import BaseLLM from application.storage.storage_creator import StorageCreator def _truncate_base64_for_logging(messages): """ Create a copy of messages with base64 data truncated for readable logging. Args: messages: List of message dicts Returns: Copy of messages with truncated base64 content """ import copy def truncate_content(content): if isinstance(content, str): # Check if it looks like a data URL with base64 if content.startswith("data:") and ";base64," in content: prefix_end = content.index(";base64,") + len(";base64,") prefix = content[:prefix_end] return f"{prefix}[BASE64_DATA_TRUNCATED, length={len(content) - prefix_end}]" return content elif isinstance(content, list): return [truncate_item(item) for item in content] elif isinstance(content, dict): return {k: truncate_content(v) for k, v in content.items()} return content def truncate_item(item): if isinstance(item, dict): result = {} for k, v in item.items(): if k == "url" and isinstance(v, str) and ";base64," in v: prefix_end = v.index(";base64,") + len(";base64,") prefix = v[:prefix_end] result[k] = f"{prefix}[BASE64_DATA_TRUNCATED, length={len(v) - prefix_end}]" elif k == "data" and isinstance(v, str) and len(v) > 100: result[k] = f"[BASE64_DATA_TRUNCATED, length={len(v)}]" else: result[k] = truncate_content(v) return result return truncate_content(item) truncated = [] for msg in messages: msg_copy = copy.copy(msg) if "content" in msg_copy: msg_copy["content"] = truncate_content(msg_copy["content"]) truncated.append(msg_copy) return truncated class OpenAILLM(BaseLLM): def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs): super().__init__(*args, **kwargs) self.api_key = api_key or settings.OPENAI_API_KEY or settings.API_KEY self.user_api_key = user_api_key # Priority: 1) Parameter base_url, 2) Settings OPENAI_BASE_URL, 3) Default effective_base_url = None if base_url and isinstance(base_url, str) and base_url.strip(): effective_base_url = base_url elif ( isinstance(settings.OPENAI_BASE_URL, str) and settings.OPENAI_BASE_URL.strip() ): effective_base_url = settings.OPENAI_BASE_URL else: effective_base_url = "https://api.openai.com/v1" self.client = OpenAI(api_key=self.api_key, base_url=effective_base_url) self.storage = StorageCreator.get_storage() def _clean_messages_openai(self, messages): cleaned_messages = [] for message in messages: role = message.get("role") content = message.get("content") if role == "model": role = "assistant" if role and content is not None: if isinstance(content, str): cleaned_messages.append({"role": role, "content": content}) elif isinstance(content, list): # Collect all content parts into a single message content_parts = [] for item in content: if "function_call" in item: # Function calls need their own message cleaned_args = self._remove_null_values( item["function_call"]["args"] ) tool_call = { "id": item["function_call"]["call_id"], "type": "function", "function": { "name": item["function_call"]["name"], "arguments": json.dumps(cleaned_args), }, } cleaned_messages.append( { "role": "assistant", "content": None, "tool_calls": [tool_call], } ) elif "function_response" in item: # Function responses need their own message cleaned_messages.append( { "role": "tool", "tool_call_id": item["function_response"][ "call_id" ], "content": json.dumps( item["function_response"]["response"]["result"] ), } ) elif isinstance(item, dict): # Collect content parts (text, images, files) into a single message if "type" in item and item["type"] == "text" and "text" in item: content_parts.append(item) elif "type" in item and item["type"] == "file" and "file" in item: content_parts.append(item) elif "type" in item and item["type"] == "image_url" and "image_url" in item: content_parts.append(item) elif "text" in item and "type" not in item: # Legacy format: {"text": "..."} without type content_parts.append({"type": "text", "text": item["text"]}) # Add the collected content parts as a single message if content_parts: cleaned_messages.append({"role": role, "content": content_parts}) else: raise ValueError(f"Unexpected content type: {type(content)}") return cleaned_messages @staticmethod def _normalize_reasoning_value(value): """Normalize reasoning payloads from OpenAI-compatible stream chunks.""" if value is None: return "" if isinstance(value, str): return value if isinstance(value, list): return "".join( OpenAILLM._normalize_reasoning_value(item) for item in value ) if isinstance(value, dict): for key in ("text", "content", "value", "reasoning_content", "reasoning"): normalized = OpenAILLM._normalize_reasoning_value(value.get(key)) if normalized: return normalized return "" for attr in ("text", "content", "value"): if hasattr(value, attr): normalized = OpenAILLM._normalize_reasoning_value(getattr(value, attr)) if normalized: return normalized return "" @classmethod def _extract_reasoning_text(cls, delta): """Extract reasoning/thinking tokens from OpenAI-compatible delta chunks.""" if delta is None: return "" for key in ( "reasoning_content", "reasoning", "thinking", "thinking_content", ): value = getattr(delta, key, None) if value is None and isinstance(delta, dict): value = delta.get(key) normalized = cls._normalize_reasoning_value(value) if normalized: return normalized return "" def _raw_gen( self, baseself, model, messages, stream=False, tools=None, engine=settings.AZURE_DEPLOYMENT_NAME, response_format=None, **kwargs, ): messages = self._clean_messages_openai(messages) logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}") # Convert max_tokens to max_completion_tokens for newer models if "max_tokens" in kwargs: kwargs["max_completion_tokens"] = kwargs.pop("max_tokens") request_params = { "model": model, "messages": messages, "stream": stream, **kwargs, } if tools: request_params["tools"] = tools if response_format: request_params["response_format"] = response_format response = self.client.chat.completions.create(**request_params) logging.info(f"OpenAI response: {response}") if tools: return response.choices[0] else: return response.choices[0].message.content def _raw_gen_stream( self, baseself, model, messages, stream=True, tools=None, engine=settings.AZURE_DEPLOYMENT_NAME, response_format=None, **kwargs, ): messages = self._clean_messages_openai(messages) logging.info(f"Cleaned messages: {_truncate_base64_for_logging(messages)}") # Convert max_tokens to max_completion_tokens for newer models if "max_tokens" in kwargs: kwargs["max_completion_tokens"] = kwargs.pop("max_tokens") request_params = { "model": model, "messages": messages, "stream": stream, **kwargs, } if tools: request_params["tools"] = tools if response_format: request_params["response_format"] = response_format response = self.client.chat.completions.create(**request_params) try: for line in response: logging.debug(f"OpenAI stream line: {line}") if not getattr(line, "choices", None): continue choice = line.choices[0] delta = getattr(choice, "delta", None) reasoning_text = self._extract_reasoning_text(delta) if reasoning_text: yield {"type": "thought", "thought": reasoning_text} content = getattr(delta, "content", None) if isinstance(content, str) and content: yield content continue has_tool_calls = bool(getattr(delta, "tool_calls", None)) finish_reason = getattr(choice, "finish_reason", None) # Yield non-content chunks only when needed for tool-call handling. if has_tool_calls or finish_reason == "tool_calls": yield choice finally: if hasattr(response, "close"): response.close() def _supports_tools(self): return True def _supports_structured_output(self): return True def prepare_structured_output_format(self, json_schema): if not json_schema: return None try: def add_additional_properties_false(schema_obj): if isinstance(schema_obj, dict): schema_copy = schema_obj.copy() if schema_copy.get("type") == "object": schema_copy["additionalProperties"] = False # Ensure 'required' includes all properties for OpenAI strict mode if "properties" in schema_copy: schema_copy["required"] = list( schema_copy["properties"].keys() ) for key, value in schema_copy.items(): if key == "properties" and isinstance(value, dict): schema_copy[key] = { prop_name: add_additional_properties_false(prop_schema) for prop_name, prop_schema in value.items() } elif key == "items" and isinstance(value, dict): schema_copy[key] = add_additional_properties_false(value) elif key in ["anyOf", "oneOf", "allOf"] and isinstance( value, list ): schema_copy[key] = [ add_additional_properties_false(sub_schema) for sub_schema in value ] return schema_copy return schema_obj processed_schema = add_additional_properties_false(json_schema) result = { "type": "json_schema", "json_schema": { "name": processed_schema.get("name", "response"), "description": processed_schema.get( "description", "Structured response" ), "schema": processed_schema, "strict": True, }, } return result except Exception as e: logging.error(f"Error preparing structured output format: {e}") return None def get_supported_attachment_types(self): """ Return a list of MIME types supported by OpenAI for file uploads. This reads from the model config to ensure consistency. If no model config found, falls back to images only (safest default). Returns: list: List of supported MIME types """ from application.core.model_configs import OPENAI_ATTACHMENTS return OPENAI_ATTACHMENTS def prepare_messages_with_attachments(self, messages, attachments=None): """ Process attachments using OpenAI's file API for more efficient handling. Args: messages (list): List of message dictionaries. attachments (list): List of attachment dictionaries with content and metadata. Returns: list: Messages formatted with file references for OpenAI API. """ if not attachments: return messages prepared_messages = messages.copy() # Find the user message to attach file_id to the last one user_message_index = None for i in range(len(prepared_messages) - 1, -1, -1): if prepared_messages[i].get("role") == "user": user_message_index = i break if user_message_index is None: user_message = {"role": "user", "content": []} prepared_messages.append(user_message) user_message_index = len(prepared_messages) - 1 if isinstance(prepared_messages[user_message_index].get("content"), str): text_content = prepared_messages[user_message_index]["content"] prepared_messages[user_message_index]["content"] = [ {"type": "text", "text": text_content} ] elif not isinstance(prepared_messages[user_message_index].get("content"), list): prepared_messages[user_message_index]["content"] = [] for attachment in attachments: mime_type = attachment.get("mime_type") logging.info(f"Processing attachment with mime_type: {mime_type}, has_data: {'data' in attachment}, has_path: {'path' in attachment}") if mime_type and mime_type.startswith("image/"): try: # Check if this is a pre-converted image (from PDF-to-image conversion) if "data" in attachment: base64_image = attachment["data"] else: base64_image = self._get_base64_image(attachment) prepared_messages[user_message_index]["content"].append( { "type": "image_url", "image_url": { "url": f"data:{mime_type};base64,{base64_image}" }, } ) except Exception as e: logging.error( f"Error processing image attachment: {e}", exc_info=True ) if "content" in attachment: prepared_messages[user_message_index]["content"].append( { "type": "text", "text": f"[Image could not be processed: {attachment.get('path', 'unknown')}]", } ) # Handle PDFs using the file API elif mime_type == "application/pdf": logging.info(f"Attempting to upload PDF to OpenAI: {attachment.get('path', 'unknown')}") try: file_id = self._upload_file_to_openai(attachment) prepared_messages[user_message_index]["content"].append( {"type": "file", "file": {"file_id": file_id}} ) except Exception as e: logging.error(f"Error uploading PDF to OpenAI: {e}", exc_info=True) if "content" in attachment: prepared_messages[user_message_index]["content"].append( { "type": "text", "text": f"File content:\n\n{attachment['content']}", } ) else: logging.warning(f"Unsupported attachment type in OpenAI provider: {mime_type}") return prepared_messages def _get_base64_image(self, attachment): """ Convert an image file to base64 encoding. Args: attachment (dict): Attachment dictionary with path and metadata. Returns: str: Base64-encoded image data. """ file_path = attachment.get("path") if not file_path: raise ValueError("No file path provided in attachment") try: with self.storage.get_file(file_path) as image_file: return base64.b64encode(image_file.read()).decode("utf-8") except FileNotFoundError: raise FileNotFoundError(f"File not found: {file_path}") def _upload_file_to_openai(self, attachment): """ Upload a file to OpenAI and return the file_id. Args: attachment (dict): Attachment dictionary with path and metadata. Expected keys: - path: Path to the file - id: Optional MongoDB ID for caching Returns: str: OpenAI file_id for the uploaded file. """ import logging if "openai_file_id" in attachment: return attachment["openai_file_id"] file_path = attachment.get("path") if not self.storage.file_exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") try: file_id = self.storage.process_file( file_path, lambda local_path, **kwargs: self.client.files.create( file=open(local_path, "rb"), purpose="assistants" ).id, ) from application.core.mongo_db import MongoDB mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] attachments_collection = db["attachments"] if "_id" in attachment: attachments_collection.update_one( {"_id": attachment["_id"]}, {"$set": {"openai_file_id": file_id}} ) return file_id except Exception as e: logging.error(f"Error uploading file to OpenAI: {e}", exc_info=True) raise class AzureOpenAILLM(OpenAILLM): def __init__(self, api_key, user_api_key, *args, **kwargs): super().__init__(api_key) self.api_base = (settings.OPENAI_API_BASE,) self.api_version = (settings.OPENAI_API_VERSION,) self.deployment_name = (settings.AZURE_DEPLOYMENT_NAME,) from openai import AzureOpenAI self.client = AzureOpenAI( api_key=api_key, api_version=settings.OPENAI_API_VERSION, azure_endpoint=settings.OPENAI_API_BASE, ) ================================================ FILE: application/llm/premai.py ================================================ from application.llm.base import BaseLLM from application.core.settings import settings class PremAILLM(BaseLLM): def __init__(self, api_key=None, user_api_key=None, *args, **kwargs): from premai import Prem super().__init__(*args, **kwargs) self.client = Prem(api_key=api_key) self.api_key = api_key self.user_api_key = user_api_key self.project_id = settings.PREMAI_PROJECT_ID def _raw_gen(self, baseself, model, messages, stream=False, **kwargs): response = self.client.chat.completions.create( model=model, project_id=self.project_id, messages=messages, stream=stream, **kwargs ) return response.choices[0].message["content"] def _raw_gen_stream(self, baseself, model, messages, stream=True, **kwargs): response = self.client.chat.completions.create( model=model, project_id=self.project_id, messages=messages, stream=stream, **kwargs ) for line in response: if line.choices[0].delta["content"] is not None: yield line.choices[0].delta["content"] ================================================ FILE: application/llm/sagemaker.py ================================================ from application.llm.base import BaseLLM from application.core.settings import settings import json import io class LineIterator: """ A helper class for parsing the byte stream input. The output of the model will be in the following format: ``` b'{"outputs": [" a"]}\n' b'{"outputs": [" challenging"]}\n' b'{"outputs": [" problem"]}\n' ... ``` While usually each PayloadPart event from the event stream will contain a byte array with a full json, this is not guaranteed and some of the json objects may be split across PayloadPart events. For example: ``` {'PayloadPart': {'Bytes': b'{"outputs": '}} {'PayloadPart': {'Bytes': b'[" problem"]}\n'}} ``` This class accounts for this by concatenating bytes written via the 'write' function and then exposing a method which will return lines (ending with a '\n' character) within the buffer via the 'scan_lines' function. It maintains the position of the last read position to ensure that previous bytes are not exposed again. """ def __init__(self, stream): self.byte_iterator = iter(stream) self.buffer = io.BytesIO() self.read_pos = 0 def __iter__(self): return self def __next__(self): while True: self.buffer.seek(self.read_pos) line = self.buffer.readline() if line and line[-1] == ord("\n"): self.read_pos += len(line) return line[:-1] try: chunk = next(self.byte_iterator) except StopIteration: if self.read_pos < self.buffer.getbuffer().nbytes: continue raise if "PayloadPart" not in chunk: print("Unknown event type:" + chunk) continue self.buffer.seek(0, io.SEEK_END) self.buffer.write(chunk["PayloadPart"]["Bytes"]) class SagemakerAPILLM(BaseLLM): def __init__(self, api_key=None, user_api_key=None, *args, **kwargs): import boto3 runtime = boto3.client( "runtime.sagemaker", aws_access_key_id="xxx", aws_secret_access_key="xxx", region_name="us-west-2", ) super().__init__(*args, **kwargs) self.api_key = api_key self.user_api_key = user_api_key self.endpoint = settings.SAGEMAKER_ENDPOINT self.runtime = runtime def _raw_gen(self, baseself, model, messages, stream=False, tools=None, **kwargs): context = messages[0]["content"] user_question = messages[-1]["content"] prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" # Construct payload for endpoint payload = { "inputs": prompt, "stream": False, "parameters": { "do_sample": True, "temperature": 0.1, "max_new_tokens": 30, "repetition_penalty": 1.03, "stop": ["", "###"], }, } body_bytes = json.dumps(payload).encode("utf-8") # Invoke the endpoint response = self.runtime.invoke_endpoint( EndpointName=self.endpoint, ContentType="application/json", Body=body_bytes ) result = json.loads(response["Body"].read().decode()) import sys print(result[0]["generated_text"], file=sys.stderr) return result[0]["generated_text"][len(prompt) :] def _raw_gen_stream(self, baseself, model, messages, stream=True, tools=None, **kwargs): context = messages[0]["content"] user_question = messages[-1]["content"] prompt = f"### Instruction \n {user_question} \n ### Context \n {context} \n ### Answer \n" # Construct payload for endpoint payload = { "inputs": prompt, "stream": True, "parameters": { "do_sample": True, "temperature": 0.1, "max_new_tokens": 512, "repetition_penalty": 1.03, "stop": ["", "###"], }, } body_bytes = json.dumps(payload).encode("utf-8") # Invoke the endpoint response = self.runtime.invoke_endpoint_with_response_stream( EndpointName=self.endpoint, ContentType="application/json", Body=body_bytes ) # result = json.loads(response['Body'].read().decode()) event_stream = response["Body"] start_json = b"{" for line in LineIterator(event_stream): if line != b"" and start_json in line: # print(line) data = json.loads(line[line.find(start_json) :].decode("utf-8")) if data["token"]["text"] not in ["", "###"]: print(data["token"]["text"], end="") yield data["token"]["text"] ================================================ FILE: application/logging.py ================================================ import datetime import functools import inspect import logging import uuid from typing import Any, Callable, Dict, Generator, List from application.core.mongo_db import MongoDB from application.core.settings import settings logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) class LogContext: def __init__(self, endpoint, activity_id, user, api_key, query): self.endpoint = endpoint self.activity_id = activity_id self.user = user self.api_key = api_key self.query = query self.stacks = [] def build_stack_data( obj: Any, include_attributes: List[str] = None, exclude_attributes: List[str] = None, custom_data: Dict = None, ) -> Dict: if obj is None: raise ValueError("The 'obj' parameter cannot be None") data = {} if include_attributes is None: include_attributes = [] for name, value in inspect.getmembers(obj): if ( not name.startswith("_") and not inspect.ismethod(value) and not inspect.isfunction(value) ): include_attributes.append(name) for attr_name in include_attributes: if exclude_attributes and attr_name in exclude_attributes: continue try: attr_value = getattr(obj, attr_name) if attr_value is not None: if isinstance(attr_value, (int, float, str, bool)): data[attr_name] = attr_value elif isinstance(attr_value, list): if all(isinstance(item, dict) for item in attr_value): data[attr_name] = attr_value elif all(hasattr(item, "__dict__") for item in attr_value): data[attr_name] = [item.__dict__ for item in attr_value] else: data[attr_name] = [str(item) for item in attr_value] elif isinstance(attr_value, dict): data[attr_name] = {k: str(v) for k, v in attr_value.items()} except AttributeError as e: logging.warning(f"AttributeError while accessing {attr_name}: {e}") except AttributeError: pass if custom_data: data.update(custom_data) return data def log_activity() -> Callable: def decorator(func: Callable) -> Callable: @functools.wraps(func) def wrapper(*args: Any, **kwargs: Any) -> Any: activity_id = str(uuid.uuid4()) data = build_stack_data(args[0]) endpoint = data.get("endpoint", "") user = data.get("user", "local") api_key = data.get("user_api_key", "") query = kwargs.get("query", getattr(args[0], "query", "")) context = LogContext(endpoint, activity_id, user, api_key, query) kwargs["log_context"] = context logging.info( f"Starting activity: {endpoint} - {activity_id} - User: {user}" ) generator = func(*args, **kwargs) yield from _consume_and_log(generator, context) return wrapper return decorator def _consume_and_log(generator: Generator, context: "LogContext"): try: for item in generator: yield item except Exception as e: logging.exception(f"Error in {context.endpoint} - {context.activity_id}: {e}") context.stacks.append({"component": "error", "data": {"message": str(e)}}) _log_to_mongodb( endpoint=context.endpoint, activity_id=context.activity_id, user=context.user, api_key=context.api_key, query=context.query, stacks=context.stacks, level="error", ) raise finally: _log_to_mongodb( endpoint=context.endpoint, activity_id=context.activity_id, user=context.user, api_key=context.api_key, query=context.query, stacks=context.stacks, level="info", ) def _log_to_mongodb( endpoint: str, activity_id: str, user: str, api_key: str, query: str, stacks: List[Dict], level: str, ) -> None: try: mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] user_logs_collection = db["stack_logs"] log_entry = { "endpoint": endpoint, "id": activity_id, "level": level, "user": user, "api_key": api_key, "query": query, "stacks": stacks, "timestamp": datetime.datetime.now(datetime.timezone.utc), } # clean up text fields to be no longer than 10000 characters for key, value in log_entry.items(): if isinstance(value, str) and len(value) > 10000: log_entry[key] = value[:10000] user_logs_collection.insert_one(log_entry) logging.debug(f"Logged activity to MongoDB: {activity_id}") except Exception as e: logging.error(f"Failed to log to MongoDB: {e}", exc_info=True) ================================================ FILE: application/parser/__init__.py ================================================ ================================================ FILE: application/parser/chunking.py ================================================ import re from typing import List, Tuple import logging from application.parser.schema.base import Document from application.utils import get_encoding logger = logging.getLogger(__name__) class Chunker: def __init__( self, chunking_strategy: str = "classic_chunk", max_tokens: int = 2000, min_tokens: int = 150, duplicate_headers: bool = False, ): if chunking_strategy not in ["classic_chunk"]: raise ValueError(f"Unsupported chunking strategy: {chunking_strategy}") self.chunking_strategy = chunking_strategy self.max_tokens = max_tokens self.min_tokens = min_tokens self.duplicate_headers = duplicate_headers self.encoding = get_encoding() def separate_header_and_body(self, text: str) -> Tuple[str, str]: header_pattern = r"^(.*?\n){3}" match = re.match(header_pattern, text) if match: header = match.group(0) body = text[len(header):] else: header, body = "", text # No header, treat entire text as body return header, body def split_document(self, doc: Document) -> List[Document]: split_docs = [] header, body = self.separate_header_and_body(doc.text) header_tokens = self.encoding.encode(header) if header else [] body_tokens = self.encoding.encode(body) current_position = 0 part_index = 0 while current_position < len(body_tokens): end_position = current_position + self.max_tokens - len(header_tokens) chunk_tokens = (header_tokens + body_tokens[current_position:end_position] if self.duplicate_headers or part_index == 0 else body_tokens[current_position:end_position]) chunk_text = self.encoding.decode(chunk_tokens) new_doc = Document( text=chunk_text, doc_id=f"{doc.doc_id}-{part_index}", embedding=doc.embedding, extra_info={**(doc.extra_info or {}), "token_count": len(chunk_tokens)} ) split_docs.append(new_doc) current_position = end_position part_index += 1 header_tokens = [] return split_docs def classic_chunk(self, documents: List[Document]) -> List[Document]: processed_docs = [] i = 0 while i < len(documents): doc = documents[i] tokens = self.encoding.encode(doc.text) token_count = len(tokens) if self.min_tokens <= token_count <= self.max_tokens: doc.extra_info = doc.extra_info or {} doc.extra_info["token_count"] = token_count processed_docs.append(doc) i += 1 elif token_count < self.min_tokens: doc.extra_info = doc.extra_info or {} doc.extra_info["token_count"] = token_count processed_docs.append(doc) i += 1 else: # Split large documents processed_docs.extend(self.split_document(doc)) i += 1 return processed_docs def chunk( self, documents: List[Document] ) -> List[Document]: if self.chunking_strategy == "classic_chunk": return self.classic_chunk(documents) else: raise ValueError("Unsupported chunking strategy") ================================================ FILE: application/parser/connectors/__init__.py ================================================ """ External knowledge base connectors for DocsGPT. This module contains connectors for external knowledge bases and document storage systems that require authentication and specialized handling, separate from simple web scrapers. """ from .base import BaseConnectorAuth, BaseConnectorLoader from .connector_creator import ConnectorCreator from .google_drive import GoogleDriveAuth, GoogleDriveLoader __all__ = [ 'BaseConnectorAuth', 'BaseConnectorLoader', 'ConnectorCreator', 'GoogleDriveAuth', 'GoogleDriveLoader' ] ================================================ FILE: application/parser/connectors/base.py ================================================ """ Base classes for external knowledge base connectors. This module provides minimal abstract base classes that define the essential interface for external knowledge base connectors. """ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional from application.parser.schema.base import Document class BaseConnectorAuth(ABC): """ Abstract base class for connector authentication. Defines the minimal interface that all connector authentication implementations must follow. """ @abstractmethod def get_authorization_url(self, state: Optional[str] = None) -> str: """ Generate authorization URL for OAuth flows. Args: state: Optional state parameter for CSRF protection Returns: Authorization URL """ pass @abstractmethod def exchange_code_for_tokens(self, authorization_code: str) -> Dict[str, Any]: """ Exchange authorization code for access tokens. Args: authorization_code: Authorization code from OAuth callback Returns: Dictionary containing token information """ pass @abstractmethod def refresh_access_token(self, refresh_token: str) -> Dict[str, Any]: """ Refresh an expired access token. Args: refresh_token: Refresh token Returns: Dictionary containing refreshed token information """ pass @abstractmethod def is_token_expired(self, token_info: Dict[str, Any]) -> bool: """ Check if a token is expired. Args: token_info: Token information dictionary Returns: True if token is expired, False otherwise """ pass def sanitize_token_info(self, token_info: Dict[str, Any], **extra_fields) -> Dict[str, Any]: """Extract the fields safe to persist in the session store. """ return { "access_token": token_info.get("access_token"), "refresh_token": token_info.get("refresh_token"), "token_uri": token_info.get("token_uri"), "expiry": token_info.get("expiry"), **extra_fields, } class BaseConnectorLoader(ABC): """ Abstract base class for connector loaders. Defines the minimal interface that all connector loader implementations must follow. """ @abstractmethod def __init__(self, session_token: str): """ Initialize the connector loader. Args: session_token: Authentication session token """ pass @abstractmethod def load_data(self, inputs: Dict[str, Any]) -> List[Document]: """ Load documents from the external knowledge base. Args: inputs: Configuration dictionary containing: - file_ids: Optional list of specific file IDs to load - folder_ids: Optional list of folder IDs to browse/download - limit: Maximum number of items to return - list_only: If True, return metadata without content - recursive: Whether to recursively process folders Returns: List of Document objects """ pass @abstractmethod def download_to_directory(self, local_dir: str, source_config: Dict[str, Any] = None) -> Dict[str, Any]: """ Download files/folders to a local directory. Args: local_dir: Local directory path to download files to source_config: Configuration for what to download Returns: Dictionary containing download results: - files_downloaded: Number of files downloaded - directory_path: Path where files were downloaded - empty_result: Whether no files were downloaded - source_type: Type of connector - config_used: Configuration that was used - error: Error message if download failed (optional) """ pass ================================================ FILE: application/parser/connectors/connector_creator.py ================================================ from application.parser.connectors.google_drive.loader import GoogleDriveLoader from application.parser.connectors.google_drive.auth import GoogleDriveAuth from application.parser.connectors.share_point.auth import SharePointAuth from application.parser.connectors.share_point.loader import SharePointLoader class ConnectorCreator: """ Factory class for creating external knowledge base connectors and auth providers. These are different from remote loaders as they typically require authentication and connect to external document storage systems. """ connectors = { "google_drive": GoogleDriveLoader, "share_point": SharePointLoader, } auth_providers = { "google_drive": GoogleDriveAuth, "share_point": SharePointAuth, } @classmethod def create_connector(cls, connector_type, *args, **kwargs): """ Create a connector instance for the specified type. Args: connector_type: Type of connector to create (e.g., 'google_drive') *args, **kwargs: Arguments to pass to the connector constructor Returns: Connector instance Raises: ValueError: If connector type is not supported """ connector_class = cls.connectors.get(connector_type.lower()) if not connector_class: raise ValueError(f"No connector class found for type {connector_type}") return connector_class(*args, **kwargs) @classmethod def create_auth(cls, connector_type): """ Create an auth provider instance for the specified connector type. Args: connector_type: Type of connector auth to create (e.g., 'google_drive') Returns: Auth provider instance Raises: ValueError: If connector type is not supported for auth """ auth_class = cls.auth_providers.get(connector_type.lower()) if not auth_class: raise ValueError(f"No auth class found for type {connector_type}") return auth_class() @classmethod def get_supported_connectors(cls): """ Get list of supported connector types. Returns: List of supported connector type strings """ return list(cls.connectors.keys()) @classmethod def is_supported(cls, connector_type): """ Check if a connector type is supported. Args: connector_type: Type of connector to check Returns: True if supported, False otherwise """ return connector_type.lower() in cls.connectors ================================================ FILE: application/parser/connectors/google_drive/__init__.py ================================================ """ Google Drive connector for DocsGPT. This module provides authentication and document loading capabilities for Google Drive. """ from .auth import GoogleDriveAuth from .loader import GoogleDriveLoader __all__ = ['GoogleDriveAuth', 'GoogleDriveLoader'] ================================================ FILE: application/parser/connectors/google_drive/auth.py ================================================ import logging import datetime from typing import Optional, Dict, Any from google.oauth2.credentials import Credentials from google_auth_oauthlib.flow import Flow from googleapiclient.discovery import build from googleapiclient.errors import HttpError from application.core.settings import settings from application.parser.connectors.base import BaseConnectorAuth class GoogleDriveAuth(BaseConnectorAuth): """ Handles Google OAuth 2.0 authentication for Google Drive access. """ SCOPES = [ 'https://www.googleapis.com/auth/drive.file' ] def __init__(self): self.client_id = settings.GOOGLE_CLIENT_ID self.client_secret = settings.GOOGLE_CLIENT_SECRET self.redirect_uri = f"{settings.CONNECTOR_REDIRECT_BASE_URI}" if not self.client_id or not self.client_secret: raise ValueError("Google OAuth credentials not configured. Please set GOOGLE_CLIENT_ID and GOOGLE_CLIENT_SECRET in settings.") def get_authorization_url(self, state: Optional[str] = None) -> str: try: flow = Flow.from_client_config( { "web": { "client_id": self.client_id, "client_secret": self.client_secret, "auth_uri": "https://accounts.google.com/o/oauth2/auth", "token_uri": "https://oauth2.googleapis.com/token", "redirect_uris": [self.redirect_uri] } }, scopes=self.SCOPES ) flow.redirect_uri = self.redirect_uri authorization_url, _ = flow.authorization_url( access_type='offline', prompt='consent', include_granted_scopes='false', state=state ) return authorization_url except Exception as e: logging.error(f"Error generating authorization URL: {e}") raise def exchange_code_for_tokens(self, authorization_code: str) -> Dict[str, Any]: try: if not authorization_code: raise ValueError("Authorization code is required") flow = Flow.from_client_config( { "web": { "client_id": self.client_id, "client_secret": self.client_secret, "auth_uri": "https://accounts.google.com/o/oauth2/auth", "token_uri": "https://oauth2.googleapis.com/token", "redirect_uris": [self.redirect_uri] } }, scopes=self.SCOPES ) flow.redirect_uri = self.redirect_uri flow.fetch_token(code=authorization_code) credentials = flow.credentials if not credentials.refresh_token: logging.warning("OAuth flow did not return a refresh_token.") if not credentials.token: raise ValueError("OAuth flow did not return an access token") if not credentials.token_uri: credentials.token_uri = "https://oauth2.googleapis.com/token" if not credentials.client_id: credentials.client_id = self.client_id if not credentials.client_secret: credentials.client_secret = self.client_secret if not credentials.refresh_token: raise ValueError( "No refresh token received. This typically happens when offline access wasn't granted. " ) return { 'access_token': credentials.token, 'refresh_token': credentials.refresh_token, 'token_uri': credentials.token_uri, 'client_id': credentials.client_id, 'client_secret': credentials.client_secret, 'scopes': credentials.scopes, 'expiry': credentials.expiry.isoformat() if credentials.expiry else None } except Exception as e: logging.error(f"Error exchanging code for tokens: {e}") raise def refresh_access_token(self, refresh_token: str) -> Dict[str, Any]: try: if not refresh_token: raise ValueError("Refresh token is required") credentials = Credentials( token=None, refresh_token=refresh_token, token_uri="https://oauth2.googleapis.com/token", client_id=self.client_id, client_secret=self.client_secret ) from google.auth.transport.requests import Request credentials.refresh(Request()) return { 'access_token': credentials.token, 'refresh_token': refresh_token, 'token_uri': credentials.token_uri, 'client_id': credentials.client_id, 'client_secret': credentials.client_secret, 'scopes': credentials.scopes, 'expiry': credentials.expiry.isoformat() if credentials.expiry else None } except Exception as e: logging.error(f"Error refreshing access token: {e}", exc_info=True) raise def create_credentials_from_token_info(self, token_info: Dict[str, Any]) -> Credentials: from application.core.settings import settings access_token = token_info.get('access_token') if not access_token: raise ValueError("No access token found in token_info") credentials = Credentials( token=access_token, refresh_token=token_info.get('refresh_token'), token_uri= 'https://oauth2.googleapis.com/token', client_id=settings.GOOGLE_CLIENT_ID, client_secret=settings.GOOGLE_CLIENT_SECRET, scopes=token_info.get('scopes', ['https://www.googleapis.com/auth/drive.readonly']) ) if not credentials.token: raise ValueError("Credentials created without valid access token") return credentials def build_drive_service(self, credentials: Credentials): try: if not credentials: raise ValueError("No credentials provided") if not credentials.token and not credentials.refresh_token: raise ValueError("No access token or refresh token available. User must re-authorize with offline access.") needs_refresh = credentials.expired or not credentials.token if needs_refresh: if credentials.refresh_token: try: from google.auth.transport.requests import Request credentials.refresh(Request()) except Exception as refresh_error: raise ValueError(f"Failed to refresh credentials: {refresh_error}") else: raise ValueError("No access token or refresh token available. User must re-authorize with offline access.") return build('drive', 'v3', credentials=credentials) except HttpError as e: raise ValueError(f"Failed to build Google Drive service: HTTP {e.resp.status}") except Exception as e: raise ValueError(f"Failed to build Google Drive service: {str(e)}") def is_token_expired(self, token_info): if 'expiry' in token_info and token_info['expiry']: try: from dateutil import parser # Google Drive provides timezone-aware ISO8601 dates expiry_dt = parser.parse(token_info['expiry']) current_time = datetime.datetime.now(datetime.timezone.utc) return current_time >= expiry_dt - datetime.timedelta(seconds=60) except Exception: return True if 'access_token' in token_info and token_info['access_token']: return False return True def get_token_info_from_session(self, session_token: str) -> Dict[str, Any]: try: from application.core.mongo_db import MongoDB from application.core.settings import settings mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] sessions_collection = db["connector_sessions"] session = sessions_collection.find_one({"session_token": session_token}) if not session: raise ValueError(f"Invalid session token: {session_token}") if "token_info" not in session: raise ValueError("Session missing token information") token_info = session["token_info"] if not token_info: raise ValueError("Invalid token information") required_fields = ["access_token", "refresh_token"] missing_fields = [field for field in required_fields if field not in token_info or not token_info.get(field)] if missing_fields: raise ValueError(f"Missing required token fields: {missing_fields}") if 'token_uri' not in token_info: token_info['token_uri'] = 'https://oauth2.googleapis.com/token' return token_info except Exception as e: raise ValueError(f"Failed to retrieve Google Drive token information: {str(e)}") def validate_credentials(self, credentials: Credentials) -> bool: """ Validate Google Drive credentials by making a test API call. Args: credentials: Google credentials object Returns: True if credentials are valid, False otherwise """ try: service = self.build_drive_service(credentials) service.about().get(fields="user").execute() return True except HttpError as e: logging.error(f"HTTP error validating credentials: {e}") return False except Exception as e: logging.error(f"Error validating credentials: {e}") return False ================================================ FILE: application/parser/connectors/google_drive/loader.py ================================================ """ Google Drive loader for DocsGPT. Loads documents from Google Drive using Google Drive API. """ import io import logging import os from typing import List, Dict, Any, Optional from googleapiclient.http import MediaIoBaseDownload from googleapiclient.errors import HttpError from application.parser.connectors.base import BaseConnectorLoader from application.parser.connectors.google_drive.auth import GoogleDriveAuth from application.parser.schema.base import Document class GoogleDriveLoader(BaseConnectorLoader): SUPPORTED_MIME_TYPES = { 'application/pdf': '.pdf', 'application/vnd.google-apps.document': '.docx', 'application/vnd.google-apps.presentation': '.pptx', 'application/vnd.google-apps.spreadsheet': '.xlsx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx', 'application/msword': '.doc', 'application/vnd.ms-powerpoint': '.ppt', 'application/vnd.ms-excel': '.xls', 'text/plain': '.txt', 'text/csv': '.csv', 'text/html': '.html', 'text/markdown': '.md', 'text/x-rst': '.rst', 'application/json': '.json', 'application/epub+zip': '.epub', 'application/rtf': '.rtf', 'image/jpeg': '.jpg', 'image/jpg': '.jpg', 'image/png': '.png', } EXPORT_FORMATS = { 'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.google-apps.presentation': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'application/vnd.google-apps.spreadsheet': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' } def __init__(self, session_token: str): self.auth = GoogleDriveAuth() self.session_token = session_token token_info = self.auth.get_token_info_from_session(session_token) self.credentials = self.auth.create_credentials_from_token_info(token_info) try: self.service = self.auth.build_drive_service(self.credentials) except Exception as e: logging.warning(f"Could not build Google Drive service: {e}") self.service = None self.next_page_token = None def _process_file(self, file_metadata: Dict[str, Any], load_content: bool = True) -> Optional[Document]: try: file_id = file_metadata.get('id') file_name = file_metadata.get('name', 'Unknown') mime_type = file_metadata.get('mimeType', 'application/octet-stream') if mime_type not in self.SUPPORTED_MIME_TYPES and not mime_type.startswith('application/vnd.google-apps.'): return None if mime_type not in self.SUPPORTED_MIME_TYPES and not mime_type.startswith('application/vnd.google-apps.'): logging.info(f"Skipping unsupported file type: {mime_type} for file {file_name}") return None # Google Drive provides timezone-aware ISO8601 dates doc_metadata = { 'file_name': file_name, 'mime_type': mime_type, 'size': file_metadata.get('size', None), 'created_time': file_metadata.get('createdTime'), 'modified_time': file_metadata.get('modifiedTime'), 'parents': file_metadata.get('parents', []), 'source': 'google_drive' } if not load_content: return Document( text="", doc_id=file_id, extra_info=doc_metadata ) content = self._download_file_content(file_id, mime_type) if content is None: logging.warning(f"Could not load content for file {file_name} ({file_id})") return None return Document( text=content, doc_id=file_id, extra_info=doc_metadata ) except Exception as e: logging.error(f"Error processing file: {e}") return None def load_data(self, inputs: Dict[str, Any]) -> List[Document]: session_token = inputs.get('session_token') if session_token and session_token != self.session_token: logging.warning("Session token in inputs differs from loader's session token. Using loader's session token.") self.config = inputs try: documents: List[Document] = [] folder_id = inputs.get('folder_id') file_ids = inputs.get('file_ids', []) limit = inputs.get('limit', 100) list_only = inputs.get('list_only', False) load_content = not list_only page_token = inputs.get('page_token') search_query = inputs.get('search_query') self.next_page_token = None if file_ids: # Specific files requested: load them for file_id in file_ids: try: doc = self._load_file_by_id(file_id, load_content=load_content) if doc: if not search_query or ( search_query.lower() in doc.extra_info.get('file_name', '').lower() ): documents.append(doc) elif hasattr(self, '_credential_refreshed') and self._credential_refreshed: self._credential_refreshed = False logging.info(f"Retrying load of file {file_id} after credential refresh") doc = self._load_file_by_id(file_id, load_content=load_content) if doc and ( not search_query or search_query.lower() in doc.extra_info.get('file_name', '').lower() ): documents.append(doc) except Exception as e: logging.error(f"Error loading file {file_id}: {e}") continue else: # Browsing mode: list immediate children of provided folder or root parent_id = folder_id if folder_id else 'root' documents = self._list_items_in_parent( parent_id, limit=limit, load_content=load_content, page_token=page_token, search_query=search_query ) logging.info(f"Loaded {len(documents)} documents from Google Drive") return documents except Exception as e: logging.error(f"Error loading data from Google Drive: {e}", exc_info=True) raise def _load_file_by_id(self, file_id: str, load_content: bool = True) -> Optional[Document]: self._ensure_service() try: file_metadata = self.service.files().get( fileId=file_id, fields='id,name,mimeType,size,createdTime,modifiedTime,parents' ).execute() return self._process_file(file_metadata, load_content=load_content) except HttpError as e: logging.error(f"HTTP error loading file {file_id}: {e.resp.status} - {e.content}") if e.resp.status in [401, 403]: if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token: try: from google.auth.transport.requests import Request self.credentials.refresh(Request()) self._ensure_service() return None except Exception as refresh_error: raise ValueError(f"Authentication failed and could not be refreshed: {refresh_error}") else: raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token") return None except Exception as e: logging.error(f"Error loading file {file_id}: {e}") return None def _list_items_in_parent(self, parent_id: str, limit: int = 100, load_content: bool = False, page_token: Optional[str] = None, search_query: Optional[str] = None) -> List[Document]: self._ensure_service() documents: List[Document] = [] try: query = f"'{parent_id}' in parents and trashed=false" if search_query: safe_search = search_query.replace("'", "\\'") query += f" and name contains '{safe_search}'" next_token_out: Optional[str] = None while True: page_size = 100 if limit: remaining = max(0, limit - len(documents)) if remaining == 0: break page_size = min(100, remaining) results = self.service.files().list( q=query, fields='nextPageToken,files(id,name,mimeType,size,createdTime,modifiedTime,parents)', pageToken=page_token, pageSize=page_size, orderBy='name' ).execute() items = results.get('files', []) for item in items: mime_type = item.get('mimeType') if mime_type == 'application/vnd.google-apps.folder': doc_metadata = { 'file_name': item.get('name', 'Unknown'), 'mime_type': mime_type, 'size': item.get('size', None), 'created_time': item.get('createdTime'), 'modified_time': item.get('modifiedTime'), 'parents': item.get('parents', []), 'source': 'google_drive', 'is_folder': True } documents.append(Document(text="", doc_id=item.get('id'), extra_info=doc_metadata)) else: doc = self._process_file(item, load_content=load_content) if doc: documents.append(doc) if limit and len(documents) >= limit: self.next_page_token = results.get('nextPageToken') return documents page_token = results.get('nextPageToken') next_token_out = page_token if not page_token: break self.next_page_token = next_token_out return documents except Exception as e: logging.error(f"Error listing items under parent {parent_id}: {e}") return documents def _download_file_content(self, file_id: str, mime_type: str) -> Optional[str]: if not self.credentials.token: logging.warning("No access token in credentials, attempting to refresh") if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token: try: from google.auth.transport.requests import Request self.credentials.refresh(Request()) logging.info("Credentials refreshed successfully") self._ensure_service() except Exception as e: logging.error(f"Failed to refresh credentials: {e}") raise ValueError("Authentication failed and cannot be refreshed: missing or invalid refresh_token") else: logging.error("No access token and no refresh_token available") raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token") if self.credentials.expired: logging.warning("Credentials are expired, attempting to refresh") if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token: try: from google.auth.transport.requests import Request self.credentials.refresh(Request()) logging.info("Credentials refreshed successfully") self._ensure_service() except Exception as e: logging.error(f"Failed to refresh expired credentials: {e}") raise ValueError("Authentication failed and cannot be refreshed: expired credentials") else: logging.error("Credentials expired and no refresh_token available") raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token") try: if mime_type in self.EXPORT_FORMATS: export_mime_type = self.EXPORT_FORMATS[mime_type] request = self.service.files().export_media( fileId=file_id, mimeType=export_mime_type ) else: request = self.service.files().get_media(fileId=file_id) file_io = io.BytesIO() downloader = MediaIoBaseDownload(file_io, request) done = False while done is False: try: _, done = downloader.next_chunk() except HttpError as e: logging.error(f"HTTP error downloading file {file_id}: {e.resp.status} - {e.content}") return None except Exception as e: logging.error(f"Error during download of file {file_id}: {e}") return None content_bytes = file_io.getvalue() try: return content_bytes.decode('utf-8') except UnicodeDecodeError: logging.error(f"Could not decode file {file_id} as text") return None except HttpError as e: logging.error(f"HTTP error downloading file {file_id}: {e.resp.status} - {e.content}") if e.resp.status in [401, 403]: logging.error(f"Authentication error downloading file {file_id}") if hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token: logging.info(f"Attempting to refresh credentials for file {file_id}") try: from google.auth.transport.requests import Request self.credentials.refresh(Request()) logging.info("Credentials refreshed successfully") self._credential_refreshed = True self._ensure_service() return None except Exception as refresh_error: logging.error(f"Error refreshing credentials: {refresh_error}") raise ValueError(f"Authentication failed and could not be refreshed: {refresh_error}") else: logging.error("Cannot refresh credentials: missing refresh_token") raise ValueError("Authentication failed and cannot be refreshed: missing refresh_token") return None except Exception as e: logging.error(f"Error downloading file {file_id}: {e}") return None def _download_file_to_directory(self, file_id: str, local_dir: str) -> bool: try: self._ensure_service() return self._download_single_file(file_id, local_dir) except Exception as e: logging.error(f"Error downloading file {file_id}: {e}", exc_info=True) return False def _ensure_service(self): if not self.service: try: self.service = self.auth.build_drive_service(self.credentials) except Exception as e: raise ValueError(f"Cannot access Google Drive: {e}") def _download_single_file(self, file_id: str, local_dir: str) -> bool: file_metadata = self.service.files().get( fileId=file_id, fields='name,mimeType' ).execute() file_name = file_metadata['name'] mime_type = file_metadata['mimeType'] if mime_type not in self.SUPPORTED_MIME_TYPES and not mime_type.startswith('application/vnd.google-apps.'): return False os.makedirs(local_dir, exist_ok=True) full_path = os.path.join(local_dir, file_name) if mime_type in self.EXPORT_FORMATS: export_mime_type = self.EXPORT_FORMATS[mime_type] request = self.service.files().export_media( fileId=file_id, mimeType=export_mime_type ) extension = self._get_extension_for_mime_type(export_mime_type) if not full_path.endswith(extension): full_path += extension else: request = self.service.files().get_media(fileId=file_id) with open(full_path, 'wb') as f: downloader = MediaIoBaseDownload(f, request) done = False while not done: _, done = downloader.next_chunk() return True def _download_folder_recursive(self, folder_id: str, local_dir: str, recursive: bool = True) -> int: files_downloaded = 0 try: os.makedirs(local_dir, exist_ok=True) query = f"'{folder_id}' in parents and trashed=false" page_token = None while True: results = self.service.files().list( q=query, fields='nextPageToken, files(id, name, mimeType)', pageToken=page_token, pageSize=1000 ).execute() items = results.get('files', []) logging.info(f"Found {len(items)} items in folder {folder_id}") for item in items: item_name = item['name'] item_id = item['id'] mime_type = item['mimeType'] if mime_type == 'application/vnd.google-apps.folder': if recursive: # Create subfolder and recurse subfolder_path = os.path.join(local_dir, item_name) os.makedirs(subfolder_path, exist_ok=True) subfolder_files = self._download_folder_recursive( item_id, subfolder_path, recursive ) files_downloaded += subfolder_files logging.info(f"Downloaded {subfolder_files} files from subfolder {item_name}") else: # Download file success = self._download_single_file(item_id, local_dir) if success: files_downloaded += 1 logging.info(f"Downloaded file: {item_name}") else: logging.warning(f"Failed to download file: {item_name}") page_token = results.get('nextPageToken') if not page_token: break return files_downloaded except Exception as e: logging.error(f"Error in _download_folder_recursive for folder {folder_id}: {e}", exc_info=True) return files_downloaded def _get_extension_for_mime_type(self, mime_type: str) -> str: extensions = { 'application/pdf': '.pdf', 'text/plain': '.txt', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', 'text/html': '.html', 'text/markdown': '.md', } return extensions.get(mime_type, '.bin') def _download_folder_contents(self, folder_id: str, local_dir: str, recursive: bool = True) -> int: try: self._ensure_service() return self._download_folder_recursive(folder_id, local_dir, recursive) except Exception as e: logging.error(f"Error downloading folder {folder_id}: {e}", exc_info=True) return 0 def download_to_directory(self, local_dir: str, source_config: dict = None) -> dict: if source_config is None: source_config = {} config = source_config if source_config else getattr(self, 'config', {}) files_downloaded = 0 try: folder_ids = config.get('folder_ids', []) file_ids = config.get('file_ids', []) recursive = config.get('recursive', True) self._ensure_service() if file_ids: if isinstance(file_ids, str): file_ids = [file_ids] for file_id in file_ids: if self._download_file_to_directory(file_id, local_dir): files_downloaded += 1 # Process folders if folder_ids: if isinstance(folder_ids, str): folder_ids = [folder_ids] for folder_id in folder_ids: try: folder_metadata = self.service.files().get( fileId=folder_id, fields='name' ).execute() folder_name = folder_metadata.get('name', '') folder_path = os.path.join(local_dir, folder_name) os.makedirs(folder_path, exist_ok=True) folder_files = self._download_folder_recursive( folder_id, folder_path, recursive ) files_downloaded += folder_files logging.info(f"Downloaded {folder_files} files from folder {folder_name}") except Exception as e: logging.error(f"Error downloading folder {folder_id}: {e}", exc_info=True) if not file_ids and not folder_ids: raise ValueError("No folder_ids or file_ids provided for download") return { "files_downloaded": files_downloaded, "directory_path": local_dir, "empty_result": files_downloaded == 0, "source_type": "google_drive", "config_used": config } except Exception as e: return { "files_downloaded": files_downloaded, "directory_path": local_dir, "empty_result": True, "source_type": "google_drive", "config_used": config, "error": str(e) } ================================================ FILE: application/parser/connectors/share_point/__init__.py ================================================ """ Share Point connector package for DocsGPT. This module provides authentication and document loading capabilities for Share Point. """ from .auth import SharePointAuth from .loader import SharePointLoader __all__ = ['SharePointAuth', 'SharePointLoader'] ================================================ FILE: application/parser/connectors/share_point/auth.py ================================================ import datetime import logging from typing import Optional, Dict, Any from msal import ConfidentialClientApplication from application.core.settings import settings from application.parser.connectors.base import BaseConnectorAuth logger = logging.getLogger(__name__) class SharePointAuth(BaseConnectorAuth): """ Handles Microsoft OAuth 2.0 authentication for SharePoint/OneDrive. Note: Files.Read scope allows access to files the user has granted access to, similar to Google Drive's drive.file scope. """ SCOPES = [ "Files.Read", "Sites.Read.All", "User.Read", ] def __init__(self): self.client_id = settings.MICROSOFT_CLIENT_ID self.client_secret = settings.MICROSOFT_CLIENT_SECRET if not self.client_id: raise ValueError( "Microsoft OAuth credentials not configured. Please set MICROSOFT_CLIENT_ID in settings." ) if not self.client_secret: raise ValueError( "Microsoft OAuth credentials not configured. Please set MICROSOFT_CLIENT_SECRET in settings." ) self.redirect_uri = settings.CONNECTOR_REDIRECT_BASE_URI self.tenant_id = settings.MICROSOFT_TENANT_ID self.authority = getattr(settings, "MICROSOFT_AUTHORITY", f"https://login.microsoftonline.com/{self.tenant_id}") self.auth_app = ConfidentialClientApplication( client_id=self.client_id, client_credential=self.client_secret, authority=self.authority ) def get_authorization_url(self, state: Optional[str] = None) -> str: return self.auth_app.get_authorization_request_url( scopes=self.SCOPES, state=state, redirect_uri=self.redirect_uri ) def exchange_code_for_tokens(self, authorization_code: str) -> Dict[str, Any]: result = self.auth_app.acquire_token_by_authorization_code( code=authorization_code, scopes=self.SCOPES, redirect_uri=self.redirect_uri ) if "error" in result: logger.error("Token exchange failed: %s", result.get("error_description")) raise ValueError(f"Error acquiring token: {result.get('error_description')}") return self.map_token_response(result) def refresh_access_token(self, refresh_token: str) -> Dict[str, Any]: result = self.auth_app.acquire_token_by_refresh_token(refresh_token=refresh_token, scopes=self.SCOPES) if "error" in result: logger.error("Token refresh failed: %s", result.get("error_description")) raise ValueError(f"Error refreshing token: {result.get('error_description')}") return self.map_token_response(result) def get_token_info_from_session(self, session_token: str) -> Dict[str, Any]: try: from application.core.mongo_db import MongoDB from application.core.settings import settings mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] sessions_collection = db["connector_sessions"] session = sessions_collection.find_one({"session_token": session_token}) if not session: raise ValueError(f"Invalid session token: {session_token}") if "token_info" not in session: raise ValueError("Session missing token information") token_info = session["token_info"] if not token_info: raise ValueError("Invalid token information") required_fields = ["access_token", "refresh_token"] missing_fields = [field for field in required_fields if field not in token_info or not token_info.get(field)] if missing_fields: raise ValueError(f"Missing required token fields: {missing_fields}") if 'token_uri' not in token_info: token_info['token_uri'] = f"https://login.microsoftonline.com/{settings.MICROSOFT_TENANT_ID}/oauth2/v2.0/token" return token_info except Exception as e: logger.error("Failed to retrieve token from session: %s", e) raise ValueError(f"Failed to retrieve SharePoint token information: {str(e)}") def is_token_expired(self, token_info: Dict[str, Any]) -> bool: if not token_info: return True expiry_timestamp = token_info.get("expiry") if expiry_timestamp is None: return True current_timestamp = int(datetime.datetime.now().timestamp()) return (expiry_timestamp - current_timestamp) < 60 def sanitize_token_info(self, token_info: Dict[str, Any], **extra_fields) -> Dict[str, Any]: return super().sanitize_token_info( token_info, allows_shared_content=token_info.get("allows_shared_content", False), **extra_fields, ) PERSONAL_ACCOUNT_TENANT_ID = "9188040d-6c67-4c5b-b112-36a304b66dad" def _allows_shared_content(self, id_token_claims: Dict[str, Any]) -> bool: """Return True when the account is a work/school tenant that can access SharePoint shared content.""" tid = id_token_claims.get("tid", "") return bool(tid) and tid != self.PERSONAL_ACCOUNT_TENANT_ID def map_token_response(self, result) -> Dict[str, Any]: claims = result.get("id_token_claims", {}) return { "access_token": result.get("access_token"), "refresh_token": result.get("refresh_token"), "token_uri": claims.get("iss"), "scopes": result.get("scope"), "expiry": claims.get("exp"), "allows_shared_content": self._allows_shared_content(claims), "user_info": { "name": claims.get("name"), "email": claims.get("preferred_username"), }, } ================================================ FILE: application/parser/connectors/share_point/loader.py ================================================ """ SharePoint/OneDrive loader for DocsGPT. Loads documents from SharePoint/OneDrive using Microsoft Graph API. """ import functools import logging import os from typing import List, Dict, Any, Optional, Tuple from urllib.parse import quote import requests from application.parser.connectors.base import BaseConnectorLoader from application.parser.connectors.share_point.auth import SharePointAuth from application.parser.schema.base import Document def _retry_on_auth_failure(func): """Retry once after refreshing the access token on 401/403 responses.""" @functools.wraps(func) def wrapper(self, *args, **kwargs): try: return func(self, *args, **kwargs) except requests.exceptions.HTTPError as e: if e.response is not None and e.response.status_code in (401, 403): logging.info(f"Auth failure in {func.__name__}, refreshing token and retrying") try: new_token_info = self.auth.refresh_access_token(self.refresh_token) self.access_token = new_token_info.get('access_token') except Exception as refresh_error: raise ValueError( f"Authentication failed and could not be refreshed: {refresh_error}" ) from e return func(self, *args, **kwargs) raise return wrapper class SharePointLoader(BaseConnectorLoader): SUPPORTED_MIME_TYPES = { 'application/pdf': '.pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '.pptx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx', 'application/msword': '.doc', 'application/vnd.ms-powerpoint': '.ppt', 'application/vnd.ms-excel': '.xls', 'text/plain': '.txt', 'text/csv': '.csv', 'text/html': '.html', 'text/markdown': '.md', 'text/x-rst': '.rst', 'application/json': '.json', 'application/epub+zip': '.epub', 'application/rtf': '.rtf', 'image/jpeg': '.jpg', 'image/png': '.png', } EXTENSION_TO_MIME = {v: k for k, v in SUPPORTED_MIME_TYPES.items()} GRAPH_API_BASE = "https://graph.microsoft.com/v1.0" def __init__(self, session_token: str): self.auth = SharePointAuth() self.session_token = session_token token_info = self.auth.get_token_info_from_session(session_token) self.access_token = token_info.get('access_token') self.refresh_token = token_info.get('refresh_token') self.allows_shared_content = token_info.get('allows_shared_content', False) if not self.access_token: raise ValueError("No access token found in session") self.next_page_token = None def _get_headers(self) -> Dict[str, str]: return { 'Authorization': f'Bearer {self.access_token}', 'Accept': 'application/json' } def _ensure_valid_token(self): if not self.access_token: raise ValueError("No access token available") token_info = {'access_token': self.access_token, 'expiry': None} if self.auth.is_token_expired(token_info): logging.info("Token expired, attempting refresh") try: new_token_info = self.auth.refresh_access_token(self.refresh_token) self.access_token = new_token_info.get('access_token') except Exception: raise ValueError("Failed to refresh access token") def _get_item_url(self, item_ref: str) -> str: if ':' in item_ref: drive_id, item_id = item_ref.split(':', 1) return f"{self.GRAPH_API_BASE}/drives/{drive_id}/items/{item_id}" return f"{self.GRAPH_API_BASE}/me/drive/items/{item_ref}" def _process_file(self, file_metadata: Dict[str, Any], load_content: bool = True) -> Optional[Document]: try: drive_item_id = file_metadata.get('id') file_name = file_metadata.get('name', 'Unknown') file_data = file_metadata.get('file', {}) mime_type = file_data.get('mimeType', 'application/octet-stream') if mime_type not in self.SUPPORTED_MIME_TYPES: logging.info(f"Skipping unsupported file type: {mime_type} for file {file_name}") return None doc_metadata = { 'file_name': file_name, 'mime_type': mime_type, 'size': file_metadata.get('size'), 'created_time': file_metadata.get('createdDateTime'), 'modified_time': file_metadata.get('lastModifiedDateTime'), 'source': 'share_point' } if not load_content: return Document( text="", doc_id=drive_item_id, extra_info=doc_metadata ) content = self._download_file_content(drive_item_id) if content is None: logging.warning(f"Could not load content for file {file_name} ({drive_item_id})") return None return Document( text=content, doc_id=drive_item_id, extra_info=doc_metadata ) except Exception as e: logging.error(f"Error processing file: {e}") return None def load_data(self, inputs: Dict[str, Any]) -> List[Document]: try: documents: List[Document] = [] folder_id = inputs.get('folder_id') file_ids = inputs.get('file_ids', []) limit = inputs.get('limit', 100) list_only = inputs.get('list_only', False) load_content = not list_only page_token = inputs.get('page_token') search_query = inputs.get('search_query') self.next_page_token = None shared = inputs.get('shared', False) if file_ids: for file_id in file_ids: try: doc = self._load_file_by_id(file_id, load_content=load_content) if doc: if not search_query or ( search_query.lower() in doc.extra_info.get('file_name', '').lower() ): documents.append(doc) except Exception as e: logging.error(f"Error loading file {file_id}: {e}") continue elif shared: if not self.allows_shared_content: logging.warning("Shared content is only available for work/school Microsoft accounts") return [] documents = self._list_shared_items( limit=limit, load_content=load_content, page_token=page_token, search_query=search_query ) else: parent_id = folder_id if folder_id else 'root' documents = self._list_items_in_parent( parent_id, limit=limit, load_content=load_content, page_token=page_token, search_query=search_query ) logging.info(f"Loaded {len(documents)} documents from SharePoint/OneDrive") return documents except Exception as e: logging.error(f"Error loading data from SharePoint/OneDrive: {e}", exc_info=True) raise @_retry_on_auth_failure def _load_file_by_id(self, file_id: str, load_content: bool = True) -> Optional[Document]: self._ensure_valid_token() try: url = self._get_item_url(file_id) params = {'$select': 'id,name,file,createdDateTime,lastModifiedDateTime,size'} response = requests.get(url, headers=self._get_headers(), params=params) response.raise_for_status() file_metadata = response.json() return self._process_file(file_metadata, load_content=load_content) except requests.exceptions.HTTPError: raise except Exception as e: logging.error(f"Error loading file {file_id}: {e}") return None @_retry_on_auth_failure def _list_items_in_parent(self, parent_id: str, limit: int = 100, load_content: bool = False, page_token: Optional[str] = None, search_query: Optional[str] = None) -> List[Document]: self._ensure_valid_token() documents: List[Document] = [] try: url = f"{self._get_item_url(parent_id)}/children" params = {'$top': min(100, limit) if limit else 100, '$select': 'id,name,file,folder,createdDateTime,lastModifiedDateTime,size'} if page_token: params['$skipToken'] = page_token if search_query: encoded_query = quote(search_query, safe='') if ':' in parent_id: drive_id = parent_id.split(':', 1)[0] search_url = f"{self.GRAPH_API_BASE}/drives/{drive_id}/root/search(q='{encoded_query}')" else: search_url = f"{self.GRAPH_API_BASE}/me/drive/search(q='{encoded_query}')" response = requests.get(search_url, headers=self._get_headers(), params=params) else: response = requests.get(url, headers=self._get_headers(), params=params) response.raise_for_status() results = response.json() items = results.get('value', []) for item in items: if 'folder' in item: doc_metadata = { 'file_name': item.get('name', 'Unknown'), 'mime_type': 'folder', 'size': item.get('size'), 'created_time': item.get('createdDateTime'), 'modified_time': item.get('lastModifiedDateTime'), 'source': 'share_point', 'is_folder': True } documents.append(Document(text="", doc_id=item.get('id'), extra_info=doc_metadata)) else: doc = self._process_file(item, load_content=load_content) if doc: documents.append(doc) if limit and len(documents) >= limit: break next_link = results.get('@odata.nextLink') if next_link: from urllib.parse import urlparse, parse_qs parsed = urlparse(next_link) query_params = parse_qs(parsed.query) skiptoken_list = query_params.get('$skiptoken') if skiptoken_list: self.next_page_token = skiptoken_list[0] else: self.next_page_token = None else: self.next_page_token = None return documents except Exception as e: logging.error(f"Error listing items under parent {parent_id}: {e}") return documents def _resolve_mime_type(self, resource: Dict[str, Any]) -> Tuple[str, bool]: """Resolve mime type from resource, falling back to file extension.""" file_data = resource.get('file', {}) mime_type = file_data.get('mimeType') if file_data else None if mime_type and mime_type in self.SUPPORTED_MIME_TYPES: return mime_type, True name = resource.get('name', '') ext = os.path.splitext(name)[1].lower() if ext in self.EXTENSION_TO_MIME: return self.EXTENSION_TO_MIME[ext], True return mime_type or 'application/octet-stream', False def _get_user_drive_web_url(self) -> Optional[str]: """Fetch the current user's OneDrive web URL for KQL path exclusion.""" try: response = requests.get( f"{self.GRAPH_API_BASE}/me/drive", headers=self._get_headers(), params={'$select': 'webUrl'} ) response.raise_for_status() return response.json().get('webUrl') except Exception as e: logging.warning(f"Could not fetch user drive web URL: {e}") return None def _build_shared_kql_query(self, search_query: Optional[str], user_drive_url: Optional[str]) -> str: """Build KQL query string that excludes the user's own drive items.""" base_query = search_query if search_query else "*" if user_drive_url: return f'{base_query} AND -path:"{user_drive_url}"' return base_query def _list_shared_items(self, limit: int = 100, load_content: bool = False, page_token: Optional[str] = None, search_query: Optional[str] = None) -> List[Document]: """Fetch shared drive items using Microsoft Graph Search API with local offset paging. We always fetch up to a fixed maximum number of hits from Graph (single request), then page through that array locally using `page_token` as a simple integer offset. This avoids relying on buggy or inconsistent remote `from`/`size` semantics. """ self._ensure_valid_token() documents: List[Document] = [] try: user_drive_url = self._get_user_drive_web_url() query_text = self._build_shared_kql_query(search_query, user_drive_url) url = f"{self.GRAPH_API_BASE}/search/query" page_size = 500 # maximum number of hits we care about for selection body = { "requests": [ { "entityTypes": ["driveItem"], "query": {"queryString": query_text}, "from": 0, "size": page_size, } ] } headers = self._get_headers() headers["Content-Type"] = "application/json" response = requests.post(url, headers=headers, json=body) response.raise_for_status() results = response.json() search_response = results.get("value", []) if not search_response: logging.warning("Search API returned empty value array") self.next_page_token = None return documents hits_containers = search_response[0].get("hitsContainers", []) if not hits_containers: logging.warning("Search API returned no hitsContainers") self.next_page_token = None return documents container = hits_containers[0] total = container.get("total", 0) raw_hits = container.get("hits", []) # Deduplicate by effective item ID (driveId:itemId) to avoid the same # resource appearing multiple times across the result set. deduped_hits = [] seen_ids = set() for hit in raw_hits: resource = hit.get("resource", {}) item_id = resource.get("id") drive_id = resource.get("parentReference", {}).get("driveId") effective_id = f"{drive_id}:{item_id}" if drive_id and item_id else item_id if not effective_id or effective_id in seen_ids: continue seen_ids.add(effective_id) deduped_hits.append(hit) hits = deduped_hits logging.info( f"Search API returned {total} total results, {len(raw_hits)} raw hits, {len(hits)} unique hits in this batch" ) try: offset = int(page_token) if page_token is not None else 0 except (TypeError, ValueError): logging.warning( f"Invalid page_token '{page_token}' for shared items search, defaulting to 0" ) offset = 0 if offset < 0: offset = 0 if offset >= len(hits): self.next_page_token = None return documents end_index = offset + limit if limit else len(hits) end_index = min(end_index, len(hits)) for hit in hits[offset:end_index]: resource = hit.get("resource", {}) item_name = resource.get("name", "Unknown") item_id = resource.get("id") drive_id = resource.get("parentReference", {}).get("driveId") effective_id = f"{drive_id}:{item_id}" if drive_id and item_id else item_id is_folder = "folder" in resource if is_folder: doc_metadata = { "file_name": item_name, "mime_type": "folder", "size": resource.get("size"), "created_time": resource.get("createdDateTime"), "modified_time": resource.get("lastModifiedDateTime"), "source": "share_point", "is_folder": True, } documents.append( Document(text="", doc_id=effective_id, extra_info=doc_metadata) ) else: mime_type, supported = self._resolve_mime_type(resource) if not supported: logging.info( f"Skipping unsupported shared file: {item_name} (mime: {mime_type})" ) continue doc_metadata = { "file_name": item_name, "mime_type": mime_type, "size": resource.get("size"), "created_time": resource.get("createdDateTime"), "modified_time": resource.get("lastModifiedDateTime"), "source": "share_point", } content = "" if load_content: content = self._download_file_content(effective_id) or "" documents.append( Document(text=content, doc_id=effective_id, extra_info=doc_metadata) ) if limit and end_index < len(hits): self.next_page_token = str(end_index) else: self.next_page_token = None return documents except Exception as e: logging.error(f"Error listing shared items via search API: {e}", exc_info=True) return documents @_retry_on_auth_failure def _download_file_content(self, file_id: str) -> Optional[str]: self._ensure_valid_token() try: url = f"{self._get_item_url(file_id)}/content" response = requests.get(url, headers=self._get_headers()) response.raise_for_status() try: return response.content.decode('utf-8') except UnicodeDecodeError: logging.error(f"Could not decode file {file_id} as text") return None except requests.exceptions.HTTPError: raise except Exception as e: logging.error(f"Error downloading file {file_id}: {e}") return None def _download_single_file(self, file_id: str, local_dir: str) -> bool: try: url = self._get_item_url(file_id) params = {'$select': 'id,name,file'} response = requests.get(url, headers=self._get_headers(), params=params) response.raise_for_status() metadata = response.json() file_name = metadata.get('name', 'unknown') file_data = metadata.get('file', {}) mime_type = file_data.get('mimeType', 'application/octet-stream') if mime_type not in self.SUPPORTED_MIME_TYPES: logging.info(f"Skipping unsupported file type: {mime_type}") return False os.makedirs(local_dir, exist_ok=True) full_path = os.path.join(local_dir, file_name) download_url = f"{self._get_item_url(file_id)}/content" download_response = requests.get(download_url, headers=self._get_headers()) download_response.raise_for_status() with open(full_path, 'wb') as f: f.write(download_response.content) return True except Exception as e: logging.error(f"Error in _download_single_file: {e}") return False def _download_folder_recursive(self, folder_id: str, local_dir: str, recursive: bool = True) -> int: files_downloaded = 0 try: os.makedirs(local_dir, exist_ok=True) url = f"{self._get_item_url(folder_id)}/children" params = {'$top': 1000} while url: response = requests.get(url, headers=self._get_headers(), params=params) response.raise_for_status() results = response.json() items = results.get('value', []) logging.info(f"Found {len(items)} items in folder {folder_id}") for item in items: item_name = item.get('name', 'unknown') item_id = item.get('id') if 'folder' in item: if recursive: subfolder_path = os.path.join(local_dir, item_name) os.makedirs(subfolder_path, exist_ok=True) subfolder_files = self._download_folder_recursive( item_id, subfolder_path, recursive ) files_downloaded += subfolder_files logging.info(f"Downloaded {subfolder_files} files from subfolder {item_name}") else: success = self._download_single_file(item_id, local_dir) if success: files_downloaded += 1 logging.info(f"Downloaded file: {item_name}") else: logging.warning(f"Failed to download file: {item_name}") url = results.get('@odata.nextLink') return files_downloaded except Exception as e: logging.error(f"Error in _download_folder_recursive for folder {folder_id}: {e}", exc_info=True) return files_downloaded def _download_folder_contents(self, folder_id: str, local_dir: str, recursive: bool = True) -> int: try: self._ensure_valid_token() return self._download_folder_recursive(folder_id, local_dir, recursive) except Exception as e: logging.error(f"Error downloading folder {folder_id}: {e}", exc_info=True) return 0 def _download_file_to_directory(self, file_id: str, local_dir: str) -> bool: try: self._ensure_valid_token() return self._download_single_file(file_id, local_dir) except Exception as e: logging.error(f"Error downloading file {file_id}: {e}", exc_info=True) return False def download_to_directory(self, local_dir: str, source_config: Dict[str, Any] = None) -> Dict[str, Any]: if source_config is None: source_config = {} config = source_config if source_config else getattr(self, 'config', {}) files_downloaded = 0 try: folder_ids = config.get('folder_ids', []) file_ids = config.get('file_ids', []) recursive = config.get('recursive', True) if file_ids: if isinstance(file_ids, str): file_ids = [file_ids] for file_id in file_ids: if self._download_file_to_directory(file_id, local_dir): files_downloaded += 1 if folder_ids: if isinstance(folder_ids, str): folder_ids = [folder_ids] for folder_id in folder_ids: try: url = self._get_item_url(folder_id) params = {'$select': 'id,name'} response = requests.get(url, headers=self._get_headers(), params=params) response.raise_for_status() folder_metadata = response.json() folder_name = folder_metadata.get('name', '') folder_path = os.path.join(local_dir, folder_name) os.makedirs(folder_path, exist_ok=True) folder_files = self._download_folder_recursive( folder_id, folder_path, recursive ) files_downloaded += folder_files logging.info(f"Downloaded {folder_files} files from folder {folder_name}") except Exception as e: logging.error(f"Error downloading folder {folder_id}: {e}", exc_info=True) if not file_ids and not folder_ids: raise ValueError("No folder_ids or file_ids provided for download") return { "files_downloaded": files_downloaded, "directory_path": local_dir, "empty_result": files_downloaded == 0, "source_type": "share_point", "config_used": config } except Exception as e: return { "files_downloaded": files_downloaded, "directory_path": local_dir, "empty_result": True, "source_type": "share_point", "config_used": config, "error": str(e) } ================================================ FILE: application/parser/embedding_pipeline.py ================================================ import os import logging from typing import List, Any from retry import retry from tqdm import tqdm from application.core.settings import settings from application.vectorstore.vector_creator import VectorCreator def sanitize_content(content: str) -> str: """ Remove NUL characters that can cause vector store ingestion to fail. Args: content (str): Raw content that may contain NUL characters Returns: str: Sanitized content with NUL characters removed """ if not content: return content return content.replace('\x00', '') @retry(tries=10, delay=60) def add_text_to_store_with_retry(store: Any, doc: Any, source_id: str) -> None: """Add a document's text and metadata to the vector store with retry logic. Args: store: The vector store object. doc: The document to be added. source_id: Unique identifier for the source. Raises: Exception: If document addition fails after all retry attempts. """ try: # Sanitize content to remove NUL characters that cause ingestion failures doc.page_content = sanitize_content(doc.page_content) doc.metadata["source_id"] = str(source_id) store.add_texts([doc.page_content], metadatas=[doc.metadata]) except Exception as e: logging.error(f"Failed to add document with retry: {e}", exc_info=True) raise def embed_and_store_documents(docs: List[Any], folder_name: str, source_id: str, task_status: Any) -> None: """Embeds documents and stores them in a vector store. Args: docs: List of documents to be embedded and stored. folder_name: Directory to save the vector store. source_id: Unique identifier for the source. task_status: Task state manager for progress updates. Returns: None Raises: OSError: If unable to create folder or save vector store. Exception: If vector store creation or document embedding fails. """ # Ensure the folder exists if not os.path.exists(folder_name): os.makedirs(folder_name) # Validate docs is not empty if not docs: raise ValueError("No documents to embed - check file format and extension") # Initialize vector store if settings.VECTOR_STORE == "faiss": docs_init = [docs.pop(0)] store = VectorCreator.create_vectorstore( settings.VECTOR_STORE, docs_init=docs_init, source_id=source_id, embeddings_key=os.getenv("EMBEDDINGS_KEY"), ) else: store = VectorCreator.create_vectorstore( settings.VECTOR_STORE, source_id=source_id, embeddings_key=os.getenv("EMBEDDINGS_KEY"), ) store.delete_index() total_docs = len(docs) # Process and embed documents for idx, doc in tqdm( enumerate(docs), desc="Embedding 🦖", unit="docs", total=total_docs, bar_format="{l_bar}{bar}| Time Left: {remaining}", ): try: # Update task status for progress tracking progress = int(((idx + 1) / total_docs) * 100) task_status.update_state(state="PROGRESS", meta={"current": progress}) # Add document to vector store add_text_to_store_with_retry(store, doc, source_id) except Exception as e: logging.error(f"Error embedding document {idx}: {e}", exc_info=True) logging.info(f"Saving progress at document {idx} out of {total_docs}") try: store.save_local(folder_name) logging.info("Progress saved successfully") except Exception as save_error: logging.error(f"CRITICAL: Failed to save progress: {save_error}", exc_info=True) # Continue without breaking to attempt final save break # Save the vector store if settings.VECTOR_STORE == "faiss": try: store.save_local(folder_name) logging.info("Vector store saved successfully.") except Exception as e: logging.error(f"CRITICAL: Failed to save final vector store: {e}", exc_info=True) raise OSError(f"Unable to save vector store to {folder_name}: {e}") from e else: logging.info("Vector store saved successfully.") ================================================ FILE: application/parser/file/__init__.py ================================================ ================================================ FILE: application/parser/file/audio_parser.py ================================================ from pathlib import Path from typing import Dict, Union from application.core.settings import settings from application.parser.file.base_parser import BaseParser from application.stt.stt_creator import STTCreator from application.stt.upload_limits import enforce_audio_file_size_limit class AudioParser(BaseParser): def __init__(self, parser_config=None): super().__init__(parser_config=parser_config) self._transcript_metadata: Dict[str, Dict] = {} def _init_parser(self) -> Dict: return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]: _ = errors try: enforce_audio_file_size_limit(file.stat().st_size) except OSError: pass stt = STTCreator.create_stt(settings.STT_PROVIDER) result = stt.transcribe( file, language=settings.STT_LANGUAGE, timestamps=settings.STT_ENABLE_TIMESTAMPS, diarize=settings.STT_ENABLE_DIARIZATION, ) transcript_metadata = { "transcript_duration_s": result.get("duration_s"), "transcript_language": result.get("language"), "transcript_provider": result.get("provider"), } if result.get("segments"): transcript_metadata["transcript_segments"] = result["segments"] self._transcript_metadata[str(file)] = { key: value for key, value in transcript_metadata.items() if value not in (None, [], {}) } return result.get("text", "") def get_file_metadata(self, file: Path) -> Dict: return self._transcript_metadata.get(str(file), {}) ================================================ FILE: application/parser/file/base.py ================================================ """Base reader class.""" from abc import abstractmethod from typing import Any, List from langchain_core.documents import Document as LCDocument from application.parser.schema.base import Document class BaseReader: """Utilities for loading data from a directory.""" @abstractmethod def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: """Load data from the input directory.""" def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: """Load data in LangChain document format.""" docs = self.load_data(**load_kwargs) return [d.to_langchain_format() for d in docs] ================================================ FILE: application/parser/file/base_parser.py ================================================ """Base parser and config class.""" from abc import abstractmethod from pathlib import Path from typing import Dict, List, Optional, Union class BaseParser: """Base class for all parsers.""" def __init__(self, parser_config: Optional[Dict] = None): """Init params.""" self._parser_config = parser_config def init_parser(self) -> None: """Init parser and store it.""" parser_config = self._init_parser() self._parser_config = parser_config @property def parser_config_set(self) -> bool: """Check if parser config is set.""" return self._parser_config is not None @property def parser_config(self) -> Dict: """Check if parser config is set.""" if self._parser_config is None: raise ValueError("Parser config not set.") return self._parser_config @abstractmethod def _init_parser(self) -> Dict: """Initialize the parser with the config.""" @abstractmethod def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: """Parse file.""" def get_file_metadata(self, file: Path) -> Dict: """Return parser-specific metadata for the most recently parsed file.""" _ = file return {} ================================================ FILE: application/parser/file/bulk.py ================================================ """Simple reader that reads files of different formats from a directory.""" import logging from pathlib import Path from typing import Callable, Dict, List, Optional, Union from application.parser.file.base import BaseReader from application.parser.file.base_parser import BaseParser from application.parser.file.docs_parser import DocxParser, PDFParser from application.parser.file.epub_parser import EpubParser from application.parser.file.html_parser import HTMLParser from application.parser.file.markdown_parser import MarkdownParser from application.parser.file.rst_parser import RstParser from application.parser.file.tabular_parser import PandasCSVParser, ExcelParser from application.parser.file.json_parser import JSONParser from application.parser.file.pptx_parser import PPTXParser from application.parser.file.image_parser import ImageParser from application.parser.file.audio_parser import AudioParser from application.parser.schema.base import Document from application.stt.constants import SUPPORTED_AUDIO_EXTENSIONS from application.utils import num_tokens_from_string from application.core.settings import settings def _build_audio_parser_mapping() -> Dict[str, BaseParser]: return {extension: AudioParser() for extension in SUPPORTED_AUDIO_EXTENSIONS} def get_default_file_extractor( ocr_enabled: Optional[bool] = None, ) -> Dict[str, BaseParser]: """Get the default file extractor. Uses docling parsers by default for advanced document processing. Falls back to standard parsers if docling is not installed. """ try: from application.parser.file.docling_parser import ( DoclingPDFParser, DoclingDocxParser, DoclingPPTXParser, DoclingXLSXParser, DoclingHTMLParser, DoclingImageParser, DoclingCSVParser, DoclingAsciiDocParser, DoclingVTTParser, DoclingXMLParser, ) if ocr_enabled is None: ocr_enabled = settings.DOCLING_OCR_ENABLED return { # Documents ".pdf": DoclingPDFParser(ocr_enabled=ocr_enabled), ".docx": DoclingDocxParser(), ".pptx": DoclingPPTXParser(), ".xlsx": DoclingXLSXParser(), # Web formats ".html": DoclingHTMLParser(), ".xhtml": DoclingHTMLParser(), # Data formats ".csv": DoclingCSVParser(), ".json": JSONParser(), # Keep JSON parser (specialized handling) # Text/markup formats ".md": MarkdownParser(), # Keep markdown parser (specialized handling) ".mdx": MarkdownParser(), ".rst": RstParser(), ".adoc": DoclingAsciiDocParser(), ".asciidoc": DoclingAsciiDocParser(), # Images (with OCR) - only use Docling when OCR is enabled ".png": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), ".jpg": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), ".jpeg": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), ".tiff": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), ".tif": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), ".bmp": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), ".webp": DoclingImageParser(ocr_enabled=ocr_enabled) if ocr_enabled else ImageParser(), # Media/subtitles ".vtt": DoclingVTTParser(), **_build_audio_parser_mapping(), # Specialized XML formats ".xml": DoclingXMLParser(), # Formats docling doesn't support - use standard parsers ".epub": EpubParser(), } except ImportError: logging.warning( "docling is not installed. Using standard parsers. " "For advanced document parsing, install with: pip install docling" ) # Fallback to standard parsers return { ".pdf": PDFParser(), ".docx": DocxParser(), ".csv": PandasCSVParser(), ".xlsx": ExcelParser(), ".epub": EpubParser(), ".md": MarkdownParser(), ".rst": RstParser(), ".html": HTMLParser(), ".mdx": MarkdownParser(), ".json": JSONParser(), ".pptx": PPTXParser(), ".png": ImageParser(), ".jpg": ImageParser(), ".jpeg": ImageParser(), **_build_audio_parser_mapping(), } # For backwards compatibility DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = get_default_file_extractor() class SimpleDirectoryReader(BaseReader): """Simple directory reader. Can read files into separate documents, or concatenates files into one document text. Args: input_dir (str): Path to the directory. input_files (List): List of file paths to read (Optional; overrides input_dir) exclude_hidden (bool): Whether to exclude hidden files (dotfiles). errors (str): how encoding and decoding errors are to be handled, see https://docs.python.org/3/library/functions.html#open recursive (bool): Whether to recursively search in subdirectories. False by default. required_exts (Optional[List[str]]): List of required extensions. Default is None. file_extractor (Optional[Dict[str, BaseParser]]): A mapping of file extension to a BaseParser class that specifies how to convert that file to text. See DEFAULT_FILE_EXTRACTOR. num_files_limit (Optional[int]): Maximum number of files to read. Default is None. file_metadata (Optional[Callable[str, Dict]]): A function that takes in a filename and returns a Dict of metadata for the Document. Default is None. """ def __init__( self, input_dir: Optional[str] = None, input_files: Optional[List] = None, exclude_hidden: bool = True, errors: str = "ignore", recursive: bool = True, required_exts: Optional[List[str]] = None, file_extractor: Optional[Dict[str, BaseParser]] = None, num_files_limit: Optional[int] = None, file_metadata: Optional[Callable[[str], Dict]] = None, ) -> None: """Initialize with parameters.""" super().__init__() if not input_dir and not input_files: raise ValueError("Must provide either `input_dir` or `input_files`.") self.errors = errors self.recursive = recursive self.exclude_hidden = exclude_hidden # Normalize extensions to lowercase for case-insensitive matching self.required_exts = ( [ext.lower() for ext in required_exts] if required_exts else None ) self.num_files_limit = num_files_limit if input_files: self.input_files = [] for path in input_files: print(path) input_file = Path(path) self.input_files.append(input_file) elif input_dir: self.input_dir = Path(input_dir) self.input_files = self._add_files(self.input_dir) self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR self.file_metadata = file_metadata def _add_files(self, input_dir: Path) -> List[Path]: """Add files.""" input_files = sorted(input_dir.iterdir()) new_input_files = [] dirs_to_explore = [] for input_file in input_files: if input_file.is_dir(): if self.recursive: dirs_to_explore.append(input_file) elif self.exclude_hidden and input_file.name.startswith("."): continue elif ( self.required_exts is not None and input_file.suffix.lower() not in self.required_exts ): continue else: new_input_files.append(input_file) for dir_to_explore in dirs_to_explore: sub_input_files = self._add_files(dir_to_explore) new_input_files.extend(sub_input_files) if self.num_files_limit is not None and self.num_files_limit > 0: new_input_files = new_input_files[0: self.num_files_limit] # print total number of files added logging.debug( f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}" ) return new_input_files def load_data(self, concatenate: bool = False) -> List[Document]: """Load data from the input directory. Args: concatenate (bool): whether to concatenate all files into one document. If set to True, file metadata is ignored. False by default. Returns: List[Document]: A list of documents. """ data: Union[str, List[str]] = "" data_list: List[str] = [] metadata_list = [] self.file_token_counts = {} for input_file in self.input_files: suffix_lower = input_file.suffix.lower() parser_metadata = {} if suffix_lower in self.file_extractor: parser = self.file_extractor[suffix_lower] if not parser.parser_config_set: parser.init_parser() data = parser.parse_file(input_file, errors=self.errors) parser_metadata = parser.get_file_metadata(input_file) else: # do standard read with open(input_file, "r", errors=self.errors) as f: data = f.read() # Calculate token count for this file if isinstance(data, List): file_tokens = sum(num_tokens_from_string(str(d)) for d in data) else: file_tokens = num_tokens_from_string(str(data)) full_path = str(input_file.resolve()) self.file_token_counts[full_path] = file_tokens base_metadata = { 'title': input_file.name, 'token_count': file_tokens, } if parser_metadata: base_metadata.update(parser_metadata) if hasattr(self, 'input_dir'): try: relative_path = str(input_file.relative_to(self.input_dir)) base_metadata['source'] = relative_path except ValueError: base_metadata['source'] = str(input_file) else: base_metadata['source'] = str(input_file) if self.file_metadata is not None: custom_metadata = self.file_metadata(input_file.name) base_metadata.update(custom_metadata) if isinstance(data, List): # Extend data_list with each item in the data list data_list.extend([str(d) for d in data]) metadata_list.extend([base_metadata for _ in data]) else: data_list.append(str(data)) metadata_list.append(base_metadata) # Build directory structure if input_dir is provided if hasattr(self, 'input_dir'): self.directory_structure = self.build_directory_structure(self.input_dir) logging.info("Directory structure built successfully") else: self.directory_structure = {} if concatenate: return [Document("\n".join(data_list))] elif self.file_metadata is not None: return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] else: return [Document(d) for d in data_list] def build_directory_structure(self, base_path): """Build a dictionary representing the directory structure. Args: base_path: The base path to start building the structure from. Returns: dict: A nested dictionary representing the directory structure. """ import mimetypes def build_tree(path): """Helper function to recursively build the directory tree.""" result = {} for item in path.iterdir(): if self.exclude_hidden and item.name.startswith('.'): continue if item.is_dir(): subtree = build_tree(item) if subtree: result[item.name] = subtree else: if self.required_exts is not None and item.suffix.lower() not in self.required_exts: continue full_path = str(item.resolve()) file_size_bytes = item.stat().st_size mime_type = mimetypes.guess_type(item.name)[0] or "application/octet-stream" file_info = { "type": mime_type, "size_bytes": file_size_bytes } if hasattr(self, 'file_token_counts') and full_path in self.file_token_counts: file_info["token_count"] = self.file_token_counts[full_path] result[item.name] = file_info return result return build_tree(Path(base_path)) ================================================ FILE: application/parser/file/constants.py ================================================ """Shared file-extension constants for parsing and ingestion flows.""" from application.stt.constants import SUPPORTED_AUDIO_EXTENSIONS SUPPORTED_SOURCE_DOCUMENT_EXTENSIONS = ( ".rst", ".md", ".pdf", ".txt", ".docx", ".csv", ".epub", ".html", ".mdx", ".json", ".xlsx", ".pptx", ) SUPPORTED_SOURCE_IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg") SUPPORTED_SOURCE_EXTENSIONS = ( *SUPPORTED_SOURCE_DOCUMENT_EXTENSIONS, *SUPPORTED_SOURCE_IMAGE_EXTENSIONS, *SUPPORTED_AUDIO_EXTENSIONS, ) ================================================ FILE: application/parser/file/docling_parser.py ================================================ """Docling parser. Uses docling library for advanced document parsing with layout detection, table structure recognition, and unified document representation. Supports: PDF, DOCX, PPTX, XLSX, HTML, XHTML, CSV, Markdown, AsciiDoc, images (PNG, JPEG, TIFF, BMP, WEBP), WebVTT, and specialized XML formats. """ import importlib.util import logging from pathlib import Path from typing import Dict, List, Optional, Union from application.parser.file.base_parser import BaseParser logger = logging.getLogger(__name__) class DoclingParser(BaseParser): """Parser using docling for advanced document processing. Docling provides: - Advanced PDF layout analysis - Table structure recognition - Reading order detection - OCR for scanned documents (supports RapidOCR) - Unified DoclingDocument format - Export to Markdown Uses hybrid OCR approach by default: - Text regions: Direct PDF text extraction (fast) - Bitmap/image regions: OCR only these areas (smart) """ def __init__( self, ocr_enabled: bool = True, table_structure: bool = True, export_format: str = "markdown", use_rapidocr: bool = True, ocr_languages: Optional[List[str]] = None, force_full_page_ocr: bool = False, ): """Initialize DoclingParser. Args: ocr_enabled: Enable OCR for bitmap/image regions in documents table_structure: Enable table structure recognition export_format: Output format ('markdown', 'text', 'html') use_rapidocr: Use RapidOCR engine (default True, works well in Docker) ocr_languages: List of OCR languages (default: ['english']) force_full_page_ocr: Force OCR on entire page (False = smart hybrid OCR) """ super().__init__() self.ocr_enabled = ocr_enabled self.table_structure = table_structure self.export_format = export_format self.use_rapidocr = use_rapidocr self.ocr_languages = ocr_languages or ["english"] self.force_full_page_ocr = force_full_page_ocr self._converter = None def _create_converter(self): """Create a docling converter with hybrid OCR configuration. Uses smart OCR approach: - When ocr_enabled=True and force_full_page_ocr=False (default): Layout model detects text vs bitmap regions, OCR only runs on bitmaps - When ocr_enabled=True and force_full_page_ocr=True: OCR runs on entire page (for scanned documents/images) - When ocr_enabled=False: No OCR, only native text extraction Returns: DocumentConverter instance """ from docling.document_converter import ( DocumentConverter, ImageFormatOption, InputFormat, PdfFormatOption, ) from docling.datamodel.pipeline_options import PdfPipelineOptions pipeline_options = PdfPipelineOptions( do_ocr=self.ocr_enabled, do_table_structure=self.table_structure, ) if self.ocr_enabled: ocr_options = self._get_ocr_options() if ocr_options is not None: pipeline_options.ocr_options = ocr_options return DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, ), InputFormat.IMAGE: ImageFormatOption( pipeline_options=pipeline_options, ), } ) def _init_parser(self) -> Dict: """Initialize the docling converter with hybrid OCR.""" logger.info("Initializing DoclingParser...") logger.info(f" ocr_enabled={self.ocr_enabled}") logger.info(f" force_full_page_ocr={self.force_full_page_ocr}") logger.info(f" use_rapidocr={self.use_rapidocr}") if importlib.util.find_spec("docling.document_converter") is None: raise ImportError( "docling is required for DoclingParser. " "Install it with: pip install docling" ) # Create converter with hybrid OCR (smart: text direct, bitmaps OCR'd) self._converter = self._create_converter() logger.info("DoclingParser initialized successfully") return { "ocr_enabled": self.ocr_enabled, "table_structure": self.table_structure, "export_format": self.export_format, "use_rapidocr": self.use_rapidocr, "ocr_languages": self.ocr_languages, "force_full_page_ocr": self.force_full_page_ocr, } def _get_ocr_options(self): """Get OCR options based on configuration. Returns RapidOcrOptions if use_rapidocr is True and available, otherwise returns None to use docling defaults. """ if not self.use_rapidocr: return None try: from docling.datamodel.pipeline_options import RapidOcrOptions return RapidOcrOptions( lang=self.ocr_languages, force_full_page_ocr=self.force_full_page_ocr, ) except ImportError as e: logger.warning(f"Failed to import RapidOcrOptions: {e}") return None except Exception as e: logger.error(f"Error creating RapidOcrOptions: {e}") return None def _export_content(self, document) -> str: """Export document content in the configured format. Handles edge case where text is nested under picture elements (e.g., OCR'd images). If the standard export returns minimal content but document.texts contains extracted text, falls back to direct text extraction. """ if self.export_format == "markdown": content = document.export_to_markdown() elif self.export_format == "html": content = document.export_to_html() else: content = document.export_to_text() # Handle case where text is nested under pictures (common with OCR'd images) # Standard exports may return just "" while actual text exists stripped_content = content.strip() is_minimal = len(stripped_content) < 50 or stripped_content == "" if is_minimal and hasattr(document, "texts") and document.texts: # Extract text directly from document.texts extracted_texts = [t.text for t in document.texts if t.text] if extracted_texts: logger.info( f"Standard export minimal ({len(stripped_content)} chars), " f"extracting {len(extracted_texts)} texts directly" ) return "\n\n".join(extracted_texts) return content def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: """Parse file using docling with hybrid OCR. Uses smart OCR approach where the layout model detects text vs bitmap regions. Text is extracted directly, bitmaps are OCR'd only when needed. Args: file: Path to the file to parse errors: Error handling mode (ignored, docling handles internally) Returns: Parsed document content as markdown string """ logger.info(f"parse_file called for: {file}") if self._converter is None: self._init_parser() try: logger.info(f"Converting file with hybrid OCR: {file}") result = self._converter.convert(str(file)) content = self._export_content(result.document) logger.info(f"Parse complete, content length: {len(content)} chars") return content except Exception as e: logger.error(f"Error parsing file with docling: {e}", exc_info=True) if errors == "ignore": return f"[Error parsing file with docling: {str(e)}]" raise class DoclingPDFParser(DoclingParser): """Docling-based PDF parser with advanced features and RapidOCR support. Uses hybrid OCR approach by default: - Text regions: Direct PDF text extraction (fast) - Bitmap/image regions: OCR only these areas (smart) Set force_full_page_ocr=True only for fully scanned documents. """ def __init__( self, ocr_enabled: bool = True, table_structure: bool = True, use_rapidocr: bool = True, ocr_languages: Optional[List[str]] = None, force_full_page_ocr: bool = False, ): super().__init__( ocr_enabled=ocr_enabled, table_structure=table_structure, export_format="markdown", use_rapidocr=use_rapidocr, ocr_languages=ocr_languages, force_full_page_ocr=force_full_page_ocr, ) class DoclingDocxParser(DoclingParser): """Docling-based DOCX parser.""" def __init__(self): super().__init__(export_format="markdown") class DoclingPPTXParser(DoclingParser): """Docling-based PPTX parser.""" def __init__(self): super().__init__(export_format="markdown") class DoclingXLSXParser(DoclingParser): """Docling-based XLSX parser with table structure.""" def __init__(self): super().__init__(table_structure=True, export_format="markdown") class DoclingHTMLParser(DoclingParser): """Docling-based HTML parser.""" def __init__(self): super().__init__(export_format="markdown") class DoclingImageParser(DoclingParser): """Docling-based image parser with OCR and RapidOCR support. For images, force_full_page_ocr=True is used since images are entirely visual and require full OCR to extract any text. """ def __init__( self, ocr_enabled: bool = True, use_rapidocr: bool = True, ocr_languages: Optional[List[str]] = None, force_full_page_ocr: bool = True, ): super().__init__( ocr_enabled=ocr_enabled, export_format="markdown", use_rapidocr=use_rapidocr, ocr_languages=ocr_languages, force_full_page_ocr=force_full_page_ocr, ) class DoclingCSVParser(DoclingParser): """Docling-based CSV parser.""" def __init__(self): super().__init__(table_structure=True, export_format="markdown") class DoclingMarkdownParser(DoclingParser): """Docling-based Markdown parser.""" def __init__(self): super().__init__(export_format="markdown") class DoclingAsciiDocParser(DoclingParser): """Docling-based AsciiDoc parser.""" def __init__(self): super().__init__(export_format="markdown") class DoclingVTTParser(DoclingParser): """Docling-based WebVTT (video text tracks) parser.""" def __init__(self): super().__init__(export_format="markdown") class DoclingXMLParser(DoclingParser): """Docling-based XML parser (USPTO, JATS).""" def __init__(self): super().__init__(export_format="markdown") ================================================ FILE: application/parser/file/docs_parser.py ================================================ """Docs parser. Contains parsers for docx, pdf files. """ from pathlib import Path from typing import Dict from application.parser.file.base_parser import BaseParser from application.core.settings import settings import requests class PDFParser(BaseParser): """PDF parser.""" def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> str: """Parse file.""" if settings.PARSE_PDF_AS_IMAGE: doc2md_service = "https://llm.arc53.com/doc2md" # alternatively you can use local vision capable LLM with open(file, "rb") as file_loaded: files = {'file': file_loaded} response = requests.post(doc2md_service, files=files) data = response.json()["markdown"] return data try: from pypdf import PdfReader except ImportError: raise ValueError("pypdf is required to read PDF files.") text_list = [] with open(file, "rb") as fp: # Create a PDF object pdf = PdfReader(fp) # Get the number of pages in the PDF document num_pages = len(pdf.pages) # Iterate over every page for page_index in range(num_pages): # Extract the text from the page page = pdf.pages[page_index] page_text = page.extract_text() text_list.append(page_text) text = "\n".join(text_list) return text class DocxParser(BaseParser): """Docx parser.""" def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> str: """Parse file.""" try: import docx2txt except ImportError: raise ValueError("docx2txt is required to read Microsoft Word files.") text = docx2txt.process(file) return text ================================================ FILE: application/parser/file/epub_parser.py ================================================ """Epub parser. Contains parsers for epub files. """ from pathlib import Path from typing import Dict from application.parser.file.base_parser import BaseParser class EpubParser(BaseParser): """Epub Parser.""" def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> str: """Parse file.""" try: import ebooklib from ebooklib import epub except ImportError: raise ValueError("`EbookLib` is required to read Epub files.") try: import html2text except ImportError: raise ValueError("`html2text` is required to parse Epub files.") text_list = [] book = epub.read_epub(file, options={"ignore_ncx": True}) # Iterate through all chapters. for item in book.get_items(): # Chapters are typically located in epub documents items. if item.get_type() == ebooklib.ITEM_DOCUMENT: text_list.append( html2text.html2text(item.get_content().decode("utf-8")) ) text = "\n".join(text_list) return text ================================================ FILE: application/parser/file/html_parser.py ================================================ """HTML parser. Contains parser for html files. """ from pathlib import Path from typing import Dict, Union from application.parser.file.base_parser import BaseParser class HTMLParser(BaseParser): """HTML parser.""" def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]: from langchain_community.document_loaders import BSHTMLLoader loader = BSHTMLLoader(file) data = loader.load() return data ================================================ FILE: application/parser/file/image_parser.py ================================================ """Image parser. Contains parser for .png, .jpg, .jpeg files. """ from pathlib import Path import requests from typing import Dict, Union from application.parser.file.base_parser import BaseParser from application.core.settings import settings class ImageParser(BaseParser): """Image parser.""" def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]: if settings.PARSE_IMAGE_REMOTE: doc2md_service = "https://llm.arc53.com/doc2md" # alternatively you can use local vision capable LLM with open(file, "rb") as file_loaded: files = {'file': file_loaded} response = requests.post(doc2md_service, files=files) data = response.json()["markdown"] else: data = "" return data ================================================ FILE: application/parser/file/json_parser.py ================================================ import json from typing import Any, Dict, List, Union from pathlib import Path from application.parser.file.base_parser import BaseParser class JSONParser(BaseParser): r"""JSON (.json) parser. Parses JSON files into a list of strings or a concatenated document. It handles both JSON objects (dictionaries) and arrays (lists). Args: concat_rows (bool): Whether to concatenate all rows into one document. If set to False, a Document will be created for each item in the JSON. True by default. row_joiner (str): Separator to use for joining each row. Only used when `concat_rows=True`. Set to "\n" by default. json_config (dict): Options for parsing JSON. Can be used to specify options like custom decoding or formatting. Set to empty dict by default. """ def __init__( self, *args: Any, concat_rows: bool = True, row_joiner: str = "\n", json_config: dict = {}, **kwargs: Any ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._concat_rows = concat_rows self._row_joiner = row_joiner self._json_config = json_config def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: """Parse JSON file.""" with open(file, 'r', encoding='utf-8') as f: data = json.load(f, **self._json_config) if isinstance(data, dict): data = [data] if self._concat_rows: return self._row_joiner.join([str(item) for item in data]) else: return data ================================================ FILE: application/parser/file/markdown_parser.py ================================================ """Markdown parser. Contains parser for md files. """ import re from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, cast from application.parser.file.base_parser import BaseParser from application.utils import num_tokens_from_string class MarkdownParser(BaseParser): """Markdown parser. Extract text from markdown files. Returns dictionary with keys as headers and values as the text between headers. """ def __init__( self, *args: Any, remove_hyperlinks: bool = True, remove_images: bool = True, max_tokens: int = 2048, # remove_tables: bool = True, **kwargs: Any, ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._remove_hyperlinks = remove_hyperlinks self._remove_images = remove_images self._max_tokens = max_tokens # self._remove_tables = remove_tables def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str): """Append to tups chunk.""" num_tokens = num_tokens_from_string(current_text) if num_tokens > self._max_tokens: chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)] for chunk in chunks: tups.append((current_header, chunk)) else: tups.append((current_header, current_text)) return tups def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: """Convert a markdown file to a dictionary. The keys are the headers and the values are the text under each header. """ markdown_tups: List[Tuple[Optional[str], str]] = [] lines = markdown_text.split("\n") current_header = None current_text = "" for line in lines: header_match = re.match(r"^#+\s", line) if header_match: if current_header is not None: if current_text == "" or None: continue markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text) current_header = line current_text = "" else: current_text += line + "\n" markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text) if current_header is not None: # pass linting, assert keys are defined markdown_tups = [ (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) for key, value in markdown_tups ] else: markdown_tups = [ (key, re.sub("\n", "", value)) for key, value in markdown_tups ] return markdown_tups def remove_images(self, content: str) -> str: """Get a dictionary of a markdown file from its path.""" pattern = r"!{1}\[\[(.*)\]\]" content = re.sub(pattern, "", content) return content # def remove_tables(self, content: str) -> List[List[str]]: # """Convert markdown tables to nested lists.""" # table_rows_pattern = r"((\r?\n){2}|^)([^\r\n]*\|[^\r\n]*(\r?\n)?)+(?=(\r?\n){2}|$)" # table_cells_pattern = r"([^\|\r\n]*)\|" # # table_rows = re.findall(table_rows_pattern, content, re.MULTILINE) # table_lists = [] # for row in table_rows: # cells = re.findall(table_cells_pattern, row[2]) # cells = [cell.strip() for cell in cells if cell.strip()] # table_lists.append(cells) # return str(table_lists) def remove_hyperlinks(self, content: str) -> str: """Get a dictionary of a markdown file from its path.""" pattern = r"\[(.*?)\]\((.*?)\)" content = re.sub(pattern, r"\1", content) return content def _init_parser(self) -> Dict: """Initialize the parser with the config.""" return {} def parse_tups( self, filepath: Path, errors: str = "ignore" ) -> List[Tuple[Optional[str], str]]: """Parse file into tuples.""" with open(filepath, "r") as f: content = f.read() if self._remove_hyperlinks: content = self.remove_hyperlinks(content) if self._remove_images: content = self.remove_images(content) # if self._remove_tables: # content = self.remove_tables(content) markdown_tups = self.markdown_to_tups(content) return markdown_tups def parse_file( self, filepath: Path, errors: str = "ignore" ) -> Union[str, List[str]]: """Parse file into string.""" tups = self.parse_tups(filepath, errors=errors) results = [] # TODO: don't include headers right now for header, value in tups: if header is None: results.append(value) else: results.append(f"\n\n{header}\n{value}") return results ================================================ FILE: application/parser/file/openapi3_parser.py ================================================ from urllib.parse import urlparse from openapi_parser import parse try: from application.parser.file.base_parser import BaseParser except ModuleNotFoundError: from base_parser import BaseParser class OpenAPI3Parser(BaseParser): def init_parser(self) -> None: return super().init_parser() def get_base_urls(self, urls): base_urls = [] for i in urls: parsed_url = urlparse(i) base_url = parsed_url.scheme + "://" + parsed_url.netloc if base_url not in base_urls: base_urls.append(base_url) return base_urls def get_info_from_paths(self, path): info = "" if path.operations: for operation in path.operations: info += ( f"\n{operation.method.value}=" f"{operation.responses[0].description}" ) return info def parse_file(self, file_path): data = parse(file_path) results = "" base_urls = self.get_base_urls(link.url for link in data.servers) base_urls = ",".join([base_url for base_url in base_urls]) results += f"Base URL:{base_urls}\n" i = 1 for path in data.paths: info = self.get_info_from_paths(path) results += ( f"Path{i}: {path.url}\n" f"description: {path.description}\n" f"parameters: {path.parameters}\nmethods: {info}\n" ) i += 1 with open("results.txt", "w") as f: f.write(results) return results ================================================ FILE: application/parser/file/pptx_parser.py ================================================ """PPT parser. Contains parsers for presentation (.pptx) files to extract slide text. """ from pathlib import Path from typing import Any, Dict, List, Union from application.parser.file.base_parser import BaseParser class PPTXParser(BaseParser): r"""PPTX (.pptx) parser for extracting text from PowerPoint slides. Args: concat_slides (bool): Specifies whether to concatenate all slide text into one document. - If True, slide texts will be joined together as a single string. - If False, each slide's text will be stored as a separate entry in a list. Set to True by default. slide_separator (str): Separator used to join slides' text content. Only used when `concat_slides=True`. Default is "\n". Refer to https://python-pptx.readthedocs.io/en/latest/ for more information. """ def __init__( self, *args: Any, concat_slides: bool = True, slide_separator: str = "\n", **kwargs: Any ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._concat_slides = concat_slides self._slide_separator = slide_separator def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: r""" Parse a .pptx file and extract text from each slide. Args: file (Path): Path to the .pptx file. errors (str): Error handling policy ('ignore' by default). Returns: Union[str, List[str]]: Concatenated text if concat_slides is True, otherwise a list of slide texts. """ try: from pptx import Presentation except ImportError: raise ImportError("pptx module is required to read .PPTX files.") try: presentation = Presentation(file) slide_texts=[] # Iterate over each slide in the presentation for slide in presentation.slides: slide_text="" # Iterate over each shape in the slide for shape in slide.shapes: # Check if the shape has a 'text' attribute and append that to the slide_text if hasattr(shape,"text"): slide_text+=shape.text slide_texts.append(slide_text.strip()) if self._concat_slides: return self._slide_separator.join(slide_texts) else: return slide_texts except Exception as e: raise e ================================================ FILE: application/parser/file/rst_parser.py ================================================ """reStructuredText parser. Contains parser for md files. """ import re from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union from application.parser.file.base_parser import BaseParser class RstParser(BaseParser): """reStructuredText parser. Extract text from .rst files. Returns dictionary with keys as headers and values as the text between headers. """ def __init__( self, *args: Any, remove_hyperlinks: bool = True, remove_images: bool = True, remove_table_excess: bool = True, remove_interpreters: bool = True, remove_directives: bool = True, remove_whitespaces_excess: bool = True, # Be careful with remove_characters_excess, might cause data loss remove_characters_excess: bool = True, **kwargs: Any, ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._remove_hyperlinks = remove_hyperlinks self._remove_images = remove_images self._remove_table_excess = remove_table_excess self._remove_interpreters = remove_interpreters self._remove_directives = remove_directives self._remove_whitespaces_excess = remove_whitespaces_excess self._remove_characters_excess = remove_characters_excess def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]: """Convert a reStructuredText file to a dictionary. The keys are the headers and the values are the text under each header. """ rst_tups: List[Tuple[Optional[str], str]] = [] lines = rst_text.split("\n") current_header = None current_text = "" for i, line in enumerate(lines): header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line) if header_match and i > 0 and ( len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]): if current_header is not None: if current_text == "" or None: continue # removes the next heading from current Document if current_text.endswith(lines[i - 1] + "\n"): current_text = current_text[:len(current_text) - len(lines[i - 1] + "\n")] rst_tups.append((current_header, current_text)) current_header = lines[i - 1] current_text = "" else: current_text += line + "\n" rst_tups.append((current_header, current_text)) # TODO: Format for rst # # if current_header is not None: # # pass linting, assert keys are defined # rst_tups = [ # (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) # for key, value in rst_tups # ] # else: # rst_tups = [ # (key, re.sub("\n", "", value)) for key, value in rst_tups # ] if current_header is None: rst_tups = [ (key, re.sub("\n", "", value)) for key, value in rst_tups ] return rst_tups def chunk_by_token_count(self, text: str, max_tokens: int = 100) -> List[str]: """Chunk text by token count.""" avg_token_length = 5 chunk_size = max_tokens * avg_token_length chunks = [] for i in range(0, len(text), chunk_size): chunk = text[i:i+chunk_size] if i + chunk_size < len(text): last_space = chunk.rfind(' ') if last_space != -1: chunk = chunk[:last_space] chunks.append(chunk.strip()) return chunks def remove_images(self, content: str) -> str: pattern = r"\.\. image:: (.*)" content = re.sub(pattern, "", content) return content def remove_hyperlinks(self, content: str) -> str: pattern = r"`(.*?) <(.*?)>`_" content = re.sub(pattern, r"\1", content) return content def remove_directives(self, content: str) -> str: """Removes reStructuredText Directives""" pattern = r"`\.\.([^:]+)::" content = re.sub(pattern, "", content) return content def remove_interpreters(self, content: str) -> str: """Removes reStructuredText Interpreted Text Roles""" pattern = r":(\w+):" content = re.sub(pattern, "", content) return content def remove_table_excess(self, content: str) -> str: """Pattern to remove grid table separators""" pattern = r"^\+[-]+\+[-]+\+$" content = re.sub(pattern, "", content, flags=re.MULTILINE) return content def remove_whitespaces_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]: """Pattern to match 2 or more consecutive whitespaces""" pattern = r"\s{2,}" content = [(key, re.sub(pattern, " ", value)) for key, value in content] return content def remove_characters_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]: """Pattern to match 2 or more consecutive characters""" pattern = r"(\S)\1{2,}" content = [(key, re.sub(pattern, r"\1\1\1", value, flags=re.MULTILINE)) for key, value in content] return content def _init_parser(self) -> Dict: """Initialize the parser with the config.""" return {} def parse_tups( self, filepath: Path, errors: str = "ignore",max_tokens: Optional[int] = 1000 ) -> List[Tuple[Optional[str], str]]: """Parse file into tuples.""" with open(filepath, "r") as f: content = f.read() if self._remove_hyperlinks: content = self.remove_hyperlinks(content) if self._remove_images: content = self.remove_images(content) if self._remove_table_excess: content = self.remove_table_excess(content) if self._remove_directives: content = self.remove_directives(content) if self._remove_interpreters: content = self.remove_interpreters(content) rst_tups = self.rst_to_tups(content) if self._remove_whitespaces_excess: rst_tups = self.remove_whitespaces_excess(rst_tups) if self._remove_characters_excess: rst_tups = self.remove_characters_excess(rst_tups) # Apply chunking if max_tokens is provided if max_tokens is not None: chunked_tups = [] for header, text in rst_tups: chunks = self.chunk_by_token_count(text, max_tokens) for idx, chunk in enumerate(chunks): chunked_tups.append((f"{header} - Chunk {idx + 1}", chunk)) return chunked_tups return rst_tups def parse_file( self, filepath: Path, errors: str = "ignore" ) -> Union[str, List[str]]: """Parse file into string.""" tups = self.parse_tups(filepath, errors=errors) results = [] # TODO: don't include headers right now for header, value in tups: if header is None: results.append(value) else: results.append(f"\n\n{header}\n{value}") return results ================================================ FILE: application/parser/file/tabular_parser.py ================================================ """Tabular parser. Contains parsers for tabular data files. """ from pathlib import Path from typing import Any, Dict, List, Union from application.parser.file.base_parser import BaseParser class CSVParser(BaseParser): """CSV parser. Args: concat_rows (bool): whether to concatenate all rows into one document. If set to False, a Document will be created for each row. True by default. """ def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None: """Init params.""" super().__init__(*args, **kwargs) self._concat_rows = concat_rows def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: """Parse file. Returns: Union[str, List[str]]: a string or a List of strings. """ try: import csv except ImportError: raise ValueError("csv module is required to read CSV files.") text_list = [] with open(file, "r") as fp: csv_reader = csv.reader(fp) for row in csv_reader: text_list.append(", ".join(row)) if self._concat_rows: return "\n".join(text_list) else: return text_list class PandasCSVParser(BaseParser): r"""Pandas-based CSV parser. Parses CSVs using the separator detection from Pandas `read_csv`function. If special parameters are required, use the `pandas_config` dict. Args: concat_rows (bool): whether to concatenate all rows into one document. If set to False, a Document will be created for each row. True by default. col_joiner (str): Separator to use for joining cols per row. Set to ", " by default. row_joiner (str): Separator to use for joining each row. Only used when `concat_rows=True`. Set to "\n" by default. pandas_config (dict): Options for the `pandas.read_csv` function call. Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html for more information. Set to empty dict by default, this means pandas will try to figure out the separators, table head, etc. on its own. header_period (int): Controls how headers are included in output: - 0: Headers only at the beginning - 1: Headers in every row - N > 1: Headers every N rows header_prefix (str): Prefix for header rows. Default is "HEADERS: ". """ def __init__( self, *args: Any, concat_rows: bool = True, col_joiner: str = ", ", row_joiner: str = "\n", pandas_config: dict = {}, header_period: int = 20, header_prefix: str = "HEADERS: ", **kwargs: Any ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._concat_rows = concat_rows self._col_joiner = col_joiner self._row_joiner = row_joiner self._pandas_config = pandas_config self._header_period = header_period self._header_prefix = header_prefix def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: """Parse file.""" try: import pandas as pd except ImportError: raise ValueError("pandas module is required to read CSV files.") df = pd.read_csv(file, **self._pandas_config) headers = df.columns.tolist() header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}" if not self._concat_rows: return df.apply( lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 ).tolist() text_list = [] if self._header_period != 1: text_list.append(header_row) for i, row in df.iterrows(): if (self._header_period > 1 and i > 0 and i % self._header_period == 0): text_list.append(header_row) text_list.append(self._col_joiner.join(row.astype(str).tolist())) if self._header_period == 1 and i < len(df) - 1: text_list.append(header_row) return self._row_joiner.join(text_list) class ExcelParser(BaseParser): r"""Excel (.xlsx) parser. Parses Excel files using Pandas `read_excel` function. If special parameters are required, use the `pandas_config` dict. Args: concat_rows (bool): whether to concatenate all rows into one document. If set to False, a Document will be created for each row. True by default. col_joiner (str): Separator to use for joining cols per row. Set to ", " by default. row_joiner (str): Separator to use for joining each row. Only used when `concat_rows=True`. Set to "\n" by default. pandas_config (dict): Options for the `pandas.read_excel` function call. Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html for more information. Set to empty dict by default, this means pandas will try to figure out the table structure on its own. header_period (int): Controls how headers are included in output: - 0: Headers only at the beginning (default) - 1: Headers in every row - N > 1: Headers every N rows header_prefix (str): Prefix for header rows. Default is "HEADERS: ". """ def __init__( self, *args: Any, concat_rows: bool = True, col_joiner: str = ", ", row_joiner: str = "\n", pandas_config: dict = {}, header_period: int = 20, header_prefix: str = "HEADERS: ", **kwargs: Any ) -> None: """Init params.""" super().__init__(*args, **kwargs) self._concat_rows = concat_rows self._col_joiner = col_joiner self._row_joiner = row_joiner self._pandas_config = pandas_config self._header_period = header_period self._header_prefix = header_prefix def _init_parser(self) -> Dict: """Init parser.""" return {} def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: """Parse file.""" try: import pandas as pd except ImportError: raise ValueError("pandas module is required to read Excel files.") df = pd.read_excel(file, **self._pandas_config) headers = df.columns.tolist() header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}" if not self._concat_rows: return df.apply( lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 ).tolist() text_list = [] if self._header_period != 1: text_list.append(header_row) for i, row in df.iterrows(): if (self._header_period > 1 and i > 0 and i % self._header_period == 0): text_list.append(header_row) text_list.append(self._col_joiner.join(row.astype(str).tolist())) if self._header_period == 1 and i < len(df) - 1: text_list.append(header_row) return self._row_joiner.join(text_list) ================================================ FILE: application/parser/remote/base.py ================================================ """Base reader class.""" from abc import abstractmethod from typing import Any, List from langchain_core.documents import Document as LCDocument from application.parser.schema.base import Document class BaseRemote: """Utilities for loading data from a directory.""" @abstractmethod def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: """Load data from the input directory.""" def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: """Load data in LangChain document format.""" docs = self.load_data(**load_kwargs) return [d.to_langchain_format() for d in docs] ================================================ FILE: application/parser/remote/crawler_loader.py ================================================ import logging import os import requests from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup from application.parser.remote.base import BaseRemote from application.parser.schema.base import Document from application.core.url_validation import validate_url, SSRFError from langchain_community.document_loaders import WebBaseLoader class CrawlerLoader(BaseRemote): def __init__(self, limit=10): self.loader = WebBaseLoader # Initialize the document loader self.limit = limit # Set the limit for the number of pages to scrape def load_data(self, inputs): url = inputs if isinstance(url, list) and url: url = url[0] # Validate URL to prevent SSRF attacks try: url = validate_url(url) except SSRFError as e: logging.error(f"URL validation failed: {e}") return [] visited_urls = set() base_url = urlparse(url).scheme + "://" + urlparse(url).hostname urls_to_visit = [url] loaded_content = [] while urls_to_visit: current_url = urls_to_visit.pop(0) visited_urls.add(current_url) try: # Validate each URL before making requests try: validate_url(current_url) except SSRFError as e: logging.warning(f"Skipping URL due to validation failure: {current_url} - {e}") continue response = requests.get(current_url, timeout=30) response.raise_for_status() loader = self.loader([current_url]) docs = loader.load() # Convert the loaded documents to your Document schema for doc in docs: metadata = dict(doc.metadata or {}) source_url = metadata.get("source") or current_url metadata["file_path"] = self._url_to_virtual_path(source_url) loaded_content.append( Document( doc.page_content, extra_info=metadata ) ) except Exception as e: logging.error(f"Error processing URL {current_url}: {e}", exc_info=True) continue # Parse the HTML content to extract all links soup = BeautifulSoup(response.text, 'html.parser') all_links = [ urljoin(current_url, a['href']) for a in soup.find_all('a', href=True) if base_url in urljoin(current_url, a['href']) ] # Add new links to the list of URLs to visit if they haven't been visited yet urls_to_visit.extend([link for link in all_links if link not in visited_urls]) urls_to_visit = list(set(urls_to_visit)) # Stop crawling if the limit of pages to scrape is reached if self.limit is not None and len(visited_urls) >= self.limit: break return loaded_content def _url_to_virtual_path(self, url): """ Convert a URL to a virtual file path ending with .md. Examples: https://docs.docsgpt.cloud/ -> index.md https://docs.docsgpt.cloud/guides/setup -> guides/setup.md https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md https://example.com/page.html -> page.md """ parsed = urlparse(url) path = parsed.path.strip("/") if not path: return "index.md" # Remove common file extensions and add .md base, ext = os.path.splitext(path) if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]: path = base if not path.endswith(".md"): path = f"{path}.md" return path ================================================ FILE: application/parser/remote/crawler_markdown.py ================================================ import requests from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup from application.parser.remote.base import BaseRemote from application.core.url_validation import validate_url, SSRFError import re from markdownify import markdownify from application.parser.schema.base import Document import tldextract import os class CrawlerLoader(BaseRemote): def __init__(self, limit=10, allow_subdomains=False): """ Given a URL crawl web pages up to `self.limit`, convert HTML content to Markdown, and returning a list of Document objects. :param limit: The maximum number of pages to crawl. :param allow_subdomains: If True, crawl pages on subdomains of the base domain. """ self.limit = limit self.allow_subdomains = allow_subdomains self.session = requests.Session() def load_data(self, inputs): url = inputs if isinstance(url, list) and url: url = url[0] # Validate URL to prevent SSRF attacks try: url = validate_url(url) except SSRFError as e: print(f"URL validation failed: {e}") return [] # Keep track of visited URLs to avoid revisiting the same page visited_urls = set() # Determine the base domain for link filtering using tldextract base_domain = self._get_base_domain(url) urls_to_visit = {url} documents = [] while urls_to_visit: current_url = urls_to_visit.pop() # Skip if already visited if current_url in visited_urls: continue visited_urls.add(current_url) # Fetch the page content html_content = self._fetch_page(current_url) if html_content is None: continue # Convert the HTML to Markdown for cleaner text formatting title, language, processed_markdown = self._process_html_to_markdown(html_content, current_url) if processed_markdown: # Generate virtual file path from URL for consistent file-like matching virtual_path = self._url_to_virtual_path(current_url) # Create a Document for each visited page documents.append( Document( processed_markdown, # content None, # doc_id None, # embedding { "source": current_url, "title": title, "language": language, "file_path": virtual_path, }, # extra_info ) ) # Extract links and filter them according to domain rules new_links = self._extract_links(html_content, current_url) filtered_links = self._filter_links(new_links, base_domain) # Add any new, not-yet-visited links to the queue urls_to_visit.update(link for link in filtered_links if link not in visited_urls) # If we've reached the limit, stop crawling if self.limit is not None and len(visited_urls) >= self.limit: break return documents def _fetch_page(self, url): try: # Validate URL before fetching to prevent SSRF validate_url(url) response = self.session.get(url, timeout=10) response.raise_for_status() return response.text except SSRFError as e: print(f"URL validation failed for {url}: {e}") return None except requests.exceptions.RequestException as e: print(f"Error fetching URL {url}: {e}") return None def _process_html_to_markdown(self, html_content, current_url): soup = BeautifulSoup(html_content, 'html.parser') title_tag = soup.find('title') title = title_tag.text.strip() if title_tag else "No Title" # Extract language language_tag = soup.find('html') language = language_tag.get('lang', 'en') if language_tag else "en" markdownified = markdownify(html_content, heading_style="ATX", newline_style="BACKSLASH") # Reduce sequences of more than two newlines to exactly three markdownified = re.sub(r'\n{3,}', '\n\n\n', markdownified) return title, language, markdownified def _extract_links(self, html_content, current_url): soup = BeautifulSoup(html_content, 'html.parser') links = [] for a in soup.find_all('a', href=True): full_url = urljoin(current_url, a['href']) links.append((full_url, a.text.strip())) return links def _get_base_domain(self, url): extracted = tldextract.extract(url) # Reconstruct the domain as domain.suffix base_domain = f"{extracted.domain}.{extracted.suffix}" return base_domain def _filter_links(self, links, base_domain): """ Filter the extracted links to only include those that match the crawling criteria: - If allow_subdomains is True, allow any link whose domain ends with the base_domain. - If allow_subdomains is False, only allow exact matches of the base_domain. """ filtered = [] for link, _ in links: parsed_link = urlparse(link) if not parsed_link.netloc: continue extracted = tldextract.extract(parsed_link.netloc) link_base = f"{extracted.domain}.{extracted.suffix}" if self.allow_subdomains: # For subdomains: sub.example.com ends with example.com if link_base == base_domain or link_base.endswith("." + base_domain): filtered.append(link) else: # Exact domain match if link_base == base_domain: filtered.append(link) return filtered def _url_to_virtual_path(self, url): """ Convert a URL to a virtual file path ending with .md. Examples: https://docs.docsgpt.cloud/ -> index.md https://docs.docsgpt.cloud/guides/setup -> guides/setup.md https://docs.docsgpt.cloud/guides/setup/ -> guides/setup.md https://example.com/page.html -> page.md """ parsed = urlparse(url) path = parsed.path.strip("/") if not path: return "index.md" # Remove common file extensions and add .md base, ext = os.path.splitext(path) if ext.lower() in [".html", ".htm", ".php", ".asp", ".aspx", ".jsp"]: path = base # Ensure path ends with .md if not path.endswith(".md"): path = path + ".md" return path ================================================ FILE: application/parser/remote/github_loader.py ================================================ import base64 import requests import time from typing import List, Optional from application.parser.remote.base import BaseRemote from application.parser.schema.base import Document import mimetypes from application.core.settings import settings class GitHubLoader(BaseRemote): def __init__(self): self.access_token = settings.GITHUB_ACCESS_TOKEN self.headers = { "Authorization": f"token {self.access_token}", "Accept": "application/vnd.github.v3+json" } if self.access_token else { "Accept": "application/vnd.github.v3+json" } return def is_text_file(self, file_path: str) -> bool: """Determine if a file is a text file based on extension.""" # Common text file extensions text_extensions = { '.txt', '.md', '.markdown', '.rst', '.json', '.xml', '.yaml', '.yml', '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.c', '.cpp', '.h', '.hpp', '.cs', '.go', '.rs', '.rb', '.php', '.swift', '.kt', '.scala', '.html', '.css', '.scss', '.sass', '.less', '.sh', '.bash', '.zsh', '.fish', '.sql', '.r', '.m', '.mat', '.ini', '.cfg', '.conf', '.config', '.env', '.gitignore', '.dockerignore', '.editorconfig', '.log', '.csv', '.tsv' } # Get file extension file_lower = file_path.lower() for ext in text_extensions: if file_lower.endswith(ext): return True # Also check MIME type mime_type, _ = mimetypes.guess_type(file_path) if mime_type and (mime_type.startswith("text") or mime_type in ["application/json", "application/xml"]): return True return False def fetch_file_content(self, repo_url: str, file_path: str) -> Optional[str]: """Fetch file content. Returns None if file should be skipped (binary files or empty files).""" url = f"https://api.github.com/repos/{repo_url}/contents/{file_path}" response = self._make_request(url) content = response.json() if content.get("encoding") == "base64": if self.is_text_file(file_path): # Handle only text files try: decoded_content = base64.b64decode(content["content"]).decode("utf-8").strip() # Skip empty files if not decoded_content: return None return decoded_content except Exception: # If decoding fails, it's probably a binary file return None else: # Skip binary files by returning None return None else: file_content = content['content'].strip() # Skip empty files if not file_content: return None return file_content def _make_request(self, url: str, max_retries: int = 3) -> requests.Response: """Make a request with retry logic for rate limiting""" for attempt in range(max_retries): response = requests.get(url, headers=self.headers) if response.status_code == 200: return response elif response.status_code == 403: # Check if it's a rate limit issue try: error_data = response.json() error_msg = error_data.get("message", "") # Check rate limit headers remaining = response.headers.get("X-RateLimit-Remaining", "unknown") reset_time = response.headers.get("X-RateLimit-Reset", "unknown") print(f"GitHub API 403 Error: {error_msg}") print(f"Rate limit remaining: {remaining}, Reset time: {reset_time}") if "rate limit" in error_msg.lower(): if attempt < max_retries - 1: wait_time = 2 ** attempt # Exponential backoff print(f"Rate limit hit, waiting {wait_time} seconds before retry...") time.sleep(wait_time) continue # Provide helpful error message if remaining == "0": raise Exception(f"GitHub API rate limit exceeded. Please set GITHUB_ACCESS_TOKEN environment variable. Reset time: {reset_time}") else: raise Exception(f"GitHub API error: {error_msg}. This may require authentication - set GITHUB_ACCESS_TOKEN environment variable.") except Exception as e: if isinstance(e, Exception) and "GitHub API" in str(e): raise # If we can't parse the response, raise the original error response.raise_for_status() else: response.raise_for_status() return response def fetch_repo_files(self, repo_url: str, path: str = "") -> List[str]: url = f"https://api.github.com/repos/{repo_url}/contents/{path}" response = self._make_request(url) contents = response.json() # Handle error responses from GitHub API if isinstance(contents, dict) and "message" in contents: raise Exception(f"GitHub API error: {contents.get('message')}") # Ensure contents is a list if not isinstance(contents, list): raise TypeError(f"Expected list from GitHub API, got {type(contents).__name__}: {contents}") files = [] for item in contents: if item["type"] == "file": files.append(item["path"]) elif item["type"] == "dir": files.extend(self.fetch_repo_files(repo_url, item["path"])) return files def load_data(self, repo_url: str) -> List[Document]: repo_name = repo_url.split("github.com/")[-1] files = self.fetch_repo_files(repo_name) documents = [] for file_path in files: content = self.fetch_file_content(repo_name, file_path) # Skip binary files (content is None) if content is None: continue documents.append(Document( text=content, doc_id=file_path, extra_info={ "title": file_path, "source": f"https://github.com/{repo_name}/blob/main/{file_path}" } )) return documents ================================================ FILE: application/parser/remote/reddit_loader.py ================================================ from application.parser.remote.base import BaseRemote from langchain_community.document_loaders import RedditPostsLoader import json class RedditPostsLoaderRemote(BaseRemote): def load_data(self, inputs): try: data = json.loads(inputs) except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON input: {e}") required_fields = ["client_id", "client_secret", "user_agent", "search_queries"] missing_fields = [field for field in required_fields if field not in data] if missing_fields: raise ValueError(f"Missing required fields: {', '.join(missing_fields)}") client_id = data.get("client_id") client_secret = data.get("client_secret") user_agent = data.get("user_agent") categories = data.get("categories", ["new", "hot"]) mode = data.get("mode", "subreddit") search_queries = data.get("search_queries") number_posts = data.get("number_posts", 10) self.loader = RedditPostsLoader( client_id=client_id, client_secret=client_secret, user_agent=user_agent, categories=categories, mode=mode, search_queries=search_queries, number_posts=number_posts, ) documents = self.loader.load() print(f"Loaded {len(documents)} documents from Reddit") return documents ================================================ FILE: application/parser/remote/remote_creator.py ================================================ from application.parser.remote.sitemap_loader import SitemapLoader from application.parser.remote.crawler_loader import CrawlerLoader from application.parser.remote.web_loader import WebLoader from application.parser.remote.reddit_loader import RedditPostsLoaderRemote from application.parser.remote.github_loader import GitHubLoader from application.parser.remote.s3_loader import S3Loader class RemoteCreator: """ Factory class for creating remote content loaders. These loaders fetch content from remote web sources like URLs, sitemaps, web crawlers, social media platforms, etc. For external knowledge base connectors (like Google Drive), use ConnectorCreator instead. """ loaders = { "url": WebLoader, "sitemap": SitemapLoader, "crawler": CrawlerLoader, "reddit": RedditPostsLoaderRemote, "github": GitHubLoader, "s3": S3Loader, } @classmethod def create_loader(cls, type, *args, **kwargs): loader_class = cls.loaders.get(type.lower()) if not loader_class: raise ValueError(f"No loader class found for type {type}") return loader_class(*args, **kwargs) ================================================ FILE: application/parser/remote/s3_loader.py ================================================ import json import logging import os import tempfile import mimetypes from typing import List, Optional from application.parser.remote.base import BaseRemote from application.parser.schema.base import Document try: import boto3 from botocore.exceptions import ClientError, NoCredentialsError except ImportError: boto3 = None logger = logging.getLogger(__name__) class S3Loader(BaseRemote): """Load documents from an AWS S3 bucket.""" def __init__(self): if boto3 is None: raise ImportError( "boto3 is required for S3Loader. Install it with: pip install boto3" ) self.s3_client = None def _normalize_endpoint_url(self, endpoint_url: str, bucket: str) -> tuple[str, str]: """ Normalize endpoint URL for S3-compatible services. Detects common mistakes like using bucket-prefixed URLs and extracts the correct endpoint and bucket name. Args: endpoint_url: The provided endpoint URL bucket: The provided bucket name Returns: Tuple of (normalized_endpoint_url, bucket_name) """ import re from urllib.parse import urlparse if not endpoint_url: return endpoint_url, bucket parsed = urlparse(endpoint_url) host = parsed.netloc or parsed.path # Check for DigitalOcean Spaces bucket-prefixed URL pattern # e.g., https://mybucket.nyc3.digitaloceanspaces.com do_match = re.match(r"^([^.]+)\.([a-z0-9]+)\.digitaloceanspaces\.com$", host) if do_match: extracted_bucket = do_match.group(1) region = do_match.group(2) correct_endpoint = f"https://{region}.digitaloceanspaces.com" logger.warning( f"Detected bucket-prefixed DigitalOcean Spaces URL. " f"Extracted bucket '{extracted_bucket}' from endpoint. " f"Using endpoint: {correct_endpoint}" ) # If bucket wasn't provided or differs, use extracted one if not bucket or bucket != extracted_bucket: logger.info(f"Using extracted bucket name: '{extracted_bucket}' (was: '{bucket}')") bucket = extracted_bucket return correct_endpoint, bucket # Check for just "digitaloceanspaces.com" without region if host == "digitaloceanspaces.com": logger.error( "Invalid DigitalOcean Spaces endpoint: missing region. " "Use format: https://.digitaloceanspaces.com (e.g., https://lon1.digitaloceanspaces.com)" ) return endpoint_url, bucket def _init_client( self, aws_access_key_id: str, aws_secret_access_key: str, region_name: str = "us-east-1", endpoint_url: Optional[str] = None, bucket: Optional[str] = None, ) -> Optional[str]: """ Initialize the S3 client with credentials. Returns: The potentially corrected bucket name if endpoint URL was normalized """ from botocore.config import Config client_kwargs = { "aws_access_key_id": aws_access_key_id, "aws_secret_access_key": aws_secret_access_key, "region_name": region_name, } logger.info(f"Initializing S3 client with region: {region_name}") corrected_bucket = bucket if endpoint_url: # Normalize the endpoint URL and potentially extract bucket name normalized_endpoint, corrected_bucket = self._normalize_endpoint_url(endpoint_url, bucket) logger.info(f"Original endpoint URL: {endpoint_url}") logger.info(f"Normalized endpoint URL: {normalized_endpoint}") logger.info(f"Bucket name: '{corrected_bucket}'") client_kwargs["endpoint_url"] = normalized_endpoint # Use path-style addressing for S3-compatible services # (DigitalOcean Spaces, MinIO, etc.) client_kwargs["config"] = Config(s3={"addressing_style": "path"}) else: logger.info("Using default AWS S3 endpoint") self.s3_client = boto3.client("s3", **client_kwargs) logger.info("S3 client initialized successfully") return corrected_bucket def is_text_file(self, file_path: str) -> bool: """Determine if a file is a text file based on extension.""" text_extensions = { ".txt", ".md", ".markdown", ".rst", ".json", ".xml", ".yaml", ".yml", ".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".c", ".cpp", ".h", ".hpp", ".cs", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala", ".html", ".css", ".scss", ".sass", ".less", ".sh", ".bash", ".zsh", ".fish", ".sql", ".r", ".m", ".mat", ".ini", ".cfg", ".conf", ".config", ".env", ".gitignore", ".dockerignore", ".editorconfig", ".log", ".csv", ".tsv", } file_lower = file_path.lower() for ext in text_extensions: if file_lower.endswith(ext): return True mime_type, _ = mimetypes.guess_type(file_path) if mime_type and ( mime_type.startswith("text") or mime_type in ["application/json", "application/xml"] ): return True return False def is_supported_document(self, file_path: str) -> bool: """Check if file is a supported document type for parsing.""" document_extensions = { ".pdf", ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt", ".epub", ".odt", ".rtf", } file_lower = file_path.lower() for ext in document_extensions: if file_lower.endswith(ext): return True return False def list_objects(self, bucket: str, prefix: str = "") -> List[str]: """ List all objects in the bucket with the given prefix. Args: bucket: S3 bucket name prefix: Optional path prefix to filter objects Returns: List of object keys """ objects = [] paginator = self.s3_client.get_paginator("list_objects_v2") logger.info(f"Listing objects in bucket: '{bucket}' with prefix: '{prefix}'") logger.debug(f"S3 client endpoint: {self.s3_client.meta.endpoint_url}") try: page_count = 0 for page in paginator.paginate(Bucket=bucket, Prefix=prefix): page_count += 1 logger.debug(f"Processing page {page_count}, keys in response: {list(page.keys())}") if "Contents" in page: for obj in page["Contents"]: key = obj["Key"] if not key.endswith("/"): objects.append(key) logger.debug(f"Found object: {key}") else: logger.info(f"Page {page_count} has no 'Contents' key - bucket may be empty or prefix not found") logger.info(f"Found {len(objects)} objects in bucket '{bucket}'") except ClientError as e: error_code = e.response.get("Error", {}).get("Code", "") error_message = e.response.get("Error", {}).get("Message", "") logger.error(f"ClientError listing objects - Code: {error_code}, Message: {error_message}") logger.error(f"Full error response: {e.response}") logger.error(f"Bucket: '{bucket}', Prefix: '{prefix}', Endpoint: {self.s3_client.meta.endpoint_url}") if error_code == "NoSuchBucket": raise Exception(f"S3 bucket '{bucket}' does not exist") elif error_code == "AccessDenied": raise Exception( f"Access denied to S3 bucket '{bucket}'. Check your credentials and permissions." ) elif error_code == "NoSuchKey": # This is unusual for ListObjectsV2 - may indicate endpoint/bucket configuration issue logger.error( "NoSuchKey error on ListObjectsV2 - this may indicate the bucket name " "is incorrect or the endpoint URL format is wrong. " "For DigitalOcean Spaces, the endpoint should be like: " "https://.digitaloceanspaces.com and bucket should be just the space name." ) raise Exception( f"S3 error: {e}. For S3-compatible services, verify: " f"1) Endpoint URL format (e.g., https://nyc3.digitaloceanspaces.com), " f"2) Bucket name is just the space/bucket name without region prefix" ) else: raise Exception(f"S3 error: {e}") except NoCredentialsError: raise Exception( "AWS credentials not found. Please provide valid credentials." ) return objects def get_object_content(self, bucket: str, key: str) -> Optional[str]: """ Get the content of an S3 object as text. Args: bucket: S3 bucket name key: Object key Returns: File content as string, or None if file should be skipped """ if not self.is_text_file(key) and not self.is_supported_document(key): return None try: response = self.s3_client.get_object(Bucket=bucket, Key=key) content = response["Body"].read() if self.is_text_file(key): try: decoded_content = content.decode("utf-8").strip() if not decoded_content: return None return decoded_content except UnicodeDecodeError: return None elif self.is_supported_document(key): return self._process_document(content, key) except ClientError as e: error_code = e.response.get("Error", {}).get("Code", "") if error_code == "NoSuchKey": return None elif error_code == "AccessDenied": print(f"Access denied to object: {key}") return None else: print(f"Error fetching object {key}: {e}") return None return None def _process_document(self, content: bytes, key: str) -> Optional[str]: """ Process a document file (PDF, DOCX, etc.) and extract text. Args: content: File content as bytes key: Object key (filename) Returns: Extracted text content """ ext = os.path.splitext(key)[1].lower() with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp_file: tmp_file.write(content) tmp_path = tmp_file.name try: from application.parser.file.bulk import SimpleDirectoryReader reader = SimpleDirectoryReader(input_files=[tmp_path]) documents = reader.load_data() if documents: return "\n\n".join(doc.text for doc in documents if doc.text) return None except Exception as e: print(f"Error processing document {key}: {e}") return None finally: if os.path.exists(tmp_path): os.unlink(tmp_path) def load_data(self, inputs) -> List[Document]: """ Load documents from an S3 bucket. Args: inputs: JSON string or dict containing: - aws_access_key_id: AWS access key ID - aws_secret_access_key: AWS secret access key - bucket: S3 bucket name - prefix: Optional path prefix to filter objects - region: AWS region (default: us-east-1) - endpoint_url: Custom S3 endpoint URL (for MinIO, R2, etc.) Returns: List of Document objects """ if isinstance(inputs, str): try: data = json.loads(inputs) except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON input: {e}") else: data = inputs required_fields = ["aws_access_key_id", "aws_secret_access_key", "bucket"] missing_fields = [field for field in required_fields if not data.get(field)] if missing_fields: raise ValueError(f"Missing required fields: {', '.join(missing_fields)}") aws_access_key_id = data["aws_access_key_id"] aws_secret_access_key = data["aws_secret_access_key"] bucket = data["bucket"] prefix = data.get("prefix", "") region = data.get("region", "us-east-1") endpoint_url = data.get("endpoint_url", "") logger.info(f"Loading data from S3 - Bucket: '{bucket}', Prefix: '{prefix}', Region: '{region}'") if endpoint_url: logger.info(f"Custom endpoint URL provided: '{endpoint_url}'") corrected_bucket = self._init_client( aws_access_key_id, aws_secret_access_key, region, endpoint_url or None, bucket ) # Use the corrected bucket name if endpoint URL normalization extracted one if corrected_bucket and corrected_bucket != bucket: logger.info(f"Using corrected bucket name: '{corrected_bucket}' (original: '{bucket}')") bucket = corrected_bucket objects = self.list_objects(bucket, prefix) documents = [] for key in objects: content = self.get_object_content(bucket, key) if content is None: continue documents.append( Document( text=content, doc_id=key, extra_info={ "title": os.path.basename(key), "source": f"s3://{bucket}/{key}", "bucket": bucket, "key": key, }, ) ) logger.info(f"Loaded {len(documents)} documents from S3 bucket '{bucket}'") return documents ================================================ FILE: application/parser/remote/sitemap_loader.py ================================================ import logging import requests import re # Import regular expression library import defusedxml.ElementTree as ET from application.parser.remote.base import BaseRemote from application.core.url_validation import validate_url, SSRFError class SitemapLoader(BaseRemote): def __init__(self, limit=20): from langchain_community.document_loaders import WebBaseLoader self.loader = WebBaseLoader self.limit = limit # Adding limit to control the number of URLs to process def load_data(self, inputs): sitemap_url= inputs # Check if the input is a list and if it is, use the first element if isinstance(sitemap_url, list) and sitemap_url: sitemap_url = sitemap_url[0] # Validate URL to prevent SSRF attacks try: sitemap_url = validate_url(sitemap_url) except SSRFError as e: logging.error(f"URL validation failed: {e}") return [] urls = self._extract_urls(sitemap_url) if not urls: print(f"No URLs found in the sitemap: {sitemap_url}") return [] # Load content of extracted URLs documents = [] processed_urls = 0 # Counter for processed URLs for url in urls: if self.limit is not None and processed_urls >= self.limit: break # Stop processing if the limit is reached try: loader = self.loader([url]) documents.extend(loader.load()) processed_urls += 1 # Increment the counter after processing each URL except Exception as e: logging.error(f"Error processing URL {url}: {e}", exc_info=True) continue return documents def _extract_urls(self, sitemap_url): try: # Validate URL before fetching to prevent SSRF validate_url(sitemap_url) response = requests.get(sitemap_url, timeout=30) response.raise_for_status() # Raise an exception for HTTP errors except SSRFError as e: print(f"URL validation failed for sitemap: {sitemap_url}. Error: {e}") return [] except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e: print(f"Failed to fetch sitemap: {sitemap_url}. Error: {e}") return [] # Determine if this is a sitemap or a URL if self._is_sitemap(response): # It's a sitemap, so parse it and extract URLs return self._parse_sitemap(response.content) else: # It's not a sitemap, return the URL itself return [sitemap_url] def _is_sitemap(self, response): content_type = response.headers.get('Content-Type', '') if 'xml' in content_type or response.url.endswith('.xml'): return True if ' None: """Post init.""" if self.text is None: raise ValueError("text field not set.") @classmethod def get_type(cls) -> str: """Get Document type.""" return "Document" def to_langchain_format(self) -> LCDocument: """Convert struct to LangChain document format.""" metadata = self.extra_info or {} return LCDocument(page_content=self.text, metadata=metadata) @classmethod def from_langchain_format(cls, doc: LCDocument) -> "Document": """Convert struct from LangChain document format.""" return cls(text=doc.page_content, extra_info=doc.metadata) ================================================ FILE: application/parser/schema/schema.py ================================================ """Base schema for data structures.""" from abc import abstractmethod from dataclasses import dataclass from typing import Any, Dict, List, Optional from dataclasses_json import DataClassJsonMixin @dataclass class BaseDocument(DataClassJsonMixin): """Base document. Generic abstract interfaces that captures both index structs as well as documents. """ # TODO: consolidate fields from Document/IndexStruct into base class text: Optional[str] = None doc_id: Optional[str] = None embedding: Optional[List[float]] = None # extra fields extra_info: Optional[Dict[str, Any]] = None @classmethod @abstractmethod def get_type(cls) -> str: """Get Document type.""" def get_text(self) -> str: """Get text.""" if self.text is None: raise ValueError("text field not set.") return self.text def get_doc_id(self) -> str: """Get doc_id.""" if self.doc_id is None: raise ValueError("doc_id not set.") return self.doc_id @property def is_doc_id_none(self) -> bool: """Check if doc_id is None.""" return self.doc_id is None def get_embedding(self) -> List[float]: """Get embedding. Errors if embedding is None. """ if self.embedding is None: raise ValueError("embedding not set.") return self.embedding @property def extra_info_str(self) -> Optional[str]: """Extra info string.""" if self.extra_info is None: return None return "\n".join([f"{k}: {str(v)}" for k, v in self.extra_info.items()]) ================================================ FILE: application/prompts/chat_combine_creative.txt ================================================ You are a helpful AI assistant, DocsGPT. You are proactive and helpful. Try to use tools, if they are available to you, be proactive and fill in missing information. Users can Upload documents for your context as attachments or sources via UI using the Conversation input box. If appropriate, your answers can include code examples, formatted as follows: ```(language) (code) ``` Users are also able to see charts and diagrams if you use them with valid mermaid syntax in your responses. Try to respond with mermaid charts if visualization helps with users queries. You effectively utilize chat history, ensuring relevant and tailored responses. Try to use additional provided context if it's available, otherwise use your knowledge and tool capabilities. Allow yourself to be very creative and use your imagination. ---------------- Possible additional context from uploaded sources: {summaries} ================================================ FILE: application/prompts/chat_combine_default.txt ================================================ You are a helpful AI assistant, DocsGPT. You are proactive and helpful. Try to use tools, if they are available to you, be proactive and fill in missing information. Users can Upload documents for your context as attachments or sources via UI using the Conversation input box. If appropriate, your answers can include code examples, formatted as follows: ```(language) (code) ``` Users are also able to see charts and diagrams if you use them with valid mermaid syntax in your responses. Try to respond with mermaid charts if visualization helps with users queries. You effectively utilize chat history, ensuring relevant and tailored responses. Try to use additional provided context if it's available, otherwise use your knowledge and tool capabilities. ---------------- Possible additional context from uploaded sources: {summaries} ================================================ FILE: application/prompts/chat_combine_strict.txt ================================================ You are a helpful AI assistant, DocsGPT. You are proactive and helpful. Try to use tools, if they are available to you, be proactive and fill in missing information. Users can Upload documents for your context as attachments or sources via UI using the Conversation input box. If appropriate, your answers can include code examples, formatted as follows: ```(language) (code) ``` Users are also able to see charts and diagrams if you use them with valid mermaid syntax in your responses. Try to respond with mermaid charts if visualization helps with users queries. You effectively utilize chat history, ensuring relevant and tailored responses. Use context provided below or use available tools tool capabilities to answer user queries. If you dont have enough information from the context or tools, answer "I don't know" or "I don't have enough information". Never make up information or provide false information! Allow yourself to be very creative and use your imagination. ---------------- Context from uploaded sources: {summaries} ================================================ FILE: application/prompts/chat_reduce_prompt.txt ================================================ Use the following pieces of context to help answer the users question. If its not relevant to the question, respond with "-" ---------------- {context} ================================================ FILE: application/prompts/compression/v1.0.txt ================================================ Your task is to create a detailed summary of the conversation so far, paying close attention to the user's explicit requests and your previous actions. This summary should be thorough in capturing technical details, code patterns, and architectural decisions that would be essential for continuing work without losing context. Before providing your final summary, wrap your analysis in tags to organize your thoughts and ensure you've covered all necessary points. In your analysis process: 1. Chronologically analyze each message, tool call and section of the conversation. For each section thoroughly identify: - The user's explicit requests and intents - Your approach to addressing the user's requests - Key decisions, concepts and patterns - Specific details like if applicable: - file names - full code snippets - function signatures - file edits - Errors that you ran into and how you fixed them - Pay special attention to specific user feedback that you received, especially if the user told you to do something differently. 2. Double-check for accuracy and completeness, addressing each required element thoroughly. Your summary should include the following sections: 1. Primary Request and Intent: Capture all of the user's explicit requests and intents in detail 2. Key Concepts: List all important concepts discussed. 3. Files and Code Sections: Enumerate specific files and code sections examined, modified, or created. Pay special attention to the most recent messages and include full code snippets where applicable and include a summary of why this file read or edit is important. 4. Errors and fixes: List all errors that you ran into, and how you fixed them. Pay special attention to specific user feedback that you received, especially if the user told you to do something differently. 5. Problem Solving: Document problems solved and any ongoing troubleshooting efforts. 6. All user messages: List ALL user messages that are not tool results. These are critical for understanding the users' feedback and changing intent. 7. Tool Calls: List ALL tool calls made, including their inputs relevant parts of the outputs. 8. Pending Tasks: Outline any pending tasks that you have explicitly been asked to work on. 9. Current Work: Describe in detail precisely what was being worked on immediately before this summary request, paying special attention to the most recent messages from both user and assistant. Include file names and code snippets where applicable. 10. Optional Next Step: List the next step that you will take that is related to the most recent work you were doing. IMPORTANT: ensure that this step is DIRECTLY in line with the user's most recent explicit requests, and the task you were working on immediately before this summary request. If your last task was concluded, then only list next steps if they are explicitly in line with the users request. Do not start on tangential requests or really old requests that were already completed without confirming with the user first. If there is a next step, include direct quotes from the most recent conversation showing exactly what task you were working on and where you left off. This should be verbatim to ensure there's no drift in task interpretation. Please provide your summary based on the conversation and tools used so far, following this structure and ensuring precision and thoroughness in your response. ================================================ FILE: application/prompts/react_final_prompt.txt ================================================ Query: {query} Observations: {observations} Now, using the insights from the observations, formulate a well-structured and precise final answer. ================================================ FILE: application/prompts/react_planning_prompt.txt ================================================ You are an AI assistant and talk like you're thinking out loud. Given the following query, outline a concise thought process that includes key steps and considerations necessary for effective analysis and response. Avoid pointwise formatting. The goal is to break down the query into manageable components without excessive detail, focusing on clarity and logical progression. Include the following elements in your thought and execution process: 1. Identify the main objective of the query. 2. Determine any relevant context or background information needed to understand the query. 3. List potential approaches or methods to address the query. 4. Highlight any critical factors or constraints that may influence the outcome. 5. Plan with available tools to help you with the analysis but dont execute them. Tools will be executed by another AI. Query: {query} Summaries: {summaries} Prompt: {prompt} Observations(potentially previous tool calls): {observations} ================================================ FILE: application/requirements.txt ================================================ anthropic==0.75.0 boto3==1.42.17 beautifulsoup4==4.14.3 cel-python==0.5.0 celery==5.6.0 cryptography==46.0.3 dataclasses-json==0.6.7 defusedxml==0.7.1 docling>=2.16.0 rapidocr>=1.4.0 onnxruntime>=1.19.0 docx2txt==0.9 ddgs>=8.0.0 ebooklib==0.20 escodegen==1.0.11 esprima==4.0.1 esutils==1.0.1 elevenlabs==2.27.0 Flask==3.1.2 faiss-cpu==1.13.2 fastmcp==2.14.1 flask-restx==1.3.2 google-genai==1.54.0 google-api-python-client==2.187.0 google-auth-httplib2==0.3.0 google-auth-oauthlib==1.2.3 gTTS==2.5.4 gunicorn==23.0.0 html2text==2025.4.15 javalang==0.13.0 jinja2==3.1.6 jiter==0.12.0 jmespath==1.0.1 joblib==1.5.3 jsonpatch==1.33 jsonpointer==3.0.0 kombu==5.6.1 langchain==1.2.0 langchain-community==0.4.1 langchain-core==1.2.5 langchain-openai==1.1.6 langchain-text-splitters==1.1.0 langsmith==0.5.1 lazy-object-proxy==1.12.0 lxml==6.0.2 markupsafe==3.0.3 marshmallow>=3.18.0,<5.0.0 mpmath==1.3.0 multidict==6.7.0 msal==1.34.0 mypy-extensions==1.1.0 networkx==3.6.1 numpy==2.4.0 openai==2.14.0 openapi3-parser==1.1.22 orjson==3.11.5 packaging==24.2 pandas==2.3.3 openpyxl==3.1.5 pathable==0.4.4 pdf2image>=1.17.0 pillow portalocker>=2.7.0,<3.0.0 prance==25.4.8.0 prompt-toolkit==3.0.52 protobuf==6.33.2 psycopg2-binary==2.9.11 py==1.11.0 pydantic pydantic-core pydantic-settings pymongo==4.15.5 pypdf==6.5.0 python-dateutil==2.9.0.post0 python-dotenv python-jose==3.5.0 python-pptx==1.0.2 redis==7.1.0 referencing>=0.28.0,<0.38.0 regex==2025.11.3 requests==2.32.5 retry==0.9.2 sentence-transformers==5.2.0 tiktoken==0.12.0 tokenizers==0.22.1 torch==2.9.1 tqdm==4.67.1 transformers==4.57.3 typing-extensions==4.15.0 typing-inspect==0.9.0 tzdata==2025.3 urllib3==2.6.3 vine==5.1.0 wcwidth==0.2.14 werkzeug>=3.1.0 yarl==1.22.0 markdownify==1.2.2 tldextract==5.3.0 websockets==15.0.1 ================================================ FILE: application/retriever/__init__.py ================================================ ================================================ FILE: application/retriever/base.py ================================================ from abc import ABC, abstractmethod class BaseRetriever(ABC): def __init__(self): pass @abstractmethod def search(self, *args, **kwargs): pass ================================================ FILE: application/retriever/classic_rag.py ================================================ import logging import os from application.core.settings import settings from application.llm.llm_creator import LLMCreator from application.retriever.base import BaseRetriever from application.utils import num_tokens_from_string from application.vectorstore.vector_creator import VectorCreator class ClassicRAG(BaseRetriever): def __init__( self, source, chat_history=None, prompt="", chunks=2, doc_token_limit=50000, model_id="docsgpt-local", user_api_key=None, agent_id=None, llm_name=settings.LLM_PROVIDER, api_key=settings.API_KEY, decoded_token=None, ): self.original_question = source.get("question", "") self.chat_history = chat_history if chat_history is not None else [] self.prompt = prompt if isinstance(chunks, str): try: self.chunks = int(chunks) except ValueError: logging.warning( f"Invalid chunks value '{chunks}', using default value 2" ) self.chunks = 2 else: self.chunks = chunks user_id = decoded_token.get("sub") if decoded_token else "default" logging.info( f"ClassicRAG initialized with chunks={self.chunks}, user_id={user_id}, " f"sources={'active_docs' in source and source['active_docs'] is not None}" ) self.model_id = model_id self.doc_token_limit = doc_token_limit self.user_api_key = user_api_key self.agent_id = agent_id self.llm_name = llm_name self.api_key = api_key self.llm = LLMCreator.create_llm( self.llm_name, api_key=self.api_key, user_api_key=self.user_api_key, decoded_token=decoded_token, agent_id=self.agent_id, ) if "active_docs" in source and source["active_docs"] is not None: if isinstance(source["active_docs"], list): self.vectorstores = source["active_docs"] else: self.vectorstores = [source["active_docs"]] else: self.vectorstores = [] self.question = self._rephrase_query() self.decoded_token = decoded_token self._validate_vectorstore_config() def _validate_vectorstore_config(self): """Validate vectorstore IDs and remove any empty/invalid entries""" if not self.vectorstores: logging.warning("No vectorstores configured for retrieval") return invalid_ids = [ vs_id for vs_id in self.vectorstores if not vs_id or not vs_id.strip() ] if invalid_ids: logging.warning(f"Found invalid vectorstore IDs: {invalid_ids}") self.vectorstores = [ vs_id for vs_id in self.vectorstores if vs_id and vs_id.strip() ] def _rephrase_query(self): """Rephrase user query with chat history context for better retrieval""" if ( not self.original_question or not self.chat_history or self.chat_history == [] or self.chunks == 0 or not self.vectorstores ): return self.original_question prompt = ( "Given the following conversation history:\n" f"{self.chat_history}\n\n" "Rephrase the following user question to be a standalone search query " "that captures all relevant context from the conversation:\n" ) messages = [ {"role": "system", "content": prompt}, {"role": "user", "content": self.original_question}, ] try: rephrased_query = self.llm.gen(model=self.model_id, messages=messages) print(f"Rephrased query: {rephrased_query}") return rephrased_query if rephrased_query else self.original_question except Exception as e: logging.error(f"Error rephrasing query: {e}", exc_info=True) return self.original_question def _get_data(self): if self.chunks == 0 or not self.vectorstores: logging.info( f"ClassicRAG._get_data: Skipping retrieval - chunks={self.chunks}, " f"vectorstores_count={len(self.vectorstores) if self.vectorstores else 0}" ) return [] all_docs = [] chunks_per_source = max(1, self.chunks // len(self.vectorstores)) token_budget = max(int(self.doc_token_limit * 0.9), 100) cumulative_tokens = 0 for vectorstore_id in self.vectorstores: if vectorstore_id: try: docsearch = VectorCreator.create_vectorstore( settings.VECTOR_STORE, vectorstore_id, settings.EMBEDDINGS_KEY ) docs_temp = docsearch.search( self.question, k=max(chunks_per_source * 2, 20) ) for doc in docs_temp: if cumulative_tokens >= token_budget: break if hasattr(doc, "page_content") and hasattr(doc, "metadata"): page_content = doc.page_content metadata = doc.metadata else: page_content = doc.get("text", doc.get("page_content", "")) metadata = doc.get("metadata", {}) title = metadata.get( "title", metadata.get("post_title", page_content) ) if not isinstance(title, str): title = str(title) title = title.split("/")[-1] filename = ( metadata.get("filename") or metadata.get("file_name") or metadata.get("source") ) if isinstance(filename, str): filename = os.path.basename(filename) or filename else: filename = title if not filename: filename = title source_path = metadata.get("source") or vectorstore_id doc_text_with_header = f"{filename}\n{page_content}" doc_tokens = num_tokens_from_string(doc_text_with_header) if cumulative_tokens + doc_tokens < token_budget: all_docs.append( { "title": title, "text": page_content, "source": source_path, "filename": filename, } ) cumulative_tokens += doc_tokens if cumulative_tokens >= token_budget: break except Exception as e: logging.error( f"Error searching vectorstore {vectorstore_id}: {e}", exc_info=True, ) continue logging.info( f"ClassicRAG._get_data: Retrieval complete - retrieved {len(all_docs)} documents " f"(requested chunks={self.chunks}, chunks_per_source={chunks_per_source}, " f"cumulative_tokens={cumulative_tokens}/{token_budget})" ) return all_docs def search(self, query: str = ""): """Search for documents using optional query override""" if query: self.original_question = query self.question = self._rephrase_query() return self._get_data() ================================================ FILE: application/retriever/retriever_creator.py ================================================ from application.retriever.classic_rag import ClassicRAG class RetrieverCreator: retrievers = { "classic": ClassicRAG, "default": ClassicRAG, } @classmethod def create_retriever(cls, type, *args, **kwargs): retriever_type = (type or "default").lower() retiever_class = cls.retrievers.get(retriever_type) if not retiever_class: raise ValueError(f"No retievers class found for type {type}") return retiever_class(*args, **kwargs) ================================================ FILE: application/security/__init__.py ================================================ ================================================ FILE: application/security/encryption.py ================================================ import base64 import json import os from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import hashes from cryptography.hazmat.primitives.ciphers import algorithms, Cipher, modes from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC from application.core.settings import settings def _derive_key(user_id: str, salt: bytes) -> bytes: app_secret = settings.ENCRYPTION_SECRET_KEY password = f"{app_secret}#{user_id}".encode() kdf = PBKDF2HMAC( algorithm=hashes.SHA256(), length=32, salt=salt, iterations=100000, backend=default_backend(), ) return kdf.derive(password) def encrypt_credentials(credentials: dict, user_id: str) -> str: if not credentials: return "" try: salt = os.urandom(16) iv = os.urandom(16) key = _derive_key(user_id, salt) json_str = json.dumps(credentials) cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend()) encryptor = cipher.encryptor() padded_data = _pad_data(json_str.encode()) encrypted_data = encryptor.update(padded_data) + encryptor.finalize() result = salt + iv + encrypted_data return base64.b64encode(result).decode() except Exception as e: print(f"Warning: Failed to encrypt credentials: {e}") return "" def decrypt_credentials(encrypted_data: str, user_id: str) -> dict: if not encrypted_data: return {} try: data = base64.b64decode(encrypted_data.encode()) salt = data[:16] iv = data[16:32] encrypted_content = data[32:] key = _derive_key(user_id, salt) cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend()) decryptor = cipher.decryptor() decrypted_padded = decryptor.update(encrypted_content) + decryptor.finalize() decrypted_data = _unpad_data(decrypted_padded) return json.loads(decrypted_data.decode()) except Exception as e: print(f"Warning: Failed to decrypt credentials: {e}") return {} def _pad_data(data: bytes) -> bytes: block_size = 16 padding_len = block_size - (len(data) % block_size) padding = bytes([padding_len]) * padding_len return data + padding def _unpad_data(data: bytes) -> bytes: padding_len = data[-1] return data[:-padding_len] ================================================ FILE: application/seed/__init__.py ================================================ ================================================ FILE: application/seed/commands.py ================================================ import click from application.core.mongo_db import MongoDB from application.core.settings import settings from application.seed.seeder import DatabaseSeeder @click.group() def seed(): """Database seeding commands""" pass @seed.command() @click.option("--force", is_flag=True, help="Force reseeding even if data exists") def init(force): """Initialize database with seed data""" mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] seeder = DatabaseSeeder(db) seeder.seed_initial_data(force=force) if __name__ == "__main__": seed() ================================================ FILE: application/seed/config/agents_template.yaml ================================================ # Configuration for Premade Agents # This file contains template agents that will be seeded into the database agents: # Basic Agent Template - name: "Agent Name" # Required: Unique name for the agent description: "What this agent does" # Required: Brief description of the agent's purpose image: "URL_TO_IMAGE" # Optional: URL to agent's avatar/image agent_type: "classic" # Required: Type of agent (e.g., classic, react, etc.) prompt_id: "default" # Optional: Reference to prompt template prompt: # Optional: Define new prompt name: "New Prompt" content: "You are new agent with cool new prompt." chunks: "0" # Optional: Chunking strategy for documents retriever: "" # Optional: Retriever type for document search # Source Configuration (where the agent gets its knowledge) source: # Optional: Select a source to link with agent name: "Source Display Name" # Human-readable name for the source url: "https://example.com/data-source" # URL or path to knowledge source loader: "url" # Type of loader (url, pdf, txt, etc.) # Tools Configuration (what capabilities the agent has) tools: # Optional: Remove if agent doesn't need tools - name: "tool_name" # Must match a supported tool name display_name: "Tool Display Name" # Optional: Human-readable name for the tool config: # Tool-specific configuration # Example for DuckDuckGo: # token: "${DDG_API_KEY}" # ${} denotes environment variable # Add more tools as needed # - name: "another_tool" # config: # param1: "value1" # param2: "${ENV_VAR}" ================================================ FILE: application/seed/config/premade_agents.yaml ================================================ # Configuration for Premade Agents agents: - name: "Assistant" description: "Your general-purpose AI assistant. Ready to help with a wide range of tasks." image: "https://d3dg1063dc54p9.cloudfront.net/imgs/agents/agent-logo.svg" agent_type: "classic" prompt_id: "default" chunks: "0" retriever: "" # Tools Configuration tools: - name: "tool_name" display_name: "read_webpage" config: - name: "Researcher" description: "A specialized research agent that performs deep dives into subjects." image: "https://d3dg1063dc54p9.cloudfront.net/imgs/agents/agent-researcher.svg" agent_type: "react" prompt: name: "Researcher-Agent" content: | You are a specialized AI research assistant, DocsGPT. Your primary function is to conduct in-depth research on a given subject or question. You are methodical, thorough, and analytical. You should perform multiple iterations of thinking to gather and synthesize information before providing a final, comprehensive answer. You have access to the 'Read Webpage' tool. Use this tool to explore sources, gather data, and deepen your understanding. Be proactive in using the tool to fill in knowledge gaps and validate information. Users can Upload documents for your context as attachments or sources via UI using the Conversation input box. If appropriate, your answers can include code examples, formatted as follows: ```(language) (code) ``` Users are also able to see charts and diagrams if you use them with valid mermaid syntax in your responses. Try to respond with mermaid charts if visualization helps with users queries. You effectively utilize chat history, ensuring relevant and tailored responses. Try to use additional provided context if it's available, otherwise use your knowledge and tool capabilities. ---------------- Possible additional context from uploaded sources: {summaries} chunks: "0" retriever: "" # Tools Configuration tools: - name: "tool_name" display_name: "read_webpage" config: - name: "Search Widget" description: "A powerful search widget agent. Ask it anything about DocsGPT" image: "https://d3dg1063dc54p9.cloudfront.net/imgs/agents/agent-search.svg" agent_type: "classic" prompt: name: "Search-Agent" content: | You are a website search assistant, DocsGPT. Your sole purpose is to help users find information within the provided context of the DocsGPT documentation. Act as a specialized search engine. Your answers must be based *only* on the provided context. Do not use any external knowledge. If the answer is not in the context, inform the user that you could not find the information within the documentation. Keep your responses concise and directly related to the user's query, pointing them to the most relevant information. ---------------- Possible additional context from uploaded sources: {summaries} chunks: "8" retriever: "" source: name: "DocsGPT-Docs" url: "https://d3dg1063dc54p9.cloudfront.net/agent-source/docsgpt-documentation.md" # URL to DocsGPT documentation loader: "url" - name: "Support Widget" description: "A friendly support widget agent to help you with any questions." image: "https://d3dg1063dc54p9.cloudfront.net/imgs/agents/agent-support.svg" agent_type: "classic" prompt: name: "Support-Agent" content: | You are a helpful AI support widget agent, DocsGPT. Your goal is to assist users by answering their questions about our website, product and its features. Provide friendly, clear, and direct support. Your knowledge is strictly limited to the provided context from the DocsGPT documentation. You must not answer questions outside of this scope. If a user asks something you cannot answer from the context, politely state that you can only help with questions about this website. Effectively utilize chat history to understand the user's issue fully. Guide users to the information they need in a helpful and conversational manner. ---------------- Possible additional context from uploaded sources: {summaries} chunks: "8" retriever: "" source: name: "DocsGPT-Docs" url: "https://d3dg1063dc54p9.cloudfront.net/agent-source/docsgpt-documentation.md" # URL to DocsGPT documentation loader: "url" ================================================ FILE: application/seed/seeder.py ================================================ import logging import os from datetime import datetime, timezone from typing import Dict, List, Optional, Union import yaml from bson import ObjectId from bson.dbref import DBRef from dotenv import load_dotenv from pymongo import MongoClient from application.agents.tools.tool_manager import ToolManager from application.api.user.tasks import ingest_remote load_dotenv() tool_config = {} tool_manager = ToolManager(config=tool_config) class DatabaseSeeder: def __init__(self, db): self.db = db self.tools_collection = self.db["user_tools"] self.sources_collection = self.db["sources"] self.agents_collection = self.db["agents"] self.prompts_collection = self.db["prompts"] self.system_user_id = "system" self.logger = logging.getLogger(__name__) def seed_initial_data(self, config_path: str = None, force=False): """Main entry point for seeding all initial data""" if not force and self._is_already_seeded(): self.logger.info("Database already seeded. Use force=True to reseed.") return config_path = config_path or os.path.join( os.path.dirname(__file__), "config", "premade_agents.yaml" ) try: with open(config_path, "r") as f: config = yaml.safe_load(f) self._seed_from_config(config) except Exception as e: self.logger.error(f"Failed to load seeding config: {str(e)}") raise def _seed_from_config(self, config: Dict): """Seed all data from configuration""" self.logger.info("🌱 Starting seeding...") if not config.get("agents"): self.logger.warning("No agents found in config") return used_tool_ids = set() for agent_config in config["agents"]: try: self.logger.info(f"Processing agent: {agent_config['name']}") # 1. Handle Source source_result = self._handle_source(agent_config) if source_result is False: self.logger.error( f"Skipping agent {agent_config['name']} due to source ingestion failure" ) continue source_id = source_result # 2. Handle Tools tool_ids = self._handle_tools(agent_config) if len(tool_ids) == 0: self.logger.warning( f"No valid tools for agent {agent_config['name']}" ) used_tool_ids.update(tool_ids) # 3. Handle Prompt prompt_id = self._handle_prompt(agent_config) # 4. Create Agent agent_data = { "user": self.system_user_id, "name": agent_config["name"], "description": agent_config["description"], "image": agent_config.get("image", ""), "source": ( DBRef("sources", ObjectId(source_id)) if source_id else "" ), "tools": [str(tid) for tid in tool_ids], "agent_type": agent_config["agent_type"], "prompt_id": prompt_id or agent_config.get("prompt_id", "default"), "chunks": agent_config.get("chunks", "0"), "retriever": agent_config.get("retriever", ""), "status": "template", "createdAt": datetime.now(timezone.utc), "updatedAt": datetime.now(timezone.utc), } existing = self.agents_collection.find_one( {"user": self.system_user_id, "name": agent_config["name"]} ) if existing: self.logger.info(f"Updating existing agent: {agent_config['name']}") self.agents_collection.update_one( {"_id": existing["_id"]}, {"$set": agent_data} ) agent_id = existing["_id"] else: self.logger.info(f"Creating new agent: {agent_config['name']}") result = self.agents_collection.insert_one(agent_data) agent_id = result.inserted_id self.logger.info( f"Successfully processed agent: {agent_config['name']} (ID: {agent_id})" ) except Exception as e: self.logger.error( f"Error processing agent {agent_config['name']}: {str(e)}" ) continue self.logger.info("✅ Database seeding completed") def _handle_source(self, agent_config: Dict) -> Union[ObjectId, None, bool]: """Handle source ingestion and return source ID""" if not agent_config.get("source"): self.logger.info( "No source provided for agent - will create agent without source" ) return None source_config = agent_config["source"] self.logger.info(f"Ingesting source: {source_config['url']}") try: existing = self.sources_collection.find_one( {"user": self.system_user_id, "remote_data": source_config["url"]} ) if existing: self.logger.info(f"Source already exists: {existing['_id']}") return existing["_id"] # Ingest new source using worker task = ingest_remote.delay( source_data=source_config["url"], job_name=source_config["name"], user=self.system_user_id, loader=source_config.get("loader", "url"), ) result = task.get(timeout=300) if not task.successful(): raise Exception(f"Source ingestion failed: {result}") source_id = None if isinstance(result, dict) and "id" in result: source_id = result["id"] else: raise Exception(f"Source ingestion result missing 'id': {result}") self.logger.info(f"Source ingested successfully: {source_id}") return source_id except Exception as e: self.logger.error(f"Failed to ingest source: {str(e)}") return False def _handle_tools(self, agent_config: Dict) -> List[ObjectId]: """Handle tool creation and return list of tool IDs""" tool_ids = [] if not agent_config.get("tools"): return tool_ids for tool_config in agent_config["tools"]: try: tool_name = tool_config["name"] processed_config = self._process_config(tool_config.get("config", {})) self.logger.info(f"Processing tool: {tool_name}") existing = self.tools_collection.find_one( { "user": self.system_user_id, "name": tool_name, "config": processed_config, } ) if existing: self.logger.info(f"Tool already exists: {existing['_id']}") tool_ids.append(existing["_id"]) continue tool_data = { "user": self.system_user_id, "name": tool_name, "displayName": tool_config.get("display_name", tool_name), "description": tool_config.get("description", ""), "actions": tool_manager.tools[tool_name].get_actions_metadata(), "config": processed_config, "status": True, } result = self.tools_collection.insert_one(tool_data) tool_ids.append(result.inserted_id) self.logger.info(f"Created new tool: {result.inserted_id}") except Exception as e: self.logger.error(f"Failed to process tool {tool_name}: {str(e)}") continue return tool_ids def _handle_prompt(self, agent_config: Dict) -> Optional[str]: """Handle prompt creation and return prompt ID""" if not agent_config.get("prompt"): return None prompt_config = agent_config["prompt"] prompt_name = prompt_config.get("name", f"{agent_config['name']} Prompt") prompt_content = prompt_config.get("content", "") if not prompt_content: self.logger.warning( f"No prompt content provided for agent {agent_config['name']}" ) return None self.logger.info(f"Processing prompt: {prompt_name}") try: existing = self.prompts_collection.find_one( { "user": self.system_user_id, "name": prompt_name, "content": prompt_content, } ) if existing: self.logger.info(f"Prompt already exists: {existing['_id']}") return str(existing["_id"]) prompt_data = { "name": prompt_name, "content": prompt_content, "user": self.system_user_id, } result = self.prompts_collection.insert_one(prompt_data) prompt_id = str(result.inserted_id) self.logger.info(f"Created new prompt: {prompt_id}") return prompt_id except Exception as e: self.logger.error(f"Failed to process prompt {prompt_name}: {str(e)}") return None def _process_config(self, config: Dict) -> Dict: """Process config values to replace environment variables""" processed = {} for key, value in config.items(): if ( isinstance(value, str) and value.startswith("${") and value.endswith("}") ): env_var = value[2:-1] processed[key] = os.getenv(env_var, "") else: processed[key] = value return processed def _is_already_seeded(self) -> bool: """Check if premade agents already exist""" return self.agents_collection.count_documents({"user": self.system_user_id}) > 0 @classmethod def initialize_from_env(cls, worker=None): """Factory method to create seeder from environment""" mongo_uri = os.getenv("MONGO_URI", "mongodb://localhost:27017") db_name = os.getenv("MONGO_DB_NAME", "docsgpt") client = MongoClient(mongo_uri) db = client[db_name] return cls(db) ================================================ FILE: application/storage/__init__.py ================================================ ================================================ FILE: application/storage/base.py ================================================ """Base storage class for file system abstraction.""" from abc import ABC, abstractmethod from typing import BinaryIO, List, Callable class BaseStorage(ABC): """Abstract base class for storage implementations.""" @abstractmethod def save_file(self, file_data: BinaryIO, path: str, **kwargs) -> dict: """ Save a file to storage. Args: file_data: File-like object containing the data path: Path where the file should be stored Returns: dict: A dictionary containing metadata about the saved file, including: - 'path': The path where the file was saved - 'storage_type': The type of storage (e.g., 'local', 's3') - Other storage-specific metadata (e.g., 'uri', 'bucket_name', etc.) """ pass @abstractmethod def get_file(self, path: str) -> BinaryIO: """ Retrieve a file from storage. Args: path: Path to the file Returns: BinaryIO: File-like object containing the file data """ pass @abstractmethod def process_file(self, path: str, processor_func: Callable, **kwargs): """ Process a file using the provided processor function. This method handles the details of retrieving the file and providing it to the processor function in an appropriate way based on the storage type. Args: path: Path to the file processor_func: Function that processes the file **kwargs: Additional arguments to pass to the processor function Returns: The result of the processor function """ pass @abstractmethod def delete_file(self, path: str) -> bool: """ Delete a file from storage. Args: path: Path to the file Returns: bool: True if deletion was successful """ pass @abstractmethod def file_exists(self, path: str) -> bool: """ Check if a file exists. Args: path: Path to the file Returns: bool: True if the file exists """ pass @abstractmethod def list_files(self, directory: str) -> List[str]: """ List all files in a directory. Args: directory: Directory path to list Returns: List[str]: List of file paths """ pass @abstractmethod def is_directory(self, path: str) -> bool: """ Check if a path is a directory. Args: path: Path to check Returns: bool: True if the path is a directory """ pass @abstractmethod def remove_directory(self, directory: str) -> bool: """ Remove a directory and all its contents. For local storage, this removes the directory and all files/subdirectories within it. For S3 storage, this removes all objects with the directory path as a prefix. Args: directory: Directory path to remove Returns: bool: True if removal was successful, False otherwise """ pass ================================================ FILE: application/storage/local.py ================================================ """Local file system implementation.""" import os import shutil from typing import BinaryIO, List, Callable from application.storage.base import BaseStorage class LocalStorage(BaseStorage): """Local file system storage implementation.""" def __init__(self, base_dir: str = None): """ Initialize local storage. Args: base_dir: Base directory for all operations. If None, uses current directory. """ self.base_dir = base_dir or os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) def _get_full_path(self, path: str) -> str: """Get absolute path by combining base_dir and path.""" if os.path.isabs(path): return path return os.path.join(self.base_dir, path) def save_file(self, file_data: BinaryIO, path: str, **kwargs) -> dict: """Save a file to local storage.""" full_path = self._get_full_path(path) os.makedirs(os.path.dirname(full_path), exist_ok=True) if hasattr(file_data, 'save'): file_data.save(full_path) else: with open(full_path, 'wb') as f: shutil.copyfileobj(file_data, f) return { 'storage_type': 'local' } def get_file(self, path: str) -> BinaryIO: """Get a file from local storage.""" full_path = self._get_full_path(path) if not os.path.exists(full_path): raise FileNotFoundError(f"File not found: {full_path}") return open(full_path, 'rb') def delete_file(self, path: str) -> bool: """Delete a file from local storage.""" full_path = self._get_full_path(path) if not os.path.exists(full_path): return False os.remove(full_path) return True def file_exists(self, path: str) -> bool: """Check if a file exists in local storage.""" full_path = self._get_full_path(path) return os.path.exists(full_path) def list_files(self, directory: str) -> List[str]: """List all files in a directory in local storage.""" full_path = self._get_full_path(directory) if not os.path.exists(full_path): return [] result = [] for root, _, files in os.walk(full_path): for file in files: rel_path = os.path.relpath(os.path.join(root, file), self.base_dir) result.append(rel_path) return result def process_file(self, path: str, processor_func: Callable, **kwargs): """ Process a file using the provided processor function. For local storage, we can directly pass the full path to the processor. Args: path: Path to the file processor_func: Function that processes the file **kwargs: Additional arguments to pass to the processor function Returns: The result of the processor function """ full_path = self._get_full_path(path) if not os.path.exists(full_path): raise FileNotFoundError(f"File not found: {full_path}") return processor_func(local_path=full_path, **kwargs) def is_directory(self, path: str) -> bool: """ Check if a path is a directory in local storage. Args: path: Path to check Returns: bool: True if the path is a directory, False otherwise """ full_path = self._get_full_path(path) return os.path.isdir(full_path) def remove_directory(self, directory: str) -> bool: """ Remove a directory and all its contents from local storage. Args: directory: Directory path to remove Returns: bool: True if removal was successful, False otherwise """ full_path = self._get_full_path(directory) if not os.path.exists(full_path): return False if not os.path.isdir(full_path): return False try: shutil.rmtree(full_path) return True except (OSError, PermissionError): return False ================================================ FILE: application/storage/s3.py ================================================ """S3 storage implementation.""" import io import os from typing import BinaryIO, Callable, List import boto3 from application.core.settings import settings from application.storage.base import BaseStorage from botocore.exceptions import ClientError class S3Storage(BaseStorage): """AWS S3 storage implementation.""" def __init__(self, bucket_name=None): """ Initialize S3 storage. Args: bucket_name: S3 bucket name (optional, defaults to settings) """ self.bucket_name = bucket_name or getattr( settings, "S3_BUCKET_NAME", "docsgpt-test-bucket" ) # Get credentials from settings aws_access_key_id = getattr(settings, "SAGEMAKER_ACCESS_KEY", None) aws_secret_access_key = getattr(settings, "SAGEMAKER_SECRET_KEY", None) region_name = getattr(settings, "SAGEMAKER_REGION", None) self.s3 = boto3.client( "s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name, ) def save_file( self, file_data: BinaryIO, path: str, storage_class: str = "INTELLIGENT_TIERING", **kwargs, ) -> dict: """Save a file to S3 storage.""" self.s3.upload_fileobj( file_data, self.bucket_name, path, ExtraArgs={"StorageClass": storage_class} ) region = getattr(settings, "SAGEMAKER_REGION", None) return { "storage_type": "s3", "bucket_name": self.bucket_name, "uri": f"s3://{self.bucket_name}/{path}", "region": region, } def get_file(self, path: str) -> BinaryIO: """Get a file from S3 storage.""" if not self.file_exists(path): raise FileNotFoundError(f"File not found: {path}") file_obj = io.BytesIO() self.s3.download_fileobj(self.bucket_name, path, file_obj) file_obj.seek(0) return file_obj def delete_file(self, path: str) -> bool: """Delete a file from S3 storage.""" try: self.s3.delete_object(Bucket=self.bucket_name, Key=path) return True except ClientError: return False def file_exists(self, path: str) -> bool: """Check if a file exists in S3 storage.""" try: self.s3.head_object(Bucket=self.bucket_name, Key=path) return True except ClientError: return False def list_files(self, directory: str) -> List[str]: """List all files in a directory in S3 storage.""" # Ensure directory ends with a slash if it's not empty if directory and not directory.endswith("/"): directory += "/" result = [] paginator = self.s3.get_paginator("list_objects_v2") pages = paginator.paginate(Bucket=self.bucket_name, Prefix=directory) for page in pages: if "Contents" in page: for obj in page["Contents"]: result.append(obj["Key"]) return result def process_file(self, path: str, processor_func: Callable, **kwargs): """ Process a file using the provided processor function. Args: path: Path to the file processor_func: Function that processes the file **kwargs: Additional arguments to pass to the processor function Returns: The result of the processor function """ import logging import tempfile if not self.file_exists(path): raise FileNotFoundError(f"File not found in S3: {path}") with tempfile.NamedTemporaryFile( suffix=os.path.splitext(path)[1], delete=True ) as temp_file: try: # Download the file from S3 to the temporary file self.s3.download_fileobj(self.bucket_name, path, temp_file) temp_file.flush() return processor_func(local_path=temp_file.name, **kwargs) except Exception as e: logging.error(f"Error processing S3 file {path}: {e}", exc_info=True) raise def is_directory(self, path: str) -> bool: """ Check if a path is a directory in S3 storage. In S3, directories are virtual concepts. A path is considered a directory if there are objects with the path as a prefix. Args: path: Path to check Returns: bool: True if the path is a directory, False otherwise """ # Ensure path ends with a slash if not empty if path and not path.endswith('/'): path += '/' response = self.s3.list_objects_v2( Bucket=self.bucket_name, Prefix=path, MaxKeys=1 ) return 'Contents' in response def remove_directory(self, directory: str) -> bool: """ Remove a directory and all its contents from S3 storage. In S3, this removes all objects with the directory path as a prefix. Since S3 doesn't have actual directories, this effectively removes all files within the virtual directory structure. Args: directory: Directory path to remove Returns: bool: True if removal was successful, False otherwise """ # Ensure directory ends with a slash if not empty if directory and not directory.endswith('/'): directory += '/' try: # Get all objects with the directory prefix objects_to_delete = [] paginator = self.s3.get_paginator('list_objects_v2') pages = paginator.paginate(Bucket=self.bucket_name, Prefix=directory) for page in pages: if 'Contents' in page: for obj in page['Contents']: objects_to_delete.append({'Key': obj['Key']}) if not objects_to_delete: return False batch_size = 1000 for i in range(0, len(objects_to_delete), batch_size): batch = objects_to_delete[i:i + batch_size] response = self.s3.delete_objects( Bucket=self.bucket_name, Delete={'Objects': batch} ) if 'Errors' in response and response['Errors']: return False return True except ClientError: return False ================================================ FILE: application/storage/storage_creator.py ================================================ """Storage factory for creating different storage implementations.""" from typing import Dict, Type from application.storage.base import BaseStorage from application.storage.local import LocalStorage from application.storage.s3 import S3Storage from application.core.settings import settings class StorageCreator: storages: Dict[str, Type[BaseStorage]] = { "local": LocalStorage, "s3": S3Storage, } _instance = None @classmethod def get_storage(cls) -> BaseStorage: if cls._instance is None: storage_type = getattr(settings, "STORAGE_TYPE", "local") cls._instance = cls.create_storage(storage_type) return cls._instance @classmethod def create_storage(cls, type_name: str, *args, **kwargs) -> BaseStorage: storage_class = cls.storages.get(type_name.lower()) if not storage_class: raise ValueError(f"No storage implementation found for type {type_name}") return storage_class(*args, **kwargs) ================================================ FILE: application/stt/__init__.py ================================================ """Speech-to-text providers.""" ================================================ FILE: application/stt/base.py ================================================ from abc import ABC, abstractmethod from pathlib import Path from typing import Any, Dict, Optional class BaseSTT(ABC): @abstractmethod def transcribe( self, file_path: Path, language: Optional[str] = None, timestamps: bool = False, diarize: bool = False, ) -> Dict[str, Any]: pass ================================================ FILE: application/stt/constants.py ================================================ SUPPORTED_AUDIO_EXTENSIONS = (".wav", ".mp3", ".m4a", ".ogg", ".webm") SUPPORTED_AUDIO_MIME_TYPES = { "application/ogg", "audio/aac", "audio/mp3", "audio/mp4", "audio/mpeg", "audio/ogg", "audio/wav", "audio/webm", "audio/x-m4a", "audio/x-wav", "video/webm", } ================================================ FILE: application/stt/faster_whisper_stt.py ================================================ from pathlib import Path from typing import Dict, Optional from application.stt.base import BaseSTT class FasterWhisperSTT(BaseSTT): def __init__( self, model_size: str = "base", device: str = "auto", compute_type: str = "int8", ): self.model_size = model_size self.device = device self.compute_type = compute_type self._model = None def _get_model(self): if self._model is None: try: from faster_whisper import WhisperModel except ImportError as exc: raise ImportError( "faster-whisper is required to use the faster_whisper STT provider." ) from exc self._model = WhisperModel( self.model_size, device=self.device, compute_type=self.compute_type, ) return self._model def transcribe( self, file_path: Path, language: Optional[str] = None, timestamps: bool = False, diarize: bool = False, ) -> Dict[str, object]: _ = diarize model = self._get_model() segments_iter, info = model.transcribe( str(file_path), language=language, word_timestamps=timestamps, ) segments = [] text_parts = [] for segment in segments_iter: segment_text = getattr(segment, "text", "").strip() if segment_text: text_parts.append(segment_text) segments.append( { "start": getattr(segment, "start", None), "end": getattr(segment, "end", None), "text": segment_text, } ) return { "text": " ".join(text_parts).strip(), "language": getattr(info, "language", language), "duration_s": getattr(info, "duration", None), "segments": segments if timestamps else [], "provider": "faster_whisper", } ================================================ FILE: application/stt/live_session.py ================================================ import json import re import uuid from typing import Dict, Optional LIVE_STT_SESSION_PREFIX = "stt_live_session:" LIVE_STT_SESSION_TTL_SECONDS = 15 * 60 LIVE_STT_MUTABLE_TAIL_WORDS = 8 LIVE_STT_SILENCE_MUTABLE_TAIL_WORDS = 2 LIVE_STT_MIN_COMMITTED_OVERLAP_WORDS = 2 def normalize_transcript_text(text: str) -> str: return " ".join((text or "").split()).strip() def join_transcript_parts(*parts: str) -> str: return " ".join(part for part in map(normalize_transcript_text, parts) if part) def _normalize_word(word: str) -> str: normalized = re.sub(r"[^\w]+", "", word.casefold(), flags=re.UNICODE) return normalized or word.casefold() def _split_words(text: str) -> list[str]: normalized = normalize_transcript_text(text) return normalized.split() if normalized else [] def _common_prefix_length(left_words: list[str], right_words: list[str]) -> int: max_index = min(len(left_words), len(right_words)) prefix_length = 0 for index in range(max_index): if _normalize_word(left_words[index]) != _normalize_word(right_words[index]): break prefix_length += 1 return prefix_length def _find_suffix_prefix_overlap( left_words: list[str], right_words: list[str], min_overlap: int ) -> int: max_overlap = min(len(left_words), len(right_words)) if max_overlap < min_overlap: return 0 left_keys = [_normalize_word(word) for word in left_words] right_keys = [_normalize_word(word) for word in right_words] for overlap_size in range(max_overlap, min_overlap - 1, -1): if left_keys[-overlap_size:] == right_keys[:overlap_size]: return overlap_size return 0 def strip_committed_prefix(committed_text: str, hypothesis_text: str) -> str: committed_words = _split_words(committed_text) hypothesis_words = _split_words(hypothesis_text) if not committed_words or not hypothesis_words: return normalize_transcript_text(hypothesis_text) full_prefix_length = _common_prefix_length(committed_words, hypothesis_words) if full_prefix_length == len(committed_words): return " ".join(hypothesis_words[full_prefix_length:]) overlap_size = _find_suffix_prefix_overlap( committed_words, hypothesis_words, LIVE_STT_MIN_COMMITTED_OVERLAP_WORDS, ) if overlap_size: return " ".join(hypothesis_words[overlap_size:]) return " ".join(hypothesis_words) def _calculate_commit_count( previous_hypothesis: str, current_hypothesis: str, is_silence: bool ) -> int: previous_words = _split_words(previous_hypothesis) current_words = _split_words(current_hypothesis) if not current_words: return 0 if not previous_words: if is_silence: return max(0, len(current_words) - LIVE_STT_SILENCE_MUTABLE_TAIL_WORDS) return 0 stable_prefix_length = _common_prefix_length(previous_words, current_words) if not stable_prefix_length: return 0 mutable_tail_words = ( LIVE_STT_SILENCE_MUTABLE_TAIL_WORDS if is_silence else LIVE_STT_MUTABLE_TAIL_WORDS ) max_committable_by_tail = max(0, len(current_words) - mutable_tail_words) return min(stable_prefix_length, max_committable_by_tail) def create_live_stt_session( user: str, language: Optional[str] = None ) -> Dict[str, object]: return { "session_id": str(uuid.uuid4()), "user": user, "language": language, "committed_text": "", "mutable_text": "", "previous_hypothesis": "", "latest_hypothesis": "", "last_chunk_index": -1, } def get_live_stt_session_key(session_id: str) -> str: return f"{LIVE_STT_SESSION_PREFIX}{session_id}" def save_live_stt_session(redis_client, session_state: Dict[str, object]) -> None: redis_client.setex( get_live_stt_session_key(str(session_state["session_id"])), LIVE_STT_SESSION_TTL_SECONDS, json.dumps(session_state), ) def load_live_stt_session(redis_client, session_id: str) -> Optional[Dict[str, object]]: raw_session = redis_client.get(get_live_stt_session_key(session_id)) if not raw_session: return None if isinstance(raw_session, bytes): raw_session = raw_session.decode("utf-8") return json.loads(raw_session) def delete_live_stt_session(redis_client, session_id: str) -> None: redis_client.delete(get_live_stt_session_key(session_id)) def apply_live_stt_hypothesis( session_state: Dict[str, object], hypothesis_text: str, chunk_index: int, is_silence: bool = False, ) -> Dict[str, object]: last_chunk_index = int(session_state.get("last_chunk_index", -1)) if chunk_index < 0: raise ValueError("chunk_index must be non-negative") if chunk_index < last_chunk_index: raise ValueError("chunk_index is older than the last processed chunk") if chunk_index == last_chunk_index: return session_state committed_text = normalize_transcript_text(str(session_state.get("committed_text", ""))) previous_hypothesis = normalize_transcript_text( str(session_state.get("latest_hypothesis", "")) ) current_hypothesis = strip_committed_prefix(committed_text, hypothesis_text) if not current_hypothesis and is_silence and previous_hypothesis: committed_text = join_transcript_parts(committed_text, previous_hypothesis) previous_hypothesis = "" commit_count = _calculate_commit_count( previous_hypothesis, current_hypothesis, is_silence=is_silence, ) current_words = _split_words(current_hypothesis) if commit_count: committed_text = join_transcript_parts( committed_text, " ".join(current_words[:commit_count]), ) current_hypothesis = " ".join(current_words[commit_count:]) session_state["committed_text"] = committed_text session_state["mutable_text"] = normalize_transcript_text(current_hypothesis) session_state["previous_hypothesis"] = previous_hypothesis session_state["latest_hypothesis"] = normalize_transcript_text(current_hypothesis) session_state["last_chunk_index"] = chunk_index return session_state def get_live_stt_transcript_text(session_state: Dict[str, object]) -> str: return join_transcript_parts( str(session_state.get("committed_text", "")), str(session_state.get("mutable_text", "")), ) def finalize_live_stt_session(session_state: Dict[str, object]) -> str: return join_transcript_parts( str(session_state.get("committed_text", "")), str(session_state.get("latest_hypothesis", "")), ) ================================================ FILE: application/stt/openai_stt.py ================================================ from pathlib import Path from typing import Any, Dict, Optional from openai import OpenAI from application.core.settings import settings from application.stt.base import BaseSTT class OpenAISTT(BaseSTT): def __init__( self, api_key: Optional[str] = None, base_url: Optional[str] = None, model: Optional[str] = None, ): self.api_key = api_key or settings.OPENAI_API_KEY or settings.API_KEY self.base_url = base_url or settings.OPENAI_BASE_URL or "https://api.openai.com/v1" self.model = model or settings.OPENAI_STT_MODEL self.client = OpenAI(api_key=self.api_key, base_url=self.base_url) def transcribe( self, file_path: Path, language: Optional[str] = None, timestamps: bool = False, diarize: bool = False, ) -> Dict[str, Any]: _ = diarize request: Dict[str, Any] = { "file": file_path, "model": self.model, "response_format": "verbose_json", } if language: request["language"] = language if timestamps: request["timestamp_granularities"] = ["segment"] with open(file_path, "rb") as audio_file: request["file"] = audio_file response = self.client.audio.transcriptions.create(**request) response_dict = self._to_dict(response) segments = response_dict.get("segments") or [] return { "text": response_dict.get("text", ""), "language": response_dict.get("language") or language, "duration_s": response_dict.get("duration"), "segments": [self._to_dict(segment) for segment in segments], "provider": "openai", } @staticmethod def _to_dict(value: Any) -> Dict[str, Any]: if hasattr(value, "model_dump"): return value.model_dump() if isinstance(value, dict): return value return {} ================================================ FILE: application/stt/stt_creator.py ================================================ from application.stt.base import BaseSTT from application.stt.faster_whisper_stt import FasterWhisperSTT from application.stt.openai_stt import OpenAISTT class STTCreator: stt_providers = { "openai": OpenAISTT, "faster_whisper": FasterWhisperSTT, } @classmethod def create_stt(cls, stt_type, *args, **kwargs) -> BaseSTT: stt_class = cls.stt_providers.get(stt_type.lower()) if not stt_class: raise ValueError(f"No stt class found for type {stt_type}") return stt_class(*args, **kwargs) ================================================ FILE: application/stt/upload_limits.py ================================================ from pathlib import Path from application.core.settings import settings from application.stt.constants import SUPPORTED_AUDIO_EXTENSIONS from application.utils import safe_filename STT_REQUEST_SIZE_OVERHEAD_BYTES = 1024 * 1024 STT_SIZE_LIMITED_PATHS = frozenset(("/api/stt", "/api/stt/live/chunk")) class AudioFileTooLargeError(ValueError): pass def get_stt_max_file_size_bytes() -> int: return max(0, settings.STT_MAX_FILE_SIZE_MB) * 1024 * 1024 def build_stt_file_size_limit_message() -> str: return f"Audio file exceeds {settings.STT_MAX_FILE_SIZE_MB}MB limit" def is_audio_filename(filename: str | Path | None) -> bool: if not filename: return False safe_name = safe_filename(Path(str(filename)).name) return Path(safe_name).suffix.lower() in SUPPORTED_AUDIO_EXTENSIONS def enforce_audio_file_size_limit(size_bytes: int) -> None: max_size_bytes = get_stt_max_file_size_bytes() if max_size_bytes and size_bytes > max_size_bytes: raise AudioFileTooLargeError(build_stt_file_size_limit_message()) def should_reject_stt_request(path: str, content_length: int | None) -> bool: if path not in STT_SIZE_LIMITED_PATHS or content_length is None: return False max_request_size_bytes = ( get_stt_max_file_size_bytes() + STT_REQUEST_SIZE_OVERHEAD_BYTES ) return content_length > max_request_size_bytes ================================================ FILE: application/templates/__init__.py ================================================ ================================================ FILE: application/templates/namespaces.py ================================================ import logging import uuid from abc import ABC, abstractmethod from datetime import datetime, timezone from typing import Any, Dict, Optional logger = logging.getLogger(__name__) class NamespaceBuilder(ABC): """Base class for building template context namespaces""" @abstractmethod def build(self, **kwargs) -> Dict[str, Any]: """Build namespace context dictionary""" pass @property @abstractmethod def namespace_name(self) -> str: """Name of this namespace for template access""" pass class SystemNamespace(NamespaceBuilder): """System metadata namespace: {{ system.* }}""" @property def namespace_name(self) -> str: return "system" def build( self, request_id: Optional[str] = None, user_id: Optional[str] = None, **kwargs ) -> Dict[str, Any]: """ Build system context with metadata. Args: request_id: Unique request identifier user_id: Current user identifier Returns: Dictionary with system variables """ now = datetime.now(timezone.utc) return { "date": now.strftime("%Y-%m-%d"), "time": now.strftime("%H:%M:%S"), "timestamp": now.isoformat(), "request_id": request_id or str(uuid.uuid4()), "user_id": user_id, } class PassthroughNamespace(NamespaceBuilder): """Request parameters namespace: {{ passthrough.* }}""" @property def namespace_name(self) -> str: return "passthrough" def build( self, passthrough_data: Optional[Dict[str, Any]] = None, **kwargs ) -> Dict[str, Any]: """ Build passthrough context from request parameters. Args: passthrough_data: Dictionary of parameters from web request Returns: Dictionary with passthrough variables """ if not passthrough_data: return {} safe_data = {} for key, value in passthrough_data.items(): if isinstance(value, (str, int, float, bool, type(None))): safe_data[key] = value else: logger.warning( f"Skipping non-serializable passthrough value for key '{key}': {type(value)}" ) return safe_data class SourceNamespace(NamespaceBuilder): """RAG source documents namespace: {{ source.* }}""" @property def namespace_name(self) -> str: return "source" def build( self, docs: Optional[list] = None, docs_together: Optional[str] = None, **kwargs ) -> Dict[str, Any]: """ Build source context from RAG retrieval results. Args: docs: List of retrieved documents docs_together: Concatenated document content (for backward compatibility) Returns: Dictionary with source variables """ context = {} if docs: context["documents"] = docs context["count"] = len(docs) if docs_together: context["docs_together"] = docs_together # Add docs_together for custom templates context["content"] = docs_together context["summaries"] = docs_together return context class ToolsNamespace(NamespaceBuilder): """Pre-executed tools namespace: {{ tools.* }}""" @property def namespace_name(self) -> str: return "tools" def build( self, tools_data: Optional[Dict[str, Any]] = None, **kwargs ) -> Dict[str, Any]: """ Build tools context with pre-executed tool results. Args: tools_data: Dictionary of pre-fetched tool results organized by tool name e.g., {"memory": {"notes": "content", "tasks": "list"}} Returns: Dictionary with tool results organized by tool name """ if not tools_data: return {} safe_data = {} for tool_name, tool_result in tools_data.items(): if isinstance(tool_result, (str, dict, list, int, float, bool, type(None))): safe_data[tool_name] = tool_result else: logger.warning( f"Skipping non-serializable tool result for '{tool_name}': {type(tool_result)}" ) return safe_data class NamespaceManager: """Manages all namespace builders and context assembly""" def __init__(self): self._builders = { "system": SystemNamespace(), "passthrough": PassthroughNamespace(), "source": SourceNamespace(), "tools": ToolsNamespace(), } def build_context(self, **kwargs) -> Dict[str, Any]: """ Build complete template context from all namespaces. Args: **kwargs: Parameters to pass to namespace builders Returns: Complete context dictionary for template rendering """ context = {} for namespace_name, builder in self._builders.items(): try: namespace_context = builder.build(**kwargs) # Always include namespace, even if empty, to prevent undefined errors context[namespace_name] = namespace_context if namespace_context else {} except Exception as e: logger.error(f"Failed to build {namespace_name} namespace: {str(e)}") # Include empty namespace on error to prevent template failures context[namespace_name] = {} return context def get_builder(self, namespace_name: str) -> Optional[NamespaceBuilder]: """Get specific namespace builder""" return self._builders.get(namespace_name) ================================================ FILE: application/templates/template_engine.py ================================================ import logging from typing import Any, Dict, List, Optional, Set from jinja2 import ( ChainableUndefined, Environment, nodes, select_autoescape, TemplateSyntaxError, ) from jinja2.exceptions import UndefinedError logger = logging.getLogger(__name__) class TemplateRenderError(Exception): """Raised when template rendering fails""" pass class TemplateEngine: """Jinja2-based template engine for dynamic prompt rendering""" def __init__(self): self._env = Environment( undefined=ChainableUndefined, trim_blocks=True, lstrip_blocks=True, autoescape=select_autoescape(default_for_string=True, default=True), ) def render(self, template_content: str, context: Dict[str, Any]) -> str: """ Render template with provided context. Args: template_content: Raw template string with Jinja2 syntax context: Dictionary of variables to inject into template Returns: Rendered template string Raises: TemplateRenderError: If template syntax is invalid or variables undefined """ if not template_content: return "" try: template = self._env.from_string(template_content) return template.render(**context) except TemplateSyntaxError as e: error_msg = f"Template syntax error at line {e.lineno}: {e.message}" logger.error(error_msg) raise TemplateRenderError(error_msg) from e except UndefinedError as e: error_msg = f"Undefined variable in template: {e.message}" logger.error(error_msg) raise TemplateRenderError(error_msg) from e except Exception as e: error_msg = f"Template rendering failed: {str(e)}" logger.error(error_msg) raise TemplateRenderError(error_msg) from e def validate_template(self, template_content: str) -> bool: """ Validate template syntax without rendering. Args: template_content: Template string to validate Returns: True if template is syntactically valid """ if not template_content: return True try: self._env.from_string(template_content) return True except TemplateSyntaxError as e: logger.debug(f"Template syntax invalid at line {e.lineno}: {e.message}") return False except Exception as e: logger.debug(f"Template validation error: {type(e).__name__}: {str(e)}") return False def extract_variables(self, template_content: str) -> Set[str]: """ Extract all variable names from template. Args: template_content: Template string to analyze Returns: Set of variable names found in template """ if not template_content: return set() try: ast = self._env.parse(template_content) return set(self._env.get_template_module(ast).make_module().keys()) except TemplateSyntaxError as e: logger.debug(f"Cannot extract variables - syntax error at line {e.lineno}") return set() except Exception as e: logger.debug(f"Cannot extract variables: {type(e).__name__}") return set() def extract_tool_usages( self, template_content: str ) -> Dict[str, Set[Optional[str]]]: """Extract tool and action references from a template""" if not template_content: return {} try: ast = self._env.parse(template_content) except TemplateSyntaxError as e: logger.debug(f"extract_tool_usages - syntax error at line {e.lineno}") return {} except Exception as e: logger.debug(f"extract_tool_usages - parse error: {type(e).__name__}") return {} usages: Dict[str, Set[Optional[str]]] = {} def record(path: List[str]) -> None: if not path: return tool_name = path[0] action_name = path[1] if len(path) > 1 else None if not tool_name: return tool_entry = usages.setdefault(tool_name, set()) tool_entry.add(action_name) for node in ast.find_all(nodes.Getattr): path = [] current = node while isinstance(current, nodes.Getattr): path.append(current.attr) current = current.node if isinstance(current, nodes.Name) and current.name == "tools": path.reverse() record(path) for node in ast.find_all(nodes.Getitem): path = [] current = node while isinstance(current, nodes.Getitem): key = current.arg if isinstance(key, nodes.Const) and isinstance(key.value, str): path.append(key.value) else: path = [] break current = current.node if path and isinstance(current, nodes.Name) and current.name == "tools": path.reverse() record(path) return usages ================================================ FILE: application/tts/base.py ================================================ from abc import ABC, abstractmethod class BaseTTS(ABC): def __init__(self): pass @abstractmethod def text_to_speech(self, *args, **kwargs): pass ================================================ FILE: application/tts/elevenlabs.py ================================================ from io import BytesIO import base64 from application.tts.base import BaseTTS from application.core.settings import settings class ElevenlabsTTS(BaseTTS): def __init__(self): from elevenlabs.client import ElevenLabs self.client = ElevenLabs( api_key=settings.ELEVENLABS_API_KEY, ) def text_to_speech(self, text): lang = "en" audio = self.client.text_to_speech.convert( voice_id="nPczCjzI2devNBz1zQrb", model_id="eleven_multilingual_v2", text=text, output_format="mp3_44100_128" ) audio_data = BytesIO() for chunk in audio: audio_data.write(chunk) audio_bytes = audio_data.getvalue() # Encode to base64 audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") return audio_base64, lang ================================================ FILE: application/tts/google_tts.py ================================================ import io import base64 from gtts import gTTS from application.tts.base import BaseTTS class GoogleTTS(BaseTTS): def __init__(self): pass def text_to_speech(self, text): lang = "en" audio_fp = io.BytesIO() tts = gTTS(text=text, lang=lang, slow=False) tts.write_to_fp(audio_fp) audio_fp.seek(0) audio_base64 = base64.b64encode(audio_fp.read()).decode("utf-8") return audio_base64, lang ================================================ FILE: application/tts/tts_creator.py ================================================ from application.tts.google_tts import GoogleTTS from application.tts.elevenlabs import ElevenlabsTTS from application.tts.base import BaseTTS class TTSCreator: tts_providers = { "google_tts": GoogleTTS, "elevenlabs": ElevenlabsTTS, } @classmethod def create_tts(cls, tts_type, *args, **kwargs)-> BaseTTS: tts_class = cls.tts_providers.get(tts_type.lower()) if not tts_class: raise ValueError(f"No tts class found for type {tts_type}") return tts_class(*args, **kwargs) ================================================ FILE: application/usage.py ================================================ import sys import logging from datetime import datetime from application.core.mongo_db import MongoDB from application.core.settings import settings from application.utils import num_tokens_from_object_or_list, num_tokens_from_string logger = logging.getLogger(__name__) mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] usage_collection = db["token_usage"] def _serialize_for_token_count(value): """Normalize payloads into token-countable primitives.""" if isinstance(value, str): # Avoid counting large binary payloads in data URLs as text tokens. if value.startswith("data:") and ";base64," in value: return "" return value if value is None: return "" if isinstance(value, list): return [_serialize_for_token_count(item) for item in value] if isinstance(value, dict): serialized = {} for key, raw in value.items(): key_lower = str(key).lower() # Skip raw binary-like fields; keep textual tool-call fields. if key_lower in {"data", "base64", "image_data"} and isinstance(raw, str): continue if key_lower == "url" and isinstance(raw, str) and ";base64," in raw: continue serialized[key] = _serialize_for_token_count(raw) return serialized if hasattr(value, "model_dump") and callable(getattr(value, "model_dump")): return _serialize_for_token_count(value.model_dump()) if hasattr(value, "to_dict") and callable(getattr(value, "to_dict")): return _serialize_for_token_count(value.to_dict()) if hasattr(value, "__dict__"): return _serialize_for_token_count(vars(value)) return str(value) def _count_tokens(value): serialized = _serialize_for_token_count(value) if isinstance(serialized, str): return num_tokens_from_string(serialized) return num_tokens_from_object_or_list(serialized) def _count_prompt_tokens(messages, tools=None, usage_attachments=None, **kwargs): prompt_tokens = 0 for message in messages or []: if not isinstance(message, dict): prompt_tokens += _count_tokens(message) continue prompt_tokens += _count_tokens(message.get("content")) # Include tool-related message fields for providers that use OpenAI-native format. prompt_tokens += _count_tokens(message.get("tool_calls")) prompt_tokens += _count_tokens(message.get("tool_call_id")) prompt_tokens += _count_tokens(message.get("function_call")) prompt_tokens += _count_tokens(message.get("function_response")) # Count tool schema payload passed to the model. prompt_tokens += _count_tokens(tools) # Count structured-output/schema payloads when provided. prompt_tokens += _count_tokens(kwargs.get("response_format")) prompt_tokens += _count_tokens(kwargs.get("response_schema")) # Optional usage-only attachment context (not forwarded to provider). prompt_tokens += _count_tokens(usage_attachments) return prompt_tokens def update_token_usage(decoded_token, user_api_key, token_usage, agent_id=None): if "pytest" in sys.modules: return user_id = decoded_token.get("sub") if isinstance(decoded_token, dict) else None normalized_agent_id = str(agent_id) if agent_id else None if not user_id and not user_api_key and not normalized_agent_id: logger.warning( "Skipping token usage insert: missing user_id, api_key, and agent_id" ) return usage_data = { "user_id": user_id, "api_key": user_api_key, "prompt_tokens": token_usage["prompt_tokens"], "generated_tokens": token_usage["generated_tokens"], "timestamp": datetime.now(), } if normalized_agent_id: usage_data["agent_id"] = normalized_agent_id usage_collection.insert_one(usage_data) def gen_token_usage(func): def wrapper(self, model, messages, stream, tools, **kwargs): usage_attachments = kwargs.pop("_usage_attachments", None) call_usage = {"prompt_tokens": 0, "generated_tokens": 0} call_usage["prompt_tokens"] += _count_prompt_tokens( messages, tools=tools, usage_attachments=usage_attachments, **kwargs, ) result = func(self, model, messages, stream, tools, **kwargs) call_usage["generated_tokens"] += _count_tokens(result) self.token_usage["prompt_tokens"] += call_usage["prompt_tokens"] self.token_usage["generated_tokens"] += call_usage["generated_tokens"] update_token_usage( self.decoded_token, self.user_api_key, call_usage, getattr(self, "agent_id", None), ) return result return wrapper def stream_token_usage(func): def wrapper(self, model, messages, stream, tools, **kwargs): usage_attachments = kwargs.pop("_usage_attachments", None) call_usage = {"prompt_tokens": 0, "generated_tokens": 0} call_usage["prompt_tokens"] += _count_prompt_tokens( messages, tools=tools, usage_attachments=usage_attachments, **kwargs, ) batch = [] result = func(self, model, messages, stream, tools, **kwargs) for r in result: batch.append(r) yield r for line in batch: call_usage["generated_tokens"] += _count_tokens(line) self.token_usage["prompt_tokens"] += call_usage["prompt_tokens"] self.token_usage["generated_tokens"] += call_usage["generated_tokens"] update_token_usage( self.decoded_token, self.user_api_key, call_usage, getattr(self, "agent_id", None), ) return wrapper ================================================ FILE: application/utils.py ================================================ import base64 import hashlib import io import logging import os import re import uuid from typing import List import tiktoken from flask import jsonify, make_response from werkzeug.utils import secure_filename from application.core.model_utils import get_token_limit from application.core.settings import settings logger = logging.getLogger(__name__) _encoding = None def get_encoding(): global _encoding if _encoding is None: _encoding = tiktoken.get_encoding("cl100k_base") return _encoding def get_gpt_model() -> str: """Get GPT model based on provider""" model_map = { "openai": "gpt-4o-mini", "anthropic": "claude-2", "groq": "llama3-8b-8192", "novita": "deepseek/deepseek-r1", } return settings.LLM_NAME or model_map.get(settings.LLM_PROVIDER, "") def safe_filename(filename): """Create safe filename, preserving extension. Handles non-Latin characters.""" if not filename: return str(uuid.uuid4()) _, extension = os.path.splitext(filename) safe_name = secure_filename(filename) # If secure_filename returns just the extension or an empty string if not safe_name or safe_name == extension.lstrip("."): return f"{str(uuid.uuid4())}{extension}" return safe_name def num_tokens_from_string(string: str) -> int: encoding = get_encoding() if isinstance(string, str): num_tokens = len(encoding.encode(string)) return num_tokens else: return 0 def num_tokens_from_object_or_list(thing): if isinstance(thing, list): return sum([num_tokens_from_object_or_list(x) for x in thing]) elif isinstance(thing, dict): return sum([num_tokens_from_object_or_list(x) for x in thing.values()]) elif isinstance(thing, str): return num_tokens_from_string(thing) else: return 0 def count_tokens_docs(docs): docs_content = "" for doc in docs: docs_content += doc.page_content tokens = num_tokens_from_string(docs_content) return tokens def calculate_doc_token_budget( model_id: str = "gpt-4o" ) -> int: total_context = get_token_limit(model_id) reserved = sum(settings.RESERVED_TOKENS.values()) doc_budget = total_context - reserved return max(doc_budget, 1000) def get_missing_fields(data, required_fields): """Check for missing required fields. Returns list of missing field names.""" return [field for field in required_fields if field not in data] def check_required_fields(data, required_fields): """Validate required fields. Returns Flask 400 response if validation fails, None otherwise.""" missing_fields = get_missing_fields(data, required_fields) if missing_fields: return make_response( jsonify( { "success": False, "message": f"Missing required fields: {', '.join(missing_fields)}", } ), 400, ) return None def get_field_validation_errors(data, required_fields): """Check for missing and empty fields. Returns dict with 'missing_fields' and 'empty_fields', or None.""" missing_fields = [] empty_fields = [] for field in required_fields: if field not in data: missing_fields.append(field) elif not data[field]: empty_fields.append(field) if missing_fields or empty_fields: return {"missing_fields": missing_fields, "empty_fields": empty_fields} return None def validate_required_fields(data, required_fields): """Validate required fields (must exist and be non-empty). Returns Flask 400 response if validation fails, None otherwise.""" errors_dict = get_field_validation_errors(data, required_fields) if errors_dict: errors = [] if errors_dict["missing_fields"]: errors.append( f"Missing required fields: {', '.join(errors_dict['missing_fields'])}" ) if errors_dict["empty_fields"]: errors.append( f"Empty values in required fields: {', '.join(errors_dict['empty_fields'])}" ) return make_response( jsonify({"success": False, "message": " | ".join(errors)}), 400 ) return None def get_hash(data): return hashlib.md5(data.encode(), usedforsecurity=False).hexdigest() def limit_chat_history(history, max_token_limit=None, model_id="docsgpt-local"): """Limit chat history to fit within token limit.""" model_token_limit = get_token_limit(model_id) max_token_limit = ( max_token_limit if max_token_limit and max_token_limit < model_token_limit else model_token_limit ) if not history: return [] trimmed_history = [] tokens_current_history = 0 for message in reversed(history): tokens_batch = 0 if "prompt" in message and "response" in message: tokens_batch += num_tokens_from_string(message["prompt"]) tokens_batch += num_tokens_from_string(message["response"]) if "tool_calls" in message: for tool_call in message["tool_calls"]: tool_call_string = f"Tool: {tool_call.get('tool_name')} | Action: {tool_call.get('action_name')} | Args: {tool_call.get('arguments')} | Response: {tool_call.get('result')}" tokens_batch += num_tokens_from_string(tool_call_string) if tokens_current_history + tokens_batch < max_token_limit: tokens_current_history += tokens_batch trimmed_history.insert(0, message) else: break return trimmed_history def validate_function_name(function_name): """Validate function name matches allowed pattern (alphanumeric, underscore, hyphen).""" if not re.match(r"^[a-zA-Z0-9_-]+$", function_name): return False return True def generate_image_url(image_path): if isinstance(image_path, str) and ( image_path.startswith("http://") or image_path.startswith("https://") ): return image_path strategy = getattr(settings, "URL_STRATEGY", "backend") if strategy == "s3": bucket_name = getattr(settings, "S3_BUCKET_NAME", "docsgpt-test-bucket") region_name = getattr(settings, "SAGEMAKER_REGION", "eu-central-1") return f"https://{bucket_name}.s3.{region_name}.amazonaws.com/{image_path}" else: base_url = getattr(settings, "API_URL", "http://localhost:7091") return f"{base_url}/api/images/{image_path}" def calculate_compression_threshold( model_id: str, threshold_percentage: float = 0.8 ) -> int: """ Calculate token threshold for triggering compression. Args: model_id: Model identifier threshold_percentage: Percentage of context window (default 80%) Returns: Token count threshold """ total_context = get_token_limit(model_id) threshold = int(total_context * threshold_percentage) return threshold def convert_pdf_to_images( file_path: str, storage=None, max_pages: int = 20, dpi: int = 150, image_format: str = "PNG", ) -> List[dict]: """ Convert PDF pages to images for LLMs that support images but not PDFs. This enables "synthetic PDF support" by converting each PDF page to an image that can be sent to vision-capable LLMs like Claude. Args: file_path: Path to the PDF file (can be storage path) storage: Optional storage instance for retrieving files max_pages: Maximum number of pages to convert (default 20 to avoid context overflow) dpi: Resolution for rendering (default 150 for balance of quality/size) image_format: Output format (PNG recommended for quality) Returns: List of dicts with keys: - 'data': base64-encoded image data - 'mime_type': MIME type (e.g., 'image/png') - 'page': Page number (1-indexed) Raises: ImportError: If pdf2image is not installed FileNotFoundError: If file doesn't exist Exception: If conversion fails """ try: from pdf2image import convert_from_path, convert_from_bytes except ImportError: raise ImportError( "pdf2image is required for PDF-to-image conversion. " "Install it with: pip install pdf2image\n" "Also ensure poppler-utils is installed on your system." ) images_data = [] mime_type = f"image/{image_format.lower()}" try: # Get PDF content either from storage or direct file path if storage and hasattr(storage, "get_file"): with storage.get_file(file_path) as pdf_file: pdf_bytes = pdf_file.read() pil_images = convert_from_bytes( pdf_bytes, dpi=dpi, fmt=image_format.lower(), first_page=1, last_page=max_pages, ) else: pil_images = convert_from_path( file_path, dpi=dpi, fmt=image_format.lower(), first_page=1, last_page=max_pages, ) for page_num, pil_image in enumerate(pil_images, start=1): # Convert PIL image to base64 buffer = io.BytesIO() pil_image.save(buffer, format=image_format) buffer.seek(0) base64_data = base64.b64encode(buffer.read()).decode("utf-8") images_data.append({ "data": base64_data, "mime_type": mime_type, "page": page_num, }) return images_data except FileNotFoundError: logger.error(f"PDF file not found: {file_path}") raise except Exception as e: logger.error(f"Error converting PDF to images: {e}", exc_info=True) raise def clean_text_for_tts(text: str) -> str: """ clean text for Text-to-Speech processing. """ # Handle code blocks and links text = re.sub(r"```mermaid[\s\S]*?```", " flowchart, ", text) ## ```mermaid...``` text = re.sub(r"```[\s\S]*?```", " code block, ", text) ## ```code``` text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text) ## [text](url) text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", "", text) ## ![alt](url) # Remove markdown formatting text = re.sub(r"`([^`]+)`", r"\1", text) ## `code` text = re.sub(r"\{([^}]*)\}", r" \1 ", text) ## {text} text = re.sub(r"[{}]", " ", text) ## unmatched {} text = re.sub(r"\[([^\]]+)\]", r" \1 ", text) ## [text] text = re.sub(r"[\[\]]", " ", text) ## unmatched [] text = re.sub(r"(\*\*|__)(.*?)\1", r"\2", text) ## **bold** __bold__ text = re.sub(r"(\*|_)(.*?)\1", r"\2", text) ## *italic* _italic_ text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) ## # headers text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE) ## > blockquotes text = re.sub(r"^[\s]*[-\*\+]\s+", "", text, flags=re.MULTILINE) ## - * + lists text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE) ## 1. numbered lists text = re.sub( r"^[\*\-_]{3,}\s*$", "", text, flags=re.MULTILINE ) ## --- *** ___ rules text = re.sub(r"<[^>]*>", "", text) ## tags # Remove non-ASCII (emojis, special Unicode) text = re.sub(r"[^\x20-\x7E\n\r\t]", "", text) # Replace special sequences text = re.sub(r"-->", ", ", text) ## --> text = re.sub(r"<--", ", ", text) ## <-- text = re.sub(r"=>", ", ", text) ## => text = re.sub(r"::", " ", text) ## :: # Normalize whitespace text = re.sub(r"\s+", " ", text) text = text.strip() return text ================================================ FILE: application/vectorstore/__init__.py ================================================ ================================================ FILE: application/vectorstore/base.py ================================================ import logging import os from abc import ABC, abstractmethod import requests from langchain_openai import OpenAIEmbeddings from application.core.settings import settings class RemoteEmbeddings: """ Wrapper for remote embeddings API (OpenAI-compatible). Used when EMBEDDINGS_BASE_URL is configured. Sends requests to {base_url}/v1/embeddings in OpenAI format. """ def __init__(self, api_url: str, model_name: str, api_key: str = None): self.api_url = api_url.rstrip("/") self.model_name = model_name self.headers = {"Content-Type": "application/json"} if api_key: self.headers["Authorization"] = f"Bearer {api_key}" self.dimension = 768 def _embed(self, inputs): """Send embedding request to remote API in OpenAI-compatible format.""" payload = {"input": inputs} if self.model_name: payload["model"] = self.model_name url = f"{self.api_url}/v1/embeddings" response = requests.post(url, headers=self.headers, json=payload, timeout=180) response.raise_for_status() result = response.json() # Handle OpenAI-compatible response format if isinstance(result, dict): if "error" in result: raise ValueError(f"Remote embeddings API error: {result['error']}") if "data" in result: # Sort by index to ensure correct order data = sorted(result["data"], key=lambda x: x.get("index", 0)) return [item["embedding"] for item in data] raise ValueError( f"Unexpected response format from remote embeddings API: {result}" ) else: raise ValueError( f"Unexpected response format from remote embeddings API: {result}" ) def embed_query(self, query: str): """Embed a single query string.""" embeddings_list = self._embed(query) if ( isinstance(embeddings_list, list) and len(embeddings_list) == 1 and isinstance(embeddings_list[0], list) ): if self.dimension is None: self.dimension = len(embeddings_list[0]) return embeddings_list[0] raise ValueError( f"Unexpected result structure after embedding query: {embeddings_list}" ) def embed_documents(self, documents: list): """Embed a list of documents.""" if not documents: return [] embeddings_list = self._embed(documents) if self.dimension is None and embeddings_list: self.dimension = len(embeddings_list[0]) return embeddings_list def __call__(self, text): if isinstance(text, str): return self.embed_query(text) elif isinstance(text, list): return self.embed_documents(text) else: raise ValueError("Input must be a string or a list of strings") def _get_embeddings_wrapper(): """Lazy import of EmbeddingsWrapper to avoid loading SentenceTransformer when using remote embeddings.""" from application.vectorstore.embeddings_local import EmbeddingsWrapper return EmbeddingsWrapper class EmbeddingsSingleton: _instances = {} @staticmethod def get_instance(embeddings_name, *args, **kwargs): if embeddings_name not in EmbeddingsSingleton._instances: EmbeddingsSingleton._instances[embeddings_name] = ( EmbeddingsSingleton._create_instance(embeddings_name, *args, **kwargs) ) return EmbeddingsSingleton._instances[embeddings_name] @staticmethod def _create_instance(embeddings_name, *args, **kwargs): if embeddings_name == "openai_text-embedding-ada-002": return OpenAIEmbeddings(*args, **kwargs) # Lazy import EmbeddingsWrapper only when needed (avoids loading SentenceTransformer) EmbeddingsWrapper = _get_embeddings_wrapper() embeddings_factory = { "huggingface_sentence-transformers/all-mpnet-base-v2": lambda: EmbeddingsWrapper( "sentence-transformers/all-mpnet-base-v2" ), "huggingface_sentence-transformers-all-mpnet-base-v2": lambda: EmbeddingsWrapper( "sentence-transformers/all-mpnet-base-v2" ), "huggingface_hkunlp/instructor-large": lambda: EmbeddingsWrapper( "hkunlp/instructor-large" ), } if embeddings_name in embeddings_factory: return embeddings_factory[embeddings_name](*args, **kwargs) else: return EmbeddingsWrapper(embeddings_name, *args, **kwargs) class BaseVectorStore(ABC): def __init__(self): pass @abstractmethod def search(self, *args, **kwargs): """Search for similar documents/chunks in the vectorstore""" pass @abstractmethod def add_texts(self, texts, metadatas=None, *args, **kwargs): """Add texts with their embeddings to the vectorstore""" pass def delete_index(self, *args, **kwargs): """Delete the entire index/collection""" pass def save_local(self, *args, **kwargs): """Save vectorstore to local storage""" pass def get_chunks(self, *args, **kwargs): """Get all chunks from the vectorstore""" pass def add_chunk(self, text, metadata=None, *args, **kwargs): """Add a single chunk to the vectorstore""" pass def delete_chunk(self, chunk_id, *args, **kwargs): """Delete a specific chunk from the vectorstore""" pass def is_azure_configured(self): return ( settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME ) def _get_embeddings(self, embeddings_name, embeddings_key=None): # Check for remote embeddings first if settings.EMBEDDINGS_BASE_URL: logging.info( f"Using remote embeddings API at: {settings.EMBEDDINGS_BASE_URL}" ) cache_key = f"remote_{settings.EMBEDDINGS_BASE_URL}_{embeddings_name}" if cache_key not in EmbeddingsSingleton._instances: EmbeddingsSingleton._instances[cache_key] = RemoteEmbeddings( api_url=settings.EMBEDDINGS_BASE_URL, model_name=embeddings_name, api_key=embeddings_key, ) return EmbeddingsSingleton._instances[cache_key] if embeddings_name == "openai_text-embedding-ada-002": if self.is_azure_configured(): os.environ["OPENAI_API_TYPE"] = "azure" embedding_instance = EmbeddingsSingleton.get_instance( embeddings_name, model=settings.AZURE_EMBEDDINGS_DEPLOYMENT_NAME ) else: embedding_instance = EmbeddingsSingleton.get_instance( embeddings_name, openai_api_key=embeddings_key ) elif embeddings_name == "huggingface_sentence-transformers/all-mpnet-base-v2": possible_paths = [ "/app/models/all-mpnet-base-v2", # Docker absolute path "./models/all-mpnet-base-v2", # Relative path ] local_model_path = None for path in possible_paths: if os.path.exists(path): local_model_path = path logging.info(f"Found local model at path: {path}") break else: logging.info(f"Path does not exist: {path}") if local_model_path: embedding_instance = EmbeddingsSingleton.get_instance( local_model_path, ) else: logging.warning( f"Local model not found in any of the paths: {possible_paths}. Falling back to HuggingFace download." ) embedding_instance = EmbeddingsSingleton.get_instance( embeddings_name, ) else: embedding_instance = EmbeddingsSingleton.get_instance(embeddings_name) return embedding_instance ================================================ FILE: application/vectorstore/document_class.py ================================================ class Document(str): """Class for storing a piece of text and associated metadata.""" def __new__(cls, page_content: str, metadata: dict): instance = super().__new__(cls, page_content) instance.page_content = page_content instance.metadata = metadata return instance ================================================ FILE: application/vectorstore/elasticsearch.py ================================================ from application.vectorstore.base import BaseVectorStore from application.core.settings import settings from application.vectorstore.document_class import Document class ElasticsearchStore(BaseVectorStore): _es_connection = None # Class attribute to hold the Elasticsearch connection def __init__(self, source_id, embeddings_key, index_name=settings.ELASTIC_INDEX): super().__init__() self.source_id = source_id.replace("application/indexes/", "").rstrip("/") self.embeddings_key = embeddings_key self.index_name = index_name if ElasticsearchStore._es_connection is None: connection_params = {} if settings.ELASTIC_URL: connection_params["hosts"] = [settings.ELASTIC_URL] connection_params["http_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD) elif settings.ELASTIC_CLOUD_ID: connection_params["cloud_id"] = settings.ELASTIC_CLOUD_ID connection_params["basic_auth"] = (settings.ELASTIC_USERNAME, settings.ELASTIC_PASSWORD) else: raise ValueError("Please provide either elasticsearch_url or cloud_id.") import elasticsearch ElasticsearchStore._es_connection = elasticsearch.Elasticsearch(**connection_params) self.docsearch = ElasticsearchStore._es_connection def connect_to_elasticsearch( *, es_url = None, cloud_id = None, api_key = None, username = None, password = None, ): try: import elasticsearch except ImportError: raise ImportError( "Could not import elasticsearch python package. " "Please install it with `pip install elasticsearch`." ) if es_url and cloud_id: raise ValueError( "Both es_url and cloud_id are defined. Please provide only one." ) connection_params = {} if es_url: connection_params["hosts"] = [es_url] elif cloud_id: connection_params["cloud_id"] = cloud_id else: raise ValueError("Please provide either elasticsearch_url or cloud_id.") if api_key: connection_params["api_key"] = api_key elif username and password: connection_params["basic_auth"] = (username, password) es_client = elasticsearch.Elasticsearch( **connection_params, ) try: es_client.info() except Exception as e: raise e return es_client def search(self, question, k=2, index_name=settings.ELASTIC_INDEX, *args, **kwargs): embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key) vector = embeddings.embed_query(question) knn = { "filter": [{"match": {"metadata.source_id.keyword": self.source_id}}], "field": "vector", "k": k, "num_candidates": 100, "query_vector": vector, } full_query = { "knn": knn, "query": { "bool": { "must": [ { "match": { "text": { "query": question, } } } ], "filter": [{"match": {"metadata.source_id.keyword": self.source_id}}], } }, "rank": {"rrf": {}}, } resp = self.docsearch.search(index=self.index_name, query=full_query['query'], size=k, knn=full_query['knn']) # create Documents objects from the results page_content ['_source']['text'], metadata ['_source']['metadata'] doc_list = [] for hit in resp['hits']['hits']: doc_list.append(Document(page_content = hit['_source']['text'], metadata = hit['_source']['metadata'])) return doc_list def _create_index_if_not_exists( self, index_name, dims_length ): if self._es_connection.indices.exists(index=index_name): print(f"Index {index_name} already exists.") else: indexSettings = self.index( dims_length=dims_length, ) self._es_connection.indices.create(index=index_name, **indexSettings) def index( self, dims_length, ): return { "mappings": { "properties": { "vector": { "type": "dense_vector", "dims": dims_length, "index": True, "similarity": "cosine", }, } } } def add_texts( self, texts, metadatas = None, ids = None, refresh_indices = True, create_index_if_not_exists = True, bulk_kwargs = None, **kwargs, ): bulk_kwargs = bulk_kwargs or {} import uuid embeddings = [] ids = ids or [str(uuid.uuid4()) for _ in texts] requests = [] embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key) vectors = embeddings.embed_documents(list(texts)) dims_length = len(vectors[0]) if create_index_if_not_exists: self._create_index_if_not_exists( index_name=self.index_name, dims_length=dims_length ) for i, (text, vector) in enumerate(zip(texts, vectors)): metadata = metadatas[i] if metadatas else {} requests.append( { "_op_type": "index", "_index": self.index_name, "text": text, "vector": vector, "metadata": metadata, "_id": ids[i], } ) if len(requests) > 0: from elasticsearch.helpers import BulkIndexError, bulk try: success, failed = bulk( self._es_connection, requests, stats_only=True, refresh=refresh_indices, **bulk_kwargs, ) return ids except BulkIndexError as e: print(f"Error adding texts: {e}") firstError = e.errors[0].get("index", {}).get("error", {}) print(f"First error reason: {firstError.get('reason')}") raise e else: return [] def delete_index(self): self._es_connection.delete_by_query(index=self.index_name, query={"match": { "metadata.source_id.keyword": self.source_id}},) ================================================ FILE: application/vectorstore/embeddings_local.py ================================================ """ Local embeddings using SentenceTransformer. This module is only imported when EMBEDDINGS_BASE_URL is not set, to avoid loading SentenceTransformer into memory when using remote embeddings. """ import logging from sentence_transformers import SentenceTransformer class EmbeddingsWrapper: def __init__(self, model_name, *args, **kwargs): logging.info(f"Initializing EmbeddingsWrapper with model: {model_name}") try: kwargs.setdefault("trust_remote_code", True) self.model = SentenceTransformer( model_name, config_kwargs={"allow_dangerous_deserialization": True}, *args, **kwargs, ) if self.model is None or self.model._first_module() is None: raise ValueError( f"SentenceTransformer model failed to load properly for: {model_name}" ) self.dimension = self.model.get_sentence_embedding_dimension() logging.info(f"Successfully loaded model with dimension: {self.dimension}") except Exception as e: logging.error( f"Failed to initialize SentenceTransformer with model {model_name}: {str(e)}", exc_info=True, ) raise def embed_query(self, query: str): return self.model.encode(query).tolist() def embed_documents(self, documents: list): return self.model.encode(documents).tolist() def __call__(self, text): if isinstance(text, str): return self.embed_query(text) elif isinstance(text, list): return self.embed_documents(text) else: raise ValueError("Input must be a string or a list of strings") ================================================ FILE: application/vectorstore/faiss.py ================================================ import os import tempfile import io from langchain_community.vectorstores import FAISS from application.core.settings import settings from application.parser.schema.base import Document from application.vectorstore.base import BaseVectorStore from application.storage.storage_creator import StorageCreator def get_vectorstore(path: str) -> str: if path: vectorstore = f"indexes/{path}" else: vectorstore = "indexes" return vectorstore class FaissStore(BaseVectorStore): def __init__(self, source_id: str, embeddings_key: str, docs_init=None): super().__init__() self.source_id = source_id self.path = get_vectorstore(source_id) self.embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) self.storage = StorageCreator.get_storage() try: if docs_init: self.docsearch = FAISS.from_documents(docs_init, self.embeddings) else: with tempfile.TemporaryDirectory() as temp_dir: faiss_path = f"{self.path}/index.faiss" pkl_path = f"{self.path}/index.pkl" if not self.storage.file_exists( faiss_path ) or not self.storage.file_exists(pkl_path): raise FileNotFoundError( f"Index files not found in storage at {self.path}" ) faiss_file = self.storage.get_file(faiss_path) pkl_file = self.storage.get_file(pkl_path) local_faiss_path = os.path.join(temp_dir, "index.faiss") local_pkl_path = os.path.join(temp_dir, "index.pkl") with open(local_faiss_path, "wb") as f: f.write(faiss_file.read()) with open(local_pkl_path, "wb") as f: f.write(pkl_file.read()) self.docsearch = FAISS.load_local( temp_dir, self.embeddings, allow_dangerous_deserialization=True ) except Exception as e: raise Exception(f"Error loading FAISS index: {str(e)}") self.assert_embedding_dimensions(self.embeddings) def search(self, *args, **kwargs): return self.docsearch.similarity_search(*args, **kwargs) def add_texts(self, *args, **kwargs): return self.docsearch.add_texts(*args, **kwargs) def _save_to_storage(self): """ Save the FAISS index to storage using temporary directory pattern. Works consistently for both local and S3 storage. """ with tempfile.TemporaryDirectory() as temp_dir: self.docsearch.save_local(temp_dir) faiss_path = os.path.join(temp_dir, "index.faiss") pkl_path = os.path.join(temp_dir, "index.pkl") with open(faiss_path, "rb") as f_faiss: faiss_data = f_faiss.read() with open(pkl_path, "rb") as f_pkl: pkl_data = f_pkl.read() storage_path = get_vectorstore(self.source_id) self.storage.save_file(io.BytesIO(faiss_data), f"{storage_path}/index.faiss") self.storage.save_file(io.BytesIO(pkl_data), f"{storage_path}/index.pkl") return True def save_local(self, path=None): if path: os.makedirs(path, exist_ok=True) self.docsearch.save_local(path) self._save_to_storage() return True def delete_index(self, *args, **kwargs): return self.docsearch.delete(*args, **kwargs) def assert_embedding_dimensions(self, embeddings): """Check that the word embedding dimension of the docsearch index matches the dimension of the word embeddings used.""" if ( settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2" ): word_embedding_dimension = getattr(embeddings, "dimension", None) if word_embedding_dimension is None: raise AttributeError( "'dimension' attribute not found in embeddings instance." ) docsearch_index_dimension = self.docsearch.index.d if word_embedding_dimension != docsearch_index_dimension: raise ValueError( f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) != docsearch index dimension ({docsearch_index_dimension})" ) def get_chunks(self): chunks = [] if self.docsearch: for doc_id, doc in self.docsearch.docstore._dict.items(): chunk_data = { "doc_id": doc_id, "text": doc.page_content, "metadata": doc.metadata, } chunks.append(chunk_data) return chunks def add_chunk(self, text, metadata=None): """Add a new chunk and save to storage.""" metadata = metadata or {} doc = Document(text=text, extra_info=metadata).to_langchain_format() doc_id = self.docsearch.add_documents([doc]) self._save_to_storage() return doc_id def delete_chunk(self, chunk_id): """Delete a chunk and save to storage.""" self.delete_index([chunk_id]) self._save_to_storage() return True ================================================ FILE: application/vectorstore/lancedb.py ================================================ from typing import List, Optional import importlib from application.vectorstore.base import BaseVectorStore from application.core.settings import settings class LanceDBVectorStore(BaseVectorStore): """Class for LanceDB Vector Store integration.""" def __init__(self, path: str = settings.LANCEDB_PATH, table_name_prefix: str = settings.LANCEDB_TABLE_NAME, source_id: str = None, embeddings_key: str = "embeddings"): """Initialize the LanceDB vector store.""" super().__init__() self.path = path self.table_name = f"{table_name_prefix}_{source_id}" if source_id else table_name_prefix self.embeddings_key = embeddings_key self._lance_db = None self.docsearch = None self._pa = None # PyArrow (pa) will be lazy loaded @property def pa(self): """Lazy load pyarrow module.""" if self._pa is None: self._pa = importlib.import_module("pyarrow") return self._pa @property def lancedb(self): """Lazy load lancedb module.""" if not hasattr(self, "_lancedb_module"): self._lancedb_module = importlib.import_module("lancedb") return self._lancedb_module @property def lance_db(self): """Lazy load the LanceDB connection.""" if self._lance_db is None: self._lance_db = self.lancedb.connect(self.path) return self._lance_db @property def table(self): """Lazy load the LanceDB table.""" if self.docsearch is None: if self.table_name in self.lance_db.table_names(): self.docsearch = self.lance_db.open_table(self.table_name) else: self.docsearch = None return self.docsearch def ensure_table_exists(self): """Ensure the table exists before performing operations.""" if self.table is None: embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key) schema = self.pa.schema([ self.pa.field("vector", self.pa.list_(self.pa.float32(), list_size=embeddings.dimension)), self.pa.field("text", self.pa.string()), self.pa.field("metadata", self.pa.struct([ self.pa.field("key", self.pa.string()), self.pa.field("value", self.pa.string()) ])) ]) self.docsearch = self.lance_db.create_table(self.table_name, schema=schema) def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None, source_id: str = None): """Add texts with metadata and their embeddings to the LanceDB table.""" embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_documents(texts) vectors = [] for embedding, text, metadata in zip(embeddings, texts, metadatas or [{}] * len(texts)): if source_id: metadata["source_id"] = source_id metadata_struct = [{"key": k, "value": str(v)} for k, v in metadata.items()] vectors.append({ "vector": embedding, "text": text, "metadata": metadata_struct }) self.ensure_table_exists() self.docsearch.add(vectors) def search(self, query: str, k: int = 2, *args, **kwargs): """Search LanceDB for the top k most similar vectors.""" self.ensure_table_exists() query_embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_query(query) results = self.docsearch.search(query_embedding).limit(k).to_list() return [(result["_distance"], result["text"], result["metadata"]) for result in results] def delete_index(self): """Delete the entire LanceDB index (table).""" if self.table: self.lance_db.drop_table(self.table_name) def assert_embedding_dimensions(self, embeddings): """Ensure that embedding dimensions match the table index dimensions.""" word_embedding_dimension = embeddings.dimension if self.table: table_index_dimension = len(self.docsearch.schema["vector"].type.value_type) if word_embedding_dimension != table_index_dimension: raise ValueError( f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) " f"!= table index dimension ({table_index_dimension})" ) def filter_documents(self, filter_condition: dict) -> List[dict]: """Filter documents based on certain conditions.""" self.ensure_table_exists() # Ensure source_id exists in the filter condition if 'source_id' not in filter_condition: raise ValueError("filter_condition must contain 'source_id'") source_id = filter_condition["source_id"] # Use LanceDB's native filtering if supported, otherwise filter manually filtered_data = self.docsearch.filter(lambda x: x.metadata and x.metadata.get("source_id") == source_id).to_list() return filtered_data ================================================ FILE: application/vectorstore/milvus.py ================================================ from typing import List, Optional from uuid import uuid4 from application.core.settings import settings from application.vectorstore.base import BaseVectorStore class MilvusStore(BaseVectorStore): def __init__(self, source_id: str = "", embeddings_key: str = "embeddings"): super().__init__() from langchain_milvus import Milvus connection_args = { "uri": settings.MILVUS_URI, "token": settings.MILVUS_TOKEN, } self._docsearch = Milvus( embedding_function=self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key), collection_name=settings.MILVUS_COLLECTION_NAME, connection_args=connection_args, ) self._source_id = source_id def search(self, question, k=2, *args, **kwargs): expr = f"source_id == '{self._source_id}'" return self._docsearch.similarity_search(query=question, k=k, expr=expr, *args, **kwargs) def add_texts(self, texts: List[str], metadatas: Optional[List[dict]], *args, **kwargs): ids = [str(uuid4()) for _ in range(len(texts))] return self._docsearch.add_texts(texts=texts, metadatas=metadatas, ids=ids, *args, **kwargs) def save_local(self, *args, **kwargs): pass def delete_index(self, *args, **kwargs): pass ================================================ FILE: application/vectorstore/mongodb.py ================================================ import logging from application.core.settings import settings from application.vectorstore.base import BaseVectorStore from application.vectorstore.document_class import Document class MongoDBVectorStore(BaseVectorStore): def __init__( self, source_id: str = "", embeddings_key: str = "embeddings", collection: str = "documents", index_name: str = "vector_search_index", text_key: str = "text", embedding_key: str = "embedding", database: str = "docsgpt", ): self._index_name = index_name self._text_key = text_key self._embedding_key = embedding_key self._embeddings_key = embeddings_key self._mongo_uri = settings.MONGO_URI self._source_id = source_id.replace("application/indexes/", "").rstrip("/") self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) try: import pymongo except ImportError: raise ImportError( "Could not import pymongo python package. " "Please install it with `pip install pymongo`." ) self._client = pymongo.MongoClient(self._mongo_uri) self._database = self._client[database] self._collection = self._database[collection] def search(self, question, k=2, *args, **kwargs): query_vector = self._embedding.embed_query(question) pipeline = [ { "$vectorSearch": { "queryVector": query_vector, "path": self._embedding_key, "limit": k, "numCandidates": k * 10, "index": self._index_name, "filter": {"source_id": {"$eq": self._source_id}}, } } ] cursor = self._collection.aggregate(pipeline) results = [] for doc in cursor: text = doc[self._text_key] doc.pop("_id") doc.pop(self._text_key) doc.pop(self._embedding_key) metadata = doc results.append(Document(text, metadata)) return results def _insert_texts(self, texts, metadatas): if not texts: return [] embeddings = self._embedding.embed_documents(texts) to_insert = [ {self._text_key: t, self._embedding_key: embedding, **m} for t, m, embedding in zip(texts, metadatas, embeddings) ] insert_result = self._collection.insert_many(to_insert) return insert_result.inserted_ids def add_texts( self, texts, metadatas=None, ids=None, refresh_indices=True, create_index_if_not_exists=True, bulk_kwargs=None, **kwargs, ): # dims = self._embedding.client[1].word_embedding_dimension # # check if index exists # if create_index_if_not_exists: # # check if index exists # info = self._collection.index_information() # if self._index_name not in info: # index_mongo = { # "fields": [{ # "type": "vector", # "path": self._embedding_key, # "numDimensions": dims, # "similarity": "cosine", # }, # { # "type": "filter", # "path": "store" # }] # } # self._collection.create_index(self._index_name, index_mongo) batch_size = 100 _metadatas = metadatas or ({} for _ in texts) texts_batch = [] metadatas_batch = [] result_ids = [] for i, (text, metadata) in enumerate(zip(texts, _metadatas)): texts_batch.append(text) metadatas_batch.append(metadata) if (i + 1) % batch_size == 0: result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) texts_batch = [] metadatas_batch = [] if texts_batch: result_ids.extend(self._insert_texts(texts_batch, metadatas_batch)) return result_ids def delete_index(self, *args, **kwargs): self._collection.delete_many({"source_id": self._source_id}) def get_chunks(self): try: chunks = [] cursor = self._collection.find({"source_id": self._source_id}) for doc in cursor: doc_id = str(doc.get("_id")) text = doc.get(self._text_key) metadata = { k: v for k, v in doc.items() if k not in ["_id", self._text_key, self._embedding_key, "source_id"] } if text: chunks.append( {"doc_id": doc_id, "text": text, "metadata": metadata} ) return chunks except Exception as e: logging.error(f"Error getting chunks: {e}", exc_info=True) return [] def add_chunk(self, text, metadata=None): metadata = metadata or {} embeddings = self._embedding.embed_documents([text]) if not embeddings: raise ValueError("Could not generate embedding for chunk") chunk_data = { self._text_key: text, self._embedding_key: embeddings[0], "source_id": self._source_id, **metadata, } result = self._collection.insert_one(chunk_data) return str(result.inserted_id) def delete_chunk(self, chunk_id): try: from bson.objectid import ObjectId object_id = ObjectId(chunk_id) result = self._collection.delete_one({"_id": object_id}) return result.deleted_count > 0 except Exception as e: logging.error(f"Error deleting chunk: {e}", exc_info=True) return False ================================================ FILE: application/vectorstore/pgvector.py ================================================ import logging from typing import List, Optional, Any, Dict from application.core.settings import settings from application.vectorstore.base import BaseVectorStore from application.vectorstore.document_class import Document class PGVectorStore(BaseVectorStore): def __init__( self, source_id: str = "", embeddings_key: str = "embeddings", table_name: str = "documents", decoded_token: Optional[str] = None, vector_column: str = "embedding", text_column: str = "text", metadata_column: str = "metadata", connection_string: str = None, ): super().__init__() # Store the source_id for use in add_chunk self._source_id = str(source_id).replace("application/indexes/", "").rstrip("/") self._embeddings_key = embeddings_key self._table_name = table_name self._vector_column = vector_column self._text_column = text_column self._metadata_column = metadata_column self._embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) # Use provided connection string or fall back to settings self._connection_string = connection_string or getattr(settings, 'PGVECTOR_CONNECTION_STRING', None) if not self._connection_string: raise ValueError( "PostgreSQL connection string is required. " "Set PGVECTOR_CONNECTION_STRING in settings or pass connection_string parameter." ) try: import psycopg2 from psycopg2.extras import Json import pgvector.psycopg2 except ImportError: raise ImportError( "Could not import required packages. " "Please install with `pip install psycopg2-binary pgvector`." ) self._psycopg2 = psycopg2 self._Json = Json self._pgvector = pgvector.psycopg2 self._connection = None self._ensure_table_exists() def _get_connection(self): """Get or create database connection""" if self._connection is None or self._connection.closed: self._connection = self._psycopg2.connect(self._connection_string) # Register pgvector types self._pgvector.register_vector(self._connection) return self._connection def _ensure_table_exists(self): """Create table and enable pgvector extension if they don't exist""" conn = self._get_connection() cursor = conn.cursor() try: # Enable pgvector extension cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;") embedding_dim = getattr(self._embedding, 'dimension', 768) # Create table with vector column create_table_query = f""" CREATE TABLE IF NOT EXISTS {self._table_name} ( id SERIAL PRIMARY KEY, {self._text_column} TEXT NOT NULL, {self._vector_column} vector({embedding_dim}), {self._metadata_column} JSONB, source_id TEXT NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); """ cursor.execute(create_table_query) # Create index for vector similarity search index_query = f""" CREATE INDEX IF NOT EXISTS {self._table_name}_{self._vector_column}_idx ON {self._table_name} USING ivfflat ({self._vector_column} vector_cosine_ops) WITH (lists = 100); """ cursor.execute(index_query) # Create index for source_id filtering source_index_query = f""" CREATE INDEX IF NOT EXISTS {self._table_name}_source_id_idx ON {self._table_name} (source_id); """ cursor.execute(source_index_query) conn.commit() except Exception as e: conn.rollback() logging.error(f"Error creating table: {e}") raise finally: cursor.close() def search(self, question: str, k: int = 2, *args, **kwargs) -> List[Document]: """Search for similar documents using vector similarity""" query_vector = self._embedding.embed_query(question) conn = self._get_connection() cursor = conn.cursor() try: # Use cosine distance for similarity search with proper vector formatting search_query = f""" SELECT {self._text_column}, {self._metadata_column}, ({self._vector_column} <=> %s::vector) as distance FROM {self._table_name} WHERE source_id = %s ORDER BY {self._vector_column} <=> %s::vector LIMIT %s; """ cursor.execute(search_query, (query_vector, self._source_id, query_vector, k)) results = cursor.fetchall() documents = [] for text, metadata, distance in results: metadata = metadata or {} documents.append(Document(page_content=text, metadata=metadata)) return documents except Exception as e: logging.error(f"Error searching documents: {e}", exc_info=True) return [] finally: cursor.close() def add_texts( self, texts: List[str], metadatas: Optional[List[Dict[str, Any]]] = None, *args, **kwargs, ) -> List[str]: """Add texts with their embeddings to the vector store""" if not texts: return [] embeddings = self._embedding.embed_documents(texts) metadatas = metadatas or [{}] * len(texts) conn = self._get_connection() cursor = conn.cursor() try: insert_query = f""" INSERT INTO {self._table_name} ({self._text_column}, {self._vector_column}, {self._metadata_column}, source_id) VALUES (%s, %s, %s, %s) RETURNING id; """ inserted_ids = [] for text, embedding, metadata in zip(texts, embeddings, metadatas): cursor.execute( insert_query, (text, embedding, self._Json(metadata), self._source_id) ) inserted_id = cursor.fetchone()[0] inserted_ids.append(str(inserted_id)) conn.commit() return inserted_ids except Exception as e: conn.rollback() logging.error(f"Error adding texts: {e}") raise finally: cursor.close() def delete_index(self, *args, **kwargs): """Delete all documents for this source_id""" conn = self._get_connection() cursor = conn.cursor() try: delete_query = f"DELETE FROM {self._table_name} WHERE source_id = %s;" cursor.execute(delete_query, (self._source_id,)) conn.commit() except Exception as e: conn.rollback() logging.error(f"Error deleting index: {e}") raise finally: cursor.close() def save_local(self, *args, **kwargs): """No-op for PostgreSQL - data is already persisted""" pass def get_chunks(self) -> List[Dict[str, Any]]: """Get all chunks for this source_id""" conn = self._get_connection() cursor = conn.cursor() try: select_query = f""" SELECT id, {self._text_column}, {self._metadata_column} FROM {self._table_name} WHERE source_id = %s; """ cursor.execute(select_query, (self._source_id,)) results = cursor.fetchall() chunks = [] for doc_id, text, metadata in results: chunks.append({ "doc_id": str(doc_id), "text": text, "metadata": metadata or {} }) return chunks except Exception as e: logging.error(f"Error getting chunks: {e}") return [] finally: cursor.close() def add_chunk(self, text: str, metadata: Optional[Dict[str, Any]] = None) -> str: """Add a single chunk to the vector store""" metadata = metadata or {} final_metadata = metadata.copy() final_metadata["source_id"] = self._source_id embeddings = self._embedding.embed_documents([text]) if not embeddings: raise ValueError("Could not generate embedding for chunk") conn = self._get_connection() cursor = conn.cursor() try: insert_query = f""" INSERT INTO {self._table_name} ({self._text_column}, {self._vector_column}, {self._metadata_column}, source_id) VALUES (%s, %s, %s, %s) RETURNING id; """ cursor.execute( insert_query, (text, embeddings[0], self._Json(final_metadata), self._source_id) ) inserted_id = cursor.fetchone()[0] conn.commit() return str(inserted_id) except Exception as e: conn.rollback() logging.error(f"Error adding chunk: {e}") raise finally: cursor.close() def delete_chunk(self, chunk_id: str) -> bool: """Delete a specific chunk by its ID""" conn = self._get_connection() cursor = conn.cursor() try: delete_query = f"DELETE FROM {self._table_name} WHERE id = %s AND source_id = %s;" cursor.execute(delete_query, (int(chunk_id), self._source_id)) deleted_count = cursor.rowcount conn.commit() return deleted_count > 0 except Exception as e: conn.rollback() logging.error(f"Error deleting chunk: {e}") return False finally: cursor.close() def __del__(self): """Close database connection when object is destroyed""" if hasattr(self, '_connection') and self._connection and not self._connection.closed: self._connection.close() ================================================ FILE: application/vectorstore/qdrant.py ================================================ import logging from application.vectorstore.base import BaseVectorStore from application.core.settings import settings from application.vectorstore.document_class import Document class QdrantStore(BaseVectorStore): def __init__(self, source_id: str = "", embeddings_key: str = "embeddings"): from qdrant_client import models from langchain_community.vectorstores.qdrant import Qdrant # Store the source_id for use in add_chunk self._source_id = str(source_id).replace("application/indexes/", "").rstrip("/") self._filter = models.Filter( must=[ models.FieldCondition( key="metadata.source_id", match=models.MatchValue(value=self._source_id), ) ] ) embedding=self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) self._docsearch = Qdrant.construct_instance( ["TEXT_TO_OBTAIN_EMBEDDINGS_DIMENSION"], embedding=embedding, collection_name=settings.QDRANT_COLLECTION_NAME, location=settings.QDRANT_LOCATION, url=settings.QDRANT_URL, port=settings.QDRANT_PORT, grpc_port=settings.QDRANT_GRPC_PORT, https=settings.QDRANT_HTTPS, prefer_grpc=settings.QDRANT_PREFER_GRPC, api_key=settings.QDRANT_API_KEY, prefix=settings.QDRANT_PREFIX, timeout=settings.QDRANT_TIMEOUT, path=settings.QDRANT_PATH, distance_func=settings.QDRANT_DISTANCE_FUNC, ) try: collections = self._docsearch.client.get_collections() collection_exists = settings.QDRANT_COLLECTION_NAME in [ collection.name for collection in collections.collections ] if not collection_exists: self._docsearch.client.recreate_collection( collection_name=settings.QDRANT_COLLECTION_NAME, vectors_config=models.VectorParams(size=embedding.client[1].word_embedding_dimension, distance=models.Distance.COSINE), ) # Ensure the required index exists for metadata.source_id try: self._docsearch.client.create_payload_index( collection_name=settings.QDRANT_COLLECTION_NAME, field_name="metadata.source_id", field_schema=models.PayloadSchemaType.KEYWORD, ) except Exception as index_error: # Index might already exist, which is fine if "already exists" not in str(index_error).lower(): logging.warning(f"Could not create index for metadata.source_id: {index_error}") except Exception as e: logging.warning(f"Could not check for collection: {e}") def search(self, *args, **kwargs): return self._docsearch.similarity_search(filter=self._filter, *args, **kwargs) def add_texts(self, *args, **kwargs): return self._docsearch.add_texts(*args, **kwargs) def save_local(self, *args, **kwargs): pass def delete_index(self, *args, **kwargs): return self._docsearch.client.delete( collection_name=settings.QDRANT_COLLECTION_NAME, points_selector=self._filter ) def get_chunks(self): try: chunks = [] offset = None while True: records, offset = self._docsearch.client.scroll( collection_name=settings.QDRANT_COLLECTION_NAME, scroll_filter=self._filter, limit=10, with_payload=True, with_vectors=False, offset=offset, ) for record in records: doc_id = record.id text = record.payload.get("page_content") metadata = record.payload.get("metadata") chunks.append( {"doc_id": doc_id, "text": text, "metadata": metadata} ) if offset is None: break return chunks except Exception as e: logging.error(f"Error getting chunks: {e}", exc_info=True) return [] def add_chunk(self, text, metadata=None): import uuid metadata = metadata or {} # Create a copy to avoid modifying the original metadata final_metadata = metadata.copy() # Ensure the source_id is in the metadata so the chunk can be found by filters final_metadata["source_id"] = self._source_id doc = Document(page_content=text, metadata=final_metadata) # Generate a unique ID for the document doc_id = str(uuid.uuid4()) doc.id = doc_id doc_ids = self._docsearch.add_documents([doc]) return doc_ids[0] if doc_ids else doc_id def delete_chunk(self, chunk_id): try: self._docsearch.client.delete( collection_name=settings.QDRANT_COLLECTION_NAME, points_selector=[chunk_id], ) return True except Exception as e: logging.error(f"Error deleting chunk: {e}", exc_info=True) return False ================================================ FILE: application/vectorstore/vector_creator.py ================================================ from application.vectorstore.faiss import FaissStore from application.vectorstore.elasticsearch import ElasticsearchStore from application.vectorstore.milvus import MilvusStore from application.vectorstore.mongodb import MongoDBVectorStore from application.vectorstore.qdrant import QdrantStore from application.vectorstore.pgvector import PGVectorStore class VectorCreator: vectorstores = { "faiss": FaissStore, "elasticsearch": ElasticsearchStore, "mongodb": MongoDBVectorStore, "qdrant": QdrantStore, "milvus": MilvusStore, "pgvector": PGVectorStore } @classmethod def create_vectorstore(cls, type, *args, **kwargs): vectorstore_class = cls.vectorstores.get(type.lower()) if not vectorstore_class: raise ValueError(f"No vectorstore class found for type {type}") return vectorstore_class(*args, **kwargs) ================================================ FILE: application/worker.py ================================================ import datetime import json import logging import mimetypes import os import shutil import string import tempfile from typing import Any, Dict import zipfile from collections import Counter from urllib.parse import urljoin import requests from bson.dbref import DBRef from bson.objectid import ObjectId from application.agents.agent_creator import AgentCreator from application.api.answer.services.stream_processor import get_prompt from application.cache import get_redis_instance from application.core.mongo_db import MongoDB from application.core.settings import settings from application.parser.chunking import Chunker from application.parser.connectors.connector_creator import ConnectorCreator from application.parser.embedding_pipeline import embed_and_store_documents from application.parser.file.bulk import SimpleDirectoryReader, get_default_file_extractor from application.parser.file.constants import SUPPORTED_SOURCE_EXTENSIONS from application.parser.remote.remote_creator import RemoteCreator from application.parser.schema.base import Document from application.retriever.retriever_creator import RetrieverCreator from application.storage.storage_creator import StorageCreator from application.utils import count_tokens_docs, num_tokens_from_string mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] sources_collection = db["sources"] # Constants MIN_TOKENS = 150 MAX_TOKENS = 1250 RECURSION_DEPTH = 2 # Define a function to extract metadata from a given filename. def metadata_from_filename(title): return {"title": title} def _normalize_file_name_map(file_name_map): if not file_name_map: return {} if isinstance(file_name_map, str): try: file_name_map = json.loads(file_name_map) except Exception: return {} return file_name_map if isinstance(file_name_map, dict) else {} def _get_display_name(file_name_map, rel_path): if not file_name_map or not rel_path: return None if rel_path in file_name_map: return file_name_map[rel_path] base_name = os.path.basename(rel_path) return file_name_map.get(base_name) def _apply_display_names_to_structure(structure, file_name_map, prefix=""): if not isinstance(structure, dict) or not file_name_map: return structure for name, node in structure.items(): if isinstance(node, dict) and "type" in node and "size_bytes" in node: rel_path = f"{prefix}/{name}" if prefix else name display_name = _get_display_name(file_name_map, rel_path) if display_name: node["display_name"] = display_name elif isinstance(node, dict): next_prefix = f"{prefix}/{name}" if prefix else name _apply_display_names_to_structure(node, file_name_map, next_prefix) return structure # Define a function to generate a random string of a given length. def generate_random_string(length): return "".join([string.ascii_letters[i % 52] for i in range(length)]) current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) # Zip extraction security limits MAX_UNCOMPRESSED_SIZE = 500 * 1024 * 1024 # 500 MB max uncompressed size MAX_FILE_COUNT = 10000 # Maximum number of files to extract MAX_COMPRESSION_RATIO = 100 # Maximum compression ratio (to detect zip bombs) class ZipExtractionError(Exception): """Raised when zip extraction fails due to security constraints.""" pass def _is_path_safe(base_path: str, target_path: str) -> bool: """ Check if target_path is safely within base_path (prevents zip slip attacks). Args: base_path: The base directory where extraction should occur. target_path: The full path where a file would be extracted. Returns: True if the path is safe, False otherwise. """ # Resolve to absolute paths and check containment base_resolved = os.path.realpath(base_path) target_resolved = os.path.realpath(target_path) return target_resolved.startswith(base_resolved + os.sep) or target_resolved == base_resolved def _validate_zip_safety(zip_path: str, extract_to: str) -> None: """ Validate a zip file for security issues before extraction. Checks for: - Zip bombs (excessive compression ratio or uncompressed size) - Too many files - Path traversal attacks (zip slip) Args: zip_path: Path to the zip file. extract_to: Destination directory. Raises: ZipExtractionError: If the zip file fails security validation. """ try: with zipfile.ZipFile(zip_path, "r") as zip_ref: # Get compressed size compressed_size = os.path.getsize(zip_path) # Calculate total uncompressed size and file count total_uncompressed = 0 file_count = 0 for info in zip_ref.infolist(): file_count += 1 # Check file count limit if file_count > MAX_FILE_COUNT: raise ZipExtractionError( f"Zip file contains too many files (>{MAX_FILE_COUNT}). " "This may be a zip bomb attack." ) # Accumulate uncompressed size total_uncompressed += info.file_size # Check total uncompressed size if total_uncompressed > MAX_UNCOMPRESSED_SIZE: raise ZipExtractionError( f"Zip file uncompressed size exceeds limit " f"({total_uncompressed / (1024*1024):.1f} MB > " f"{MAX_UNCOMPRESSED_SIZE / (1024*1024):.1f} MB). " "This may be a zip bomb attack." ) # Check for path traversal (zip slip) target_path = os.path.join(extract_to, info.filename) if not _is_path_safe(extract_to, target_path): raise ZipExtractionError( f"Zip file contains path traversal attempt: {info.filename}" ) # Check compression ratio (only if compressed size is meaningful) if compressed_size > 0 and total_uncompressed > 0: compression_ratio = total_uncompressed / compressed_size if compression_ratio > MAX_COMPRESSION_RATIO: raise ZipExtractionError( f"Zip file has suspicious compression ratio ({compression_ratio:.1f}:1 > " f"{MAX_COMPRESSION_RATIO}:1). This may be a zip bomb attack." ) except zipfile.BadZipFile as e: raise ZipExtractionError(f"Invalid or corrupted zip file: {e}") def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5): """ Recursively extract zip files with security protections. Security measures: - Limits recursion depth to prevent infinite loops - Validates uncompressed size to prevent zip bombs - Limits number of files to prevent resource exhaustion - Checks compression ratio to detect zip bombs - Validates paths to prevent zip slip attacks Args: zip_path (str): Path to the zip file to be extracted. extract_to (str): Destination path for extracted files. current_depth (int): Current depth of recursion. max_depth (int): Maximum allowed depth of recursion to prevent infinite loops. """ if current_depth > max_depth: logging.warning(f"Reached maximum recursion depth of {max_depth}") return try: # Validate zip file safety before extraction _validate_zip_safety(zip_path, extract_to) # Safe to extract with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(extract_to) os.remove(zip_path) # Remove the zip file after extracting except ZipExtractionError as e: logging.error(f"Zip security validation failed for {zip_path}: {e}") # Remove the potentially malicious zip file try: os.remove(zip_path) except OSError: pass return except Exception as e: logging.error(f"Error extracting zip file {zip_path}: {e}", exc_info=True) return # Check for nested zip files and extract them for root, dirs, files in os.walk(extract_to): for file in files: if file.endswith(".zip"): # If a nested zip file is found, extract it recursively file_path = os.path.join(root, file) extract_zip_recursive(file_path, root, current_depth + 1, max_depth) def download_file(url, params, dest_path): try: response = requests.get(url, params=params) response.raise_for_status() with open(dest_path, "wb") as f: f.write(response.content) except requests.RequestException as e: logging.error(f"Error downloading file: {e}") raise def upload_index(full_path, file_data): files = None try: headers = {} if settings.INTERNAL_KEY: headers["X-Internal-Key"] = settings.INTERNAL_KEY if settings.VECTOR_STORE == "faiss": faiss_path = full_path + "/index.faiss" pkl_path = full_path + "/index.pkl" if not os.path.exists(faiss_path): logging.error(f"FAISS index file not found: {faiss_path}") raise FileNotFoundError(f"FAISS index file not found: {faiss_path}") if not os.path.exists(pkl_path): logging.error(f"FAISS pickle file not found: {pkl_path}") raise FileNotFoundError(f"FAISS pickle file not found: {pkl_path}") files = { "file_faiss": open(faiss_path, "rb"), "file_pkl": open(pkl_path, "rb"), } response = requests.post( urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data, headers=headers, ) else: response = requests.post( urljoin(settings.API_URL, "/api/upload_index"), data=file_data, headers=headers, ) response.raise_for_status() except (requests.RequestException, FileNotFoundError) as e: logging.error(f"Error uploading index: {e}") raise finally: if settings.VECTOR_STORE == "faiss" and files is not None: for file in files.values(): file.close() def run_agent_logic(agent_config, input_data): try: from application.core.model_utils import ( get_api_key_for_provider, get_default_model_id, get_provider_from_model_id, validate_model_id, ) from application.utils import calculate_doc_token_budget source = agent_config.get("source") retriever = agent_config.get("retriever", "classic") if isinstance(source, DBRef): source_doc = db.dereference(source) source = str(source_doc["_id"]) retriever = source_doc.get("retriever", agent_config.get("retriever")) else: source = {} source = {"active_docs": source} chunks = int(agent_config.get("chunks", 2)) prompt_id = agent_config.get("prompt_id", "default") user_api_key = agent_config["key"] agent_id = str(agent_config.get("_id")) if agent_config.get("_id") else None agent_type = agent_config.get("agent_type", "classic") decoded_token = {"sub": agent_config.get("user")} json_schema = agent_config.get("json_schema") prompt = get_prompt(prompt_id, db["prompts"]) # Determine model_id: check agent's default_model_id, fallback to system default agent_default_model = agent_config.get("default_model_id", "") if agent_default_model and validate_model_id(agent_default_model): model_id = agent_default_model else: model_id = get_default_model_id() # Get provider and API key for the selected model provider = get_provider_from_model_id(model_id) if model_id else settings.LLM_PROVIDER system_api_key = get_api_key_for_provider(provider or settings.LLM_PROVIDER) # Calculate proper doc_token_limit based on model's context window doc_token_limit = calculate_doc_token_budget( model_id=model_id ) retriever = RetrieverCreator.create_retriever( retriever, source=source, chat_history=[], prompt=prompt, chunks=chunks, doc_token_limit=doc_token_limit, model_id=model_id, user_api_key=user_api_key, agent_id=agent_id, decoded_token=decoded_token, ) # Pre-fetch documents using the retriever retrieved_docs = [] try: docs = retriever.search(input_data) if docs: retrieved_docs = docs except Exception as e: logging.warning(f"Failed to retrieve documents: {e}") agent = AgentCreator.create_agent( agent_type, endpoint="webhook", llm_name=provider or settings.LLM_PROVIDER, model_id=model_id, api_key=system_api_key, agent_id=agent_id, user_api_key=user_api_key, prompt=prompt, chat_history=[], retrieved_docs=retrieved_docs, decoded_token=decoded_token, attachments=[], json_schema=json_schema, ) answer = agent.gen(query=input_data) response_full = "" thought = "" source_log_docs = [] tool_calls = [] for line in answer: if "answer" in line: response_full += str(line["answer"]) elif "sources" in line: source_log_docs.extend(line["sources"]) elif "tool_calls" in line: tool_calls.extend(line["tool_calls"]) elif "thought" in line: thought += line["thought"] result = { "answer": response_full, "sources": source_log_docs, "tool_calls": tool_calls, "thought": thought, } logging.info(f"Agent response: {result}") return result except Exception as e: logging.error(f"Error in run_agent_logic: {e}", exc_info=True) raise # Define the main function for ingesting and processing documents. def ingest_worker( self, directory, formats, job_name, file_path, filename, user, retriever="classic", file_name_map=None, ): """ Ingest and process documents. Args: self: Reference to the instance of the task. directory (str): Specifies the directory for ingesting ('inputs' or 'temp'). formats (list of str): List of file extensions to consider for ingestion (e.g., [".rst", ".md"]). job_name (str): Name of the job for this ingestion task (original, unsanitized). file_path (str): Complete file path to use consistently throughout the pipeline. filename (str): Original unsanitized filename provided by the user. user (str): Identifier for the user initiating the ingestion (original, unsanitized). retriever (str): Type of retriever to use for processing the documents. file_name_map (dict|str|None): Optional mapping of safe relative paths to original filenames. Returns: dict: Information about the completed ingestion task, including input parameters and a "limited" flag. """ input_files = None recursive = True limit = None exclude = True sample = False storage = StorageCreator.get_storage() logging.info(f"Ingest path: {file_path}", extra={"user": user, "job": job_name}) # Create temporary working directory with tempfile.TemporaryDirectory() as temp_dir: try: os.makedirs(temp_dir, exist_ok=True) if storage.is_directory(file_path): # Handle directory case logging.info(f"Processing directory: {file_path}") files_list = storage.list_files(file_path) for storage_file_path in files_list: if storage.is_directory(storage_file_path): continue # Create relative path structure in temp directory rel_path = os.path.relpath(storage_file_path, file_path) local_file_path = os.path.join(temp_dir, rel_path) os.makedirs(os.path.dirname(local_file_path), exist_ok=True) # Download file try: file_data = storage.get_file(storage_file_path) with open(local_file_path, "wb") as f: f.write(file_data.read()) except Exception as e: logging.error( f"Error downloading file {storage_file_path}: {e}" ) continue else: # Handle single file case temp_filename = os.path.basename(file_path) temp_file_path = os.path.join(temp_dir, temp_filename) file_data = storage.get_file(file_path) with open(temp_file_path, "wb") as f: f.write(file_data.read()) # Handle zip files if temp_filename.endswith(".zip"): logging.info(f"Extracting zip file: {temp_filename}") extract_zip_recursive( temp_file_path, temp_dir, current_depth=0, max_depth=RECURSION_DEPTH, ) self.update_state(state="PROGRESS", meta={"current": 1}) if sample: logging.info(f"Sample mode enabled. Using {limit} documents.") reader = SimpleDirectoryReader( input_dir=temp_dir, input_files=input_files, recursive=recursive, required_exts=formats, exclude_hidden=exclude, file_metadata=metadata_from_filename, ) raw_docs = reader.load_data() directory_structure = getattr(reader, "directory_structure", {}) logging.info(f"Directory structure from reader: {directory_structure}") file_name_map = _normalize_file_name_map(file_name_map) if file_name_map: for doc in raw_docs: extra_info = getattr(doc, "extra_info", None) if not isinstance(extra_info, dict): continue rel_path = extra_info.get("source") or extra_info.get("file_path") display_name = _get_display_name(file_name_map, rel_path) if display_name: display_name = str(display_name) extra_info["filename"] = display_name extra_info["file_name"] = display_name extra_info["title"] = display_name directory_structure = _apply_display_names_to_structure( directory_structure, file_name_map ) chunker = Chunker( chunking_strategy="classic_chunk", max_tokens=MAX_TOKENS, min_tokens=MIN_TOKENS, duplicate_headers=False, ) raw_docs = chunker.chunk(documents=raw_docs) docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] id = ObjectId() vector_store_path = os.path.join(temp_dir, "vector_store") os.makedirs(vector_store_path, exist_ok=True) embed_and_store_documents(docs, vector_store_path, id, self) tokens = count_tokens_docs(docs) self.update_state(state="PROGRESS", meta={"current": 100}) if sample: for i in range(min(5, len(raw_docs))): logging.info(f"Sample document {i}: {raw_docs[i]}") file_data = { "name": job_name, "file": filename, "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), "type": "local", "file_path": file_path, "directory_structure": json.dumps(directory_structure), } if file_name_map: file_data["file_name_map"] = json.dumps(file_name_map) upload_index(vector_store_path, file_data) except Exception as e: logging.error(f"Error in ingest_worker: {e}", exc_info=True) raise return { "directory": directory, "formats": formats, "name_job": job_name, # Use original job_name "filename": filename, "user": user, # Use original user "limited": False, } def reingest_source_worker(self, source_id, user): """ Re-ingestion worker that handles incremental updates by: 1. Adding chunks from newly added files 2. Removing chunks from deleted files Args: self: Task instance source_id: ID of the source to re-ingest user: User identifier Returns: dict: Information about the re-ingestion task """ try: from application.vectorstore.vector_creator import VectorCreator self.update_state( state="PROGRESS", meta={"current": 10, "status": "Initializing re-ingestion scan"}, ) source = sources_collection.find_one({"_id": ObjectId(source_id), "user": user}) if not source: raise ValueError(f"Source {source_id} not found or access denied") storage = StorageCreator.get_storage() source_file_path = source.get("file_path", "") file_name_map = _normalize_file_name_map(source.get("file_name_map")) self.update_state( state="PROGRESS", meta={"current": 20, "status": "Scanning current files"} ) with tempfile.TemporaryDirectory() as temp_dir: # Download all files from storage to temp directory, preserving directory structure if storage.is_directory(source_file_path): files_list = storage.list_files(source_file_path) for storage_file_path in files_list: if storage.is_directory(storage_file_path): continue rel_path = os.path.relpath(storage_file_path, source_file_path) local_file_path = os.path.join(temp_dir, rel_path) os.makedirs(os.path.dirname(local_file_path), exist_ok=True) # Download file try: file_data = storage.get_file(storage_file_path) with open(local_file_path, "wb") as f: f.write(file_data.read()) except Exception as e: logging.error( f"Error downloading file {storage_file_path}: {e}" ) continue reader = SimpleDirectoryReader( input_dir=temp_dir, recursive=True, required_exts=list(SUPPORTED_SOURCE_EXTENSIONS), exclude_hidden=True, file_metadata=metadata_from_filename, ) reader.load_data() directory_structure = reader.directory_structure logging.info( f"Directory structure built with token counts: {directory_structure}" ) try: old_directory_structure = source.get("directory_structure") or {} if isinstance(old_directory_structure, str): try: old_directory_structure = json.loads(old_directory_structure) except Exception: old_directory_structure = {} def _flatten_directory_structure(struct, prefix=""): files = set() if isinstance(struct, dict): for name, meta in struct.items(): current_path = ( os.path.join(prefix, name) if prefix else name ) if isinstance(meta, dict) and ( "type" in meta and "size_bytes" in meta ): files.add(current_path) elif isinstance(meta, dict): files |= _flatten_directory_structure( meta, current_path ) return files old_files = _flatten_directory_structure(old_directory_structure) new_files = _flatten_directory_structure(directory_structure) added_files = sorted(new_files - old_files) removed_files = sorted(old_files - new_files) if added_files: logging.info(f"Files added since last ingest: {added_files}") else: logging.info("No files added since last ingest.") if removed_files: logging.info(f"Files removed since last ingest: {removed_files}") else: logging.info("No files removed since last ingest.") except Exception as e: logging.error( f"Error comparing directory structures: {e}", exc_info=True ) added_files = [] removed_files = [] try: if not added_files and not removed_files: logging.info("No changes detected.") return { "source_id": source_id, "user": user, "status": "no_changes", "added_files": [], "removed_files": [], } vector_store = VectorCreator.create_vectorstore( settings.VECTOR_STORE, source_id, settings.EMBEDDINGS_KEY, ) self.update_state( state="PROGRESS", meta={"current": 40, "status": "Processing file changes"}, ) # 1) Delete chunks from removed files deleted = 0 if removed_files: try: for ch in vector_store.get_chunks() or []: metadata = ( ch.get("metadata", {}) if isinstance(ch, dict) else getattr(ch, "metadata", {}) ) raw_source = metadata.get("source") source_file = str(raw_source) if raw_source else "" if source_file in removed_files: cid = ch.get("doc_id") if cid: try: vector_store.delete_chunk(cid) deleted += 1 except Exception as de: logging.error( f"Failed deleting chunk {cid}: {de}" ) logging.info( f"Deleted {deleted} chunks from {len(removed_files)} removed files" ) except Exception as e: logging.error( f"Error during deletion of removed file chunks: {e}", exc_info=True, ) # 2) Add chunks from new files added = 0 if added_files: try: # Build list of local files for added files only added_local_files = [] for rel_path in added_files: local_path = os.path.join(temp_dir, rel_path) if os.path.isfile(local_path): added_local_files.append(local_path) if added_local_files: reader_new = SimpleDirectoryReader( input_files=added_local_files, exclude_hidden=True, errors="ignore", file_metadata=metadata_from_filename, ) raw_docs_new = reader_new.load_data() chunker_new = Chunker( chunking_strategy="classic_chunk", max_tokens=MAX_TOKENS, min_tokens=MIN_TOKENS, duplicate_headers=False, ) chunked_new = chunker_new.chunk(documents=raw_docs_new) for ( file_path, token_count, ) in reader_new.file_token_counts.items(): try: rel_path = os.path.relpath( file_path, start=temp_dir ) path_parts = rel_path.split(os.sep) current_dir = directory_structure for part in path_parts[:-1]: if part in current_dir and isinstance( current_dir[part], dict ): current_dir = current_dir[part] else: break filename = path_parts[-1] if filename in current_dir and isinstance( current_dir[filename], dict ): current_dir[filename][ "token_count" ] = token_count logging.info( f"Updated token count for {rel_path}: {token_count}" ) except Exception as e: logging.warning( f"Could not update token count for {file_path}: {e}" ) for d in chunked_new: meta = dict(d.extra_info or {}) try: raw_src = meta.get("source") if isinstance(raw_src, str) and os.path.isabs( raw_src ): meta["source"] = os.path.relpath( raw_src, start=temp_dir ) except Exception: pass display_name = _get_display_name( file_name_map, meta.get("source") ) if display_name: display_name = str(display_name) meta["filename"] = display_name meta["file_name"] = display_name meta["title"] = display_name vector_store.add_chunk(d.text, metadata=meta) added += 1 logging.info( f"Added {added} chunks from {len(added_files)} new files" ) except Exception as e: logging.error( f"Error during ingestion of new files: {e}", exc_info=True ) # 3) Update source directory structure timestamp try: total_tokens = sum(reader.file_token_counts.values()) directory_structure = _apply_display_names_to_structure( directory_structure, file_name_map ) sources_collection.update_one( {"_id": ObjectId(source_id)}, { "$set": { "directory_structure": directory_structure, "date": datetime.datetime.now(), "tokens": total_tokens, } }, ) except Exception as e: logging.error( f"Error updating directory_structure in DB: {e}", exc_info=True ) self.update_state( state="PROGRESS", meta={"current": 100, "status": "Re-ingestion completed"}, ) return { "source_id": source_id, "user": user, "status": "completed", "added_files": added_files, "removed_files": removed_files, "chunks_added": added, "chunks_deleted": deleted, } except Exception as e: logging.error( f"Error while processing file changes: {e}", exc_info=True ) raise except Exception as e: logging.error(f"Error in reingest_source_worker: {e}", exc_info=True) raise def remote_worker( self, source_data, name_job, user, loader, directory="temp", retriever="classic", sync_frequency="never", operation_mode="upload", doc_id=None, ): full_path = os.path.join(directory, user, name_job) if not os.path.exists(full_path): os.makedirs(full_path) self.update_state(state="PROGRESS", meta={"current": 1}) try: logging.info("Initializing remote loader with type: %s", loader) remote_loader = RemoteCreator.create_loader(loader) raw_docs = remote_loader.load_data(source_data) chunker = Chunker( chunking_strategy="classic_chunk", max_tokens=MAX_TOKENS, min_tokens=MIN_TOKENS, duplicate_headers=False, ) docs = chunker.chunk(documents=raw_docs) docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] tokens = count_tokens_docs(docs) logging.info("Total tokens calculated: %d", tokens) # Build directory structure from loaded documents # Format matches local file uploads: nested structure with type, size_bytes, token_count directory_structure = {} for doc in raw_docs: # Get the file path from extra_info # For crawlers: file_path is a virtual path like "guides/setup.md" # For other remotes: use key or title as fallback file_path = "" if doc.extra_info: file_path = ( doc.extra_info.get("file_path", "") or doc.extra_info.get("key", "") or doc.extra_info.get("title", "") ) if not file_path: file_path = doc.doc_id or "" if file_path: # Calculate token count token_count = num_tokens_from_string(doc.text) if doc.text else 0 # Estimate size in bytes from text content size_bytes = len(doc.text.encode("utf-8")) if doc.text else 0 # Guess mime type from extension file_name = ( file_path.split("/")[-1] if "/" in file_path else file_path ) ext = os.path.splitext(file_name)[1].lower() mime_types = { ".txt": "text/plain", ".md": "text/markdown", ".pdf": "application/pdf", ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ".doc": "application/msword", ".html": "text/html", ".json": "application/json", ".csv": "text/csv", ".xml": "application/xml", ".py": "text/x-python", ".js": "text/javascript", ".ts": "text/typescript", ".jsx": "text/jsx", ".tsx": "text/tsx", } file_type = mime_types.get(ext, "application/octet-stream") # Build nested directory structure from path # e.g., "guides/setup.md" -> {"guides": {"setup.md": {...}}} path_parts = file_path.split("/") current_level = directory_structure for i, part in enumerate(path_parts): if i == len(path_parts) - 1: # Last part is the file current_level[part] = { "type": file_type, "size_bytes": size_bytes, "token_count": token_count, } else: # Intermediate parts are directories if part not in current_level: current_level[part] = {} current_level = current_level[part] logging.info( f"Built directory structure with {len(directory_structure)} files: " f"{list(directory_structure.keys())}" ) if operation_mode == "upload": id = ObjectId() embed_and_store_documents(docs, full_path, id, self) elif operation_mode == "sync": if not doc_id or not ObjectId.is_valid(doc_id): logging.error("Invalid doc_id provided for sync operation: %s", doc_id) raise ValueError("doc_id must be provided for sync operation.") id = ObjectId(doc_id) embed_and_store_documents(docs, full_path, id, self) self.update_state(state="PROGRESS", meta={"current": 100}) # Serialize remote_data as JSON if it's a dict (for S3, Reddit, etc.) remote_data_serialized = ( json.dumps(source_data) if isinstance(source_data, dict) else source_data ) file_data = { "name": name_job, "user": user, "tokens": tokens, "retriever": retriever, "id": str(id), "type": loader, "remote_data": remote_data_serialized, "sync_frequency": sync_frequency, "directory_structure": json.dumps(directory_structure), } if operation_mode == "sync": file_data["last_sync"] = datetime.datetime.now() upload_index(full_path, file_data) except Exception as e: logging.error("Error in remote_worker task: %s", str(e), exc_info=True) raise finally: if os.path.exists(full_path): shutil.rmtree(full_path) logging.info("remote_worker task completed successfully") return { "id": str(id), "urls": source_data, "name_job": name_job, "user": user, "limited": False, } def sync( self, source_data, name_job, user, loader, sync_frequency, retriever, doc_id=None, directory="temp", ): try: remote_worker( self, source_data, name_job, user, loader, directory, retriever, sync_frequency, "sync", doc_id, ) except Exception as e: logging.error(f"Error during sync: {e}", exc_info=True) return {"status": "error", "error": str(e)} return {"status": "success"} def sync_worker(self, frequency): sync_counts = Counter() sources = sources_collection.find() for doc in sources: if doc.get("sync_frequency") == frequency: name = doc.get("name") user = doc.get("user") source_type = doc.get("type") source_data = doc.get("remote_data") retriever = doc.get("retriever") doc_id = str(doc.get("_id")) resp = sync( self, source_data, name, user, source_type, frequency, retriever, doc_id ) sync_counts["total_sync_count"] += 1 sync_counts[ "sync_success" if resp["status"] == "success" else "sync_failure" ] += 1 return { key: sync_counts[key] for key in ["total_sync_count", "sync_success", "sync_failure"] } def attachment_worker(self, file_info, user): """ Process and store a single attachment without vectorization. """ mongo = MongoDB.get_client() db = mongo[settings.MONGO_DB_NAME] attachments_collection = db["attachments"] filename = file_info["filename"] attachment_id = file_info["attachment_id"] relative_path = file_info["path"] metadata = file_info.get("metadata", {}) try: self.update_state(state="PROGRESS", meta={"current": 10}) storage = StorageCreator.get_storage() self.update_state( state="PROGRESS", meta={"current": 30, "status": "Processing content"} ) file_extractor = get_default_file_extractor( ocr_enabled=settings.DOCLING_OCR_ATTACHMENTS_ENABLED ) attachment_document = storage.process_file( relative_path, lambda local_path, **kwargs: SimpleDirectoryReader( input_files=[local_path], exclude_hidden=True, errors="ignore", file_extractor=file_extractor, file_metadata=metadata_from_filename, ) .load_data()[0], ) content = attachment_document.text parser_metadata = { key: value for key, value in (attachment_document.extra_info or {}).items() if key.startswith("transcript_") } if parser_metadata: metadata = {**metadata, **parser_metadata} token_count = num_tokens_from_string(content) if token_count > 100000: content = content[:250000] token_count = num_tokens_from_string(content) self.update_state( state="PROGRESS", meta={"current": 80, "status": "Storing in database"} ) mime_type = mimetypes.guess_type(filename)[0] or "application/octet-stream" doc_id = ObjectId(attachment_id) attachments_collection.insert_one( { "_id": doc_id, "user": user, "path": relative_path, "filename": filename, "content": content, "token_count": token_count, "mime_type": mime_type, "date": datetime.datetime.now(), "metadata": metadata, } ) logging.info( f"Stored attachment with ID: {attachment_id}", extra={"user": user} ) self.update_state(state="PROGRESS", meta={"current": 100, "status": "Complete"}) return { "filename": filename, "path": relative_path, "token_count": token_count, "attachment_id": attachment_id, "mime_type": mime_type, "metadata": metadata, } except Exception as e: logging.error( f"Error processing file {filename}: {e}", extra={"user": user}, exc_info=True, ) raise def agent_webhook_worker(self, agent_id, payload): """ Process the webhook payload for an agent. Args: self: Reference to the instance of the task. agent_id (str): Unique identifier for the agent. payload (dict): The payload data from the webhook. Returns: dict: Information about the processed webhook. """ mongo = MongoDB.get_client() db = mongo["docsgpt"] agents_collection = db["agents"] self.update_state(state="PROGRESS", meta={"current": 1}) try: agent_oid = ObjectId(agent_id) agent_config = agents_collection.find_one({"_id": agent_oid}) if not agent_config: raise ValueError(f"Agent with ID {agent_id} not found.") input_data = json.dumps(payload) except Exception as e: logging.error(f"Error processing agent webhook: {e}", exc_info=True) return {"status": "error", "error": str(e)} self.update_state(state="PROGRESS", meta={"current": 50}) try: result = run_agent_logic(agent_config, input_data) except Exception as e: logging.error(f"Error running agent logic: {e}", exc_info=True) return {"status": "error"} else: logging.info( f"Webhook processed for agent {agent_id}", extra={"agent_id": agent_id} ) return {"status": "success", "result": result} finally: self.update_state(state="PROGRESS", meta={"current": 100}) def ingest_connector( self, job_name: str, user: str, source_type: str, session_token=None, file_ids=None, folder_ids=None, recursive=True, retriever: str = "classic", operation_mode: str = "upload", doc_id=None, sync_frequency: str = "never", ) -> Dict[str, Any]: """ Ingestion for internal knowledge bases (GoogleDrive, etc.). Args: job_name: Name of the ingestion job user: User identifier source_type: Type of remote source ("google_drive", "dropbox", etc.) session_token: Authentication token for the service file_ids: List of file IDs to download folder_ids: List of folder IDs to download recursive: Whether to recursively download folders retriever: Type of retriever to use operation_mode: "upload" for initial ingestion, "sync" for incremental sync doc_id: Document ID for sync operations (required when operation_mode="sync") sync_frequency: How often to sync ("never", "daily", "weekly", "monthly") """ logging.info( f"Starting remote ingestion from {source_type} for user: {user}, job: {job_name}" ) self.update_state(state="PROGRESS", meta={"current": 1}) with tempfile.TemporaryDirectory() as temp_dir: try: # Step 1: Initialize the appropriate loader self.update_state( state="PROGRESS", meta={"current": 10, "status": "Initializing connector"}, ) if not session_token: raise ValueError(f"{source_type} connector requires session_token") if not ConnectorCreator.is_supported(source_type): raise ValueError( f"Unsupported connector type: {source_type}. Supported types: {ConnectorCreator.get_supported_connectors()}" ) remote_loader = ConnectorCreator.create_connector( source_type, session_token ) # Create a clean config for storage api_source_config = { "file_ids": file_ids or [], "folder_ids": folder_ids or [], "recursive": recursive, } # Step 2: Download files to temp directory self.update_state( state="PROGRESS", meta={"current": 20, "status": "Downloading files"} ) download_info = remote_loader.download_to_directory( temp_dir, api_source_config ) if download_info.get("empty_result", False) or not download_info.get( "files_downloaded", 0 ): logging.warning(f"No files were downloaded from {source_type}") # Create empty result directly instead of calling a separate method return { "name": job_name, "user": user, "tokens": 0, "type": source_type, "source_config": api_source_config, "directory_structure": "{}", } # Step 3: Use SimpleDirectoryReader to process downloaded files self.update_state( state="PROGRESS", meta={"current": 40, "status": "Processing files"} ) reader = SimpleDirectoryReader( input_dir=temp_dir, recursive=True, required_exts=list(SUPPORTED_SOURCE_EXTENSIONS), exclude_hidden=True, file_metadata=metadata_from_filename, ) raw_docs = reader.load_data() directory_structure = getattr(reader, "directory_structure", {}) # Step 4: Process documents (chunking, embedding, etc.) self.update_state( state="PROGRESS", meta={"current": 60, "status": "Processing documents"} ) chunker = Chunker( chunking_strategy="classic_chunk", max_tokens=MAX_TOKENS, min_tokens=MIN_TOKENS, duplicate_headers=False, ) raw_docs = chunker.chunk(documents=raw_docs) # Preserve source information in document metadata for doc in raw_docs: if hasattr(doc, "extra_info") and doc.extra_info: source = doc.extra_info.get("source") if source and os.path.isabs(source): # Convert absolute path to relative path doc.extra_info["source"] = os.path.relpath( source, start=temp_dir ) docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] if operation_mode == "upload": id = ObjectId() elif operation_mode == "sync": if not doc_id or not ObjectId.is_valid(doc_id): logging.error( "Invalid doc_id provided for sync operation: %s", doc_id ) raise ValueError("doc_id must be provided for sync operation.") id = ObjectId(doc_id) else: raise ValueError(f"Invalid operation_mode: {operation_mode}") vector_store_path = os.path.join(temp_dir, "vector_store") os.makedirs(vector_store_path, exist_ok=True) self.update_state( state="PROGRESS", meta={"current": 80, "status": "Storing documents"} ) embed_and_store_documents(docs, vector_store_path, id, self) tokens = count_tokens_docs(docs) # Step 6: Upload index files file_data = { "user": user, "name": job_name, "tokens": tokens, "retriever": retriever, "id": str(id), "type": "connector:file", "remote_data": json.dumps( {"provider": source_type, **api_source_config} ), "directory_structure": json.dumps(directory_structure), "sync_frequency": sync_frequency, } if operation_mode == "sync": file_data["last_sync"] = datetime.datetime.now() else: file_data["last_sync"] = datetime.datetime.now() upload_index(vector_store_path, file_data) # Ensure we mark the task as complete self.update_state( state="PROGRESS", meta={"current": 100, "status": "Complete"} ) logging.info(f"Remote ingestion completed: {job_name}") return { "user": user, "name": job_name, "tokens": tokens, "type": source_type, "id": str(id), "status": "complete", } except Exception as e: logging.error(f"Error during remote ingestion: {e}", exc_info=True) raise def mcp_oauth(self, config: Dict[str, Any], user_id: str = None) -> Dict[str, Any]: """Worker to handle MCP OAuth flow asynchronously.""" try: import asyncio from application.agents.tools.mcp_tool import MCPTool task_id = self.request.id redis_client = get_redis_instance() def update_status(status_data: Dict[str, Any]): status_key = f"mcp_oauth_status:{task_id}" redis_client.setex(status_key, 600, json.dumps(status_data)) update_status( { "status": "in_progress", "message": "Starting OAuth...", "task_id": task_id, } ) tool_config = config.copy() tool_config["oauth_task_id"] = task_id mcp_tool = MCPTool(tool_config, user_id) async def run_oauth_discovery(): if not mcp_tool._client: mcp_tool._setup_client() return await mcp_tool._execute_with_client("list_tools") update_status( { "status": "awaiting_redirect", "message": "Awaiting OAuth redirect...", "task_id": task_id, } ) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: loop.run_until_complete(run_oauth_discovery()) tools = mcp_tool.get_actions_metadata() update_status( { "status": "completed", "message": f"Connected \u2014 found {len(tools)} tool{'s' if len(tools) != 1 else ''}.", "tools": tools, "tools_count": len(tools), "task_id": task_id, } ) return {"success": True, "tools": tools, "tools_count": len(tools)} except Exception as e: error_msg = f"OAuth failed: {str(e)}" logging.error("MCP OAuth discovery failed: %s", error_msg, exc_info=True) update_status( { "status": "error", "message": error_msg, "task_id": task_id, } ) return {"success": False, "error": error_msg} finally: loop.close() except Exception as e: error_msg = f"OAuth init failed: {str(e)}" logging.error("MCP OAuth init failed: %s", error_msg, exc_info=True) update_status( { "status": "error", "message": error_msg, "task_id": task_id, } ) return {"success": False, "error": error_msg} def mcp_oauth_status(self, task_id: str) -> Dict[str, Any]: """Check the status of an MCP OAuth flow.""" redis_client = get_redis_instance() status_key = f"mcp_oauth_status:{task_id}" status_data = redis_client.get(status_key) if status_data: return json.loads(status_data) return {"status": "not_found", "message": "Status not found"} ================================================ FILE: application/wsgi.py ================================================ from application.app import app from application.core.settings import settings if __name__ == "__main__": app.run(debug=settings.FLASK_DEBUG_MODE, port=7091) ================================================ FILE: codecov.yml ================================================ ignore: - "*/tests/*" ================================================ FILE: deployment/docker-compose-azure.yaml ================================================ services: frontend: build: ../frontend environment: - VITE_API_HOST=http://localhost:7091 - VITE_API_STREAMING=$VITE_API_STREAMING ports: - "5173:5173" depends_on: - backend backend: build: ../application env_file: - ../.env environment: # Override URLs to use docker service names - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/1 - MONGO_URI=mongodb://mongo:27017/docsgpt ports: - "7091:7091" volumes: - ../application/indexes:/app/application/indexes - ../application/inputs:/app/application/inputs - ../application/vectors:/app/application/vectors depends_on: - redis - mongo worker: build: ../application command: celery -A application.app.celery worker -l INFO env_file: - ../.env environment: # Override URLs to use docker service names - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/1 - MONGO_URI=mongodb://mongo:27017/docsgpt - API_URL=http://backend:7091 depends_on: - redis - mongo redis: image: redis:6-alpine ports: - 6379:6379 mongo: image: mongo:6 ports: - 27017:27017 volumes: - mongodb_data_container:/data/db volumes: mongodb_data_container: ================================================ FILE: deployment/docker-compose-dev.yaml ================================================ name: docsgpt-oss services: redis: image: redis:6-alpine ports: - 6379:6379 mongo: image: mongo:6 ports: - 27017:27017 volumes: - mongodb_data_container:/data/db volumes: mongodb_data_container: ================================================ FILE: deployment/docker-compose-hub.yaml ================================================ name: docsgpt-oss services: frontend: image: arc53/docsgpt-fe:develop environment: - VITE_API_HOST=http://localhost:7091 - VITE_API_STREAMING=${VITE_API_STREAMING:-true} - VITE_GOOGLE_CLIENT_ID=${VITE_GOOGLE_CLIENT_ID:-} ports: - "5173:5173" depends_on: - backend backend: user: root image: arc53/docsgpt:develop env_file: - ../.env environment: - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/1 - MONGO_URI=mongodb://mongo:27017/docsgpt - CACHE_REDIS_URL=redis://redis:6379/2 ports: - "7091:7091" volumes: - ../application/indexes:/app/indexes - ../application/inputs:/app/inputs - ../application/vectors:/app/vectors depends_on: - redis - mongo worker: user: root image: arc53/docsgpt:develop command: celery -A application.app.celery worker -l INFO -B env_file: - ../.env environment: - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/1 - MONGO_URI=mongodb://mongo:27017/docsgpt - API_URL=http://backend:7091 - CACHE_REDIS_URL=redis://redis:6379/2 volumes: - ../application/indexes:/app/indexes - ../application/inputs:/app/inputs - ../application/vectors:/app/vectors depends_on: - redis - mongo redis: image: redis:6-alpine ports: - 6379:6379 mongo: image: mongo:6 ports: - 27017:27017 volumes: - mongodb_data_container:/data/db volumes: mongodb_data_container: ================================================ FILE: deployment/docker-compose-local.yaml ================================================ services: frontend: build: ../frontend volumes: - ../frontend/src:/app/src environment: - VITE_API_HOST=http://localhost:7091 - VITE_API_STREAMING=$VITE_API_STREAMING - VITE_EMBEDDINGS_NAME=$EMBEDDINGS_NAME ports: - "5173:5173" redis: image: redis:6-alpine ports: - 6379:6379 mongo: image: mongo:6 ports: - 27017:27017 volumes: - mongodb_data_container:/data/db volumes: mongodb_data_container: ================================================ FILE: deployment/docker-compose.yaml ================================================ name: docsgpt-oss services: frontend: build: ../frontend volumes: - ../frontend/src:/app/src environment: - VITE_API_HOST=http://localhost:7091 - VITE_API_STREAMING=$VITE_API_STREAMING - VITE_GOOGLE_CLIENT_ID=$VITE_GOOGLE_CLIENT_ID ports: - "5173:5173" depends_on: - backend backend: user: root build: ../application env_file: - ../.env environment: # Override URLs to use docker service names - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/1 - MONGO_URI=mongodb://mongo:27017/docsgpt - CACHE_REDIS_URL=redis://redis:6379/2 ports: - "7091:7091" volumes: - ../application/indexes:/app/indexes - ../application/inputs:/app/inputs - ../application/vectors:/app/vectors depends_on: - redis - mongo worker: user: root build: ../application command: celery -A application.app.celery worker -l INFO -B env_file: - ../.env environment: # Override URLs to use docker service names - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/1 - MONGO_URI=mongodb://mongo:27017/docsgpt - API_URL=http://backend:7091 - CACHE_REDIS_URL=redis://redis:6379/2 volumes: - ../application/indexes:/app/indexes - ../application/inputs:/app/inputs - ../application/vectors:/app/vectors depends_on: - redis - mongo redis: image: redis:6-alpine ports: - 6379:6379 mongo: image: mongo:6 ports: - 27017:27017 volumes: - mongodb_data_container:/data/db volumes: mongodb_data_container: ================================================ FILE: deployment/k8s/deployments/docsgpt-deploy.yaml ================================================ apiVersion: apps/v1 kind: Deployment metadata: name: docsgpt-api spec: replicas: 1 selector: matchLabels: app: docsgpt-api template: metadata: labels: app: docsgpt-api spec: containers: - name: docsgpt-api image: arc53/docsgpt ports: - containerPort: 7091 resources: limits: memory: "4Gi" cpu: "2" requests: memory: "2Gi" cpu: "1" envFrom: - secretRef: name: docsgpt-secrets env: - name: FLASK_APP value: "application/app.py" - name: DEPLOYMENT_TYPE value: "cloud" --- apiVersion: apps/v1 kind: Deployment metadata: name: docsgpt-worker spec: replicas: 1 selector: matchLabels: app: docsgpt-worker template: metadata: labels: app: docsgpt-worker spec: containers: - name: docsgpt-worker image: arc53/docsgpt command: ["celery", "-A", "application.app.celery", "worker", "-l", "INFO", "-n", "worker.%h"] resources: limits: memory: "4Gi" cpu: "2" requests: memory: "2Gi" cpu: "1" envFrom: - secretRef: name: docsgpt-secrets env: - name: API_URL value: "http://" --- apiVersion: apps/v1 kind: Deployment metadata: name: docsgpt-frontend spec: replicas: 1 selector: matchLabels: app: docsgpt-frontend template: metadata: labels: app: docsgpt-frontend spec: containers: - name: docsgpt-frontend image: arc53/docsgpt-fe ports: - containerPort: 5173 resources: limits: memory: "1Gi" cpu: "1" requests: memory: "256Mi" cpu: "100m" env: - name: VITE_API_HOST value: "http://" - name: VITE_API_STREAMING value: "true" ================================================ FILE: deployment/k8s/deployments/mongo-deploy.yaml ================================================ apiVersion: v1 kind: PersistentVolumeClaim metadata: name: mongodb-pvc spec: accessModes: - ReadWriteOnce resources: requests: storage: 10Gi # Adjust size as needed --- apiVersion: apps/v1 kind: Deployment metadata: name: mongodb spec: replicas: 1 selector: matchLabels: app: mongodb template: metadata: labels: app: mongodb spec: containers: - name: mongodb image: mongo:latest ports: - containerPort: 27017 resources: limits: memory: "1Gi" cpu: "0.5" requests: memory: "512Mi" cpu: "250m" volumeMounts: - name: mongodb-data mountPath: /data/db volumes: - name: mongodb-data persistentVolumeClaim: claimName: mongodb-pvc ================================================ FILE: deployment/k8s/deployments/qdrant-deploy.yaml ================================================ apiVersion: v1 kind: PersistentVolumeClaim metadata: name: qdrant-pvc spec: accessModes: - ReadWriteOnce resources: requests: storage: 10Gi --- apiVersion: apps/v1 kind: Deployment metadata: name: qdrant spec: replicas: 1 selector: matchLabels: app: qdrant template: metadata: labels: app: qdrant spec: containers: - name: qdrant image: qdrant/qdrant:latest ports: - containerPort: 6333 resources: limits: memory: "2Gi" # Adjust based on your needs cpu: "1" # Adjust based on your needs requests: memory: "1Gi" # Adjust based on your needs cpu: "500m" # Adjust based on your needs volumeMounts: - name: qdrant-data mountPath: /qdrant/storage volumes: - name: qdrant-data persistentVolumeClaim: claimName: qdrant-pvc ================================================ FILE: deployment/k8s/deployments/redis-deploy.yaml ================================================ apiVersion: apps/v1 kind: Deployment metadata: name: redis spec: replicas: 1 selector: matchLabels: app: redis template: metadata: labels: app: redis spec: containers: - name: redis image: redis:latest ports: - containerPort: 6379 resources: limits: memory: "1Gi" cpu: "0.5" requests: memory: "512Mi" cpu: "250m" ================================================ FILE: deployment/k8s/docsgpt-secrets.yaml ================================================ apiVersion: v1 kind: Secret metadata: name: docsgpt-secrets type: Opaque data: LLM_PROVIDER: ZG9jc2dwdA== INTERNAL_KEY: aW50ZXJuYWw= CELERY_BROKER_URL: cmVkaXM6Ly9yZWRpcy1zZXJ2aWNlOjYzNzkvMA== CELERY_RESULT_BACKEND: cmVkaXM6Ly9yZWRpcy1zZXJ2aWNlOjYzNzkvMA== QDRANT_URL: cmVkaXM6Ly9yZWRpcy1zZXJ2aWNlOjYzNzkvMA== QDRANT_PORT: NjM3OQ== MONGO_URI: bW9uZ29kYjovL21vbmdvZGItc2VydmljZToyNzAxNy9kb2NzZ3B0P3JldHJ5V3JpdGVzPXRydWUmdz1tYWpvcml0eQ== mongo-user: bW9uZ28tdXNlcg== mongo-password: bW9uZ28tcGFzc3dvcmQ= ================================================ FILE: deployment/k8s/services/docsgpt-service.yaml ================================================ apiVersion: v1 kind: Service metadata: name: docsgpt-api-service spec: selector: app: docsgpt-api ports: - protocol: TCP port: 80 targetPort: 7091 type: LoadBalancer --- apiVersion: v1 kind: Service metadata: name: docsgpt-frontend-service spec: selector: app: docsgpt-frontend ports: - protocol: TCP port: 80 targetPort: 5173 type: LoadBalancer ================================================ FILE: deployment/k8s/services/mongo-service.yaml ================================================ apiVersion: v1 kind: Service metadata: name: mongodb-service spec: selector: app: mongodb ports: - protocol: TCP port: 27017 targetPort: 27017 type: ClusterIP ================================================ FILE: deployment/k8s/services/qdrant-service.yaml ================================================ apiVersion: v1 kind: Service metadata: name: qdrant spec: selector: app: qdrant ports: - protocol: TCP port: 6333 targetPort: 6333 type: ClusterIP ================================================ FILE: deployment/k8s/services/redis-service.yaml ================================================ apiVersion: v1 kind: Service metadata: name: redis-service spec: selector: app: redis ports: - protocol: TCP port: 6379 targetPort: 6379 type: ClusterIP ================================================ FILE: deployment/optional/docker-compose.optional.ollama-cpu.yaml ================================================ version: "3.8" services: ollama: image: ollama/ollama ports: - "11434:11434" volumes: - ollama_data:/root/.ollama volumes: ollama_data: ================================================ FILE: deployment/optional/docker-compose.optional.ollama-gpu.yaml ================================================ version: "3.8" services: ollama: image: ollama/ollama ports: - "11434:11434" volumes: - ollama_data:/root/.ollama deploy: resources: reservations: devices: - capabilities: [gpu] volumes: ollama_data: ================================================ FILE: docs/README.md ================================================ # nextra-docsgpt ## Setting Up Docs Folder of DocsGPT Locally ### 1. Clone the DocsGPT repository: ```bash git clone https://github.com/arc53/DocsGPT.git ``` ### 2. Navigate to the docs folder: ```bash cd DocsGPT/docs ``` The docs folder contains the markdown files that make up the documentation. The majority of the files are in the pages directory. Some notable files in this folder include: `index.mdx`: The main documentation file. `_app.js`: This file is used to customize the default Next.js application shell. `theme.config.jsx`: This file is for configuring the Nextra theme for the documentation. ### 3. Verify that you have Node.js and npm installed in your system. You can check by running: ```bash node --version npm --version ``` ### 4. If not installed, download Node.js and npm from the respective official websites. ### 5. Once you have Node.js and npm running, proceed to install yarn - another package manager that helps to manage project dependencies: ```bash npm install --global yarn ``` ### 6. Install the project dependencies using yarn: ```bash yarn install ``` ### 7. After the successful installation of the project dependencies, start the local server: ```bash yarn dev ``` - Now, you should be able to view the docs on your local environment by visiting `http://localhost:3000`. You can explore the different markdown files and make changes as you see fit. - **Footnotes:** This guide assumes you have Node.js and npm installed. The guide involves running a local server using yarn, and viewing the documentation offline. If you encounter any issues, it may be worth verifying your Node.js and npm installations and whether you have installed yarn correctly. ================================================ FILE: docs/app/[[...mdxPath]]/page.jsx ================================================ import { generateStaticParamsFor, importPage } from 'nextra/pages'; import { useMDXComponents } from '../../mdx-components'; export const generateStaticParams = generateStaticParamsFor('mdxPath'); export async function generateMetadata(props) { const params = await props.params; const { metadata } = await importPage(params?.mdxPath); return metadata; } const Wrapper = useMDXComponents().wrapper; export default async function Page(props) { const params = await props.params; const result = await importPage(params?.mdxPath); const { default: MDXContent, metadata, sourceCode, toc } = result; return ( ); } ================================================ FILE: docs/app/layout.jsx ================================================ import Image from 'next/image'; import { Analytics } from '@vercel/analytics/react'; import { Banner, Head } from 'nextra/components'; import { getPageMap } from 'nextra/page-map'; import { Footer, Layout, Navbar } from 'nextra-theme-docs'; import 'nextra-theme-docs/style.css'; import CuteLogo from '../public/cute-docsgpt.png'; import themeConfig from '../theme.config'; const github = 'https://github.com/arc53/DocsGPT'; export const metadata = { title: { default: 'DocsGPT Documentation', template: '%s - DocsGPT Documentation', }, description: 'Use DocsGPT to chat with your data. DocsGPT is a GPT-powered chatbot that can answer questions about your data.', }; const navbar = ( DocsGPT logo DocsGPT Docs } projectLink={github} chatLink="https://discord.com/invite/n5BX8dh8rU" /> ); const footer = ( ); export default async function RootLayout({ children }) { return (
Welcome to the new DocsGPT docs!
} navbar={navbar} footer={footer} pageMap={await getPageMap()} {...themeConfig} > {children}
); } ================================================ FILE: docs/components/DeploymentCards.jsx ================================================ 'use client'; import Image from 'next/image'; const iconMap = { 'Amazon Lightsail': '/lightsail.png', 'Railway': '/railway.png', 'Civo Compute Cloud': '/civo.png', 'DigitalOcean Droplet': '/digitalocean.png', 'Kamatera Cloud': '/kamatera.png', }; export function DeploymentCards({ items }) { return ( <>
{items.map(({ title, link, description }) => { const isExternal = link.startsWith('https://'); const iconSrc = iconMap[title] || '/default-icon.png'; // Default icon if not found return ( ); })}
); } ================================================ FILE: docs/components/ToolCards.jsx ================================================ 'use client'; import Image from 'next/image'; const iconMap = { 'API Tool': '/toolIcons/tool_api_tool.svg', 'Brave Search Tool': '/toolIcons/tool_brave.svg', 'Cryptoprice Tool': '/toolIcons/tool_cryptoprice.svg', 'Ntfy Tool': '/toolIcons/tool_ntfy.svg', 'PostgreSQL Tool': '/toolIcons/tool_postgres.svg', 'Read Webpage Tool': '/toolIcons/tool_read_webpage.svg', 'Telegram Tool': '/toolIcons/tool_telegram.svg' }; export function ToolCards({ items }) { return ( <>
{items.map(({ title, link, description }) => { const isExternal = link.startsWith('https://'); const iconSrc = iconMap[title] || '/default-icon.png'; // Default icon if not found return ( ); })}
); } ================================================ FILE: docs/content/Agents/_meta.js ================================================ export default { "basics": { "title": "🤖 Agent Basics", "href": "/Agents/basics" }, "api": { "title": "🔌 Agent API", "href": "/Agents/api" }, "webhooks": { "title": "🪝 Agent Webhooks", "href": "/Agents/webhooks" }, "nodes": { "title": "🧩 Workflow Nodes", "href": "/Agents/nodes" } } ================================================ FILE: docs/content/Agents/api.mdx ================================================ --- title: Interacting with Agents via API description: Learn how to programmatically interact with DocsGPT Agents using the streaming and non-streaming API endpoints. --- import { Callout, Tabs } from 'nextra/components'; # Interacting with Agents via API DocsGPT Agents can be accessed programmatically through API endpoints. This page covers: - Non-streaming answers (`/api/answer`) - Streaming answers over SSE (`/stream`) - File/image attachments (`/api/store_attachment` + `/api/task_status` + `/stream`) When you use an agent `api_key`, DocsGPT loads that agent's configuration automatically (prompt, tools, sources, default model). You usually only need to send `question` and `api_key`. ## Base URL For DocsGPT Cloud, use `https://gptcloud.arc53.com` as the base URL. - Local: `http://localhost:7091` - Cloud: `https://gptcloud.arc53.com` ## How Request Resolution Works DocsGPT resolves your request in this order: 1. If `api_key` is provided, DocsGPT loads the mapped agent and executes with that config. 2. If `agent_id` is provided (typically with JWT auth), DocsGPT loads that agent if allowed. 3. If neither is provided, DocsGPT uses request-level fields (`prompt_id`, `active_docs`, `retriever`, etc.). Authentication: - Agent API-key flow: include `api_key` in JSON/form payload. - JWT flow (if auth enabled): include `Authorization: Bearer `. ## Endpoints - `POST /api/answer` (non-streaming) - `POST /stream` (SSE streaming) - `POST /api/store_attachment` (multipart upload) - `GET /api/task_status?task_id=...` (Celery task polling) ## Request Parameters Common request body fields: | Field | Type | Required | Applies to | Notes | | --- | --- | --- | --- | --- | | `question` | `string` | Yes | `/api/answer`, `/stream` | User query. | | `api_key` | `string` | Usually | `/api/answer`, `/stream` | Recommended for agent API use. Loads agent config from key. | | `conversation_id` | `string` | No | `/api/answer`, `/stream` | Continue an existing conversation. | | `history` | `string` (JSON-encoded array) | No | `/api/answer`, `/stream` | Used for new conversations. Format: `[{\"prompt\":\"...\",\"response\":\"...\"}]`. | | `model_id` | `string` | No | `/api/answer`, `/stream` | Override model for this request. | | `save_conversation` | `boolean` | No | `/api/answer`, `/stream` | Default `true`. If `false`, no conversation is persisted. | | `passthrough` | `object` | No | `/api/answer`, `/stream` | Dynamic values injected into prompt templates. | | `prompt_id` | `string` | No | `/api/answer`, `/stream` | Ignored when `api_key` already defines prompt. | | `active_docs` | `string` or `string[]` | No | `/api/answer`, `/stream` | Overrides active docs when not using key-owned source config. | | `retriever` | `string` | No | `/api/answer`, `/stream` | Retriever type (for example `classic`). | | `chunks` | `number` | No | `/api/answer`, `/stream` | Retrieval chunk count, default `2`. | | `isNoneDoc` | `boolean` | No | `/api/answer`, `/stream` | Skip document retrieval. | | `agent_id` | `string` | No | `/api/answer`, `/stream` | Alternative to `api_key` when using authenticated user context. | Streaming-only fields: | Field | Type | Required | Notes | | --- | --- | --- | --- | | `attachments` | `string[]` | No | List of attachment IDs from `/api/task_status` success result. | | `index` | `number` | No | Update an existing query index. If provided, `conversation_id` is required. | ## Non-Streaming API (`/api/answer`) `/api/answer` waits for completion and returns one JSON response. `attachments` are currently handled through `/stream`. For file/image-attached queries, use the streaming endpoint. Response fields: - `conversation_id` - `answer` - `sources` - `tool_calls` - `thought` - Optional structured output metadata (`structured`, `schema`) when enabled ### Examples ```bash curl -X POST http://localhost:7091/api/answer \ -H "Content-Type: application/json" \ -d '{"question":"your question here","api_key":"your_agent_api_key"}' ``` ```python import requests API_URL = "http://localhost:7091/api/answer" API_KEY = "your_agent_api_key" QUESTION = "your question here" response = requests.post( API_URL, json={"question": QUESTION, "api_key": API_KEY} ) if response.status_code == 200: print(response.json()) else: print(f"Error: {response.status_code}") print(response.text) ``` ```javascript const apiUrl = 'http://localhost:7091/api/answer'; const apiKey = 'your_agent_api_key'; const question = 'your question here'; async function getAnswer() { try { const response = await fetch(apiUrl, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({ question, api_key: apiKey }), }); if (!response.ok) { throw new Error(`HTTP error! Status: ${response.status}`); } const data = await response.json(); console.log(data); } catch (error) { console.error("Failed to fetch answer:", error); } } getAnswer(); ``` --- ## Streaming API (`/stream`) `/stream` returns a Server-Sent Events (SSE) stream so you can render output token-by-token. ### SSE Event Types Each `data:` frame is JSON with `type`: - `answer`: incremental answer chunk - `source`: source list/chunks - `tool_calls`: tool invocation results/metadata - `thought`: reasoning/thought chunk (agent dependent) - `structured_answer`: final structured payload (when schema mode is active) - `id`: final conversation ID - `error`: error message - `end`: stream is complete ### Examples ```bash curl -X POST http://localhost:7091/stream \ -H "Content-Type: application/json" \ -H "Accept: text/event-stream" \ -d '{"question":"your question here","api_key":"your_agent_api_key"}' ``` ```python import requests import json API_URL = "http://localhost:7091/stream" payload = { "question": "your question here", "api_key": "your_agent_api_key" } with requests.post(API_URL, json=payload, stream=True) as r: for line in r.iter_lines(): if line: decoded_line = line.decode('utf-8') if decoded_line.startswith('data: '): try: data = json.loads(decoded_line[6:]) print(data) except json.JSONDecodeError: pass ``` ```javascript const apiUrl = 'http://localhost:7091/stream'; const apiKey = 'your_agent_api_key'; const question = 'your question here'; async function getStream() { try { const response = await fetch(apiUrl, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Accept': 'text/event-stream' }, body: JSON.stringify({ question, api_key: apiKey }), }); if (!response.ok) { throw new Error(`HTTP error! Status: ${response.status}`); } const reader = response.body.getReader(); const decoder = new TextDecoder(); while (true) { const { done, value } = await reader.read(); if (done) break; const chunk = decoder.decode(value, { stream: true }); // Note: This parsing method assumes each chunk contains whole lines. // For a more robust production implementation, buffer the chunks // and process them line by line. const lines = chunk.split('\n'); for (const line of lines) { if (line.startsWith('data: ')) { try { const data = JSON.parse(line.substring(6)); console.log(data); } catch (e) { console.error("Failed to parse JSON from SSE event:", e); } } } } } catch (error) { console.error("Failed to fetch stream:", error); } } getStream(); ``` --- ## Attachments API (Including Images) To attach an image (or other file) to a query: 1. Upload file(s) to `/api/store_attachment` (multipart/form-data). 2. Poll `/api/task_status` until `status=SUCCESS`. 3. Read `result.attachment_id` from task result. 4. Send that ID in `/stream` as `attachments: ["..."]`. Attachments are processed asynchronously. Do not call `/stream` with an attachment until its task has finished with `SUCCESS`. ### Step 1: Upload Attachment `POST /api/store_attachment` - Content type: `multipart/form-data` - Form fields: - `file` (required, can be repeated for multi-file upload) - `api_key` (optional if JWT is present; useful for API-key-only flows) Example upload (single image): ```bash curl -X POST http://localhost:7091/api/store_attachment \ -F "file=@/absolute/path/to/image.png" \ -F "api_key=your_agent_api_key" ``` Possible response (single-file upload): ```json { "success": true, "task_id": "34f1cb56-7c7f-4d5f-a973-4ea7e65f7a10", "message": "File uploaded successfully. Processing started." } ``` ### Step 2: Poll Task Status ```bash curl "http://localhost:7091/api/task_status?task_id=34f1cb56-7c7f-4d5f-a973-4ea7e65f7a10" ``` When complete: ```json { "status": "SUCCESS", "result": { "attachment_id": "67b4f8f2618dc9f19384a9e1", "filename": "image.png", "mime_type": "image/png" } } ``` ### Step 3: Attach to `/stream` Request Use the `attachment_id` in `attachments`. ```bash curl -X POST http://localhost:7091/stream \ -H "Content-Type: application/json" \ -H "Accept: text/event-stream" \ -d '{ "question": "Describe this image", "api_key": "your_agent_api_key", "attachments": ["67b4f8f2618dc9f19384a9e1"] }' ``` ### Image/Attachment Behavior Notes - Typical image MIME types supported for native vision flows: `image/png`, `image/jpeg`, `image/jpg`, `image/webp`, `image/gif`. - If the selected model/provider does not support a file type natively, DocsGPT falls back to parsed text content. - For providers that support images but not native PDF file attachments, DocsGPT can convert PDF pages to images (synthetic PDF support). - Attachments are user-scoped. Upload and query must be done under the same user context (same API key owner or same JWT user). ================================================ FILE: docs/content/Agents/basics.mdx ================================================ --- title: Understanding DocsGPT Agents description: Learn about DocsGPT Agents, their types, how to create and manage them, and how they can enhance your interaction with documents and tools. --- import { Callout } from 'nextra/components'; import Image from 'next/image'; // Assuming you might want to embed images later, like the ones you uploaded. # Understanding DocsGPT Agents 🤖 DocsGPT Agents are advanced, configurable AI entities designed to go beyond simple question-answering. They act as specialized assistants or workers that combine instructions (prompts), knowledge (document sources), and capabilities (tools) to perform a wide range of tasks, automate workflows, and provide tailored interactions. Think of an Agent as a pre-configured version of DocsGPT, fine-tuned for a specific purpose, such as classifying documents, responding to new form submissions, or validating emails. ## Why Use Agents? * **Personalization:** Create AI assistants that behave and respond according to specific roles or personas. * **Task Specialization:** Design agents focused on particular tasks, like customer support, data extraction, or content generation. * **Knowledge Integration:** Equip agents with specific document sources, making them experts in particular domains. * **Tool Utilization:** Grant agents access to various tools, allowing them to interact with external services, fetch live data, or perform actions. * **Automation:** Automate repetitive tasks by defining an agent's behavior and integrating it via webhooks or other means. * **Shareability:** Share your custom-configured agents with others or use agents shared with you. Agents provide a more structured and powerful way to leverage LLMs compared to a standard chat interface, as they come with a pre-defined context, instruction set, and set of capabilities. ## Core Components of an Agent When you create or configure an agent, you'll work with these key components: **Meta:** * **Agent Name:** A user-friendly name to identify the agent (e.g., "Support Ticket Classifier," "Product Spec Expert"). * **Describe your agent:** A brief description for you or users to understand the agent's purpose. **Source:** * **Select source:** The knowledge base for the agent. You can select from previously uploaded documents or data sources. This is what the agent will "know." * **Chunks per query:** A numerical value determining how many relevant text chunks from the selected source are sent to the LLM with each query. This helps manage context length and relevance. **Prompt:** The main set of instructions or system [prompt](/Guides/Customising-prompts) that defines the agent's persona, objectives, constraints, and how it should behave or respond. **Tools:** A selection of available [DocsGPT Tools](/Tools/basics) that the agent can use to perform actions or access external information. **Agent type:** The underlying operational logic or architecture the agent uses. DocsGPT supports different types of agents, each suited for different kinds of tasks. ## Understanding Agent Types DocsGPT allows for different "types" of agents, each with a distinct way of processing information and generating responses. The code for these agent types can be found in the `application/agents/` directory. ### 1. Classic Agent (`classic_agent.py`) **How it works:** The Classic Agent follows a traditional Retrieval Augmented Generation (RAG) approach. 1. **Retrieve:** When a query is made, it first searches the selected Source documents for relevant information. 2. **Augment:** This retrieved data is then added to the context, along with the main Prompt and the user's query. 3. **Generate:** The LLM generates a response based on this augmented context. It can also utilize any configured tools if the LLM decides they are necessary. **Best for:** * Direct question-answering over a specific set of documents. * Tasks where the primary goal is to extract and synthesize information from the provided sources. * Simpler tool integrations where the decision to use a tool is straightforward. ### 2. ReAct Agent (`react_agent.py`) **How it works:** The ReAct Agent employs a more sophisticated "Reason and Act" framework. This involves a multi-step process: 1. **Plan (Thought):** Based on the query, its prompt, and available tools/sources, the LLM first generates a plan or a sequence of thoughts on how to approach the problem. You might see this output as a "thought" process during generation. 2. **Act:** The agent then executes actions based on this plan. This might involve querying its sources, using a tool, or performing internal reasoning. 3. **Observe:** It gathers observations from the results of its actions (e.g., data from a tool, snippets from documents). 4. **Repeat (if necessary):** Steps 2 and 3 can be repeated as the agent refines its approach or gathers more information. 5. **Conclude:** Finally, it generates the final answer based on the initial query and all accumulated observations. **Best for:** * More complex tasks that require multi-step reasoning or problem-solving. * Scenarios where the agent needs to dynamically decide which tools to use and in what order, based on intermediate results. * Interactive tasks where the agent needs to "think" through a problem. Developers looking to introduce new agent architectures can explore the `application/agents/` directory. `classic_agent.py` and `react_agent.py` serve as excellent starting points, demonstrating how to inherit from `BaseAgent` and structure agent logic. ## Navigating and Managing Agents in DocsGPT You can easily access and manage your agents through the DocsGPT user interface. Recently used agents appear at the top of the left sidebar for quick access. Below these, the "Manage Agents" button will take you to the main Agents page. ### Creating a New Agent 1. Navigate to the "Agents" page. 2. Click the **"New Agent"** button. 3. You will be presented with the "New Agent" configuration screen: API Tool configuration example for phone validation 4. Fill in the fields as described in the "Core Components of an Agent" section. 5. Once configured, you can **"Save Draft"** to continue editing later or **"Publish"** to make the agent active. ## Interacting with and Editing Agents Once an agent is created, you can: * **Chat with it:** Select the agent to start an interaction. * **View Logs:** Access usage statistics, monitor token consumption per interaction, and review user message feedbacks. This is crucial for understanding how your agent is being used and performing. * **Edit an Agent:** * Modify any of its configuration settings (name, description, source, prompt, tools, type). * **Generate a Public Link:** From the edit screen, you can create a shareable public link that allows others to import and use your agent. * **Get a Webhook URL:** You can also obtain a Webhook URL for the agent. This allows external applications or services to trigger the agent and receive responses programmatically, enabling powerful integrations and automations. ## Seeding Premade Agents from YAML You can bootstrap a fresh DocsGPT deployment with a curated set of agents by seeding them directly into MongoDB. 1. **Customize the configuration** – edit `application/seed/config/premade_agents.yaml` (or copy from `application/seed/config/agents_template.yaml`) to describe the agents you want to provision. Each entry lets you define prompts, tools, and optional data sources. 2. **Ensure dependencies are running** – MongoDB must be reachable using the credentials in `.env`, and a Celery worker should be available if any agent sources need to be ingested via `ingest_remote`. 3. **Execute the seeder** – run `python -m application.seed.commands init`. Add `--force` when you need to reseed an existing environment. The seeder keeps templates under the `system` user so they appear in the UI for anyone to clone or customize. Environment variable placeholders such as `${MY_TOKEN}` inside tool configs are resolved during the seeding process. ================================================ FILE: docs/content/Agents/nodes.mdx ================================================ # Workflow Nodes DocsGPT workflows are composed of **Nodes** that are connected to form a processing graph. These nodes interact with a **Shared State**—a global dictionary of variables that persists throughout the execution of the workflow. ## The Shared State Every workflow run maintains a state object (a JSON-like dictionary). - **Initial State**: Contains the user's input query (`{{query}}`) and chat history (`{{chat_history}}`). - **Accessing Variables**: You can access any variable in the state using the double-curly braces syntax: `{{variable_name}}`. - **Modifying State**: Nodes read from this state and write their outputs back to it. --- ## AI Agent Node The **AI Agent Node** is the core processing unit. It uses a Large Language Model (LLM) to generate text, answer questions, or perform tasks using tools. ### Inputs (Template Variables) The primary input is the **Prompt Template**. This field supports variable substitution. - **Prompt Template**: The text sent to the model. - *Example*: `"Summarize the following text: {{user_input_text}}"` - If left empty, it defaults to the initial user query (`{{query}}`). - **System Prompt**: Instructions that define the agent's persona and constraints. - **Tools**: A list of tools the agent can use (e.g., search, calculator). - **LLM Settings**: Specific provider, model name, and parameters. ### Outputs (Emissions) When the agent completes its task, it stores the result in the shared state. - **Output Variable**: The name of the variable where the result will be saved. - *Default*: If not specified, it is saved as `node_{node_id}_output`. - *Custom*: You can set this to something meaningful, like `summary` or `translated_text`. - **Streaming**: If "Stream to user" is enabled, the output is sent to the user in real-time as it is generated, in addition to being saved to the state. --- ## Set State Node The **Set State Node** allows you to manipulate variables within the shared state directly without calling an LLM. This is useful for initialization, formatting, or control flow logic. ### Operations You can define multiple operations in a single node. Each operation targets a specific **Key** (variable name). 1. **Set**: Assigns a specific value to a variable. - *Value*: Can be a static string or a template using variables. - *Example*: Set `current_step` to `1`. - *Example*: Set `formatted_response` to `Analysis: {{analysis_result}}`. 2. **Increment**: Increases the value of a numeric variable. - *Value*: The amount to add (default is 1). - *Example*: Increment `retry_count` by `1`. 3. **Append**: Adds a value to a list variable. - *Value*: The item to add to the list. - *Example*: Append `{{last_result}}` to `history_list`. ### Usage Examples - **Loop Counters**: Use a *Set State* node to initialize a counter (`i = 0`) before a loop, and another to increment it inside the loop. - **Accumulators**: Use *Append* to collect results from multiple parallel branches into a single list. - **Renaming**: Copy the output of a previous node to a more generic name (e.g., set `context` to `{{search_results}}`) so subsequent nodes can use a standard variable name. ================================================ FILE: docs/content/Agents/webhooks.mdx ================================================ --- title: Triggering Agents with Webhooks description: Learn how to automate and integrate DocsGPT Agents using webhooks for asynchronous task execution. --- import { Callout, Tabs } from 'nextra/components'; # Triggering Agents with Webhooks Agent Webhooks provide a powerful mechanism to trigger an agent's execution from external systems. Unlike the direct API which provides an immediate response, webhooks are designed for **asynchronous** operations. When you call a webhook, DocsGPT enqueues the agent's task for background processing and immediately returns a `task_id`. You then use this ID to poll for the result. This workflow is ideal for integrating with services that expect a quick initial response (e.g., form submissions) or for triggering long-running tasks without tying up a client connection. Each agent has its own unique webhook URL, which can be generated from the agent's edit page in the DocsGPT UI. This URL includes a secure token for authentication. ### API Endpoints - **Webhook URL:** `http://localhost:7091/api/webhooks/agents/{AGENT_WEBHOOK_TOKEN}` - **Task Status URL:** `http://localhost:7091/api/task_status` For DocsGPT Cloud, use `https://gptcloud.arc53.com/` as the base URL. For more technical details, you can explore the API swagger documentation available for the cloud version or your local instance. --- ## The Webhook Workflow The process involves two main steps: triggering the task and polling for the result. ### Step 1: Trigger the Webhook Send an HTTP `POST` request to the agent's unique webhook URL with the required payload. The structure of this payload should match what the agent's prompt and tools are designed to handle. - **Method:** `POST` - **Response:** A JSON object with a `task_id`. `{"task_id": "a1b2c3d4-e5f6-..."}` ```bash curl -X POST \ http://localhost:7091/api/webhooks/agents/your_webhook_token \ -H "Content-Type: application/json" \ -d '{"question": "Your message to agent"}' ``` ```python import requests WEBHOOK_URL = "http://localhost:7091/api/webhooks/agents/your_webhook_token" payload = {"question": "Your message to agent"} try: response = requests.post(WEBHOOK_URL, json=payload) response.raise_for_status() task_id = response.json().get("task_id") print(f"Task successfully created with ID: {task_id}") except requests.exceptions.RequestException as e: print(f"Error triggering webhook: {e}") ``` ```javascript const webhookUrl = 'http://localhost:7091/api/webhooks/agents/your_webhook_token'; const payload = { question: 'Your message to agent' }; async function triggerWebhook() { try { const response = await fetch(webhookUrl, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload) }); if (!response.ok) throw new Error(`HTTP error! ${response.status}`); const data = await response.json(); console.log(`Task successfully created with ID: ${data.task_id}`); return data.task_id; } catch (error) { console.error('Error triggering webhook:', error); } } triggerWebhook(); ``` ### Step 2: Poll for the Result Once you have the `task_id`, periodically send a `GET` request to the `/api/task_status` endpoint until the task `status` is `SUCCESS` or `FAILURE`. - **`status`**: The current state of the task (`PENDING`, `STARTED`, `SUCCESS`, `FAILURE`). - **`result`**: The final output from the agent, available when the status is `SUCCESS` or `FAILURE`. ```bash # Replace the task_id with the one you received curl http://localhost:7091/api/task_status?task_id=YOUR_TASK_ID ``` ```python import requests import time STATUS_URL = "http://localhost:7091/api/task_status" task_id = "YOUR_TASK_ID" while True: response = requests.get(STATUS_URL, params={"task_id": task_id}) data = response.json() status = data.get("status") print(f"Current task status: {status}") if status in ["SUCCESS", "FAILURE"]: print("Final Result:") print(data.get("result")) break time.sleep(2) ``` ```javascript const statusUrl = 'http://localhost:7091/api/task_status'; const taskId = 'YOUR_TASK_ID'; const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); async function pollForResult() { while (true) { const response = await fetch(`${statusUrl}?task_id=${taskId}`); const data = await response.json(); const status = data.status; console.log(`Current task status: ${status}`); if (status === 'SUCCESS' || status === 'FAILURE') { console.log('Final Result:', data.result); break; } await sleep(2000); } } pollForResult(); ``` ================================================ FILE: docs/content/Deploying/Amazon-Lightsail.mdx ================================================ --- title: Hosting DocsGPT on Amazon Lightsail description: display: hidden --- # Self-hosting DocsGPT on Amazon Lightsail Here's a step-by-step guide on how to set up an Amazon Lightsail instance to host DocsGPT. ## Configuring your instance (If you know how to create a Lightsail instance, you can skip to the recommended configuration part by clicking [here](#connecting-to-your-newly-created-instance)). ### 1. Create an AWS Account: If you haven't already, create or log in to your AWS account at https://lightsail.aws.amazon.com. ### 2. Create an Instance: a. Click "Create Instance." b. Select the "Instance location." In most cases, the default location works fine. c. Choose "Linux/Unix" as the image and "Ubuntu 20.04 LTS" as the Operating System. d. Configure the instance plan based on your requirements. A "1 GB, 1vCPU, 40GB SSD, and 2TB transfer" setup is recommended for most scenarios. e. Give your instance a unique name and click "Create Instance." PS: It may take a few minutes for the instance setup to complete. ### Connecting to Your newly created Instance Your instance will be ready a few minutes after creation. To access it, open the instance and click "Connect using SSH." #### Clone the DocsGPT Repository A terminal window will pop up, and the first step will be to clone the DocsGPT Git repository: `git clone https://github.com/arc53/DocsGPT.git` #### Download the package information Once it has finished cloning the repository, it is time to download the package information from all sources. To do so, simply enter the following command: `sudo apt update` #### Install Docker and Docker Compose DocsGPT backend and worker use Python, Frontend is written on React and the whole application is containerized using Docker. To install Docker and Docker Compose, enter the following commands: `sudo apt install docker.io` And now install docker-compose: `sudo apt install docker-compose` #### Access the DocsGPT Folder Enter the following command to access the folder in which the DocsGPT docker-compose file is present. `cd DocsGPT/` #### Prepare the Environment Inside the DocsGPT folder create a `.env` file and copy the contents of `.env_sample` into it. `nano .env` Make sure your `.env` file looks like this: ``` OPENAI_API_KEY=(Your OpenAI API key) VITE_API_STREAMING=true SELF_HOSTED_MODEL=false ``` To save the file, press CTRL+X, then Y, and then ENTER. Next, set the correct IP for the Backend by opening the docker-compose.yml file: `nano deployment/docker-compose.yaml` And Change line 7 to: `VITE_API_HOST=http://localhost:7091` to this `VITE_API_HOST=http://:7091` This will allow the frontend to connect to the backend. #### Running the Application You're almost there! Now that all the necessary bits and pieces have been installed, it is time to run the application. To do so, use the following command: `sudo docker compose -f deployment/docker-compose.yaml up -d` Launching it for the first time will take a few minutes to download all the necessary dependencies and build. Once this is done you can go ahead and close the terminal window. #### Enabling Ports a. Before you are able to access your live instance, you must first enable the port that it is using. b. Open your Lightsail instance and head to "Networking". c. Then click on "Add rule" under "IPv4 Firewall", enter `5173` as your port, and hit "Create". Repeat the process for port `7091`. #### Access your instance Your instance is now available at your Public IP Address on port 5173. Enjoy using DocsGPT! ================================================ FILE: docs/content/Deploying/Development-Environment.mdx ================================================ --- title: Setting Up a Development Environment description: Guide to setting up a development environment for DocsGPT, including backend and frontend setup. --- # Setting Up a Development Environment This guide will walk you through setting up a development environment for DocsGPT. This setup allows you to modify and test the application's backend and frontend components. ## 1. Spin Up MongoDB and Redis For development purposes, you can quickly start MongoDB and Redis containers, which are the primary database and caching systems used by DocsGPT. We provide a dedicated Docker Compose file, `docker-compose-dev.yaml`, located in the `deployment` directory, that includes only these essential services. You can find the `docker-compose-dev.yaml` file [here](https://github.com/arc53/DocsGPT/blob/main/deployment/docker-compose-dev.yaml). **Steps to start MongoDB and Redis:** 1. Navigate to the root directory of your DocsGPT repository in your terminal. 2. Run the following commands to build and start the containers defined in `docker-compose-dev.yaml`: ```bash docker compose -f deployment/docker-compose-dev.yaml build docker compose -f deployment/docker-compose-dev.yaml up -d ``` These commands will start MongoDB and Redis in detached mode, running in the background. ## 2. Run the Backend To run the DocsGPT backend locally, you'll need to set up a Python environment and install the necessary dependencies. **Prerequisites:** * **Python 3.12:** Ensure you have Python 3.12 installed on your system. You can check your Python version by running `python --version` or `python3 --version` in your terminal. **Steps to run the backend:** 1. **Configure Environment Variables:** DocsGPT backend settings are configured using environment variables. You can set these either in a `.env` file or directly in the `settings.py` file. For a comprehensive overview of all settings, please refer to the [DocsGPT Settings Guide](/Deploying/DocsGPT-Settings). * **Option 1: Using a `.env` file (Recommended):** * If you haven't already, create a file named `.env` in the **root directory** of your DocsGPT project. * Modify the `.env` file to adjust settings as needed. You can find a comprehensive list of configurable options in [`application/core/settings.py`](https://github.com/arc53/DocsGPT/blob/main/application/core/settings.py). * **Option 2: Exporting Environment Variables:** * Alternatively, you can export environment variables directly in your terminal. However, using a `.env` file is generally more organized for development. 2. **Create a Python Virtual Environment (Optional but Recommended):** Using a virtual environment isolates project dependencies and avoids conflicts with system-wide Python packages. * **macOS and Linux:** ```bash python -m venv venv . venv/bin/activate ``` * **Windows:** ```bash python -m venv venv venv/Scripts/activate ``` 3. **Download Embedding Model:** The backend requires an embedding model. Download the `mpnet-base-v2` model and place it in the `models/` directory within the project root. You can use the following script: ```bash wget https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.zip unzip mpnet-base-v2.zip -d model rm mpnet-base-v2.zip ``` Alternatively, you can manually download the zip file from [here](https://d3dg1063dc54p9.cloudfront.net/models/embeddings/mpnet-base-v2.zip), unzip it, and place the extracted folder in `models/`. 4. **Install Backend Dependencies:** Navigate to the root of your DocsGPT repository and install the required Python packages: ```bash pip install -r application/requirements.txt ``` 5. **Run the Flask App:** Start the Flask backend application: ```bash flask --app application/app.py run --host=0.0.0.0 --port=7091 ``` This command will launch the backend server, making it accessible on `http://localhost:7091`. 6. **Start the Celery Worker:** Open a new terminal window (and activate your virtual environment if you used one). Start the Celery worker to handle background tasks: ```bash celery -A application.app.celery worker -l INFO ``` This command will start the Celery worker, which processes tasks such as document parsing and vector embedding. **macOS note:** Due to a threading issue, start Celery with the solo pool: ```bash python -m celery -A application.app.celery worker -l INFO --pool=solo ``` **Running in Debugger (VSCode):** For easier debugging, you can launch the Flask app and Celery worker directly from VSCode's debugger. * Press Shift + Cmd + D (macOS) or Shift + Windows + D (Windows) to open the Run and Debug view. * You should see configurations named "Flask" and "Celery". Select the desired configuration and click the "Start Debugging" button (green play icon). ## 3. Start the Frontend To run the DocsGPT frontend locally, you'll need Node.js and npm (Node Package Manager). **Prerequisites:** * **Node.js version 16 or higher:** Ensure you have Node.js version 16 or greater installed. You can check your Node.js version by running `node -v` in your terminal. npm is usually bundled with Node.js. **Steps to start the frontend:** 1. **Navigate to the Frontend Directory:** In your terminal, change the current directory to the `frontend` folder within your DocsGPT repository: ```bash cd frontend ``` 2. **Install Global Packages (If Needed):** If you don't have `husky` and `vite` installed globally, you can install them: ```bash npm install husky -g npm install vite -g ``` You can skip this step if you already have these packages installed or prefer to use local installations (though global installation simplifies running the commands in this guide). 3. **Install Frontend Dependencies:** Install the project's frontend dependencies using npm: ```bash npm install --include=dev ``` This command reads the `package.json` file in the `frontend` directory and installs all listed dependencies, including development dependencies. 4. **Run the Frontend App:** Start the frontend development server: ```bash npm run dev ``` This command will start the Vite development server. The frontend application will typically be accessible at [http://localhost:5173/](http://localhost:5173/). The terminal will display the exact URL where the frontend is running. With both the backend and frontend running, you should now have a fully functional DocsGPT development environment. You can access the application in your browser at [http://localhost:5173/](http://localhost:5173/) and start developing! ================================================ FILE: docs/content/Deploying/Docker-Deploying.mdx ================================================ --- title: Docker Deployment of DocsGPT description: Deploy DocsGPT using Docker and Docker Compose for easy setup and management. --- # Docker Deployment of DocsGPT Docker is the recommended method for deploying DocsGPT, providing a consistent and isolated environment for the application to run. This guide will walk you through deploying DocsGPT using Docker and Docker Compose. ## Prerequisites * **Docker Engine:** You need to have Docker Engine installed on your system. * **macOS:** [Docker Desktop for Mac](https://docs.docker.com/desktop/install/mac-install/) * **Linux:** [Docker Engine Installation Guide](https://docs.docker.com/engine/install/) (follow instructions for your specific distribution) * **Windows:** [Docker Desktop for Windows](https://docs.docker.com/desktop/install/windows-install/) (requires WSL 2 backend, see notes below) * **Docker Compose:** Docker Compose is usually included with Docker Desktop. If you are using Docker Engine separately, ensure you have Docker Compose V2 installed. **Important Note for Windows Users:** Docker Desktop on Windows generally requires the WSL 2 backend to function correctly, especially when using features like host networking which are utilized in DocsGPT's Docker Compose setup. Ensure WSL 2 is enabled and configured in Docker Desktop settings. ## Quickest Setup: Using DocsGPT Public API The fastest way to try out DocsGPT is by using the public API endpoint. This requires minimal configuration and no local LLM setup. 1. **Clone the DocsGPT Repository (if you haven't already):** ```bash git clone https://github.com/arc53/DocsGPT.git cd DocsGPT ``` 2. **Create a `.env` file:** In the root directory of your DocsGPT repository, create a file named `.env`. 3. **Add Public API Configuration to `.env`:** Open the `.env` file and add the following lines: ``` LLM_PROVIDER=docsgpt VITE_API_STREAMING=true ``` This minimal configuration tells DocsGPT to use the public API. For more advanced settings and other LLM options, refer to the [DocsGPT Settings Guide](/Deploying/DocsGPT-Settings). 4. **Launch DocsGPT with Docker Compose:** Navigate to the root directory of the DocsGPT repository in your terminal and run: ```bash docker compose -f deployment/docker-compose.yaml up -d ``` The `-d` flag runs Docker Compose in detached mode (in the background). 5. **Access DocsGPT in your browser:** Once the containers are running, open your web browser and go to [http://localhost:5173/](http://localhost:5173/). 6. **Stopping DocsGPT:** To stop the application, navigate to the same directory in your terminal and run: ```bash docker compose -f deployment/docker-compose.yaml down ``` ## Optional Ollama Setup (Local Models) DocsGPT provides optional Docker Compose files to easily integrate with [Ollama](https://ollama.com/) for running local models. These files add an official Ollama container to your Docker Compose setup. These files are located in the `deployment/optional/` directory. There are two Ollama optional files: * **`docker-compose.optional.ollama-cpu.yaml`**: For running Ollama on CPU. * **`docker-compose.optional.ollama-gpu.yaml`**: For running Ollama on GPU (requires Docker to be configured for GPU usage). ### Launching with Ollama and Pulling a Model 1. **Clone the DocsGPT Repository and Create `.env` (as described above).** 2. **Launch DocsGPT with Ollama Docker Compose:** Choose the appropriate Ollama Compose file (CPU or GPU) and launch DocsGPT: **CPU:** ```bash docker compose --env-file .env -f deployment/docker-compose.yaml -f deployment/optional/docker-compose.optional.ollama-cpu.yaml up -d ``` **GPU:** ```bash docker compose --env-file .env -f deployment/docker-compose.yaml -f deployment/optional/docker-compose.optional.ollama-gpu.yaml up -d ``` 3. **Pull the Ollama Model:** **Crucially, after launching with Ollama, you need to pull the desired model into the Ollama container.** Find the `LLM_NAME` you configured in your `.env` file (e.g., `llama3.2:1b`). Then execute the following command to pull the model *inside* the running Ollama container: ```bash docker compose -f deployment/docker-compose.yaml -f deployment/optional/docker-compose.optional.ollama-cpu.yaml exec -it ollama ollama pull ``` or (for GPU): ```bash docker compose -f deployment/docker-compose.yaml -f deployment/optional/docker-compose.optional.ollama-gpu.yaml exec -it ollama ollama pull ``` Replace `` with the actual model name from your `.env` file. 4. **Access DocsGPT in your browser:** Once the model is pulled and containers are running, open your web browser and go to [http://localhost:5173/](http://localhost:5173/). 5. **Stopping Ollama Setup:** To stop a DocsGPT setup launched with Ollama optional files, use `docker compose down` and include all the compose files used during the `up` command: ```bash docker compose -f deployment/docker-compose.yaml -f deployment/optional/docker-compose.optional.ollama-cpu.yaml down ``` or ```bash docker compose -f deployment/docker-compose.yaml -f deployment/optional/docker-compose.optional.ollama-gpu.yaml down ``` **Important for GPU Usage:** * **NVIDIA Container Toolkit (for NVIDIA GPUs):** If you are using NVIDIA GPUs, you need to have the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed and configured on your system for Docker to access your GPU. * **Docker GPU Configuration:** Ensure Docker is configured to utilize your GPU. Refer to the [Ollama Docker Hub page](https://hub.docker.com/r/ollama/ollama) and Docker documentation for GPU setup instructions specific to your GPU type (NVIDIA, AMD, Intel). ## Restarting After Configuration Changes Whenever you modify the `.env` file or any Docker Compose files, you need to restart the Docker containers for the changes to be applied. Use the same `docker compose down` and `docker compose up -d` commands you used to launch DocsGPT, ensuring you include all relevant `-f` flags for optional files if you are using them. ## Further Configuration This guide covers the basic Docker deployment of DocsGPT. For detailed information on configuring various aspects of DocsGPT, such as LLM providers, models, vector stores, and more, please refer to the comprehensive [DocsGPT Settings Guide](/Deploying/DocsGPT-Settings). ================================================ FILE: docs/content/Deploying/DocsGPT-Settings.mdx ================================================ --- title: DocsGPT Settings description: Configure your DocsGPT application by understanding the basic settings. --- # DocsGPT Settings DocsGPT is highly configurable, allowing you to tailor it to your specific needs and preferences. You can control various aspects of the application, from choosing the Large Language Model (LLM) provider to selecting embedding models and vector stores. This document will guide you through the basic settings you can configure in DocsGPT. These settings determine how DocsGPT interacts with LLMs and processes your data. ## Configuration Methods There are two primary ways to configure DocsGPT settings: ### 1. Configuration via `.env` file (Recommended) The easiest and recommended way to configure basic settings is by using a `.env` file. This file should be located in the **root directory** of your DocsGPT project (the same directory where `setup.sh` is located). **Example `.env` file structure:** ``` LLM_PROVIDER=openai API_KEY=YOUR_OPENAI_API_KEY LLM_NAME=gpt-4o ``` ### 2. Configuration via `settings.py` file (Advanced) For more advanced configurations or if you prefer to manage settings directly in code, you can modify the `settings.py` file. This file is located in the `application/core` directory of your DocsGPT project. While modifying `settings.py` offers more flexibility, it's generally recommended to use the `.env` file for basic settings and reserve `settings.py` for more complex adjustments or when you need to configure settings programmatically. **Location of `settings.py`:** `application/core/settings.py` ## Basic Settings Explained Here are some of the most fundamental settings you'll likely want to configure: - **`LLM_PROVIDER`**: This setting determines which Large Language Model (LLM) provider DocsGPT will use. It tells DocsGPT which API to interact with. - **Common values:** - `docsgpt`: Use the DocsGPT Public API Endpoint (simple and free, as offered in `setup.sh` option 1). - `openai`: Use OpenAI's API (requires an API key). - `google`: Use Google's Vertex AI or Gemini models. - `anthropic`: Use Anthropic's Claude models. - `groq`: Use Groq's models. - `huggingface`: Use HuggingFace Inference API. - `azure_openai`: Use Azure OpenAI Service. - `openai` (when using local inference engines like Ollama, Llama.cpp, TGI, etc.): This signals DocsGPT to use an OpenAI-compatible API format, even if the actual LLM is running locally. - **`LLM_NAME`**: Specifies the specific model to use from the chosen LLM provider. The available models depend on the `LLM_PROVIDER` you've selected. - **Examples:** - For `LLM_PROVIDER=openai`: `gpt-4o` - For `LLM_PROVIDER=google`: `gemini-2.0-flash` - For local models (e.g., Ollama): `llama3.2:1b` (or any model name available in your setup). - **`EMBEDDINGS_NAME`**: This setting defines which embedding model DocsGPT will use to generate vector embeddings for your documents. Embeddings are numerical representations of text that allow DocsGPT to understand the semantic meaning of your documents for efficient search and retrieval. - **Default value:** `huggingface_sentence-transformers/all-mpnet-base-v2` (a good general-purpose embedding model). - **Other options:** You can explore other embedding models from Hugging Face Sentence Transformers or other providers if needed. - **`API_KEY`**: Required for most cloud-based LLM providers. This is your authentication key to access the LLM provider's API. You'll need to obtain this key from your chosen provider's platform. - **`OPENAI_BASE_URL`**: Specifically used when `LLM_PROVIDER` is set to `openai` but you are connecting to a local inference engine (like Ollama, Llama.cpp, etc.) that exposes an OpenAI-compatible API. This setting tells DocsGPT where to find your local LLM server. - **`STT_PROVIDER`**: Selects the speech-to-text provider used for microphone transcription in chat and for audio file ingestion through the parser pipeline. ## Configuration Examples Let's look at some concrete examples of how to configure these settings in your `.env` file. ### Example for Cloud API Provider (OpenAI) To use OpenAI's `gpt-4o` model, you would configure your `.env` file like this: ``` LLM_PROVIDER=openai API_KEY=YOUR_OPENAI_API_KEY # Replace with your actual OpenAI API key LLM_NAME=gpt-4o ``` Make sure to replace `YOUR_OPENAI_API_KEY` with your actual OpenAI API key. ### Example for Local Deployment To use a local Ollama server with the `llama3.2:1b` model, you would configure your `.env` file like this: ``` LLM_PROVIDER=openai # Using OpenAI compatible API format for local models API_KEY=None # API Key is not needed for local Ollama LLM_NAME=llama3.2:1b OPENAI_BASE_URL=http://host.docker.internal:11434/v1 # Default Ollama API URL within Docker EMBEDDINGS_NAME=huggingface_sentence-transformers/all-mpnet-base-v2 # You can also run embeddings locally if needed ``` In this case, even though you are using Ollama locally, `LLM_PROVIDER` is set to `openai` because Ollama (and many other local inference engines) are designed to be API-compatible with OpenAI. `OPENAI_BASE_URL` points DocsGPT to the local Ollama server. ## Speech-to-Text Settings DocsGPT can transcribe audio in two places: - Voice input in the chat. - Audio file ingestion. Uploaded `.wav`, `.mp3`, `.m4a`, `.ogg`, and `.webm` files are transcribed first and then passed through the normal parser, chunking, embedding, and indexing pipeline. For an end-to-end walkthrough, see the [Speech and Audio Guide](/Guides/speech-and-audio). | Setting | Purpose | Typical values | | --- | --- | --- | | `STT_PROVIDER` | Speech-to-text backend provider. | `openai`, `faster_whisper` | | `OPENAI_STT_MODEL` | OpenAI transcription model used when `STT_PROVIDER=openai`. | `gpt-4o-mini-transcribe` | | `STT_LANGUAGE` | Optional language hint passed to the provider. Leave unset for auto-detection when supported. | `en`, `es`, unset | | `STT_MAX_FILE_SIZE_MB` | Maximum file size accepted by the synchronous `/api/stt` endpoint. | `50` | | `STT_ENABLE_TIMESTAMPS` | Include timestamp segments in the normalized transcript response and stored parser metadata. | `true`, `false` | | `STT_ENABLE_DIARIZATION` | Reserved provider option for speaker diarization. Some providers may ignore it. | `true`, `false` | ### Example: OpenAI Speech-to-Text ```env STT_PROVIDER=openai OPENAI_API_KEY=YOUR_OPENAI_API_KEY OPENAI_STT_MODEL=gpt-4o-mini-transcribe STT_LANGUAGE= STT_MAX_FILE_SIZE_MB=50 STT_ENABLE_TIMESTAMPS=false STT_ENABLE_DIARIZATION=false ``` If you already use `API_KEY` for OpenAI, DocsGPT can reuse that key for transcription. Set `OPENAI_API_KEY` only when you want a dedicated key. ### Example: Local `faster_whisper` ```env STT_PROVIDER=faster_whisper STT_LANGUAGE=en STT_ENABLE_TIMESTAMPS=true STT_ENABLE_DIARIZATION=false ``` `faster_whisper` is an optional backend dependency. Install it in the Python environment used by the DocsGPT API and worker before selecting this provider. ## Authentication Settings DocsGPT includes a JWT (JSON Web Token) based authentication feature for managing sessions or securing local deployments while allowing access. ### `AUTH_TYPE` Overview The `AUTH_TYPE` setting in your `.env` file or `settings.py` determines the authentication method used by DocsGPT. This allows you to control how users authenticate with your DocsGPT instance. | Value | Description | | ------------- | ------------------------------------------------------------------------------------------- | | `None` | No authentication is used. Anyone can access the app. | | `simple_jwt` | A single, long-lived JWT token is generated at startup. All requests use this shared token. | | `session_jwt` | Unique JWT tokens are generated for each session/user. | #### How to Configure Add the following to your `.env` file (or set in `settings.py`): ```env # No authentication (default) AUTH_TYPE=None # OR: Simple JWT (shared token) AUTH_TYPE=simple_jwt JWT_SECRET_KEY=your_secret_key_here # OR: Session JWT (per-user/session tokens) AUTH_TYPE=session_jwt JWT_SECRET_KEY=your_secret_key_here ``` - If `AUTH_TYPE` is set to `simple_jwt` or `session_jwt`, a `JWT_SECRET_KEY` is required. - If `JWT_SECRET_KEY` is not set, DocsGPT will generate one and store it in `.jwt_secret_key` in the project root. #### How Each Method Works - **None**: No authentication. All API and UI access is open. - **simple_jwt**: - A single JWT token is generated at startup and printed to the console. - Use this token in the `Authorization` header for all API requests: ```http Authorization: Bearer ``` - The frontend will prompt for this token if not already set. - **session_jwt**: - Clients can request a new token from `/api/generate_token`. - Use the received token in the `Authorization` header for subsequent requests. - Each user/session gets a unique token. #### Security Notes - Always keep your `JWT_SECRET_KEY` secure and private. - If you set it manually, use a strong, random string. - If not set, DocsGPT will generate a secure key and persist it in `.jwt_secret_key`. #### Checking Current Auth Type - Use the `/api/config` endpoint to check the current `auth_type` and whether authentication is required. #### Frontend Token Input for `simple_jwt` If you have configured `AUTH_TYPE=simple_jwt`, the DocsGPT frontend will prompt you to enter the JWT token if it's not already set or is invalid. Paste the `SIMPLE_JWT_TOKEN` (printed to your console when the backend starts) into this field to access the application. Frontend prompt for JWT Token ## Exploring More Settings These are just the basic settings to get you started. The `settings.py` file contains many more advanced options that you can explore to further customize DocsGPT, such as: - Vector store configuration (`VECTOR_STORE`, Qdrant, Milvus, LanceDB settings) If you're looking for an easy way to set up a vector store with pgvector, try [Neon](https://get.neon.com/docsgpt). - Retriever settings (`RETRIEVERS_ENABLED`) - Cache settings (`CACHE_REDIS_URL`) - And many more! For a complete list of available settings and their descriptions, refer to the `settings.py` file in `application/core`. Remember to restart your Docker containers after making changes to your `.env` file or `settings.py` for the changes to take effect. ================================================ FILE: docs/content/Deploying/Hosting-the-app.mdx ================================================ import { DeploymentCards } from '../../components/DeploymentCards'; # Deployment Guides ================================================ FILE: docs/content/Deploying/Kubernetes-Deploying.mdx ================================================ --- title: Deploying DocsGPT on Kubernetes description: Learn how to self-host DocsGPT on a Kubernetes cluster for scalable and robust deployments. --- # Self-hosting DocsGPT on Kubernetes This guide will walk you through deploying DocsGPT on Kubernetes. ## Prerequisites Ensure you have the following installed before proceeding: - [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) - Access to a Kubernetes cluster. - [Neon](https://get.neon.com/docsgpt) (optional) for a quick and easy vector store setup with pgvector. ## Folder Structure The `deployment/k8s` folder contains the necessary deployment and service configuration files: - `deployments/` - `services/` - `docsgpt-secrets.yaml` ## Deployment Instructions 1. **Clone the Repository** ```sh git clone https://github.com/arc53/DocsGPT.git cd docsgpt/deployment/k8s ``` 2. **Configure Secrets (optional)** Ensure that you have all the necessary secrets in `docsgpt-secrets.yaml`. Update it with your secrets before applying if you want. By default we will use qdrant as a vectorstore and public docsgpt llm as llm for inference. Alternatively, you can use [Neon](https://get.neon.com/docsgpt) as an easy way to set up your vector store with pgvector, which is highly recommended for quick deployments. 3. **Apply Kubernetes Deployments** Deploy your DocsGPT resources using the following commands: ```sh kubectl apply -f deployments/ ``` 4. **Apply Kubernetes Services** Set up your services using the following commands: ```sh kubectl apply -f services/ ``` 5. **Apply Secrets** Apply the secret configurations: ```sh kubectl apply -f docsgpt-secrets.yaml ``` 6. **Substitute API URL** After deploying the services, you need to update the environment variable `VITE_API_HOST` in your deployment file `deployments/docsgpt-deploy.yaml` with the actual endpoint URL created by your `docsgpt-api-service`. ```sh kubectl get services/docsgpt-api-service -o jsonpath='{.status.loadBalancer.ingress[0].ip}' | xargs -I {} sed -i "s||{}|g" deployments/docsgpt-deploy.yaml ``` 7. **Rerun Deployment** After making the changes, reapply the deployment configuration to update the environment variables: ```sh kubectl apply -f deployments/ ``` ## Verifying the Deployment To verify if everything is set up correctly, you can run the following: ```sh kubectl get pods kubectl get services ``` Ensure that the pods are running and the services are available. ## Accessing DocsGPT To access DocsGPT, you need to find the external IP address of the frontend service. You can do this by running: ```sh kubectl get services/docsgpt-frontend-service | awk 'NR>1 {print "http://" $4}' ``` ## Troubleshooting If you encounter any issues, you can check the logs of the pods for more details: ```sh kubectl logs ``` Replace `` with the actual name of your DocsGPT pod. ================================================ FILE: docs/content/Deploying/Railway.mdx ================================================ --- title: Hosting DocsGPT on Railway description: Learn how to deploy your own DocsGPT instance on Railway with this step-by-step tutorial --- # Self-hosting DocsGPT on Railway Here's a step-by-step guide on how to host DocsGPT on Railway App. At first Clone and set up the project locally to run , test and Modify. ### 1. Clone and GitHub SetUp a. Open Terminal (Windows Shell or Git bash(recommended)). b. Type `git clone https://github.com/arc53/DocsGPT.git` #### Download the package information Once it has finished cloning the repository, it is time to download the package information from all sources. To do so, simply enter the following command: `sudo apt update` #### Install Docker and Docker Compose DocsGPT backend and worker use Python, Frontend is written on React and the whole application is containerized using Docker. To install Docker and Docker Compose, enter the following commands: `sudo apt install docker.io` And now install docker-compose: `sudo apt install docker-compose` #### Access the DocsGPT Folder Enter the following command to access the folder in which the DocsGPT docker-compose file is present. `cd DocsGPT/` #### Prepare the Environment Inside the DocsGPT folder create a `.env` file and copy the contents of `.env_sample` into it. `nano .env` Make sure your `.env` file looks like this: ``` OPENAI_API_KEY=(Your OpenAI API key) VITE_API_STREAMING=true SELF_HOSTED_MODEL=false ``` To save the file, press CTRL+X, then Y, and then ENTER. Next, set the correct IP for the Backend by opening the docker-compose.yaml file: `nano deployment/docker-compose.yaml` And Change line 7 to: `VITE_API_HOST=http://localhost:7091` to this `VITE_API_HOST=http://:7091` This will allow the frontend to connect to the backend. #### Running the Application You're almost there! Now that all the necessary bits and pieces have been installed, it is time to run the application. To do so, use the following command: `sudo docker compose -f deployment/docker-compose.yaml up -d` Launching it for the first time will take a few minutes to download all the necessary dependencies and build. Once this is done you can go ahead and close the terminal window. ### 2. Pushing it to your own Repository a. Create a Repository on your GitHub. b. Open Terminal in the same directory of the Cloned project. c. Type `git init` d. `git add .` e. `git commit -m "first-commit"` f. `git remote add origin ` g. `git push git push --set-upstream origin master` Your local files will now be pushed to your GitHub Account. :) ### 3. Create a Railway Account: If you haven't already, create or log in to your railway account do it by visiting [Railway](https://railway.app/) Signup via **GitHub** [Recommended]. ### 4. Start New Project: a. Open Railway app and Click on "Start New Project." b. Choose any from the list of options available (Recommended "**Deploy from GitHub Repo**") c. Choose the required Repository from your GitHub. d. Configure and allow access to modify your GitHub content from the pop-up window. e. Agree to all the terms and conditions. PS: It may take a few minutes for the account setup to complete. #### You will get A free trial of $5 (use it for trial and then purchase if satisfied and needed) ### 5. Connecting to Your newly Railway app with GitHub a. Choose DocsGPT repo from the list of your GitHub repository that you want to deploy now. b. Click on Deploy now. ![Three Tabs will be there](/Railway-selection.png) c. Select Variables Tab. d. Upload the env file here that you used for local setup. e. Go to Settings Tab now. f. Go to "Networking" and click on Generate Domain Name, to get the URL of your hosted project. g. You can update the Root directory, build command, installation command as per need. *[However recommended not the disturb these options and leave them as default if not that needed.]* Your own DocsGPT is now available at the Generated domain URl. :) ================================================ FILE: docs/content/Deploying/_meta.js ================================================ export default { "DocsGPT-Settings": { "title": "⚙️ App Configuration", "href": "/Deploying/DocsGPT-Settings" }, "Docker-Deploying": { "title": "🛳️ Docker Setup", "href": "/Deploying/Docker-Deploying" }, "Development-Environment": { "title": "🛠️Development Environment", "href": "/Deploying/Development-Environment" }, "Kubernetes-Deploying": { "title": "☸️ Deploying on Kubernetes", "href": "/Deploying/Kubernetes-Deploying" }, "Hosting-the-app": { "title": "☁️ Hosting DocsGPT", "href": "/Deploying/Hosting-the-app" }, "Amazon-Lightsail": { "title": "Hosting DocsGPT on Amazon Lightsail", "href": "/Deploying/Amazon-Lightsail", "display": "hidden" }, "Railway": { "title": "Hosting DocsGPT on Railway", "href": "/Deploying/Railway", "display": "hidden" } } ================================================ FILE: docs/content/Extensions/Chatwoot-extension.mdx ================================================ --- title: Comprehensive Guide to Setting Up the Chatwoot Extension with DocsGPT description: This step-by-step guide walks you through the process of setting up the Chatwoot extension with DocsGPT, enabling seamless integration for automated responses and enhanced customer support. Learn how to launch DocsGPT, retrieve your Chatwoot access token, configure the .env file, and start the extension. --- ## Chatwoot Extension Setup Guide ### Step 1: Prepare and Start DocsGPT - **Launch DocsGPT**: Follow the instructions in our [Quickstart](/quickstart) to start DocsGPT. Make sure to load your documentation. ### Step 2: Get Access Token from Chatwoot - Go to Chatwoot. - In your profile settings (located at the bottom left), scroll down and copy the **Access Token**. ### Step 3: Set Up Chatwoot Extension - Navigate to `/extensions/chatwoot`. - Copy the `.env_sample` file and create a new file named `.env`. - Fill in the values in the `.env` file as follows: ```env docsgpt_url= chatwoot_url= docsgpt_key= chatwoot_token= ``` ### Step 4: Start the Extension - Use the command `flask run` to start the extension. ### Step 5: Optional - Extra Validation - In app.py, uncomment lines 12-13 and 71-75. - Add the following lines to your .env file: ```account_id=(optional) 1 assignee_id=(optional) 1 ``` These Chatwoot values help ensure you respond to the correct widget and handle questions assigned to a specific user. ### Stopping Bot Responses for Specific User or Session - If you want the bot to stop responding to questions for a specific user or session, add a label `human-requested` in your conversation. ### Additional Notes - For further details on training on other documentation, refer to our [wiki](https://github.com/arc53/DocsGPT/wiki/How-to-train-on-other-documentation). ================================================ FILE: docs/content/Extensions/Chrome-extension.mdx ================================================ --- title: Add DocsGPT Chrome Extension to Your Browser description: Install the DocsGPT Chrome extension to access AI-powered document assistance directly from your browser for enhanced productivity. --- import {Steps} from 'nextra/components' import { Callout } from 'nextra/components' ## Chrome Extension Setup Guide To enhance your DocsGPT experience, you can install the DocsGPT Chrome extension. Here's how: ### Step 1 In the DocsGPT GitHub repository, click on the **Code** button and select **Download ZIP**. ### Step 2 Unzip the downloaded file to a location you can easily access. ### Step 3 Open the Google Chrome browser and click on the three dots menu (upper right corner). ### Step 4 Select **More Tools** and then **Extensions**. ### Step 5 Turn on the **Developer mode** switch in the top right corner of the **Extensions page**. ### Step 6 Click on the **Load unpacked** button. ### Step 7 7. Select the **Chrome** folder where the DocsGPT files have been unzipped (docsgpt-main > extensions > chrome). ### Step 8 The extension should now be added to Google Chrome and can be managed on the Extensions page. ### Step 9 To disable or remove the extension, simply turn off the toggle switch on the extension card or click the **Remove** button. ================================================ FILE: docs/content/Extensions/_meta.js ================================================ export default { "api-key-guide": { "title": "🔑 Getting API key", "href": "/Extensions/api-key-guide" }, "chat-widget": { "title": "💬️ Chat Widget", "href": "/Extensions/chat-widget" }, "search-widget": { "title": "🔎 Search Widget", "href": "/Extensions/search-widget" }, "Chrome-extension": { "title": "🌐 Chrome Extension", "href": "/Extensions/Chrome-extension" }, "Chatwoot-extension": { "title": "🗣️ Chatwoot Extension", "href": "/Extensions/Chatwoot-extension" } } ================================================ FILE: docs/content/Extensions/api-key-guide.mdx ================================================ --- title: API Keys for DocsGPT Integrations description: Learn how to obtain, understand, and use DocsGPT API keys to integrate DocsGPT into your external applications and widgets. --- # Guide to DocsGPT API Keys DocsGPT API keys are essential for developers and users who wish to integrate the DocsGPT models into external applications, such as [our widget](/Extensions/chat-widget). This guide will walk you through the steps of obtaining an API key, starting from uploading your document to understanding the key variables associated with API keys. ## Obtaining Your API Key After uploading your document, you can obtain an API key either through the graphical user interface or via an API call: - **Graphical User Interface:** Navigate to the Settings section of the DocsGPT web app, find the API Keys option, and press 'Create New' to generate your key. - **API Call:** Alternatively, you can use the `/api/create_api_key` endpoint to create a new API key. For detailed instructions, visit [DocsGPT API Documentation](https://gptcloud.arc53.com/). ## Understanding Key Variables Upon creating your API key, you will encounter several key variables. Each serves a specific purpose: - **Name:** Assign a name to your API key for easy identification. - **Source:** Indicates the source document(s) linked to your API key, which DocsGPT will use to generate responses. - **ID:** A unique identifier for your API key. You can view this by making a call to `/api/get_api_keys`. - **Key:** The API key itself, which will be used in your application to authenticate API requests. With your API key ready, you can now integrate DocsGPT into your application, such as the DocsGPT Widget or any other software, via `/api/answer` or `/stream` endpoints. The source document is preset with the API key, allowing you to bypass fields like `selectDocs` and `active_docs` during implementation. Congratulations on taking the first step towards enhancing your applications with DocsGPT! ================================================ FILE: docs/content/Extensions/chat-widget.mdx ================================================ --- title: Integrate DocsGPT Chat Widget into Your Web Application description: Embed the DocsGPT Widget in your React, HTML, or Nextra projects to provide AI-powered chat functionality to your users. --- import { Tabs } from 'nextra/components' # Integrating DocsGPT Chat Widget ## Introduction The DocsGPT Widget is a powerful tool that allows you to integrate AI-driven document assistance directly into your web applications. This guide will walk you through embedding the DocsGPT Widget into your projects, whether you're using React, plain HTML, or Nextra. Enhance your user experience by providing seamless access to intelligent document search and chatbot capabilities. Try out the interactive widget showcase and customize its parameters at the [DocsGPT Widget Demo](https://widget.docsgpt.cloud/). ## Setup ### Installation Make sure you have Node.js and npm (or yarn, pnpm) installed in your project. Navigate to your project directory in the terminal and install the `docsgpt` package: ```bash npm npm install docsgpt ``` ### Usage In your React component file, import the `DocsGPTWidget` component: ```js import { DocsGPTWidget } from "docsgpt"; ``` Now, you can embed the widget within your React component's JSX: ```jsx ``` ### Installation To use the DocsGPT Widget directly in HTML, include the widget script from a CDN in your HTML file: ```html filename="html" ``` ### Usage In your HTML ``, add a `
` element where you want to render the widget. Set an `id` for easy targeting. ```html filename="html"
``` Then, in a ` ``` ### Installation Make sure you have Node.js and npm (or yarn, pnpm) installed in your project. Navigate to your project directory in the terminal and install the `docsgpt` package: ```bash npm npm install docsgpt ``` ### Usage with Nextra (Next.js + MDX) To integrate the DocsGPT Widget into a [Nextra](https://nextra.site/) documentation site (built with Next.js and MDX), create or modify your `pages/_app.js` file as follows: ```js filename="pages/_app.js" import { DocsGPTWidget } from "docsgpt"; export default function MyApp({ Component, pageProps }) { return ( <> ) } ``` --- ## Properties Table The DocsGPT Widget offers a range of customizable properties that allow you to tailor its appearance and behavior to perfectly match your web application. These parameters can be modified directly when embedding the widget in your React components or HTML code. Below is a detailed overview of each available prop: | **Prop** | **Type** | **Default Value** | **Description** | |--------------------|------------------|-------------------------------------------------------------|-----------------------------------------------------------------------------------------------------| | **`apiHost`** | `string` | `"https://gptcloud.arc53.com"` | **Required.** The URL of your DocsGPT API backend. This endpoint handles vector search and chatbot queries. | | **`apiKey`** | `string` | `"your-api-key"` | API key for authentication with your DocsGPT API. Leave empty if no authentication is required. | | **`avatar`** | `string` | [`dino-icon-link`](https://d3dg1063dc54p9.cloudfront.net/cute-docsgpt.png) | URL for the avatar image displayed in the chatbot interface. | | **`title`** | `string` | `"Get AI assistance"` | Title text shown in the chatbot header. | | **`description`** | `string` | `"DocsGPT's AI Chatbot is here to help"` | Sub-title or descriptive text displayed below the title in the chatbot header. | | **`heroTitle`** | `string` | `"Welcome to DocsGPT !"` | Welcome message displayed when the chatbot is initially opened. | | **`heroDescription`** | `string` | `"This chatbot is built with DocsGPT and utilises GenAI, please review important information using sources."` | Introductory text providing context or disclaimers about the chatbot. | | **`theme`** | `"dark" \| "light"` | `"dark"` | Color theme of the widget interface. Options: `"dark"` or `"light"`. Defaults to `"dark"`. | | **`buttonIcon`** | `string` | `"https://your-icon"` | URL for the icon image used in the widget's launch button. | | **`buttonBg`** | `string` | `"#222327"` | Background color of the widget's launch button. | | **`size`** | `"small" \| "medium"` | `"medium"` | Size of the widget. Options: `"small"` or `"medium"`. Defaults to `"medium"`. | | **`showSources`** | `boolean` | `false` | Enables displaying source URLs for data fetched within the widget. When set to `true`, the widget will show the original sources of the fetched data. | --- ## Notes on Widget Properties * **Full Customization:** Every property listed in the table can be customized. Override the defaults to create a widget that perfectly matches your branding and application context. From avatars and titles to color schemes, you have fine-grained control over the widget's presentation. * **API Key Handling:** The `apiKey` prop is optional. Only include it if your DocsGPT backend API is configured to require API key authentication. `apiHost` for DocsGPT Cloud is `https://gptcloud.arc53.com/` ## Explore and Customize Further The DocsGPT Widget is fully open-source, allowing for deep customization and extension beyond the readily available props. The complete source code for the React-based widget is available in the `extensions/react-widget` directory within the main [DocsGPT GitHub Repository](https://github.com/arc53/DocsGPT). Feel free to explore the code, fork the repository, and tailor the widget to your exact requirements. ================================================ FILE: docs/content/Extensions/search-widget.mdx ================================================ --- title: Integrate DocsGPT Search Bar into Your Web Application description: Embed the DocsGPT Search Bar Widget in your React or HTML projects to provide AI-powered document search functionality to your users. --- import { Tabs } from 'nextra/components' # Integrating DocsGPT Search Bar Widget ## Introduction The DocsGPT Search Bar Widget offers a simple yet powerful way to embed AI-powered document search directly into your web applications. This widget allows users to perform searches across your documents or pages, enabling them to quickly find the information they need. This guide will walk you through embedding the Search Bar Widget into your projects, whether you're using React or plain HTML. Try out the interactive widget showcase and customize its parameters at the [DocsGPT Widget Demo](https://widget.docsgpt.cloud/). ## Setup ## React Setup ### Installation Make sure you have Node.js and npm (or yarn, pnpm) installed in your project. Navigate to your project directory in the terminal and install the `docsgpt` package: ```bash npm npm install docsgpt ``` ### Usage In your React component file, import the `SearchBar` component: ```js import { SearchBar } from "docsgpt"; ``` Now, you can embed the widget within your React component's JSX: ```jsx ``` ### Installation To use the DocsGPT Search Bar Widget directly in HTML, include the widget script from a CDN in your HTML file: ```html filename="html" ``` ### Usage In your HTML ``, add a `
` element where you want to render the Search Bar Widget. Set an `id` for easy targeting. ```html filename="html"
``` Then, in a ` ``` --- ## Properties Table The DocsGPT Search Bar Widget offers a range of customizable properties that allow you to tailor its appearance and behavior to perfectly match your web application. These parameters can be modified directly when embedding the widget in your React components or HTML code. Below is a detailed overview of each available prop: | **Prop** | **Type** | **Default Value** | **Description** | |-----------------|-----------|-------------------------------------|--------------------------------------------------------------------------------------------------| | **`apiKey`** | `string` | `"your-api-key"` | API key for authentication with your DocsGPT API. Leave empty if no authentication is required. | | **`apiHost`** | `string` | `"https://gptcloud.arc53.com"` | **Required.** The URL of your DocsGPT API backend. This endpoint handles vector similarity search queries. | | **`theme`** | `"dark" \| "light"` | `"dark"` | Color theme of the search bar. Options: `"dark"` or `"light"`. Defaults to `"dark"`. | | **`placeholder`** | `string` | `"Search or Ask AI..."` | Placeholder text displayed in the search input field. | | **`width`** | `string` | `"256px"` | Width of the search bar. Accepts any valid CSS width value (e.g., `"300px"`, `"100%"`, `"20rem"`). | --- ## Notes on Widget Properties * **Full Customization:** Every property listed in the table can be customized. Override the defaults to create a Search Bar Widget that perfectly matches your branding and application context. * **API Key Handling:** The `apiKey` prop is optional. Only include it if your DocsGPT backend API is configured to require API key authentication. `apiHost` for DocsGPT Cloud is `https://gptcloud.arc53.com/` ## Explore and Customize Further The DocsGPT Search Bar Widget is fully open-source, allowing for deep customization and extension beyond the readily available props. The complete source code for the React-based widget is available in the `extensions/react-widget` directory within the main [DocsGPT GitHub Repository](https://github.com/arc53/DocsGPT). Feel free to explore the code, fork the repository, and tailor the widget to your exact requirements. ================================================ FILE: docs/content/Guides/Architecture.mdx ================================================ --- title: Architecture description: High-level architecture of DocsGPT --- ## Introduction DocsGPT is designed as a modular and scalable application for knowledge based GenAI system. This document outlines the high-level architecture of DocsGPT, highlighting its key components. ## High-Level Architecture This diagram provides a bird's-eye view of the DocsGPT architecture, illustrating the main components and their interactions. ```mermaid flowchart LR User["User"] --> Frontend["Frontend (React/Vite)"] Frontend --> Backend["Backend API (Flask)"] Backend --> LLM["LLM Integration Layer"] & VectorStore["Vector Stores"] & TaskQueue["Task Queue (Celery)"] & Databases["Databases (MongoDB, Redis)"] LLM -- Cloud APIs / Local Engines --> InferenceEngine["Inference Engine"] VectorStore -- Document Embeddings --> Indexes[("Indexes")] TaskQueue -- Asynchronous Tasks --> DocumentIngestion["Document Ingestion"] style Frontend fill:#AA00FF,color:#FFFFFF style Backend fill:#AA00FF,color:#FFFFFF style LLM fill:#AA00FF,color:#FFFFFF style TaskQueue fill:#AA00FF,color:#FFFFFF,stroke:#AA00FF style DocumentIngestion fill:#AA00FF,color:#FFFFFF,stroke:none ``` ## Component Descriptions ### 1. Frontend (React/Vite) * **Technology:** Built using React and Vite. * **Responsibility:** This is the user interface of DocsGPT, providing users with an UI to ask questions and receive answers, configure prompts, tools and other settings. It handles user input, displays conversation history, shows sources, and manages settings. * **Key Features:** * Clean and responsive UI. * Simple static client-side rendering. * Manages conversation state and settings. * Communicates with the Backend API for data retrieval and processing. ### 2. Backend API (Flask) * **Technology:** Implemented using Flask (Python). * **Responsibility:** The Backend API serves as the core logic and orchestration layer of DocsGPT. It receives requests from the Frontend, Extensions or API clients, processes them, and coordinates interactions between different components. * **Key Features:** * API endpoints for handling user queries, document uploads, and settings configurations. * Manages the overall application flow and logic. * Integrates with the LLM Integration Layer, Vector Stores, Task Queue, Tools, Agents and Databases. * Provides Swagger documentation for API endpoints. ### 3. LLM Integration Layer (Part of backend) * **Technology:** Supports multiple LLM APIs and local engines. * **Responsibility:** This layer provides an abstraction for interacting with Large Language Models (LLMs). * **Key Features:** * Supports LLMs from OpenAI, Google, Anthropic, Groq, HuggingFace Inference API, Azure OpenAI, also compatable with local models like Ollama, LLaMa.cpp, Text Generation Inference (TGI), SGLang, vLLM, Aphrodite, FriendliAI, and LMDeploy. * Manages API key handling and request formatting and Tool fromatting. * Offers caching mechanisms to improve response times and reduce API usage. * Handles streaming responses for a more interactive user experience. ### 4. Vector Stores (Part of backend) * **Technology:** Supports multiple vector databases. * **Responsibility:** Vector Stores are used to store and retrieve vector embeddings of document chunks. This enables semantic search and retrieval of relevant document snippets in response to user queries. * **Key Features:** * Supports vector databases including FAISS, Elasticsearch, Qdrant, Milvus, and LanceDB. * Provides storage and indexing of high-dimensional vector embeddings. * Enables editing and updating of vector indexes including specific chunks. ### 5. Parser Integration Layer (Part of backend) * **Technology:** Supports multiple formats for file processing and remote source uploading. * **Responsibility:** Parser Integration Layer handles uploading, parsing, chunking, embedding, and indexing documents. * **Key Features:** * Supports various document formats (PDF, DOCX, TXT, etc.) and remote sources (web URLs, sitemaps). * Handles document parsing, text chunking, and embedding generation. * Utilizes Celery for asynchronous processing, ensuring efficient handling of large documents. ### 6. Task Queue (Celery) * **Technology:** Celery with Redis as broker and backend. * **Responsibility:** Celery handles asynchronous task processing, for long-running operations such as document ingestion and indexing. This ensures that the main application remains responsive and efficient. * **Key Features:** * Manages background tasks for document processing and indexing. * Improves application responsiveness by offloading heavy tasks. * Enhances scalability and reliability through distributed task processing. ### 7. Databases (MongoDB, Redis) * **Technology:** MongoDB and Redis. * **Responsibility:** Databases are used for persistent data storage and caching. MongoDB stores structured data such as conversations, documents, user settings, and API keys. Redis is used as a cache, as well as a message broker for Celery. ## Request Flow Diagram This diagram illustrates the sequence of steps involved when a user submits a question to DocsGPT. ```mermaid sequenceDiagram participant User participant Frontend participant BackendAPI participant LLMIntegrationLayer participant VectorStores participant InferenceEngine User->>Frontend: User asks a question Frontend->>BackendAPI: API Request (Question) BackendAPI->>VectorStores: Fetch relevant document chunks (Similarity Search) VectorStores-->>BackendAPI: Return document chunks BackendAPI->>LLMIntegrationLayer: Send question and document chunks LLMIntegrationLayer->>InferenceEngine: LLM API Request (Prompt + Context) InferenceEngine-->>LLMIntegrationLayer: LLM API Response (Answer) LLMIntegrationLayer-->>BackendAPI: Return Answer BackendAPI->>Frontend: API Response (Answer) Frontend->>User: Display Answer Note over Frontend,BackendAPI: Data flow is simplified for clarity ``` ## Deployment Architecture DocsGPT is designed to be deployed using Docker and Kubernetes, here is a qucik overview of a simple k8s deployment. ```mermaid graph LR subgraph Kubernetes Cluster subgraph Nodes subgraph Node 1 FrontendPod[Frontend Pod] BackendAPIPod[Backend API Pod] end subgraph Node 2 CeleryWorkerPod[Celery Worker Pod] RedisPod[Redis Pod] end subgraph Node 3 MongoDBPod[MongoDB Pod] VectorStorePod[Vector Store Pod] end end LoadBalancer[Load Balancer] --> docsgpt-frontend-service[docsgpt-frontend-service] LoadBalancer --> docsgpt-api-service[docsgpt-api-service] docsgpt-frontend-service --> FrontendPod docsgpt-api-service --> BackendAPIPod BackendAPIPod --> CeleryWorkerPod BackendAPIPod --> RedisPod BackendAPIPod --> MongoDBPod BackendAPIPod --> VectorStorePod CeleryWorkerPod --> RedisPod BackendAPIPod --> InferenceEngine[(Inference Engine)] VectorStorePod --> Indexes[(Indexes)] MongoDBPod --> Data[(Data)] RedisPod --> Cache[(Cache)] end User[User] --> LoadBalancer ``` ================================================ FILE: docs/content/Guides/Customising-prompts.mdx ================================================ --- title: Customizing Prompts description: This guide explains how to customize prompts in DocsGPT using the new template-based system with dynamic variable injection. --- import Image from 'next/image' # Customizing Prompts in DocsGPT Customizing prompts for DocsGPT gives you powerful control over the AI's behavior and responses. With the new template-based system, you can inject dynamic context through organized namespaces, making prompts flexible and maintainable without hardcoding values. ## Quick Start 1. Navigate to `SideBar -> Settings`. 2. In Settings, select the `Active Prompt` to see various prompt styles. 3. Click on the `edit icon` on your chosen prompt to customize it. ### Video Demo prompts --- ## Template-Based Prompt System DocsGPT now uses **Jinja2 templating** with four organized namespaces for dynamic variable injection: ### Available Namespaces #### 1. **`system`** - System Metadata Access system-level information: ```jinja {{ system.date }} # Current date (YYYY-MM-DD) {{ system.time }} # Current time (HH:MM:SS) {{ system.timestamp }} # ISO 8601 timestamp {{ system.request_id }} # Unique request identifier {{ system.user_id }} # Current user ID ``` #### 2. **`source`** - Retrieved Documents Access RAG (Retrieval-Augmented Generation) document context: ```jinja {{ source.content }} # Concatenated document content {{ source.summaries }} # Alias for content (backward compatible) {{ source.documents }} # List of document objects {{ source.count }} # Number of retrieved documents ``` #### 3. **`passthrough`** - Request Parameters Access custom parameters passed in the API request: ```jinja {{ passthrough.company }} # Custom field from request {{ passthrough.user_name }} # User-provided data {{ passthrough.context }} # Any custom parameter ``` To use passthrough data, send it in your API request: ```json { "question": "What is the pricing?", "passthrough": { "company": "Acme Corp", "user_name": "Alice", "plan_type": "enterprise" } } ``` #### 4. **`tools`** - Pre-fetched Tool Data Access results from tools that run before the agent (like memory tool): ```jinja {{ tools.memory.root }} # Memory tool directory listing {{ tools.memory.available }} # Boolean: is memory available ``` --- ## Example Prompts ### Basic Prompt with Documents ```jinja You are a helpful AI assistant for DocsGPT. Current date: {{ system.date }} Use the following documents to answer the question: {{ source.content }} Provide accurate, helpful answers with code examples when relevant. ``` ### Advanced Prompt with All Namespaces ```jinja You are an AI assistant for {{ passthrough.company }}. **System Info:** - Date: {{ system.date }} - Request ID: {{ system.request_id }} **User Context:** - User: {{ passthrough.user_name }} - Role: {{ passthrough.role }} **Available Documents ({{ source.count }}):** {{ source.content }} **Memory Context:** {% if tools.memory.available %} {{ tools.memory.root }} {% else %} No saved context available. {% endif %} Please provide detailed, accurate answers based on the documents above. ``` ### Conditional Logic Example ```jinja You are a DocsGPT assistant. {% if source.count > 0 %} I found {{ source.count }} relevant document(s): {{ source.content }} Base your answer on these documents. {% else %} No documents were found. Please answer based on your general knowledge. {% endif %} ``` --- ## Migration Guide ### Legacy Format (Still Supported) The old `{summaries}` format continues to work for backward compatibility: ```markdown You are a helpful assistant. Documents: {summaries} ``` This will automatically substitute `{summaries}` with document content. ### New Template Format (Recommended) Migrate to the new template syntax for more flexibility: ```jinja You are a helpful assistant. Documents: {{ source.content }} ``` **Migration mapping:** - `{summaries}` → `{{ source.content }}` or `{{ source.summaries }}` --- ## Best Practices ### 1. **Use Descriptive Context** ```jinja **Retrieved Documents:** {{ source.content }} **User Query Context:** - Company: {{ passthrough.company }} - Department: {{ passthrough.department }} ``` ### 2. **Handle Missing Data Gracefully** ```jinja {% if passthrough.user_name %} Hello {{ passthrough.user_name }}! {% endif %} ``` ### 3. **Leverage Memory for Continuity** ```jinja {% if tools.memory.available %} **Previous Context:** {{ tools.memory.root }} {% endif %} **Current Question:** Please consider the above context when answering. ``` ### 4. **Add Clear Instructions** ```jinja You are a technical support assistant. **Guidelines:** 1. Always reference the documents below 2. Provide step-by-step instructions 3. Include code examples when relevant **Reference Documents:** {{ source.content }} ``` --- ## Advanced Features ### Looping Over Documents ```jinja {% for doc in source.documents %} **Source {{ loop.index }}:** {{ doc.filename }} {{ doc.text }} {% endfor %} ``` ### Date-Based Behavior ```jinja {% if system.date > "2025-01-01" %} Note: This is information from 2025 or later. {% endif %} ``` ### Custom Formatting ```jinja **Request Information** ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ • Request ID: {{ system.request_id }} • User: {{ passthrough.user_name | default("Guest") }} • Time: {{ system.time }} ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ ``` --- ## Tool Pre-Fetching ### Memory Tool Configuration Enable memory tool pre-fetching to inject saved context into prompts: ```python # In your tool configuration { "name": "memory", "config": { "pre_fetch_enabled": true # Default: true } } ``` Control pre-fetching globally: ```bash # .env file ENABLE_TOOL_PREFETCH=true ``` Or per-request: ```json { "question": "What are the requirements?", "disable_tool_prefetch": false } ``` --- ## Debugging Prompts ### View Rendered Prompts in Logs Set log level to `INFO` to see the final rendered prompt sent to the LLM: ```bash export LOG_LEVEL=INFO ``` You'll see output like: ``` INFO - Rendered system prompt for agent (length: 1234 chars): ================================================================================ You are a helpful assistant for Acme Corp. Current date: 2025-10-30 Request ID: req_abc123 Documents: Technical documentation about... ================================================================================ ``` ### Template Validation Test your template syntax before saving: ```python from application.api.answer.services.prompt_renderer import PromptRenderer renderer = PromptRenderer() is_valid = renderer.validate_template("Your prompt with {{ variables }}") ``` --- ## Common Use Cases ### 1. Customer Support Bot ```jinja You are a customer support assistant for {{ passthrough.company }}. **Customer:** {{ passthrough.customer_name }} **Ticket ID:** {{ system.request_id }} **Date:** {{ system.date }} **Knowledge Base:** {{ source.content }} **Previous Interactions:** {{ tools.memory.root }} Please provide helpful, friendly support based on the knowledge base above. ``` ### 2. Technical Documentation Assistant ```jinja You are a technical documentation expert. **Available Documentation ({{ source.count }} documents):** {{ source.content }} **Requirements:** - Provide code examples in {{ passthrough.language }} - Focus on {{ passthrough.framework }} best practices - Include relevant links when possible ``` ### 3. Internal Knowledge Base ```jinja You are an internal AI assistant for {{ passthrough.department }}. **Employee:** {{ passthrough.employee_name }} **Access Level:** {{ passthrough.access_level }} **Relevant Documents:** {{ source.content }} Provide detailed answers appropriate for {{ passthrough.access_level }} access level. ``` --- ## Template Syntax Reference ### Variables ```jinja {{ variable_name }} # Output variable {{ namespace.field }} # Access nested field {{ variable | default("N/A") }} # Default value ``` ### Conditionals ```jinja {% if condition %} Content {% elif other_condition %} Other content {% else %} Default content {% endif %} ``` ### Loops ```jinja {% for item in list %} {{ item.field }} {% endfor %} ``` ### Comments ```jinja {# This is a comment and won't appear in output #} ``` --- ## Security Considerations 1. **Input Sanitization**: Passthrough data is automatically sanitized to prevent injection attacks 2. **Type Filtering**: Only primitive types (string, int, float, bool, None) are allowed in passthrough 3. **Autoescaping**: Jinja2 autoescaping is enabled by default 4. **Size Limits**: Consider the token budget when including large documents --- ## Troubleshooting ### Problem: Variables Not Rendering **Solution:** Ensure you're using the correct namespace: ```jinja ❌ {{ company }} ✅ {{ passthrough.company }} ``` ### Problem: Empty Output for Tool Data **Solution:** Check that tool pre-fetching is enabled and the tool is configured correctly. ### Problem: Syntax Errors **Solution:** Validate template syntax. Common issues: ```jinja ❌ {{ variable } # Missing closing brace ❌ {% if x % # Missing closing %} ✅ {{ variable }} ✅ {% if x %}...{% endif %} ``` ### Problem: Legacy Prompts Not Working **Solution:** The system auto-detects template syntax. If your prompt uses `{summaries}`, it will work in legacy mode. To use new features, add `{{ }}` syntax. --- ## API Reference ### Render Prompt via API ```python from application.api.answer.services.prompt_renderer import PromptRenderer renderer = PromptRenderer() rendered = renderer.render_prompt( prompt_content="Your template with {{ passthrough.name }}", user_id="user_123", request_id="req_456", passthrough_data={"name": "Alice"}, docs_together="Document content here", tools_data={"memory": {"root": "Files: notes.txt"}} ) ``` --- ## Conclusion The new template-based prompt system provides powerful flexibility while maintaining backward compatibility. By leveraging namespaces, you can create dynamic, context-aware prompts that adapt to your specific use case. **Key Benefits:** - ✅ Dynamic variable injection - ✅ Organized namespaces - ✅ Backward compatible - ✅ Security built-in - ✅ Easy to debug Start with simple templates and gradually add complexity as needed. Happy prompting! 🚀 ================================================ FILE: docs/content/Guides/How-to-train-on-other-documentation.mdx ================================================ --- title: How to Train on Other Documentation description: A step-by-step guide on how to effectively train DocsGPT on additional documentation sources. --- import { Callout } from 'nextra/components' import Image from 'next/image' import { Steps } from 'nextra/components' ## How to train on other documentation Training on other documentation sources can greatly enhance the versatility and depth of DocsGPT's knowledge. By incorporating diverse materials, you can broaden the AI's understanding and improve its ability to generate insightful responses across a range of topics. Here's a step-by-step guide on how to effectively train DocsGPT on additional documentation sources: **Get your document ready**: Make sure you have the document on which you want to train on ready with you on the device which you are using .You can also use links to the documentation to train on. Note: The document should be either of the given file formats .pdf, .txt, .rst, .docx, .md, .zip and limited to 25mb.You can also train using the link of the documentation. ### Video Demo prompts ### Step1 Navigate to the sidebar where you will find `Source Docs` option,here you will find 3 options built in which are default,Web Search and None. ### Step 2 Click on the `Upload icon` just beside the source docs options,now browse and upload the document which you want to train on or select the `remote` option if you have to insert the link of the documentation. ### Step 3 Now you will be able to see the name of the file uploaded under the Uploaded Files ,now click on `Train`,once you click on train it might take some time to train on the document. You will be able to see the `Training progress` and once the training is completed you can click the `finish` button and there you go your document is uploaded. ### Step 4 Go to `New chat` and from the side bar select the document you uploaded under the `Source Docs` and go ahead with your chat, now you can ask questions regarding the document you uploaded and you will get the effective answer based on it. ================================================ FILE: docs/content/Guides/How-to-use-different-LLM.mdx ================================================ --- title: description: --- import { Callout } from 'nextra/components' import Image from 'next/image' import { Steps } from 'nextra/components' # Setting Up Local Language Models for Your App Setting up local language models for your app can significantly enhance its capabilities, enabling it to understand and generate text in multiple languages without relying on external APIs. By integrating local language models, you can improve privacy, reduce latency, and ensure continuous functionality even in offline environments. Here's a comprehensive guide on how to set up local language models for your application: ## Steps: ### For cloud version LLM change: ### Step 1 Visit the chat screen and you will be to see the default LLM selected. ### Step 2 Click on it and you will get a drop down of various LLM's available to choose. ### Step 3 Choose the LLM of your choice. ### Video Demo prompts ### For Open source llm change: ### Step 1 For open source version please edit `LLM_PROVIDER`, `LLM_NAME` and others in the .env file. Refer to [⚙️ App Configuration](/Deploying/DocsGPT-Settings) for more information. ### Step 2 Visit [☁️ Cloud Providers](/Models/cloud-providers) for the updated list of online models. Make sure you have the right API_KEY and correct LLM_PROVIDER. For self-hosted please visit [🖥️ Local Inference](/Models/local-inference). ================================================ FILE: docs/content/Guides/Integrations/_meta.js ================================================ export default { "google-drive-connector": { "title": "🔗 Google Drive", "href": "/Guides/Integrations/google-drive-connector" } } ================================================ FILE: docs/content/Guides/Integrations/google-drive-connector.mdx ================================================ --- title: Google Drive Connector description: Connect your Google Drive as an external knowledge base to upload and process files directly from your Google Drive account. --- import { Callout } from 'nextra/components' import { Steps } from 'nextra/components' # Google Drive Connector The Google Drive Connector allows you to seamlessly connect your Google Drive account as an external knowledge base. This integration enables you to upload and process files directly from your Google Drive without manually downloading and uploading them to DocsGPT. ## Features - **Direct File Access**: Browse and select files directly from your Google Drive - **Comprehensive File Support**: Supports all major document formats including: - Google Workspace files (Docs, Sheets, Slides) - Microsoft Office files (.docx, .xlsx, .pptx, .doc, .ppt, .xls) - PDF documents - Text files (.txt, .md, .rst, .html, .rtf) - Data files (.csv, .json) - Image files (.png, .jpg, .jpeg) - E-books (.epub) - **Secure Authentication**: Uses OAuth 2.0 for secure access to your Google Drive - **Real-time Sync**: Process files directly from Google Drive without local downloads The Google Drive Connector requires proper configuration of Google API credentials. Follow the setup instructions below to enable this feature. ## Prerequisites Before setting up the Google Drive Connector, you'll need: 1. A Google Cloud Platform (GCP) project 2. Google Drive API enabled 3. OAuth 2.0 credentials configured 4. DocsGPT instance with proper environment variables ## Setup Instructions ### Step 1: Create a Google Cloud Project 1. Go to the [Google Cloud Console](https://console.cloud.google.com/) 2. Create a new project or select an existing one 3. Note down your Project ID for later use ### Step 2: Enable Google Drive API 1. In the Google Cloud Console, navigate to **APIs & Services** > **Library** 2. Search for "Google Drive API" 3. Click on "Google Drive API" and click **Enable** ### Step 3: Create OAuth 2.0 Credentials 1. Go to **APIs & Services** > **Credentials** 2. Click **Create Credentials** > **OAuth client ID** 3. If prompted, configure the OAuth consent screen: - Choose **External** user type (unless you're using Google Workspace) - Fill in the required fields (App name, User support email, Developer contact) - Add your domain to **Authorized domains** if deploying publicly 4. For Application type, select **Web application** 5. Add your DocsGPT frontend URL to **Authorized JavaScript origins**: - For local development: `http://localhost:3000` - For production: `https://yourdomain.com` 6. Add your DocsGPT callback URL to **Authorized redirect URIs**: - For local development: `http://localhost:7091/api/connectors/callback?provider=google_drive` - For production: `https://yourdomain.com/api/connectors/callback?provider=google_drive` 7. Click **Create** and note down the **Client ID** and **Client Secret** ### Step 4: Configure Backend Environment Variables Add the following environment variables to your backend configuration: **For Docker deployment**, add to your `.env` file in the root directory: ```env # Google Drive Connector Configuration GOOGLE_CLIENT_ID=your_google_client_id_here GOOGLE_CLIENT_SECRET=your_google_client_secret_here ``` **For manual deployment**, set these environment variables in your system or application configuration. ### Step 5: Configure Frontend Environment Variables Add the following environment variables to your frontend `.env` file: ```env # Google Drive Frontend Configuration VITE_GOOGLE_CLIENT_ID=your_google_client_id_here ``` Make sure to use the same Google Client ID in both backend and frontend configurations. ### Step 6: Restart Your Application After configuring the environment variables: 1. **For Docker**: Restart your Docker containers ```bash docker-compose down docker-compose up -d ``` 2. **For manual deployment**: Restart both backend and frontend services ## Using the Google Drive Connector Once configured, you can use the Google Drive Connector to upload files: ### Step 1: Access the Upload Interface 1. Navigate to the DocsGPT interface 2. Go to the upload/training section 3. You should now see "Google Drive" as an available upload option ### Step 2: Connect Your Google Account 1. Select "Google Drive" as your upload method 2. Click "Connect to Google Drive" 3. You'll be redirected to Google's OAuth consent screen 4. Grant the necessary permissions to DocsGPT 5. You'll be redirected back to DocsGPT with a successful connection ### Step 3: Select Files 1. Once connected, click "Select Files" 2. The Google Drive picker will open 3. Browse your Google Drive and select the files you want to process 4. Click "Select" to confirm your choices ### Step 4: Process Files 1. Review your selected files 2. Click "Train" or "Upload" to process the files 3. DocsGPT will download and process the files from your Google Drive 4. Once processing is complete, the files will be available in your knowledge base ## Supported File Types The Google Drive Connector supports the following file types: | File Type | Extensions | Description | |-----------|------------|-------------| | **Google Workspace** | - | Google Docs, Sheets, Slides (automatically converted) | | **Microsoft Office** | .docx, .xlsx, .pptx | Modern Office formats | | **Legacy Office** | .doc, .ppt, .xls | Older Office formats | | **PDF Documents** | .pdf | Portable Document Format | | **Text Files** | .txt, .md, .rst, .html, .rtf | Various text formats | | **Data Files** | .csv, .json | Structured data formats | | **Images** | .png, .jpg, .jpeg | Image files (with OCR if enabled) | | **E-books** | .epub | Electronic publication format | ## Troubleshooting ### Common Issues **"Google Drive option not appearing"** - Verify that `VITE_GOOGLE_CLIENT_ID` is set in frontend environment - Check that `VITE_GOOGLE_CLIENT_ID` environment variable is present in your frontend configuration - Check browser console for any JavaScript errors - Ensure the frontend has been restarted after adding environment variables **"Authentication failed"** - Verify that your OAuth 2.0 credentials are correctly configured - Check that the redirect URI `http:///api/connectors/callback?provider=google_drive` is correctly added in GCP console - Ensure the Google Drive API is enabled in your GCP project **"Permission denied" errors** - Verify that the OAuth consent screen is properly configured - Check that your Google account has access to the files you're trying to select - Ensure the required scopes are granted during authentication **"Files not processing"** - Check that the backend environment variables are correctly set - Verify that the OAuth credentials have the necessary permissions - Check the backend logs for any error messages ### Environment Variable Checklist **Backend (.env in root directory):** - ✅ `GOOGLE_CLIENT_ID` - ✅ `GOOGLE_CLIENT_SECRET` **Frontend (.env in frontend directory):** - ✅ `VITE_GOOGLE_CLIENT_ID` ### Security Considerations - Keep your Google Client Secret secure and never expose it in frontend code - Regularly rotate your OAuth credentials - Use HTTPS in production to protect authentication tokens - Ensure proper OAuth consent screen configuration for production use For production deployments, make sure to add your actual domain to the OAuth consent screen and authorized origins/redirect URIs. ================================================ FILE: docs/content/Guides/My-AI-answers-questions-using-external-knowledge.mdx ================================================ --- title: description: --- # Avoiding hallucinations If your AI uses external knowledge and is not explicit enough, it is ok, because we try to make DocsGPT friendly. But if you want to adjust it, here is a simple way:- - Got to `application/prompts/chat_combine_prompt.txt` - And change it to ``` You are a DocsGPT, friendly and helpful AI assistant by Arc53 that provides help with documents. You give thorough answers with code examples, if possible. Write an answer for the question below based on the provided context. If the context provides insufficient information, reply "I cannot answer". You have access to chat history and can use it to help answer the question. ---------------- {summaries} ``` ================================================ FILE: docs/content/Guides/_meta.js ================================================ export default { "Customising-prompts": { "title": "️💻 Customising Prompts", "href": "/Guides/Customising-prompts" }, "How-to-train-on-other-documentation": { "title": "📥 Training on docs", "href": "/Guides/How-to-train-on-other-documentation" }, "How-to-use-different-LLM": { "title": "️🤖 How to use different LLM's", "href": "/Guides/How-to-use-different-LLM", "display": "hidden" }, "My-AI-answers-questions-using-external-knowledge": { "title": "💭️ Avoiding hallucinations", "href": "/Guides/My-AI-answers-questions-using-external-knowledge", "display": "hidden" }, "Architecture": { "title": "🏗️ Architecture", "href": "/Guides/Architecture" }, "compression": { "title": "🗜️ Context Compression", "href": "/Guides/compression" }, "ocr": { "title": "OCR", "href": "/Guides/ocr" }, "Integrations": { "title": "🔗 Integrations" } } ================================================ FILE: docs/content/Guides/compression.md ================================================ # Context Compression DocsGPT implements a smart context compression system to manage long conversations effectively. This feature prevents conversations from hitting the LLM's context window limit while preserving critical information and continuity. ## How It Works The compression system operates on a "summarize and truncate" principle: 1. **Threshold Check**: Before each request, the system calculates the total token count of the conversation history. 2. **Trigger**: If the token count exceeds a configured threshold (default: 80% of the model's context limit), compression is triggered. 3. **Summarization**: An LLM (potentially a different, cheaper/faster one) processes the older part of the conversation—including previous summaries, user messages, agent responses, and tool outputs. 4. **Context Replacement**: The system generates a comprehensive summary of the older history. For subsequent requests, the LLM receives this **Summary + Recent Messages** instead of the full raw history. ### Key Features * **Recursive Summarization**: New summaries incorporate previous summaries, ensuring that information from the very beginning of a long chat is not lost. * **Tool Call Support**: The compression logic explicitly handles tool calls and their outputs (e.g., file readings, search results), summarizing their results so the agent retains knowledge of what it has already done. * **"Needle in a Haystack" Preservation**: The prompts are designed to identify and preserve specific, critical details (like passwords, keys, or specific user instructions) even when compressing large amounts of text. ## Configuration You can configure the compression behavior in your `.env` file or `application/core/settings.py`: | Setting | Default | Description | | :--- | :--- | :--- | | `ENABLE_CONVERSATION_COMPRESSION` | `True` | Master switch to enable/disable the feature. | | `COMPRESSION_THRESHOLD_PERCENTAGE` | `0.8` | The fraction of the context window (0.0 to 1.0) that triggers compression. | | `COMPRESSION_MODEL_OVERRIDE` | `None` | (Optional) Specify a different model ID to use specifically for the summarization task (e.g., using `gpt-3.5-turbo` to compress for `gpt-4`). | | `COMPRESSION_MAX_HISTORY_POINTS` | `3` | The number of past compression points to keep in the database (older ones are discarded as they are incorporated into newer summaries). | ## Architecture The system is modularized into several components: * **`CompressionThresholdChecker`**: Calculates token usage and decides when to compress. * **`CompressionService`**: Orchestrates the compression process, manages DB updates, and reconstructs the context (Summary + Recent Messages) for the LLM. * **`CompressionPromptBuilder`**: Constructs the specific prompts used to instruct the LLM to summarize the conversation effectively. ================================================ FILE: docs/content/Guides/ocr.mdx ================================================ --- title: OCR for Sources and Attachments description: How OCR works in DocsGPT, how to configure it, and what changes for source ingestion vs chat attachments. --- import { Callout } from 'nextra/components' # Docling OCR for Sources and Attachments DocsGPT uses Docling as the default parser layer for many document formats. OCR is optional and controlled by two settings: ```env DOCLING_OCR_ENABLED=false DOCLING_OCR_ATTACHMENTS_ENABLED=false ``` - `DOCLING_OCR_ENABLED`: OCR behavior for Source Docs ingestion. - `DOCLING_OCR_ATTACHMENTS_ENABLED`: OCR behavior for chat attachments uploaded from the message box. ## Processing Flow ### Source Docs flow (Upload and Train) 1. Files are uploaded through `/api/upload`. 2. Ingestion runs asynchronously in Celery (`ingest_worker`). 3. `SimpleDirectoryReader` parses files with `get_default_file_extractor`. 4. For PDFs and image formats, Docling parsers are used. OCR in this path is controlled by `DOCLING_OCR_ENABLED`. 5. Parsed text is chunked, embedded, and stored in the vector store. 6. Retrieval during chat uses this indexed text and returns source citations. ### Attachment flow (Chat-only file context) 1. Files are uploaded through `/api/store_attachment`. 2. Celery task `attachment_worker` parses and stores the attachment in MongoDB (`attachments` collection). 3. OCR in this path is controlled by `DOCLING_OCR_ATTACHMENTS_ENABLED`. 4. Attachments are not vectorized and are not added to the source index. 5. During answer generation, selected attachment IDs are loaded and passed directly to the LLM pipeline. ## How Docling OCR Works Docling OCR behavior is different for PDFs vs images: - PDF parser defaults to hybrid OCR: - text regions: extracted directly - bitmap/image regions: OCR only where needed - Image parser defaults to full-page OCR (the whole image is visual content). By default, Docling parser classes use RapidOCR options (language default: `english`). Parser internals like OCR language and force-full-page OCR are currently set by code defaults, not separate `.env` settings. ## Attachment Behavior by Model Support When attachments are used in chat, behavior depends on the selected model/provider: - If a MIME type is supported, DocsGPT sends files/images through provider-native attachment APIs. - If unsupported, DocsGPT falls back to the parsed text content stored for the attachment. - For providers that support images but not native PDF attachments, PDF files are converted to images (synthetic PDF support). This means OCR quality is especially important for text fallback paths and for models without native attachment support. ## Recommended Configuration For most OCR-enabled use cases, enable both flags: ```env DOCLING_OCR_ENABLED=true DOCLING_OCR_ATTACHMENTS_ENABLED=true ``` After changing these settings, restart the API and Celery worker. ## Legacy Fallback Notes - If Docling is unavailable, DocsGPT falls back to legacy parsers. - With OCR disabled, text-based PDFs can still parse, but scanned/image-heavy content may produce little text. - For image parsing without Docling OCR, the legacy image parser only extracts text when `PARSE_IMAGE_REMOTE=true`. ================================================ FILE: docs/content/Models/_meta.js ================================================ export default { "cloud-providers": { "title": "☁️ Cloud Providers", "href": "/Models/cloud-providers" }, "local-inference": { "title": "🖥️ Local Inference", "href": "/Models/local-inference" }, "embeddings": { "title": "📝 Embeddings", "href": "/Models/embeddings" } } ================================================ FILE: docs/content/Models/cloud-providers.mdx ================================================ --- title: Connecting DocsGPT to Cloud LLM Providers description: Connect DocsGPT to various Cloud Large Language Model (LLM) providers to power your document Q&A. --- # Connecting DocsGPT to Cloud LLM Providers DocsGPT is designed to seamlessly integrate with a variety of Cloud Large Language Model (LLM) providers, giving you access to state-of-the-art AI models for document question answering. ## Configuration via `.env` file The primary method for configuring your LLM provider in DocsGPT is through the `.env` file. For a comprehensive understanding of all available settings, please refer to the detailed [DocsGPT Settings Guide](/Deploying/DocsGPT-Settings). To connect to a cloud LLM provider, you will typically need to configure the following basic settings in your `.env` file: * **`LLM_PROVIDER`**: This setting is essential and identifies the specific cloud provider you wish to use (e.g., `openai`, `google`, `anthropic`). * **`LLM_NAME`**: Specifies the exact model you want to utilize from your chosen provider (e.g., `gpt-5.1`, `gemini-flash-latest`, `claude-3-5-sonnet-20241022`). Refer to your provider's documentation for a list of available models. * **`API_KEY`**: Almost all cloud LLM providers require an API key for authentication. Obtain your API key from your chosen provider's platform and securely store it in your `.env` file. ## Explicitly Supported Cloud Providers DocsGPT offers direct, streamlined support for the following cloud LLM providers, making configuration straightforward. The table below outlines the `LLM_PROVIDER` and example `LLM_NAME` values to use for each provider in your `.env` file. | Provider | `LLM_PROVIDER` | Example `LLM_NAME` | | :--------------------------- | :------------- | :-------------------------- | | DocsGPT Public API | `docsgpt` | `None` | | OpenAI | `openai` | `gpt-5.1` | | Google (Vertex AI, Gemini) | `google` | `gemini-flash-latest` | | Anthropic (Claude) | `anthropic` | `claude-3-5-sonnet-20241022`| | Groq | `groq` | `llama-3.3-70b-versatile` | | HuggingFace Inference API | `huggingface` | `meta-llama/Llama-3.1-8B-Instruct` | | Azure OpenAI | `azure_openai` | `azure-gpt-4` | | Prem AI | `premai` | (See Prem AI docs) | | AWS SageMaker | `sagemaker` | (See SageMaker docs) | | Novita AI | `novita` | (See Novita docs) | ## Connecting to OpenAI-Compatible Cloud APIs DocsGPT's flexible architecture allows you to connect to any cloud provider that offers an API compatible with the OpenAI API standard. This opens up a vast ecosystem of LLM services. To connect to an OpenAI-compatible cloud provider, you will still use `LLM_PROVIDER=openai` in your `.env` file. However, you will also need to specify the API endpoint of your chosen provider using the `OPENAI_BASE_URL` setting. You will also likely need to provide an `API_KEY` and `LLM_NAME` as required by that provider. **Example for DeepSeek (OpenAI-Compatible API):** To connect to DeepSeek, which offers an OpenAI-compatible API, your `.env` file could be configured as follows: ``` LLM_PROVIDER=openai API_KEY=YOUR_API_KEY # Your DeepSeek API key LLM_NAME=deepseek-chat # Or your desired DeepSeek model name OPENAI_BASE_URL=https://api.deepseek.com/v1 # DeepSeek's OpenAI API URL ``` Remember to consult the documentation of your chosen OpenAI-compatible cloud provider for their specific API endpoint, required model names, and authentication methods. ## Adding Support for Other Cloud Providers If you wish to connect to a cloud provider that is not explicitly listed above or doesn't offer OpenAI API compatibility, you can extend DocsGPT to support it. Within the DocsGPT repository, navigate to the `application/llm` directory. Here, you will find Python files defining the existing LLM integrations. You can use these files as examples to create a new module for your desired cloud provider. After creating your new LLM module, you will need to register it within the `llm_creator.py` file. This process involves some coding, but it allows for virtually unlimited extensibility to connect to any cloud-based LLM service with an accessible API. ================================================ FILE: docs/content/Models/embeddings.md ================================================ --- title: Understanding and Configuring Embedding Models in DocsGPT description: Learn about embedding models, their importance in DocsGPT, and how to configure them for optimal performance. --- # Understanding and Configuring Embedding Models in DocsGPT Embedding models are a crucial component of DocsGPT, enabling its powerful document understanding and question-answering capabilities. This guide will explain what embedding models are, why they are essential for DocsGPT, and how to configure them. ## What are Embedding Models? In simple terms, an embedding model is a type of language model that converts text into numerical vectors. These vectors, known as embeddings, capture the semantic meaning of the text. Think of it as translating words and sentences into a language that computers can understand mathematically, where similar meanings are represented by vectors that are close to each other in vector space. **Why are embedding models important for DocsGPT?** DocsGPT uses embedding models for several key tasks: * **Semantic Search:** When you upload documents to DocsGPT, the application uses an embedding model to generate embeddings for each document chunk. These embeddings are stored in a vector store. When you ask a question, your query is also converted into an embedding. DocsGPT then performs a semantic search in the vector store, finding document chunks whose embeddings are most similar to your query embedding. This allows DocsGPT to retrieve relevant information based on the *meaning* of your question and documents, not just keyword matching. * **Document Understanding:** Embeddings help DocsGPT understand the underlying meaning of your documents, enabling it to answer questions accurately and contextually, even if the exact keywords from your question are not present in the retrieved document chunks. In essence, embedding models are the bridge that allows DocsGPT to understand the nuances of human language and connect your questions to the relevant information within your documents. ## Out-of-the-Box Embedding Model Support in DocsGPT DocsGPT is designed to be flexible and supports a wide range of embedding models right out of the box. Currently, DocsGPT provides native support for models from two major sources: * **Sentence Transformers:** DocsGPT supports all models available through the [Sentence Transformers library](https://www.sbert.net/). This library offers a vast selection of pre-trained embedding models, known for their quality and efficiency in various semantic tasks. * **OpenAI Embeddings:** DocsGPT also supports using embedding models from OpenAI, specifically the `text-embedding-ada-002` model, which is a powerful and widely used embedding model from OpenAI's API. ## Configuring Sentence Transformer Models To utilize Sentence Transformer models within DocsGPT, you need to follow these steps: 1. **Download the Model:** Sentence Transformer models are typically hosted on Hugging Face Model Hub. You need to download your chosen model and place it in the `model/` folder in the root directory of your DocsGPT project. For example, to use the `all-mpnet-base-v2` model, you would set `EMBEDDINGS_NAME` as described below, and ensure that the model files are available locally (DocsGPT will attempt to download it if it's not found, but local download is recommended for development and offline use). 2. **Set `EMBEDDINGS_NAME` in `.env` (or `settings.py`):** You need to configure the `EMBEDDINGS_NAME` setting in your `.env` file (or `settings.py`) to point to the desired Sentence Transformer model. * **Using a pre-downloaded model from `model/` folder:** You can specify a path to the downloaded model within the `model/` directory. For instance, if you downloaded `all-mpnet-base-v2` and it's in `model/all-mpnet-base-v2`, you could potentially use a relative path like (though direct path to the model name is usually sufficient): ``` EMBEDDINGS_NAME=huggingface_sentence-transformers/all-mpnet-base-v2 ``` or simply use the model identifier: ``` EMBEDDINGS_NAME=sentence-transformers/all-mpnet-base-v2 ``` * **Using a model directly from Hugging Face Model Hub:** You can directly specify the model identifier from Hugging Face Model Hub: ``` EMBEDDINGS_NAME=huggingface_sentence-transformers/all-mpnet-base-v2 ``` ## Using OpenAI Embeddings To use OpenAI's `text-embedding-ada-002` embedding model, you need to set `EMBEDDINGS_NAME` to `openai_text-embedding-ada-002` and ensure you have your OpenAI API key configured correctly via `API_KEY` in your `.env` file (if you are not using Azure OpenAI). **Example `.env` configuration for OpenAI Embeddings:** ``` LLM_PROVIDER=openai API_KEY=YOUR_OPENAI_API_KEY # Your OpenAI API Key EMBEDDINGS_NAME=openai_text-embedding-ada-002 ``` ## Adding Support for Other Embedding Models If you wish to use an embedding model that is not supported out-of-the-box, a good starting point for adding custom embedding model support is to examine the `base.py` file located in the `application/vectorstore` directory. Specifically, pay attention to the `EmbeddingsWrapper` and `EmbeddingsSingleton` classes. `EmbeddingsWrapper` provides a way to wrap different embedding model libraries into a consistent interface for DocsGPT. `EmbeddingsSingleton` manages the instantiation and retrieval of embedding model instances. By understanding these classes and the existing embedding model implementations, you can create your own custom integration for virtually any embedding model library you desire. ================================================ FILE: docs/content/Models/local-inference.mdx ================================================ --- title: Connecting DocsGPT to Local Inference Engines description: Connect DocsGPT to local inference engines for running LLMs directly on your hardware. --- # Connecting DocsGPT to Local Inference Engines DocsGPT can be configured to leverage local inference engines, allowing you to run Large Language Models directly on your own infrastructure. This approach offers enhanced privacy and control over your LLM processing. Currently, DocsGPT primarily supports local inference engines that are compatible with the OpenAI API format. This means you can connect DocsGPT to various local LLM servers that mimic the OpenAI API structure. ## Configuration via `.env` file Setting up a local inference engine with DocsGPT is configured through environment variables in the `.env` file. For a detailed explanation of all settings, please consult the [DocsGPT Settings Guide](/Deploying/DocsGPT-Settings). To connect to a local inference engine, you will generally need to configure these settings in your `.env` file: * **`LLM_PROVIDER`**: Crucially set this to `openai`. This tells DocsGPT to use the OpenAI-compatible API format for communication, even though the LLM is local. * **`LLM_NAME`**: Specify the model name as recognized by your local inference engine. This might be a model identifier or left as `None` if the engine doesn't require explicit model naming in the API request. * **`OPENAI_BASE_URL`**: This is essential. Set this to the base URL of your local inference engine's API endpoint. This tells DocsGPT where to find your local LLM server. * **`API_KEY`**: Generally, for local inference engines, you can set `API_KEY=None` as authentication is usually not required in local setups. ## Native llama.cpp Support DocsGPT includes native support for llama.cpp without requiring an OpenAI-compatible server. To use this: ``` LLM_PROVIDER=llama.cpp LLM_NAME=your-model-name ``` This provider integrates directly with llama.cpp Python bindings. ## Supported Local Inference Engines (OpenAI API Compatible) DocsGPT is also readily configurable to work with the following local inference engines, all communicating via the OpenAI API format. Here are example `OPENAI_BASE_URL` values for each, based on default setups: | Inference Engine | `LLM_PROVIDER` | `OPENAI_BASE_URL` | | :---------------------------- | :------------- | :------------------------- | | LLaMa.cpp (server mode) | `openai` | `http://localhost:8000/v1` | | Ollama | `openai` | `http://localhost:11434/v1` | | Text Generation Inference (TGI)| `openai` | `http://localhost:8080/v1` | | SGLang | `openai` | `http://localhost:30000/v1` | | vLLM | `openai` | `http://localhost:8000/v1` | | Aphrodite | `openai` | `http://localhost:2242/v1` | | FriendliAI | `openai` | `http://localhost:8997/v1` | | LMDeploy | `openai` | `http://localhost:23333/v1` | **Important Note on `localhost` vs `host.docker.internal`:** The `OPENAI_BASE_URL` examples above use `http://localhost`. If you are running DocsGPT within Docker and your local inference engine is running on your host machine (outside of Docker), you will likely need to replace `localhost` with `http://host.docker.internal` to ensure Docker can correctly access your host's services. For example, `http://host.docker.internal:11434/v1` for Ollama. ## How the Model Registry Works DocsGPT uses a **Model Registry** to automatically detect and register available models based on your environment configuration. Understanding this system helps you configure models correctly. ### Automatic Model Detection When DocsGPT starts, the Model Registry scans your environment variables and automatically registers models from providers that have valid API keys configured: | Environment Variable | Provider Models Registered | | :--------------------- | :------------------------- | | `OPENAI_API_KEY` | OpenAI models (gpt-5.1, gpt-5-mini, etc.) | | `ANTHROPIC_API_KEY` | Anthropic models (Claude family) | | `GOOGLE_API_KEY` | Google models (Gemini family) | | `GROQ_API_KEY` | Groq models (Llama, Mixtral) | | `HUGGINGFACE_API_KEY` | HuggingFace models | You can also use the generic `API_KEY` variable with `LLM_PROVIDER` to configure a single provider. ### Custom OpenAI-Compatible Models When you set `OPENAI_BASE_URL` along with `LLM_PROVIDER=openai` and `LLM_NAME`, the registry automatically creates a custom model entry pointing to your local inference server. This is how local engines like Ollama, vLLM, and others get registered. ### Default Model Selection The registry determines the default model in this priority order: 1. If `LLM_NAME` is set and matches a registered model, that model becomes the default 2. Otherwise, the first model from the configured `LLM_PROVIDER` is selected 3. If neither is set, the first available model in the registry is used ### Multiple Providers You can configure multiple API keys simultaneously (e.g., both `OPENAI_API_KEY` and `ANTHROPIC_API_KEY`). The registry will load models from all configured providers, giving users the ability to switch between them in the UI. ## Adding Support for Other Local Engines While DocsGPT currently focuses on OpenAI API compatible local engines, you can extend its capabilities to support other local inference solutions. To do this, navigate to the `application/llm` directory in the DocsGPT repository. Examine the existing Python files for examples of LLM integrations. You can create a new module for your desired local engine, and then register it in the `llm_creator.py` file within the same directory. This allows for custom integration with a wide range of local LLM servers beyond those listed above. ================================================ FILE: docs/content/Tools/_meta.js ================================================ export default { "basics": { "title": "🔧 Tools Basics", "href": "/Tools/basics" }, "api-tool": { "title": "🗝️ API Tool", "href": "/Tools/api-tool" }, "creating-a-tool": { "title": "🛠️ Creating a Custom Tool", "href": "/Tools/creating-a-tool" } } ================================================ FILE: docs/content/Tools/api-tool.mdx ================================================ --- title: 🗝️ Generic API Tool description: Learn how to configure and use the API Tool in DocsGPT to connect with any RESTful API without writing custom code. --- import { Callout } from 'nextra/components'; import Image from 'next/image'; # Using the Generic API Tool The API Tool provides a no-code/low-code solution to make DocsGPT interact with third-party or internal RESTful APIs. It acts as a bridge, allowing the Large Language Model (LLM) to leverage external services based on your chat interactions. This guide will walk you through its capabilities, configuration, and best practices. ## Introduction to the Generic API Tool **When to Use It:** * Ideal for quickly integrating existing APIs where the interaction involves standard HTTP requests (GET, POST, PUT, DELETE). * Suitable for fetching data to enrich answers (e.g., current weather, stock prices, product details). * Useful for triggering simple actions in other systems (e.g., sending a notification, creating a basic task). **Contrast with Custom Python Tools:** * **API Tool:** Best for straightforward API calls. Configuration is done through the DocsGPT UI. * **Custom Python Tools:** Preferable when you need complex logic before or after the API call, handle non-standard authentication (like complex OAuth flows), manage multi-step API interactions, or require intricate data processing not easily managed by the LLM alone. See [Creating a Custom Tool](/Tools/creating-a-tool) for more. ## Capabilities of the API Tool **Supported HTTP Methods:** You can configure actions using standard HTTP methods such as: * `GET`: To retrieve data. * `POST`: To submit data to create a new resource. * `PUT`: To update an existing resource. * `DELETE`: To remove a resource. **Request Configuration:** * **Headers:** Define static or dynamic HTTP headers for authentication (e.g., API keys), content type specification, etc. * **Query Parameters:** Specify URL query parameters, which can be static or dynamically filled by the LLM based on user input. * **Request Body:** Define the structure of the request body (e.g., JSON), with fields that can be static or dynamically populated by the LLM. **Response Handling:** * The API Tool executes the request and receives the raw response from the API (typically JSON or plain text). * This raw response is then passed back to the LLM. * The LLM uses this response, along with the context of your query and the description of the API tool action, to formulate an answer or decide on follow-up actions. The API tool itself doesn't deeply parse or transform the response beyond basic content type detection (e.g., loading JSON into a parsable object). ## Configuring an API as a Tool You can configure the API Tool through the DocsGPT user interface, found in **Settings -> Tools**. When you add or modify an API Tool, you'll define specific actions that DocsGPT can perform. The configuration involves defining how DocsGPT should call an API endpoint. Each configured API call essentially becomes a distinct "action" the LLM can choose to use. Below is an example of how you might configure an API action, inspired by setting up a phone number validation service: API Tool configuration example for phone validation _Figure 1: Example configuration for an API Tool action to validate phone numbers._ **Defining an API Endpoint/Action:** When you configure a new API action, you'll fill in the following fields: - **`Name`:** A user-friendly name for this specific API action (e.g., "Phone-check" as in the image, or more specific like "ValidateUSPhoneNumber"). This helps in managing your tools. - **`Description`:** This is a **critical field**. Provide a clear and concise description of what the API action does, what kind of input it expects (implicitly), and what kind of output it provides. The LLM uses this description to understand when and how to use this action. - **`URL`:** The full endpoint URL for the API request. - **`HTTP Method`:** Select the appropriate HTTP method (e.g., GET, POST) from a dropdown. - **`Headers`:** You can add custom HTTP headers as key-value pairs (Name, Value). Indicate if the value should be `Filled by LLM` or is static. If filled by LLM, provide a `Description` for the LLM. - **`Query Parameters`:** For `GET` requests or when parameters are sent in the URL. * **`Name`:** The name of the query parameter (e.g., `api_key`, `phone`). * **`Type`:** The data type of the parameter (e.g., `string`). * **`Filled by LLM` (Checkbox):** - **Unchecked (Static):** The `Value` you provide will be used for every call (e.g., for an `api_key` that doesn't change). - **Checked (Dynamic):** The LLM will extract the appropriate value from the user's chat query based on the `Description` you provide for this parameter. The `Value` field is typically left empty or contains a placeholder if `Filled by LLM` is checked. * `Description`: Context for the LLM if the parameter is to be filled dynamically, or for your own reference if static. * `Value`: The static value if not filled by LLM. - **`Request Body`:** Used to send data (commonly JSON) to the API. Similar to Query Parameters, you define fields with `Name`, `Type`, whether it's `Filled by LLM`, a `Description` for dynamic fields, and a static `Value` if applicable. **Response Handling Guidance for the LLM:** While the API Tool configuration UI doesn't have explicit fields for defining response parsing rules (like JSONPath extractors), you significantly influence how the LLM handles the response through: * **Tool Action `Description`:** Clearly state what kind of information the API returns (e.g., "This API returns a JSON object with 'status' and 'location' fields for the phone number."). This helps the LLM know what to look for in the API's output. * **Prompt Engineering:** For more complex scenarios, you might need to adjust your global or agent-specific prompts to guide DocsGPT on how to interpret and present information from API tool responses. See [Customising Prompts](/Guides/Customising-prompts). ## Using the Configured API Tool in Chat Once an API action is configured and enabled, DocsGPT's LLM can decide to use it based on your natural language queries. **Example (based on the phone validation tool in Figure 1):** 1. **User Query:** "Hey DocsGPT, can you check if +14155555555 is a valid phone number?" 2. **DocsGPT (LLM Orchestration):** * The LLM analyzes the query. * It matches the intent ("check if ... is a valid phone number") with the description of the "Phone-check" API action. * It identifies `+14155555555` as the value for the `phone` parameter (which was marked as `Filled by LLM` with the description "Phone number to check"). * DocsGPT constructs the GET API request. 3. **API Tool Execution:** * The API Tool makes the HTTP GET request. * The external API (AbstractAPI) processes the request and returns a JSON response, e.g.: ```json { "phone": "+14155555555", "valid": true, "format": { "international": "+1 415-555-5555", "national": "(415) 555-5555" }, "country": { "code": "US", "name": "United States", "prefix": "+1" }, "location": "California", "type": "Landline" } ``` 4. **DocsGPT Response Formulation:** * The API Tool passes this JSON response back to the LLM. * The LLM, guided by the tool's description and the user's original query, extracts relevant information and formulates a user-friendly answer. * **DocsGPT Chat Response:** "Yes, +14155555555 appears to be a valid landline phone number in California, United States." ## Advanced Tips and Best Practices **Clear Description is the Key:** The LLM relies heavily on the `Description` field of the API action and its parameters. Make them unambiguous and action-oriented. Clearly state what the tool does and what kind of input it expects (even if implicitly through parameter descriptions). **Iterative Testing:** After configuring an API tool, test it with various phrasings of user queries to ensure the LLM triggers it correctly and interprets the response as expected. **Error Handling:** * If an API call fails, the API Tool will return an error message and status code from the `requests` library or the API itself. The LLM may relay this error or try to explain it. * Check DocsGPT's backend logs for more detailed error information if you encounter issues. **Security Considerations:** * **API Keys:** Be mindful of API keys and other sensitive credentials. The example image shows an API key directly in the configuration. For production or shared environments avoid exposing configurations with sensitive keys. * **Rate Limits:** Be aware of the rate limits of the APIs you are integrating. Frequent calls from DocsGPT could exceed these limits. * **Data Privacy:** Consider the data privacy implications of sending user query data to third-party APIs. - **Idempotency:** For tools that modify data (POST, PUT, DELETE), be aware of whether the API operations are idempotent to avoid unintended consequences from repeated calls if the LLM retries an action. ## Limitations While powerful, the Generic API Tool has some limitations: - **Complex Authentication:** Advanced authentication flows like OAuth 2.0 (especially 3-legged OAuth requiring user redirection) or custom signature-based authentication often require custom Python tools. - **Multi-Step API Interactions:** If a task requires multiple API calls that depend on each other (e.g., fetch a list, then for each item, fetch details), this kind of complex chaining and logic is better handled by a custom Python tool. - **Complex Data Transformations:** If the API response needs significant transformation or processing before being useful to the LLM, a custom Python tool offers more flexibility. - **Real-time Streaming (SSE, WebSockets):** The tool is designed for request-response interactions, not for maintaining persistent streaming connections. For scenarios that exceed these limitations, developing a [Custom Python Tool](/Tools/creating-a-tool) is the recommended approach. ================================================ FILE: docs/content/Tools/basics.mdx ================================================ --- title: Tools Basics - Enhancing DocsGPT Capabilities description: Understand what DocsGPT Tools are, how they work, and explore the built-in tools available to extend DocsGPT's functionality. --- import { Callout } from 'nextra/components'; import Image from 'next/image'; import { ToolCards } from '../../components/ToolCards'; # Understanding DocsGPT Tools DocsGPT Tools are powerful extensions that significantly enhance the capabilities of your DocsGPT application. They allow DocsGPT to move beyond its core function of retrieving information from your documents and enable it to perform actions, interact with external data sources, and integrate with other services. You can find and configure available tools within the "Tools" section of the DocsGPT application settings in the user interface. ## What are Tools? - **Purpose:** The primary purpose of Tools is to bridge the gap between understanding a user's request (natural language processing by the LLM) and executing a tangible action. This could involve fetching live data from the web, sending notifications, running code snippets, querying databases, or interacting with third-party APIs. - **LLM as an Orchestrator:** The Large Language Model (LLM) at the heart of DocsGPT is designed to act as an intelligent orchestrator. Based on your query and the declared capabilities of the available tools (defined in their metadata), the LLM decides if a tool is needed, which tool to use, and what parameters to pass to it. - **Action-Oriented Interactions:** Tools enable more dynamic and action-oriented interactions. For example: * *"What's the latest news on renewable energy?"* - This might trigger a web search tool to fetch current articles. * *"Fetch the order status for customer ID 12345 from our database."* - This could use a database tool. * *"Summarize the content of this webpage and send the summary to the #general channel on Telegram."* - This might involve a web scraping tool followed by a Telegram notification tool. ## Overview of Built-in Tools DocsGPT includes a suite of pre-built tools designed to expand its capabilities out-of-the-box. Below is an overview of the currently available tools. ## Using Tools in DocsGPT (User Perspective) Interacting with tools in DocsGPT is designed to be intuitive: 1. **Natural Language Interaction:** As a user, you typically interact with DocsGPT using natural language queries or commands. The LLM within DocsGPT analyzes your input to determine if a specific task can or should be handled by one of the available and configured tools. 2. **Configuration in UI:** * Tools are generally managed and configured within the DocsGPT application's settings, found under a "Tools" section in the GUI. * For tools that interact with external services (like Brave Search, Telegram, or any service via the API Tool), you might need to provide authentication credentials (e.g., API keys, tokens) or specific endpoint information during the tool's setup in the UI. 3. **Prompt Engineering for Tools:** While the LLM aims to intelligently use tools, for more complex or reliable agent-like behaviors, you might need to customize the system prompts. Modifying the prompt can guide the LLM on when and how to prioritize or chain tools to achieve specific outcomes, especially if you're building an agent designed to perform a certain sequence of actions every time. For more on this, see [Customising Prompts](/Guides/Customising-prompts). ## Advancing with Tools Understanding the basics of DocsGPT Tools opens up many possibilities: * **Leverage the API Tool:** For quick integrations with numerous external services, explore the [API Tool Detailed Guide](/Tools/api-tool). * **Develop Custom Tools:** If you have specific needs not covered by built-in tools or the generic API tool, you can develop your own. See our guide on `[Developing Custom Tools](/Tools/creating-a-tool)` (placeholder for now). * **Build AI Agents:** Tools are the fundamental building blocks for creating sophisticated AI agents within DocsGPT. Explore how these can be combined by looking into the `[Agents section/tab concept - link to be added once available]`. By harnessing the power of Tools, you can transform DocsGPT into a more versatile and proactive assistant tailored to your unique workflows. ================================================ FILE: docs/content/Tools/creating-a-tool.mdx ================================================ --- title: 🛠️ Creating a Custom Tool description: Learn how to create custom Python tools to extend DocsGPT's functionality and integrate with various services or perform specific actions. --- import { Callout } from 'nextra/components'; import { Steps } from 'nextra/components'; # 🛠️ Creating a Custom Python Tool This guide provides developers with a comprehensive, step-by-step approach to creating their own custom tools for DocsGPT. By developing custom tools, you can significantly extend DocsGPT's capabilities, enabling it to interact with new data sources, services, and perform specialized actions tailored to your unique needs. ## Introduction to Custom Tool Development ### Why Create Custom Tools? While DocsGPT offers a range of built-in tools and a versatile API Tool, there are many scenarios where a custom Python tool is the best solution: * **Integrating with Proprietary Systems:** Connect to internal APIs, databases, or services that are not publicly accessible or require complex authentication. * **Adding Domain-Specific Functionalities:** Implement logic specific to your industry or use case that isn't covered by general-purpose tools. * **Automating Unique Workflows:** Create tools that orchestrate multiple steps or interact with systems in a way unique to your operational needs. * **Connecting to Any System with an Accessible Interface:** If you can interact with a system programmatically using Python (e.g., through libraries, SDKs, or direct HTTP requests), you can likely build a DocsGPT tool for it. * **Complex Logic or Data Transformation:** When API interactions require intricate logic before sending a request or after receiving a response, or when data needs significant transformation that is difficult for an LLM to handle directly. ### Prerequisites Before you begin, ensure you have: * A solid understanding of Python programming. * Familiarity with the DocsGPT project structure, particularly the `application/agents/tools/` directory where custom tools reside. * Basic knowledge of how APIs work, as many tools involve interacting with external or internal APIs. * Your DocsGPT development environment set up. If not, please refer to the [Setting Up a Development Environment](/Deploying/Development-Environment) guide. ## The Anatomy of a DocsGPT Tool Custom tools in DocsGPT are Python classes that inherit from a base `Tool` class and implement specific methods to define their behavior, capabilities, and configuration needs. The **foundation** for all custom tools is the abstract base class, located in `application/agents/tools/base.py`. Your custom tool class **must** inherit from this class. ### Essential Methods to Implement Your custom tool class needs to implement the following methods: 1. **`__init__(self, config: dict)`** - **Purpose:** The constructor for your tool. It's called when DocsGPT initializes the tool. - **Usage:** This method is typically used to receive and store tool-specific configurations passed via the `config` dictionary. This dictionary is populated based on the tool's settings, often configured through the DocsGPT UI or environment variables. For example, you would store API keys, base URLs, or database connection strings here. - **Example** (`brave.py`)**:** ``` python class BraveSearchTool(Tool): def __init__(self, config): self.config = config self.token = config.get("token", "") # API Key for Brave Search self.base_url = "https://api.search.brave.com/res/v1" ``` 2. **`execute_action(self, action_name: str, **kwargs) -> dict`** - **Purpose:** This is the workhorse of your tool. The LLM, acting as an agent, calls this method when it decides to use one of the actions your tool provides. - **Parameters:** - `action_name` (str): A string specifying which of the tool's actions to run (e.g., "brave_web_search"). - `**kwargs` (dict): A dictionary containing the parameters for that specific action. These parameters are defined in the tool's metadata (`get_actions_metadata()`) and are extracted or inferred by the LLM from the user's query. - **Return Value:** A dictionary containing the result of the action. It's good practice to include keys like: - `status_code` (int): An HTTP-like status code (e.g., 200 for success, 500 for error). - `message` (str): A human-readable message describing the outcome. - `data` (any): The actual data payload returned by the action (if applicable). - `error` (str): An error message if the action failed. - **Example (`read_webpage.py`):** ``` python def execute_action(self, action_name: str, **kwargs) -> str: if action_name != "read_webpage": return f"Error: Unknown action '{action_name}'. This tool only supports 'read_webpage'." url = kwargs.get("url") if not url: return "Error: URL parameter is missing." # ... (logic to fetch and parse webpage) ... try: # ... return markdown_content except Exception as e: return f"Error processing URL {url}: {e}" ``` A more structured return: ``` python # ... inside execute_action try: # ... logic ... return {"status_code": 200, "message": "Webpage read successfully", "data": markdown_content} except Exception as e: return {"status_code": 500, "message": f"Error processing URL {url}", "error": str(e)} ``` 3. **`get_actions_metadata(self) -> list`** - **Purpose:** This method is **critical** for the LLM to understand what your tool can do, when to use it, and what parameters it needs. It effectively advertises your tool's capabilities. - **Return Value:** A list of dictionaries. Each dictionary describes one distinct action the tool can perform and must follow a specific JSON schema structure. - `name` (str): A unique and descriptive name for the action (e.g., `mytool_get_user_details`). It's a common convention to prefix with the tool name to avoid collisions. - `description` (str): A clear, concise, and unambiguous description of what the action does. **Write this for the LLM.** The LLM uses this description to decide if this action is appropriate for a given user query. - `parameters` (dict): A JSON Schema object defining the parameters that the action expects. This schema tells the LLM what arguments are needed, their types, and which are required. - `type`: Should always be `"object"`. - `properties`: A dictionary where each key is a parameter name, and the value is an object defining its `type` (e.g., "string", "integer", "boolean") and `description`. - `required`: A list of strings, where each string is the name of a parameter that is mandatory for the action. - **Example (`postgres.py` - partial):** ``` python def get_actions_metadata(self): return [ { "name": "postgres_execute_sql", "description": "Execute an SQL query against the PostgreSQL database...", "parameters": { "type": "object", "properties": { "sql_query": { "type": "string", "description": "The SQL query to execute.", }, }, "required": ["sql_query"], "additionalProperties": False, # Good practice to prevent unexpected params }, }, # ... other actions like postgres_get_schema ] ``` 4. **`get_config_requirements(self) -> dict`** - **Purpose:** Defines the configuration parameters that your tool needs to function (e.g., API keys, specific base URLs, connection strings, default settings). This information can be used by the DocsGPT UI to dynamically render configuration fields for your tool or for validation. - **Return Value:** A dictionary where keys are the configuration item names (which will be keys in the `config` dict passed to `__init__`) and values are dictionaries describing each requirement: - `type` (str): The expected data type of the config value (e.g., "string", "boolean", "integer"). - `description` (str): A human-readable description of what this configuration item is for. - `secret` (bool, optional): Set to `True` if the value is sensitive (e.g., an API key) and should be masked or handled specially in UIs. Defaults to `False`. - **Example (`brave.py`):** ``` python def get_config_requirements(self): return { "token": { # This 'token' will be a key in the config dict for __init__ "type": "string", "description": "Brave Search API key for authentication", "secret": True }, } ``` ## Tool Registration and Discovery DocsGPT's ToolManager (located in application/agents/tools/tool_manager.py) automatically discovers and loads tools. As long as your custom tool: 1. Is placed in a Python file within the `application/agents/tools/` directory (and the filename is not `base.py` or starts with `__`). 2. Correctly inherits from the `Tool` base class. 3. Implements all the abstract methods (`execute_action`, `get_actions_metadata`, `get_config_requirements`). The `ToolManager` should be able to load it when DocsGPT starts. ## Configuration & Secrets Management - **Configuration Source:** The `config` dictionary passed to your tool's `__init__` method is typically populated from settings defined in the DocsGPT UI (if available for the tool) or from environment variables/configuration files that DocsGPT loads (see [⚙️ App Configuration](/Deploying/DocsGPT-Settings)). The keys in this dictionary should match the names you define in `get_config_requirements()`. - **Secrets:** Never hardcode secrets (like API keys or passwords) directly into your tool's Python code. Instead, define them as configuration requirements (using `secret: True` in `get_config_requirements()`) and let DocsGPT's configuration system inject them via the `config` dictionary at runtime. This ensures that secrets are managed securely and are not exposed in your codebase. ## Best Practices for Tool Development - **Atomicity:** Design tool actions to be as atomic (single, well-defined purpose) as possible. This makes them easier for the LLM to understand and combine. - **Clarity in Metadata:** Ensure action names and descriptions in `get_actions_metadata()` are extremely clear, specific, and unambiguous. This is the primary way the LLM understands your tool. - **Robust Error Handling:** Implement comprehensive error handling within your `execute_action` logic (and the private methods it calls). Return informative error messages in the result dictionary so the LLM or user can understand what went wrong. - **Security:** - Be mindful of the security implications of your tool, especially if it interacts with sensitive systems or can execute arbitrary code/queries. - Validate and sanitize any inputs, especially if they are used to construct database queries or shell commands, to prevent injection attacks. - **Performance:** Consider the performance implications of your tool's actions. If an action is slow, it will impact the user experience. Optimize where possible. ## (Optional) Contributing Your Tool If you develop a custom tool that you believe could be valuable to the broader DocsGPT community and is general-purpose: 1. Ensure it's well-documented (both in code and with clear metadata). 2. Make sure it adheres to the best practices outlined above. 3. Consider opening a Pull Request to the [DocsGPT GitHub repository](https://github.com/arc53/DocsGPT) with your new tool, including any necessary documentation updates. By following this guide, you can create powerful custom tools that extend DocsGPT's capabilities to your specific operational environment. ================================================ FILE: docs/content/_meta.js ================================================ export default { "index": "Home", "quickstart": "Quickstart", "Deploying": "Deploying", "Models": "Models", "Tools": "Tools", "Agents": "Agents", "Extensions": "Extensions", "https://gptcloud.arc53.com/": { "title": "API", "href": "https://gptcloud.arc53.com/" }, "Guides": "Guides", "changelog": { "title": "Changelog", "display": "hidden" } } ================================================ FILE: docs/content/changelog.mdx ================================================ --- title: 'Changelog' --- ================================================ FILE: docs/content/index.mdx ================================================ --- title: 'Home' description: Documentation of DocsGPT - quickstart, deployment guides, model configuration, and widget integration documentation. --- import { Cards } from 'nextra/components' import Image from 'next/image' export const allGuides = { "quickstart": { "title": "⚡️ Quickstart", "href": "/quickstart" }, "DocsGPT-Settings": { "title": "⚙️ App Configuration", "href": "/Deploying/DocsGPT-Settings" }, "Docker-Deploying": { "title": "🛳️ Docker Setup", "href": "/Deploying/Docker-Deploying" }, "Development-Environment": { "title": "🛠️Development Environment", "href": "/Deploying/Development-Environment" }, "https://gptcloud.arc53.com/": { "title": "🧑‍💻️ API", "href": "https://gptcloud.arc53.com/", "newWindow": true }, "cloud-providers": { "title": "☁️ Cloud Providers", "href": "/Models/cloud-providers" }, "local-inference": { "title": "🖥️ Local Inference", "href": "/Models/local-inference" }, "embeddings": { "title": "📝 Embeddings", "href": "/Models/embeddings" }, "api-key-guide": { "title": "🔑 Getting API key", "href": "/Extensions/api-key-guide" }, "chat-widget": { "title": "💬️ Chat Widget", "href": "/Extensions/chat-widget" }, "search-widget": { "title": "🔎 Search Widget", "href": "/Extensions/search-widget" }, "Customising-prompts": { "title": "️💻 Customising Prompts", "href": "/Guides/Customising-prompts" } }; # **DocsGPT 🦖** DocsGPT is an open-source genAI tool that helps users get reliable answers from any knowledge source, while avoiding hallucinations. It enables quick and reliable information retrieval, with tooling and agentic system capability built in, including speech-to-text workflows for chat and audio knowledge ingestion. Try it yourself: [https://www.docsgpt.cloud/](https://www.docsgpt.cloud/) ### Features: - **🗂️ Wide Format Support:** Reads PDF, DOCX, CSV, XLSX, EPUB, MD, RST, HTML, MDX, JSON, PPTX, images, and audio files such as MP3, WAV, M4A, OGG, and WebM. - **🎙️ Speech Workflows:** Record voice input into chat, transcribe on the backend, and index uploaded audio files as searchable source material. - **🌐 Web & Data Integration:** Ingests from URLs, sitemaps, Reddit, GitHub and web crawlers. - **✅ Reliable Answers:** Get accurate, hallucination-free responses with source citations viewable in a clean UI. - **🔑 Streamlined API Keys:** Generate keys linked to your settings, documents, and models, simplifying chatbot and integration setup. - **🔗 Actionable Tooling:** Connect to APIs, tools, and other services to enable LLM actions. - **🧩 Pre-built Integrations:** Use readily available HTML/React chat widgets, search tools, Discord/Telegram bots, and more. - **🔌 Flexible Deployment:** Works with major LLMs (OpenAI, Google, Anthropic) and local models (Ollama, llama_cpp). - **🏢 Secure & Scalable:** Run privately and securely with Kubernetes support, designed for enterprise-grade reliability. **Contribute and Extend:** As an open-source project, community contributions are highly encouraged! If you develop valuable customizations or enhancements, consider contributing them back to the main repository to benefit other DocsGPT users. ( ))} /> ================================================ FILE: docs/content/quickstart.mdx ================================================ --- title: Quickstart - Launching DocsGPT Web App description: Get started with DocsGPT quickly by launching the web application using the setup script. --- # Quickstart **Prerequisites:** * **Docker:** Ensure you have Docker installed and running on your system. ## Launching DocsGPT (macOS and Linux) The easiest way to launch DocsGPT is using the provided `setup.sh` script. This script automates the configuration process and offers several setup options. **Steps:** 1. **Download the DocsGPT Repository:** First, you need to download the DocsGPT repository to your local machine. You can do this using Git: ```bash git clone https://github.com/arc53/DocsGPT.git cd DocsGPT ``` 2. **Run the `setup.sh` script:** Navigate to the DocsGPT directory in your terminal and execute the `setup.sh` script: ```bash ./setup.sh ``` 3. **Follow the interactive setup:** The `setup.sh` script will guide you through an interactive menu with the following options: ``` Welcome to DocsGPT Setup! How would you like to proceed? 1) Use DocsGPT Public API Endpoint (simple and free) 2) Serve Local (with Ollama) 3) Connect Local Inference Engine 4) Connect Cloud API Provider 5) Advanced: Build images locally (for developers) Choose option (1-5): ``` Let's break down each option: * **1) Use DocsGPT Public API Endpoint (simple and free):** This is the simplest option to get started. It utilizes the DocsGPT public API, requiring no API keys or local model downloads. Choose this for a quick and easy setup. * **2) Serve Local (with Ollama):** This option allows you to run a Large Language Model locally using [Ollama](https://ollama.com/). You'll be prompted to choose between CPU or GPU for Ollama and select a model to download. This is a good option for local processing and experimentation. * **3) Connect Local Inference Engine:** If you are already running a local inference engine like Llama.cpp, Text Generation Inference (TGI), vLLM, or others, choose this option. You'll be asked to select your engine and provide the necessary connection details. This is for users with existing local LLM infrastructure. * **4) Connect Cloud API Provider:** This option lets you connect DocsGPT to a commercial Cloud API provider such as OpenAI, Google (Vertex AI/Gemini), Anthropic (Claude), Groq, HuggingFace Inference API, or Azure OpenAI. You will need an API key from your chosen provider. Select this if you prefer to use a powerful cloud-based LLM. * **5) Modify DocsGPT's source code and rebuild the Docker images locally.** Instead of pulling prebuilt images from Docker Hub or using the hosted/public API, you build the entire backend and frontend from source, customizing how DocsGPT works internally, or run it in an environment without internet access. After selecting an option and providing any required information (like API keys or model names), the script will configure your `.env` file and start DocsGPT using Docker Compose. 4. **Access DocsGPT in your browser:** Once the setup is complete and Docker containers are running, navigate to [http://localhost:5173/](http://localhost:5173/) in your web browser to access the DocsGPT web application. 5. **Stopping DocsGPT:** To stop DocsGPT, simply open a new terminal in the `DocsGPT` directory and run: ```bash docker compose -f deployment/docker-compose.yaml down ``` (or the specific `docker compose` command shown at the end of the `setup.sh` execution, which may include optional compose files depending on your choices). ## Launching DocsGPT (Windows) For Windows users, we provide a PowerShell script that offers the same functionality as the macOS/Linux setup script. **Steps:** 1. **Download the DocsGPT Repository:** First, you need to download the DocsGPT repository to your local machine. You can do this using Git: ```powershell git clone https://github.com/arc53/DocsGPT.git cd DocsGPT ``` 2. **Run the `setup.ps1` script:** Execute the PowerShell setup script: ```powershell PowerShell -ExecutionPolicy Bypass -File .\setup.ps1 ``` 3. **Follow the interactive setup:** Just like the Linux/macOS script, the PowerShell script will guide you through setting DocsGPT. The script will handle environment configuration and start DocsGPT based on your selections. 4. **Access DocsGPT in your browser:** Once the setup is complete and Docker containers are running, navigate to [http://localhost:5173/](http://localhost:5173/) in your web browser to access the DocsGPT web application. 5. **Stopping DocsGPT:** To stop DocsGPT run the Docker Compose down command displayed at the end of the setup script's execution. **Important for Windows:** Ensure Docker Desktop is installed and running correctly on your Windows system before proceeding. The script will attempt to start Docker if it's not running, but you may need to start it manually if there are issues. **Alternative Method:** If you prefer a more manual approach, you can follow our [Docker Deployment documentation](/Deploying/Docker-Deploying) for detailed instructions on setting up DocsGPT on Windows using Docker commands directly. ## Advanced Configuration For more advanced customization of DocsGPT settings, such as configuring vector stores, embedding models, and other parameters, please refer to the [DocsGPT Settings documentation](/Deploying/DocsGPT-Settings). This guide explains how to modify the `.env` file or `settings.py` for deeper configuration. Enjoy using DocsGPT! ================================================ FILE: docs/mdx-components.jsx ================================================ import { useMDXComponents as getThemeComponents } from 'nextra-theme-docs'; export function useMDXComponents(components) { return { ...getThemeComponents(), ...components, }; } ================================================ FILE: docs/next.config.js ================================================ const nextra = require('nextra').default; const withNextra = nextra({ defaultShowCopyCode: true, }); module.exports = withNextra({ reactStrictMode: true, }); ================================================ FILE: docs/package.json ================================================ { "scripts": { "dev": "next dev", "build": "next build", "postbuild": "pagefind --site .next/server/app --output-path public/_pagefind", "start": "next start" }, "license": "MIT", "dependencies": { "@vercel/analytics": "^1.1.1", "docsgpt-react": "^0.5.1", "next": "^15.5.9", "nextra": "^4.6.1", "nextra-theme-docs": "^4.6.1", "react": "^18.2.0", "react-dom": "^18.2.0" }, "devDependencies": { "pagefind": "^1.3.0", "typescript": "^5.9.3" } } ================================================ FILE: docs/public/favicons/site.webmanifest ================================================ { "name": "", "short_name": "", "icons": [ { "src": "/android-chrome-192x192.png", "sizes": "192x192", "type": "image/png" }, { "src": "/android-chrome-512x512.png", "sizes": "512x512", "type": "image/png" } ], "theme_color": "#ffffff", "background_color": "#ffffff", "display": "standalone" } ================================================ FILE: docs/public/llms.txt ================================================ # DocsGPT > DocsGPT is an open-source platform for building AI agents and assistants with document retrieval, tools, and multi-model support. This file is a curated map of DocsGPT documentation for LLM and agent use. Prioritize Core, Deploying, and Agents for implementation tasks. ## Core - [Docs Home](https://docs.docsgpt.cloud/): Main documentation landing page. - [Quickstart](https://docs.docsgpt.cloud/quickstart): Fastest path to run DocsGPT locally. - [Architecture](https://docs.docsgpt.cloud/Guides/Architecture): High-level system architecture. - [Development Environment](https://docs.docsgpt.cloud/Deploying/Development-Environment): Backend and frontend local setup. - [DocsGPT Settings](https://docs.docsgpt.cloud/Deploying/DocsGPT-Settings): Environment variables and core app configuration. ## Deploying - [Docker Deployment](https://docs.docsgpt.cloud/Deploying/Docker-Deploying): Run DocsGPT with Docker and Docker Compose. - [Kubernetes Deployment](https://docs.docsgpt.cloud/Deploying/Kubernetes-Deploying): Deploy DocsGPT on Kubernetes clusters. - [Hosting DocsGPT](https://docs.docsgpt.cloud/Deploying/Hosting-the-app): Hosting overview with cloud options. ## Agents - [Agent Basics](https://docs.docsgpt.cloud/Agents/basics): Core concepts for building and managing agents. - [Workflow Nodes](https://docs.docsgpt.cloud/Agents/nodes): Node types and behavior in agent workflows. - [Agent API](https://docs.docsgpt.cloud/Agents/api): Programmatic agent interaction (streaming and non-streaming). - [Agent Webhooks](https://docs.docsgpt.cloud/Agents/webhooks): Trigger and automate agents with webhooks. ## Tools - [Tools Basics](https://docs.docsgpt.cloud/Tools/basics): How tools extend agent capabilities. - [Generic API Tool](https://docs.docsgpt.cloud/Tools/api-tool): Configure API calls without custom code. - [Creating a Custom Tool](https://docs.docsgpt.cloud/Tools/creating-a-tool): Build custom Python tools for DocsGPT. ## Models - [Cloud LLM Providers](https://docs.docsgpt.cloud/Models/cloud-providers): Configure hosted model providers. - [Local Inference](https://docs.docsgpt.cloud/Models/local-inference): Connect DocsGPT to local inference backends. - [Embeddings](https://docs.docsgpt.cloud/Models/embeddings): Select and configure embedding models. ## Extensions - [API Keys for Integrations](https://docs.docsgpt.cloud/Extensions/api-key-guide): Generate and use DocsGPT API keys. - [Chat Widget](https://docs.docsgpt.cloud/Extensions/chat-widget): Embed the DocsGPT chat widget. - [Search Widget](https://docs.docsgpt.cloud/Extensions/search-widget): Embed the DocsGPT search widget. - [Chrome Extension](https://docs.docsgpt.cloud/Extensions/Chrome-extension): Install and use the browser extension. - [Chatwoot Extension](https://docs.docsgpt.cloud/Extensions/Chatwoot-extension): Integrate DocsGPT with Chatwoot. ## Integrations - [Google Drive Connector](https://docs.docsgpt.cloud/Guides/Integrations/google-drive-connector): Ingest and sync files from Google Drive. ## Optional - [Customizing Prompts](https://docs.docsgpt.cloud/Guides/Customising-prompts): Template-based prompt customization. - [How to Train on Other Documentation](https://docs.docsgpt.cloud/Guides/How-to-train-on-other-documentation): Add additional documentation sources. - [Context Compression](https://docs.docsgpt.cloud/Guides/compression): Reduce context while preserving key information. - [OCR for Sources and Attachments](https://docs.docsgpt.cloud/Guides/ocr): OCR behavior for ingestion and chat uploads. - [How to Use Different LLMs](https://docs.docsgpt.cloud/Guides/How-to-use-different-LLM): Additional model-selection guidance. - [Avoiding Hallucinations](https://docs.docsgpt.cloud/Guides/My-AI-answers-questions-using-external-knowledge): Improve answer grounding with external knowledge. - [Amazon Lightsail Deployment](https://docs.docsgpt.cloud/Deploying/Amazon-Lightsail): Deploy DocsGPT on AWS Lightsail. - [Railway Deployment](https://docs.docsgpt.cloud/Deploying/Railway): Deploy DocsGPT on Railway. - [Changelog](https://docs.docsgpt.cloud/changelog): Project release history. ================================================ FILE: docs/theme.config.jsx ================================================ const github = 'https://github.com/arc53/DocsGPT'; const isDevelopment = process.env.NODE_ENV === 'development'; const config = { docsRepositoryBase: `${github}/blob/main/docs`, darkMode: true, search: isDevelopment ? null : undefined, nextThemes: { defaultTheme: 'dark', }, sidebar: { defaultMenuCollapseLevel: 1, }, toc: { float: true, }, editLink: 'Edit this page on GitHub', }; export default config; ================================================ FILE: extensions/chatwoot/.env_sample ================================================ docsgpt_url= chatwoot_url= docsgpt_key= chatwoot_token=xxxxx account_id=(optional) 1 assignee_id=(optional) 1 ================================================ FILE: extensions/chatwoot/__init__.py ================================================ ================================================ FILE: extensions/chatwoot/app.py ================================================ import os import pprint import dotenv import requests from flask import Flask, request dotenv.load_dotenv() docsgpt_url = os.getenv("docsgpt_url") chatwoot_url = os.getenv("chatwoot_url") docsgpt_key = os.getenv("docsgpt_key") chatwoot_token = os.getenv("chatwoot_token") # account_id = os.getenv("account_id") # assignee_id = os.getenv("assignee_id") label_stop = "human-requested" def send_to_bot(sender, message): data = { 'sender': sender, 'question': message, 'api_key': docsgpt_key, 'embeddings_key': docsgpt_key, 'history': '' } headers = {"Content-Type": "application/json", "Accept": "application/json"} r = requests.post(f'{docsgpt_url}/api/answer', json=data, headers=headers) return r.json()['answer'] def send_to_chatwoot(account, conversation, message): data = { 'content': message } url = f"{chatwoot_url}/api/v1/accounts/{account}/conversations/{conversation}/messages" headers = {"Content-Type": "application/json", "Accept": "application/json", "api_access_token": f"{chatwoot_token}"} r = requests.post(url, json=data, headers=headers) return r.json() app = Flask(__name__) @app.route('/docsgpt', methods=['POST']) def docsgpt(): data = request.get_json() pp = pprint.PrettyPrinter(indent=4) pp.pprint(data) try: message_type = data['message_type'] except KeyError: return "Not a message" message = data['content'] conversation = data['conversation']['id'] contact = data['sender']['id'] account = data['account']['id'] assignee = data['conversation']['meta']['assignee']['id'] print(account) print(label_stop) print(data['conversation']['labels']) print(assignee) if label_stop in data['conversation']['labels']: return "Label stop" # elif str(account) != str(account_id): # return "Not the right account" # elif str(assignee) != str(assignee_id): # return "Not the right assignee" if (message_type == "incoming"): bot_response = send_to_bot(contact, message) create_message = send_to_chatwoot( account, conversation, bot_response) else: return "Not an incoming message" return create_message if __name__ == '__main__': app.run(host='0.0.0.0', port=80) ================================================ FILE: extensions/chrome/_locales/en/messages.json ================================================ { "l10nTabName": { "message":"Localization" ,"description":"name of the localization tab" } ,"l10nHeader": { "message":"It does localization too! (this whole tab is, actually)" ,"description":"Header text for the localization section" } ,"l10nIntro": { "message":"'L10n' refers to 'Localization' - 'L' an 'n' are obvious, and 10 comes from the number of letters between those two. It is the process/whatever of displaying something in the language of choice. It uses 'I18n', 'Internationalization', which refers to the tools / framework supporting L10n. I.e., something is internationalized if it has I18n support, and can be localized. Something is localized for you if it is in your language / dialect." ,"description":"introduce the basic idea." } ,"l10nProd": { "message":"You are planning to allow localization, right? You have no idea who will be using your extension! You have no idea who will be translating it! At least support the basics, it's not hard, and having the framework in place will let you transition much more easily later on." ,"description":"drive the point home. It's good for you." } ,"l10nFirstParagraph": { "message":"When the options page loads, elements decorated with data-l10n will automatically be localized!" ,"description":"inform that elements will be localized on load" } ,"l10nSecondParagraph": { "message":"If you need more complex localization, you can also define data-l10n-args. This should contain $containerType$ filled with $dataType$, which will be passed into Chrome's i18n API as $functionArgs$. In fact, this paragraph does just that, and wraps the args in mono-space font. Easy!" ,"description":"introduce the data-l10n-args attribute. End on a lame note." ,"placeholders": { "containerType": { "content":"$1" ,"example":"'array', 'list', or something similar" ,"description":"type of the args container" } ,"dataType": { "content":"$2" ,"example":"string" ,"description":"type of data in each array index" } ,"functionArgs": { "content":"$3" ,"example":"arguments" ,"description":"whatever you call what you pass into a function/method. args, params, etc." } } } ,"l10nThirdParagraph": { "message":"Message contents are passed right into innerHTML without processing - include any tags (or even scripts) that you feel like. If you have an input field, the placeholder will be set instead, and buttons will have the value attribute set." ,"description":"inform that we handle placeholders, buttons, and direct HTML input" } ,"l10nButtonsBefore": { "message":"Different types of buttons are handled as well. <button> elements have their html set:" } ,"l10nButton": { "message":"in a button" } ,"l10nButtonsBetween": { "message":"while <input type='submit'> and <input type='button'> get their 'value' set (note: no HTML):" } ,"l10nSubmit": { "message":"a submit value" } ,"l10nButtonsAfter": { "message":"Awesome, no?" } ,"l10nExtras": { "message":"You can even set data-l10n on things like the <title> tag, which lets you have translatable page titles, or fieldset <legend> tags, or anywhere else - the default Boil.localize() behavior will check every tag in the document, not just the body." ,"description":"inform about places which may not be obvious, like , etc" } } ================================================ FILE: extensions/chrome/dist/output.css ================================================ /* ! tailwindcss v3.2.7 | MIT License | https://tailwindcss.com */ /* 1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4) 2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116) */ *, ::before, ::after { box-sizing: border-box; /* 1 */ border-width: 0; /* 2 */ border-style: solid; /* 2 */ border-color: #e5e7eb; /* 2 */ } ::before, ::after { --tw-content: ''; } /* 1. Use a consistent sensible line-height in all browsers. 2. Prevent adjustments of font size after orientation changes in iOS. 3. Use a more readable tab size. 4. Use the user's configured `sans` font-family by default. 5. Use the user's configured `sans` font-feature-settings by default. */ html { line-height: 1.5; /* 1 */ -webkit-text-size-adjust: 100%; /* 2 */ -moz-tab-size: 4; /* 3 */ -o-tab-size: 4; tab-size: 4; /* 3 */ font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; /* 4 */ font-feature-settings: normal; /* 5 */ } /* 1. Remove the margin in all browsers. 2. Inherit line-height from `html` so users can set them as a class directly on the `html` element. */ body { margin: 0; /* 1 */ line-height: inherit; /* 2 */ } /* 1. Add the correct height in Firefox. 2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655) 3. Ensure horizontal rules are visible by default. */ hr { height: 0; /* 1 */ color: inherit; /* 2 */ border-top-width: 1px; /* 3 */ } /* Add the correct text decoration in Chrome, Edge, and Safari. */ abbr:where([title]) { -webkit-text-decoration: underline dotted; text-decoration: underline dotted; } /* Remove the default font size and weight for headings. */ h1, h2, h3, h4, h5, h6 { font-size: inherit; font-weight: inherit; } /* Reset links to optimize for opt-in styling instead of opt-out. */ a { color: inherit; text-decoration: inherit; } /* Add the correct font weight in Edge and Safari. */ b, strong { font-weight: bolder; } /* 1. Use the user's configured `mono` font family by default. 2. Correct the odd `em` font sizing in all browsers. */ code, kbd, samp, pre { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; /* 1 */ font-size: 1em; /* 2 */ } /* Add the correct font size in all browsers. */ small { font-size: 80%; } /* Prevent `sub` and `sup` elements from affecting the line height in all browsers. */ sub, sup { font-size: 75%; line-height: 0; position: relative; vertical-align: baseline; } sub { bottom: -0.25em; } sup { top: -0.5em; } /* 1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297) 2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016) 3. Remove gaps between table borders by default. */ table { text-indent: 0; /* 1 */ border-color: inherit; /* 2 */ border-collapse: collapse; /* 3 */ } /* 1. Change the font styles in all browsers. 2. Remove the margin in Firefox and Safari. 3. Remove default padding in all browsers. */ button, input, optgroup, select, textarea { font-family: inherit; /* 1 */ font-size: 100%; /* 1 */ font-weight: inherit; /* 1 */ line-height: inherit; /* 1 */ color: inherit; /* 1 */ margin: 0; /* 2 */ padding: 0; /* 3 */ } /* Remove the inheritance of text transform in Edge and Firefox. */ button, select { text-transform: none; } /* 1. Correct the inability to style clickable types in iOS and Safari. 2. Remove default button styles. */ button, [type='button'], [type='reset'], [type='submit'] { -webkit-appearance: button; /* 1 */ background-color: transparent; /* 2 */ background-image: none; /* 2 */ } /* Use the modern Firefox focus style for all focusable elements. */ :-moz-focusring { outline: auto; } /* Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737) */ :-moz-ui-invalid { box-shadow: none; } /* Add the correct vertical alignment in Chrome and Firefox. */ progress { vertical-align: baseline; } /* Correct the cursor style of increment and decrement buttons in Safari. */ ::-webkit-inner-spin-button, ::-webkit-outer-spin-button { height: auto; } /* 1. Correct the odd appearance in Chrome and Safari. 2. Correct the outline style in Safari. */ [type='search'] { -webkit-appearance: textfield; /* 1 */ outline-offset: -2px; /* 2 */ } /* Remove the inner padding in Chrome and Safari on macOS. */ ::-webkit-search-decoration { -webkit-appearance: none; } /* 1. Correct the inability to style clickable types in iOS and Safari. 2. Change font properties to `inherit` in Safari. */ ::-webkit-file-upload-button { -webkit-appearance: button; /* 1 */ font: inherit; /* 2 */ } /* Add the correct display in Chrome and Safari. */ summary { display: list-item; } /* Removes the default spacing and border for appropriate elements. */ blockquote, dl, dd, h1, h2, h3, h4, h5, h6, hr, figure, p, pre { margin: 0; } fieldset { margin: 0; padding: 0; } legend { padding: 0; } ol, ul, menu { list-style: none; margin: 0; padding: 0; } /* Prevent resizing textareas horizontally by default. */ textarea { resize: vertical; } /* 1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300) 2. Set the default placeholder color to the user's configured gray 400 color. */ input::-moz-placeholder, textarea::-moz-placeholder { opacity: 1; /* 1 */ color: #9ca3af; /* 2 */ } input::placeholder, textarea::placeholder { opacity: 1; /* 1 */ color: #9ca3af; /* 2 */ } /* Set the default cursor for buttons. */ button, [role="button"] { cursor: pointer; } /* Make sure disabled buttons don't get the pointer cursor. */ :disabled { cursor: default; } /* 1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14) 2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210) This can trigger a poorly considered lint error in some tools but is included by design. */ img, svg, video, canvas, audio, iframe, embed, object { display: block; /* 1 */ vertical-align: middle; /* 2 */ } /* Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14) */ img, video { max-width: 100%; height: auto; } /* Make elements with the HTML hidden attribute stay hidden by default */ [hidden] { display: none; } *, ::before, ::after { --tw-border-spacing-x: 0; --tw-border-spacing-y: 0; --tw-translate-x: 0; --tw-translate-y: 0; --tw-rotate: 0; --tw-skew-x: 0; --tw-skew-y: 0; --tw-scale-x: 1; --tw-scale-y: 1; --tw-pan-x: ; --tw-pan-y: ; --tw-pinch-zoom: ; --tw-scroll-snap-strictness: proximity; --tw-ordinal: ; --tw-slashed-zero: ; --tw-numeric-figure: ; --tw-numeric-spacing: ; --tw-numeric-fraction: ; --tw-ring-inset: ; --tw-ring-offset-width: 0px; --tw-ring-offset-color: #fff; --tw-ring-color: rgb(59 130 246 / 0.5); --tw-ring-offset-shadow: 0 0 #0000; --tw-ring-shadow: 0 0 #0000; --tw-shadow: 0 0 #0000; --tw-shadow-colored: 0 0 #0000; --tw-blur: ; --tw-brightness: ; --tw-contrast: ; --tw-grayscale: ; --tw-hue-rotate: ; --tw-invert: ; --tw-saturate: ; --tw-sepia: ; --tw-drop-shadow: ; --tw-backdrop-blur: ; --tw-backdrop-brightness: ; --tw-backdrop-contrast: ; --tw-backdrop-grayscale: ; --tw-backdrop-hue-rotate: ; --tw-backdrop-invert: ; --tw-backdrop-opacity: ; --tw-backdrop-saturate: ; --tw-backdrop-sepia: ; } ::backdrop { --tw-border-spacing-x: 0; --tw-border-spacing-y: 0; --tw-translate-x: 0; --tw-translate-y: 0; --tw-rotate: 0; --tw-skew-x: 0; --tw-skew-y: 0; --tw-scale-x: 1; --tw-scale-y: 1; --tw-pan-x: ; --tw-pan-y: ; --tw-pinch-zoom: ; --tw-scroll-snap-strictness: proximity; --tw-ordinal: ; --tw-slashed-zero: ; --tw-numeric-figure: ; --tw-numeric-spacing: ; --tw-numeric-fraction: ; --tw-ring-inset: ; --tw-ring-offset-width: 0px; --tw-ring-offset-color: #fff; --tw-ring-color: rgb(59 130 246 / 0.5); --tw-ring-offset-shadow: 0 0 #0000; --tw-ring-shadow: 0 0 #0000; --tw-shadow: 0 0 #0000; --tw-shadow-colored: 0 0 #0000; --tw-blur: ; --tw-brightness: ; --tw-contrast: ; --tw-grayscale: ; --tw-hue-rotate: ; --tw-invert: ; --tw-saturate: ; --tw-sepia: ; --tw-drop-shadow: ; --tw-backdrop-blur: ; --tw-backdrop-brightness: ; --tw-backdrop-contrast: ; --tw-backdrop-grayscale: ; --tw-backdrop-hue-rotate: ; --tw-backdrop-invert: ; --tw-backdrop-opacity: ; --tw-backdrop-saturate: ; --tw-backdrop-sepia: ; } .mb-2 { margin-bottom: 0.5rem; } .ml-2 { margin-left: 0.5rem; } .mr-2 { margin-right: 0.5rem; } .mt-4 { margin-top: 1rem; } .flex { display: flex; } .w-\[26rem\] { width: 26rem; } .w-full { width: 100%; } .flex-col { flex-direction: column; } .items-center { align-items: center; } .justify-between { justify-content: space-between; } .self-start { align-self: flex-start; } .self-end { align-self: flex-end; } .rounded-lg { border-radius: 0.5rem; } .bg-blue-500 { --tw-bg-opacity: 1; background-color: rgb(59 130 246 / var(--tw-bg-opacity)); } .bg-gray-200 { --tw-bg-opacity: 1; background-color: rgb(229 231 235 / var(--tw-bg-opacity)); } .bg-gray-900 { --tw-bg-opacity: 1; background-color: rgb(17 24 39 / var(--tw-bg-opacity)); } .bg-indigo-500 { --tw-bg-opacity: 1; background-color: rgb(99 102 241 / var(--tw-bg-opacity)); } .bg-white { --tw-bg-opacity: 1; background-color: rgb(255 255 255 / var(--tw-bg-opacity)); } .p-2 { padding: 0.5rem; } .p-4 { padding: 1rem; } .text-lg { font-size: 1.125rem; line-height: 1.75rem; } .text-sm { font-size: 0.875rem; line-height: 1.25rem; } .font-medium { font-weight: 500; } .text-blue-500 { --tw-text-opacity: 1; color: rgb(59 130 246 / var(--tw-text-opacity)); } .text-gray-700 { --tw-text-opacity: 1; color: rgb(55 65 81 / var(--tw-text-opacity)); } .text-white { --tw-text-opacity: 1; color: rgb(255 255 255 / var(--tw-text-opacity)); } .shadow { --tw-shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1); --tw-shadow-colored: 0 1px 3px 0 var(--tw-shadow-color), 0 1px 2px -1px var(--tw-shadow-color); box-shadow: var(--tw-ring-offset-shadow, 0 0 #0000), var(--tw-ring-shadow, 0 0 #0000), var(--tw-shadow); } #chat-container { width: 500px; height: 450px; background-color: white; padding: 10px; overflow: auto; } .bg-gray-200 { background-color: #edf2f7; } .bg-gray-900 { background-color: #1a202c; } .rounded-lg { border-radius: 0.5rem; } .shadow { box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12), 0 1px 2px rgba(0, 0, 0, 0.24); } .text-gray-700 { color: #4a5568; } .text-sm { font-size: 0.875rem; } .p-4 { padding: 1.5rem; } .hover\:text-blue-800:hover { --tw-text-opacity: 1; color: rgb(30 64 175 / var(--tw-text-opacity)); } ================================================ FILE: extensions/chrome/js/jquery/.gitignore ================================================ build jquery-migrate.js jquery-migrate.min.js ================================================ FILE: extensions/chrome/js/jquery/README.md ================================================ jQuery Component ================ Shim repository for jQuery. ================================================ FILE: extensions/chrome/js/jquery/bower.json ================================================ { "name": "jquery", "version": "2.0.0", "description": "jQuery component", "keywords": [ "jquery", "component" ], "scripts": [ "jquery.js" ], "license": "MIT" } ================================================ FILE: extensions/chrome/js/jquery/component.json ================================================ { "name": "jquery", "version": "2.0.0", "description": "jQuery component", "keywords": [ "jquery", "component" ], "scripts": [ "jquery.js" ], "license": "MIT", "gitHead": "46f8412bd1bb9b1b30b5b0eb88560e2d4196509c", "readme": "jQuery Component\n================\n\nShim repository for jQuery.\n", "readmeFilename": "README.md", "_id": "jquery@2.0.0", "repository": { "type": "git", "url": "git://github.com/components/jquery.git" } } ================================================ FILE: extensions/chrome/js/jquery/composer.json ================================================ { "name": "components/jquery", "description": "jQuery JavaScript Library", "type": "component", "homepage": "http://jquery.com", "license": "MIT", "support": { "irc": "irc://irc.freenode.org/jquery", "issues": "http://bugs.jquery.com", "forum": "http://forum.jquery.com", "wiki": "http://docs.jquery.com/", "source": "https://github.com/jquery/jquery" }, "authors": [ { "name": "John Resig", "email": "jeresig@gmail.com" } ], "require": { "robloach/component-installer": "*" }, "extra": { "component": { "scripts": [ "jquery.js" ] } } } ================================================ FILE: extensions/chrome/js/jquery/jquery.js ================================================ /*! * jQuery JavaScript Library v2.0.0 * http://jquery.com/ * * Includes Sizzle.js * http://sizzlejs.com/ * * Copyright 2005, 2013 jQuery Foundation, Inc. and other contributors * Released under the MIT license * http://jquery.org/license * * Date: 2013-04-18 */ (function( window, undefined ) { // Can't do this because several apps including ASP.NET trace // the stack via arguments.caller.callee and Firefox dies if // you try to trace through "use strict" call chains. (#13335) // Support: Firefox 18+ //"use strict"; var // A central reference to the root jQuery(document) rootjQuery, // The deferred used on DOM ready readyList, // Support: IE9 // For `typeof xmlNode.method` instead of `xmlNode.method !== undefined` core_strundefined = typeof undefined, // Use the correct document accordingly with window argument (sandbox) location = window.location, document = window.document, docElem = document.documentElement, // Map over jQuery in case of overwrite _jQuery = window.jQuery, // Map over the $ in case of overwrite _$ = window.$, // [[Class]] -> type pairs class2type = {}, // List of deleted data cache ids, so we can reuse them core_deletedIds = [], core_version = "2.0.0", // Save a reference to some core methods core_concat = core_deletedIds.concat, core_push = core_deletedIds.push, core_slice = core_deletedIds.slice, core_indexOf = core_deletedIds.indexOf, core_toString = class2type.toString, core_hasOwn = class2type.hasOwnProperty, core_trim = core_version.trim, // Define a local copy of jQuery jQuery = function( selector, context ) { // The jQuery object is actually just the init constructor 'enhanced' return new jQuery.fn.init( selector, context, rootjQuery ); }, // Used for matching numbers core_pnum = /[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/.source, // Used for splitting on whitespace core_rnotwhite = /\S+/g, // A simple way to check for HTML strings // Prioritize #id over <tag> to avoid XSS via location.hash (#9521) // Strict HTML recognition (#11290: must start with <) rquickExpr = /^(?:(<[\w\W]+>)[^>]*|#([\w-]*))$/, // Match a standalone tag rsingleTag = /^<(\w+)\s*\/?>(?:<\/\1>|)$/, // Matches dashed string for camelizing rmsPrefix = /^-ms-/, rdashAlpha = /-([\da-z])/gi, // Used by jQuery.camelCase as callback to replace() fcamelCase = function( all, letter ) { return letter.toUpperCase(); }, // The ready event handler and self cleanup method completed = function() { document.removeEventListener( "DOMContentLoaded", completed, false ); window.removeEventListener( "load", completed, false ); jQuery.ready(); }; jQuery.fn = jQuery.prototype = { // The current version of jQuery being used jquery: core_version, constructor: jQuery, init: function( selector, context, rootjQuery ) { var match, elem; // HANDLE: $(""), $(null), $(undefined), $(false) if ( !selector ) { return this; } // Handle HTML strings if ( typeof selector === "string" ) { if ( selector.charAt(0) === "<" && selector.charAt( selector.length - 1 ) === ">" && selector.length >= 3 ) { // Assume that strings that start and end with <> are HTML and skip the regex check match = [ null, selector, null ]; } else { match = rquickExpr.exec( selector ); } // Match html or make sure no context is specified for #id if ( match && (match[1] || !context) ) { // HANDLE: $(html) -> $(array) if ( match[1] ) { context = context instanceof jQuery ? context[0] : context; // scripts is true for back-compat jQuery.merge( this, jQuery.parseHTML( match[1], context && context.nodeType ? context.ownerDocument || context : document, true ) ); // HANDLE: $(html, props) if ( rsingleTag.test( match[1] ) && jQuery.isPlainObject( context ) ) { for ( match in context ) { // Properties of context are called as methods if possible if ( jQuery.isFunction( this[ match ] ) ) { this[ match ]( context[ match ] ); // ...and otherwise set as attributes } else { this.attr( match, context[ match ] ); } } } return this; // HANDLE: $(#id) } else { elem = document.getElementById( match[2] ); // Check parentNode to catch when Blackberry 4.6 returns // nodes that are no longer in the document #6963 if ( elem && elem.parentNode ) { // Inject the element directly into the jQuery object this.length = 1; this[0] = elem; } this.context = document; this.selector = selector; return this; } // HANDLE: $(expr, $(...)) } else if ( !context || context.jquery ) { return ( context || rootjQuery ).find( selector ); // HANDLE: $(expr, context) // (which is just equivalent to: $(context).find(expr) } else { return this.constructor( context ).find( selector ); } // HANDLE: $(DOMElement) } else if ( selector.nodeType ) { this.context = this[0] = selector; this.length = 1; return this; // HANDLE: $(function) // Shortcut for document ready } else if ( jQuery.isFunction( selector ) ) { return rootjQuery.ready( selector ); } if ( selector.selector !== undefined ) { this.selector = selector.selector; this.context = selector.context; } return jQuery.makeArray( selector, this ); }, // Start with an empty selector selector: "", // The default length of a jQuery object is 0 length: 0, toArray: function() { return core_slice.call( this ); }, // Get the Nth element in the matched element set OR // Get the whole matched element set as a clean array get: function( num ) { return num == null ? // Return a 'clean' array this.toArray() : // Return just the object ( num < 0 ? this[ this.length + num ] : this[ num ] ); }, // Take an array of elements and push it onto the stack // (returning the new matched element set) pushStack: function( elems ) { // Build a new jQuery matched element set var ret = jQuery.merge( this.constructor(), elems ); // Add the old object onto the stack (as a reference) ret.prevObject = this; ret.context = this.context; // Return the newly-formed element set return ret; }, // Execute a callback for every element in the matched set. // (You can seed the arguments with an array of args, but this is // only used internally.) each: function( callback, args ) { return jQuery.each( this, callback, args ); }, ready: function( fn ) { // Add the callback jQuery.ready.promise().done( fn ); return this; }, slice: function() { return this.pushStack( core_slice.apply( this, arguments ) ); }, first: function() { return this.eq( 0 ); }, last: function() { return this.eq( -1 ); }, eq: function( i ) { var len = this.length, j = +i + ( i < 0 ? len : 0 ); return this.pushStack( j >= 0 && j < len ? [ this[j] ] : [] ); }, map: function( callback ) { return this.pushStack( jQuery.map(this, function( elem, i ) { return callback.call( elem, i, elem ); })); }, end: function() { return this.prevObject || this.constructor(null); }, // For internal use only. // Behaves like an Array's method, not like a jQuery method. push: core_push, sort: [].sort, splice: [].splice }; // Give the init function the jQuery prototype for later instantiation jQuery.fn.init.prototype = jQuery.fn; jQuery.extend = jQuery.fn.extend = function() { var options, name, src, copy, copyIsArray, clone, target = arguments[0] || {}, i = 1, length = arguments.length, deep = false; // Handle a deep copy situation if ( typeof target === "boolean" ) { deep = target; target = arguments[1] || {}; // skip the boolean and the target i = 2; } // Handle case when target is a string or something (possible in deep copy) if ( typeof target !== "object" && !jQuery.isFunction(target) ) { target = {}; } // extend jQuery itself if only one argument is passed if ( length === i ) { target = this; --i; } for ( ; i < length; i++ ) { // Only deal with non-null/undefined values if ( (options = arguments[ i ]) != null ) { // Extend the base object for ( name in options ) { src = target[ name ]; copy = options[ name ]; // Prevent never-ending loop if ( target === copy ) { continue; } // Recurse if we're merging plain objects or arrays if ( deep && copy && ( jQuery.isPlainObject(copy) || (copyIsArray = jQuery.isArray(copy)) ) ) { if ( copyIsArray ) { copyIsArray = false; clone = src && jQuery.isArray(src) ? src : []; } else { clone = src && jQuery.isPlainObject(src) ? src : {}; } // Never move original objects, clone them target[ name ] = jQuery.extend( deep, clone, copy ); // Don't bring in undefined values } else if ( copy !== undefined ) { target[ name ] = copy; } } } } // Return the modified object return target; }; jQuery.extend({ // Unique for each copy of jQuery on the page expando: "jQuery" + ( core_version + Math.random() ).replace( /\D/g, "" ), noConflict: function( deep ) { if ( window.$ === jQuery ) { window.$ = _$; } if ( deep && window.jQuery === jQuery ) { window.jQuery = _jQuery; } return jQuery; }, // Is the DOM ready to be used? Set to true once it occurs. isReady: false, // A counter to track how many items to wait for before // the ready event fires. See #6781 readyWait: 1, // Hold (or release) the ready event holdReady: function( hold ) { if ( hold ) { jQuery.readyWait++; } else { jQuery.ready( true ); } }, // Handle when the DOM is ready ready: function( wait ) { // Abort if there are pending holds or we're already ready if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { return; } // Remember that the DOM is ready jQuery.isReady = true; // If a normal DOM Ready event fired, decrement, and wait if need be if ( wait !== true && --jQuery.readyWait > 0 ) { return; } // If there are functions bound, to execute readyList.resolveWith( document, [ jQuery ] ); // Trigger any bound ready events if ( jQuery.fn.trigger ) { jQuery( document ).trigger("ready").off("ready"); } }, // See test/unit/core.js for details concerning isFunction. // Since version 1.3, DOM methods and functions like alert // aren't supported. They return false on IE (#2968). isFunction: function( obj ) { return jQuery.type(obj) === "function"; }, isArray: Array.isArray, isWindow: function( obj ) { return obj != null && obj === obj.window; }, isNumeric: function( obj ) { return !isNaN( parseFloat(obj) ) && isFinite( obj ); }, type: function( obj ) { if ( obj == null ) { return String( obj ); } // Support: Safari <= 5.1 (functionish RegExp) return typeof obj === "object" || typeof obj === "function" ? class2type[ core_toString.call(obj) ] || "object" : typeof obj; }, isPlainObject: function( obj ) { // Not plain objects: // - Any object or value whose internal [[Class]] property is not "[object Object]" // - DOM nodes // - window if ( jQuery.type( obj ) !== "object" || obj.nodeType || jQuery.isWindow( obj ) ) { return false; } // Support: Firefox <20 // The try/catch suppresses exceptions thrown when attempting to access // the "constructor" property of certain host objects, ie. |window.location| // https://bugzilla.mozilla.org/show_bug.cgi?id=814622 try { if ( obj.constructor && !core_hasOwn.call( obj.constructor.prototype, "isPrototypeOf" ) ) { return false; } } catch ( e ) { return false; } // If the function hasn't returned already, we're confident that // |obj| is a plain object, created by {} or constructed with new Object return true; }, isEmptyObject: function( obj ) { var name; for ( name in obj ) { return false; } return true; }, error: function( msg ) { throw new Error( msg ); }, // data: string of html // context (optional): If specified, the fragment will be created in this context, defaults to document // keepScripts (optional): If true, will include scripts passed in the html string parseHTML: function( data, context, keepScripts ) { if ( !data || typeof data !== "string" ) { return null; } if ( typeof context === "boolean" ) { keepScripts = context; context = false; } context = context || document; var parsed = rsingleTag.exec( data ), scripts = !keepScripts && []; // Single tag if ( parsed ) { return [ context.createElement( parsed[1] ) ]; } parsed = jQuery.buildFragment( [ data ], context, scripts ); if ( scripts ) { jQuery( scripts ).remove(); } return jQuery.merge( [], parsed.childNodes ); }, parseJSON: JSON.parse, // Cross-browser xml parsing parseXML: function( data ) { var xml, tmp; if ( !data || typeof data !== "string" ) { return null; } // Support: IE9 try { tmp = new DOMParser(); xml = tmp.parseFromString( data , "text/xml" ); } catch ( e ) { xml = undefined; } if ( !xml || xml.getElementsByTagName( "parsererror" ).length ) { jQuery.error( "Invalid XML: " + data ); } return xml; }, noop: function() {}, // Evaluates a script in a global context globalEval: function( code ) { var script, indirect = eval; code = jQuery.trim( code ); if ( code ) { // If the code includes a valid, prologue position // strict mode pragma, execute code by injecting a // script tag into the document. if ( code.indexOf("use strict") === 1 ) { script = document.createElement("script"); script.text = code; document.head.appendChild( script ).parentNode.removeChild( script ); } else { // Otherwise, avoid the DOM node creation, insertion // and removal by using an indirect global eval indirect( code ); } } }, // Convert dashed to camelCase; used by the css and data modules // Microsoft forgot to hump their vendor prefix (#9572) camelCase: function( string ) { return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); }, nodeName: function( elem, name ) { return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); }, // args is for internal usage only each: function( obj, callback, args ) { var value, i = 0, length = obj.length, isArray = isArraylike( obj ); if ( args ) { if ( isArray ) { for ( ; i < length; i++ ) { value = callback.apply( obj[ i ], args ); if ( value === false ) { break; } } } else { for ( i in obj ) { value = callback.apply( obj[ i ], args ); if ( value === false ) { break; } } } // A special, fast, case for the most common use of each } else { if ( isArray ) { for ( ; i < length; i++ ) { value = callback.call( obj[ i ], i, obj[ i ] ); if ( value === false ) { break; } } } else { for ( i in obj ) { value = callback.call( obj[ i ], i, obj[ i ] ); if ( value === false ) { break; } } } } return obj; }, trim: function( text ) { return text == null ? "" : core_trim.call( text ); }, // results is for internal usage only makeArray: function( arr, results ) { var ret = results || []; if ( arr != null ) { if ( isArraylike( Object(arr) ) ) { jQuery.merge( ret, typeof arr === "string" ? [ arr ] : arr ); } else { core_push.call( ret, arr ); } } return ret; }, inArray: function( elem, arr, i ) { return arr == null ? -1 : core_indexOf.call( arr, elem, i ); }, merge: function( first, second ) { var l = second.length, i = first.length, j = 0; if ( typeof l === "number" ) { for ( ; j < l; j++ ) { first[ i++ ] = second[ j ]; } } else { while ( second[j] !== undefined ) { first[ i++ ] = second[ j++ ]; } } first.length = i; return first; }, grep: function( elems, callback, inv ) { var retVal, ret = [], i = 0, length = elems.length; inv = !!inv; // Go through the array, only saving the items // that pass the validator function for ( ; i < length; i++ ) { retVal = !!callback( elems[ i ], i ); if ( inv !== retVal ) { ret.push( elems[ i ] ); } } return ret; }, // arg is for internal usage only map: function( elems, callback, arg ) { var value, i = 0, length = elems.length, isArray = isArraylike( elems ), ret = []; // Go through the array, translating each of the items to their if ( isArray ) { for ( ; i < length; i++ ) { value = callback( elems[ i ], i, arg ); if ( value != null ) { ret[ ret.length ] = value; } } // Go through every key on the object, } else { for ( i in elems ) { value = callback( elems[ i ], i, arg ); if ( value != null ) { ret[ ret.length ] = value; } } } // Flatten any nested arrays return core_concat.apply( [], ret ); }, // A global GUID counter for objects guid: 1, // Bind a function to a context, optionally partially applying any // arguments. proxy: function( fn, context ) { var tmp, args, proxy; if ( typeof context === "string" ) { tmp = fn[ context ]; context = fn; fn = tmp; } // Quick check to determine if target is callable, in the spec // this throws a TypeError, but we will just return undefined. if ( !jQuery.isFunction( fn ) ) { return undefined; } // Simulated bind args = core_slice.call( arguments, 2 ); proxy = function() { return fn.apply( context || this, args.concat( core_slice.call( arguments ) ) ); }; // Set the guid of unique handler to the same of original handler, so it can be removed proxy.guid = fn.guid = fn.guid || jQuery.guid++; return proxy; }, // Multifunctional method to get and set values of a collection // The value/s can optionally be executed if it's a function access: function( elems, fn, key, value, chainable, emptyGet, raw ) { var i = 0, length = elems.length, bulk = key == null; // Sets many values if ( jQuery.type( key ) === "object" ) { chainable = true; for ( i in key ) { jQuery.access( elems, fn, i, key[i], true, emptyGet, raw ); } // Sets one value } else if ( value !== undefined ) { chainable = true; if ( !jQuery.isFunction( value ) ) { raw = true; } if ( bulk ) { // Bulk operations run against the entire set if ( raw ) { fn.call( elems, value ); fn = null; // ...except when executing function values } else { bulk = fn; fn = function( elem, key, value ) { return bulk.call( jQuery( elem ), value ); }; } } if ( fn ) { for ( ; i < length; i++ ) { fn( elems[i], key, raw ? value : value.call( elems[i], i, fn( elems[i], key ) ) ); } } } return chainable ? elems : // Gets bulk ? fn.call( elems ) : length ? fn( elems[0], key ) : emptyGet; }, now: Date.now, // A method for quickly swapping in/out CSS properties to get correct calculations. // Note: this method belongs to the css module but it's needed here for the support module. // If support gets modularized, this method should be moved back to the css module. swap: function( elem, options, callback, args ) { var ret, name, old = {}; // Remember the old values, and insert the new ones for ( name in options ) { old[ name ] = elem.style[ name ]; elem.style[ name ] = options[ name ]; } ret = callback.apply( elem, args || [] ); // Revert the old values for ( name in options ) { elem.style[ name ] = old[ name ]; } return ret; } }); jQuery.ready.promise = function( obj ) { if ( !readyList ) { readyList = jQuery.Deferred(); // Catch cases where $(document).ready() is called after the browser event has already occurred. // we once tried to use readyState "interactive" here, but it caused issues like the one // discovered by ChrisS here: http://bugs.jquery.com/ticket/12282#comment:15 if ( document.readyState === "complete" ) { // Handle it asynchronously to allow scripts the opportunity to delay ready setTimeout( jQuery.ready ); } else { // Use the handy event callback document.addEventListener( "DOMContentLoaded", completed, false ); // A fallback to window.onload, that will always work window.addEventListener( "load", completed, false ); } } return readyList.promise( obj ); }; // Populate the class2type map jQuery.each("Boolean Number String Function Array Date RegExp Object Error".split(" "), function(i, name) { class2type[ "[object " + name + "]" ] = name.toLowerCase(); }); function isArraylike( obj ) { var length = obj.length, type = jQuery.type( obj ); if ( jQuery.isWindow( obj ) ) { return false; } if ( obj.nodeType === 1 && length ) { return true; } return type === "array" || type !== "function" && ( length === 0 || typeof length === "number" && length > 0 && ( length - 1 ) in obj ); } // All jQuery objects should point back to these rootjQuery = jQuery(document); /*! * Sizzle CSS Selector Engine v1.9.2-pre * http://sizzlejs.com/ * * Copyright 2013 jQuery Foundation, Inc. and other contributors * Released under the MIT license * http://jquery.org/license * * Date: 2013-04-16 */ (function( window, undefined ) { var i, cachedruns, Expr, getText, isXML, compile, outermostContext, sortInput, // Local document vars setDocument, document, docElem, documentIsHTML, rbuggyQSA, rbuggyMatches, matches, contains, // Instance-specific data expando = "sizzle" + -(new Date()), preferredDoc = window.document, support = {}, dirruns = 0, done = 0, classCache = createCache(), tokenCache = createCache(), compilerCache = createCache(), hasDuplicate = false, sortOrder = function() { return 0; }, // General-purpose constants strundefined = typeof undefined, MAX_NEGATIVE = 1 << 31, // Array methods arr = [], pop = arr.pop, push_native = arr.push, push = arr.push, slice = arr.slice, // Use a stripped-down indexOf if we can't use a native one indexOf = arr.indexOf || function( elem ) { var i = 0, len = this.length; for ( ; i < len; i++ ) { if ( this[i] === elem ) { return i; } } return -1; }, booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped", // Regular expressions // Whitespace characters http://www.w3.org/TR/css3-selectors/#whitespace whitespace = "[\\x20\\t\\r\\n\\f]", // http://www.w3.org/TR/css3-syntax/#characters characterEncoding = "(?:\\\\.|[\\w-]|[^\\x00-\\xa0])+", // Loosely modeled on CSS identifier characters // An unquoted value should be a CSS identifier http://www.w3.org/TR/css3-selectors/#attribute-selectors // Proper syntax: http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier identifier = characterEncoding.replace( "w", "w#" ), // Acceptable operators http://www.w3.org/TR/selectors/#attribute-selectors attributes = "\\[" + whitespace + "*(" + characterEncoding + ")" + whitespace + "*(?:([*^$|!~]?=)" + whitespace + "*(?:(['\"])((?:\\\\.|[^\\\\])*?)\\3|(" + identifier + ")|)|)" + whitespace + "*\\]", // Prefer arguments quoted, // then not containing pseudos/brackets, // then attribute selectors/non-parenthetical expressions, // then anything else // These preferences are here to reduce the number of selectors // needing tokenize in the PSEUDO preFilter pseudos = ":(" + characterEncoding + ")(?:\\(((['\"])((?:\\\\.|[^\\\\])*?)\\3|((?:\\\\.|[^\\\\()[\\]]|" + attributes.replace( 3, 8 ) + ")*)|.*)\\)|)", // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + whitespace + "+$", "g" ), rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + "*" ), rsibling = new RegExp( whitespace + "*[+~]" ), rattributeQuotes = new RegExp( "=" + whitespace + "*([^\\]'\"]*)" + whitespace + "*\\]", "g" ), rpseudo = new RegExp( pseudos ), ridentifier = new RegExp( "^" + identifier + "$" ), matchExpr = { "ID": new RegExp( "^#(" + characterEncoding + ")" ), "CLASS": new RegExp( "^\\.(" + characterEncoding + ")" ), "TAG": new RegExp( "^(" + characterEncoding.replace( "w", "w*" ) + ")" ), "ATTR": new RegExp( "^" + attributes ), "PSEUDO": new RegExp( "^" + pseudos ), "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + whitespace + "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + whitespace + "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), "boolean": new RegExp( "^(?:" + booleans + ")$", "i" ), // For use in libraries implementing .is() // We use this for POS matching in `select` "needsContext": new RegExp( "^" + whitespace + "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + whitespace + "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) }, rnative = /^[^{]+\{\s*\[native \w/, // Easily-parseable/retrievable ID or TAG or CLASS selectors rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, rinputs = /^(?:input|select|textarea|button)$/i, rheader = /^h\d$/i, rescape = /'|\\/g, // CSS escapes http://www.w3.org/TR/CSS21/syndata.html#escaped-characters runescape = /\\([\da-fA-F]{1,6}[\x20\t\r\n\f]?|.)/g, funescape = function( _, escaped ) { var high = "0x" + escaped - 0x10000; // NaN means non-codepoint return high !== high ? escaped : // BMP codepoint high < 0 ? String.fromCharCode( high + 0x10000 ) : // Supplemental Plane codepoint (surrogate pair) String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); }; // Optimize for push.apply( _, NodeList ) try { push.apply( (arr = slice.call( preferredDoc.childNodes )), preferredDoc.childNodes ); // Support: Android<4.0 // Detect silently failing push.apply arr[ preferredDoc.childNodes.length ].nodeType; } catch ( e ) { push = { apply: arr.length ? // Leverage slice if possible function( target, els ) { push_native.apply( target, slice.call(els) ); } : // Support: IE<9 // Otherwise append directly function( target, els ) { var j = target.length, i = 0; // Can't trust NodeList.length while ( (target[j++] = els[i++]) ) {} target.length = j - 1; } }; } /** * For feature detection * @param {Function} fn The function to test for native support */ function isNative( fn ) { return rnative.test( fn + "" ); } /** * Create key-value caches of limited size * @returns {Function(string, Object)} Returns the Object data after storing it on itself with * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) * deleting the oldest entry */ function createCache() { var cache, keys = []; return (cache = function( key, value ) { // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) if ( keys.push( key += " " ) > Expr.cacheLength ) { // Only keep the most recent entries delete cache[ keys.shift() ]; } return (cache[ key ] = value); }); } /** * Mark a function for special use by Sizzle * @param {Function} fn The function to mark */ function markFunction( fn ) { fn[ expando ] = true; return fn; } /** * Support testing using an element * @param {Function} fn Passed the created div and expects a boolean result */ function assert( fn ) { var div = document.createElement("div"); try { return !!fn( div ); } catch (e) { return false; } finally { if ( div.parentNode ) { div.parentNode.removeChild( div ); } // release memory in IE div = null; } } function Sizzle( selector, context, results, seed ) { var match, elem, m, nodeType, // QSA vars i, groups, old, nid, newContext, newSelector; if ( ( context ? context.ownerDocument || context : preferredDoc ) !== document ) { setDocument( context ); } context = context || document; results = results || []; if ( !selector || typeof selector !== "string" ) { return results; } if ( (nodeType = context.nodeType) !== 1 && nodeType !== 9 ) { return []; } if ( documentIsHTML && !seed ) { // Shortcuts if ( (match = rquickExpr.exec( selector )) ) { // Speed-up: Sizzle("#ID") if ( (m = match[1]) ) { if ( nodeType === 9 ) { elem = context.getElementById( m ); // Check parentNode to catch when Blackberry 4.6 returns // nodes that are no longer in the document #6963 if ( elem && elem.parentNode ) { // Handle the case where IE, Opera, and Webkit return items // by name instead of ID if ( elem.id === m ) { results.push( elem ); return results; } } else { return results; } } else { // Context is not a document if ( context.ownerDocument && (elem = context.ownerDocument.getElementById( m )) && contains( context, elem ) && elem.id === m ) { results.push( elem ); return results; } } // Speed-up: Sizzle("TAG") } else if ( match[2] ) { push.apply( results, context.getElementsByTagName( selector ) ); return results; // Speed-up: Sizzle(".CLASS") } else if ( (m = match[3]) && support.getElementsByClassName && context.getElementsByClassName ) { push.apply( results, context.getElementsByClassName( m ) ); return results; } } // QSA path if ( support.qsa && (!rbuggyQSA || !rbuggyQSA.test( selector )) ) { nid = old = expando; newContext = context; newSelector = nodeType === 9 && selector; // qSA works strangely on Element-rooted queries // We can work around this by specifying an extra ID on the root // and working up from there (Thanks to Andrew Dupont for the technique) // IE 8 doesn't work on object elements if ( nodeType === 1 && context.nodeName.toLowerCase() !== "object" ) { groups = tokenize( selector ); if ( (old = context.getAttribute("id")) ) { nid = old.replace( rescape, "\\$&" ); } else { context.setAttribute( "id", nid ); } nid = "[id='" + nid + "'] "; i = groups.length; while ( i-- ) { groups[i] = nid + toSelector( groups[i] ); } newContext = rsibling.test( selector ) && context.parentNode || context; newSelector = groups.join(","); } if ( newSelector ) { try { push.apply( results, newContext.querySelectorAll( newSelector ) ); return results; } catch(qsaError) { } finally { if ( !old ) { context.removeAttribute("id"); } } } } } // All others return select( selector.replace( rtrim, "$1" ), context, results, seed ); } /** * Detect xml * @param {Element|Object} elem An element or a document */ isXML = Sizzle.isXML = function( elem ) { // documentElement is verified for cases where it doesn't yet exist // (such as loading iframes in IE - #4833) var documentElement = elem && (elem.ownerDocument || elem).documentElement; return documentElement ? documentElement.nodeName !== "HTML" : false; }; /** * Sets document-related variables once based on the current document * @param {Element|Object} [doc] An element or document object to use to set the document * @returns {Object} Returns the current document */ setDocument = Sizzle.setDocument = function( node ) { var doc = node ? node.ownerDocument || node : preferredDoc; // If no document and documentElement is available, return if ( doc === document || doc.nodeType !== 9 || !doc.documentElement ) { return document; } // Set our document document = doc; docElem = doc.documentElement; // Support tests documentIsHTML = !isXML( doc ); // Check if getElementsByTagName("*") returns only elements support.getElementsByTagName = assert(function( div ) { div.appendChild( doc.createComment("") ); return !div.getElementsByTagName("*").length; }); // Support: IE<8 // Verify that getAttribute really returns attributes and not properties (excepting IE8 booleans) support.attributes = assert(function( div ) { div.className = "i"; return !div.getAttribute("className"); }); // Check if getElementsByClassName can be trusted support.getElementsByClassName = assert(function( div ) { div.innerHTML = "<div class='a'></div><div class='a i'></div>"; // Support: Safari<4 // Catch class over-caching div.firstChild.className = "i"; // Support: Opera<10 // Catch gEBCN failure to find non-leading classes return div.getElementsByClassName("i").length === 2; }); // Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) // Detached nodes confoundingly follow *each other* support.sortDetached = assert(function( div1 ) { // Should return 1, but returns 4 (following) return div1.compareDocumentPosition( document.createElement("div") ) & 1; }); // Support: IE<10 // Check if getElementById returns elements by name // Support: Windows 8 Native Apps // Assigning innerHTML with "name" attributes throws uncatchable exceptions // (http://msdn.microsoft.com/en-us/library/ie/hh465388.aspx) // and the broken getElementById methods don't pick up programatically-set names, // so use a roundabout getElementsByName test support.getById = assert(function( div ) { docElem.appendChild( div ).id = expando; return !doc.getElementsByName || !doc.getElementsByName( expando ).length; }); // ID find and filter if ( support.getById ) { Expr.find["ID"] = function( id, context ) { if ( typeof context.getElementById !== strundefined && documentIsHTML ) { var m = context.getElementById( id ); // Check parentNode to catch when Blackberry 4.6 returns // nodes that are no longer in the document #6963 return m && m.parentNode ? [m] : []; } }; Expr.filter["ID"] = function( id ) { var attrId = id.replace( runescape, funescape ); return function( elem ) { return elem.getAttribute("id") === attrId; }; }; } else { Expr.find["ID"] = function( id, context ) { if ( typeof context.getElementById !== strundefined && documentIsHTML ) { var m = context.getElementById( id ); return m ? m.id === id || typeof m.getAttributeNode !== strundefined && m.getAttributeNode("id").value === id ? [m] : undefined : []; } }; Expr.filter["ID"] = function( id ) { var attrId = id.replace( runescape, funescape ); return function( elem ) { var node = typeof elem.getAttributeNode !== strundefined && elem.getAttributeNode("id"); return node && node.value === attrId; }; }; } // Tag Expr.find["TAG"] = support.getElementsByTagName ? function( tag, context ) { if ( typeof context.getElementsByTagName !== strundefined ) { return context.getElementsByTagName( tag ); } } : function( tag, context ) { var elem, tmp = [], i = 0, results = context.getElementsByTagName( tag ); // Filter out possible comments if ( tag === "*" ) { while ( (elem = results[i++]) ) { if ( elem.nodeType === 1 ) { tmp.push( elem ); } } return tmp; } return results; }; // Class Expr.find["CLASS"] = support.getElementsByClassName && function( className, context ) { if ( typeof context.getElementsByClassName !== strundefined && documentIsHTML ) { return context.getElementsByClassName( className ); } }; // QSA and matchesSelector support // matchesSelector(:active) reports false when true (IE9/Opera 11.5) rbuggyMatches = []; // qSa(:focus) reports false when true (Chrome 21) // We allow this because of a bug in IE8/9 that throws an error // whenever `document.activeElement` is accessed on an iframe // So, we allow :focus to pass through QSA all the time to avoid the IE error // See http://bugs.jquery.com/ticket/13378 rbuggyQSA = []; if ( (support.qsa = isNative(doc.querySelectorAll)) ) { // Build QSA regex // Regex strategy adopted from Diego Perini assert(function( div ) { // Select is set to empty string on purpose // This is to test IE's treatment of not explicitly // setting a boolean content attribute, // since its presence should be enough // http://bugs.jquery.com/ticket/12359 div.innerHTML = "<select><option selected=''></option></select>"; // Support: IE8 // Boolean attributes and "value" are not treated correctly if ( !div.querySelectorAll("[selected]").length ) { rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); } // Webkit/Opera - :checked should return selected option elements // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked // IE8 throws error here and will not see later tests if ( !div.querySelectorAll(":checked").length ) { rbuggyQSA.push(":checked"); } }); assert(function( div ) { // Support: Opera 10-12/IE8 // ^= $= *= and empty values // Should not select anything // Support: Windows 8 Native Apps // The type attribute is restricted during .innerHTML assignment var input = document.createElement("input"); input.setAttribute( "type", "hidden" ); div.appendChild( input ).setAttribute( "t", "" ); if ( div.querySelectorAll("[t^='']").length ) { rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); } // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) // IE8 throws error here and will not see later tests if ( !div.querySelectorAll(":enabled").length ) { rbuggyQSA.push( ":enabled", ":disabled" ); } // Opera 10-11 does not throw on post-comma invalid pseudos div.querySelectorAll("*,:x"); rbuggyQSA.push(",.*:"); }); } if ( (support.matchesSelector = isNative( (matches = docElem.webkitMatchesSelector || docElem.mozMatchesSelector || docElem.oMatchesSelector || docElem.msMatchesSelector) )) ) { assert(function( div ) { // Check to see if it's possible to do matchesSelector // on a disconnected node (IE 9) support.disconnectedMatch = matches.call( div, "div" ); // This should fail with an exception // Gecko does not error, returns false instead matches.call( div, "[s!='']:x" ); rbuggyMatches.push( "!=", pseudos ); }); } rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join("|") ); rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join("|") ); // Element contains another // Purposefully does not implement inclusive descendent // As in, an element does not contain itself contains = isNative(docElem.contains) || docElem.compareDocumentPosition ? function( a, b ) { var adown = a.nodeType === 9 ? a.documentElement : a, bup = b && b.parentNode; return a === bup || !!( bup && bup.nodeType === 1 && ( adown.contains ? adown.contains( bup ) : a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 )); } : function( a, b ) { if ( b ) { while ( (b = b.parentNode) ) { if ( b === a ) { return true; } } } return false; }; // Document order sorting sortOrder = docElem.compareDocumentPosition ? function( a, b ) { // Flag for duplicate removal if ( a === b ) { hasDuplicate = true; return 0; } var compare = b.compareDocumentPosition && a.compareDocumentPosition && a.compareDocumentPosition( b ); if ( compare ) { // Disconnected nodes if ( compare & 1 || (!support.sortDetached && b.compareDocumentPosition( a ) === compare) ) { // Choose the first element that is related to our preferred document if ( a === doc || contains(preferredDoc, a) ) { return -1; } if ( b === doc || contains(preferredDoc, b) ) { return 1; } // Maintain original order return sortInput ? ( indexOf.call( sortInput, a ) - indexOf.call( sortInput, b ) ) : 0; } return compare & 4 ? -1 : 1; } // Not directly comparable, sort on existence of method return a.compareDocumentPosition ? -1 : 1; } : function( a, b ) { var cur, i = 0, aup = a.parentNode, bup = b.parentNode, ap = [ a ], bp = [ b ]; // Exit early if the nodes are identical if ( a === b ) { hasDuplicate = true; return 0; // Parentless nodes are either documents or disconnected } else if ( !aup || !bup ) { return a === doc ? -1 : b === doc ? 1 : aup ? -1 : bup ? 1 : sortInput ? ( indexOf.call( sortInput, a ) - indexOf.call( sortInput, b ) ) : 0; // If the nodes are siblings, we can do a quick check } else if ( aup === bup ) { return siblingCheck( a, b ); } // Otherwise we need full lists of their ancestors for comparison cur = a; while ( (cur = cur.parentNode) ) { ap.unshift( cur ); } cur = b; while ( (cur = cur.parentNode) ) { bp.unshift( cur ); } // Walk down the tree looking for a discrepancy while ( ap[i] === bp[i] ) { i++; } return i ? // Do a sibling check if the nodes have a common ancestor siblingCheck( ap[i], bp[i] ) : // Otherwise nodes in our document sort first ap[i] === preferredDoc ? -1 : bp[i] === preferredDoc ? 1 : 0; }; return document; }; Sizzle.matches = function( expr, elements ) { return Sizzle( expr, null, null, elements ); }; Sizzle.matchesSelector = function( elem, expr ) { // Set document vars if needed if ( ( elem.ownerDocument || elem ) !== document ) { setDocument( elem ); } // Make sure that attribute selectors are quoted expr = expr.replace( rattributeQuotes, "='$1']" ); // rbuggyQSA always contains :focus, so no need for an existence check if ( support.matchesSelector && documentIsHTML && (!rbuggyMatches || !rbuggyMatches.test(expr)) && (!rbuggyQSA || !rbuggyQSA.test(expr)) ) { try { var ret = matches.call( elem, expr ); // IE 9's matchesSelector returns false on disconnected nodes if ( ret || support.disconnectedMatch || // As well, disconnected nodes are said to be in a document // fragment in IE 9 elem.document && elem.document.nodeType !== 11 ) { return ret; } } catch(e) {} } return Sizzle( expr, document, null, [elem] ).length > 0; }; Sizzle.contains = function( context, elem ) { // Set document vars if needed if ( ( context.ownerDocument || context ) !== document ) { setDocument( context ); } return contains( context, elem ); }; Sizzle.attr = function( elem, name ) { // Set document vars if needed if ( ( elem.ownerDocument || elem ) !== document ) { setDocument( elem ); } var fn = Expr.attrHandle[ name.toLowerCase() ], val = fn && fn( elem, name, !documentIsHTML ); return val === undefined ? support.attributes || !documentIsHTML ? elem.getAttribute( name ) : (val = elem.getAttributeNode(name)) && val.specified ? val.value : null : val; }; Sizzle.error = function( msg ) { throw new Error( "Syntax error, unrecognized expression: " + msg ); }; // Document sorting and removing duplicates Sizzle.uniqueSort = function( results ) { var elem, duplicates = [], j = 0, i = 0; // Unless we *know* we can detect duplicates, assume their presence hasDuplicate = !support.detectDuplicates; sortInput = !support.sortStable && results.slice( 0 ); results.sort( sortOrder ); if ( hasDuplicate ) { while ( (elem = results[i++]) ) { if ( elem === results[ i ] ) { j = duplicates.push( i ); } } while ( j-- ) { results.splice( duplicates[ j ], 1 ); } } return results; }; /** * Checks document order of two siblings * @param {Element} a * @param {Element} b * @returns Returns -1 if a precedes b, 1 if a follows b */ function siblingCheck( a, b ) { var cur = b && a, diff = cur && ( ~b.sourceIndex || MAX_NEGATIVE ) - ( ~a.sourceIndex || MAX_NEGATIVE ); // Use IE sourceIndex if available on both nodes if ( diff ) { return diff; } // Check if b follows a if ( cur ) { while ( (cur = cur.nextSibling) ) { if ( cur === b ) { return -1; } } } return a ? 1 : -1; } // Fetches boolean attributes by node function boolHandler( elem, name, isXML ) { var val; return isXML ? undefined : (val = elem.getAttributeNode( name )) && val.specified ? val.value : elem[ name ] === true ? name.toLowerCase() : null; } // Fetches attributes without interpolation // http://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx function interpolationHandler( elem, name, isXML ) { var val; return isXML ? undefined : (val = elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 )); } // Returns a function to use in pseudos for input types function createInputPseudo( type ) { return function( elem ) { var name = elem.nodeName.toLowerCase(); return name === "input" && elem.type === type; }; } // Returns a function to use in pseudos for buttons function createButtonPseudo( type ) { return function( elem ) { var name = elem.nodeName.toLowerCase(); return (name === "input" || name === "button") && elem.type === type; }; } // Returns a function to use in pseudos for positionals function createPositionalPseudo( fn ) { return markFunction(function( argument ) { argument = +argument; return markFunction(function( seed, matches ) { var j, matchIndexes = fn( [], seed.length, argument ), i = matchIndexes.length; // Match elements found at the specified indexes while ( i-- ) { if ( seed[ (j = matchIndexes[i]) ] ) { seed[j] = !(matches[j] = seed[j]); } } }); }); } /** * Utility function for retrieving the text value of an array of DOM nodes * @param {Array|Element} elem */ getText = Sizzle.getText = function( elem ) { var node, ret = "", i = 0, nodeType = elem.nodeType; if ( !nodeType ) { // If no nodeType, this is expected to be an array for ( ; (node = elem[i]); i++ ) { // Do not traverse comment nodes ret += getText( node ); } } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { // Use textContent for elements // innerText usage removed for consistency of new lines (see #11153) if ( typeof elem.textContent === "string" ) { return elem.textContent; } else { // Traverse its children for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { ret += getText( elem ); } } } else if ( nodeType === 3 || nodeType === 4 ) { return elem.nodeValue; } // Do not include comment or processing instruction nodes return ret; }; Expr = Sizzle.selectors = { // Can be adjusted by the user cacheLength: 50, createPseudo: markFunction, match: matchExpr, attrHandle: {}, find: {}, relative: { ">": { dir: "parentNode", first: true }, " ": { dir: "parentNode" }, "+": { dir: "previousSibling", first: true }, "~": { dir: "previousSibling" } }, preFilter: { "ATTR": function( match ) { match[1] = match[1].replace( runescape, funescape ); // Move the given value to match[3] whether quoted or unquoted match[3] = ( match[4] || match[5] || "" ).replace( runescape, funescape ); if ( match[2] === "~=" ) { match[3] = " " + match[3] + " "; } return match.slice( 0, 4 ); }, "CHILD": function( match ) { /* matches from matchExpr["CHILD"] 1 type (only|nth|...) 2 what (child|of-type) 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) 4 xn-component of xn+y argument ([+-]?\d*n|) 5 sign of xn-component 6 x of xn-component 7 sign of y-component 8 y of y-component */ match[1] = match[1].toLowerCase(); if ( match[1].slice( 0, 3 ) === "nth" ) { // nth-* requires argument if ( !match[3] ) { Sizzle.error( match[0] ); } // numeric x and y parameters for Expr.filter.CHILD // remember that false/true cast respectively to 0/1 match[4] = +( match[4] ? match[5] + (match[6] || 1) : 2 * ( match[3] === "even" || match[3] === "odd" ) ); match[5] = +( ( match[7] + match[8] ) || match[3] === "odd" ); // other types prohibit arguments } else if ( match[3] ) { Sizzle.error( match[0] ); } return match; }, "PSEUDO": function( match ) { var excess, unquoted = !match[5] && match[2]; if ( matchExpr["CHILD"].test( match[0] ) ) { return null; } // Accept quoted arguments as-is if ( match[4] ) { match[2] = match[4]; // Strip excess characters from unquoted arguments } else if ( unquoted && rpseudo.test( unquoted ) && // Get excess from tokenize (recursively) (excess = tokenize( unquoted, true )) && // advance to the next closing parenthesis (excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length) ) { // excess is a negative index match[0] = match[0].slice( 0, excess ); match[2] = unquoted.slice( 0, excess ); } // Return only captures needed by the pseudo filter method (type and argument) return match.slice( 0, 3 ); } }, filter: { "TAG": function( nodeNameSelector ) { var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); return nodeNameSelector === "*" ? function() { return true; } : function( elem ) { return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; }; }, "CLASS": function( className ) { var pattern = classCache[ className + " " ]; return pattern || (pattern = new RegExp( "(^|" + whitespace + ")" + className + "(" + whitespace + "|$)" )) && classCache( className, function( elem ) { return pattern.test( typeof elem.className === "string" && elem.className || typeof elem.getAttribute !== strundefined && elem.getAttribute("class") || "" ); }); }, "ATTR": function( name, operator, check ) { return function( elem ) { var result = Sizzle.attr( elem, name ); if ( result == null ) { return operator === "!="; } if ( !operator ) { return true; } result += ""; return operator === "=" ? result === check : operator === "!=" ? result !== check : operator === "^=" ? check && result.indexOf( check ) === 0 : operator === "*=" ? check && result.indexOf( check ) > -1 : operator === "$=" ? check && result.slice( -check.length ) === check : operator === "~=" ? ( " " + result + " " ).indexOf( check ) > -1 : operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : false; }; }, "CHILD": function( type, what, argument, first, last ) { var simple = type.slice( 0, 3 ) !== "nth", forward = type.slice( -4 ) !== "last", ofType = what === "of-type"; return first === 1 && last === 0 ? // Shortcut for :nth-*(n) function( elem ) { return !!elem.parentNode; } : function( elem, context, xml ) { var cache, outerCache, node, diff, nodeIndex, start, dir = simple !== forward ? "nextSibling" : "previousSibling", parent = elem.parentNode, name = ofType && elem.nodeName.toLowerCase(), useCache = !xml && !ofType; if ( parent ) { // :(first|last|only)-(child|of-type) if ( simple ) { while ( dir ) { node = elem; while ( (node = node[ dir ]) ) { if ( ofType ? node.nodeName.toLowerCase() === name : node.nodeType === 1 ) { return false; } } // Reverse direction for :only-* (if we haven't yet done so) start = dir = type === "only" && !start && "nextSibling"; } return true; } start = [ forward ? parent.firstChild : parent.lastChild ]; // non-xml :nth-child(...) stores cache data on `parent` if ( forward && useCache ) { // Seek `elem` from a previously-cached index outerCache = parent[ expando ] || (parent[ expando ] = {}); cache = outerCache[ type ] || []; nodeIndex = cache[0] === dirruns && cache[1]; diff = cache[0] === dirruns && cache[2]; node = nodeIndex && parent.childNodes[ nodeIndex ]; while ( (node = ++nodeIndex && node && node[ dir ] || // Fallback to seeking `elem` from the start (diff = nodeIndex = 0) || start.pop()) ) { // When found, cache indexes on `parent` and break if ( node.nodeType === 1 && ++diff && node === elem ) { outerCache[ type ] = [ dirruns, nodeIndex, diff ]; break; } } // Use previously-cached element index if available } else if ( useCache && (cache = (elem[ expando ] || (elem[ expando ] = {}))[ type ]) && cache[0] === dirruns ) { diff = cache[1]; // xml :nth-child(...) or :nth-last-child(...) or :nth(-last)?-of-type(...) } else { // Use the same loop as above to seek `elem` from the start while ( (node = ++nodeIndex && node && node[ dir ] || (diff = nodeIndex = 0) || start.pop()) ) { if ( ( ofType ? node.nodeName.toLowerCase() === name : node.nodeType === 1 ) && ++diff ) { // Cache the index of each encountered element if ( useCache ) { (node[ expando ] || (node[ expando ] = {}))[ type ] = [ dirruns, diff ]; } if ( node === elem ) { break; } } } } // Incorporate the offset, then check against cycle size diff -= last; return diff === first || ( diff % first === 0 && diff / first >= 0 ); } }; }, "PSEUDO": function( pseudo, argument ) { // pseudo-class names are case-insensitive // http://www.w3.org/TR/selectors/#pseudo-classes // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters // Remember that setFilters inherits from pseudos var args, fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || Sizzle.error( "unsupported pseudo: " + pseudo ); // The user may use createPseudo to indicate that // arguments are needed to create the filter function // just as Sizzle does if ( fn[ expando ] ) { return fn( argument ); } // But maintain support for old signatures if ( fn.length > 1 ) { args = [ pseudo, pseudo, "", argument ]; return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? markFunction(function( seed, matches ) { var idx, matched = fn( seed, argument ), i = matched.length; while ( i-- ) { idx = indexOf.call( seed, matched[i] ); seed[ idx ] = !( matches[ idx ] = matched[i] ); } }) : function( elem ) { return fn( elem, 0, args ); }; } return fn; } }, pseudos: { // Potentially complex pseudos "not": markFunction(function( selector ) { // Trim the selector passed to compile // to avoid treating leading and trailing // spaces as combinators var input = [], results = [], matcher = compile( selector.replace( rtrim, "$1" ) ); return matcher[ expando ] ? markFunction(function( seed, matches, context, xml ) { var elem, unmatched = matcher( seed, null, xml, [] ), i = seed.length; // Match elements unmatched by `matcher` while ( i-- ) { if ( (elem = unmatched[i]) ) { seed[i] = !(matches[i] = elem); } } }) : function( elem, context, xml ) { input[0] = elem; matcher( input, null, xml, results ); return !results.pop(); }; }), "has": markFunction(function( selector ) { return function( elem ) { return Sizzle( selector, elem ).length > 0; }; }), "contains": markFunction(function( text ) { return function( elem ) { return ( elem.textContent || elem.innerText || getText( elem ) ).indexOf( text ) > -1; }; }), // "Whether an element is represented by a :lang() selector // is based solely on the element's language value // being equal to the identifier C, // or beginning with the identifier C immediately followed by "-". // The matching of C against the element's language value is performed case-insensitively. // The identifier C does not have to be a valid language name." // http://www.w3.org/TR/selectors/#lang-pseudo "lang": markFunction( function( lang ) { // lang value must be a valid identifier if ( !ridentifier.test(lang || "") ) { Sizzle.error( "unsupported lang: " + lang ); } lang = lang.replace( runescape, funescape ).toLowerCase(); return function( elem ) { var elemLang; do { if ( (elemLang = documentIsHTML ? elem.lang : elem.getAttribute("xml:lang") || elem.getAttribute("lang")) ) { elemLang = elemLang.toLowerCase(); return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; } } while ( (elem = elem.parentNode) && elem.nodeType === 1 ); return false; }; }), // Miscellaneous "target": function( elem ) { var hash = window.location && window.location.hash; return hash && hash.slice( 1 ) === elem.id; }, "root": function( elem ) { return elem === docElem; }, "focus": function( elem ) { return elem === document.activeElement && (!document.hasFocus || document.hasFocus()) && !!(elem.type || elem.href || ~elem.tabIndex); }, // Boolean properties "enabled": function( elem ) { return elem.disabled === false; }, "disabled": function( elem ) { return elem.disabled === true; }, "checked": function( elem ) { // In CSS3, :checked should return both checked and selected elements // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked var nodeName = elem.nodeName.toLowerCase(); return (nodeName === "input" && !!elem.checked) || (nodeName === "option" && !!elem.selected); }, "selected": function( elem ) { // Accessing this property makes selected-by-default // options in Safari work properly if ( elem.parentNode ) { elem.parentNode.selectedIndex; } return elem.selected === true; }, // Contents "empty": function( elem ) { // http://www.w3.org/TR/selectors/#empty-pseudo // :empty is only affected by element nodes and content nodes(including text(3), cdata(4)), // not comment, processing instructions, or others // Thanks to Diego Perini for the nodeName shortcut // Greater than "@" means alpha characters (specifically not starting with "#" or "?") for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { if ( elem.nodeName > "@" || elem.nodeType === 3 || elem.nodeType === 4 ) { return false; } } return true; }, "parent": function( elem ) { return !Expr.pseudos["empty"]( elem ); }, // Element/input types "header": function( elem ) { return rheader.test( elem.nodeName ); }, "input": function( elem ) { return rinputs.test( elem.nodeName ); }, "button": function( elem ) { var name = elem.nodeName.toLowerCase(); return name === "input" && elem.type === "button" || name === "button"; }, "text": function( elem ) { var attr; // IE6 and 7 will map elem.type to 'text' for new HTML5 types (search, etc) // use getAttribute instead to test this case return elem.nodeName.toLowerCase() === "input" && elem.type === "text" && ( (attr = elem.getAttribute("type")) == null || attr.toLowerCase() === elem.type ); }, // Position-in-collection "first": createPositionalPseudo(function() { return [ 0 ]; }), "last": createPositionalPseudo(function( matchIndexes, length ) { return [ length - 1 ]; }), "eq": createPositionalPseudo(function( matchIndexes, length, argument ) { return [ argument < 0 ? argument + length : argument ]; }), "even": createPositionalPseudo(function( matchIndexes, length ) { var i = 0; for ( ; i < length; i += 2 ) { matchIndexes.push( i ); } return matchIndexes; }), "odd": createPositionalPseudo(function( matchIndexes, length ) { var i = 1; for ( ; i < length; i += 2 ) { matchIndexes.push( i ); } return matchIndexes; }), "lt": createPositionalPseudo(function( matchIndexes, length, argument ) { var i = argument < 0 ? argument + length : argument; for ( ; --i >= 0; ) { matchIndexes.push( i ); } return matchIndexes; }), "gt": createPositionalPseudo(function( matchIndexes, length, argument ) { var i = argument < 0 ? argument + length : argument; for ( ; ++i < length; ) { matchIndexes.push( i ); } return matchIndexes; }) } }; // Add button/input type pseudos for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { Expr.pseudos[ i ] = createInputPseudo( i ); } for ( i in { submit: true, reset: true } ) { Expr.pseudos[ i ] = createButtonPseudo( i ); } function tokenize( selector, parseOnly ) { var matched, match, tokens, type, soFar, groups, preFilters, cached = tokenCache[ selector + " " ]; if ( cached ) { return parseOnly ? 0 : cached.slice( 0 ); } soFar = selector; groups = []; preFilters = Expr.preFilter; while ( soFar ) { // Comma and first run if ( !matched || (match = rcomma.exec( soFar )) ) { if ( match ) { // Don't consume trailing commas as valid soFar = soFar.slice( match[0].length ) || soFar; } groups.push( tokens = [] ); } matched = false; // Combinators if ( (match = rcombinators.exec( soFar )) ) { matched = match.shift(); tokens.push( { value: matched, // Cast descendant combinators to space type: match[0].replace( rtrim, " " ) } ); soFar = soFar.slice( matched.length ); } // Filters for ( type in Expr.filter ) { if ( (match = matchExpr[ type ].exec( soFar )) && (!preFilters[ type ] || (match = preFilters[ type ]( match ))) ) { matched = match.shift(); tokens.push( { value: matched, type: type, matches: match } ); soFar = soFar.slice( matched.length ); } } if ( !matched ) { break; } } // Return the length of the invalid excess // if we're just parsing // Otherwise, throw an error or return tokens return parseOnly ? soFar.length : soFar ? Sizzle.error( selector ) : // Cache the tokens tokenCache( selector, groups ).slice( 0 ); } function toSelector( tokens ) { var i = 0, len = tokens.length, selector = ""; for ( ; i < len; i++ ) { selector += tokens[i].value; } return selector; } function addCombinator( matcher, combinator, base ) { var dir = combinator.dir, checkNonElements = base && dir === "parentNode", doneName = done++; return combinator.first ? // Check against closest ancestor/preceding element function( elem, context, xml ) { while ( (elem = elem[ dir ]) ) { if ( elem.nodeType === 1 || checkNonElements ) { return matcher( elem, context, xml ); } } } : // Check against all ancestor/preceding elements function( elem, context, xml ) { var data, cache, outerCache, dirkey = dirruns + " " + doneName; // We can't set arbitrary data on XML nodes, so they don't benefit from dir caching if ( xml ) { while ( (elem = elem[ dir ]) ) { if ( elem.nodeType === 1 || checkNonElements ) { if ( matcher( elem, context, xml ) ) { return true; } } } } else { while ( (elem = elem[ dir ]) ) { if ( elem.nodeType === 1 || checkNonElements ) { outerCache = elem[ expando ] || (elem[ expando ] = {}); if ( (cache = outerCache[ dir ]) && cache[0] === dirkey ) { if ( (data = cache[1]) === true || data === cachedruns ) { return data === true; } } else { cache = outerCache[ dir ] = [ dirkey ]; cache[1] = matcher( elem, context, xml ) || cachedruns; if ( cache[1] === true ) { return true; } } } } } }; } function elementMatcher( matchers ) { return matchers.length > 1 ? function( elem, context, xml ) { var i = matchers.length; while ( i-- ) { if ( !matchers[i]( elem, context, xml ) ) { return false; } } return true; } : matchers[0]; } function condense( unmatched, map, filter, context, xml ) { var elem, newUnmatched = [], i = 0, len = unmatched.length, mapped = map != null; for ( ; i < len; i++ ) { if ( (elem = unmatched[i]) ) { if ( !filter || filter( elem, context, xml ) ) { newUnmatched.push( elem ); if ( mapped ) { map.push( i ); } } } } return newUnmatched; } function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { if ( postFilter && !postFilter[ expando ] ) { postFilter = setMatcher( postFilter ); } if ( postFinder && !postFinder[ expando ] ) { postFinder = setMatcher( postFinder, postSelector ); } return markFunction(function( seed, results, context, xml ) { var temp, i, elem, preMap = [], postMap = [], preexisting = results.length, // Get initial elements from seed or context elems = seed || multipleContexts( selector || "*", context.nodeType ? [ context ] : context, [] ), // Prefilter to get matcher input, preserving a map for seed-results synchronization matcherIn = preFilter && ( seed || !selector ) ? condense( elems, preMap, preFilter, context, xml ) : elems, matcherOut = matcher ? // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, postFinder || ( seed ? preFilter : preexisting || postFilter ) ? // ...intermediate processing is necessary [] : // ...otherwise use results directly results : matcherIn; // Find primary matches if ( matcher ) { matcher( matcherIn, matcherOut, context, xml ); } // Apply postFilter if ( postFilter ) { temp = condense( matcherOut, postMap ); postFilter( temp, [], context, xml ); // Un-match failing elements by moving them back to matcherIn i = temp.length; while ( i-- ) { if ( (elem = temp[i]) ) { matcherOut[ postMap[i] ] = !(matcherIn[ postMap[i] ] = elem); } } } if ( seed ) { if ( postFinder || preFilter ) { if ( postFinder ) { // Get the final matcherOut by condensing this intermediate into postFinder contexts temp = []; i = matcherOut.length; while ( i-- ) { if ( (elem = matcherOut[i]) ) { // Restore matcherIn since elem is not yet a final match temp.push( (matcherIn[i] = elem) ); } } postFinder( null, (matcherOut = []), temp, xml ); } // Move matched elements from seed to results to keep them synchronized i = matcherOut.length; while ( i-- ) { if ( (elem = matcherOut[i]) && (temp = postFinder ? indexOf.call( seed, elem ) : preMap[i]) > -1 ) { seed[temp] = !(results[temp] = elem); } } } // Add elements to results, through postFinder if defined } else { matcherOut = condense( matcherOut === results ? matcherOut.splice( preexisting, matcherOut.length ) : matcherOut ); if ( postFinder ) { postFinder( null, results, matcherOut, xml ); } else { push.apply( results, matcherOut ); } } }); } function matcherFromTokens( tokens ) { var checkContext, matcher, j, len = tokens.length, leadingRelative = Expr.relative[ tokens[0].type ], implicitRelative = leadingRelative || Expr.relative[" "], i = leadingRelative ? 1 : 0, // The foundational matcher ensures that elements are reachable from top-level context(s) matchContext = addCombinator( function( elem ) { return elem === checkContext; }, implicitRelative, true ), matchAnyContext = addCombinator( function( elem ) { return indexOf.call( checkContext, elem ) > -1; }, implicitRelative, true ), matchers = [ function( elem, context, xml ) { return ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( (checkContext = context).nodeType ? matchContext( elem, context, xml ) : matchAnyContext( elem, context, xml ) ); } ]; for ( ; i < len; i++ ) { if ( (matcher = Expr.relative[ tokens[i].type ]) ) { matchers = [ addCombinator(elementMatcher( matchers ), matcher) ]; } else { matcher = Expr.filter[ tokens[i].type ].apply( null, tokens[i].matches ); // Return special upon seeing a positional matcher if ( matcher[ expando ] ) { // Find the next relative operator (if any) for proper handling j = ++i; for ( ; j < len; j++ ) { if ( Expr.relative[ tokens[j].type ] ) { break; } } return setMatcher( i > 1 && elementMatcher( matchers ), i > 1 && toSelector( tokens.slice( 0, i - 1 ) ).replace( rtrim, "$1" ), matcher, i < j && matcherFromTokens( tokens.slice( i, j ) ), j < len && matcherFromTokens( (tokens = tokens.slice( j )) ), j < len && toSelector( tokens ) ); } matchers.push( matcher ); } } return elementMatcher( matchers ); } function matcherFromGroupMatchers( elementMatchers, setMatchers ) { // A counter to specify which element is currently being matched var matcherCachedRuns = 0, bySet = setMatchers.length > 0, byElement = elementMatchers.length > 0, superMatcher = function( seed, context, xml, results, expandContext ) { var elem, j, matcher, setMatched = [], matchedCount = 0, i = "0", unmatched = seed && [], outermost = expandContext != null, contextBackup = outermostContext, // We must always have either seed elements or context elems = seed || byElement && Expr.find["TAG"]( "*", expandContext && context.parentNode || context ), // Use integer dirruns iff this is the outermost matcher dirrunsUnique = (dirruns += contextBackup == null ? 1 : Math.random() || 0.1); if ( outermost ) { outermostContext = context !== document && context; cachedruns = matcherCachedRuns; } // Add elements passing elementMatchers directly to results // Keep `i` a string if there are no elements so `matchedCount` will be "00" below for ( ; (elem = elems[i]) != null; i++ ) { if ( byElement && elem ) { j = 0; while ( (matcher = elementMatchers[j++]) ) { if ( matcher( elem, context, xml ) ) { results.push( elem ); break; } } if ( outermost ) { dirruns = dirrunsUnique; cachedruns = ++matcherCachedRuns; } } // Track unmatched elements for set filters if ( bySet ) { // They will have gone through all possible matchers if ( (elem = !matcher && elem) ) { matchedCount--; } // Lengthen the array for every element, matched or not if ( seed ) { unmatched.push( elem ); } } } // Apply set filters to unmatched elements matchedCount += i; if ( bySet && i !== matchedCount ) { j = 0; while ( (matcher = setMatchers[j++]) ) { matcher( unmatched, setMatched, context, xml ); } if ( seed ) { // Reintegrate element matches to eliminate the need for sorting if ( matchedCount > 0 ) { while ( i-- ) { if ( !(unmatched[i] || setMatched[i]) ) { setMatched[i] = pop.call( results ); } } } // Discard index placeholder values to get only actual matches setMatched = condense( setMatched ); } // Add matches to results push.apply( results, setMatched ); // Seedless set matches succeeding multiple successful matchers stipulate sorting if ( outermost && !seed && setMatched.length > 0 && ( matchedCount + setMatchers.length ) > 1 ) { Sizzle.uniqueSort( results ); } } // Override manipulation of globals by nested matchers if ( outermost ) { dirruns = dirrunsUnique; outermostContext = contextBackup; } return unmatched; }; return bySet ? markFunction( superMatcher ) : superMatcher; } compile = Sizzle.compile = function( selector, group /* Internal Use Only */ ) { var i, setMatchers = [], elementMatchers = [], cached = compilerCache[ selector + " " ]; if ( !cached ) { // Generate a function of recursive functions that can be used to check each element if ( !group ) { group = tokenize( selector ); } i = group.length; while ( i-- ) { cached = matcherFromTokens( group[i] ); if ( cached[ expando ] ) { setMatchers.push( cached ); } else { elementMatchers.push( cached ); } } // Cache the compiled function cached = compilerCache( selector, matcherFromGroupMatchers( elementMatchers, setMatchers ) ); } return cached; }; function multipleContexts( selector, contexts, results ) { var i = 0, len = contexts.length; for ( ; i < len; i++ ) { Sizzle( selector, contexts[i], results ); } return results; } function select( selector, context, results, seed ) { var i, tokens, token, type, find, match = tokenize( selector ); if ( !seed ) { // Try to minimize operations if there is only one group if ( match.length === 1 ) { // Take a shortcut and set the context if the root selector is an ID tokens = match[0] = match[0].slice( 0 ); if ( tokens.length > 2 && (token = tokens[0]).type === "ID" && context.nodeType === 9 && documentIsHTML && Expr.relative[ tokens[1].type ] ) { context = ( Expr.find["ID"]( token.matches[0].replace(runescape, funescape), context ) || [] )[0]; if ( !context ) { return results; } selector = selector.slice( tokens.shift().value.length ); } // Fetch a seed set for right-to-left matching i = matchExpr["needsContext"].test( selector ) ? 0 : tokens.length; while ( i-- ) { token = tokens[i]; // Abort if we hit a combinator if ( Expr.relative[ (type = token.type) ] ) { break; } if ( (find = Expr.find[ type ]) ) { // Search, expanding context for leading sibling combinators if ( (seed = find( token.matches[0].replace( runescape, funescape ), rsibling.test( tokens[0].type ) && context.parentNode || context )) ) { // If seed is empty or no tokens remain, we can return early tokens.splice( i, 1 ); selector = seed.length && toSelector( tokens ); if ( !selector ) { push.apply( results, seed ); return results; } break; } } } } } // Compile and execute a filtering function // Provide `match` to avoid retokenization if we modified the selector above compile( selector, match )( seed, context, !documentIsHTML, results, rsibling.test( selector ) ); return results; } // Deprecated Expr.pseudos["nth"] = Expr.pseudos["eq"]; // Easy API for creating new setFilters function setFilters() {} setFilters.prototype = Expr.filters = Expr.pseudos; Expr.setFilters = new setFilters(); // One-time assignments // Sort stability support.sortStable = expando.split("").sort( sortOrder ).join("") === expando; // Initialize against the default document setDocument(); // Support: Chrome<<14 // Always assume duplicates if they aren't passed to the comparison function [0, 0].sort( sortOrder ); support.detectDuplicates = hasDuplicate; // Support: IE<8 // Prevent attribute/property "interpolation" assert(function( div ) { div.innerHTML = "<a href='#'></a>"; if ( div.firstChild.getAttribute("href") !== "#" ) { var attrs = "type|href|height|width".split("|"), i = attrs.length; while ( i-- ) { Expr.attrHandle[ attrs[i] ] = interpolationHandler; } } }); // Support: IE<9 // Use getAttributeNode to fetch booleans when getAttribute lies assert(function( div ) { if ( div.getAttribute("disabled") != null ) { var attrs = booleans.split("|"), i = attrs.length; while ( i-- ) { Expr.attrHandle[ attrs[i] ] = boolHandler; } } }); jQuery.find = Sizzle; jQuery.expr = Sizzle.selectors; jQuery.expr[":"] = jQuery.expr.pseudos; jQuery.unique = Sizzle.uniqueSort; jQuery.text = Sizzle.getText; jQuery.isXMLDoc = Sizzle.isXML; jQuery.contains = Sizzle.contains; })( window ); // String to Object options format cache var optionsCache = {}; // Convert String-formatted options into Object-formatted ones and store in cache function createOptions( options ) { var object = optionsCache[ options ] = {}; jQuery.each( options.match( core_rnotwhite ) || [], function( _, flag ) { object[ flag ] = true; }); return object; } /* * Create a callback list using the following parameters: * * options: an optional list of space-separated options that will change how * the callback list behaves or a more traditional option object * * By default a callback list will act like an event callback list and can be * "fired" multiple times. * * Possible options: * * once: will ensure the callback list can only be fired once (like a Deferred) * * memory: will keep track of previous values and will call any callback added * after the list has been fired right away with the latest "memorized" * values (like a Deferred) * * unique: will ensure a callback can only be added once (no duplicate in the list) * * stopOnFalse: interrupt callings when a callback returns false * */ jQuery.Callbacks = function( options ) { // Convert options from String-formatted to Object-formatted if needed // (we check in cache first) options = typeof options === "string" ? ( optionsCache[ options ] || createOptions( options ) ) : jQuery.extend( {}, options ); var // Last fire value (for non-forgettable lists) memory, // Flag to know if list was already fired fired, // Flag to know if list is currently firing firing, // First callback to fire (used internally by add and fireWith) firingStart, // End of the loop when firing firingLength, // Index of currently firing callback (modified by remove if needed) firingIndex, // Actual callback list list = [], // Stack of fire calls for repeatable lists stack = !options.once && [], // Fire callbacks fire = function( data ) { memory = options.memory && data; fired = true; firingIndex = firingStart || 0; firingStart = 0; firingLength = list.length; firing = true; for ( ; list && firingIndex < firingLength; firingIndex++ ) { if ( list[ firingIndex ].apply( data[ 0 ], data[ 1 ] ) === false && options.stopOnFalse ) { memory = false; // To prevent further calls using add break; } } firing = false; if ( list ) { if ( stack ) { if ( stack.length ) { fire( stack.shift() ); } } else if ( memory ) { list = []; } else { self.disable(); } } }, // Actual Callbacks object self = { // Add a callback or a collection of callbacks to the list add: function() { if ( list ) { // First, we save the current length var start = list.length; (function add( args ) { jQuery.each( args, function( _, arg ) { var type = jQuery.type( arg ); if ( type === "function" ) { if ( !options.unique || !self.has( arg ) ) { list.push( arg ); } } else if ( arg && arg.length && type !== "string" ) { // Inspect recursively add( arg ); } }); })( arguments ); // Do we need to add the callbacks to the // current firing batch? if ( firing ) { firingLength = list.length; // With memory, if we're not firing then // we should call right away } else if ( memory ) { firingStart = start; fire( memory ); } } return this; }, // Remove a callback from the list remove: function() { if ( list ) { jQuery.each( arguments, function( _, arg ) { var index; while( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { list.splice( index, 1 ); // Handle firing indexes if ( firing ) { if ( index <= firingLength ) { firingLength--; } if ( index <= firingIndex ) { firingIndex--; } } } }); } return this; }, // Check if a given callback is in the list. // If no argument is given, return whether or not list has callbacks attached. has: function( fn ) { return fn ? jQuery.inArray( fn, list ) > -1 : !!( list && list.length ); }, // Remove all callbacks from the list empty: function() { list = []; firingLength = 0; return this; }, // Have the list do nothing anymore disable: function() { list = stack = memory = undefined; return this; }, // Is it disabled? disabled: function() { return !list; }, // Lock the list in its current state lock: function() { stack = undefined; if ( !memory ) { self.disable(); } return this; }, // Is it locked? locked: function() { return !stack; }, // Call all callbacks with the given context and arguments fireWith: function( context, args ) { args = args || []; args = [ context, args.slice ? args.slice() : args ]; if ( list && ( !fired || stack ) ) { if ( firing ) { stack.push( args ); } else { fire( args ); } } return this; }, // Call all the callbacks with the given arguments fire: function() { self.fireWith( this, arguments ); return this; }, // To know if the callbacks have already been called at least once fired: function() { return !!fired; } }; return self; }; jQuery.extend({ Deferred: function( func ) { var tuples = [ // action, add listener, listener list, final state [ "resolve", "done", jQuery.Callbacks("once memory"), "resolved" ], [ "reject", "fail", jQuery.Callbacks("once memory"), "rejected" ], [ "notify", "progress", jQuery.Callbacks("memory") ] ], state = "pending", promise = { state: function() { return state; }, always: function() { deferred.done( arguments ).fail( arguments ); return this; }, then: function( /* fnDone, fnFail, fnProgress */ ) { var fns = arguments; return jQuery.Deferred(function( newDefer ) { jQuery.each( tuples, function( i, tuple ) { var action = tuple[ 0 ], fn = jQuery.isFunction( fns[ i ] ) && fns[ i ]; // deferred[ done | fail | progress ] for forwarding actions to newDefer deferred[ tuple[1] ](function() { var returned = fn && fn.apply( this, arguments ); if ( returned && jQuery.isFunction( returned.promise ) ) { returned.promise() .done( newDefer.resolve ) .fail( newDefer.reject ) .progress( newDefer.notify ); } else { newDefer[ action + "With" ]( this === promise ? newDefer.promise() : this, fn ? [ returned ] : arguments ); } }); }); fns = null; }).promise(); }, // Get a promise for this deferred // If obj is provided, the promise aspect is added to the object promise: function( obj ) { return obj != null ? jQuery.extend( obj, promise ) : promise; } }, deferred = {}; // Keep pipe for back-compat promise.pipe = promise.then; // Add list-specific methods jQuery.each( tuples, function( i, tuple ) { var list = tuple[ 2 ], stateString = tuple[ 3 ]; // promise[ done | fail | progress ] = list.add promise[ tuple[1] ] = list.add; // Handle state if ( stateString ) { list.add(function() { // state = [ resolved | rejected ] state = stateString; // [ reject_list | resolve_list ].disable; progress_list.lock }, tuples[ i ^ 1 ][ 2 ].disable, tuples[ 2 ][ 2 ].lock ); } // deferred[ resolve | reject | notify ] deferred[ tuple[0] ] = function() { deferred[ tuple[0] + "With" ]( this === deferred ? promise : this, arguments ); return this; }; deferred[ tuple[0] + "With" ] = list.fireWith; }); // Make the deferred a promise promise.promise( deferred ); // Call given func if any if ( func ) { func.call( deferred, deferred ); } // All done! return deferred; }, // Deferred helper when: function( subordinate /* , ..., subordinateN */ ) { var i = 0, resolveValues = core_slice.call( arguments ), length = resolveValues.length, // the count of uncompleted subordinates remaining = length !== 1 || ( subordinate && jQuery.isFunction( subordinate.promise ) ) ? length : 0, // the master Deferred. If resolveValues consist of only a single Deferred, just use that. deferred = remaining === 1 ? subordinate : jQuery.Deferred(), // Update function for both resolve and progress values updateFunc = function( i, contexts, values ) { return function( value ) { contexts[ i ] = this; values[ i ] = arguments.length > 1 ? core_slice.call( arguments ) : value; if( values === progressValues ) { deferred.notifyWith( contexts, values ); } else if ( !( --remaining ) ) { deferred.resolveWith( contexts, values ); } }; }, progressValues, progressContexts, resolveContexts; // add listeners to Deferred subordinates; treat others as resolved if ( length > 1 ) { progressValues = new Array( length ); progressContexts = new Array( length ); resolveContexts = new Array( length ); for ( ; i < length; i++ ) { if ( resolveValues[ i ] && jQuery.isFunction( resolveValues[ i ].promise ) ) { resolveValues[ i ].promise() .done( updateFunc( i, resolveContexts, resolveValues ) ) .fail( deferred.reject ) .progress( updateFunc( i, progressContexts, progressValues ) ); } else { --remaining; } } } // if we're not waiting on anything, resolve the master if ( !remaining ) { deferred.resolveWith( resolveContexts, resolveValues ); } return deferred.promise(); } }); jQuery.support = (function( support ) { var input = document.createElement("input"), fragment = document.createDocumentFragment(), div = document.createElement("div"), select = document.createElement("select"), opt = select.appendChild( document.createElement("option") ); // Finish early in limited environments if ( !input.type ) { return support; } input.type = "checkbox"; // Support: Safari 5.1, iOS 5.1, Android 4.x, Android 2.3 // Check the default checkbox/radio value ("" on old WebKit; "on" elsewhere) support.checkOn = input.value !== ""; // Must access the parent to make an option select properly // Support: IE9, IE10 support.optSelected = opt.selected; // Will be defined later support.reliableMarginRight = true; support.boxSizingReliable = true; support.pixelPosition = false; // Make sure checked status is properly cloned // Support: IE9, IE10 input.checked = true; support.noCloneChecked = input.cloneNode( true ).checked; // Make sure that the options inside disabled selects aren't marked as disabled // (WebKit marks them as disabled) select.disabled = true; support.optDisabled = !opt.disabled; // Check if an input maintains its value after becoming a radio // Support: IE9, IE10 input = document.createElement("input"); input.value = "t"; input.type = "radio"; support.radioValue = input.value === "t"; // #11217 - WebKit loses check when the name is after the checked attribute input.setAttribute( "checked", "t" ); input.setAttribute( "name", "t" ); fragment.appendChild( input ); // Support: Safari 5.1, Android 4.x, Android 2.3 // old WebKit doesn't clone checked state correctly in fragments support.checkClone = fragment.cloneNode( true ).cloneNode( true ).lastChild.checked; // Support: Firefox, Chrome, Safari // Beware of CSP restrictions (https://developer.mozilla.org/en/Security/CSP) support.focusinBubbles = "onfocusin" in window; div.style.backgroundClip = "content-box"; div.cloneNode( true ).style.backgroundClip = ""; support.clearCloneStyle = div.style.backgroundClip === "content-box"; // Run tests that need a body at doc ready jQuery(function() { var container, marginDiv, // Support: Firefox, Android 2.3 (Prefixed box-sizing versions). divReset = "padding:0;margin:0;border:0;display:block;-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box", body = document.getElementsByTagName("body")[ 0 ]; if ( !body ) { // Return for frameset docs that don't have a body return; } container = document.createElement("div"); container.style.cssText = "border:0;width:0;height:0;position:absolute;top:0;left:-9999px;margin-top:1px"; // Check box-sizing and margin behavior. body.appendChild( container ).appendChild( div ); div.innerHTML = ""; // Support: Firefox, Android 2.3 (Prefixed box-sizing versions). div.style.cssText = "-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;padding:1px;border:1px;display:block;width:4px;margin-top:1%;position:absolute;top:1%"; // Workaround failing boxSizing test due to offsetWidth returning wrong value // with some non-1 values of body zoom, ticket #13543 jQuery.swap( body, body.style.zoom != null ? { zoom: 1 } : {}, function() { support.boxSizing = div.offsetWidth === 4; }); // Use window.getComputedStyle because jsdom on node.js will break without it. if ( window.getComputedStyle ) { support.pixelPosition = ( window.getComputedStyle( div, null ) || {} ).top !== "1%"; support.boxSizingReliable = ( window.getComputedStyle( div, null ) || { width: "4px" } ).width === "4px"; // Support: Android 2.3 // Check if div with explicit width and no margin-right incorrectly // gets computed margin-right based on width of container. (#3333) // WebKit Bug 13343 - getComputedStyle returns wrong value for margin-right marginDiv = div.appendChild( document.createElement("div") ); marginDiv.style.cssText = div.style.cssText = divReset; marginDiv.style.marginRight = marginDiv.style.width = "0"; div.style.width = "1px"; support.reliableMarginRight = !parseFloat( ( window.getComputedStyle( marginDiv, null ) || {} ).marginRight ); } body.removeChild( container ); }); return support; })( {} ); /* Implementation Summary 1. Enforce API surface and semantic compatibility with 1.9.x branch 2. Improve the module's maintainability by reducing the storage paths to a single mechanism. 3. Use the same single mechanism to support "private" and "user" data. 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) 5. Avoid exposing implementation details on user objects (eg. expando properties) 6. Provide a clear path for implementation upgrade to WeakMap in 2014 */ var data_user, data_priv, rbrace = /(?:\{[\s\S]*\}|\[[\s\S]*\])$/, rmultiDash = /([A-Z])/g; function Data() { // Support: Android < 4, // Old WebKit does not have Object.preventExtensions/freeze method, // return new empty object instead with no [[set]] accessor Object.defineProperty( this.cache = {}, 0, { get: function() { return {}; } }); this.expando = jQuery.expando + Math.random(); } Data.uid = 1; Data.accepts = function( owner ) { // Accepts only: // - Node // - Node.ELEMENT_NODE // - Node.DOCUMENT_NODE // - Object // - Any return owner.nodeType ? owner.nodeType === 1 || owner.nodeType === 9 : true; }; Data.prototype = { key: function( owner ) { // We can accept data for non-element nodes in modern browsers, // but we should not, see #8335. // Always return the key for a frozen object. if ( !Data.accepts( owner ) ) { return 0; } var descriptor = {}, // Check if the owner object already has a cache key unlock = owner[ this.expando ]; // If not, create one if ( !unlock ) { unlock = Data.uid++; // Secure it in a non-enumerable, non-writable property try { descriptor[ this.expando ] = { value: unlock }; Object.defineProperties( owner, descriptor ); // Support: Android < 4 // Fallback to a less secure definition } catch ( e ) { descriptor[ this.expando ] = unlock; jQuery.extend( owner, descriptor ); } } // Ensure the cache object if ( !this.cache[ unlock ] ) { this.cache[ unlock ] = {}; } return unlock; }, set: function( owner, data, value ) { var prop, // There may be an unlock assigned to this node, // if there is no entry for this "owner", create one inline // and set the unlock as though an owner entry had always existed unlock = this.key( owner ), cache = this.cache[ unlock ]; // Handle: [ owner, key, value ] args if ( typeof data === "string" ) { cache[ data ] = value; // Handle: [ owner, { properties } ] args } else { // Support an expectation from the old data system where plain // objects used to initialize would be set to the cache by // reference, instead of having properties and values copied. // Note, this will kill the connection between // "this.cache[ unlock ]" and "cache" if ( jQuery.isEmptyObject( cache ) ) { this.cache[ unlock ] = data; // Otherwise, copy the properties one-by-one to the cache object } else { for ( prop in data ) { cache[ prop ] = data[ prop ]; } } } }, get: function( owner, key ) { // Either a valid cache is found, or will be created. // New caches will be created and the unlock returned, // allowing direct access to the newly created // empty data object. A valid owner object must be provided. var cache = this.cache[ this.key( owner ) ]; return key === undefined ? cache : cache[ key ]; }, access: function( owner, key, value ) { // In cases where either: // // 1. No key was specified // 2. A string key was specified, but no value provided // // Take the "read" path and allow the get method to determine // which value to return, respectively either: // // 1. The entire cache object // 2. The data stored at the key // if ( key === undefined || ((key && typeof key === "string") && value === undefined) ) { return this.get( owner, key ); } // [*]When the key is not a string, or both a key and value // are specified, set or extend (existing objects) with either: // // 1. An object of properties // 2. A key and value // this.set( owner, key, value ); // Since the "set" path can have two possible entry points // return the expected data based on which path was taken[*] return value !== undefined ? value : key; }, remove: function( owner, key ) { var i, name, unlock = this.key( owner ), cache = this.cache[ unlock ]; if ( key === undefined ) { this.cache[ unlock ] = {}; } else { // Support array or space separated string of keys if ( jQuery.isArray( key ) ) { // If "name" is an array of keys... // When data is initially created, via ("key", "val") signature, // keys will be converted to camelCase. // Since there is no way to tell _how_ a key was added, remove // both plain key and camelCase key. #12786 // This will only penalize the array argument path. name = key.concat( key.map( jQuery.camelCase ) ); } else { // Try the string as a key before any manipulation if ( key in cache ) { name = [ key ]; } else { // If a key with the spaces exists, use it. // Otherwise, create an array by matching non-whitespace name = jQuery.camelCase( key ); name = name in cache ? [ name ] : ( name.match( core_rnotwhite ) || [] ); } } i = name.length; while ( i-- ) { delete cache[ name[ i ] ]; } } }, hasData: function( owner ) { return !jQuery.isEmptyObject( this.cache[ owner[ this.expando ] ] || {} ); }, discard: function( owner ) { delete this.cache[ this.key( owner ) ]; } }; // These may be used throughout the jQuery core codebase data_user = new Data(); data_priv = new Data(); jQuery.extend({ acceptData: Data.accepts, hasData: function( elem ) { return data_user.hasData( elem ) || data_priv.hasData( elem ); }, data: function( elem, name, data ) { return data_user.access( elem, name, data ); }, removeData: function( elem, name ) { data_user.remove( elem, name ); }, // TODO: Now that all calls to _data and _removeData have been replaced // with direct calls to data_priv methods, these can be deprecated. _data: function( elem, name, data ) { return data_priv.access( elem, name, data ); }, _removeData: function( elem, name ) { data_priv.remove( elem, name ); } }); jQuery.fn.extend({ data: function( key, value ) { var attrs, name, elem = this[ 0 ], i = 0, data = null; // Gets all values if ( key === undefined ) { if ( this.length ) { data = data_user.get( elem ); if ( elem.nodeType === 1 && !data_priv.get( elem, "hasDataAttrs" ) ) { attrs = elem.attributes; for ( ; i < attrs.length; i++ ) { name = attrs[ i ].name; if ( name.indexOf( "data-" ) === 0 ) { name = jQuery.camelCase( name.substring(5) ); dataAttr( elem, name, data[ name ] ); } } data_priv.set( elem, "hasDataAttrs", true ); } } return data; } // Sets multiple values if ( typeof key === "object" ) { return this.each(function() { data_user.set( this, key ); }); } return jQuery.access( this, function( value ) { var data, camelKey = jQuery.camelCase( key ); // The calling jQuery object (element matches) is not empty // (and therefore has an element appears at this[ 0 ]) and the // `value` parameter was not undefined. An empty jQuery object // will result in `undefined` for elem = this[ 0 ] which will // throw an exception if an attempt to read a data cache is made. if ( elem && value === undefined ) { // Attempt to get data from the cache // with the key as-is data = data_user.get( elem, key ); if ( data !== undefined ) { return data; } // Attempt to get data from the cache // with the key camelized data = data_user.get( elem, camelKey ); if ( data !== undefined ) { return data; } // Attempt to "discover" the data in // HTML5 custom data-* attrs data = dataAttr( elem, camelKey, undefined ); if ( data !== undefined ) { return data; } // We tried really hard, but the data doesn't exist. return; } // Set the data... this.each(function() { // First, attempt to store a copy or reference of any // data that might've been store with a camelCased key. var data = data_user.get( this, camelKey ); // For HTML5 data-* attribute interop, we have to // store property names with dashes in a camelCase form. // This might not apply to all properties...* data_user.set( this, camelKey, value ); // *... In the case of properties that might _actually_ // have dashes, we need to also store a copy of that // unchanged property. if ( key.indexOf("-") !== -1 && data !== undefined ) { data_user.set( this, key, value ); } }); }, null, value, arguments.length > 1, null, true ); }, removeData: function( key ) { return this.each(function() { data_user.remove( this, key ); }); } }); function dataAttr( elem, key, data ) { var name; // If nothing was found internally, try to fetch any // data from the HTML5 data-* attribute if ( data === undefined && elem.nodeType === 1 ) { name = "data-" + key.replace( rmultiDash, "-$1" ).toLowerCase(); data = elem.getAttribute( name ); if ( typeof data === "string" ) { try { data = data === "true" ? true : data === "false" ? false : data === "null" ? null : // Only convert to a number if it doesn't change the string +data + "" === data ? +data : rbrace.test( data ) ? JSON.parse( data ) : data; } catch( e ) {} // Make sure we set the data so it isn't changed later data_user.set( elem, key, data ); } else { data = undefined; } } return data; } jQuery.extend({ queue: function( elem, type, data ) { var queue; if ( elem ) { type = ( type || "fx" ) + "queue"; queue = data_priv.get( elem, type ); // Speed up dequeue by getting out quickly if this is just a lookup if ( data ) { if ( !queue || jQuery.isArray( data ) ) { queue = data_priv.access( elem, type, jQuery.makeArray(data) ); } else { queue.push( data ); } } return queue || []; } }, dequeue: function( elem, type ) { type = type || "fx"; var queue = jQuery.queue( elem, type ), startLength = queue.length, fn = queue.shift(), hooks = jQuery._queueHooks( elem, type ), next = function() { jQuery.dequeue( elem, type ); }; // If the fx queue is dequeued, always remove the progress sentinel if ( fn === "inprogress" ) { fn = queue.shift(); startLength--; } hooks.cur = fn; if ( fn ) { // Add a progress sentinel to prevent the fx queue from being // automatically dequeued if ( type === "fx" ) { queue.unshift( "inprogress" ); } // clear up the last queue stop function delete hooks.stop; fn.call( elem, next, hooks ); } if ( !startLength && hooks ) { hooks.empty.fire(); } }, // not intended for public consumption - generates a queueHooks object, or returns the current one _queueHooks: function( elem, type ) { var key = type + "queueHooks"; return data_priv.get( elem, key ) || data_priv.access( elem, key, { empty: jQuery.Callbacks("once memory").add(function() { data_priv.remove( elem, [ type + "queue", key ] ); }) }); } }); jQuery.fn.extend({ queue: function( type, data ) { var setter = 2; if ( typeof type !== "string" ) { data = type; type = "fx"; setter--; } if ( arguments.length < setter ) { return jQuery.queue( this[0], type ); } return data === undefined ? this : this.each(function() { var queue = jQuery.queue( this, type, data ); // ensure a hooks for this queue jQuery._queueHooks( this, type ); if ( type === "fx" && queue[0] !== "inprogress" ) { jQuery.dequeue( this, type ); } }); }, dequeue: function( type ) { return this.each(function() { jQuery.dequeue( this, type ); }); }, // Based off of the plugin by Clint Helfers, with permission. // http://blindsignals.com/index.php/2009/07/jquery-delay/ delay: function( time, type ) { time = jQuery.fx ? jQuery.fx.speeds[ time ] || time : time; type = type || "fx"; return this.queue( type, function( next, hooks ) { var timeout = setTimeout( next, time ); hooks.stop = function() { clearTimeout( timeout ); }; }); }, clearQueue: function( type ) { return this.queue( type || "fx", [] ); }, // Get a promise resolved when queues of a certain type // are emptied (fx is the type by default) promise: function( type, obj ) { var tmp, count = 1, defer = jQuery.Deferred(), elements = this, i = this.length, resolve = function() { if ( !( --count ) ) { defer.resolveWith( elements, [ elements ] ); } }; if ( typeof type !== "string" ) { obj = type; type = undefined; } type = type || "fx"; while( i-- ) { tmp = data_priv.get( elements[ i ], type + "queueHooks" ); if ( tmp && tmp.empty ) { count++; tmp.empty.add( resolve ); } } resolve(); return defer.promise( obj ); } }); var nodeHook, boolHook, rclass = /[\t\r\n]/g, rreturn = /\r/g, rfocusable = /^(?:input|select|textarea|button)$/i; jQuery.fn.extend({ attr: function( name, value ) { return jQuery.access( this, jQuery.attr, name, value, arguments.length > 1 ); }, removeAttr: function( name ) { return this.each(function() { jQuery.removeAttr( this, name ); }); }, prop: function( name, value ) { return jQuery.access( this, jQuery.prop, name, value, arguments.length > 1 ); }, removeProp: function( name ) { return this.each(function() { delete this[ jQuery.propFix[ name ] || name ]; }); }, addClass: function( value ) { var classes, elem, cur, clazz, j, i = 0, len = this.length, proceed = typeof value === "string" && value; if ( jQuery.isFunction( value ) ) { return this.each(function( j ) { jQuery( this ).addClass( value.call( this, j, this.className ) ); }); } if ( proceed ) { // The disjunction here is for better compressibility (see removeClass) classes = ( value || "" ).match( core_rnotwhite ) || []; for ( ; i < len; i++ ) { elem = this[ i ]; cur = elem.nodeType === 1 && ( elem.className ? ( " " + elem.className + " " ).replace( rclass, " " ) : " " ); if ( cur ) { j = 0; while ( (clazz = classes[j++]) ) { if ( cur.indexOf( " " + clazz + " " ) < 0 ) { cur += clazz + " "; } } elem.className = jQuery.trim( cur ); } } } return this; }, removeClass: function( value ) { var classes, elem, cur, clazz, j, i = 0, len = this.length, proceed = arguments.length === 0 || typeof value === "string" && value; if ( jQuery.isFunction( value ) ) { return this.each(function( j ) { jQuery( this ).removeClass( value.call( this, j, this.className ) ); }); } if ( proceed ) { classes = ( value || "" ).match( core_rnotwhite ) || []; for ( ; i < len; i++ ) { elem = this[ i ]; // This expression is here for better compressibility (see addClass) cur = elem.nodeType === 1 && ( elem.className ? ( " " + elem.className + " " ).replace( rclass, " " ) : "" ); if ( cur ) { j = 0; while ( (clazz = classes[j++]) ) { // Remove *all* instances while ( cur.indexOf( " " + clazz + " " ) >= 0 ) { cur = cur.replace( " " + clazz + " ", " " ); } } elem.className = value ? jQuery.trim( cur ) : ""; } } } return this; }, toggleClass: function( value, stateVal ) { var type = typeof value, isBool = typeof stateVal === "boolean"; if ( jQuery.isFunction( value ) ) { return this.each(function( i ) { jQuery( this ).toggleClass( value.call(this, i, this.className, stateVal), stateVal ); }); } return this.each(function() { if ( type === "string" ) { // toggle individual class names var className, i = 0, self = jQuery( this ), state = stateVal, classNames = value.match( core_rnotwhite ) || []; while ( (className = classNames[ i++ ]) ) { // check each className given, space separated list state = isBool ? state : !self.hasClass( className ); self[ state ? "addClass" : "removeClass" ]( className ); } // Toggle whole class name } else if ( type === core_strundefined || type === "boolean" ) { if ( this.className ) { // store className if set data_priv.set( this, "__className__", this.className ); } // If the element has a class name or if we're passed "false", // then remove the whole classname (if there was one, the above saved it). // Otherwise bring back whatever was previously saved (if anything), // falling back to the empty string if nothing was stored. this.className = this.className || value === false ? "" : data_priv.get( this, "__className__" ) || ""; } }); }, hasClass: function( selector ) { var className = " " + selector + " ", i = 0, l = this.length; for ( ; i < l; i++ ) { if ( this[i].nodeType === 1 && (" " + this[i].className + " ").replace(rclass, " ").indexOf( className ) >= 0 ) { return true; } } return false; }, val: function( value ) { var hooks, ret, isFunction, elem = this[0]; if ( !arguments.length ) { if ( elem ) { hooks = jQuery.valHooks[ elem.type ] || jQuery.valHooks[ elem.nodeName.toLowerCase() ]; if ( hooks && "get" in hooks && (ret = hooks.get( elem, "value" )) !== undefined ) { return ret; } ret = elem.value; return typeof ret === "string" ? // handle most common string cases ret.replace(rreturn, "") : // handle cases where value is null/undef or number ret == null ? "" : ret; } return; } isFunction = jQuery.isFunction( value ); return this.each(function( i ) { var val, self = jQuery(this); if ( this.nodeType !== 1 ) { return; } if ( isFunction ) { val = value.call( this, i, self.val() ); } else { val = value; } // Treat null/undefined as ""; convert numbers to string if ( val == null ) { val = ""; } else if ( typeof val === "number" ) { val += ""; } else if ( jQuery.isArray( val ) ) { val = jQuery.map(val, function ( value ) { return value == null ? "" : value + ""; }); } hooks = jQuery.valHooks[ this.type ] || jQuery.valHooks[ this.nodeName.toLowerCase() ]; // If set returns undefined, fall back to normal setting if ( !hooks || !("set" in hooks) || hooks.set( this, val, "value" ) === undefined ) { this.value = val; } }); } }); jQuery.extend({ valHooks: { option: { get: function( elem ) { // attributes.value is undefined in Blackberry 4.7 but // uses .value. See #6932 var val = elem.attributes.value; return !val || val.specified ? elem.value : elem.text; } }, select: { get: function( elem ) { var value, option, options = elem.options, index = elem.selectedIndex, one = elem.type === "select-one" || index < 0, values = one ? null : [], max = one ? index + 1 : options.length, i = index < 0 ? max : one ? index : 0; // Loop through all the selected options for ( ; i < max; i++ ) { option = options[ i ]; // IE6-9 doesn't update selected after form reset (#2551) if ( ( option.selected || i === index ) && // Don't return options that are disabled or in a disabled optgroup ( jQuery.support.optDisabled ? !option.disabled : option.getAttribute("disabled") === null ) && ( !option.parentNode.disabled || !jQuery.nodeName( option.parentNode, "optgroup" ) ) ) { // Get the specific value for the option value = jQuery( option ).val(); // We don't need an array for one selects if ( one ) { return value; } // Multi-Selects return an array values.push( value ); } } return values; }, set: function( elem, value ) { var optionSet, option, options = elem.options, values = jQuery.makeArray( value ), i = options.length; while ( i-- ) { option = options[ i ]; if ( (option.selected = jQuery.inArray( jQuery(option).val(), values ) >= 0) ) { optionSet = true; } } // force browsers to behave consistently when non-matching value is set if ( !optionSet ) { elem.selectedIndex = -1; } return values; } } }, attr: function( elem, name, value ) { var hooks, ret, nType = elem.nodeType; // don't get/set attributes on text, comment and attribute nodes if ( !elem || nType === 3 || nType === 8 || nType === 2 ) { return; } // Fallback to prop when attributes are not supported if ( typeof elem.getAttribute === core_strundefined ) { return jQuery.prop( elem, name, value ); } // All attributes are lowercase // Grab necessary hook if one is defined if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { name = name.toLowerCase(); hooks = jQuery.attrHooks[ name ] || ( jQuery.expr.match.boolean.test( name ) ? boolHook : nodeHook ); } if ( value !== undefined ) { if ( value === null ) { jQuery.removeAttr( elem, name ); } else if ( hooks && "set" in hooks && (ret = hooks.set( elem, value, name )) !== undefined ) { return ret; } else { elem.setAttribute( name, value + "" ); return value; } } else if ( hooks && "get" in hooks && (ret = hooks.get( elem, name )) !== null ) { return ret; } else { ret = jQuery.find.attr( elem, name ); // Non-existent attributes return null, we normalize to undefined return ret == null ? undefined : ret; } }, removeAttr: function( elem, value ) { var name, propName, i = 0, attrNames = value && value.match( core_rnotwhite ); if ( attrNames && elem.nodeType === 1 ) { while ( (name = attrNames[i++]) ) { propName = jQuery.propFix[ name ] || name; // Boolean attributes get special treatment (#10870) if ( jQuery.expr.match.boolean.test( name ) ) { // Set corresponding property to false elem[ propName ] = false; } elem.removeAttribute( name ); } } }, attrHooks: { type: { set: function( elem, value ) { if ( !jQuery.support.radioValue && value === "radio" && jQuery.nodeName(elem, "input") ) { // Setting the type on a radio button after the value resets the value in IE6-9 // Reset value to default in case type is set after value during creation var val = elem.value; elem.setAttribute( "type", value ); if ( val ) { elem.value = val; } return value; } } } }, propFix: { "for": "htmlFor", "class": "className" }, prop: function( elem, name, value ) { var ret, hooks, notxml, nType = elem.nodeType; // don't get/set properties on text, comment and attribute nodes if ( !elem || nType === 3 || nType === 8 || nType === 2 ) { return; } notxml = nType !== 1 || !jQuery.isXMLDoc( elem ); if ( notxml ) { // Fix name and attach hooks name = jQuery.propFix[ name ] || name; hooks = jQuery.propHooks[ name ]; } if ( value !== undefined ) { return hooks && "set" in hooks && (ret = hooks.set( elem, value, name )) !== undefined ? ret : ( elem[ name ] = value ); } else { return hooks && "get" in hooks && (ret = hooks.get( elem, name )) !== null ? ret : elem[ name ]; } }, propHooks: { tabIndex: { get: function( elem ) { return elem.hasAttribute( "tabindex" ) || rfocusable.test( elem.nodeName ) || elem.href ? elem.tabIndex : -1; } } } }); // Hooks for boolean attributes boolHook = { set: function( elem, value, name ) { if ( value === false ) { // Remove boolean attributes when set to false jQuery.removeAttr( elem, name ); } else { elem.setAttribute( name, name ); } return name; } }; jQuery.each( jQuery.expr.match.boolean.source.match( /\w+/g ), function( i, name ) { var getter = jQuery.expr.attrHandle[ name ] || jQuery.find.attr; jQuery.expr.attrHandle[ name ] = function( elem, name, isXML ) { var fn = jQuery.expr.attrHandle[ name ], ret = isXML ? undefined : /* jshint eqeqeq: false */ // Temporarily disable this handler to check existence (jQuery.expr.attrHandle[ name ] = undefined) != getter( elem, name, isXML ) ? name.toLowerCase() : null; // Restore handler jQuery.expr.attrHandle[ name ] = fn; return ret; }; }); // Support: IE9+ // Selectedness for an option in an optgroup can be inaccurate if ( !jQuery.support.optSelected ) { jQuery.propHooks.selected = { get: function( elem ) { var parent = elem.parentNode; if ( parent && parent.parentNode ) { parent.parentNode.selectedIndex; } return null; } }; } jQuery.each([ "tabIndex", "readOnly", "maxLength", "cellSpacing", "cellPadding", "rowSpan", "colSpan", "useMap", "frameBorder", "contentEditable" ], function() { jQuery.propFix[ this.toLowerCase() ] = this; }); // Radios and checkboxes getter/setter jQuery.each([ "radio", "checkbox" ], function() { jQuery.valHooks[ this ] = { set: function( elem, value ) { if ( jQuery.isArray( value ) ) { return ( elem.checked = jQuery.inArray( jQuery(elem).val(), value ) >= 0 ); } } }; if ( !jQuery.support.checkOn ) { jQuery.valHooks[ this ].get = function( elem ) { // Support: Webkit // "" is returned instead of "on" if a value isn't specified return elem.getAttribute("value") === null ? "on" : elem.value; }; } }); var rkeyEvent = /^key/, rmouseEvent = /^(?:mouse|contextmenu)|click/, rfocusMorph = /^(?:focusinfocus|focusoutblur)$/, rtypenamespace = /^([^.]*)(?:\.(.+)|)$/; function returnTrue() { return true; } function returnFalse() { return false; } function safeActiveElement() { try { return document.activeElement; } catch ( err ) { } } /* * Helper functions for managing events -- not part of the public interface. * Props to Dean Edwards' addEvent library for many of the ideas. */ jQuery.event = { global: {}, add: function( elem, types, handler, data, selector ) { var handleObjIn, eventHandle, tmp, events, t, handleObj, special, handlers, type, namespaces, origType, elemData = data_priv.get( elem ); // Don't attach events to noData or text/comment nodes (but allow plain objects) if ( !elemData ) { return; } // Caller can pass in an object of custom data in lieu of the handler if ( handler.handler ) { handleObjIn = handler; handler = handleObjIn.handler; selector = handleObjIn.selector; } // Make sure that the handler has a unique ID, used to find/remove it later if ( !handler.guid ) { handler.guid = jQuery.guid++; } // Init the element's event structure and main handler, if this is the first if ( !(events = elemData.events) ) { events = elemData.events = {}; } if ( !(eventHandle = elemData.handle) ) { eventHandle = elemData.handle = function( e ) { // Discard the second event of a jQuery.event.trigger() and // when an event is called after a page has unloaded return typeof jQuery !== core_strundefined && (!e || jQuery.event.triggered !== e.type) ? jQuery.event.dispatch.apply( eventHandle.elem, arguments ) : undefined; }; // Add elem as a property of the handle fn to prevent a memory leak with IE non-native events eventHandle.elem = elem; } // Handle multiple events separated by a space types = ( types || "" ).match( core_rnotwhite ) || [""]; t = types.length; while ( t-- ) { tmp = rtypenamespace.exec( types[t] ) || []; type = origType = tmp[1]; namespaces = ( tmp[2] || "" ).split( "." ).sort(); // There *must* be a type, no attaching namespace-only handlers if ( !type ) { continue; } // If event changes its type, use the special event handlers for the changed type special = jQuery.event.special[ type ] || {}; // If selector defined, determine special event api type, otherwise given type type = ( selector ? special.delegateType : special.bindType ) || type; // Update special based on newly reset type special = jQuery.event.special[ type ] || {}; // handleObj is passed to all event handlers handleObj = jQuery.extend({ type: type, origType: origType, data: data, handler: handler, guid: handler.guid, selector: selector, needsContext: selector && jQuery.expr.match.needsContext.test( selector ), namespace: namespaces.join(".") }, handleObjIn ); // Init the event handler queue if we're the first if ( !(handlers = events[ type ]) ) { handlers = events[ type ] = []; handlers.delegateCount = 0; // Only use addEventListener if the special events handler returns false if ( !special.setup || special.setup.call( elem, data, namespaces, eventHandle ) === false ) { if ( elem.addEventListener ) { elem.addEventListener( type, eventHandle, false ); } } } if ( special.add ) { special.add.call( elem, handleObj ); if ( !handleObj.handler.guid ) { handleObj.handler.guid = handler.guid; } } // Add to the element's handler list, delegates in front if ( selector ) { handlers.splice( handlers.delegateCount++, 0, handleObj ); } else { handlers.push( handleObj ); } // Keep track of which events have ever been used, for event optimization jQuery.event.global[ type ] = true; } // Nullify elem to prevent memory leaks in IE elem = null; }, // Detach an event or set of events from an element remove: function( elem, types, handler, selector, mappedTypes ) { var j, origCount, tmp, events, t, handleObj, special, handlers, type, namespaces, origType, elemData = data_priv.hasData( elem ) && data_priv.get( elem ); if ( !elemData || !(events = elemData.events) ) { return; } // Once for each type.namespace in types; type may be omitted types = ( types || "" ).match( core_rnotwhite ) || [""]; t = types.length; while ( t-- ) { tmp = rtypenamespace.exec( types[t] ) || []; type = origType = tmp[1]; namespaces = ( tmp[2] || "" ).split( "." ).sort(); // Unbind all events (on this namespace, if provided) for the element if ( !type ) { for ( type in events ) { jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); } continue; } special = jQuery.event.special[ type ] || {}; type = ( selector ? special.delegateType : special.bindType ) || type; handlers = events[ type ] || []; tmp = tmp[2] && new RegExp( "(^|\\.)" + namespaces.join("\\.(?:.*\\.|)") + "(\\.|$)" ); // Remove matching events origCount = j = handlers.length; while ( j-- ) { handleObj = handlers[ j ]; if ( ( mappedTypes || origType === handleObj.origType ) && ( !handler || handler.guid === handleObj.guid ) && ( !tmp || tmp.test( handleObj.namespace ) ) && ( !selector || selector === handleObj.selector || selector === "**" && handleObj.selector ) ) { handlers.splice( j, 1 ); if ( handleObj.selector ) { handlers.delegateCount--; } if ( special.remove ) { special.remove.call( elem, handleObj ); } } } // Remove generic event handler if we removed something and no more handlers exist // (avoids potential for endless recursion during removal of special event handlers) if ( origCount && !handlers.length ) { if ( !special.teardown || special.teardown.call( elem, namespaces, elemData.handle ) === false ) { jQuery.removeEvent( elem, type, elemData.handle ); } delete events[ type ]; } } // Remove the expando if it's no longer used if ( jQuery.isEmptyObject( events ) ) { delete elemData.handle; data_priv.remove( elem, "events" ); } }, trigger: function( event, data, elem, onlyHandlers ) { var i, cur, tmp, bubbleType, ontype, handle, special, eventPath = [ elem || document ], type = core_hasOwn.call( event, "type" ) ? event.type : event, namespaces = core_hasOwn.call( event, "namespace" ) ? event.namespace.split(".") : []; cur = tmp = elem = elem || document; // Don't do events on text and comment nodes if ( elem.nodeType === 3 || elem.nodeType === 8 ) { return; } // focus/blur morphs to focusin/out; ensure we're not firing them right now if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { return; } if ( type.indexOf(".") >= 0 ) { // Namespaced trigger; create a regexp to match event type in handle() namespaces = type.split("."); type = namespaces.shift(); namespaces.sort(); } ontype = type.indexOf(":") < 0 && "on" + type; // Caller can pass in a jQuery.Event object, Object, or just an event type string event = event[ jQuery.expando ] ? event : new jQuery.Event( type, typeof event === "object" && event ); // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) event.isTrigger = onlyHandlers ? 2 : 3; event.namespace = namespaces.join("."); event.namespace_re = event.namespace ? new RegExp( "(^|\\.)" + namespaces.join("\\.(?:.*\\.|)") + "(\\.|$)" ) : null; // Clean up the event in case it is being reused event.result = undefined; if ( !event.target ) { event.target = elem; } // Clone any incoming data and prepend the event, creating the handler arg list data = data == null ? [ event ] : jQuery.makeArray( data, [ event ] ); // Allow special events to draw outside the lines special = jQuery.event.special[ type ] || {}; if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { return; } // Determine event propagation path in advance, per W3C events spec (#9951) // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) if ( !onlyHandlers && !special.noBubble && !jQuery.isWindow( elem ) ) { bubbleType = special.delegateType || type; if ( !rfocusMorph.test( bubbleType + type ) ) { cur = cur.parentNode; } for ( ; cur; cur = cur.parentNode ) { eventPath.push( cur ); tmp = cur; } // Only add window if we got to document (e.g., not plain obj or detached DOM) if ( tmp === (elem.ownerDocument || document) ) { eventPath.push( tmp.defaultView || tmp.parentWindow || window ); } } // Fire handlers on the event path i = 0; while ( (cur = eventPath[i++]) && !event.isPropagationStopped() ) { event.type = i > 1 ? bubbleType : special.bindType || type; // jQuery handler handle = ( data_priv.get( cur, "events" ) || {} )[ event.type ] && data_priv.get( cur, "handle" ); if ( handle ) { handle.apply( cur, data ); } // Native handler handle = ontype && cur[ ontype ]; if ( handle && jQuery.acceptData( cur ) && handle.apply && handle.apply( cur, data ) === false ) { event.preventDefault(); } } event.type = type; // If nobody prevented the default action, do it now if ( !onlyHandlers && !event.isDefaultPrevented() ) { if ( (!special._default || special._default.apply( eventPath.pop(), data ) === false) && jQuery.acceptData( elem ) ) { // Call a native DOM method on the target with the same name name as the event. // Don't do default actions on window, that's where global variables be (#6170) if ( ontype && jQuery.isFunction( elem[ type ] ) && !jQuery.isWindow( elem ) ) { // Don't re-trigger an onFOO event when we call its FOO() method tmp = elem[ ontype ]; if ( tmp ) { elem[ ontype ] = null; } // Prevent re-triggering of the same event, since we already bubbled it above jQuery.event.triggered = type; elem[ type ](); jQuery.event.triggered = undefined; if ( tmp ) { elem[ ontype ] = tmp; } } } } return event.result; }, dispatch: function( event ) { // Make a writable jQuery.Event from the native event object event = jQuery.event.fix( event ); var i, j, ret, matched, handleObj, handlerQueue = [], args = core_slice.call( arguments ), handlers = ( data_priv.get( this, "events" ) || {} )[ event.type ] || [], special = jQuery.event.special[ event.type ] || {}; // Use the fix-ed jQuery.Event rather than the (read-only) native event args[0] = event; event.delegateTarget = this; // Call the preDispatch hook for the mapped type, and let it bail if desired if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { return; } // Determine handlers handlerQueue = jQuery.event.handlers.call( this, event, handlers ); // Run delegates first; they may want to stop propagation beneath us i = 0; while ( (matched = handlerQueue[ i++ ]) && !event.isPropagationStopped() ) { event.currentTarget = matched.elem; j = 0; while ( (handleObj = matched.handlers[ j++ ]) && !event.isImmediatePropagationStopped() ) { // Triggered event must either 1) have no namespace, or // 2) have namespace(s) a subset or equal to those in the bound event (both can have no namespace). if ( !event.namespace_re || event.namespace_re.test( handleObj.namespace ) ) { event.handleObj = handleObj; event.data = handleObj.data; ret = ( (jQuery.event.special[ handleObj.origType ] || {}).handle || handleObj.handler ) .apply( matched.elem, args ); if ( ret !== undefined ) { if ( (event.result = ret) === false ) { event.preventDefault(); event.stopPropagation(); } } } } } // Call the postDispatch hook for the mapped type if ( special.postDispatch ) { special.postDispatch.call( this, event ); } return event.result; }, handlers: function( event, handlers ) { var i, matches, sel, handleObj, handlerQueue = [], delegateCount = handlers.delegateCount, cur = event.target; // Find delegate handlers // Black-hole SVG <use> instance trees (#13180) // Avoid non-left-click bubbling in Firefox (#3861) if ( delegateCount && cur.nodeType && (!event.button || event.type !== "click") ) { for ( ; cur !== this; cur = cur.parentNode || this ) { // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) if ( cur.disabled !== true || event.type !== "click" ) { matches = []; for ( i = 0; i < delegateCount; i++ ) { handleObj = handlers[ i ]; // Don't conflict with Object.prototype properties (#13203) sel = handleObj.selector + " "; if ( matches[ sel ] === undefined ) { matches[ sel ] = handleObj.needsContext ? jQuery( sel, this ).index( cur ) >= 0 : jQuery.find( sel, this, null, [ cur ] ).length; } if ( matches[ sel ] ) { matches.push( handleObj ); } } if ( matches.length ) { handlerQueue.push({ elem: cur, handlers: matches }); } } } } // Add the remaining (directly-bound) handlers if ( delegateCount < handlers.length ) { handlerQueue.push({ elem: this, handlers: handlers.slice( delegateCount ) }); } return handlerQueue; }, // Includes some event props shared by KeyEvent and MouseEvent props: "altKey bubbles cancelable ctrlKey currentTarget eventPhase metaKey relatedTarget shiftKey target timeStamp view which".split(" "), fixHooks: {}, keyHooks: { props: "char charCode key keyCode".split(" "), filter: function( event, original ) { // Add which for key events if ( event.which == null ) { event.which = original.charCode != null ? original.charCode : original.keyCode; } return event; } }, mouseHooks: { props: "button buttons clientX clientY offsetX offsetY pageX pageY screenX screenY toElement".split(" "), filter: function( event, original ) { var eventDoc, doc, body, button = original.button; // Calculate pageX/Y if missing and clientX/Y available if ( event.pageX == null && original.clientX != null ) { eventDoc = event.target.ownerDocument || document; doc = eventDoc.documentElement; body = eventDoc.body; event.pageX = original.clientX + ( doc && doc.scrollLeft || body && body.scrollLeft || 0 ) - ( doc && doc.clientLeft || body && body.clientLeft || 0 ); event.pageY = original.clientY + ( doc && doc.scrollTop || body && body.scrollTop || 0 ) - ( doc && doc.clientTop || body && body.clientTop || 0 ); } // Add which for click: 1 === left; 2 === middle; 3 === right // Note: button is not normalized, so don't use it if ( !event.which && button !== undefined ) { event.which = ( button & 1 ? 1 : ( button & 2 ? 3 : ( button & 4 ? 2 : 0 ) ) ); } return event; } }, fix: function( event ) { if ( event[ jQuery.expando ] ) { return event; } // Create a writable copy of the event object and normalize some properties var i, prop, copy, type = event.type, originalEvent = event, fixHook = this.fixHooks[ type ]; if ( !fixHook ) { this.fixHooks[ type ] = fixHook = rmouseEvent.test( type ) ? this.mouseHooks : rkeyEvent.test( type ) ? this.keyHooks : {}; } copy = fixHook.props ? this.props.concat( fixHook.props ) : this.props; event = new jQuery.Event( originalEvent ); i = copy.length; while ( i-- ) { prop = copy[ i ]; event[ prop ] = originalEvent[ prop ]; } // Support: Safari 6.0+, Chrome < 28 // Target should not be a text node (#504, #13143) if ( event.target.nodeType === 3 ) { event.target = event.target.parentNode; } return fixHook.filter? fixHook.filter( event, originalEvent ) : event; }, special: { load: { // Prevent triggered image.load events from bubbling to window.load noBubble: true }, focus: { // Fire native event if possible so blur/focus sequence is correct trigger: function() { if ( this !== safeActiveElement() && this.focus ) { this.focus(); return false; } }, delegateType: "focusin" }, blur: { trigger: function() { if ( this === safeActiveElement() && this.blur ) { this.blur(); return false; } }, delegateType: "focusout" }, click: { // For checkbox, fire native event so checked state will be right trigger: function() { if ( this.type === "checkbox" && this.click && jQuery.nodeName( this, "input" ) ) { this.click(); return false; } }, // For cross-browser consistency, don't fire native .click() on links _default: function( event ) { return jQuery.nodeName( event.target, "a" ); } }, beforeunload: { postDispatch: function( event ) { // Support: Firefox 20+ // Firefox doesn't alert if the returnValue field is not set. if ( event.result !== undefined ) { event.originalEvent.returnValue = event.result; } } } }, simulate: function( type, elem, event, bubble ) { // Piggyback on a donor event to simulate a different one. // Fake originalEvent to avoid donor's stopPropagation, but if the // simulated event prevents default then we do the same on the donor. var e = jQuery.extend( new jQuery.Event(), event, { type: type, isSimulated: true, originalEvent: {} } ); if ( bubble ) { jQuery.event.trigger( e, null, elem ); } else { jQuery.event.dispatch.call( elem, e ); } if ( e.isDefaultPrevented() ) { event.preventDefault(); } } }; jQuery.removeEvent = function( elem, type, handle ) { if ( elem.removeEventListener ) { elem.removeEventListener( type, handle, false ); } }; jQuery.Event = function( src, props ) { // Allow instantiation without the 'new' keyword if ( !(this instanceof jQuery.Event) ) { return new jQuery.Event( src, props ); } // Event object if ( src && src.type ) { this.originalEvent = src; this.type = src.type; // Events bubbling up the document may have been marked as prevented // by a handler lower down the tree; reflect the correct value. this.isDefaultPrevented = ( src.defaultPrevented || src.getPreventDefault && src.getPreventDefault() ) ? returnTrue : returnFalse; // Event type } else { this.type = src; } // Put explicitly provided properties onto the event object if ( props ) { jQuery.extend( this, props ); } // Create a timestamp if incoming event doesn't have one this.timeStamp = src && src.timeStamp || jQuery.now(); // Mark it as fixed this[ jQuery.expando ] = true; }; // jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding // http://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html jQuery.Event.prototype = { isDefaultPrevented: returnFalse, isPropagationStopped: returnFalse, isImmediatePropagationStopped: returnFalse, preventDefault: function() { var e = this.originalEvent; this.isDefaultPrevented = returnTrue; if ( e && e.preventDefault ) { e.preventDefault(); } }, stopPropagation: function() { var e = this.originalEvent; this.isPropagationStopped = returnTrue; if ( e && e.stopPropagation ) { e.stopPropagation(); } }, stopImmediatePropagation: function() { this.isImmediatePropagationStopped = returnTrue; this.stopPropagation(); } }; // Create mouseenter/leave events using mouseover/out and event-time checks // Support: Chrome 15+ jQuery.each({ mouseenter: "mouseover", mouseleave: "mouseout" }, function( orig, fix ) { jQuery.event.special[ orig ] = { delegateType: fix, bindType: fix, handle: function( event ) { var ret, target = this, related = event.relatedTarget, handleObj = event.handleObj; // For mousenter/leave call the handler if related is outside the target. // NB: No relatedTarget if the mouse left/entered the browser window if ( !related || (related !== target && !jQuery.contains( target, related )) ) { event.type = handleObj.origType; ret = handleObj.handler.apply( this, arguments ); event.type = fix; } return ret; } }; }); // Create "bubbling" focus and blur events // Support: Firefox, Chrome, Safari if ( !jQuery.support.focusinBubbles ) { jQuery.each({ focus: "focusin", blur: "focusout" }, function( orig, fix ) { // Attach a single capturing handler while someone wants focusin/focusout var attaches = 0, handler = function( event ) { jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ), true ); }; jQuery.event.special[ fix ] = { setup: function() { if ( attaches++ === 0 ) { document.addEventListener( orig, handler, true ); } }, teardown: function() { if ( --attaches === 0 ) { document.removeEventListener( orig, handler, true ); } } }; }); } jQuery.fn.extend({ on: function( types, selector, data, fn, /*INTERNAL*/ one ) { var origFn, type; // Types can be a map of types/handlers if ( typeof types === "object" ) { // ( types-Object, selector, data ) if ( typeof selector !== "string" ) { // ( types-Object, data ) data = data || selector; selector = undefined; } for ( type in types ) { this.on( type, selector, data, types[ type ], one ); } return this; } if ( data == null && fn == null ) { // ( types, fn ) fn = selector; data = selector = undefined; } else if ( fn == null ) { if ( typeof selector === "string" ) { // ( types, selector, fn ) fn = data; data = undefined; } else { // ( types, data, fn ) fn = data; data = selector; selector = undefined; } } if ( fn === false ) { fn = returnFalse; } else if ( !fn ) { return this; } if ( one === 1 ) { origFn = fn; fn = function( event ) { // Can use an empty set, since event contains the info jQuery().off( event ); return origFn.apply( this, arguments ); }; // Use same guid so caller can remove using origFn fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); } return this.each( function() { jQuery.event.add( this, types, fn, data, selector ); }); }, one: function( types, selector, data, fn ) { return this.on( types, selector, data, fn, 1 ); }, off: function( types, selector, fn ) { var handleObj, type; if ( types && types.preventDefault && types.handleObj ) { // ( event ) dispatched jQuery.Event handleObj = types.handleObj; jQuery( types.delegateTarget ).off( handleObj.namespace ? handleObj.origType + "." + handleObj.namespace : handleObj.origType, handleObj.selector, handleObj.handler ); return this; } if ( typeof types === "object" ) { // ( types-object [, selector] ) for ( type in types ) { this.off( type, selector, types[ type ] ); } return this; } if ( selector === false || typeof selector === "function" ) { // ( types [, fn] ) fn = selector; selector = undefined; } if ( fn === false ) { fn = returnFalse; } return this.each(function() { jQuery.event.remove( this, types, fn, selector ); }); }, trigger: function( type, data ) { return this.each(function() { jQuery.event.trigger( type, data, this ); }); }, triggerHandler: function( type, data ) { var elem = this[0]; if ( elem ) { return jQuery.event.trigger( type, data, elem, true ); } } }); var isSimple = /^.[^:#\[\.,]*$/, rneedsContext = jQuery.expr.match.needsContext, // methods guaranteed to produce a unique set when starting from a unique set guaranteedUnique = { children: true, contents: true, next: true, prev: true }; jQuery.fn.extend({ find: function( selector ) { var self, matched, i, l = this.length; if ( typeof selector !== "string" ) { self = this; return this.pushStack( jQuery( selector ).filter(function() { for ( i = 0; i < l; i++ ) { if ( jQuery.contains( self[ i ], this ) ) { return true; } } }) ); } matched = []; for ( i = 0; i < l; i++ ) { jQuery.find( selector, this[ i ], matched ); } // Needed because $( selector, context ) becomes $( context ).find( selector ) matched = this.pushStack( l > 1 ? jQuery.unique( matched ) : matched ); matched.selector = ( this.selector ? this.selector + " " : "" ) + selector; return matched; }, has: function( target ) { var targets = jQuery( target, this ), l = targets.length; return this.filter(function() { var i = 0; for ( ; i < l; i++ ) { if ( jQuery.contains( this, targets[i] ) ) { return true; } } }); }, not: function( selector ) { return this.pushStack( winnow(this, selector || [], true) ); }, filter: function( selector ) { return this.pushStack( winnow(this, selector || [], false) ); }, is: function( selector ) { return !!selector && ( typeof selector === "string" ? // If this is a positional/relative selector, check membership in the returned set // so $("p:first").is("p:last") won't return true for a doc with two "p". rneedsContext.test( selector ) ? jQuery( selector, this.context ).index( this[ 0 ] ) >= 0 : jQuery.filter( selector, this ).length > 0 : this.filter( selector ).length > 0 ); }, closest: function( selectors, context ) { var cur, i = 0, l = this.length, matched = [], pos = ( rneedsContext.test( selectors ) || typeof selectors !== "string" ) ? jQuery( selectors, context || this.context ) : 0; for ( ; i < l; i++ ) { for ( cur = this[i]; cur && cur !== context; cur = cur.parentNode ) { // Always skip document fragments if ( cur.nodeType < 11 && (pos ? pos.index(cur) > -1 : // Don't pass non-elements to Sizzle cur.nodeType === 1 && jQuery.find.matchesSelector(cur, selectors)) ) { cur = matched.push( cur ); break; } } } return this.pushStack( matched.length > 1 ? jQuery.unique( matched ) : matched ); }, // Determine the position of an element within // the matched set of elements index: function( elem ) { // No argument, return index in parent if ( !elem ) { return ( this[ 0 ] && this[ 0 ].parentNode ) ? this.first().prevAll().length : -1; } // index in selector if ( typeof elem === "string" ) { return core_indexOf.call( jQuery( elem ), this[ 0 ] ); } // Locate the position of the desired element return core_indexOf.call( this, // If it receives a jQuery object, the first element is used elem.jquery ? elem[ 0 ] : elem ); }, add: function( selector, context ) { var set = typeof selector === "string" ? jQuery( selector, context ) : jQuery.makeArray( selector && selector.nodeType ? [ selector ] : selector ), all = jQuery.merge( this.get(), set ); return this.pushStack( jQuery.unique(all) ); }, addBack: function( selector ) { return this.add( selector == null ? this.prevObject : this.prevObject.filter(selector) ); } }); function sibling( cur, dir ) { while ( (cur = cur[dir]) && cur.nodeType !== 1 ) {} return cur; } jQuery.each({ parent: function( elem ) { var parent = elem.parentNode; return parent && parent.nodeType !== 11 ? parent : null; }, parents: function( elem ) { return jQuery.dir( elem, "parentNode" ); }, parentsUntil: function( elem, i, until ) { return jQuery.dir( elem, "parentNode", until ); }, next: function( elem ) { return sibling( elem, "nextSibling" ); }, prev: function( elem ) { return sibling( elem, "previousSibling" ); }, nextAll: function( elem ) { return jQuery.dir( elem, "nextSibling" ); }, prevAll: function( elem ) { return jQuery.dir( elem, "previousSibling" ); }, nextUntil: function( elem, i, until ) { return jQuery.dir( elem, "nextSibling", until ); }, prevUntil: function( elem, i, until ) { return jQuery.dir( elem, "previousSibling", until ); }, siblings: function( elem ) { return jQuery.sibling( ( elem.parentNode || {} ).firstChild, elem ); }, children: function( elem ) { return jQuery.sibling( elem.firstChild ); }, contents: function( elem ) { return jQuery.nodeName( elem, "iframe" ) ? elem.contentDocument || elem.contentWindow.document : jQuery.merge( [], elem.childNodes ); } }, function( name, fn ) { jQuery.fn[ name ] = function( until, selector ) { var matched = jQuery.map( this, fn, until ); if ( name.slice( -5 ) !== "Until" ) { selector = until; } if ( selector && typeof selector === "string" ) { matched = jQuery.filter( selector, matched ); } if ( this.length > 1 ) { // Remove duplicates if ( !guaranteedUnique[ name ] ) { jQuery.unique( matched ); } // Reverse order for parents* and prev* if ( name[ 0 ] === "p" ) { matched.reverse(); } } return this.pushStack( matched ); }; }); jQuery.extend({ filter: function( expr, elems, not ) { var elem = elems[ 0 ]; if ( not ) { expr = ":not(" + expr + ")"; } return elems.length === 1 && elem.nodeType === 1 ? jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : [] : jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { return elem.nodeType === 1; })); }, dir: function( elem, dir, until ) { var matched = [], truncate = until !== undefined; while ( (elem = elem[ dir ]) && elem.nodeType !== 9 ) { if ( elem.nodeType === 1 ) { if ( truncate && jQuery( elem ).is( until ) ) { break; } matched.push( elem ); } } return matched; }, sibling: function( n, elem ) { var matched = []; for ( ; n; n = n.nextSibling ) { if ( n.nodeType === 1 && n !== elem ) { matched.push( n ); } } return matched; } }); // Implement the identical functionality for filter and not function winnow( elements, qualifier, not ) { if ( jQuery.isFunction( qualifier ) ) { return jQuery.grep( elements, function( elem, i ) { /* jshint -W018 */ return !!qualifier.call( elem, i, elem ) !== not; }); } if ( qualifier.nodeType ) { return jQuery.grep( elements, function( elem ) { return ( elem === qualifier ) !== not; }); } if ( typeof qualifier === "string" ) { if ( isSimple.test( qualifier ) ) { return jQuery.filter( qualifier, elements, not ); } qualifier = jQuery.filter( qualifier, elements ); } return jQuery.grep( elements, function( elem ) { return ( core_indexOf.call( qualifier, elem ) >= 0 ) !== not; }); } var rxhtmlTag = /<(?!area|br|col|embed|hr|img|input|link|meta|param)(([\w:]+)[^>]*)\/>/gi, rtagName = /<([\w:]+)/, rhtml = /<|&#?\w+;/, rnoInnerhtml = /<(?:script|style|link)/i, manipulation_rcheckableType = /^(?:checkbox|radio)$/i, // checked="checked" or checked rchecked = /checked\s*(?:[^=]|=\s*.checked.)/i, rscriptType = /^$|\/(?:java|ecma)script/i, rscriptTypeMasked = /^true\/(.*)/, rcleanScript = /^\s*<!(?:\[CDATA\[|--)|(?:\]\]|--)>\s*$/g, // We have to close these tags to support XHTML (#13200) wrapMap = { // Support: IE 9 option: [ 1, "<select multiple='multiple'>", "</select>" ], thead: [ 1, "<table>", "</table>" ], tr: [ 2, "<table><tbody>", "</tbody></table>" ], td: [ 3, "<table><tbody><tr>", "</tr></tbody></table>" ], _default: [ 0, "", "" ] }; // Support: IE 9 wrapMap.optgroup = wrapMap.option; wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.col = wrapMap.thead; wrapMap.th = wrapMap.td; jQuery.fn.extend({ text: function( value ) { return jQuery.access( this, function( value ) { return value === undefined ? jQuery.text( this ) : this.empty().append( ( this[ 0 ] && this[ 0 ].ownerDocument || document ).createTextNode( value ) ); }, null, value, arguments.length ); }, append: function() { return this.domManip( arguments, function( elem ) { if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { var target = manipulationTarget( this, elem ); target.appendChild( elem ); } }); }, prepend: function() { return this.domManip( arguments, function( elem ) { if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { var target = manipulationTarget( this, elem ); target.insertBefore( elem, target.firstChild ); } }); }, before: function() { return this.domManip( arguments, function( elem ) { if ( this.parentNode ) { this.parentNode.insertBefore( elem, this ); } }); }, after: function() { return this.domManip( arguments, function( elem ) { if ( this.parentNode ) { this.parentNode.insertBefore( elem, this.nextSibling ); } }); }, // keepData is for internal use only--do not document remove: function( selector, keepData ) { var elem, elems = selector ? jQuery.filter( selector, this ) : this, i = 0; for ( ; (elem = elems[i]) != null; i++ ) { if ( !keepData && elem.nodeType === 1 ) { jQuery.cleanData( getAll( elem ) ); } if ( elem.parentNode ) { if ( keepData && jQuery.contains( elem.ownerDocument, elem ) ) { setGlobalEval( getAll( elem, "script" ) ); } elem.parentNode.removeChild( elem ); } } return this; }, empty: function() { var elem, i = 0; for ( ; (elem = this[i]) != null; i++ ) { if ( elem.nodeType === 1 ) { // Prevent memory leaks jQuery.cleanData( getAll( elem, false ) ); // Remove any remaining nodes elem.textContent = ""; } } return this; }, clone: function( dataAndEvents, deepDataAndEvents ) { dataAndEvents = dataAndEvents == null ? false : dataAndEvents; deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; return this.map( function () { return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); }); }, html: function( value ) { return jQuery.access( this, function( value ) { var elem = this[ 0 ] || {}, i = 0, l = this.length; if ( value === undefined && elem.nodeType === 1 ) { return elem.innerHTML; } // See if we can take a shortcut and just use innerHTML if ( typeof value === "string" && !rnoInnerhtml.test( value ) && !wrapMap[ ( rtagName.exec( value ) || [ "", "" ] )[ 1 ].toLowerCase() ] ) { value = value.replace( rxhtmlTag, "<$1></$2>" ); try { for ( ; i < l; i++ ) { elem = this[ i ] || {}; // Remove element nodes and prevent memory leaks if ( elem.nodeType === 1 ) { jQuery.cleanData( getAll( elem, false ) ); elem.innerHTML = value; } } elem = 0; // If using innerHTML throws an exception, use the fallback method } catch( e ) {} } if ( elem ) { this.empty().append( value ); } }, null, value, arguments.length ); }, replaceWith: function() { var // Snapshot the DOM in case .domManip sweeps something relevant into its fragment args = jQuery.map( this, function( elem ) { return [ elem.nextSibling, elem.parentNode ]; }), i = 0; // Make the changes, replacing each context element with the new content this.domManip( arguments, function( elem ) { var next = args[ i++ ], parent = args[ i++ ]; if ( parent ) { jQuery( this ).remove(); parent.insertBefore( elem, next ); } // Allow new content to include elements from the context set }, true ); // Force removal if there was no new content (e.g., from empty arguments) return i ? this : this.remove(); }, detach: function( selector ) { return this.remove( selector, true ); }, domManip: function( args, callback, allowIntersection ) { // Flatten any nested arrays args = core_concat.apply( [], args ); var fragment, first, scripts, hasScripts, node, doc, i = 0, l = this.length, set = this, iNoClone = l - 1, value = args[ 0 ], isFunction = jQuery.isFunction( value ); // We can't cloneNode fragments that contain checked, in WebKit if ( isFunction || !( l <= 1 || typeof value !== "string" || jQuery.support.checkClone || !rchecked.test( value ) ) ) { return this.each(function( index ) { var self = set.eq( index ); if ( isFunction ) { args[ 0 ] = value.call( this, index, self.html() ); } self.domManip( args, callback, allowIntersection ); }); } if ( l ) { fragment = jQuery.buildFragment( args, this[ 0 ].ownerDocument, false, !allowIntersection && this ); first = fragment.firstChild; if ( fragment.childNodes.length === 1 ) { fragment = first; } if ( first ) { scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); hasScripts = scripts.length; // Use the original fragment for the last item instead of the first because it can end up // being emptied incorrectly in certain situations (#8070). for ( ; i < l; i++ ) { node = fragment; if ( i !== iNoClone ) { node = jQuery.clone( node, true, true ); // Keep references to cloned scripts for later restoration if ( hasScripts ) { // Support: QtWebKit // jQuery.merge because core_push.apply(_, arraylike) throws jQuery.merge( scripts, getAll( node, "script" ) ); } } callback.call( this[ i ], node, i ); } if ( hasScripts ) { doc = scripts[ scripts.length - 1 ].ownerDocument; // Reenable scripts jQuery.map( scripts, restoreScript ); // Evaluate executable scripts on first document insertion for ( i = 0; i < hasScripts; i++ ) { node = scripts[ i ]; if ( rscriptType.test( node.type || "" ) && !data_priv.access( node, "globalEval" ) && jQuery.contains( doc, node ) ) { if ( node.src ) { // Hope ajax is available... jQuery._evalUrl( node.src ); } else { jQuery.globalEval( node.textContent.replace( rcleanScript, "" ) ); } } } } } } return this; } }); jQuery.each({ appendTo: "append", prependTo: "prepend", insertBefore: "before", insertAfter: "after", replaceAll: "replaceWith" }, function( name, original ) { jQuery.fn[ name ] = function( selector ) { var elems, ret = [], insert = jQuery( selector ), last = insert.length - 1, i = 0; for ( ; i <= last; i++ ) { elems = i === last ? this : this.clone( true ); jQuery( insert[ i ] )[ original ]( elems ); // Support: QtWebKit // .get() because core_push.apply(_, arraylike) throws core_push.apply( ret, elems.get() ); } return this.pushStack( ret ); }; }); jQuery.extend({ clone: function( elem, dataAndEvents, deepDataAndEvents ) { var i, l, srcElements, destElements, clone = elem.cloneNode( true ), inPage = jQuery.contains( elem.ownerDocument, elem ); // Support: IE >= 9 // Fix Cloning issues if ( !jQuery.support.noCloneChecked && ( elem.nodeType === 1 || elem.nodeType === 11 ) && !jQuery.isXMLDoc( elem ) ) { // We eschew Sizzle here for performance reasons: http://jsperf.com/getall-vs-sizzle/2 destElements = getAll( clone ); srcElements = getAll( elem ); for ( i = 0, l = srcElements.length; i < l; i++ ) { fixInput( srcElements[ i ], destElements[ i ] ); } } // Copy the events from the original to the clone if ( dataAndEvents ) { if ( deepDataAndEvents ) { srcElements = srcElements || getAll( elem ); destElements = destElements || getAll( clone ); for ( i = 0, l = srcElements.length; i < l; i++ ) { cloneCopyEvent( srcElements[ i ], destElements[ i ] ); } } else { cloneCopyEvent( elem, clone ); } } // Preserve script evaluation history destElements = getAll( clone, "script" ); if ( destElements.length > 0 ) { setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); } // Return the cloned set return clone; }, buildFragment: function( elems, context, scripts, selection ) { var elem, tmp, tag, wrap, contains, j, i = 0, l = elems.length, fragment = context.createDocumentFragment(), nodes = []; for ( ; i < l; i++ ) { elem = elems[ i ]; if ( elem || elem === 0 ) { // Add nodes directly if ( jQuery.type( elem ) === "object" ) { // Support: QtWebKit // jQuery.merge because core_push.apply(_, arraylike) throws jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); // Convert non-html into a text node } else if ( !rhtml.test( elem ) ) { nodes.push( context.createTextNode( elem ) ); // Convert html into DOM nodes } else { tmp = tmp || fragment.appendChild( context.createElement("div") ); // Deserialize a standard representation tag = ( rtagName.exec( elem ) || ["", ""] )[ 1 ].toLowerCase(); wrap = wrapMap[ tag ] || wrapMap._default; tmp.innerHTML = wrap[ 1 ] + elem.replace( rxhtmlTag, "<$1></$2>" ) + wrap[ 2 ]; // Descend through wrappers to the right content j = wrap[ 0 ]; while ( j-- ) { tmp = tmp.firstChild; } // Support: QtWebKit // jQuery.merge because core_push.apply(_, arraylike) throws jQuery.merge( nodes, tmp.childNodes ); // Remember the top-level container tmp = fragment.firstChild; // Fixes #12346 // Support: Webkit, IE tmp.textContent = ""; } } } // Remove wrapper from fragment fragment.textContent = ""; i = 0; while ( (elem = nodes[ i++ ]) ) { // #4087 - If origin and destination elements are the same, and this is // that element, do not do anything if ( selection && jQuery.inArray( elem, selection ) !== -1 ) { continue; } contains = jQuery.contains( elem.ownerDocument, elem ); // Append to fragment tmp = getAll( fragment.appendChild( elem ), "script" ); // Preserve script evaluation history if ( contains ) { setGlobalEval( tmp ); } // Capture executables if ( scripts ) { j = 0; while ( (elem = tmp[ j++ ]) ) { if ( rscriptType.test( elem.type || "" ) ) { scripts.push( elem ); } } } } return fragment; }, cleanData: function( elems ) { var data, elem, type, l = elems.length, i = 0, special = jQuery.event.special; for ( ; i < l; i++ ) { elem = elems[ i ]; if ( jQuery.acceptData( elem ) ) { data = data_priv.access( elem ); if ( data ) { for ( type in data.events ) { if ( special[ type ] ) { jQuery.event.remove( elem, type ); // This is a shortcut to avoid jQuery.event.remove's overhead } else { jQuery.removeEvent( elem, type, data.handle ); } } } } // Discard any remaining `private` and `user` data // One day we'll replace the dual arrays with a WeakMap and this won't be an issue. // (Splices the data objects out of the internal cache arrays) data_user.discard( elem ); data_priv.discard( elem ); } }, _evalUrl: function( url ) { return jQuery.ajax({ url: url, type: "GET", dataType: "text", async: false, global: false, success: jQuery.globalEval }); } }); // Support: 1.x compatibility // Manipulating tables requires a tbody function manipulationTarget( elem, content ) { return jQuery.nodeName( elem, "table" ) && jQuery.nodeName( content.nodeType === 1 ? content : content.firstChild, "tr" ) ? elem.getElementsByTagName("tbody")[0] || elem.appendChild( elem.ownerDocument.createElement("tbody") ) : elem; } // Replace/restore the type attribute of script elements for safe DOM manipulation function disableScript( elem ) { elem.type = (elem.getAttribute("type") !== null) + "/" + elem.type; return elem; } function restoreScript( elem ) { var match = rscriptTypeMasked.exec( elem.type ); if ( match ) { elem.type = match[ 1 ]; } else { elem.removeAttribute("type"); } return elem; } // Mark scripts as having already been evaluated function setGlobalEval( elems, refElements ) { var l = elems.length, i = 0; for ( ; i < l; i++ ) { data_priv.set( elems[ i ], "globalEval", !refElements || data_priv.get( refElements[ i ], "globalEval" ) ); } } function cloneCopyEvent( src, dest ) { var i, l, type, pdataOld, pdataCur, udataOld, udataCur, events; if ( dest.nodeType !== 1 ) { return; } // 1. Copy private data: events, handlers, etc. if ( data_priv.hasData( src ) ) { pdataOld = data_priv.access( src ); pdataCur = jQuery.extend( {}, pdataOld ); events = pdataOld.events; data_priv.set( dest, pdataCur ); if ( events ) { delete pdataCur.handle; pdataCur.events = {}; for ( type in events ) { for ( i = 0, l = events[ type ].length; i < l; i++ ) { jQuery.event.add( dest, type, events[ type ][ i ] ); } } } } // 2. Copy user data if ( data_user.hasData( src ) ) { udataOld = data_user.access( src ); udataCur = jQuery.extend( {}, udataOld ); data_user.set( dest, udataCur ); } } function getAll( context, tag ) { var ret = context.getElementsByTagName ? context.getElementsByTagName( tag || "*" ) : context.querySelectorAll ? context.querySelectorAll( tag || "*" ) : []; return tag === undefined || tag && jQuery.nodeName( context, tag ) ? jQuery.merge( [ context ], ret ) : ret; } // Support: IE >= 9 function fixInput( src, dest ) { var nodeName = dest.nodeName.toLowerCase(); // Fails to persist the checked state of a cloned checkbox or radio button. if ( nodeName === "input" && manipulation_rcheckableType.test( src.type ) ) { dest.checked = src.checked; // Fails to return the selected option to the default selected state when cloning options } else if ( nodeName === "input" || nodeName === "textarea" ) { dest.defaultValue = src.defaultValue; } } jQuery.fn.extend({ wrapAll: function( html ) { var wrap; if ( jQuery.isFunction( html ) ) { return this.each(function( i ) { jQuery( this ).wrapAll( html.call(this, i) ); }); } if ( this[ 0 ] ) { // The elements to wrap the target around wrap = jQuery( html, this[ 0 ].ownerDocument ).eq( 0 ).clone( true ); if ( this[ 0 ].parentNode ) { wrap.insertBefore( this[ 0 ] ); } wrap.map(function() { var elem = this; while ( elem.firstElementChild ) { elem = elem.firstElementChild; } return elem; }).append( this ); } return this; }, wrapInner: function( html ) { if ( jQuery.isFunction( html ) ) { return this.each(function( i ) { jQuery( this ).wrapInner( html.call(this, i) ); }); } return this.each(function() { var self = jQuery( this ), contents = self.contents(); if ( contents.length ) { contents.wrapAll( html ); } else { self.append( html ); } }); }, wrap: function( html ) { var isFunction = jQuery.isFunction( html ); return this.each(function( i ) { jQuery( this ).wrapAll( isFunction ? html.call(this, i) : html ); }); }, unwrap: function() { return this.parent().each(function() { if ( !jQuery.nodeName( this, "body" ) ) { jQuery( this ).replaceWith( this.childNodes ); } }).end(); } }); var curCSS, iframe, // swappable if display is none or starts with table except "table", "table-cell", or "table-caption" // see here for display values: https://developer.mozilla.org/en-US/docs/CSS/display rdisplayswap = /^(none|table(?!-c[ea]).+)/, rmargin = /^margin/, rnumsplit = new RegExp( "^(" + core_pnum + ")(.*)$", "i" ), rnumnonpx = new RegExp( "^(" + core_pnum + ")(?!px)[a-z%]+$", "i" ), rrelNum = new RegExp( "^([+-])=(" + core_pnum + ")", "i" ), elemdisplay = { BODY: "block" }, cssShow = { position: "absolute", visibility: "hidden", display: "block" }, cssNormalTransform = { letterSpacing: 0, fontWeight: 400 }, cssExpand = [ "Top", "Right", "Bottom", "Left" ], cssPrefixes = [ "Webkit", "O", "Moz", "ms" ]; // return a css property mapped to a potentially vendor prefixed property function vendorPropName( style, name ) { // shortcut for names that are not vendor prefixed if ( name in style ) { return name; } // check for vendor prefixed names var capName = name.charAt(0).toUpperCase() + name.slice(1), origName = name, i = cssPrefixes.length; while ( i-- ) { name = cssPrefixes[ i ] + capName; if ( name in style ) { return name; } } return origName; } function isHidden( elem, el ) { // isHidden might be called from jQuery#filter function; // in that case, element will be second argument elem = el || elem; return jQuery.css( elem, "display" ) === "none" || !jQuery.contains( elem.ownerDocument, elem ); } // NOTE: we've included the "window" in window.getComputedStyle // because jsdom on node.js will break without it. function getStyles( elem ) { return window.getComputedStyle( elem, null ); } function showHide( elements, show ) { var display, elem, hidden, values = [], index = 0, length = elements.length; for ( ; index < length; index++ ) { elem = elements[ index ]; if ( !elem.style ) { continue; } values[ index ] = data_priv.get( elem, "olddisplay" ); display = elem.style.display; if ( show ) { // Reset the inline display of this element to learn if it is // being hidden by cascaded rules or not if ( !values[ index ] && display === "none" ) { elem.style.display = ""; } // Set elements which have been overridden with display: none // in a stylesheet to whatever the default browser style is // for such an element if ( elem.style.display === "" && isHidden( elem ) ) { values[ index ] = data_priv.access( elem, "olddisplay", css_defaultDisplay(elem.nodeName) ); } } else { if ( !values[ index ] ) { hidden = isHidden( elem ); if ( display && display !== "none" || !hidden ) { data_priv.set( elem, "olddisplay", hidden ? display : jQuery.css(elem, "display") ); } } } } // Set the display of most of the elements in a second loop // to avoid the constant reflow for ( index = 0; index < length; index++ ) { elem = elements[ index ]; if ( !elem.style ) { continue; } if ( !show || elem.style.display === "none" || elem.style.display === "" ) { elem.style.display = show ? values[ index ] || "" : "none"; } } return elements; } jQuery.fn.extend({ css: function( name, value ) { return jQuery.access( this, function( elem, name, value ) { var styles, len, map = {}, i = 0; if ( jQuery.isArray( name ) ) { styles = getStyles( elem ); len = name.length; for ( ; i < len; i++ ) { map[ name[ i ] ] = jQuery.css( elem, name[ i ], false, styles ); } return map; } return value !== undefined ? jQuery.style( elem, name, value ) : jQuery.css( elem, name ); }, name, value, arguments.length > 1 ); }, show: function() { return showHide( this, true ); }, hide: function() { return showHide( this ); }, toggle: function( state ) { var bool = typeof state === "boolean"; return this.each(function() { if ( bool ? state : isHidden( this ) ) { jQuery( this ).show(); } else { jQuery( this ).hide(); } }); } }); jQuery.extend({ // Add in style property hooks for overriding the default // behavior of getting and setting a style property cssHooks: { opacity: { get: function( elem, computed ) { if ( computed ) { // We should always get a number back from opacity var ret = curCSS( elem, "opacity" ); return ret === "" ? "1" : ret; } } } }, // Exclude the following css properties to add px cssNumber: { "columnCount": true, "fillOpacity": true, "fontWeight": true, "lineHeight": true, "opacity": true, "orphans": true, "widows": true, "zIndex": true, "zoom": true }, // Add in properties whose names you wish to fix before // setting or getting the value cssProps: { // normalize float css property "float": "cssFloat" }, // Get and set the style property on a DOM Node style: function( elem, name, value, extra ) { // Don't set styles on text and comment nodes if ( !elem || elem.nodeType === 3 || elem.nodeType === 8 || !elem.style ) { return; } // Make sure that we're working with the right name var ret, type, hooks, origName = jQuery.camelCase( name ), style = elem.style; name = jQuery.cssProps[ origName ] || ( jQuery.cssProps[ origName ] = vendorPropName( style, origName ) ); // gets hook for the prefixed version // followed by the unprefixed version hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; // Check if we're setting a value if ( value !== undefined ) { type = typeof value; // convert relative number strings (+= or -=) to relative numbers. #7345 if ( type === "string" && (ret = rrelNum.exec( value )) ) { value = ( ret[1] + 1 ) * ret[2] + parseFloat( jQuery.css( elem, name ) ); // Fixes bug #9237 type = "number"; } // Make sure that NaN and null values aren't set. See: #7116 if ( value == null || type === "number" && isNaN( value ) ) { return; } // If a number was passed in, add 'px' to the (except for certain CSS properties) if ( type === "number" && !jQuery.cssNumber[ origName ] ) { value += "px"; } // Fixes #8908, it can be done more correctly by specifying setters in cssHooks, // but it would mean to define eight (for every problematic property) identical functions if ( !jQuery.support.clearCloneStyle && value === "" && name.indexOf("background") === 0 ) { style[ name ] = "inherit"; } // If a hook was provided, use that value, otherwise just set the specified value if ( !hooks || !("set" in hooks) || (value = hooks.set( elem, value, extra )) !== undefined ) { style[ name ] = value; } } else { // If a hook was provided get the non-computed value from there if ( hooks && "get" in hooks && (ret = hooks.get( elem, false, extra )) !== undefined ) { return ret; } // Otherwise just get the value from the style object return style[ name ]; } }, css: function( elem, name, extra, styles ) { var val, num, hooks, origName = jQuery.camelCase( name ); // Make sure that we're working with the right name name = jQuery.cssProps[ origName ] || ( jQuery.cssProps[ origName ] = vendorPropName( elem.style, origName ) ); // gets hook for the prefixed version // followed by the unprefixed version hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; // If a hook was provided get the computed value from there if ( hooks && "get" in hooks ) { val = hooks.get( elem, true, extra ); } // Otherwise, if a way to get the computed value exists, use that if ( val === undefined ) { val = curCSS( elem, name, styles ); } //convert "normal" to computed value if ( val === "normal" && name in cssNormalTransform ) { val = cssNormalTransform[ name ]; } // Return, converting to number if forced or a qualifier was provided and val looks numeric if ( extra === "" || extra ) { num = parseFloat( val ); return extra === true || jQuery.isNumeric( num ) ? num || 0 : val; } return val; } }); curCSS = function( elem, name, _computed ) { var width, minWidth, maxWidth, computed = _computed || getStyles( elem ), // Support: IE9 // getPropertyValue is only needed for .css('filter') in IE9, see #12537 ret = computed ? computed.getPropertyValue( name ) || computed[ name ] : undefined, style = elem.style; if ( computed ) { if ( ret === "" && !jQuery.contains( elem.ownerDocument, elem ) ) { ret = jQuery.style( elem, name ); } // Support: Safari 5.1 // A tribute to the "awesome hack by Dean Edwards" // Safari 5.1.7 (at least) returns percentage for a larger set of values, but width seems to be reliably pixels // this is against the CSSOM draft spec: http://dev.w3.org/csswg/cssom/#resolved-values if ( rnumnonpx.test( ret ) && rmargin.test( name ) ) { // Remember the original values width = style.width; minWidth = style.minWidth; maxWidth = style.maxWidth; // Put in the new values to get a computed value out style.minWidth = style.maxWidth = style.width = ret; ret = computed.width; // Revert the changed values style.width = width; style.minWidth = minWidth; style.maxWidth = maxWidth; } } return ret; }; function setPositiveNumber( elem, value, subtract ) { var matches = rnumsplit.exec( value ); return matches ? // Guard against undefined "subtract", e.g., when used as in cssHooks Math.max( 0, matches[ 1 ] - ( subtract || 0 ) ) + ( matches[ 2 ] || "px" ) : value; } function augmentWidthOrHeight( elem, name, extra, isBorderBox, styles ) { var i = extra === ( isBorderBox ? "border" : "content" ) ? // If we already have the right measurement, avoid augmentation 4 : // Otherwise initialize for horizontal or vertical properties name === "width" ? 1 : 0, val = 0; for ( ; i < 4; i += 2 ) { // both box models exclude margin, so add it if we want it if ( extra === "margin" ) { val += jQuery.css( elem, extra + cssExpand[ i ], true, styles ); } if ( isBorderBox ) { // border-box includes padding, so remove it if we want content if ( extra === "content" ) { val -= jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); } // at this point, extra isn't border nor margin, so remove border if ( extra !== "margin" ) { val -= jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); } } else { // at this point, extra isn't content, so add padding val += jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); // at this point, extra isn't content nor padding, so add border if ( extra !== "padding" ) { val += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); } } } return val; } function getWidthOrHeight( elem, name, extra ) { // Start with offset property, which is equivalent to the border-box value var valueIsBorderBox = true, val = name === "width" ? elem.offsetWidth : elem.offsetHeight, styles = getStyles( elem ), isBorderBox = jQuery.support.boxSizing && jQuery.css( elem, "boxSizing", false, styles ) === "border-box"; // some non-html elements return undefined for offsetWidth, so check for null/undefined // svg - https://bugzilla.mozilla.org/show_bug.cgi?id=649285 // MathML - https://bugzilla.mozilla.org/show_bug.cgi?id=491668 if ( val <= 0 || val == null ) { // Fall back to computed then uncomputed css if necessary val = curCSS( elem, name, styles ); if ( val < 0 || val == null ) { val = elem.style[ name ]; } // Computed unit is not pixels. Stop here and return. if ( rnumnonpx.test(val) ) { return val; } // we need the check for style in case a browser which returns unreliable values // for getComputedStyle silently falls back to the reliable elem.style valueIsBorderBox = isBorderBox && ( jQuery.support.boxSizingReliable || val === elem.style[ name ] ); // Normalize "", auto, and prepare for extra val = parseFloat( val ) || 0; } // use the active box-sizing model to add/subtract irrelevant styles return ( val + augmentWidthOrHeight( elem, name, extra || ( isBorderBox ? "border" : "content" ), valueIsBorderBox, styles ) ) + "px"; } // Try to determine the default display value of an element function css_defaultDisplay( nodeName ) { var doc = document, display = elemdisplay[ nodeName ]; if ( !display ) { display = actualDisplay( nodeName, doc ); // If the simple way fails, read from inside an iframe if ( display === "none" || !display ) { // Use the already-created iframe if possible iframe = ( iframe || jQuery("<iframe frameborder='0' width='0' height='0'/>") .css( "cssText", "display:block !important" ) ).appendTo( doc.documentElement ); // Always write a new HTML skeleton so Webkit and Firefox don't choke on reuse doc = ( iframe[0].contentWindow || iframe[0].contentDocument ).document; doc.write("<!doctype html><html><body>"); doc.close(); display = actualDisplay( nodeName, doc ); iframe.detach(); } // Store the correct default display elemdisplay[ nodeName ] = display; } return display; } // Called ONLY from within css_defaultDisplay function actualDisplay( name, doc ) { var elem = jQuery( doc.createElement( name ) ).appendTo( doc.body ), display = jQuery.css( elem[0], "display" ); elem.remove(); return display; } jQuery.each([ "height", "width" ], function( i, name ) { jQuery.cssHooks[ name ] = { get: function( elem, computed, extra ) { if ( computed ) { // certain elements can have dimension info if we invisibly show them // however, it must have a current display style that would benefit from this return elem.offsetWidth === 0 && rdisplayswap.test( jQuery.css( elem, "display" ) ) ? jQuery.swap( elem, cssShow, function() { return getWidthOrHeight( elem, name, extra ); }) : getWidthOrHeight( elem, name, extra ); } }, set: function( elem, value, extra ) { var styles = extra && getStyles( elem ); return setPositiveNumber( elem, value, extra ? augmentWidthOrHeight( elem, name, extra, jQuery.support.boxSizing && jQuery.css( elem, "boxSizing", false, styles ) === "border-box", styles ) : 0 ); } }; }); // These hooks cannot be added until DOM ready because the support test // for it is not run until after DOM ready jQuery(function() { // Support: Android 2.3 if ( !jQuery.support.reliableMarginRight ) { jQuery.cssHooks.marginRight = { get: function( elem, computed ) { if ( computed ) { // Support: Android 2.3 // WebKit Bug 13343 - getComputedStyle returns wrong value for margin-right // Work around by temporarily setting element display to inline-block return jQuery.swap( elem, { "display": "inline-block" }, curCSS, [ elem, "marginRight" ] ); } } }; } // Webkit bug: https://bugs.webkit.org/show_bug.cgi?id=29084 // getComputedStyle returns percent when specified for top/left/bottom/right // rather than make the css module depend on the offset module, we just check for it here if ( !jQuery.support.pixelPosition && jQuery.fn.position ) { jQuery.each( [ "top", "left" ], function( i, prop ) { jQuery.cssHooks[ prop ] = { get: function( elem, computed ) { if ( computed ) { computed = curCSS( elem, prop ); // if curCSS returns percentage, fallback to offset return rnumnonpx.test( computed ) ? jQuery( elem ).position()[ prop ] + "px" : computed; } } }; }); } }); if ( jQuery.expr && jQuery.expr.filters ) { jQuery.expr.filters.hidden = function( elem ) { // Support: Opera <= 12.12 // Opera reports offsetWidths and offsetHeights less than zero on some elements return elem.offsetWidth <= 0 && elem.offsetHeight <= 0; }; jQuery.expr.filters.visible = function( elem ) { return !jQuery.expr.filters.hidden( elem ); }; } // These hooks are used by animate to expand properties jQuery.each({ margin: "", padding: "", border: "Width" }, function( prefix, suffix ) { jQuery.cssHooks[ prefix + suffix ] = { expand: function( value ) { var i = 0, expanded = {}, // assumes a single number if not a string parts = typeof value === "string" ? value.split(" ") : [ value ]; for ( ; i < 4; i++ ) { expanded[ prefix + cssExpand[ i ] + suffix ] = parts[ i ] || parts[ i - 2 ] || parts[ 0 ]; } return expanded; } }; if ( !rmargin.test( prefix ) ) { jQuery.cssHooks[ prefix + suffix ].set = setPositiveNumber; } }); var r20 = /%20/g, rbracket = /\[\]$/, rCRLF = /\r?\n/g, rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, rsubmittable = /^(?:input|select|textarea|keygen)/i; jQuery.fn.extend({ serialize: function() { return jQuery.param( this.serializeArray() ); }, serializeArray: function() { return this.map(function(){ // Can add propHook for "elements" to filter or add form elements var elements = jQuery.prop( this, "elements" ); return elements ? jQuery.makeArray( elements ) : this; }) .filter(function(){ var type = this.type; // Use .is(":disabled") so that fieldset[disabled] works return this.name && !jQuery( this ).is( ":disabled" ) && rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && ( this.checked || !manipulation_rcheckableType.test( type ) ); }) .map(function( i, elem ){ var val = jQuery( this ).val(); return val == null ? null : jQuery.isArray( val ) ? jQuery.map( val, function( val ){ return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; }) : { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; }).get(); } }); //Serialize an array of form elements or a set of //key/values into a query string jQuery.param = function( a, traditional ) { var prefix, s = [], add = function( key, value ) { // If value is a function, invoke it and return its value value = jQuery.isFunction( value ) ? value() : ( value == null ? "" : value ); s[ s.length ] = encodeURIComponent( key ) + "=" + encodeURIComponent( value ); }; // Set traditional to true for jQuery <= 1.3.2 behavior. if ( traditional === undefined ) { traditional = jQuery.ajaxSettings && jQuery.ajaxSettings.traditional; } // If an array was passed in, assume that it is an array of form elements. if ( jQuery.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { // Serialize the form elements jQuery.each( a, function() { add( this.name, this.value ); }); } else { // If traditional, encode the "old" way (the way 1.3.2 or older // did it), otherwise encode params recursively. for ( prefix in a ) { buildParams( prefix, a[ prefix ], traditional, add ); } } // Return the resulting serialization return s.join( "&" ).replace( r20, "+" ); }; function buildParams( prefix, obj, traditional, add ) { var name; if ( jQuery.isArray( obj ) ) { // Serialize array item. jQuery.each( obj, function( i, v ) { if ( traditional || rbracket.test( prefix ) ) { // Treat each array item as a scalar. add( prefix, v ); } else { // Item is non-scalar (array or object), encode its numeric index. buildParams( prefix + "[" + ( typeof v === "object" ? i : "" ) + "]", v, traditional, add ); } }); } else if ( !traditional && jQuery.type( obj ) === "object" ) { // Serialize object item. for ( name in obj ) { buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); } } else { // Serialize scalar item. add( prefix, obj ); } } jQuery.each( ("blur focus focusin focusout load resize scroll unload click dblclick " + "mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave " + "change select submit keydown keypress keyup error contextmenu").split(" "), function( i, name ) { // Handle event binding jQuery.fn[ name ] = function( data, fn ) { return arguments.length > 0 ? this.on( name, null, data, fn ) : this.trigger( name ); }; }); jQuery.fn.extend({ hover: function( fnOver, fnOut ) { return this.mouseenter( fnOver ).mouseleave( fnOut || fnOver ); }, bind: function( types, data, fn ) { return this.on( types, null, data, fn ); }, unbind: function( types, fn ) { return this.off( types, null, fn ); }, delegate: function( selector, types, data, fn ) { return this.on( types, selector, data, fn ); }, undelegate: function( selector, types, fn ) { // ( namespace ) or ( selector, types [, fn] ) return arguments.length === 1 ? this.off( selector, "**" ) : this.off( types, selector || "**", fn ); } }); var // Document location ajaxLocParts, ajaxLocation, ajax_nonce = jQuery.now(), ajax_rquery = /\?/, rhash = /#.*$/, rts = /([?&])_=[^&]*/, rheaders = /^(.*?):[ \t]*([^\r\n]*)$/mg, // #7653, #8125, #8152: local protocol detection rlocalProtocol = /^(?:about|app|app-storage|.+-extension|file|res|widget):$/, rnoContent = /^(?:GET|HEAD)$/, rprotocol = /^\/\//, rurl = /^([\w.+-]+:)(?:\/\/([^\/?#:]*)(?::(\d+)|)|)/, // Keep a copy of the old load method _load = jQuery.fn.load, /* Prefilters * 1) They are useful to introduce custom dataTypes (see ajax/jsonp.js for an example) * 2) These are called: * - BEFORE asking for a transport * - AFTER param serialization (s.data is a string if s.processData is true) * 3) key is the dataType * 4) the catchall symbol "*" can be used * 5) execution will start with transport dataType and THEN continue down to "*" if needed */ prefilters = {}, /* Transports bindings * 1) key is the dataType * 2) the catchall symbol "*" can be used * 3) selection will start with transport dataType and THEN go to "*" if needed */ transports = {}, // Avoid comment-prolog char sequence (#10098); must appease lint and evade compression allTypes = "*/".concat("*"); // #8138, IE may throw an exception when accessing // a field from window.location if document.domain has been set try { ajaxLocation = location.href; } catch( e ) { // Use the href attribute of an A element // since IE will modify it given document.location ajaxLocation = document.createElement( "a" ); ajaxLocation.href = ""; ajaxLocation = ajaxLocation.href; } // Segment location into parts ajaxLocParts = rurl.exec( ajaxLocation.toLowerCase() ) || []; // Base "constructor" for jQuery.ajaxPrefilter and jQuery.ajaxTransport function addToPrefiltersOrTransports( structure ) { // dataTypeExpression is optional and defaults to "*" return function( dataTypeExpression, func ) { if ( typeof dataTypeExpression !== "string" ) { func = dataTypeExpression; dataTypeExpression = "*"; } var dataType, i = 0, dataTypes = dataTypeExpression.toLowerCase().match( core_rnotwhite ) || []; if ( jQuery.isFunction( func ) ) { // For each dataType in the dataTypeExpression while ( (dataType = dataTypes[i++]) ) { // Prepend if requested if ( dataType[0] === "+" ) { dataType = dataType.slice( 1 ) || "*"; (structure[ dataType ] = structure[ dataType ] || []).unshift( func ); // Otherwise append } else { (structure[ dataType ] = structure[ dataType ] || []).push( func ); } } } }; } // Base inspection function for prefilters and transports function inspectPrefiltersOrTransports( structure, options, originalOptions, jqXHR ) { var inspected = {}, seekingTransport = ( structure === transports ); function inspect( dataType ) { var selected; inspected[ dataType ] = true; jQuery.each( structure[ dataType ] || [], function( _, prefilterOrFactory ) { var dataTypeOrTransport = prefilterOrFactory( options, originalOptions, jqXHR ); if( typeof dataTypeOrTransport === "string" && !seekingTransport && !inspected[ dataTypeOrTransport ] ) { options.dataTypes.unshift( dataTypeOrTransport ); inspect( dataTypeOrTransport ); return false; } else if ( seekingTransport ) { return !( selected = dataTypeOrTransport ); } }); return selected; } return inspect( options.dataTypes[ 0 ] ) || !inspected[ "*" ] && inspect( "*" ); } // A special extend for ajax options // that takes "flat" options (not to be deep extended) // Fixes #9887 function ajaxExtend( target, src ) { var key, deep, flatOptions = jQuery.ajaxSettings.flatOptions || {}; for ( key in src ) { if ( src[ key ] !== undefined ) { ( flatOptions[ key ] ? target : ( deep || (deep = {}) ) )[ key ] = src[ key ]; } } if ( deep ) { jQuery.extend( true, target, deep ); } return target; } jQuery.fn.load = function( url, params, callback ) { if ( typeof url !== "string" && _load ) { return _load.apply( this, arguments ); } var selector, type, response, self = this, off = url.indexOf(" "); if ( off >= 0 ) { selector = url.slice( off ); url = url.slice( 0, off ); } // If it's a function if ( jQuery.isFunction( params ) ) { // We assume that it's the callback callback = params; params = undefined; // Otherwise, build a param string } else if ( params && typeof params === "object" ) { type = "POST"; } // If we have elements to modify, make the request if ( self.length > 0 ) { jQuery.ajax({ url: url, // if "type" variable is undefined, then "GET" method will be used type: type, dataType: "html", data: params }).done(function( responseText ) { // Save response for use in complete callback response = arguments; self.html( selector ? // If a selector was specified, locate the right elements in a dummy div // Exclude scripts to avoid IE 'Permission Denied' errors jQuery("<div>").append( jQuery.parseHTML( responseText ) ).find( selector ) : // Otherwise use the full result responseText ); }).complete( callback && function( jqXHR, status ) { self.each( callback, response || [ jqXHR.responseText, status, jqXHR ] ); }); } return this; }; // Attach a bunch of functions for handling common AJAX events jQuery.each( [ "ajaxStart", "ajaxStop", "ajaxComplete", "ajaxError", "ajaxSuccess", "ajaxSend" ], function( i, type ){ jQuery.fn[ type ] = function( fn ){ return this.on( type, fn ); }; }); jQuery.extend({ // Counter for holding the number of active queries active: 0, // Last-Modified header cache for next request lastModified: {}, etag: {}, ajaxSettings: { url: ajaxLocation, type: "GET", isLocal: rlocalProtocol.test( ajaxLocParts[ 1 ] ), global: true, processData: true, async: true, contentType: "application/x-www-form-urlencoded; charset=UTF-8", /* timeout: 0, data: null, dataType: null, username: null, password: null, cache: null, throws: false, traditional: false, headers: {}, */ accepts: { "*": allTypes, text: "text/plain", html: "text/html", xml: "application/xml, text/xml", json: "application/json, text/javascript" }, contents: { xml: /xml/, html: /html/, json: /json/ }, responseFields: { xml: "responseXML", text: "responseText", json: "responseJSON" }, // Data converters // Keys separate source (or catchall "*") and destination types with a single space converters: { // Convert anything to text "* text": String, // Text to html (true = no transformation) "text html": true, // Evaluate text as a json expression "text json": jQuery.parseJSON, // Parse text as xml "text xml": jQuery.parseXML }, // For options that shouldn't be deep extended: // you can add your own custom options here if // and when you create one that shouldn't be // deep extended (see ajaxExtend) flatOptions: { url: true, context: true } }, // Creates a full fledged settings object into target // with both ajaxSettings and settings fields. // If target is omitted, writes into ajaxSettings. ajaxSetup: function( target, settings ) { return settings ? // Building a settings object ajaxExtend( ajaxExtend( target, jQuery.ajaxSettings ), settings ) : // Extending ajaxSettings ajaxExtend( jQuery.ajaxSettings, target ); }, ajaxPrefilter: addToPrefiltersOrTransports( prefilters ), ajaxTransport: addToPrefiltersOrTransports( transports ), // Main method ajax: function( url, options ) { // If url is an object, simulate pre-1.5 signature if ( typeof url === "object" ) { options = url; url = undefined; } // Force options to be an object options = options || {}; var transport, // URL without anti-cache param cacheURL, // Response headers responseHeadersString, responseHeaders, // timeout handle timeoutTimer, // Cross-domain detection vars parts, // To know if global events are to be dispatched fireGlobals, // Loop variable i, // Create the final options object s = jQuery.ajaxSetup( {}, options ), // Callbacks context callbackContext = s.context || s, // Context for global events is callbackContext if it is a DOM node or jQuery collection globalEventContext = s.context && ( callbackContext.nodeType || callbackContext.jquery ) ? jQuery( callbackContext ) : jQuery.event, // Deferreds deferred = jQuery.Deferred(), completeDeferred = jQuery.Callbacks("once memory"), // Status-dependent callbacks statusCode = s.statusCode || {}, // Headers (they are sent all at once) requestHeaders = {}, requestHeadersNames = {}, // The jqXHR state state = 0, // Default abort message strAbort = "canceled", // Fake xhr jqXHR = { readyState: 0, // Builds headers hashtable if needed getResponseHeader: function( key ) { var match; if ( state === 2 ) { if ( !responseHeaders ) { responseHeaders = {}; while ( (match = rheaders.exec( responseHeadersString )) ) { responseHeaders[ match[1].toLowerCase() ] = match[ 2 ]; } } match = responseHeaders[ key.toLowerCase() ]; } return match == null ? null : match; }, // Raw string getAllResponseHeaders: function() { return state === 2 ? responseHeadersString : null; }, // Caches the header setRequestHeader: function( name, value ) { var lname = name.toLowerCase(); if ( !state ) { name = requestHeadersNames[ lname ] = requestHeadersNames[ lname ] || name; requestHeaders[ name ] = value; } return this; }, // Overrides response content-type header overrideMimeType: function( type ) { if ( !state ) { s.mimeType = type; } return this; }, // Status-dependent callbacks statusCode: function( map ) { var code; if ( map ) { if ( state < 2 ) { for ( code in map ) { // Lazy-add the new callback in a way that preserves old ones statusCode[ code ] = [ statusCode[ code ], map[ code ] ]; } } else { // Execute the appropriate callbacks jqXHR.always( map[ jqXHR.status ] ); } } return this; }, // Cancel the request abort: function( statusText ) { var finalText = statusText || strAbort; if ( transport ) { transport.abort( finalText ); } done( 0, finalText ); return this; } }; // Attach deferreds deferred.promise( jqXHR ).complete = completeDeferred.add; jqXHR.success = jqXHR.done; jqXHR.error = jqXHR.fail; // Remove hash character (#7531: and string promotion) // Add protocol if not provided (prefilters might expect it) // Handle falsy url in the settings object (#10093: consistency with old signature) // We also use the url parameter if available s.url = ( ( url || s.url || ajaxLocation ) + "" ).replace( rhash, "" ) .replace( rprotocol, ajaxLocParts[ 1 ] + "//" ); // Alias method option to type as per ticket #12004 s.type = options.method || options.type || s.method || s.type; // Extract dataTypes list s.dataTypes = jQuery.trim( s.dataType || "*" ).toLowerCase().match( core_rnotwhite ) || [""]; // A cross-domain request is in order when we have a protocol:host:port mismatch if ( s.crossDomain == null ) { parts = rurl.exec( s.url.toLowerCase() ); s.crossDomain = !!( parts && ( parts[ 1 ] !== ajaxLocParts[ 1 ] || parts[ 2 ] !== ajaxLocParts[ 2 ] || ( parts[ 3 ] || ( parts[ 1 ] === "http:" ? "80" : "443" ) ) !== ( ajaxLocParts[ 3 ] || ( ajaxLocParts[ 1 ] === "http:" ? "80" : "443" ) ) ) ); } // Convert data if not already a string if ( s.data && s.processData && typeof s.data !== "string" ) { s.data = jQuery.param( s.data, s.traditional ); } // Apply prefilters inspectPrefiltersOrTransports( prefilters, s, options, jqXHR ); // If request was aborted inside a prefilter, stop there if ( state === 2 ) { return jqXHR; } // We can fire global events as of now if asked to fireGlobals = s.global; // Watch for a new set of requests if ( fireGlobals && jQuery.active++ === 0 ) { jQuery.event.trigger("ajaxStart"); } // Uppercase the type s.type = s.type.toUpperCase(); // Determine if request has content s.hasContent = !rnoContent.test( s.type ); // Save the URL in case we're toying with the If-Modified-Since // and/or If-None-Match header later on cacheURL = s.url; // More options handling for requests with no content if ( !s.hasContent ) { // If data is available, append data to url if ( s.data ) { cacheURL = ( s.url += ( ajax_rquery.test( cacheURL ) ? "&" : "?" ) + s.data ); // #9682: remove data so that it's not used in an eventual retry delete s.data; } // Add anti-cache in url if needed if ( s.cache === false ) { s.url = rts.test( cacheURL ) ? // If there is already a '_' parameter, set its value cacheURL.replace( rts, "$1_=" + ajax_nonce++ ) : // Otherwise add one to the end cacheURL + ( ajax_rquery.test( cacheURL ) ? "&" : "?" ) + "_=" + ajax_nonce++; } } // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. if ( s.ifModified ) { if ( jQuery.lastModified[ cacheURL ] ) { jqXHR.setRequestHeader( "If-Modified-Since", jQuery.lastModified[ cacheURL ] ); } if ( jQuery.etag[ cacheURL ] ) { jqXHR.setRequestHeader( "If-None-Match", jQuery.etag[ cacheURL ] ); } } // Set the correct header, if data is being sent if ( s.data && s.hasContent && s.contentType !== false || options.contentType ) { jqXHR.setRequestHeader( "Content-Type", s.contentType ); } // Set the Accepts header for the server, depending on the dataType jqXHR.setRequestHeader( "Accept", s.dataTypes[ 0 ] && s.accepts[ s.dataTypes[0] ] ? s.accepts[ s.dataTypes[0] ] + ( s.dataTypes[ 0 ] !== "*" ? ", " + allTypes + "; q=0.01" : "" ) : s.accepts[ "*" ] ); // Check for headers option for ( i in s.headers ) { jqXHR.setRequestHeader( i, s.headers[ i ] ); } // Allow custom headers/mimetypes and early abort if ( s.beforeSend && ( s.beforeSend.call( callbackContext, jqXHR, s ) === false || state === 2 ) ) { // Abort if not done already and return return jqXHR.abort(); } // aborting is no longer a cancellation strAbort = "abort"; // Install callbacks on deferreds for ( i in { success: 1, error: 1, complete: 1 } ) { jqXHR[ i ]( s[ i ] ); } // Get transport transport = inspectPrefiltersOrTransports( transports, s, options, jqXHR ); // If no transport, we auto-abort if ( !transport ) { done( -1, "No Transport" ); } else { jqXHR.readyState = 1; // Send global event if ( fireGlobals ) { globalEventContext.trigger( "ajaxSend", [ jqXHR, s ] ); } // Timeout if ( s.async && s.timeout > 0 ) { timeoutTimer = setTimeout(function() { jqXHR.abort("timeout"); }, s.timeout ); } try { state = 1; transport.send( requestHeaders, done ); } catch ( e ) { // Propagate exception as error if not done if ( state < 2 ) { done( -1, e ); // Simply rethrow otherwise } else { throw e; } } } // Callback for when everything is done function done( status, nativeStatusText, responses, headers ) { var isSuccess, success, error, response, modified, statusText = nativeStatusText; // Called once if ( state === 2 ) { return; } // State is "done" now state = 2; // Clear timeout if it exists if ( timeoutTimer ) { clearTimeout( timeoutTimer ); } // Dereference transport for early garbage collection // (no matter how long the jqXHR object will be used) transport = undefined; // Cache response headers responseHeadersString = headers || ""; // Set readyState jqXHR.readyState = status > 0 ? 4 : 0; // Determine if successful isSuccess = status >= 200 && status < 300 || status === 304; // Get response data if ( responses ) { response = ajaxHandleResponses( s, jqXHR, responses ); } // Convert no matter what (that way responseXXX fields are always set) response = ajaxConvert( s, response, jqXHR, isSuccess ); // If successful, handle type chaining if ( isSuccess ) { // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. if ( s.ifModified ) { modified = jqXHR.getResponseHeader("Last-Modified"); if ( modified ) { jQuery.lastModified[ cacheURL ] = modified; } modified = jqXHR.getResponseHeader("etag"); if ( modified ) { jQuery.etag[ cacheURL ] = modified; } } // if no content if ( status === 204 ) { statusText = "nocontent"; // if not modified } else if ( status === 304 ) { statusText = "notmodified"; // If we have data, let's convert it } else { statusText = response.state; success = response.data; error = response.error; isSuccess = !error; } } else { // We extract error from statusText // then normalize statusText and status for non-aborts error = statusText; if ( status || !statusText ) { statusText = "error"; if ( status < 0 ) { status = 0; } } } // Set data for the fake xhr object jqXHR.status = status; jqXHR.statusText = ( nativeStatusText || statusText ) + ""; // Success/Error if ( isSuccess ) { deferred.resolveWith( callbackContext, [ success, statusText, jqXHR ] ); } else { deferred.rejectWith( callbackContext, [ jqXHR, statusText, error ] ); } // Status-dependent callbacks jqXHR.statusCode( statusCode ); statusCode = undefined; if ( fireGlobals ) { globalEventContext.trigger( isSuccess ? "ajaxSuccess" : "ajaxError", [ jqXHR, s, isSuccess ? success : error ] ); } // Complete completeDeferred.fireWith( callbackContext, [ jqXHR, statusText ] ); if ( fireGlobals ) { globalEventContext.trigger( "ajaxComplete", [ jqXHR, s ] ); // Handle the global AJAX counter if ( !( --jQuery.active ) ) { jQuery.event.trigger("ajaxStop"); } } } return jqXHR; }, getJSON: function( url, data, callback ) { return jQuery.get( url, data, callback, "json" ); }, getScript: function( url, callback ) { return jQuery.get( url, undefined, callback, "script" ); } }); jQuery.each( [ "get", "post" ], function( i, method ) { jQuery[ method ] = function( url, data, callback, type ) { // shift arguments if data argument was omitted if ( jQuery.isFunction( data ) ) { type = type || callback; callback = data; data = undefined; } return jQuery.ajax({ url: url, type: method, dataType: type, data: data, success: callback }); }; }); /* Handles responses to an ajax request: * - finds the right dataType (mediates between content-type and expected dataType) * - returns the corresponding response */ function ajaxHandleResponses( s, jqXHR, responses ) { var ct, type, finalDataType, firstDataType, contents = s.contents, dataTypes = s.dataTypes; // Remove auto dataType and get content-type in the process while( dataTypes[ 0 ] === "*" ) { dataTypes.shift(); if ( ct === undefined ) { ct = s.mimeType || jqXHR.getResponseHeader("Content-Type"); } } // Check if we're dealing with a known content-type if ( ct ) { for ( type in contents ) { if ( contents[ type ] && contents[ type ].test( ct ) ) { dataTypes.unshift( type ); break; } } } // Check to see if we have a response for the expected dataType if ( dataTypes[ 0 ] in responses ) { finalDataType = dataTypes[ 0 ]; } else { // Try convertible dataTypes for ( type in responses ) { if ( !dataTypes[ 0 ] || s.converters[ type + " " + dataTypes[0] ] ) { finalDataType = type; break; } if ( !firstDataType ) { firstDataType = type; } } // Or just use first one finalDataType = finalDataType || firstDataType; } // If we found a dataType // We add the dataType to the list if needed // and return the corresponding response if ( finalDataType ) { if ( finalDataType !== dataTypes[ 0 ] ) { dataTypes.unshift( finalDataType ); } return responses[ finalDataType ]; } } /* Chain conversions given the request and the original response * Also sets the responseXXX fields on the jqXHR instance */ function ajaxConvert( s, response, jqXHR, isSuccess ) { var conv2, current, conv, tmp, prev, converters = {}, // Work with a copy of dataTypes in case we need to modify it for conversion dataTypes = s.dataTypes.slice(); // Create converters map with lowercased keys if ( dataTypes[ 1 ] ) { for ( conv in s.converters ) { converters[ conv.toLowerCase() ] = s.converters[ conv ]; } } current = dataTypes.shift(); // Convert to each sequential dataType while ( current ) { if ( s.responseFields[ current ] ) { jqXHR[ s.responseFields[ current ] ] = response; } // Apply the dataFilter if provided if ( !prev && isSuccess && s.dataFilter ) { response = s.dataFilter( response, s.dataType ); } prev = current; current = dataTypes.shift(); if ( current ) { // There's only work to do if current dataType is non-auto if ( current === "*" ) { current = prev; // Convert response if prev dataType is non-auto and differs from current } else if ( prev !== "*" && prev !== current ) { // Seek a direct converter conv = converters[ prev + " " + current ] || converters[ "* " + current ]; // If none found, seek a pair if ( !conv ) { for ( conv2 in converters ) { // If conv2 outputs current tmp = conv2.split( " " ); if ( tmp[ 1 ] === current ) { // If prev can be converted to accepted input conv = converters[ prev + " " + tmp[ 0 ] ] || converters[ "* " + tmp[ 0 ] ]; if ( conv ) { // Condense equivalence converters if ( conv === true ) { conv = converters[ conv2 ]; // Otherwise, insert the intermediate dataType } else if ( converters[ conv2 ] !== true ) { current = tmp[ 0 ]; dataTypes.unshift( tmp[ 1 ] ); } break; } } } } // Apply converter (if not an equivalence) if ( conv !== true ) { // Unless errors are allowed to bubble, catch and return them if ( conv && s[ "throws" ] ) { response = conv( response ); } else { try { response = conv( response ); } catch ( e ) { return { state: "parsererror", error: conv ? e : "No conversion from " + prev + " to " + current }; } } } } } } return { state: "success", data: response }; } // Install script dataType jQuery.ajaxSetup({ accepts: { script: "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript" }, contents: { script: /(?:java|ecma)script/ }, converters: { "text script": function( text ) { jQuery.globalEval( text ); return text; } } }); // Handle cache's special case and crossDomain jQuery.ajaxPrefilter( "script", function( s ) { if ( s.cache === undefined ) { s.cache = false; } if ( s.crossDomain ) { s.type = "GET"; } }); // Bind script tag hack transport jQuery.ajaxTransport( "script", function( s ) { // This transport only deals with cross domain requests if ( s.crossDomain ) { var script, callback; return { send: function( _, complete ) { script = jQuery("<script>").prop({ async: true, charset: s.scriptCharset, src: s.url }).on( "load error", callback = function( evt ) { script.remove(); callback = null; if ( evt ) { complete( evt.type === "error" ? 404 : 200, evt.type ); } } ); document.head.appendChild( script[ 0 ] ); }, abort: function() { if ( callback ) { callback(); } } }; } }); var oldCallbacks = [], rjsonp = /(=)\?(?=&|$)|\?\?/; // Default jsonp settings jQuery.ajaxSetup({ jsonp: "callback", jsonpCallback: function() { var callback = oldCallbacks.pop() || ( jQuery.expando + "_" + ( ajax_nonce++ ) ); this[ callback ] = true; return callback; } }); // Detect, normalize options and install callbacks for jsonp requests jQuery.ajaxPrefilter( "json jsonp", function( s, originalSettings, jqXHR ) { var callbackName, overwritten, responseContainer, jsonProp = s.jsonp !== false && ( rjsonp.test( s.url ) ? "url" : typeof s.data === "string" && !( s.contentType || "" ).indexOf("application/x-www-form-urlencoded") && rjsonp.test( s.data ) && "data" ); // Handle iff the expected data type is "jsonp" or we have a parameter to set if ( jsonProp || s.dataTypes[ 0 ] === "jsonp" ) { // Get callback name, remembering preexisting value associated with it callbackName = s.jsonpCallback = jQuery.isFunction( s.jsonpCallback ) ? s.jsonpCallback() : s.jsonpCallback; // Insert callback into url or form data if ( jsonProp ) { s[ jsonProp ] = s[ jsonProp ].replace( rjsonp, "$1" + callbackName ); } else if ( s.jsonp !== false ) { s.url += ( ajax_rquery.test( s.url ) ? "&" : "?" ) + s.jsonp + "=" + callbackName; } // Use data converter to retrieve json after script execution s.converters["script json"] = function() { if ( !responseContainer ) { jQuery.error( callbackName + " was not called" ); } return responseContainer[ 0 ]; }; // force json dataType s.dataTypes[ 0 ] = "json"; // Install callback overwritten = window[ callbackName ]; window[ callbackName ] = function() { responseContainer = arguments; }; // Clean-up function (fires after converters) jqXHR.always(function() { // Restore preexisting value window[ callbackName ] = overwritten; // Save back as free if ( s[ callbackName ] ) { // make sure that re-using the options doesn't screw things around s.jsonpCallback = originalSettings.jsonpCallback; // save the callback name for future use oldCallbacks.push( callbackName ); } // Call if it was a function and we have a response if ( responseContainer && jQuery.isFunction( overwritten ) ) { overwritten( responseContainer[ 0 ] ); } responseContainer = overwritten = undefined; }); // Delegate to script return "script"; } }); jQuery.ajaxSettings.xhr = function() { try { return new XMLHttpRequest(); } catch( e ) {} }; var xhrSupported = jQuery.ajaxSettings.xhr(), xhrSuccessStatus = { // file protocol always yields status code 0, assume 200 0: 200, // Support: IE9 // #1450: sometimes IE returns 1223 when it should be 204 1223: 204 }, // Support: IE9 // We need to keep track of outbound xhr and abort them manually // because IE is not smart enough to do it all by itself xhrId = 0, xhrCallbacks = {}; if ( window.ActiveXObject ) { jQuery( window ).on( "unload", function() { for( var key in xhrCallbacks ) { xhrCallbacks[ key ](); } xhrCallbacks = undefined; }); } jQuery.support.cors = !!xhrSupported && ( "withCredentials" in xhrSupported ); jQuery.support.ajax = xhrSupported = !!xhrSupported; jQuery.ajaxTransport(function( options ) { var callback; // Cross domain only allowed if supported through XMLHttpRequest if ( jQuery.support.cors || xhrSupported && !options.crossDomain ) { return { send: function( headers, complete ) { var i, id, xhr = options.xhr(); xhr.open( options.type, options.url, options.async, options.username, options.password ); // Apply custom fields if provided if ( options.xhrFields ) { for ( i in options.xhrFields ) { xhr[ i ] = options.xhrFields[ i ]; } } // Override mime type if needed if ( options.mimeType && xhr.overrideMimeType ) { xhr.overrideMimeType( options.mimeType ); } // X-Requested-With header // For cross-domain requests, seeing as conditions for a preflight are // akin to a jigsaw puzzle, we simply never set it to be sure. // (it can always be set on a per-request basis or even using ajaxSetup) // For same-domain requests, won't change header if already provided. if ( !options.crossDomain && !headers["X-Requested-With"] ) { headers["X-Requested-With"] = "XMLHttpRequest"; } // Set headers for ( i in headers ) { xhr.setRequestHeader( i, headers[ i ] ); } // Callback callback = function( type ) { return function() { if ( callback ) { delete xhrCallbacks[ id ]; callback = xhr.onload = xhr.onerror = null; if ( type === "abort" ) { xhr.abort(); } else if ( type === "error" ) { complete( // file protocol always yields status 0, assume 404 xhr.status || 404, xhr.statusText ); } else { complete( xhrSuccessStatus[ xhr.status ] || xhr.status, xhr.statusText, // Support: IE9 // #11426: When requesting binary data, IE9 will throw an exception // on any attempt to access responseText typeof xhr.responseText === "string" ? { text: xhr.responseText } : undefined, xhr.getAllResponseHeaders() ); } } }; }; // Listen to events xhr.onload = callback(); xhr.onerror = callback("error"); // Create the abort callback callback = xhrCallbacks[( id = xhrId++ )] = callback("abort"); // Do send the request // This may raise an exception which is actually // handled in jQuery.ajax (so no try/catch here) xhr.send( options.hasContent && options.data || null ); }, abort: function() { if ( callback ) { callback(); } } }; } }); var fxNow, timerId, rfxtypes = /^(?:toggle|show|hide)$/, rfxnum = new RegExp( "^(?:([+-])=|)(" + core_pnum + ")([a-z%]*)$", "i" ), rrun = /queueHooks$/, animationPrefilters = [ defaultPrefilter ], tweeners = { "*": [function( prop, value ) { var end, unit, tween = this.createTween( prop, value ), parts = rfxnum.exec( value ), target = tween.cur(), start = +target || 0, scale = 1, maxIterations = 20; if ( parts ) { end = +parts[2]; unit = parts[3] || ( jQuery.cssNumber[ prop ] ? "" : "px" ); // We need to compute starting value if ( unit !== "px" && start ) { // Iteratively approximate from a nonzero starting point // Prefer the current property, because this process will be trivial if it uses the same units // Fallback to end or a simple constant start = jQuery.css( tween.elem, prop, true ) || end || 1; do { // If previous iteration zeroed out, double until we get *something* // Use a string for doubling factor so we don't accidentally see scale as unchanged below scale = scale || ".5"; // Adjust and apply start = start / scale; jQuery.style( tween.elem, prop, start + unit ); // Update scale, tolerating zero or NaN from tween.cur() // And breaking the loop if scale is unchanged or perfect, or if we've just had enough } while ( scale !== (scale = tween.cur() / target) && scale !== 1 && --maxIterations ); } tween.unit = unit; tween.start = start; // If a +=/-= token was provided, we're doing a relative animation tween.end = parts[1] ? start + ( parts[1] + 1 ) * end : end; } return tween; }] }; // Animations created synchronously will run synchronously function createFxNow() { setTimeout(function() { fxNow = undefined; }); return ( fxNow = jQuery.now() ); } function createTweens( animation, props ) { jQuery.each( props, function( prop, value ) { var collection = ( tweeners[ prop ] || [] ).concat( tweeners[ "*" ] ), index = 0, length = collection.length; for ( ; index < length; index++ ) { if ( collection[ index ].call( animation, prop, value ) ) { // we're done with this property return; } } }); } function Animation( elem, properties, options ) { var result, stopped, index = 0, length = animationPrefilters.length, deferred = jQuery.Deferred().always( function() { // don't match elem in the :animated selector delete tick.elem; }), tick = function() { if ( stopped ) { return false; } var currentTime = fxNow || createFxNow(), remaining = Math.max( 0, animation.startTime + animation.duration - currentTime ), // archaic crash bug won't allow us to use 1 - ( 0.5 || 0 ) (#12497) temp = remaining / animation.duration || 0, percent = 1 - temp, index = 0, length = animation.tweens.length; for ( ; index < length ; index++ ) { animation.tweens[ index ].run( percent ); } deferred.notifyWith( elem, [ animation, percent, remaining ]); if ( percent < 1 && length ) { return remaining; } else { deferred.resolveWith( elem, [ animation ] ); return false; } }, animation = deferred.promise({ elem: elem, props: jQuery.extend( {}, properties ), opts: jQuery.extend( true, { specialEasing: {} }, options ), originalProperties: properties, originalOptions: options, startTime: fxNow || createFxNow(), duration: options.duration, tweens: [], createTween: function( prop, end ) { var tween = jQuery.Tween( elem, animation.opts, prop, end, animation.opts.specialEasing[ prop ] || animation.opts.easing ); animation.tweens.push( tween ); return tween; }, stop: function( gotoEnd ) { var index = 0, // if we are going to the end, we want to run all the tweens // otherwise we skip this part length = gotoEnd ? animation.tweens.length : 0; if ( stopped ) { return this; } stopped = true; for ( ; index < length ; index++ ) { animation.tweens[ index ].run( 1 ); } // resolve when we played the last frame // otherwise, reject if ( gotoEnd ) { deferred.resolveWith( elem, [ animation, gotoEnd ] ); } else { deferred.rejectWith( elem, [ animation, gotoEnd ] ); } return this; } }), props = animation.props; propFilter( props, animation.opts.specialEasing ); for ( ; index < length ; index++ ) { result = animationPrefilters[ index ].call( animation, elem, props, animation.opts ); if ( result ) { return result; } } createTweens( animation, props ); if ( jQuery.isFunction( animation.opts.start ) ) { animation.opts.start.call( elem, animation ); } jQuery.fx.timer( jQuery.extend( tick, { elem: elem, anim: animation, queue: animation.opts.queue }) ); // attach callbacks from options return animation.progress( animation.opts.progress ) .done( animation.opts.done, animation.opts.complete ) .fail( animation.opts.fail ) .always( animation.opts.always ); } function propFilter( props, specialEasing ) { var index, name, easing, value, hooks; // camelCase, specialEasing and expand cssHook pass for ( index in props ) { name = jQuery.camelCase( index ); easing = specialEasing[ name ]; value = props[ index ]; if ( jQuery.isArray( value ) ) { easing = value[ 1 ]; value = props[ index ] = value[ 0 ]; } if ( index !== name ) { props[ name ] = value; delete props[ index ]; } hooks = jQuery.cssHooks[ name ]; if ( hooks && "expand" in hooks ) { value = hooks.expand( value ); delete props[ name ]; // not quite $.extend, this wont overwrite keys already present. // also - reusing 'index' from above because we have the correct "name" for ( index in value ) { if ( !( index in props ) ) { props[ index ] = value[ index ]; specialEasing[ index ] = easing; } } } else { specialEasing[ name ] = easing; } } } jQuery.Animation = jQuery.extend( Animation, { tweener: function( props, callback ) { if ( jQuery.isFunction( props ) ) { callback = props; props = [ "*" ]; } else { props = props.split(" "); } var prop, index = 0, length = props.length; for ( ; index < length ; index++ ) { prop = props[ index ]; tweeners[ prop ] = tweeners[ prop ] || []; tweeners[ prop ].unshift( callback ); } }, prefilter: function( callback, prepend ) { if ( prepend ) { animationPrefilters.unshift( callback ); } else { animationPrefilters.push( callback ); } } }); function defaultPrefilter( elem, props, opts ) { /* jshint validthis: true */ var index, prop, value, length, dataShow, toggle, tween, hooks, oldfire, anim = this, style = elem.style, orig = {}, handled = [], hidden = elem.nodeType && isHidden( elem ); // handle queue: false promises if ( !opts.queue ) { hooks = jQuery._queueHooks( elem, "fx" ); if ( hooks.unqueued == null ) { hooks.unqueued = 0; oldfire = hooks.empty.fire; hooks.empty.fire = function() { if ( !hooks.unqueued ) { oldfire(); } }; } hooks.unqueued++; anim.always(function() { // doing this makes sure that the complete handler will be called // before this completes anim.always(function() { hooks.unqueued--; if ( !jQuery.queue( elem, "fx" ).length ) { hooks.empty.fire(); } }); }); } // height/width overflow pass if ( elem.nodeType === 1 && ( "height" in props || "width" in props ) ) { // Make sure that nothing sneaks out // Record all 3 overflow attributes because IE9-10 do not // change the overflow attribute when overflowX and // overflowY are set to the same value opts.overflow = [ style.overflow, style.overflowX, style.overflowY ]; // Set display property to inline-block for height/width // animations on inline elements that are having width/height animated if ( jQuery.css( elem, "display" ) === "inline" && jQuery.css( elem, "float" ) === "none" ) { style.display = "inline-block"; } } if ( opts.overflow ) { style.overflow = "hidden"; anim.always(function() { style.overflow = opts.overflow[ 0 ]; style.overflowX = opts.overflow[ 1 ]; style.overflowY = opts.overflow[ 2 ]; }); } // show/hide pass dataShow = data_priv.get( elem, "fxshow" ); for ( index in props ) { value = props[ index ]; if ( rfxtypes.exec( value ) ) { delete props[ index ]; toggle = toggle || value === "toggle"; if ( value === ( hidden ? "hide" : "show" ) ) { // If there is dataShow left over from a stopped hide or show and we are going to proceed with show, we should pretend to be hidden if( value === "show" && dataShow !== undefined && dataShow[ index ] !== undefined ) { hidden = true; } else { continue; } } handled.push( index ); } } length = handled.length; if ( length ) { dataShow = data_priv.get( elem, "fxshow" ) || data_priv.access( elem, "fxshow", {} ); if ( "hidden" in dataShow ) { hidden = dataShow.hidden; } // store state if its toggle - enables .stop().toggle() to "reverse" if ( toggle ) { dataShow.hidden = !hidden; } if ( hidden ) { jQuery( elem ).show(); } else { anim.done(function() { jQuery( elem ).hide(); }); } anim.done(function() { var prop; data_priv.remove( elem, "fxshow" ); for ( prop in orig ) { jQuery.style( elem, prop, orig[ prop ] ); } }); for ( index = 0 ; index < length ; index++ ) { prop = handled[ index ]; tween = anim.createTween( prop, hidden ? dataShow[ prop ] : 0 ); orig[ prop ] = dataShow[ prop ] || jQuery.style( elem, prop ); if ( !( prop in dataShow ) ) { dataShow[ prop ] = tween.start; if ( hidden ) { tween.end = tween.start; tween.start = prop === "width" || prop === "height" ? 1 : 0; } } } } } function Tween( elem, options, prop, end, easing ) { return new Tween.prototype.init( elem, options, prop, end, easing ); } jQuery.Tween = Tween; Tween.prototype = { constructor: Tween, init: function( elem, options, prop, end, easing, unit ) { this.elem = elem; this.prop = prop; this.easing = easing || "swing"; this.options = options; this.start = this.now = this.cur(); this.end = end; this.unit = unit || ( jQuery.cssNumber[ prop ] ? "" : "px" ); }, cur: function() { var hooks = Tween.propHooks[ this.prop ]; return hooks && hooks.get ? hooks.get( this ) : Tween.propHooks._default.get( this ); }, run: function( percent ) { var eased, hooks = Tween.propHooks[ this.prop ]; if ( this.options.duration ) { this.pos = eased = jQuery.easing[ this.easing ]( percent, this.options.duration * percent, 0, 1, this.options.duration ); } else { this.pos = eased = percent; } this.now = ( this.end - this.start ) * eased + this.start; if ( this.options.step ) { this.options.step.call( this.elem, this.now, this ); } if ( hooks && hooks.set ) { hooks.set( this ); } else { Tween.propHooks._default.set( this ); } return this; } }; Tween.prototype.init.prototype = Tween.prototype; Tween.propHooks = { _default: { get: function( tween ) { var result; if ( tween.elem[ tween.prop ] != null && (!tween.elem.style || tween.elem.style[ tween.prop ] == null) ) { return tween.elem[ tween.prop ]; } // passing an empty string as a 3rd parameter to .css will automatically // attempt a parseFloat and fallback to a string if the parse fails // so, simple values such as "10px" are parsed to Float. // complex values such as "rotate(1rad)" are returned as is. result = jQuery.css( tween.elem, tween.prop, "" ); // Empty strings, null, undefined and "auto" are converted to 0. return !result || result === "auto" ? 0 : result; }, set: function( tween ) { // use step hook for back compat - use cssHook if its there - use .style if its // available and use plain properties where available if ( jQuery.fx.step[ tween.prop ] ) { jQuery.fx.step[ tween.prop ]( tween ); } else if ( tween.elem.style && ( tween.elem.style[ jQuery.cssProps[ tween.prop ] ] != null || jQuery.cssHooks[ tween.prop ] ) ) { jQuery.style( tween.elem, tween.prop, tween.now + tween.unit ); } else { tween.elem[ tween.prop ] = tween.now; } } } }; // Support: IE9 // Panic based approach to setting things on disconnected nodes Tween.propHooks.scrollTop = Tween.propHooks.scrollLeft = { set: function( tween ) { if ( tween.elem.nodeType && tween.elem.parentNode ) { tween.elem[ tween.prop ] = tween.now; } } }; jQuery.each([ "toggle", "show", "hide" ], function( i, name ) { var cssFn = jQuery.fn[ name ]; jQuery.fn[ name ] = function( speed, easing, callback ) { return speed == null || typeof speed === "boolean" ? cssFn.apply( this, arguments ) : this.animate( genFx( name, true ), speed, easing, callback ); }; }); jQuery.fn.extend({ fadeTo: function( speed, to, easing, callback ) { // show any hidden elements after setting opacity to 0 return this.filter( isHidden ).css( "opacity", 0 ).show() // animate to the value specified .end().animate({ opacity: to }, speed, easing, callback ); }, animate: function( prop, speed, easing, callback ) { var empty = jQuery.isEmptyObject( prop ), optall = jQuery.speed( speed, easing, callback ), doAnimation = function() { // Operate on a copy of prop so per-property easing won't be lost var anim = Animation( this, jQuery.extend( {}, prop ), optall ); doAnimation.finish = function() { anim.stop( true ); }; // Empty animations, or finishing resolves immediately if ( empty || data_priv.get( this, "finish" ) ) { anim.stop( true ); } }; doAnimation.finish = doAnimation; return empty || optall.queue === false ? this.each( doAnimation ) : this.queue( optall.queue, doAnimation ); }, stop: function( type, clearQueue, gotoEnd ) { var stopQueue = function( hooks ) { var stop = hooks.stop; delete hooks.stop; stop( gotoEnd ); }; if ( typeof type !== "string" ) { gotoEnd = clearQueue; clearQueue = type; type = undefined; } if ( clearQueue && type !== false ) { this.queue( type || "fx", [] ); } return this.each(function() { var dequeue = true, index = type != null && type + "queueHooks", timers = jQuery.timers, data = data_priv.get( this ); if ( index ) { if ( data[ index ] && data[ index ].stop ) { stopQueue( data[ index ] ); } } else { for ( index in data ) { if ( data[ index ] && data[ index ].stop && rrun.test( index ) ) { stopQueue( data[ index ] ); } } } for ( index = timers.length; index--; ) { if ( timers[ index ].elem === this && (type == null || timers[ index ].queue === type) ) { timers[ index ].anim.stop( gotoEnd ); dequeue = false; timers.splice( index, 1 ); } } // start the next in the queue if the last step wasn't forced // timers currently will call their complete callbacks, which will dequeue // but only if they were gotoEnd if ( dequeue || !gotoEnd ) { jQuery.dequeue( this, type ); } }); }, finish: function( type ) { if ( type !== false ) { type = type || "fx"; } return this.each(function() { var index, data = data_priv.get( this ), queue = data[ type + "queue" ], hooks = data[ type + "queueHooks" ], timers = jQuery.timers, length = queue ? queue.length : 0; // enable finishing flag on private data data.finish = true; // empty the queue first jQuery.queue( this, type, [] ); if ( hooks && hooks.cur && hooks.cur.finish ) { hooks.cur.finish.call( this ); } // look for any active animations, and finish them for ( index = timers.length; index--; ) { if ( timers[ index ].elem === this && timers[ index ].queue === type ) { timers[ index ].anim.stop( true ); timers.splice( index, 1 ); } } // look for any animations in the old queue and finish them for ( index = 0; index < length; index++ ) { if ( queue[ index ] && queue[ index ].finish ) { queue[ index ].finish.call( this ); } } // turn off finishing flag delete data.finish; }); } }); // Generate parameters to create a standard animation function genFx( type, includeWidth ) { var which, attrs = { height: type }, i = 0; // if we include width, step value is 1 to do all cssExpand values, // if we don't include width, step value is 2 to skip over Left and Right includeWidth = includeWidth? 1 : 0; for( ; i < 4 ; i += 2 - includeWidth ) { which = cssExpand[ i ]; attrs[ "margin" + which ] = attrs[ "padding" + which ] = type; } if ( includeWidth ) { attrs.opacity = attrs.width = type; } return attrs; } // Generate shortcuts for custom animations jQuery.each({ slideDown: genFx("show"), slideUp: genFx("hide"), slideToggle: genFx("toggle"), fadeIn: { opacity: "show" }, fadeOut: { opacity: "hide" }, fadeToggle: { opacity: "toggle" } }, function( name, props ) { jQuery.fn[ name ] = function( speed, easing, callback ) { return this.animate( props, speed, easing, callback ); }; }); jQuery.speed = function( speed, easing, fn ) { var opt = speed && typeof speed === "object" ? jQuery.extend( {}, speed ) : { complete: fn || !fn && easing || jQuery.isFunction( speed ) && speed, duration: speed, easing: fn && easing || easing && !jQuery.isFunction( easing ) && easing }; opt.duration = jQuery.fx.off ? 0 : typeof opt.duration === "number" ? opt.duration : opt.duration in jQuery.fx.speeds ? jQuery.fx.speeds[ opt.duration ] : jQuery.fx.speeds._default; // normalize opt.queue - true/undefined/null -> "fx" if ( opt.queue == null || opt.queue === true ) { opt.queue = "fx"; } // Queueing opt.old = opt.complete; opt.complete = function() { if ( jQuery.isFunction( opt.old ) ) { opt.old.call( this ); } if ( opt.queue ) { jQuery.dequeue( this, opt.queue ); } }; return opt; }; jQuery.easing = { linear: function( p ) { return p; }, swing: function( p ) { return 0.5 - Math.cos( p*Math.PI ) / 2; } }; jQuery.timers = []; jQuery.fx = Tween.prototype.init; jQuery.fx.tick = function() { var timer, timers = jQuery.timers, i = 0; fxNow = jQuery.now(); for ( ; i < timers.length; i++ ) { timer = timers[ i ]; // Checks the timer has not already been removed if ( !timer() && timers[ i ] === timer ) { timers.splice( i--, 1 ); } } if ( !timers.length ) { jQuery.fx.stop(); } fxNow = undefined; }; jQuery.fx.timer = function( timer ) { if ( timer() && jQuery.timers.push( timer ) ) { jQuery.fx.start(); } }; jQuery.fx.interval = 13; jQuery.fx.start = function() { if ( !timerId ) { timerId = setInterval( jQuery.fx.tick, jQuery.fx.interval ); } }; jQuery.fx.stop = function() { clearInterval( timerId ); timerId = null; }; jQuery.fx.speeds = { slow: 600, fast: 200, // Default speed _default: 400 }; // Back Compat <1.8 extension point jQuery.fx.step = {}; if ( jQuery.expr && jQuery.expr.filters ) { jQuery.expr.filters.animated = function( elem ) { return jQuery.grep(jQuery.timers, function( fn ) { return elem === fn.elem; }).length; }; } jQuery.fn.offset = function( options ) { if ( arguments.length ) { return options === undefined ? this : this.each(function( i ) { jQuery.offset.setOffset( this, options, i ); }); } var docElem, win, elem = this[ 0 ], box = { top: 0, left: 0 }, doc = elem && elem.ownerDocument; if ( !doc ) { return; } docElem = doc.documentElement; // Make sure it's not a disconnected DOM node if ( !jQuery.contains( docElem, elem ) ) { return box; } // If we don't have gBCR, just use 0,0 rather than error // BlackBerry 5, iOS 3 (original iPhone) if ( typeof elem.getBoundingClientRect !== core_strundefined ) { box = elem.getBoundingClientRect(); } win = getWindow( doc ); return { top: box.top + win.pageYOffset - docElem.clientTop, left: box.left + win.pageXOffset - docElem.clientLeft }; }; jQuery.offset = { setOffset: function( elem, options, i ) { var curPosition, curLeft, curCSSTop, curTop, curOffset, curCSSLeft, calculatePosition, position = jQuery.css( elem, "position" ), curElem = jQuery( elem ), props = {}; // Set position first, in-case top/left are set even on static elem if ( position === "static" ) { elem.style.position = "relative"; } curOffset = curElem.offset(); curCSSTop = jQuery.css( elem, "top" ); curCSSLeft = jQuery.css( elem, "left" ); calculatePosition = ( position === "absolute" || position === "fixed" ) && ( curCSSTop + curCSSLeft ).indexOf("auto") > -1; // Need to be able to calculate position if either top or left is auto and position is either absolute or fixed if ( calculatePosition ) { curPosition = curElem.position(); curTop = curPosition.top; curLeft = curPosition.left; } else { curTop = parseFloat( curCSSTop ) || 0; curLeft = parseFloat( curCSSLeft ) || 0; } if ( jQuery.isFunction( options ) ) { options = options.call( elem, i, curOffset ); } if ( options.top != null ) { props.top = ( options.top - curOffset.top ) + curTop; } if ( options.left != null ) { props.left = ( options.left - curOffset.left ) + curLeft; } if ( "using" in options ) { options.using.call( elem, props ); } else { curElem.css( props ); } } }; jQuery.fn.extend({ position: function() { if ( !this[ 0 ] ) { return; } var offsetParent, offset, elem = this[ 0 ], parentOffset = { top: 0, left: 0 }; // Fixed elements are offset from window (parentOffset = {top:0, left: 0}, because it is it's only offset parent if ( jQuery.css( elem, "position" ) === "fixed" ) { // We assume that getBoundingClientRect is available when computed position is fixed offset = elem.getBoundingClientRect(); } else { // Get *real* offsetParent offsetParent = this.offsetParent(); // Get correct offsets offset = this.offset(); if ( !jQuery.nodeName( offsetParent[ 0 ], "html" ) ) { parentOffset = offsetParent.offset(); } // Add offsetParent borders parentOffset.top += jQuery.css( offsetParent[ 0 ], "borderTopWidth", true ); parentOffset.left += jQuery.css( offsetParent[ 0 ], "borderLeftWidth", true ); } // Subtract parent offsets and element margins return { top: offset.top - parentOffset.top - jQuery.css( elem, "marginTop", true ), left: offset.left - parentOffset.left - jQuery.css( elem, "marginLeft", true ) }; }, offsetParent: function() { return this.map(function() { var offsetParent = this.offsetParent || docElem; while ( offsetParent && ( !jQuery.nodeName( offsetParent, "html" ) && jQuery.css( offsetParent, "position") === "static" ) ) { offsetParent = offsetParent.offsetParent; } return offsetParent || docElem; }); } }); // Create scrollLeft and scrollTop methods jQuery.each( {scrollLeft: "pageXOffset", scrollTop: "pageYOffset"}, function( method, prop ) { var top = "pageYOffset" === prop; jQuery.fn[ method ] = function( val ) { return jQuery.access( this, function( elem, method, val ) { var win = getWindow( elem ); if ( val === undefined ) { return win ? win[ prop ] : elem[ method ]; } if ( win ) { win.scrollTo( !top ? val : window.pageXOffset, top ? val : window.pageYOffset ); } else { elem[ method ] = val; } }, method, val, arguments.length, null ); }; }); function getWindow( elem ) { return jQuery.isWindow( elem ) ? elem : elem.nodeType === 9 && elem.defaultView; } // Create innerHeight, innerWidth, height, width, outerHeight and outerWidth methods jQuery.each( { Height: "height", Width: "width" }, function( name, type ) { jQuery.each( { padding: "inner" + name, content: type, "": "outer" + name }, function( defaultExtra, funcName ) { // margin is only for outerHeight, outerWidth jQuery.fn[ funcName ] = function( margin, value ) { var chainable = arguments.length && ( defaultExtra || typeof margin !== "boolean" ), extra = defaultExtra || ( margin === true || value === true ? "margin" : "border" ); return jQuery.access( this, function( elem, type, value ) { var doc; if ( jQuery.isWindow( elem ) ) { // As of 5/8/2012 this will yield incorrect results for Mobile Safari, but there // isn't a whole lot we can do. See pull request at this URL for discussion: // https://github.com/jquery/jquery/pull/764 return elem.document.documentElement[ "client" + name ]; } // Get document width or height if ( elem.nodeType === 9 ) { doc = elem.documentElement; // Either scroll[Width/Height] or offset[Width/Height] or client[Width/Height], // whichever is greatest return Math.max( elem.body[ "scroll" + name ], doc[ "scroll" + name ], elem.body[ "offset" + name ], doc[ "offset" + name ], doc[ "client" + name ] ); } return value === undefined ? // Get width or height on the element, requesting but not forcing parseFloat jQuery.css( elem, type, extra ) : // Set width or height on the element jQuery.style( elem, type, value, extra ); }, type, chainable ? margin : undefined, chainable, null ); }; }); }); // Limit scope pollution from any deprecated API // (function() { // The number of elements contained in the matched element set jQuery.fn.size = function() { return this.length; }; jQuery.fn.andSelf = jQuery.fn.addBack; // })(); if ( typeof module === "object" && typeof module.exports === "object" ) { // Expose jQuery as module.exports in loaders that implement the Node // module pattern (including browserify). Do not create the global, since // the user will be storing it themselves locally, and globals are frowned // upon in the Node module world. module.exports = jQuery; } else { // Register as a named AMD module, since jQuery can be concatenated with other // files that may use define, but not via a proper concatenation script that // understands anonymous AMD modules. A named AMD is safest and most robust // way to register. Lowercase jquery is used because AMD module names are // derived from file names, and jQuery is normally delivered in a lowercase // file name. Do this after creating the global so that if an AMD module wants // to call noConflict to hide this version of jQuery, it will work. if ( typeof define === "function" && define.amd ) { define( "jquery", [], function () { return jQuery; } ); } } // If there is a window object, that at least has a document property, // define jQuery and $ identifiers if ( typeof window === "object" && typeof window.document === "object" ) { window.jQuery = window.$ = jQuery; } })( window ); ================================================ FILE: extensions/chrome/js/jquery/package.json ================================================ { "name": "components-jquery", "version": "2.0.0", "description": "jQuery component", "keywords": ["jquery"], "main": "./jquery.js" } ================================================ FILE: extensions/chrome/manifest.json ================================================ { "name": "DocsGPT - Documentation AI butler", "version": "0.0.1", "manifest_version": 3, "description": "AI assistant for developers, that helps you answer your questions about the documentation you are reading.", "icons": { "16": "icons/icon16.png", "48": "icons/icon48.png", "128": "icons/icon128.png" }, "default_locale": "en", "background": { "service_worker": "src/bg/service-worker.js" }, "action": { "default_title": "DocsGPT - Documentation AI butler", "default_popup": "popup.html" }, "permissions": ["activeTab", "storage"], "host_permissions": [ "*://*/*" ], "content_scripts": [{ "js": ["popup.js"], "matches": ["https://github.com/*"] }] } ================================================ FILE: extensions/chrome/package.json ================================================ { "name": "docsgpt-chrome-extension", "version": "0.0.1", "description": "DocsGPT - Documentation AI butler", "main": "popup.js", "author": "", "license": "MIT", "scripts": { "dev": "npx tailwindcss -i ./styles.css -o ./dist/output.css --watch" }, "keywords": [ "DocsGPT", "Documentation", "Chrome", "extension" ], "devDependencies": { "tailwindcss": "^3.2.4" } } ================================================ FILE: extensions/chrome/popup.html ================================================ <!DOCTYPE html> <html> <head> <title>Chat Extension

DocsGPT

About

Hello, ask me anything about this library. Im here to help

How to create API key for Api gateway?

Import the boto3 library and create a client for the API Gateway service:

client = boto3.client('apigateway')

Create an API key:

response = client.create_api_key(
name='API_KEY_NAME',
description='API key description',
enabled=True)
api_key = response['value']
================================================ FILE: extensions/chrome/popup.js ================================================ document.getElementById("message-form").addEventListener("submit", function(event) { event.preventDefault(); var message = document.getElementById("message-input").value; chrome.runtime.sendMessage({msg: "sendMessage", message: message}, function(response) { console.log(response.response); msg_html = '

' msg_html += message msg_html += '

' document.getElementById("messages").innerHTML += msg_html; let chatWindow = document.getElementById("chat-container"); chatWindow.scrollTop = chatWindow.scrollHeight; }); document.getElementById("message-input").value = ""; var conversation_state = localStorage.getItem("conversation_state"); // check if conversation state is null if (conversation_state == null) { conversation_state = 0; localStorage.setItem("conversation_state", conversation_state); } // send post request to server http://127.0.0.1:5000/ with message in json body fetch('http://127.0.0.1:7091/api/answer', { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({question: message, history: null}), }) .then(response => response.json()) .then(data => { console.log('Success:', data); msg_html = '
' msg_html += data.answer msg_html += '
' document.getElementById("messages").innerHTML += msg_html; let chatWindow = document.getElementById("chat-container"); chatWindow.scrollTop = chatWindow.scrollHeight; }) }); ================================================ FILE: extensions/chrome/src/bg/service-worker.js ================================================ // This is the service worker script, which executes in its own context // when the extension is installed or refreshed (or when you access its console). // It would correspond to the background script in chrome extensions v2. console.log("This prints to the console of the service worker (background script)"); chrome.runtime.onMessage.addListener( function(request, sender, sendResponse) { if (request.msg === "sendMessage") { sendResponse({response: "Message received"}); } } ); ================================================ FILE: extensions/chrome/styles.css ================================================ @tailwind base; @tailwind components; @tailwind utilities; #chat-container { width: 500px; height: 450px; background-color: white; padding: 10px; overflow: auto; } .bg-gray-200 { background-color: #edf2f7; } .bg-gray-900 { background-color: #1a202c; } .rounded-lg { border-radius: 0.5rem; } .shadow { box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12), 0 1px 2px rgba(0, 0, 0, 0.24); } .text-gray-700 { color: #4a5568; } .text-sm { font-size: 0.875rem; } .p-4 { padding: 1.5rem; } ================================================ FILE: extensions/chrome/tailwind.config.js ================================================ module.exports = { content: ["./src/**/*.{html,js}", "./*.{html,js,css}"], theme: { extend: {}, }, plugins: [], } ================================================ FILE: extensions/discord/__init__.py ================================================ ================================================ FILE: extensions/discord/bot.py ================================================ import os import re import logging import aiohttp import discord from discord.ext import commands import dotenv dotenv.load_dotenv() # Enable logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Bot configuration TOKEN = os.getenv("DISCORD_TOKEN") PREFIX = '!' # Command prefix BASE_API_URL = os.getenv("API_BASE", "https://gptcloud.arc53.com") API_URL = BASE_API_URL + "/api/answer" API_KEY = os.getenv("API_KEY") intents = discord.Intents.default() intents.message_content = True bot = commands.Bot(command_prefix=PREFIX, intents=intents) # Store conversation history per user conversation_histories = {} def chunk_string(text, max_length=2000): """Splits a string into chunks of a specified maximum length.""" # Create list to store the split strings chunks = [] # Loop through the text, create substrings with max_length while len(text) > max_length: # Find last space within the limit idx = text.rfind(' ', 0, max_length) # Ensure we don't have an empty part if idx == -1: # If no spaces, just take chunk chunks.append(text[:max_length]) text = text[max_length:] else: # Push whatever we've got up to the last space chunks.append(text[:idx]) text = text[idx+1:] # Catches the remaining part chunks.append(text) return chunks def escape_markdown(text): """Escapes Discord markdown characters.""" escape_chars = r'\*_$$$$()~>#+-=|{}.!' return re.sub(f'([{re.escape(escape_chars)}])', r'\\\1', text) def split_string(input_str): """Splits the input string to detect bot mentions.""" pattern = r'^<@!?{0}>\s*'.format(bot.user.id) match = re.match(pattern, input_str) if match: content = input_str[match.end():].strip() return str(bot.user.id), content return None, input_str @bot.event async def on_ready(): print(f'{bot.user.name} has connected to Discord!') async def generate_answer(question, messages, conversation_id): """Generates an answer using the external API.""" payload = { "question": question, "api_key": API_KEY, "history": messages, "conversation_id": conversation_id } headers = { "Content-Type": "application/json; charset=utf-8" } timeout = aiohttp.ClientTimeout(total=60) async with aiohttp.ClientSession(timeout=timeout) as session: async with session.post(API_URL, json=payload, headers=headers) as resp: if resp.status == 200: data = await resp.json() conversation_id = data.get("conversation_id") answer = data.get("answer", "Sorry, I couldn't find an answer.") return {"answer": answer, "conversation_id": conversation_id} else: return {"answer": "Sorry, I couldn't find an answer.", "conversation_id": None} @bot.command(name="start") async def start(ctx): """Handles the /start command.""" await ctx.send(f"Hi {ctx.author.mention}! How can I assist you today?") @bot.command(name="custom_help") async def custom_help_command(ctx): """Handles the /custom_help command.""" help_text = ( "Here are the available commands:\n" "`!start` - Begin a new conversation with the bot\n" "`!help` - Display this help message\n\n" "You can also mention me or send a direct message to ask a question!" ) await ctx.send(help_text) @bot.event async def on_message(message): if message.author == bot.user: return # Process commands first await bot.process_commands(message) # Check if the message is in a DM channel if isinstance(message.channel, discord.DMChannel): content = message.content.strip() else: # In guild channels, check if the message mentions the bot at the start content = message.content.strip() prefix, content = split_string(content) if prefix is None: return part_prefix = str(bot.user.id) if part_prefix != prefix: return # Bot not mentioned at the start, so do not process # Now process the message user_id = message.author.id if user_id not in conversation_histories: conversation_histories[user_id] = { "history": [], "conversation_id": None } conversation = conversation_histories[user_id] conversation["history"].append({"prompt": content}) # Generate the answer response_doc = await generate_answer( content, conversation["history"], conversation["conversation_id"] ) answer = response_doc["answer"] conversation_id = response_doc["conversation_id"] answer_chunks = chunk_string(answer) for chunk in answer_chunks: await message.channel.send(chunk) conversation["history"][-1]["response"] = answer conversation["conversation_id"] = conversation_id # Keep conversation history to last 10 exchanges conversation["history"] = conversation["history"][-10:] bot.run(TOKEN) ================================================ FILE: extensions/react-widget/.gitignore ================================================ node_modules dist .parcel-cache ================================================ FILE: extensions/react-widget/.parcelrc ================================================ { "extends": "@parcel/config-default", "resolvers": ["@parcel/resolver-glob","..."], "transformers": { "*.svg": ["...", "@parcel/transformer-svg-react", "@parcel/transformer-typescript-tsc"] }, "validators": { "*.{ts,tsx}": ["@parcel/validator-typescript"] } } ================================================ FILE: extensions/react-widget/README.md ================================================ # DocsGPT react widget This widget will allow you to embed a DocsGPT assistant in your React app. ## Installation ```bash npm install docsgpt ``` ## Usage ### React ```javascript import { DocsGPTWidget } from "docsgpt-react"; const App = () => { return ; }; ``` To link the widget to your api and your documents you can pass parameters to the component. ```javascript import { DocsGPTWidget } from "docsgpt-react"; const App = () => { return ; }; ``` ### Html ```html DocsGPT Widget
``` To link the widget to your api and your documents you can pass parameters to the **renderDocsGPTWidget('div id', { parameters })**. ```html DocsGPT Widget
``` # SearchBar The `SearchBar` component is an interactive search bar designed to provide search results based on **vector similarity search**. It also includes the capability to open the AI Chatbot, enabling users to query. --- ### Importing the Component ```tsx import { SearchBar } from "docsgpt-react"; ``` --- ### Usage Example ```tsx ``` --- ## HTML embedding for Search bar ```html SearchBar Embedding
``` ### Props | **Prop** | **Type** | **Default Value** | **Description** | |-----------------|-----------|-------------------------------------|--------------------------------------------------------------------------------------------------| | **`apiKey`** | `string` | `"74039c6d-bff7-44ce-ae55-2973cbf13837"` | Your API key generated from the app. Used for authenticating requests. | | **`apiHost`** | `string` | `"https://gptcloud.arc53.com"` | The base URL of the server hosting the vector similarity search and chatbot services. | | **`theme`** | `"dark" \| "light"` | `"dark"` | The theme of the search bar. Accepts `"dark"` or `"light"`. | | **`placeholder`** | `string` | `"Search or Ask AI..."` | Placeholder text displayed in the search input field. | | **`width`** | `string` | `"256px"` | Width of the search bar. Accepts any valid CSS width value (e.g., `"300px"`, `"100%"`, `"20rem"`). | Feel free to reach out if you need help customizing or extending the `SearchBar`! ## Our github [DocsGPT](https://github.com/arc53/DocsGPT) You can find the source code in the extensions/react-widget folder. ================================================ FILE: extensions/react-widget/custom.d.ts ================================================ declare module "*.svg" { import * as React from "react"; const ReactComponent: React.FunctionComponent< React.SVGProps & { title?: string } >; export default ReactComponent; } ================================================ FILE: extensions/react-widget/package.json ================================================ { "name": "docsgpt", "version": "0.5.1", "private": false, "description": "DocsGPT 🦖 is an innovative open-source tool designed to simplify the retrieval of information from project documentation using advanced GPT models 🤖.", "source": "./src/index.html", "main": "dist/main.js", "module": "dist/module.js", "types": "dist/types.d.ts", "files": [ "dist", "package.json" ], "targets": { "modern": { "engines": { "browsers": "Chrome 80" } }, "legacy": { "engines": { "browsers": "> 0.5%, last 2 versions, not dead" } } }, "@parcel/resolver-default": { "packageExports": true }, "resolution": { "styled-components": "^5" }, "scripts": { "build": "parcel build src/browser.tsx --public-url ./", "build:react": "parcel build src/index.ts", "serve": "parcel serve -p 3000", "dev": "parcel -p 3000", "test": "jest", "lint": "eslint", "check": "tsc --noEmit", "ci": "yarn build && yarn test && yarn lint && yarn check" }, "dependencies": { "@babel/plugin-transform-flow-strip-types": "^7.23.3", "@bpmn-io/snarkdown": "^2.2.0", "@parcel/resolver-glob": "^2.16.4", "@parcel/transformer-svg-react": "^2.16.4", "@parcel/transformer-typescript-tsc": "^2.16.4", "@parcel/validator-typescript": "^2.16.4", "@radix-ui/react-icons": "^1.3.0", "class-variance-authority": "^0.7.0", "clsx": "^2.1.0", "dompurify": "^3.1.5", "flow-bin": "^0.305.0", "markdown-it": "^14.1.0", "react": "^18.2.0", "react-dom": "^18.2.0", "styled-components": "^6.1.8" }, "devDependencies": { "@babel/core": "^7.24.0", "@babel/preset-env": "^7.24.0", "@babel/preset-react": "^7.23.3", "@parcel/packager-ts": "^2.16.4", "@parcel/transformer-typescript-types": "^2.16.4", "@types/dompurify": "^3.0.5", "@types/markdown-it": "^14.1.2", "@types/react": "^18.3.3", "@types/react-dom": "^18.3.0", "babel-loader": "^10.1.1", "parcel": "^2.16.4", "process": "^0.11.10", "svgo": "^3.3.3", "typescript": "^5.3.3" }, "publishConfig": { "access": "public" }, "repository": { "type": "git", "url": "git+https://github.com/arc53/DocsGPT.git" }, "keywords": [ "docsgpt", "chatbot", "assistant", "ai", "chatdocs", "widget" ], "author": "Arc53", "license": "Apache-2.0", "bugs": { "url": "https://github.com/arc53/DocsGPT/issues" }, "homepage": "https://github.com/arc53/DocsGPT#readme" } ================================================ FILE: extensions/react-widget/publish.sh ================================================ #!/bin/bash set -e # Create backup of original files cp package.json package_original.json cp package-lock.json package-lock_original.json # Store the latest version after publishing LATEST_VERSION="" # Check if a specific version was provided if [ "$1" ]; then VERSION_UPDATE_TYPE="$1" echo "Using custom version update: $VERSION_UPDATE_TYPE" else VERSION_UPDATE_TYPE="patch" echo "No version specified, defaulting to patch update" fi publish_package() { PACKAGE_NAME=$1 BUILD_COMMAND=$2 IS_REACT=$3 echo "Preparing to publish ${PACKAGE_NAME}..." # Restore original package.json state before each publish cp package_original.json package.json cp package-lock_original.json package-lock.json # Update package name in package.json jq --arg name "$PACKAGE_NAME" '.name=$name' package.json > temp.json && mv temp.json package.json # Handle targets based on package type if [ "$IS_REACT" = "true" ]; then echo "Removing targets for React library build..." jq 'del(.targets)' package.json > temp.json && mv temp.json package.json fi # Clean dist directory if [ -d "dist" ]; then echo "Cleaning dist directory..." rm -rf dist fi # Update version based on input parameter or default to patch if [[ "$VERSION_UPDATE_TYPE" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then # If full version number is provided (e.g., 0.5.0) LATEST_VERSION=$(npm version "$VERSION_UPDATE_TYPE" --no-git-tag-version) else # If update type is provided (patch, minor, major) LATEST_VERSION=$(npm version "$VERSION_UPDATE_TYPE" --no-git-tag-version) fi echo "New version: ${LATEST_VERSION}" # Build package npm run "$BUILD_COMMAND" # Publish package npm publish echo "Successfully published ${PACKAGE_NAME} version ${LATEST_VERSION}" } # First publish docsgpt (HTML bundle) publish_package "docsgpt" "build" "false" # Then publish docsgpt-react (React library) publish_package "docsgpt-react" "build:react" "true" # Restore original state but keep the updated version cp package_original.json package.json cp package-lock_original.json package-lock.json # Update the version in the final package.json jq --arg version "${LATEST_VERSION#v}" '.version=$version' package.json > temp.json && mv temp.json package.json # Run npm install to update package-lock-only npm install --package-lock-only # Cleanup backup files rm -f package_original.json rm -f package-lock_original.json rm -f temp.json echo "---Process completed---" echo "Final version in package.json: $(jq -r '.version' package.json)" echo "Final version in package-lock.json: $(jq -r '.version' package-lock.json)" ================================================ FILE: extensions/react-widget/src/App.tsx ================================================ import React from "react" import {DocsGPTWidget} from "./components/DocsGPTWidget" import {SearchBar} from "./components/SearchBar" export const App = () => { return (
) } ================================================ FILE: extensions/react-widget/src/browser.tsx ================================================ //exports browser ready methods import { createRoot } from "react-dom/client"; import { DocsGPTWidget } from './components/DocsGPTWidget'; import { SearchBar } from './components/SearchBar'; import React from "react"; if (typeof window !== 'undefined') { const renderWidget = (elementId: string, props = {}) => { const root = createRoot(document.getElementById(elementId) as HTMLElement); root.render(); }; const renderSearchBar = (elementId: string, props = {}) => { const root = createRoot(document.getElementById(elementId) as HTMLElement); root.render(); }; (window as any).renderDocsGPTWidget = renderWidget; (window as any).renderSearchBar = renderSearchBar; } export { DocsGPTWidget, SearchBar }; ================================================ FILE: extensions/react-widget/src/components/DocsGPTWidget.tsx ================================================ "use client"; import React, { useRef, useState, useEffect } from 'react' import DOMPurify from 'dompurify'; import styled, { keyframes, css } from 'styled-components'; import { PaperPlaneIcon, RocketIcon, ExclamationTriangleIcon, Cross2Icon } from '@radix-ui/react-icons'; import { FEEDBACK, MESSAGE_TYPE, Query, Status, WidgetCoreProps, WidgetProps } from '../types/index'; import { fetchAnswerStreaming, sendFeedback } from '../requests/streamingApi'; import { ThemeProvider } from 'styled-components'; import Like from '../assets/like.svg'; import Dislike from '../assets/dislike.svg'; import MarkdownIt from 'markdown-it'; const themes = { dark: { bg: '#222327', text: '#fff', primary: { text: "#FAFAFA", bg: '#222327' }, secondary: { text: "#A1A1AA", bg: "#38383b" } }, light: { bg: '#fff', text: '#000', primary: { text: "#222327", bg: "#fff" }, secondary: { text: "#A1A1AA", bg: "#F6F6F6" } } }; const sizesConfig = { small: { size: 'small', width: '320px', height: '400px' }, medium: { size: 'medium', width: '400px', height: '80vh' }, large: { size: 'large', width: '666px', height: '75vh' }, getCustom: (custom: { width: string; height: string; maxWidth?: string; maxHeight?: string }) => ({ size: 'custom', width: custom.width, height: custom.height, maxWidth: custom.maxWidth || '968px', maxHeight: custom.maxHeight || '70vh', }), }; const createBox = keyframes` 0% { transform: scale(0.6); } 90% { transform: scale(1.02); } 100% { transform: scale(1); } ` const closeBox = keyframes` 0% { transform: scale(1); } 10% { transform: scale(1.02); } 100% { transform: scale(0.6); } ` const openContainer = keyframes` 0% { width: 200px; height: 100px; } 100% { width: ${(props) => props.theme.dimensions.width}; height: ${(props) => props.theme.dimensions.height}; border-radius: 12px; }` const closeContainer = keyframes` 0% { width: ${(props) => props.theme.dimensions.width}; height: ${(props) => props.theme.dimensions.height}; border-radius: 12px; } 100% { width: 200px; height: 100px; } ` const fadeIn = keyframes` from { opacity: 0; width: ${(props) => props.theme.dimensions.width}; height: ${(props) => props.theme.dimensions.height}; transform: scale(0.9); } to { opacity: 1; transform: scale(1); width: ${(props) => props.theme.dimensions.width}; height: ${(props) => props.theme.dimensions.height}; } ` const fadeOut = keyframes` from { opacity: 1; width: ${(props) => props.theme.dimensions.width}; height: ${(props) => props.theme.dimensions.height}; } to { opacity: 0; transform: scale(0.9); width: ${(props) => props.theme.dimensions.width}; height: ${(props) => props.theme.dimensions.height}; } ` const scaleAnimation = keyframes` from { transform: scale(1.2); } to { transform: scale(1); } ` const Overlay = styled.div` position: fixed; top: 0; left: 0; width: 100%; height: 100%; background-color: rgba(0, 0, 0, 0.5); z-index: 999; transition: opacity 0.5s; ` const WidgetContainer = styled.div<{ $modal?: boolean }>` all: initial; position: fixed; right: ${props => props.$modal ? '50%' : '10px'}; bottom: ${props => props.$modal ? '50%' : '10px'}; z-index: 1001; transform-origin:100% 100%; display: block; &.modal{ transform : translate(50%,50%); } &.open { animation: css ${createBox} 250ms cubic-bezier(0.25, 0.1, 0.25, 1) forwards; } &.close { animation: css ${closeBox} 250ms cubic-bezier(0.25, 0.1, 0.25, 1) forwards; } align-items: center; text-align: left; `; const StyledContainer = styled.div<{ $isOpen: boolean }>` all: initial; max-height: ${(props) => props.theme.dimensions.maxHeight}; max-width: ${(props) => props.theme.dimensions.maxWidth}; width: ${(props) => props.theme.dimensions.width}; height: ${(props) => props.theme.dimensions.height} ; position: relative; flex-direction: column; justify-content: space-between; bottom: 0; left: 0; background-color: ${(props) => props.theme.primary.bg}; font-family: sans-serif; display: flex; border-radius: 12px; box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05), 0 2px 4px rgba(0, 0, 0, 0.1); padding: 26px 26px 0px 26px; animation: ${({ $isOpen, theme }) => theme.dimensions.size === 'large' ? $isOpen ? css`${fadeIn} 150ms ease-in forwards` : css` ${fadeOut} 150ms ease-in forwards` : $isOpen ? css`${openContainer} 150ms ease-in forwards` : css`${closeContainer} 250ms ease-in forwards`}; @media only screen and (max-width: 768px) { max-height: 100vh; max-width: 80vw; overflow: auto; } `; const FloatingButton = styled.div<{ $bgcolor: string, $hidden: boolean, $isAnimatingButton: boolean }>` position: fixed; display: ${props => props.$hidden ? "none" : "flex"}; z-index: 500; justify-content: center; gap: 8px; padding: 14px; align-items: center; bottom: 16px; color: white; font-family: sans-serif; right: 16px; font-weight: 500; border-radius: 9999px; background: ${props => props.$bgcolor}; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); cursor: pointer; animation: ${props => props.$isAnimatingButton ? css`${scaleAnimation} 200ms forwards` : 'none'}; &:hover { transform: scale(1.1); transition: transform 0.2s ease-in-out; } &:not(:hover) { transition: transform 0.2s ease-in-out; } `; const CancelButton = styled.button` cursor: pointer; position: absolute; top: 0; right: 0; margin: 8px; width: 30px; padding: 0; background-color: transparent; border: none; outline: none; color: inherit; transition: opacity 0.3s ease; opacity: 0.6; &:hover { opacity: 1; } .white-filter { filter: invert(100%); } `; const Header = styled.div` display: flex; align-items: flex-start; `; const ContentWrapper = styled.div` display: flex; flex-direction: column; gap:2px; margin-left: 8px; `; const Title = styled.h3` font-size: 14px; font-weight: normal; color: ${props => props.theme.primary.text}; margin: 0; `; const Description = styled.p` font-size: 13.75px; color: ${props => props.theme.secondary.text}; margin: 0 ; padding: 0 ; `; const Conversation = styled.div` height: 70%; border-radius: 6px; text-align: left; overflow-y: auto; scrollbar-width: thin; scrollbar-color: ${props => props.theme.secondary.bg} transparent; /* thumb color track color */ `; const Feedback = styled.div` background-color: transparent; font-weight: normal; gap: 12px; display: flex; padding: 6px; clear: both; `; const MessageBubble = styled.div<{ $type: MESSAGE_TYPE }>` display: block; font-size: 16px; position: relative; width: 100%;; float: right; margin: 0px; &:hover ${Feedback} * { visibility: visible ; } `; const Message = styled.div<{ $type: MESSAGE_TYPE }>` background: ${props => props.$type === 'QUESTION' ? 'linear-gradient(to bottom right, #8860DB, #6D42C5)' : props.theme.secondary.bg}; color: ${props => props.$type === 'ANSWER' ? props.theme.primary.text : '#fff'}; border: none; float: ${props => props.$type === 'QUESTION' ? 'right' : 'left'}; max-width: ${props => props.$type === 'ANSWER' ? '90%' : '80%'}; overflow: auto; margin: 4px; display: block; line-height: 1.5; padding: 12px; border-radius: 6px; overflow-wrap: break-word; `; const Markdown = styled.div` pre { padding: 8px; width: 90%; font-size: 12px; border-radius: 6px; overflow-x: auto; background-color: #1B1C1F; color: #fff ; } h1 { font-size: clamp(14px,40vw,16px); } h2 { font-size: 14px; } h3 { font-size: 14px; } p { margin: 0px; } code:not(pre code) { border-radius: 6px; padding: 1px 3px; font-size: 12px; display: inline-block; background-color: #646464; color: #fff ; } code { white-space: pre-wrap ; overflow-wrap: break-word; word-break: break-all; } ul{ padding:0px; margin: 1rem 0; list-style-position: outside; list-style-type: disc; padding-left: 1rem; white-space: normal; } ol{ padding:0px; margin: 1rem 0; list-style-position: outside; list-style-type: decimal; padding-left: 1rem; white-space: normal; } li{ line-height: 1.625; } .dgpt-table-container { margin: 20px 0; width:100%; overflow-x: scroll !important; border: 1px solid #a2a2ab; border-radius: 6px; -webkit-overflow-scrolling: touch; -ms-overflow-style: scrollbar; scrollbar-width: thin; scrollbar-color: #a2a2ab #38383b; } table, .dgpt-table { width: 100%; border-collapse: collapse; text-align: left; min-width:600px; } thead, .dgpt-thead { font-size: 12px; text-transform: uppercase; } th, .dgpt-th, td, .dgpt-td { padding: 10px; border-bottom: 1px solid #a2a2ab; font-size:14px; } th{ font-weight: normal !important; } td{ font-weight: bold; } ` const ErrorAlert = styled.div` color: #b91c1c; border:0.1px solid #b91c1c; display: flex; padding:4px; margin:11.2px; opacity: 90%; max-width: 70%; font-weight: 400; border-radius: 6px; justify-content: space-evenly; ` //dot loading animation const dotBounce = keyframes` 0%, 80%, 100% { transform: translateY(0); } 40% { transform: translateY(-5px); } `; const DotAnimation = styled.div` display: inline-block; animation: ${dotBounce} 1s infinite ease-in-out; `; // delay classes as styled components const Delay = styled(DotAnimation) <{ $delay: number }>` animation-delay: ${props => props.$delay + 'ms'}; `; const PromptContainer = styled.form` background-color: transparent; min-height: ${props => props.theme.dimensions.size == 'large' ? '40px' : '23px'}; max-height:150px; display: flex; align-items: end; justify-content: space-evenly; `; const StyledTextarea = styled.textarea` box-sizing: border-box; width: 100%; border: 1px solid #686877; padding: ${props => props.theme.dimensions.size === 'large' ? '18px 12px 14px 12px' : '8px 12px 4px 12px'}; background-color: transparent; font-size: 16px; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; border-radius: 6px; color: ${props => props.theme.text}; outline: none; resize: none; transition: height 0.1s ease; overflow-wrap: break-word; white-space: pre-wrap; line-height: 1.4; text-align: left; min-height: ${props => props.theme.dimensions.size === 'large' ? '60px' : '40px'}; max-height: 140px; overflow-y: auto; scrollbar-width: thin; scrollbar-color: #38383b transparent; &::-webkit-scrollbar { width: 6px; height: 6px; } &::-webkit-scrollbar-thumb { background-color: #38383b; border-radius: 6px; } &::-webkit-scrollbar-track { background: transparent; } &::placeholder { text-align: left; } `; const StyledButton = styled.button` display: flex; justify-content: center; align-items: center; background-image: linear-gradient(to bottom right, #5AF0EC, #E80D9D); background-color: rgba(0, 0, 0, 0.3); border-radius: 6px; min-width: ${props => props.theme.dimensions.size === 'large' ? '60px' : '40px'}; height: ${props => props.theme.dimensions.size === 'large' ? '60px' : '40px'}; margin-left:8px; padding: 0px; border: none; cursor: pointer; outline: none; &:hover{ opacity: 90%; } &:disabled { background-image: linear-gradient(to bottom right, #2d938f, #b31877); }`; const HeroContainer = styled.div` position: relative; width: 90%; max-width: 500px; background-image: linear-gradient(to bottom right, #5AF0EC, #ff1bf4); border-radius: 10px; margin: 16px auto; padding: 2px; `; const HeroWrapper = styled.div` display: flex; flex-direction: column; justify-content: flex-start; gap: 8px; align-items: middle; background-color: ${props => props.theme.primary.bg}; border-radius: 10px; font-weight: normal; padding: 12px; ` const HeroTitle = styled.h3` color: ${props => props.theme.text}; font-size: 16px; margin:0px ; padding: 0px; `; const HeroDescription = styled.p` color: ${props => props.theme.text}; font-size: 12px; line-height: 1.5; margin: 0px; padding: 0px; `; const Hyperlink = styled.a` color: #9971EC; text-decoration: none; `; const Tagline = styled.div` text-align: center; display: block; color: ${props => props.theme.secondary.text}; padding: 12px ; font-size: 12px; `; const SourcesList = styled.div` display: flex; margin:12px 0px; flex-wrap: wrap; gap: 8px; `; const SourceLink = styled.a` color: ${props => props.theme.primary.text}; text-decoration: none; background: ${props => props.theme.secondary.bg}; padding: 4px 12px; border-radius: 85px; font-size: 14px; transition: opacity 0.2s ease; display: inline-block; text-align: center; max-width: 25%; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; line-height: 1.5; &:hover { opacity: 0.8; } `; const ExtraButton = styled.button` color: #9971EC; background: transparent; border-radius: 85px; padding: 4px 12px; font-size: 14px; border: none; cursor: pointer; transition: opacity 0.2s ease; text-align: center; height:auto; &:hover { opacity: 0.8; } `; const SourcesComponent = ({ sources }: { sources: Array<{ source: string; title: string }> }) => { const [showAll, setShowAll] = React.useState(false); const visibleSources = showAll ? sources : sources.slice(0, 3); const extraCount = sources.length - 3; return ( {visibleSources.map((source, idx) => ( {source.title} ))} {sources.length > 3 && ( setShowAll(!showAll)}> {showAll ? "Show less" : `+ ${extraCount} more`} )} ); }; const Hero = ({ title, description, theme }: { title: string, description: string, theme: string }) => { return ( {title} {description} ); }; export const DocsGPTWidget = (props: WidgetProps) => { const { buttonIcon = 'https://d3dg1063dc54p9.cloudfront.net/widget/chat.svg', buttonText = 'Ask a question', buttonBg = 'linear-gradient(to bottom right, #5AF0EC, #E80D9D)', defaultOpen = false, ...coreProps } = props const [open, setOpen] = React.useState(defaultOpen); const [isAnimatingButton, setIsAnimatingButton] = React.useState(false); const [isFloatingButtonVisible, setIsFloatingButtonVisible] = React.useState(true); React.useEffect(() => { if (isFloatingButtonVisible) setTimeout(() => setIsAnimatingButton(true), 250); return () => { setIsAnimatingButton(false) } }, [isFloatingButtonVisible]) const handleClose = () => { setIsFloatingButtonVisible(true); setOpen(false); }; const handleOpen = () => { setOpen(true); setIsFloatingButtonVisible(false); } return ( <> {buttonText} ) } export const WidgetCore = ({ apiHost = 'https://gptcloud.arc53.com', apiKey = "527686a3-e867-4b4d-9fec-f5f45fdb613a", avatar = 'https://d3dg1063dc54p9.cloudfront.net/cute-docsgpt.png', title = 'Get AI assistance', description = 'DocsGPT\'s AI Chatbot is here to help', heroTitle = 'Welcome to DocsGPT !', heroDescription = 'This chatbot is built with DocsGPT and utilises GenAI, please review important information using sources.', size = 'small', theme = 'dark', collectFeedback = true, isOpen = false, showSources = true, handleClose, prefilledQuery = '' }: WidgetCoreProps) => { const [prompt, setPrompt] = React.useState(""); const [mounted, setMounted] = React.useState(false); const [status, setStatus] = React.useState('idle'); const [queries, setQueries] = React.useState([]); const [conversationId, setConversationId] = React.useState(null); const [eventInterrupt, setEventInterrupt] = React.useState(false); //click or scroll by user while autoScrolling const [hasScrolledToLast, setHasScrolledToLast] = useState(true); const isBubbleHovered = useRef(false); const conversationRef = useRef(null); const endMessageRef = React.useRef(null); const promptRef = React.useRef(null); const md = new MarkdownIt(); //Custom markdown for the table md.renderer.rules.table_open = () => '
'; md.renderer.rules.table_close = () => '
'; md.renderer.rules.thead_open = () => ''; md.renderer.rules.tr_open = () => ''; md.renderer.rules.td_open = () => ''; md.renderer.rules.th_open = () => ''; React.useEffect(() => { if (isOpen) { setMounted(true); // Mount the component appendQuery(prefilledQuery) } else { // Wait for animations before unmounting const timeout = setTimeout(() => { setMounted(false) }, 250); return () => clearTimeout(timeout); } }, [isOpen]); const handleUserInterrupt = () => { if (!eventInterrupt && status === 'loading') setEventInterrupt(true); } const scrollIntoView = () => { if (!conversationRef?.current || eventInterrupt) return; if (status === 'idle' || !queries.length || !queries[queries.length - 1].response) { conversationRef.current.scrollTo({ behavior: 'smooth', top: conversationRef.current.scrollHeight, }); } else { conversationRef.current.scrollTop = conversationRef.current.scrollHeight; } setHasScrolledToLast(true); }; const checkScroll = () => { const el = conversationRef.current; if (!el) return; const isBottom = el.scrollHeight - el.scrollTop - el.clientHeight < 10; setHasScrolledToLast(isBottom); }; React.useEffect(() => { !eventInterrupt && scrollIntoView(); conversationRef.current?.addEventListener('scroll', checkScroll); return () => { conversationRef.current?.removeEventListener('scroll', checkScroll); }; }, [queries.length, queries[queries.length - 1]?.response]); async function handleFeedback(feedback: FEEDBACK, index: number) { let query = queries[index]; if (!query.response || !conversationId) { console.log("Cannot submit feedback: missing response or conversation ID"); return; } // If clicking the same feedback button that's already active, remove the feedback by sending null if (query.feedback === feedback) { try { const response = await sendFeedback({ question: query.prompt, answer: query.response, feedback: null, apikey: apiKey, conversation_id: conversationId, question_index: index, }, apiHost); if (response.status === 200) { const updatedQuery = { ...query }; delete updatedQuery.feedback; setQueries((prev: Query[]) => prev.map((q, i) => (i === index ? updatedQuery : q)) ); } } catch (err) { console.error("Failed to submit feedback:", err); } return; } try { const response = await sendFeedback({ question: query.prompt, answer: query.response, feedback: feedback, apikey: apiKey, conversation_id: conversationId, question_index: index, }, apiHost); if (response.status === 200) { setQueries((prev: Query[]) => { return prev.map((q, i) => { if (i === index) { return { ...q, feedback: feedback }; } return q; }); }); } } catch (err) { console.error("Failed to submit feedback:", err); } } async function stream(question: string) { setStatus('loading') try { await fetchAnswerStreaming( { question: question, apiKey: apiKey, apiHost: apiHost, history: queries, conversationId: conversationId, onEvent: (event: MessageEvent) => { const data = JSON.parse(event.data); // check if the 'end' event has been received if (data.type === 'end') { setStatus('idle'); } else if (data.type === 'id') { setConversationId(data.id) } else if (data.type === 'error') { const updatedQueries = [...queries]; updatedQueries[updatedQueries.length - 1].error = data.error; setQueries(updatedQueries); setStatus('idle') } else if (data.type === 'source' && showSources) { const updatedQueries = [...queries]; updatedQueries[updatedQueries.length - 1].sources = data.source; setQueries(updatedQueries); } else { const result = data.answer ? data.answer : ''; //Fallback to an empty string if data.answer is undefined const streamingResponse = queries[queries.length - 1].response ? queries[queries.length - 1].response : ''; const updatedQueries = [...queries]; updatedQueries[updatedQueries.length - 1].response = streamingResponse + result; setQueries(updatedQueries); } } } ); } catch (error) { const updatedQueries = [...queries]; updatedQueries[updatedQueries.length - 1].error = 'Something went wrong !' setQueries(updatedQueries); setStatus('idle') //setEventInterrupt(false) } } const appendQuery = async (userQuery: string) => { if (!userQuery) return; setEventInterrupt(false); queries.push({ prompt: userQuery }); setPrompt(''); await stream(userQuery); } // submit handler const handleSubmit = async (e: React.FormEvent) => { e.preventDefault(); if (!prompt.trim()) return; if (promptRef.current) { promptRef.current.style.height = "auto"; } await appendQuery(prompt); } const handlePromptKeyDown = async (e: React.KeyboardEvent) => { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); // Prevent sending empty messages if (promptRef.current && promptRef.current.value.trim() === "") return; //Rest the input to it's original size after submitting if(promptRef.current){ promptRef.current.value = ""; promptRef.current.style.height = "auto"; } await appendQuery(prompt); } } // Auto-resize the input textarea while typing, clamping to base or max height const handleUserInput = (e: React.KeyboardEvent) =>{ const el = promptRef.current; if (!el) return; const baseHeight = size === 'large' ? 60 : 40; const maxHeight = 140; el.style.height = 'auto'; const next = Math.min(el.scrollHeight, maxHeight); el.style.height = Math.max(baseHeight, next) + 'px'; } // Update prompt state, auto resize textarea to content, and maintain scroll on new lines const handlePromptChange = (event: React.ChangeEvent) => { const value = event.target.value; setPrompt(value); const el = event.currentTarget; const baseHeight = size === 'large' ? 60 : 40; const maxHeight = 140; el.style.height = 'auto'; const next = Math.min(el.scrollHeight, maxHeight); el.style.height = Math.max(baseHeight, next) + 'px'; if(value.includes("\n")){ el.scrollTop = el.scrollHeight; } } const handleImageError = (event: React.SyntheticEvent) => { event.currentTarget.src = "https://d3dg1063dc54p9.cloudfront.net/cute-docsgpt.png"; }; const dimensions = typeof size === 'object' && 'custom' in size ? sizesConfig.getCustom(size.custom) : sizesConfig[size]; if (!mounted) return null; return ( {isOpen && size === 'large' && } {(
docs-gpt {title} {description}
{ queries.length > 0 ? queries?.map((query, index) => { return ( { query.prompt && {query.prompt} } { query.response ? { isBubbleHovered.current = true }} $type='ANSWER'> {showSources && query.sources && query.sources.length > 0 && query.sources.some(source => source.source !== 'local') && ( source.source !== 'local')} /> )} {collectFeedback && } :
{ query.error ?
Network Error
{query.error}
: . . . }
}
) }) : }
Powered by  DocsGPT
) }
) } ================================================ FILE: extensions/react-widget/src/components/SearchBar.tsx ================================================ import React from 'react'; import styled, { ThemeProvider, createGlobalStyle } from 'styled-components'; import { WidgetCore } from './DocsGPTWidget'; import { SearchBarProps } from '@/types'; import { getSearchResults } from '../requests/searchAPI'; import { Result } from '@/types'; import { getOS, processMarkdownString } from '../utils/helper'; import DOMPurify from 'dompurify'; import { CodeIcon, TextAlignLeftIcon, HeadingIcon, ReaderIcon, ListBulletIcon, QuoteIcon } from '@radix-ui/react-icons'; const themes = { dark: { name: 'dark', bg: '#202124', text: '#EDEDED', primary: { text: "#FAFAFA", bg: '#111111' }, secondary: { text: "#A1A1AA", bg: "#38383b" } }, light: { name: 'light', bg: '#EAEAEA', text: '#171717', primary: { text: "#222327", bg: "#fff" }, secondary: { text: "#A1A1AA", bg: "#F6F6F6" } } } const GlobalStyle = createGlobalStyle` .highlight { color: ${props => props.theme.name === 'dark' ? '#4B9EFF' : '#0066CC'}; font-weight: 500; } `; const loadGeistFont = () => { const link = document.createElement('link'); link.href = 'https://fonts.googleapis.com/css2?family=Geist:wght@100..900&display=swap'; link.rel = 'stylesheet'; document.head.appendChild(link); }; const Main = styled.div` all: initial; font-family: 'Geist', sans-serif; ` const SearchButton = styled.button<{ $inputWidth: string }>` padding: 6px 6px; font-family: inherit; width: ${({ $inputWidth }) => $inputWidth}; border-radius: 8px; display: inline; color: ${props => props.theme.secondary.text}; outline: none; border: none; background-color: ${props => props.theme.secondary.bg}; -webkit-appearance: none; -moz-appearance: none; appearance: none; transition: background-color 128ms linear; text-align: left; cursor: pointer; ` const Container = styled.div` position: relative; display: inline-block; ` const SearchOverlay = styled.div` position: fixed; top: 0; left: 0; width: 100%; height: 100%; background-color: #0000001A; backdrop-filter: blur(8px); -webkit-backdrop-filter: blur(8px); z-index: 99; `; const SearchResults = styled.div` position: fixed; display: flex; flex-direction: column; background-color: ${props => props.theme.name === 'dark' ? 'rgba(0, 0, 0, 0.15)' : 'rgba(255, 255, 255, 0.4)'}; border: 1px solid rgba(255, 255, 255, 0.18); border-radius: 15px; padding: 8px 0px 8px 0px; width: 792px; max-width: 90vw; height: 396px; z-index: 100; left: 50%; top: 50%; transform: translate(-50%, -50%); color: ${props => props.theme.primary.text}; box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.37); backdrop-filter: blur(82px); -webkit-backdrop-filter: blur(82px); border-radius: 10px; box-sizing: border-box; @media only screen and (max-width: 768px) { height: 80vh; width: 90vw; } `; const SearchResultsScroll = styled.div` flex: 1; overflow-y: auto; overflow-x: hidden; scrollbar-gutter: stable; scrollbar-width: thin; scrollbar-color: #383838 transparent; padding: 0 16px; `; const IconTitleWrapper = styled.div` display: flex; align-items: center; gap: 8px; .element-icon{ margin: 4px; } `; const Title = styled.h3` font-size: 15px; font-weight: 400; color: ${props => props.theme.primary.text}; margin: 0; overflow-wrap: break-word; white-space: normal; overflow: hidden; text-overflow: ellipsis; `; const ContentWrapper = styled.div` display: flex; flex-direction: column; gap: 12px; `; const ResultWrapper = styled.div` display: flex; align-items: flex-start; width: 100%; box-sizing: border-box; padding: 8px 16px; cursor: pointer; background-color: transparent; font-family: 'Geist', sans-serif; border-radius: 8px; word-wrap: break-word; overflow-wrap: break-word; word-break: break-word; white-space: normal; overflow: hidden; text-overflow: ellipsis; &:hover { backdrop-filter: blur(8px); -webkit-backdrop-filter: blur(8px); } `; const Content = styled.div` display: flex; margin-left: 8px; flex-direction: column; gap: 8px; padding: 4px 0px 0px 12px; font-size: 15px; color: ${props => props.theme.primary.text}; line-height: 1.6; border-left: 2px solid ${props => props.theme.primary.text}CC; overflow: hidden; `; const ContentSegment = styled.div` display: flex; align-items: flex-start; gap: 8px; padding-right: 16px; overflow-wrap: break-word; white-space: normal; overflow: hidden; text-overflow: ellipsis; ` const Toolkit = styled.kbd` position: absolute; right: 4px; top: 50%; transform: translateY(-50%); background-color: ${(props) => props.theme.primary.bg}; color: ${(props) => props.theme.secondary.text}; font-weight: 600; font-size: 10px; padding: 3px 6px; border: 1px solid ${(props) => props.theme.secondary.text}; border-radius: 4px; display: flex; align-items: center; justify-content: center; z-index: 1; pointer-events: none; ` const Loader = styled.div` margin: 2rem auto; border: 4px solid ${props => props.theme.name === 'dark' ? 'rgba(255, 255, 255, 0.2)' : 'rgba(0, 0, 0, 0.1)'}; border-top: 4px solid ${props => props.theme.name === 'dark' ? '#FFFFFF' : props.theme.primary.bg}; border-radius: 50%; width: 12px; height: 12px; animation: spin 1s linear infinite; @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } `; const NoResults = styled.div` margin-top: 2rem; text-align: center; font-size: 14px; color: ${props => props.theme.name === 'dark' ? '#E0E0E0' : '#505050'}; font-weight: 500; `; const AskAIButton = styled.button` display: flex; align-items: center; justify-content: flex-start; gap: 12px; width: calc(100% - 32px); margin: 0 16px 16px 16px; box-sizing: border-box; height: 50px; padding: 8px 24px; border: none; border-radius: 8px; color: ${props => props.theme.text}; cursor: pointer; font-size: 16px; backdrop-filter: blur(16px); -webkit-backdrop-filter: blur(16px); background-color: ${props => props.theme.name === 'dark' ? 'rgba(255, 255, 255, 0.05)' : 'rgba(0, 0, 0, 0.03)'}; &:hover { backdrop-filter: blur(20px); -webkit-backdrop-filter: blur(20px); background-color: ${props => props.theme.name === 'dark' ? 'rgba(255, 255, 255, 0.1)' : 'rgba(0, 0, 0, 0.06)'}; } `; const SearchHeader = styled.div` display: flex; align-items: center; gap: 8px; margin-bottom: 12px; padding-bottom: 12px; border-bottom: 1px solid ${props => props.theme.name === 'dark' ? '#FFFFFF24' : 'rgba(0, 0, 0, 0.14)'}; `; const TextField = styled.input` width: calc(100% - 32px); margin: 0 16px; padding: 12px 16px; border: none; background-color: transparent; color: ${props => props.theme.text}; font-size: 20px; font-weight: 400; outline: none; &:focus { border-color: none; } &::placeholder { color: ${props => props.theme.name === 'dark' ? 'rgba(255, 255, 255, 0.6)' : 'rgba(0, 0, 0, 0.5)'} !important; opacity: 100%; /* Force opacity to ensure placeholder is visible */ font-weight: 500; } ` const EscapeInstruction = styled.kbd` display: flex; align-items: center; justify-content: center; margin: 12px 16px 0; padding: 4px 8px; border-radius: 4px; background-color: transparent; border: 1px solid ${props => props.theme.name === 'dark' ? 'rgba(237, 237, 237, 0.6)' : 'rgba(23, 23, 23, 0.6)'}; color: ${props => props.theme.name === 'dark' ? '#EDEDED' : '#171717'}; font-size: 12px; font-family: 'Geist', sans-serif; white-space: nowrap; cursor: pointer; width: fit-content; -webkit-appearance: none; -moz-appearance: none; appearance: none; `; export const SearchBar = ({ apiKey = "74039c6d-bff7-44ce-ae55-2973cbf13837", apiHost = "https://gptcloud.arc53.com", theme = "dark", placeholder = "Search or Ask AI...", width = "256px", buttonText = "Search here" }: SearchBarProps) => { const [input, setInput] = React.useState(""); const [loading, setLoading] = React.useState(false); const [isWidgetOpen, setIsWidgetOpen] = React.useState(false); const inputRef = React.useRef(null); const containerRef = React.useRef(null); const [isResultVisible, setIsResultVisible] = React.useState(false); const [results, setResults] = React.useState([]); const debounceTimeout = React.useRef | null>(null); const abortControllerRef = React.useRef(null); const browserOS = getOS(); const isTouch = 'ontouchstart' in window; const getKeyboardInstruction = () => { if (isResultVisible) return "Enter"; return browserOS === 'mac' ? '⌘ + K' : 'Ctrl + K'; }; React.useEffect(() => { loadGeistFont() const handleClickOutside = (event: MouseEvent) => { if (containerRef.current && !containerRef.current.contains(event.target as Node)) { setIsResultVisible(false); } }; const handleKeyDown = (event: KeyboardEvent) => { if ( ((browserOS === 'win' || browserOS === 'linux') && event.ctrlKey && event.key === 'k') || (browserOS === 'mac' && event.metaKey && event.key === 'k') ) { event.preventDefault(); inputRef.current?.focus(); setIsResultVisible(true); } else if (event.key === 'Escape') { setIsResultVisible(false); } }; document.addEventListener('mousedown', handleClickOutside); document.addEventListener('keydown', handleKeyDown); return () => { document.removeEventListener('mousedown', handleClickOutside); document.removeEventListener('keydown', handleKeyDown); }; }, []); React.useEffect(() => { if (!input) { setResults([]); setLoading(false); return; } setLoading(true); if (debounceTimeout.current) { clearTimeout(debounceTimeout.current); } if (abortControllerRef.current) { abortControllerRef.current.abort(); } const abortController = new AbortController(); abortControllerRef.current = abortController; debounceTimeout.current = setTimeout(() => { getSearchResults(input, apiKey, apiHost, abortController.signal) .then((data) => setResults(data)) .catch((err) => !abortController.signal.aborted && console.log(err)) .finally(() => setLoading(false)); }, 500); return () => { abortController.abort(); clearTimeout(debounceTimeout.current ?? undefined); }; }, [input]) const handleKeyDown = (event: React.KeyboardEvent) => { if (event.key === 'Enter') { event.preventDefault(); openWidget(); } }; const openWidget = () => { setIsWidgetOpen(true); setIsResultVisible(false); }; const handleClose = () => { setIsWidgetOpen(false); setIsResultVisible(true); }; return (
setIsResultVisible(true)} $inputWidth={width} > {buttonText} { isResultVisible && ( <> setIsResultVisible(false)} /> setInput(e.target.value)} onKeyDown={(e) => handleKeyDown(e)} placeholder={placeholder} autoFocus /> setIsResultVisible(false)}> Esc DocsGPT Ask the AI {!loading ? ( results.length > 0 ? ( results.map((res, key) => { const containsSource = res.source !== 'local'; const processedResults = processMarkdownString(res.text, input); if (processedResults) return ( { if (!containsSource) return; window.open(res.source, '_blank', 'noopener, noreferrer'); }} >
{res.title} {processedResults.map((element, index) => ( {element.tag === 'code' && } {(element.tag === 'bulletList' || element.tag === 'numberedList') && } {element.tag === 'text' && } {element.tag === 'heading' && } {element.tag === 'blockquote' && }
))}
); return null; }) ) : ( No results found ) ) : ( )} ) } { isTouch ? { setIsWidgetOpen(true) }} title={"Tap to Ask the AI"}> Tap : {getKeyboardInstruction()} }
) } ================================================ FILE: extensions/react-widget/src/index.html ================================================ DocsGPT Widget
================================================ FILE: extensions/react-widget/src/index.ts ================================================ //exports methods for React export {SearchBar} from "./components/SearchBar" export { DocsGPTWidget } from "./components/DocsGPTWidget"; ================================================ FILE: extensions/react-widget/src/main.tsx ================================================ //development import { createRoot } from "react-dom/client"; import { App } from "./App"; import React from "react"; const container = document.getElementById("app") as HTMLElement; const root = createRoot(container) root.render(); ================================================ FILE: extensions/react-widget/src/requests/searchAPI.ts ================================================ import { Result } from "@/types"; async function getSearchResults(question: string, apiKey: string, apiHost: string, signal: AbortSignal): Promise { const payload = { question, api_key: apiKey }; try { const response = await fetch(`${apiHost}/api/search`, { method: "POST", headers: { "Content-Type": "application/json", }, body: JSON.stringify(payload), signal: signal }); if (!response.ok) { throw new Error(`Error: ${response.status}`); } const data: Result[] = await response.json(); return data; } catch (error) { if (!(error instanceof DOMException && error.name == "AbortError")) { console.error("Failed to fetch documents:", error); } throw error; } } export { getSearchResults } ================================================ FILE: extensions/react-widget/src/requests/streamingApi.ts ================================================ import { FEEDBACK } from "@/types"; interface HistoryItem { prompt: string; response?: string; } interface FetchAnswerStreamingProps { question?: string; apiKey?: string; selectedDocs?: string; history?: HistoryItem[]; conversationId?: string | null; apiHost?: string; onEvent?: (event: MessageEvent) => void; } export interface FeedbackPayload { question?: string; answer?: string; feedback: string | null; apikey?: string; conversation_id: string; question_index: number; } export function fetchAnswerStreaming({ question = '', apiKey = '', history = [], conversationId = null, apiHost = '', onEvent = () => { console.log("Event triggered, but no handler provided."); } }: FetchAnswerStreamingProps): Promise { return new Promise((resolve, reject) => { const body = { question: question, history: JSON.stringify(history), conversation_id: conversationId, model: 'default', api_key: apiKey }; fetch(apiHost + '/stream', { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify(body), }) .then((response) => { if (!response.body) throw Error('No response body'); const reader = response.body.getReader(); const decoder = new TextDecoder('utf-8'); let counter = 0; const processStream = ({ done, value, }: ReadableStreamReadResult) => { if (done) { resolve(); return; } counter += 1; const chunk = decoder.decode(value); const lines = chunk.split('\n'); for (let line of lines) { if (line.trim() == '') { continue; } if (line.startsWith('data:')) { line = line.substring(5); } const messageEvent = new MessageEvent('message', { data: line, }); onEvent(messageEvent); // handle each message } reader.read().then(processStream).catch(reject); }; reader.read().then(processStream).catch(reject); }) .catch((error) => { console.error('Connection failed:', error); reject(error); }); }); } export const sendFeedback = (payload: FeedbackPayload, apiHost: string): Promise => { return fetch(`${apiHost}/api/feedback`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ question: payload.question, answer: payload.answer, feedback: payload.feedback, api_key: payload.apikey, conversation_id: payload.conversation_id, question_index: payload.question_index }), }); }; ================================================ FILE: extensions/react-widget/src/types/index.ts ================================================ export type MESSAGE_TYPE = 'QUESTION' | 'ANSWER' | 'ERROR'; export type Status = 'idle' | 'loading' | 'failed'; export type FEEDBACK = 'LIKE' | 'DISLIKE'; export type THEME = 'light' | 'dark'; export interface Query { prompt: string; response?: string; feedback?: FEEDBACK; error?: string; sources?: { title: string; text: string, source:string }[]; conversationId?: string | null; title?: string | null; } export interface WidgetProps { apiHost?: string; apiKey?: string; avatar?: string; title?: string; description?: string; heroTitle?: string; heroDescription?: string; size?: 'small' | 'medium' | 'large' | { custom: { width: string; height: string; maxWidth?: string; maxHeight?: string; }; }; theme?:THEME, buttonIcon?:string; buttonText?:string; buttonBg?:string; collectFeedback?:boolean; showSources?: boolean; defaultOpen?: boolean; } export interface WidgetCoreProps extends WidgetProps { widgetRef?:React.RefObject | null; handleClose?:React.MouseEventHandler | undefined; isOpen:boolean; prefilledQuery?: string; } export interface SearchBarProps { apiHost?: string; apiKey?: string; theme?: THEME; placeholder?: string; width?: string; buttonText?: string; } export interface Result { text:string; title:string; source:string; } ================================================ FILE: extensions/react-widget/src/utils/helper.ts ================================================ export const getOS = () => { const platform = window.navigator.platform; const userAgent = window.navigator.userAgent || window.navigator.vendor; if (/Mac/i.test(platform)) { return 'mac'; } if (/Win/i.test(platform)) { return 'win'; } if (/Linux/i.test(platform) && !/Android/i.test(userAgent)) { return 'linux'; } if (/Android/i.test(userAgent)) { return 'android'; } if (/iPhone|iPad|iPod/i.test(userAgent)) { return 'ios'; } return 'other'; }; interface ParsedElement { content: string; tag: string; } export const processMarkdownString = (markdown: string, keyword?: string): ParsedElement[] => { const lines = markdown.trim().split('\n'); const keywordLower = keyword?.toLowerCase(); const escapeRegExp = (str: string) => str.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); const escapedKeyword = keyword ? escapeRegExp(keyword) : ''; const keywordRegex = keyword ? new RegExp(`(${escapedKeyword})`, 'gi') : null; let isInCodeBlock = false; let codeBlockContent: string[] = []; let matchingLines: ParsedElement[] = []; let firstLine: ParsedElement | null = null; for (let i = 0; i < lines.length; i++) { const trimmedLine = lines[i].trim(); if (!trimmedLine) continue; if (trimmedLine.startsWith('```')) { if (!isInCodeBlock) { isInCodeBlock = true; codeBlockContent = []; } else { isInCodeBlock = false; const codeContent = codeBlockContent.join('\n'); const parsedElement: ParsedElement = { content: codeContent, tag: 'code' }; if (!firstLine) { firstLine = parsedElement; } if (keywordLower && codeContent.toLowerCase().includes(keywordLower)) { parsedElement.content = parsedElement.content.replace(keywordRegex!, '$1'); matchingLines.push(parsedElement); } } continue; } if (isInCodeBlock) { codeBlockContent.push(trimmedLine); continue; } let parsedElement: ParsedElement | null = null; const headingMatch = trimmedLine.match(/^(#{1,6})\s+(.+)$/); const bulletMatch = trimmedLine.match(/^[-*]\s+(.+)$/); const numberedMatch = trimmedLine.match(/^\d+\.\s+(.+)$/); const blockquoteMatch = trimmedLine.match(/^>+\s*(.+)$/); let content = trimmedLine; if (headingMatch) { content = headingMatch[2]; parsedElement = { content: content, tag: 'heading' }; } else if (bulletMatch) { content = bulletMatch[1]; parsedElement = { content: content, tag: 'bulletList' }; } else if (numberedMatch) { content = numberedMatch[1]; parsedElement = { content: content, tag: 'numberedList' }; } else if (blockquoteMatch) { content = blockquoteMatch[1]; parsedElement = { content: content, tag: 'blockquote' }; } else { parsedElement = { content: content, tag: 'text' }; } if (!firstLine) { firstLine = parsedElement; } if (keywordLower && parsedElement.content.toLowerCase().includes(keywordLower)) { parsedElement.content = parsedElement.content.replace(keywordRegex!, '$1'); matchingLines.push(parsedElement); } } if (isInCodeBlock && codeBlockContent.length > 0) { const codeContent = codeBlockContent.join('\n'); const parsedElement: ParsedElement = { content: codeContent, tag: 'code' }; if (!firstLine) { firstLine = parsedElement; } if (keywordLower && codeContent.toLowerCase().includes(keywordLower)) { parsedElement.content = parsedElement.content.replace(keywordRegex!, '$1'); matchingLines.push(parsedElement); } } if (keywordLower && matchingLines.length > 0) { return matchingLines; } return firstLine ? [firstLine] : []; }; ================================================ FILE: extensions/react-widget/tsconfig.json ================================================ { "compilerOptions": { "baseUrl": ".", "paths": { "@/*": ["src/*", "@/*"] }, "target": "ES2020", "useDefineForClassFields": true, "lib": ["ES2020", "DOM", "DOM.Iterable"], "module": "ESNext", "skipLibCheck": true, /* Bundler mode */ "moduleResolution": "bundler", "allowImportingTsExtensions": true, "resolveJsonModule": true, "isolatedModules": true, "noEmit": true, "jsx": "react-jsx", /* Linting */ "strict": true, "noUnusedLocals": false, "noUnusedParameters": false, "noFallthroughCasesInSwitch": true, /* The "typeRoots" configuration specifies the locations where TypeScript looks for type definitions (.d.ts files) to include in the compilation process.*/ "typeRoots": ["./dist/index.d.ts", "node_modules/@types"] }, /* include /index.ts*/ "include": ["src/index.ts","custom.d.ts"], "exclude": ["node_modules"], } ================================================ FILE: extensions/slack-bot/.gitignore ================================================ .env .venv/ get-pip.py ================================================ FILE: extensions/slack-bot/Readme.md ================================================ # Slack Bot Configuration Guide > **Note:** The following guidelines must be followed on the [Slack API website](https://api.slack.com/) for setting up your Slack app and generating the necessary tokens. ## Step-by-Step Instructions ### 1. Navigate to Your Apps - Go to the Slack API page for apps and select **Create an App** from the “From Scratch” option. ### 2. App Creation - Name your app and choose the workspace where you wish to add the assistant. ### 3. Enabling Socket Mode - Navigate to **Settings > Socket Mode** and enable **Socket Mode**. - This action will generate an App-level token. Select the `connections:write` scope and copy the App-level token for future use. ### 4. Socket Naming - Assign a name to your socket as per your preference. ### 5. Basic Information Setup - Go to **Basic Information** (under **Settings**) and configure the following: - Assistant name - App icon - Background color ### 6. Bot Token and Permissions - In the **OAuth & Permissions** option found under the **Features** section, retrieve the Bot Token. Save it for future usage. - You will also need to add specific bot token scopes: - `app_mentions:read` - `assistant:write` - `chat:write` - `chat:write.public` - `im:history` ### 7. Enable Events - From **Event Subscriptions**, enable events and add the following Bot User events: - `app_mention` - `assistant_thread_context_changed` - `assistant_thread_started` - `message.im` ### 8. Agent/Assistant Toggle - In the **Features > Agent & Assistants** section, toggle on the Agent or Assistant option. - In the **Suggested Prompts** setting, leave it as `dynamic` (this is the default setting). --- ## Code-Side Configuration Guide This section focuses on generating and setting up the necessary tokens in the `.env` file, using the `.env-example` as a template. ### Step 1: Generating Required Keys 1. **SLACK_APP_TOKEN** - Navigate to **Settings > Socket Mode** in the Slack API and enable **Socket Mode**. - Copy the App-level token generated (usually starts with `xapp-`). 2. **SLACK_BOT_TOKEN** - Go to **OAuth & Permissions** (under the **Features** section in Slack API). - Retrieve the **Bot Token** (starts with `xoxb-`). 3. **DOCSGPT_API_KEY** - Go to the **DocsGPT website**. - Navigate to **Settings > Chatbots > Create New** to generate a DocsGPT API Key. - Copy the generated key for use. ### Step 2: Creating and Updating the `.env` File 1. Create a new `.env` file in the root of your project (if it doesn’t already exist). 2. Use the `.env-example` as a reference and update the file with the following keys and values: ```bash # .env file SLACK_APP_TOKEN=xapp-your-generated-app-token SLACK_BOT_TOKEN=xoxb-your-generated-bot-token DOCSGPT_API_KEY=your-docsgpt-generated-api-key ``` Replace the placeholder values with the actual tokens generated from the Slack API and DocsGPT as per the steps outlined above. --- This concludes the guide for both setting up the Slack API and configuring the `.env` file on the code side. ================================================ FILE: extensions/slack-bot/app.py ================================================ import os import hashlib import httpx import re from slack_bolt.async_app import AsyncApp from slack_bolt.adapter.socket_mode.async_handler import AsyncSocketModeHandler from dotenv import load_dotenv load_dotenv() API_BASE = os.getenv("API_BASE", "https://gptcloud.arc53.com") API_URL = API_BASE + "/api/answer" # Slack bot token and signing secret SLACK_BOT_TOKEN = os.getenv("SLACK_BOT_TOKEN") SLACK_APP_TOKEN = os.getenv("SLACK_APP_TOKEN") # OpenAI API key for DocsGPT (replace this with your actual API key) DOCSGPT_API_KEY = os.getenv("DOCSGPT_API_KEY") # Initialize Slack app app = AsyncApp(token=SLACK_BOT_TOKEN) def encode_conversation_id(conversation_id: str) -> str: """ Encodes 11 length Slack conversation_id to 12 length string Args: conversation_id (str): The 11 digit slack conversation_id. Returns: str: Hashed id. """ # Create a SHA-256 hash of the string hashed_id = hashlib.sha256(conversation_id.encode()).hexdigest() # Take the first 24 characters of the hash hashed_24_char_id = hashed_id[:24] return hashed_24_char_id async def generate_answer(question: str, messages: list, conversation_id: str | None) -> dict: """Generates an answer using the external API.""" payload = { "question": question, "api_key": DOCSGPT_API_KEY, "history": messages, "conversation_id": conversation_id, } headers = { "Content-Type": "application/json; charset=utf-8" } timeout = 60.0 async with httpx.AsyncClient() as client: response = await client.post(API_URL, json=payload, headers=headers, timeout=timeout) if response.status_code == 200: data = response.json() conversation_id = data.get("conversation_id") answer = data.get("answer", "Sorry, I couldn't find an answer.") return {"answer": answer, "conversation_id": conversation_id} else: print(response.json()) return {"answer": "Sorry, I couldn't find an answer.", "conversation_id": None} @app.message(".*") async def message_docs(message, say): client = app.client channel = message['channel'] thread_ts = message['thread_ts'] user_query = message['text'] await client.assistant_threads_setStatus( channel_id = channel, thread_ts = thread_ts, status = "is generating your answer...", ) docs_gpt_channel_id = encode_conversation_id(thread_ts) # Get response from DocsGPT response = await generate_answer(user_query,[], docs_gpt_channel_id) answer = convert_to_slack_markdown(response['answer']) # Respond in Slack await client.chat_postMessage(text = answer, mrkdwn= True, channel= message['channel'], thread_ts = message['thread_ts'],) def convert_to_slack_markdown(markdown_text: str): # Convert bold **text** to *text* for Slack slack_text = re.sub(r'\*\*(.*?)\*\*', r'*\1*', markdown_text) # **text** to *text* # Convert italics _text_ to _text_ for Slack slack_text = re.sub(r'_(.*?)_', r'_\1_', slack_text) # _text_ to _text_ # Convert inline code `code` to `code` (Slack supports backticks for inline code) slack_text = re.sub(r'`(.*?)`', r'`\1`', slack_text) # Convert bullet points with single or no spaces to filled bullets (•) slack_text = re.sub(r'^\s{0,1}[-*]\s+', ' • ', slack_text, flags=re.MULTILINE) # Convert bullet points with multiple spaces to hollow bullets (◦) slack_text = re.sub(r'^\s{2,}[-*]\s+', '\t◦ ', slack_text, flags=re.MULTILINE) # Convert headers (##) to bold in Slack slack_text = re.sub(r'^\s*#{1,6}\s*(.*?)$', r'*\1*', slack_text, flags=re.MULTILINE) return slack_text async def main(): handler = AsyncSocketModeHandler(app, os.environ["SLACK_APP_TOKEN"]) await handler.start_async() # Start the app if __name__ == "__main__": import asyncio asyncio.run(main()) ================================================ FILE: extensions/slack-bot/requirements.txt ================================================ aiohttp>=3,<4 certifi==2024.7.4 h11==0.14.0 httpcore==1.0.5 httpx==0.27.0 idna==3.7 python-dotenv==1.0.1 sniffio==1.3.1 slack-bolt==1.21.0 bson==0.5.10 ================================================ FILE: extensions/web-widget/README.md ================================================ # Chat Widget A simple chat widget that can be easily integrated into any website. ## Installation 1. Host the `widget.html`, `styles.css`, and `script.js` files from the `src` folder on your own server or a Content Delivery Network (CDN). Make sure to note the URLs for these files. 2. Update the URLs in the `dist/chat-widget.js` file to match the locations of your hosted files: ```javascript fetch("https://your-server-or-cdn.com/path/to/widget.html"), fetch("https://your-server-or-cdn.com/path/to/styles.css"), fetch("https://your-server-or-cdn.com/path/to/script.js"), ``` 3. Host the `dist/chat-widget.js` file on your own server or a Content Delivery Network (CDN). Make sure to note the URL for this file. ##Integration To integrate the chat widget into a website, add the following script tag to the HTML file, replacing URL_TO_CHAT_WIDGET_JS with the actual URL of your hosted chat-widget.js file: ```javascript ``` ================================================ FILE: extensions/web-widget/dist/chat-widget.js ================================================ (async function () { // Fetch the HTML, CSS, and JavaScript from your server or CDN const [htmlRes, jsRes] = await Promise.all([ fetch("https://s3-eu-west-2.amazonaws.com/arc53data/widget.html"), // fetch("https://s3-eu-west-2.amazonaws.com/arc53data/tailwind.css"), fetch("https://s3-eu-west-2.amazonaws.com/arc53data/script.js"), ]); const html = await htmlRes.text(); //const css = await cssRes.text(); const js = await jsRes.text(); // create a new link element const link = document.createElement("link"); //set the rel, href, type, and integrity attributes link.rel = "stylesheet"; link.href = "https://cdn.tailwindcss.com/"; link.type = "text/css"; link.integrity = "sha384-PDOmVviaTm8N1W35y1NSmo80w6GPaGhbDuOBAF/5hRffaeGc6yOwIo1qAt4gqLGA%"; // get the document head and append the link element to it // document.head.appendChild(link); // Create a style element for the CSS // const style = document.createElement("style"); // style.innerHTML = css; // document.head.appendChild(style); // Create a container for the chat widget and inject the HTML const chatWidgetContainer = document.createElement("div"); chatWidgetContainer.innerHTML = html; document.body.appendChild(chatWidgetContainer); // Execute the JavaScript code const script = document.createElement("script"); script.innerHTML = js; document.body.appendChild(script); })(); ================================================ FILE: extensions/web-widget/dist/output.css ================================================ /* ! tailwindcss v3.3.1 | MIT License | https://tailwindcss.com */ /* 1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4) 2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116) */ *, ::before, ::after { box-sizing: border-box; /* 1 */ border-width: 0; /* 2 */ border-style: solid; /* 2 */ border-color: #e5e7eb; /* 2 */ } ::before, ::after { --tw-content: ''; } /* 1. Use a consistent sensible line-height in all browsers. 2. Prevent adjustments of font size after orientation changes in iOS. 3. Use a more readable tab size. 4. Use the user's configured `sans` font-family by default. 5. Use the user's configured `sans` font-feature-settings by default. 6. Use the user's configured `sans` font-variation-settings by default. */ html { line-height: 1.5; /* 1 */ -webkit-text-size-adjust: 100%; /* 2 */ -moz-tab-size: 4; /* 3 */ -o-tab-size: 4; tab-size: 4; /* 3 */ font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; /* 4 */ font-feature-settings: normal; /* 5 */ font-variation-settings: normal; /* 6 */ } /* 1. Remove the margin in all browsers. 2. Inherit line-height from `html` so users can set them as a class directly on the `html` element. */ body { margin: 0; /* 1 */ line-height: inherit; /* 2 */ } /* 1. Add the correct height in Firefox. 2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655) 3. Ensure horizontal rules are visible by default. */ hr { height: 0; /* 1 */ color: inherit; /* 2 */ border-top-width: 1px; /* 3 */ } /* Add the correct text decoration in Chrome, Edge, and Safari. */ abbr:where([title]) { -webkit-text-decoration: underline dotted; text-decoration: underline dotted; } /* Remove the default font size and weight for headings. */ h1, h2, h3, h4, h5, h6 { font-size: inherit; font-weight: inherit; } /* Reset links to optimize for opt-in styling instead of opt-out. */ a { color: inherit; text-decoration: inherit; } /* Add the correct font weight in Edge and Safari. */ b, strong { font-weight: bolder; } /* 1. Use the user's configured `mono` font family by default. 2. Correct the odd `em` font sizing in all browsers. */ code, kbd, samp, pre { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; /* 1 */ font-size: 1em; /* 2 */ } /* Add the correct font size in all browsers. */ small { font-size: 80%; } /* Prevent `sub` and `sup` elements from affecting the line height in all browsers. */ sub, sup { font-size: 75%; line-height: 0; position: relative; vertical-align: baseline; } sub { bottom: -0.25em; } sup { top: -0.5em; } /* 1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297) 2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016) 3. Remove gaps between table borders by default. */ table { text-indent: 0; /* 1 */ border-color: inherit; /* 2 */ border-collapse: collapse; /* 3 */ } /* 1. Change the font styles in all browsers. 2. Remove the margin in Firefox and Safari. 3. Remove default padding in all browsers. */ button, input, optgroup, select, textarea { font-family: inherit; /* 1 */ font-size: 100%; /* 1 */ font-weight: inherit; /* 1 */ line-height: inherit; /* 1 */ color: inherit; /* 1 */ margin: 0; /* 2 */ padding: 0; /* 3 */ } /* Remove the inheritance of text transform in Edge and Firefox. */ button, select { text-transform: none; } /* 1. Correct the inability to style clickable types in iOS and Safari. 2. Remove default button styles. */ button, [type='button'], [type='reset'], [type='submit'] { -webkit-appearance: button; /* 1 */ background-color: transparent; /* 2 */ background-image: none; /* 2 */ } /* Use the modern Firefox focus style for all focusable elements. */ :-moz-focusring { outline: auto; } /* Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737) */ :-moz-ui-invalid { box-shadow: none; } /* Add the correct vertical alignment in Chrome and Firefox. */ progress { vertical-align: baseline; } /* Correct the cursor style of increment and decrement buttons in Safari. */ ::-webkit-inner-spin-button, ::-webkit-outer-spin-button { height: auto; } /* 1. Correct the odd appearance in Chrome and Safari. 2. Correct the outline style in Safari. */ [type='search'] { -webkit-appearance: textfield; /* 1 */ outline-offset: -2px; /* 2 */ } /* Remove the inner padding in Chrome and Safari on macOS. */ ::-webkit-search-decoration { -webkit-appearance: none; } /* 1. Correct the inability to style clickable types in iOS and Safari. 2. Change font properties to `inherit` in Safari. */ ::-webkit-file-upload-button { -webkit-appearance: button; /* 1 */ font: inherit; /* 2 */ } /* Add the correct display in Chrome and Safari. */ summary { display: list-item; } /* Removes the default spacing and border for appropriate elements. */ blockquote, dl, dd, h1, h2, h3, h4, h5, h6, hr, figure, p, pre { margin: 0; } fieldset { margin: 0; padding: 0; } legend { padding: 0; } ol, ul, menu { list-style: none; margin: 0; padding: 0; } /* Prevent resizing textareas horizontally by default. */ textarea { resize: vertical; } /* 1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300) 2. Set the default placeholder color to the user's configured gray 400 color. */ input::-moz-placeholder, textarea::-moz-placeholder { opacity: 1; /* 1 */ color: #9ca3af; /* 2 */ } input::placeholder, textarea::placeholder { opacity: 1; /* 1 */ color: #9ca3af; /* 2 */ } /* Set the default cursor for buttons. */ button, [role="button"] { cursor: pointer; } /* Make sure disabled buttons don't get the pointer cursor. */ :disabled { cursor: default; } /* 1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14) 2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210) This can trigger a poorly considered lint error in some tools but is included by design. */ img, svg, video, canvas, audio, iframe, embed, object { display: block; /* 1 */ vertical-align: middle; /* 2 */ } /* Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14) */ img, video { max-width: 100%; height: auto; } /* Make elements with the HTML hidden attribute stay hidden by default */ [hidden] { display: none; } *, ::before, ::after { --tw-border-spacing-x: 0; --tw-border-spacing-y: 0; --tw-translate-x: 0; --tw-translate-y: 0; --tw-rotate: 0; --tw-skew-x: 0; --tw-skew-y: 0; --tw-scale-x: 1; --tw-scale-y: 1; --tw-pan-x: ; --tw-pan-y: ; --tw-pinch-zoom: ; --tw-scroll-snap-strictness: proximity; --tw-ordinal: ; --tw-slashed-zero: ; --tw-numeric-figure: ; --tw-numeric-spacing: ; --tw-numeric-fraction: ; --tw-ring-inset: ; --tw-ring-offset-width: 0px; --tw-ring-offset-color: #fff; --tw-ring-color: rgb(59 130 246 / 0.5); --tw-ring-offset-shadow: 0 0 #0000; --tw-ring-shadow: 0 0 #0000; --tw-shadow: 0 0 #0000; --tw-shadow-colored: 0 0 #0000; --tw-blur: ; --tw-brightness: ; --tw-contrast: ; --tw-grayscale: ; --tw-hue-rotate: ; --tw-invert: ; --tw-saturate: ; --tw-sepia: ; --tw-drop-shadow: ; --tw-backdrop-blur: ; --tw-backdrop-brightness: ; --tw-backdrop-contrast: ; --tw-backdrop-grayscale: ; --tw-backdrop-hue-rotate: ; --tw-backdrop-invert: ; --tw-backdrop-opacity: ; --tw-backdrop-saturate: ; --tw-backdrop-sepia: ; } ::backdrop { --tw-border-spacing-x: 0; --tw-border-spacing-y: 0; --tw-translate-x: 0; --tw-translate-y: 0; --tw-rotate: 0; --tw-skew-x: 0; --tw-skew-y: 0; --tw-scale-x: 1; --tw-scale-y: 1; --tw-pan-x: ; --tw-pan-y: ; --tw-pinch-zoom: ; --tw-scroll-snap-strictness: proximity; --tw-ordinal: ; --tw-slashed-zero: ; --tw-numeric-figure: ; --tw-numeric-spacing: ; --tw-numeric-fraction: ; --tw-ring-inset: ; --tw-ring-offset-width: 0px; --tw-ring-offset-color: #fff; --tw-ring-color: rgb(59 130 246 / 0.5); --tw-ring-offset-shadow: 0 0 #0000; --tw-ring-shadow: 0 0 #0000; --tw-shadow: 0 0 #0000; --tw-shadow-colored: 0 0 #0000; --tw-blur: ; --tw-brightness: ; --tw-contrast: ; --tw-grayscale: ; --tw-hue-rotate: ; --tw-invert: ; --tw-saturate: ; --tw-sepia: ; --tw-drop-shadow: ; --tw-backdrop-blur: ; --tw-backdrop-brightness: ; --tw-backdrop-contrast: ; --tw-backdrop-grayscale: ; --tw-backdrop-hue-rotate: ; --tw-backdrop-invert: ; --tw-backdrop-opacity: ; --tw-backdrop-saturate: ; --tw-backdrop-sepia: ; } .fixed { position: fixed; } .absolute { position: absolute; } .relative { position: relative; } .inset-y-0 { top: 0px; bottom: 0px; } .bottom-5 { bottom: 1.25rem; } .left-5 { left: 1.25rem; } .right-2 { right: 0.5rem; } .z-50 { z-index: 50; } .m-0 { margin: 0px; } .-mx-2 { margin-left: -0.5rem; margin-right: -0.5rem; } .mt-1 { margin-top: 0.25rem; } .flex { display: flex; } .hidden { display: none; } .w-full { width: 100%; } .flex-1 { flex: 1 1 0%; } .transform { transform: translate(var(--tw-translate-x), var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y)); } .items-center { align-items: center; } .justify-center { justify-content: center; } .gap-2 { gap: 0.5rem; } .divide-y > :not([hidden]) ~ :not([hidden]) { --tw-divide-y-reverse: 0; border-top-width: calc(1px * calc(1 - var(--tw-divide-y-reverse))); border-bottom-width: calc(1px * var(--tw-divide-y-reverse)); } .rounded-md { border-radius: 0.375rem; } .rounded-b { border-bottom-right-radius: 0.25rem; border-bottom-left-radius: 0.25rem; } .border { border-width: 1px; } .bg-transparent { background-color: transparent; } .bg-gradient-to-br { background-image: linear-gradient(to bottom right, var(--tw-gradient-stops)); } .from-gray-100\/80 { --tw-gradient-from: rgb(243 244 246 / 0.8) var(--tw-gradient-from-position); --tw-gradient-from-position: ; --tw-gradient-to: rgb(243 244 246 / 0) var(--tw-gradient-from-position); --tw-gradient-to-position: ; --tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to); } .via-white { --tw-gradient-via-position: ; --tw-gradient-to: rgb(255 255 255 / 0) var(--tw-gradient-to-position); --tw-gradient-to-position: ; --tw-gradient-stops: var(--tw-gradient-from), #fff var(--tw-gradient-via-position), var(--tw-gradient-to); } .to-white { --tw-gradient-to: #fff var(--tw-gradient-to-position); --tw-gradient-to-position: ; } .p-3 { padding: 0.75rem; } .px-2 { padding-left: 0.5rem; padding-right: 0.5rem; } .px-5 { padding-left: 1.25rem; padding-right: 1.25rem; } .py-3 { padding-top: 0.75rem; padding-bottom: 0.75rem; } .pl-5 { padding-left: 1.25rem; } .pr-8 { padding-right: 2rem; } .font-sans { font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; } .text-sm { font-size: 0.875rem; line-height: 1.25rem; } .text-xs { font-size: 0.75rem; line-height: 1rem; } .font-bold { font-weight: 700; } .text-gray-400 { --tw-text-opacity: 1; color: rgb(156 163 175 / var(--tw-text-opacity)); } .text-gray-600 { --tw-text-opacity: 1; color: rgb(75 85 99 / var(--tw-text-opacity)); } .text-gray-700 { --tw-text-opacity: 1; color: rgb(55 65 81 / var(--tw-text-opacity)); } .text-gray-800 { --tw-text-opacity: 1; color: rgb(31 41 55 / var(--tw-text-opacity)); } .shadow { --tw-shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1); --tw-shadow-colored: 0 1px 3px 0 var(--tw-shadow-color), 0 1px 2px -1px var(--tw-shadow-color); box-shadow: var(--tw-ring-offset-shadow, 0 0 #0000), var(--tw-ring-shadow, 0 0 #0000), var(--tw-shadow); } .backdrop-blur-sm { --tw-backdrop-blur: blur(4px); -webkit-backdrop-filter: var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia); backdrop-filter: var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia); } .transition { transition-property: color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, -webkit-backdrop-filter; transition-property: color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, backdrop-filter; transition-property: color, background-color, border-color, text-decoration-color, fill, stroke, opacity, box-shadow, transform, filter, backdrop-filter, -webkit-backdrop-filter; transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1); transition-duration: 150ms; } .delay-200 { transition-delay: 200ms; } .duration-300 { transition-duration: 300ms; } .hover\:bg-gray-100:hover { --tw-bg-opacity: 1; background-color: rgb(243 244 246 / var(--tw-bg-opacity)); } .focus\:outline-none:focus { outline: 2px solid transparent; outline-offset: 2px; } @media (prefers-color-scheme: dark) { .dark\:divide-gray-700 > :not([hidden]) ~ :not([hidden]) { --tw-divide-opacity: 1; border-color: rgb(55 65 81 / var(--tw-divide-opacity)); } .dark\:border-gray-700 { --tw-border-opacity: 1; border-color: rgb(55 65 81 / var(--tw-border-opacity)); } .dark\:from-gray-900\/80 { --tw-gradient-from: rgb(17 24 39 / 0.8) var(--tw-gradient-from-position); --tw-gradient-from-position: ; --tw-gradient-to: rgb(17 24 39 / 0) var(--tw-gradient-from-position); --tw-gradient-to-position: ; --tw-gradient-stops: var(--tw-gradient-from), var(--tw-gradient-to); } .dark\:via-gray-900 { --tw-gradient-via-position: ; --tw-gradient-to: rgb(17 24 39 / 0) var(--tw-gradient-to-position); --tw-gradient-to-position: ; --tw-gradient-stops: var(--tw-gradient-from), #111827 var(--tw-gradient-via-position), var(--tw-gradient-to); } .dark\:to-gray-900 { --tw-gradient-to: #111827 var(--tw-gradient-to-position); --tw-gradient-to-position: ; } .dark\:text-gray-200 { --tw-text-opacity: 1; color: rgb(229 231 235 / var(--tw-text-opacity)); } .dark\:text-gray-300 { --tw-text-opacity: 1; color: rgb(209 213 219 / var(--tw-text-opacity)); } .dark\:text-gray-500 { --tw-text-opacity: 1; color: rgb(107 114 128 / var(--tw-text-opacity)); } .dark\:text-white { --tw-text-opacity: 1; color: rgb(255 255 255 / var(--tw-text-opacity)); } .dark\:hover\:bg-gray-800\/70:hover { background-color: rgb(31 41 55 / 0.7); } } @media (min-width: 768px) { .md\:pl-0 { padding-left: 0px; } } ================================================ FILE: extensions/web-widget/index.html ================================================ Chat Widget Test ================================================ FILE: extensions/web-widget/package.json ================================================ { "name": "web-widget", "version": "1.0.0", "description": "", "main": "index.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, "keywords": [], "author": "", "license": "ISC", "devDependencies": { "tailwindcss": "^3.3.1" } } ================================================ FILE: extensions/web-widget/src/html/widget.html ================================================

Looking for help with documentation?

DocsGPT AI assistant will help you with docs

================================================ FILE: extensions/web-widget/src/input.css ================================================ @tailwind base; @tailwind components; @tailwind utilities; ================================================ FILE: extensions/web-widget/src/js/script.js ================================================ const API_ENDPOINT = "http://localhost:7091/api/answer"; // Replace with your API endpoint const widgetInitMessage = document.getElementById("docsgpt-init-message"); const widgetAnswerMessage = document.getElementById("docsgpt-answer"); const widgetAnswerMessageP = widgetAnswerMessage.querySelector("p"); const askDocsGPTButton = document.getElementById("ask-docsgpt"); const chatInput = document.getElementById("docsgpt-chat-input"); const chatForm = document.getElementById("docsgpt-chat-form"); const chatProcessing = document.getElementById("docsgpt-chat-processing"); async function sendMessage(message) { const requestData = { "question": message, "active_docs": "default", "api_key": "token", "embeddings_key": "token", "model": "default", "history": null, } const response = await fetch(API_ENDPOINT, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(requestData), }); const data = await response.json(); return data.answer; } askDocsGPTButton.addEventListener("click", () => { askDocsGPTButton.classList.add("hidden"); chatForm.classList.remove("hidden"); chatForm.focus(); widgetInitMessage.classList.remove("hidden"); widgetAnswerMessage.classList.add("hidden"); }); chatForm.addEventListener("submit", async (e) => { e.preventDefault(); const message = chatInput.value.trim(); if (!message) return; chatInput.value = ""; chatForm.classList.add("hidden"); chatProcessing.classList.remove("hidden"); const reply = await sendMessage(message); chatProcessing.classList.add("hidden"); // inside

tag widgetAnswerMessageP.innerHTML = reply; widgetAnswerMessage.classList.remove("hidden"); widgetInitMessage.classList.add("hidden"); askDocsGPTButton.classList.remove("hidden"); }); ================================================ FILE: extensions/web-widget/tailwind.config.js ================================================ /** @type {import('tailwindcss').Config} */ module.exports = { content: ["./src/**/*.{html,js}"], theme: { extend: {}, }, plugins: [], } ================================================ FILE: frontend/.husky/pre-commit ================================================ #!/usr/bin/env sh . "$(dirname -- "$0")/_/husky.sh" # npm test cd frontend npx lint-staged ================================================ FILE: frontend/.prettierignore ================================================ node_modules/ dist/ prettier.config.cjs .eslintrc.cjs env.d.ts public/ assets/ vite-env.d.ts .prettierignore package-lock.json package.json postcss.config.cjs prettier.config.cjs tailwind.config.cjs tsconfig.json tsconfig.node.json vite.config.ts ================================================ FILE: frontend/Dockerfile ================================================ FROM node:22-bullseye-slim WORKDIR /app COPY package*.json ./ RUN npm install COPY . . EXPOSE 5173 CMD [ "npm", "run", "dev", "--" , "--host"] ================================================ FILE: frontend/components.json ================================================ { "$schema": "https://ui.shadcn.com/schema.json", "style": "new-york", "rsc": false, "tsx": true, "tailwind": { "config": "", "css": "src/index.css", "baseColor": "neutral", "cssVariables": true, "prefix": "" }, "iconLibrary": "lucide", "aliases": { "components": "@/components", "utils": "@/lib/utils", "ui": "@/components/ui", "lib": "@/lib", "hooks": "@/hooks" }, "registries": {} } ================================================ FILE: frontend/eslint.config.js ================================================ import js from '@eslint/js' import tsParser from '@typescript-eslint/parser' import tsPlugin from '@typescript-eslint/eslint-plugin' import react from 'eslint-plugin-react' import unusedImports from 'eslint-plugin-unused-imports' import prettier from 'eslint-plugin-prettier' import globals from 'globals' export default [ { ignores: [ 'node_modules/', 'dist/', 'prettier.config.cjs', '.eslintrc.cjs', 'env.d.ts', 'public/', 'assets/', 'vite-env.d.ts', '.prettierignore', 'package-lock.json', 'package.json', 'postcss.config.cjs', 'tailwind.config.cjs', 'tsconfig.json', 'tsconfig.node.json', 'vite.config.ts', ], }, { files: ['**/*.{js,jsx,ts,tsx}'], languageOptions: { ecmaVersion: 'latest', sourceType: 'module', parser: tsParser, parserOptions: { ecmaFeatures: { jsx: true, }, }, globals: { ...globals.browser, ...globals.es2021, ...globals.node, }, }, plugins: { '@typescript-eslint': tsPlugin, react, 'unused-imports': unusedImports, prettier, }, rules: { ...js.configs.recommended.rules, ...tsPlugin.configs.recommended.rules, ...react.configs.recommended.rules, ...prettier.configs.recommended.rules, 'react/prop-types': 'off', 'unused-imports/no-unused-imports': 'error', 'react/react-in-jsx-scope': 'off', 'no-undef': 'off', '@typescript-eslint/no-explicit-any': 'warn', '@typescript-eslint/no-unused-vars': 'warn', '@typescript-eslint/no-unused-expressions': 'warn', 'prettier/prettier': [ 'error', { endOfLine: 'auto', }, ], }, settings: { react: { version: 'detect', }, }, }, ] ================================================ FILE: frontend/index.html ================================================ DocsGPT

================================================ FILE: frontend/package.json ================================================ { "name": "frontend", "private": true, "version": "0.0.0", "type": "module", "scripts": { "dev": "vite", "build": "tsc && vite build", "preview": "vite preview", "lint": "eslint ./src --ext .jsx,.js,.ts,.tsx", "lint-fix": "eslint ./src --ext .jsx,.js,.ts,.tsx --fix", "format": "prettier ./src --write", "prepare": "cd .. && husky install frontend/.husky" }, "lint-staged": { "**/*.{js,jsx,ts,tsx}": [ "npm run lint-fix", "npm run format" ] }, "dependencies": { "@radix-ui/react-dialog": "^1.1.15", "@radix-ui/react-popover": "^1.1.15", "@radix-ui/react-select": "^2.2.6", "@radix-ui/react-slot": "^1.2.4", "@reduxjs/toolkit": "^2.10.1", "chart.js": "^4.4.4", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "cmdk": "^1.1.1", "copy-to-clipboard": "^3.3.3", "i18next": "^25.8.18", "i18next-browser-languagedetector": "^8.2.0", "lodash": "^4.17.21", "lucide-react": "^0.562.0", "mermaid": "^11.12.1", "prop-types": "^15.8.1", "radix-ui": "^1.4.3", "react": "^19.1.0", "react-chartjs-2": "^5.3.0", "react-dom": "^19.1.1", "react-dropzone": "^14.3.8", "react-google-drive-picker": "^1.2.2", "react-i18next": "^16.2.4", "react-markdown": "^9.0.1", "react-redux": "^9.2.0", "react-router-dom": "^7.6.1", "react-syntax-highlighter": "^16.1.1", "reactflow": "^11.11.4", "rehype-katex": "^7.0.1", "remark-gfm": "^4.0.0", "remark-math": "^6.0.0", "tailwind-merge": "^3.4.0" }, "devDependencies": { "@tailwindcss/postcss": "^4.1.10", "@types/lodash": "^4.17.20", "@types/react": "^19.1.8", "@types/react-dom": "^19.1.7", "@types/react-syntax-highlighter": "^15.5.13", "@typescript-eslint/eslint-plugin": "^8.46.3", "@typescript-eslint/parser": "^8.46.3", "@vitejs/plugin-react": "^6.0.1", "eslint": "^9.39.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-import": "^2.31.0", "eslint-plugin-n": "^17.23.1", "eslint-plugin-prettier": "^5.5.4", "eslint-plugin-promise": "^6.6.0", "eslint-plugin-react": "^7.37.5", "eslint-plugin-unused-imports": "^4.1.4", "husky": "^9.1.7", "lint-staged": "^16.4.0", "postcss": "^8.4.49", "prettier": "^3.5.3", "prettier-plugin-tailwindcss": "^0.7.1", "tailwindcss": "^4.2.1", "tw-animate-css": "^1.4.0", "typescript": "^5.8.3", "vite": "^8.0.0", "vite-plugin-svgr": "^4.3.0" } } ================================================ FILE: frontend/postcss.config.cjs ================================================ module.exports = { plugins: { '@tailwindcss/postcss': {}, }, } ================================================ FILE: frontend/prettier.config.cjs ================================================ module.exports = { trailingComma: 'all', tabWidth: 2, semi: true, singleQuote: true, printWidth: 80, plugins: ['prettier-plugin-tailwindcss'], }; ================================================ FILE: frontend/src/App.tsx ================================================ import './locale/i18n'; import { useState } from 'react'; import { Outlet, Route, Routes } from 'react-router-dom'; import Agents from './agents'; import SharedAgentGate from './agents/SharedAgentGate'; import ActionButtons from './components/ActionButtons'; import Spinner from './components/Spinner'; import UploadToast from './components/UploadToast'; import Conversation from './conversation/Conversation'; import { SharedConversation } from './conversation/SharedConversation'; import { useDarkTheme, useMediaQuery } from './hooks'; import useDataInitializer from './hooks/useDataInitializer'; import useTokenAuth from './hooks/useTokenAuth'; import Navigation from './Navigation'; import PageNotFound from './PageNotFound'; import Setting from './settings'; import Notification from './components/Notification'; function AuthWrapper({ children }: { children: React.ReactNode }) { const { isAuthLoading } = useTokenAuth(); useDataInitializer(isAuthLoading); if (isAuthLoading) { return (
); } return <>{children}; } function MainLayout() { const { isMobile, isTablet } = useMediaQuery(); const [navOpen, setNavOpen] = useState(!(isMobile || isTablet)); return (
); } export default function App() { const [, , componentMounted] = useDarkTheme(); const [showNotification, setShowNotification] = useState(() => { const saved = localStorage.getItem('showNotification'); return saved ? JSON.parse(saved) : true; }); const notificationText = import.meta.env.VITE_NOTIFICATION_TEXT; const notificationLink = import.meta.env.VITE_NOTIFICATION_LINK; if (!componentMounted) { return
; } return (
{notificationLink && notificationText && showNotification && ( { setShowNotification(false); localStorage.setItem('showNotification', 'false'); }} /> )} } > } /> } /> } /> } /> } /> } />
); } ================================================ FILE: frontend/src/Hero.tsx ================================================ import { useTranslation } from 'react-i18next'; import DocsGPT3 from './assets/cute_docsgpt3.svg'; import DropdownModel from './components/DropdownModel'; export default function Hero({ handleQuestion, }: { handleQuestion: ({ question, isRetry, }: { question: string; isRetry?: boolean; }) => void; }) { const { t } = useTranslation(); const demos = t('demo', { returnObjects: true }) as Array<{ header: string; query: string; }>; return (
{/* Header Section */}
DocsGPT docsgpt
{/* Model Selector */}
{/* Demo Buttons Section */}
{demos?.map( (demo: { header: string; query: string }, key: number) => demo.header && demo.query && ( ), )}
); } ================================================ FILE: frontend/src/Navigation.tsx ================================================ import { useEffect, useRef, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { useDispatch, useSelector } from 'react-redux'; import { NavLink, useNavigate } from 'react-router-dom'; import { Agent } from './agents/types'; import conversationService from './api/services/conversationService'; import userService from './api/services/userService'; import Add from './assets/add.svg'; import DocsGPT3 from './assets/cute_docsgpt3.svg'; import Discord from './assets/discord.svg'; import PanelLeftClose from './assets/panel-left-close.svg'; import PanelLeftOpen from './assets/panel-left-open.svg'; import Github from './assets/git_nav.svg'; import Hamburger from './assets/hamburger.svg'; import openNewChat from './assets/openNewChat.svg'; import Pin from './assets/pin.svg'; import AgentImage from './components/AgentImage'; import SettingGear from './assets/settingGear.svg'; import Spark from './assets/spark.svg'; import SpinnerDark from './assets/spinner-dark.svg'; import Spinner from './assets/spinner.svg'; import Twitter from './assets/TwitterX.svg'; import UnPin from './assets/unpin.svg'; import Help from './components/Help'; import { handleAbort, selectQueries, setConversation, updateConversationId, } from './conversation/conversationSlice'; import ConversationTile from './conversation/ConversationTile'; import { useDarkTheme, useMediaQuery } from './hooks'; import useTokenAuth from './hooks/useTokenAuth'; import DeleteConvModal from './modals/DeleteConvModal'; import JWTModal from './modals/JWTModal'; import { ActiveState } from './models/misc'; import { getConversations } from './preferences/preferenceApi'; import { selectAgents, selectConversationId, selectConversations, selectModalStateDeleteConv, selectSelectedAgent, selectSharedAgents, selectToken, setAgents, setConversations, setModalStateDeleteConv, setSelectedAgent, setSharedAgents, } from './preferences/preferenceSlice'; import Upload from './upload/Upload'; interface NavigationProps { navOpen: boolean; setNavOpen: React.Dispatch>; } export default function Navigation({ navOpen, setNavOpen }: NavigationProps) { const dispatch = useDispatch(); const navigate = useNavigate(); const { t } = useTranslation(); const token = useSelector(selectToken); const queries = useSelector(selectQueries); const conversations = useSelector(selectConversations); const conversationId = useSelector(selectConversationId); const modalStateDeleteConv = useSelector(selectModalStateDeleteConv); const agents = useSelector(selectAgents); const sharedAgents = useSelector(selectSharedAgents); const selectedAgent = useSelector(selectSelectedAgent); const { isMobile, isTablet } = useMediaQuery(); const [isDarkTheme] = useDarkTheme(); const { showTokenModal, handleTokenSubmit } = useTokenAuth(); const [isDeletingConversation, setIsDeletingConversation] = useState(false); const [uploadModalState, setUploadModalState] = useState('INACTIVE'); const [recentAgents, setRecentAgents] = useState([]); const navRef = useRef(null); useEffect(() => { function handleClickOutside(event: MouseEvent) { if ( navRef.current && !navRef.current.contains(event.target as Node) && (isMobile || isTablet) && navOpen ) { setNavOpen(false); } } //event listener only for mobile/tablet when nav is open if ((isMobile || isTablet) && navOpen) { document.addEventListener('mousedown', handleClickOutside); return () => { document.removeEventListener('mousedown', handleClickOutside); }; } }, [navOpen, isMobile, isTablet, setNavOpen]); async function fetchRecentAgents() { try { const response = await userService.getPinnedAgents(token); if (!response.ok) throw new Error('Failed to fetch pinned agents'); const pinnedAgents: Agent[] = await response.json(); if (pinnedAgents.length >= 3) { setRecentAgents(pinnedAgents); return; } let tempAgents: Agent[] = []; if (!agents) { const response = await userService.getAgents(token); if (!response.ok) throw new Error('Failed to fetch agents'); const data: Agent[] = await response.json(); dispatch(setAgents(data)); tempAgents = data; } else tempAgents = agents; const additionalAgents = tempAgents .filter( (agent: Agent) => agent.status === 'published' && !pinnedAgents.some((pinned) => pinned.id === agent.id), ) .sort( (a: Agent, b: Agent) => new Date(b.last_used_at ?? 0).getTime() - new Date(a.last_used_at ?? 0).getTime(), ) .slice(0, 3 - pinnedAgents.length); setRecentAgents([...pinnedAgents, ...additionalAgents]); } catch (error) { console.error('Failed to fetch recent agents: ', error); } } async function fetchConversations() { dispatch(setConversations({ ...conversations, loading: true })); return await getConversations(token) .then((fetchedConversations) => { dispatch(setConversations(fetchedConversations)); }) .catch((error) => { console.error('Failed to fetch conversations: ', error); dispatch(setConversations({ data: null, loading: false })); }); } useEffect(() => { fetchRecentAgents(); }, [agents, sharedAgents, token, dispatch]); useEffect(() => { if (queries.length === 0) resetConversation(); }, [conversations?.data, dispatch]); const handleDeleteAllConversations = () => { setIsDeletingConversation(true); conversationService .deleteAll(token) .then(() => { fetchConversations(); }) .catch((error) => console.error(error)); }; const handleDeleteConversation = (id: string) => { setIsDeletingConversation(true); conversationService .delete(id, {}, token) .then(() => { fetchConversations(); resetConversation(); }) .catch((error) => console.error(error)); }; const handleAgentClick = (agent: Agent) => { resetConversation(); dispatch(setSelectedAgent(agent)); if (isMobile || isTablet) setNavOpen(!navOpen); navigate('/'); }; const handleTogglePin = (agent: Agent) => { userService.togglePinAgent(agent.id ?? '', token).then((response) => { if (response.ok) { const updatePinnedStatus = (a: Agent) => a.id === agent.id ? { ...a, pinned: !a.pinned } : a; dispatch(setAgents(agents?.map(updatePinnedStatus))); dispatch(setSharedAgents(sharedAgents?.map(updatePinnedStatus))); } }); }; const handleConversationClick = async (index: string) => { try { dispatch(setSelectedAgent(null)); const response = await conversationService.getConversation(index, token); if (!response.ok) { navigate('/'); return; } const data = await response.json(); if (!data) return; dispatch(setConversation(data.queries)); dispatch(updateConversationId({ query: { conversationId: index } })); if (!data.agent_id) { navigate('/'); return; } let agent: Agent; if (data.is_shared_usage) { const sharedResponse = await userService.getSharedAgent( data.shared_token, token, ); if (!sharedResponse.ok) { navigate('/'); return; } agent = await sharedResponse.json(); navigate(`/agents/shared/${agent.shared_token}`); } else { const agentResponse = await userService.getAgent(data.agent_id, token); if (!agentResponse.ok) { navigate('/'); return; } agent = await agentResponse.json(); if (agent.shared_token) { navigate(`/agents/shared/${agent.shared_token}`); } else { await Promise.resolve(dispatch(setSelectedAgent(agent))); navigate('/'); } } } catch (error) { console.error('Error handling conversation click:', error); navigate('/'); } }; const resetConversation = () => { handleAbort(); dispatch(setConversation([])); dispatch( updateConversationId({ query: { conversationId: null }, }), ); dispatch(setSelectedAgent(null)); }; const newChat = () => { if (queries && queries?.length > 0) { resetConversation(); } }; async function updateConversationName(updatedConversation: { name: string; id: string; }) { await conversationService .update(updatedConversation, token) .then((response) => response.json()) .then((data) => { if (data) { navigate('/'); fetchConversations(); } }) .catch((err) => { console.error(err); }); } useEffect(() => { setNavOpen(!(isMobile || isTablet)); }, [isMobile, isTablet]); return ( <> {(isMobile || isTablet) && navOpen && (
setNavOpen(false)} /> )} {
{!navOpen && ( )} {queries?.length > 0 && ( )}
DocsGPT
}
{ if (isMobile) { setNavOpen(!navOpen); } }} > DocsGPT Logo

DocsGPT

{ if (isMobile || isTablet) { setNavOpen(!navOpen); } resetConversation(); }} className={({ isActive }) => `${ isActive ? 'bg-transparent' : '' } group border-silver hover:border-rainy-gray dark:border-purple-taupe sticky mx-4 mt-4 flex cursor-pointer gap-2.5 rounded-3xl border p-3 hover:bg-transparent dark:text-white` } > Create new chat

{t('newChat')}

{conversations?.loading && !isDeletingConversation && (
Loading conversations
)} {recentAgents?.length > 0 ? (

{t('navigation.agents')}

{recentAgents.map((agent, idx) => (
handleAgentClick(agent)} >

{agent.name}

))}
{ dispatch(setSelectedAgent(null)); if (isMobile || isTablet) { setNavOpen(false); } navigate('/agents'); }} >
manage-agents

{t('manageAgents')}

) : (
{ if (isMobile || isTablet) { setNavOpen(false); } dispatch(setSelectedAgent(null)); navigate('/agents'); }} >
manage-agents

{t('manageAgents')}

)} {conversations?.data && conversations.data.length > 0 ? (

{t('chats')}

{conversations.data?.map((conversation) => ( handleConversationClick(id)} onConversationClick={() => { if (isMobile) { setNavOpen(false); } }} onDeleteConversation={(id) => handleDeleteConversation(id)} onSave={(conversation) => updateConversationName(conversation) } /> ))}
) : ( <> )}
{ if (isMobile || isTablet) { setNavOpen(false); } resetConversation(); }} to="/settings" className={({ isActive }) => `mx-4 my-auto flex h-9 cursor-pointer items-center gap-4 rounded-3xl hover:bg-gray-100 dark:hover:bg-[#28292E] ${ isActive ? 'bg-gray-3000 dark:bg-transparent' : '' }` } > Settings

{t('settings.label')}

Join Discord community Follow us on X View on GitHub
DocsGPT
{uploadModalState === 'ACTIVE' && ( setUploadModalState('INACTIVE')} > )} ); } ================================================ FILE: frontend/src/PageNotFound.tsx ================================================ import { Link } from 'react-router-dom'; import { useTranslation } from 'react-i18next'; export default function PageNotFound() { const { t } = useTranslation(); return (

{t('pageNotFound.title')}

{t('pageNotFound.message')}

); } ================================================ FILE: frontend/src/agents/AgentCard.tsx ================================================ import { SyntheticEvent, useRef, useState } from 'react'; import { useDispatch, useSelector } from 'react-redux'; import { useNavigate } from 'react-router-dom'; import { useTranslation } from 'react-i18next'; import userService from '../api/services/userService'; import Duplicate from '../assets/duplicate.svg'; import Edit from '../assets/edit.svg'; import FolderIcon from '../assets/folder.svg'; import Link from '../assets/link-gray.svg'; import Monitoring from '../assets/monitoring.svg'; import Pin from '../assets/pin.svg'; import Trash from '../assets/red-trash.svg'; import ThreeDots from '../assets/three-dots.svg'; import UnPin from '../assets/unpin.svg'; import AgentImage from '../components/AgentImage'; import ContextMenu, { MenuOption } from '../components/ContextMenu'; import ConfirmationModal from '../modals/ConfirmationModal'; import MoveToFolderModal from '../modals/MoveToFolderModal'; import { ActiveState } from '../models/misc'; import { selectAgents, selectToken, setAgents, setSelectedAgent, } from '../preferences/preferenceSlice'; import { Agent } from './types'; type AgentCardProps = { agent: Agent; agents: Agent[]; updateAgents?: (agents: Agent[]) => void; section: string; }; export default function AgentCard({ agent, agents, updateAgents, section, }: AgentCardProps) { const { t } = useTranslation(); const navigate = useNavigate(); const dispatch = useDispatch(); const token = useSelector(selectToken); const userAgents = useSelector(selectAgents); const [isMenuOpen, setIsMenuOpen] = useState(false); const [deleteConfirmation, setDeleteConfirmation] = useState('INACTIVE'); const [moveModalState, setMoveModalState] = useState('INACTIVE'); const menuRef = useRef(null); const menuOptionsConfig: Record = { template: [ { icon: Duplicate, label: 'Duplicate', onClick: (e: SyntheticEvent) => { e.stopPropagation(); handleDuplicate(); }, variant: 'primary', iconWidth: 18, iconHeight: 18, }, ], user: [ { icon: Monitoring, label: 'Logs', onClick: (e: SyntheticEvent) => { e.stopPropagation(); navigate(`/agents/logs/${agent.id}`); }, variant: 'primary', iconWidth: 14, iconHeight: 14, }, { icon: Edit, label: 'Edit', onClick: (e: SyntheticEvent) => { e.stopPropagation(); if (agent.agent_type === 'workflow') { navigate(`/agents/workflow/edit/${agent.id}`); } else { navigate(`/agents/edit/${agent.id}`); } }, variant: 'primary', iconWidth: 14, iconHeight: 14, }, ...(agent.status === 'published' ? [ { icon: agent.pinned ? UnPin : Pin, label: agent.pinned ? 'Unpin' : 'Pin agent', onClick: (e: SyntheticEvent) => { e.stopPropagation(); togglePin(); }, variant: 'primary' as const, iconWidth: 18, iconHeight: 18, }, ] : []), { icon: FolderIcon, label: t('agents.folders.moveToFolder'), onClick: (e: SyntheticEvent) => { e.stopPropagation(); setMoveModalState('ACTIVE'); setIsMenuOpen(false); }, variant: 'primary', iconWidth: 16, iconHeight: 15, }, { icon: Trash, label: 'Delete', onClick: (e: SyntheticEvent) => { e.stopPropagation(); setDeleteConfirmation('ACTIVE'); }, variant: 'danger', iconWidth: 13, iconHeight: 13, }, ], shared: [ { icon: Link, label: 'Open', onClick: (e: SyntheticEvent) => { e.stopPropagation(); navigate(`/agents/shared/${agent.shared_token}`); }, variant: 'primary', iconWidth: 12, iconHeight: 12, }, { icon: agent.pinned ? UnPin : Pin, label: agent.pinned ? 'Unpin' : 'Pin agent', onClick: (e: SyntheticEvent) => { e.stopPropagation(); togglePin(); }, variant: 'primary', iconWidth: 18, iconHeight: 18, }, { icon: Trash, label: 'Remove', onClick: (e: SyntheticEvent) => { e.stopPropagation(); handleHideSharedAgent(); }, variant: 'danger', iconWidth: 13, iconHeight: 13, }, ], }; const menuOptions = menuOptionsConfig[section] || []; const handleClick = () => { if (section === 'user') { if (agent.status === 'published') { dispatch(setSelectedAgent(agent)); navigate(`/`); } } if (section === 'shared') { navigate(`/agents/shared/${agent.shared_token}`); } }; const togglePin = async () => { try { const response = await userService.togglePinAgent(agent.id ?? '', token); if (!response.ok) throw new Error('Failed to pin agent'); const updatedAgents = agents.map((prevAgent) => { if (prevAgent.id === agent.id) return { ...prevAgent, pinned: !prevAgent.pinned }; return prevAgent; }); updateAgents?.(updatedAgents); } catch (error) { console.error('Error:', error); } }; const handleHideSharedAgent = async () => { try { const response = await userService.removeSharedAgent( agent.id ?? '', token, ); if (!response.ok) throw new Error('Failed to hide shared agent'); const updatedAgents = agents.filter( (prevAgent) => prevAgent.id !== agent.id, ); updateAgents?.(updatedAgents); } catch (error) { console.error('Error:', error); } }; const handleDelete = async () => { try { const response = await userService.deleteAgent(agent.id ?? '', token); if (!response.ok) throw new Error('Failed to delete agent'); const updatedAgents = agents.filter( (prevAgent) => prevAgent.id !== agent.id, ); updateAgents?.(updatedAgents); } catch (error) { console.error('Error:', error); } }; const handleDuplicate = async () => { try { const response = await userService.adoptAgent(agent.id ?? '', token); if (!response.ok) throw new Error('Failed to duplicate agent'); const data = await response.json(); if (userAgents) { const updatedAgents = [...userAgents, data.agent]; dispatch(setAgents(updatedAgents)); } else dispatch(setAgents([data.agent])); } catch (error) { console.error('Error:', error); } }; const handleMoveSuccess = (folderId: string | null) => { const updatedAgents = agents.map((prevAgent) => { if (prevAgent.id === agent.id) { return { ...prevAgent, folder_id: folderId ?? undefined }; } return prevAgent; }); updateAgents?.(updatedAgents); }; return (
{ e.stopPropagation(); handleClick(); }} >
{ e.stopPropagation(); setIsMenuOpen(true); }} className="absolute top-4 right-4 z-10 cursor-pointer" > {'use-agent'}
{agent.status === 'draft' && (

{`(Draft)`}

)}

{agent.name}

{agent.description}

{ handleDelete(); setDeleteConfirmation('INACTIVE'); }} cancelLabel="Cancel" variant="danger" />
); } ================================================ FILE: frontend/src/agents/AgentLogs.tsx ================================================ import { useEffect, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { useSelector } from 'react-redux'; import { useNavigate, useParams } from 'react-router-dom'; import userService from '../api/services/userService'; import ArrowLeft from '../assets/arrow-left.svg'; import Spinner from '../components/Spinner'; import { selectToken } from '../preferences/preferenceSlice'; import Analytics from '../settings/Analytics'; import Logs from '../settings/Logs'; import { Agent } from './types'; export default function AgentLogs() { const { t } = useTranslation(); const navigate = useNavigate(); const { agentId } = useParams(); const token = useSelector(selectToken); const [agent, setAgent] = useState(); const [loadingAgent, setLoadingAgent] = useState(true); const fetchAgent = async (agentId: string) => { setLoadingAgent(true); try { const response = await userService.getAgent(agentId ?? '', token); if (!response.ok) throw new Error('Failed to fetch Chatbots'); const agent = await response.json(); setAgent(agent); } catch (error) { console.error(error); } finally { setLoadingAgent(false); } }; useEffect(() => { if (agentId) fetchAgent(agentId); }, [agentId, token]); return (

{t('agents.backToAll')}

{t('agents.logs.title')}

{agent && (

{agent.name}

{agent.last_used_at ? t('agents.logs.lastUsedAt') + ' ' + new Date(agent.last_used_at).toLocaleString() : t('agents.logs.noUsageHistory')}

)}
{loadingAgent ? (
) : ( agent && )} {loadingAgent ? (
{' '}
) : ( agent && ( ) )}
); } ================================================ FILE: frontend/src/agents/AgentPreview.tsx ================================================ import { useCallback, useEffect, useRef, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { useDispatch, useSelector } from 'react-redux'; import MessageInput from '../components/MessageInput'; import ConversationMessages from '../conversation/ConversationMessages'; import { Query } from '../conversation/conversationModels'; import { selectSelectedAgent } from '../preferences/preferenceSlice'; import { AppDispatch } from '../store'; import { addQuery, fetchPreviewAnswer, handlePreviewAbort, resendQuery, resetPreview, selectPreviewQueries, selectPreviewStatus, } from './agentPreviewSlice'; export default function AgentPreview() { const { t } = useTranslation(); const dispatch = useDispatch(); const queries = useSelector(selectPreviewQueries); const status = useSelector(selectPreviewStatus); const selectedAgent = useSelector(selectSelectedAgent); const [lastQueryReturnedErr, setLastQueryReturnedErr] = useState(false); const fetchStream = useRef(null); const handleFetchAnswer = useCallback( ({ question, index }: { question: string; index?: number }) => { fetchStream.current = dispatch( fetchPreviewAnswer({ question, indx: index }), ); }, [dispatch], ); const handleQuestion = useCallback( ({ question, isRetry = false, index = undefined, }: { question: string; isRetry?: boolean; index?: number; }) => { const trimmedQuestion = question.trim(); if (trimmedQuestion === '') return; if (index !== undefined) { if (!isRetry) dispatch(resendQuery({ index, prompt: trimmedQuestion })); handleFetchAnswer({ question: trimmedQuestion, index }); } else { if (!isRetry) { const newQuery: Query = { prompt: trimmedQuestion }; dispatch(addQuery(newQuery)); } handleFetchAnswer({ question: trimmedQuestion, index: undefined }); } }, [dispatch, handleFetchAnswer], ); const handleQuestionSubmission = ( question?: string, updated?: boolean, indx?: number, ) => { if (updated === true && question !== undefined && indx !== undefined) { handleQuestion({ question, index: indx, isRetry: false, }); } else if (question && status !== 'loading') { const currentInput = question.trim(); if (lastQueryReturnedErr && queries.length > 0) { const lastQueryIndex = queries.length - 1; handleQuestion({ question: currentInput, isRetry: true, index: lastQueryIndex, }); } else { handleQuestion({ question: currentInput, isRetry: false, index: undefined, }); } } }; useEffect(() => { dispatch(resetPreview()); return () => { if (fetchStream.current) fetchStream.current.abort(); handlePreviewAbort(); dispatch(resetPreview()); }; }, [dispatch]); useEffect(() => { if (queries.length > 0) { const lastQuery = queries[queries.length - 1]; setLastQueryReturnedErr(!!lastQuery.error); } else setLastQueryReturnedErr(false); }, [queries]); return (
handleQuestionSubmission(text)} loading={status === 'loading'} showSourceButton={selectedAgent ? false : true} showToolButton={selectedAgent ? false : true} autoFocus={false} />

{t('agents.preview.testMessage')}

); } ================================================ FILE: frontend/src/agents/AgentsList.tsx ================================================ import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { useDispatch, useSelector } from 'react-redux'; import { useNavigate, useSearchParams } from 'react-router-dom'; import userService from '../api/services/userService'; import Search from '../assets/search.svg'; import Spinner from '../components/Spinner'; import { setConversation, updateConversationId, } from '../conversation/conversationSlice'; import { selectAgentFolders, selectSelectedAgent, selectToken, setAgentFolders, setSelectedAgent, } from '../preferences/preferenceSlice'; import AgentCard from './AgentCard'; import { AgentSectionId, agentSectionsConfig } from './agents.config'; import AgentTypeModal from './components/AgentTypeModal'; import FolderCard from './FolderCard'; import { AgentFilterTab, useAgentSearch } from './hooks/useAgentSearch'; import { useAgentsFetch } from './hooks/useAgentsFetch'; import { Agent, AgentFolder } from './types'; const FILTER_TABS: { id: AgentFilterTab; labelKey: string }[] = [ { id: 'all', labelKey: 'agents.filters.all' }, { id: 'template', labelKey: 'agents.filters.byDocsGPT' }, { id: 'user', labelKey: 'agents.filters.byMe' }, { id: 'shared', labelKey: 'agents.filters.shared' }, ]; export default function AgentsList() { const { t } = useTranslation(); const dispatch = useDispatch(); const navigate = useNavigate(); const [searchParams] = useSearchParams(); const token = useSelector(selectToken); const selectedAgent = useSelector(selectSelectedAgent); const folders = useSelector(selectAgentFolders); const [folderPath, setFolderPath] = useState(() => { const folderIdFromUrl = searchParams.get('folder'); return folderIdFromUrl ? [folderIdFromUrl] : []; }); const [showAgentTypeModal, setShowAgentTypeModal] = useState(false); const [modalFolderId, setModalFolderId] = useState(null); // Sync folder path with URL useEffect(() => { const currentFolderInUrl = searchParams.get('folder'); const currentFolderId = folderPath.length > 0 ? folderPath[folderPath.length - 1] : null; if (currentFolderId !== currentFolderInUrl) { const newUrl = currentFolderId ? `/agents?folder=${currentFolderId}` : '/agents'; navigate(newUrl, { replace: true }); } }, [folderPath, searchParams, navigate]); const { isLoading, refetchFolders, refetchUserAgents } = useAgentsFetch(); const { searchQuery, setSearchQuery, activeFilter, setActiveFilter, filteredAgentsBySection, totalAgentsBySection, hasAnyAgents, hasFilteredResults, isDataLoaded, } = useAgentSearch(); useEffect(() => { dispatch(setConversation([])); dispatch( updateConversationId({ query: { conversationId: null }, }), ); if (selectedAgent) dispatch(setSelectedAgent(null)); }, []); const handleCreateFolder = useCallback( async (name: string, parentId?: string) => { const response = await userService.createAgentFolder( { name, parent_id: parentId }, token, ); if (response.ok) { await refetchFolders(); return true; } return false; }, [token, refetchFolders], ); const handleDeleteFolder = useCallback( async (folderId: string) => { const response = await userService.deleteAgentFolder(folderId, token); if (response.ok) { await Promise.all([refetchFolders(), refetchUserAgents()]); return true; } return false; }, [token, refetchFolders, refetchUserAgents], ); const handleRenameFolder = useCallback( async (folderId: string, newName: string) => { const response = await userService.updateAgentFolder( folderId, { name: newName }, token, ); if (response.ok) { dispatch( setAgentFolders( (folders || []).map((f) => f.id === folderId ? { ...f, name: newName } : f, ), ), ); } }, [token, folders, dispatch], ); const handleSubmitNewFolder = async (name: string, parentId?: string) => { await handleCreateFolder(name, parentId); }; const visibleSections = agentSectionsConfig.filter((config) => { if (activeFilter !== 'all') { return config.id === activeFilter; } const sectionId = config.id as AgentSectionId; const hasAgentsInSection = totalAgentsBySection[sectionId] > 0; const hasFilteredAgents = filteredAgentsBySection[sectionId].length > 0; const sectionDataLoaded = isDataLoaded[sectionId]; if (!sectionDataLoaded) return true; if (searchQuery) return hasFilteredAgents; if (config.id === 'user') return true; return hasAgentsInSection; }); const showSearchEmptyState = searchQuery && hasAnyAgents && !hasFilteredResults && activeFilter === 'all'; return (

{t('agents.title')}

{t('agents.description')}

setSearchQuery(e.target.value)} placeholder={t('agents.searchPlaceholder')} className="h-11 w-full rounded-full border border-[#E5E5E5] bg-white py-2 pr-5 pl-11 text-sm shadow-[0_1px_4px_rgba(0,0,0,0.06)] transition-shadow outline-none placeholder:text-[#9CA3AF] focus:shadow-[0_2px_8px_rgba(0,0,0,0.1)] dark:border-[#3A3A3A] dark:bg-[#2C2C2C] dark:text-white dark:shadow-none dark:placeholder:text-[#6B7280]" />
{FILTER_TABS.map((tab) => ( ))}
{visibleSections.map((sectionConfig) => ( ))} {showSearchEmptyState && (

{t('agents.noSearchResults')}

{t('agents.tryDifferentSearch')}

)} setShowAgentTypeModal(false)} folderId={modalFolderId} />
); } interface AgentSectionProps { config: (typeof agentSectionsConfig)[number]; filteredAgents: Agent[]; totalAgents: number; searchQuery: string; isFilteredView: boolean; isLoading: boolean; folders: AgentFolder[] | null; folderPath: string[]; onFolderPathChange?: (path: string[]) => void; onCreateFolder: (name: string, parentId?: string) => void; onDeleteFolder: (id: string) => Promise; onRenameFolder: (id: string, name: string) => void; setModalFolderId: (folderId: string | null) => void; setShowAgentTypeModal: (show: boolean) => void; } function AgentSection({ config, filteredAgents, totalAgents, searchQuery, isFilteredView, isLoading, folders, folderPath, onFolderPathChange, onCreateFolder, onDeleteFolder, onRenameFolder, setModalFolderId, setShowAgentTypeModal, }: AgentSectionProps) { const { t } = useTranslation(); const navigate = useNavigate(); const dispatch = useDispatch(); const allAgents = useSelector(config.selectData); const [isCreatingFolder, setIsCreatingFolder] = useState(false); const [newFolderName, setNewFolderName] = useState(''); const newFolderInputRef = useRef(null); const currentFolderId = folderPath.length > 0 ? folderPath[folderPath.length - 1] : null; const setFolderPath = useCallback( (updater: string[] | ((prev: string[]) => string[])) => { if (!onFolderPathChange) return; if (typeof updater === 'function') { onFolderPathChange(updater(folderPath)); } else { onFolderPathChange(updater); } }, [onFolderPathChange, folderPath], ); const updateAgents = (updatedAgents: Agent[]) => { dispatch(config.updateAction(updatedAgents)); }; const currentFolderDescendantIds = useMemo(() => { if (config.id !== 'user' || !folders || currentFolderId === null) return null; const getDescendants = (folderId: string): string[] => { const children = folders.filter((f) => f.parent_id === folderId); return children.flatMap((child) => [ child.id, ...getDescendants(child.id), ]); }; return new Set([currentFolderId, ...getDescendants(currentFolderId)]); }, [folders, currentFolderId, config.id]); const folderHasMatchingAgents = useCallback( (folderId: string): boolean => { const directMatches = filteredAgents.some( (a) => a.folder_id === folderId, ); if (directMatches) return true; const childFolders = (folders || []).filter( (f) => f.parent_id === folderId, ); return childFolders.some((f) => folderHasMatchingAgents(f.id)); }, [filteredAgents, folders], ); // Get folders at the current level (root or inside current folder) const currentLevelFolders = useMemo(() => { if (config.id !== 'user' || !folders) return []; const foldersAtLevel = folders.filter( (f) => (f.parent_id || null) === currentFolderId, ); if (searchQuery) { return foldersAtLevel.filter((f) => folderHasMatchingAgents(f.id)); } return foldersAtLevel; }, [ folders, currentFolderId, config.id, searchQuery, folderHasMatchingAgents, ]); const unfolderedAgents = useMemo(() => { if (config.id !== 'user' || !folders) return filteredAgents; if (searchQuery) { // When searching at root: return ALL filtered agents if (currentFolderId === null) { return filteredAgents; } // When searching inside a folder: return agents in current folder OR any descendant return filteredAgents.filter( (a) => currentFolderDescendantIds?.has(a.folder_id ?? '') ?? false, ); } // No search: show agents that belong to the current folder level only return filteredAgents.filter( (a) => (a.folder_id || null) === currentFolderId, ); }, [ filteredAgents, folders, config.id, currentFolderId, searchQuery, currentFolderDescendantIds, ]); const getAgentsForFolder = (folderId: string) => { return filteredAgents.filter((a) => a.folder_id === folderId); }; const handleNavigateIntoFolder = (folderId: string) => { setFolderPath((prev) => [...prev, folderId]); }; const handleNavigateToPath = (index: number) => { if (index < 0) { setFolderPath([]); } else { setFolderPath((prev) => prev.slice(0, index + 1)); } }; const handleSubmitNewFolder = (name: string) => { onCreateFolder(name, currentFolderId || undefined); }; const hasNoAgentsAtAll = !isLoading && totalAgents === 0; const isSearchingWithNoResults = !isLoading && searchQuery && filteredAgents.length === 0 && totalAgents > 0; if (isFilteredView && isSearchingWithNoResults) { return (

{t('agents.noSearchResults')}

{t('agents.tryDifferentSearch')}

); } if (isFilteredView && hasNoAgentsAtAll) { return (

{t(`agents.sections.${config.id}.emptyState`)}

{config.showNewAgentButton && ( )}
); } // Build breadcrumb items from folder path const breadcrumbItems = useMemo(() => { if (!folders || folderPath.length === 0) return []; return folderPath.map((folderId) => { const folder = folders.find((f) => f.id === folderId); return { id: folderId, name: folder?.name || '' }; }); }, [folders, folderPath]); const ChevronIcon = () => ( ); return (

{config.id === 'user' && folderPath.length > 0 ? ( <> {breadcrumbItems.map((item, index) => ( {index === breadcrumbItems.length - 1 ? ( {item.name} ) : ( )} ))} ) : ( t(`agents.sections.${config.id}.title`) )}

{t(`agents.sections.${config.id}.description`)}

{config.id === 'user' && (isCreatingFolder ? ( setNewFolderName(e.target.value)} onKeyDown={(e) => { if (e.key === 'Enter' && newFolderName.trim()) { handleSubmitNewFolder(newFolderName.trim()); setNewFolderName(''); setIsCreatingFolder(false); } else if (e.key === 'Escape') { setNewFolderName(''); setIsCreatingFolder(false); } }} onBlur={() => { if (!newFolderName.trim()) { setIsCreatingFolder(false); } }} placeholder={t('agents.folders.newFolder')} className="w-28 rounded-full border border-[#E5E5E5] bg-white px-4 py-2 text-sm text-[#18181B] outline-none placeholder:text-[#9CA3AF] sm:w-auto dark:border-[#3A3A3A] dark:bg-[#2C2C2C] dark:text-white dark:placeholder:text-[#6B7280]" autoFocus /> ) : ( ))} {config.showNewAgentButton && ( )}
{isLoading ? (
) : ( <> {/* Show subfolders at current level */} {config.id === 'user' && currentLevelFolders.length > 0 && (
{currentLevelFolders.map((folder) => ( ))}
)} {/* Show agents at current level */} {unfolderedAgents.length > 0 ? (
{unfolderedAgents.map((agent) => ( ))}
) : hasNoAgentsAtAll && currentLevelFolders.length === 0 ? (

{currentFolderId ? t('agents.folders.empty') : t(`agents.sections.${config.id}.emptyState`)}

{config.showNewAgentButton && !currentFolderId && ( )}
) : null} )}
); } ================================================ FILE: frontend/src/agents/FolderCard.tsx ================================================ import { SyntheticEvent, useRef, useState } from 'react'; import { useTranslation } from 'react-i18next'; import Edit from '../assets/edit.svg'; import Trash from '../assets/red-trash.svg'; import ThreeDots from '../assets/three-dots.svg'; import ContextMenu, { MenuOption } from '../components/ContextMenu'; import ConfirmationModal from '../modals/ConfirmationModal'; import FolderNameModal from '../modals/FolderManagementModal'; import { ActiveState } from '../models/misc'; import { AgentFolder } from './types'; type FolderCardProps = { folder: AgentFolder; agentCount: number; onDelete: (folderId: string) => Promise; onRename: (folderId: string, newName: string) => void; isExpanded: boolean; onToggleExpand: (folderId: string) => void; }; export default function FolderCard({ folder, agentCount, onDelete, onRename, isExpanded, onToggleExpand, }: FolderCardProps) { const { t } = useTranslation(); const [isMenuOpen, setIsMenuOpen] = useState(false); const [deleteConfirmation, setDeleteConfirmation] = useState('INACTIVE'); const [renameModalState, setRenameModalState] = useState('INACTIVE'); const menuRef = useRef(null); const menuOptions: MenuOption[] = [ { icon: Edit, label: t('agents.folders.rename'), onClick: (e: SyntheticEvent) => { e.stopPropagation(); setRenameModalState('ACTIVE'); setIsMenuOpen(false); }, variant: 'primary', iconWidth: 14, iconHeight: 14, }, { icon: Trash, label: t('agents.folders.delete'), onClick: (e: SyntheticEvent) => { e.stopPropagation(); setDeleteConfirmation('ACTIVE'); setIsMenuOpen(false); }, variant: 'danger', iconWidth: 13, iconHeight: 13, }, ]; const handleRename = (newName: string) => { onRename(folder.id, newName); }; return ( <>
onToggleExpand(folder.id)} >
{folder.name} ({agentCount})
{ e.stopPropagation(); setIsMenuOpen(true); }} className="ml-2 shrink-0 cursor-pointer" > menu
{ onDelete(folder.id); setDeleteConfirmation('INACTIVE'); }} cancelLabel={t('cancel')} variant="danger" /> ); } ================================================ FILE: frontend/src/agents/NewAgent.tsx ================================================ import isEqual from 'lodash/isEqual'; import React, { useCallback, useEffect, useRef, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { useDispatch, useSelector } from 'react-redux'; import { useNavigate, useParams, useSearchParams } from 'react-router-dom'; import modelService from '../api/services/modelService'; import userService from '../api/services/userService'; import ArrowLeft from '../assets/arrow-left.svg'; import SourceIcon from '../assets/source.svg'; import Dropdown from '../components/Dropdown'; import { FileUpload } from '../components/FileUpload'; import MultiSelectPopup, { OptionType } from '../components/MultiSelectPopup'; import Spinner from '../components/Spinner'; import AgentDetailsModal from '../modals/AgentDetailsModal'; import ConfirmationModal from '../modals/ConfirmationModal'; import { ActiveState, Doc, Prompt } from '../models/misc'; import { selectAgentFolders, selectSelectedAgent, selectSourceDocs, selectToken, selectPrompts, setAgentFolders, setSelectedAgent, setPrompts, } from '../preferences/preferenceSlice'; import PromptsModal from '../preferences/PromptsModal'; import Prompts from '../settings/Prompts'; import { UserToolType } from '../settings/types'; import AgentPreview from './AgentPreview'; import { Agent, ToolSummary } from './types'; import WorkflowBuilder from './workflow/WorkflowBuilder'; import type { Model } from '../models/types'; export default function NewAgent({ mode }: { mode: 'new' | 'edit' | 'draft' }) { const { t } = useTranslation(); const navigate = useNavigate(); const dispatch = useDispatch(); const { agentId } = useParams(); const [searchParams] = useSearchParams(); const folderIdFromUrl = searchParams.get('folder_id'); const token = useSelector(selectToken); const sourceDocs = useSelector(selectSourceDocs); const selectedAgent = useSelector(selectSelectedAgent); const prompts = useSelector(selectPrompts); const agentFolders = useSelector(selectAgentFolders); const [validatedFolderId, setValidatedFolderId] = useState( null, ); const [effectiveMode, setEffectiveMode] = useState(mode); const [agent, setAgent] = useState({ id: agentId || '', name: '', description: '', image: '', source: '', sources: [], chunks: '2', retriever: 'classic', prompt_id: 'default', tools: [], agent_type: 'classic', status: '', json_schema: undefined, limited_token_mode: false, token_limit: undefined, limited_request_mode: false, request_limit: undefined, models: [], default_model_id: '', }); const [imageFile, setImageFile] = useState(null); const [userTools, setUserTools] = useState([]); const [availableModels, setAvailableModels] = useState([]); const [isSourcePopupOpen, setIsSourcePopupOpen] = useState(false); const [isToolsPopupOpen, setIsToolsPopupOpen] = useState(false); const [isModelsPopupOpen, setIsModelsPopupOpen] = useState(false); const [selectedSourceIds, setSelectedSourceIds] = useState< Set >(new Set()); const [selectedTools, setSelectedTools] = useState([]); const [selectedModelIds, setSelectedModelIds] = useState>( new Set(), ); const [deleteConfirmation, setDeleteConfirmation] = useState('INACTIVE'); const [agentDetails, setAgentDetails] = useState('INACTIVE'); const [addPromptModal, setAddPromptModal] = useState('INACTIVE'); const [hasChanges, setHasChanges] = useState(false); const [draftLoading, setDraftLoading] = useState(false); const [publishLoading, setPublishLoading] = useState(false); const [jsonSchemaText, setJsonSchemaText] = useState(''); const [jsonSchemaValid, setJsonSchemaValid] = useState(true); const [isAdvancedSectionExpanded, setIsAdvancedSectionExpanded] = useState(false); const initialAgentRef = useRef(null); const sourceAnchorButtonRef = useRef(null); const toolAnchorButtonRef = useRef(null); const modelAnchorButtonRef = useRef(null); const modeConfig = { new: { heading: t('agents.form.headings.new'), buttonText: t('agents.form.buttons.publish'), showDelete: false, showSaveDraft: true, showLogs: false, showAccessDetails: false, trackChanges: false, }, edit: { heading: t('agents.form.headings.edit'), buttonText: t('agents.form.buttons.save'), showDelete: true, showSaveDraft: false, showLogs: true, showAccessDetails: true, trackChanges: true, }, draft: { heading: t('agents.form.headings.draft'), buttonText: t('agents.form.buttons.publish'), showDelete: true, showSaveDraft: true, showLogs: false, showAccessDetails: false, trackChanges: false, }, }; const chunks = ['0', '2', '4', '6', '8', '10']; const agentTypes = [ { label: t('agents.form.agentTypes.classic'), value: 'classic' }, { label: t('agents.form.agentTypes.react'), value: 'react' }, ]; const isPublishable = () => { const hasRequiredFields = agent.name && agent.description && agent.prompt_id && agent.agent_type; const isJsonSchemaValidOrEmpty = jsonSchemaText.trim() === '' || jsonSchemaValid; const hasSource = selectedSourceIds.size > 0; return hasRequiredFields && isJsonSchemaValidOrEmpty && hasSource; }; const isJsonSchemaInvalid = () => { return jsonSchemaText.trim() !== '' && !jsonSchemaValid; }; const handleUpload = useCallback((files: File[]) => { if (files && files.length > 0) { const file = files[0]; setImageFile(file); } }, []); const navigateBackToAgents = useCallback(() => { const targetPath = validatedFolderId ? `/agents?folder=${validatedFolderId}` : '/agents'; navigate(targetPath); }, [navigate, validatedFolderId]); const handleCancel = () => { if (selectedAgent) dispatch(setSelectedAgent(null)); navigateBackToAgents(); }; const handleDelete = async (agentId: string) => { const response = await userService.deleteAgent(agentId, token); if (!response.ok) throw new Error('Failed to delete agent'); navigateBackToAgents(); }; const handleSaveDraft = async () => { const formData = new FormData(); formData.append('name', agent.name); formData.append('description', agent.description); if (selectedSourceIds.size > 1) { const sourcesArray = Array.from(selectedSourceIds) .map((id) => { const sourceDoc = sourceDocs?.find( (source) => source.id === id || source.retriever === id || source.name === id, ); if (sourceDoc?.name === 'Default' && !sourceDoc?.id) { return 'default'; } return sourceDoc?.id || id; }) .filter(Boolean); formData.append('sources', JSON.stringify(sourcesArray)); formData.append('source', ''); } else if (selectedSourceIds.size === 1) { const singleSourceId = Array.from(selectedSourceIds)[0]; const sourceDoc = sourceDocs?.find( (source) => source.id === singleSourceId || source.retriever === singleSourceId || source.name === singleSourceId, ); let finalSourceId; if (sourceDoc?.name === 'Default' && !sourceDoc?.id) finalSourceId = 'default'; else finalSourceId = sourceDoc?.id || singleSourceId; formData.append('source', String(finalSourceId)); formData.append('sources', JSON.stringify([])); } else { formData.append('source', ''); formData.append('sources', JSON.stringify([])); } formData.append('chunks', agent.chunks); formData.append('retriever', agent.retriever); formData.append('prompt_id', agent.prompt_id); formData.append('agent_type', agent.agent_type); formData.append('status', 'draft'); if (agent.limited_token_mode && agent.token_limit) { formData.append('limited_token_mode', 'True'); formData.append('token_limit', agent.token_limit.toString()); } else { formData.append('limited_token_mode', 'False'); formData.append('token_limit', '0'); } if (agent.limited_request_mode && agent.request_limit) { formData.append('limited_request_mode', 'True'); formData.append('request_limit', agent.request_limit.toString()); } else { formData.append('limited_request_mode', 'False'); formData.append('request_limit', '0'); } if (imageFile) formData.append('image', imageFile); if (agent.tools && agent.tools.length > 0) formData.append('tools', JSON.stringify(agent.tools)); else formData.append('tools', '[]'); if (agent.json_schema) { formData.append('json_schema', JSON.stringify(agent.json_schema)); } if (agent.models && agent.models.length > 0) { formData.append('models', JSON.stringify(agent.models)); } if (agent.default_model_id) { formData.append('default_model_id', agent.default_model_id); } if (agent.agent_type === 'workflow' && agent.workflow) { formData.append('workflow', JSON.stringify(agent.workflow)); } if (effectiveMode === 'new' && validatedFolderId) { formData.append('folder_id', validatedFolderId); } try { setDraftLoading(true); const response = effectiveMode === 'new' ? await userService.createAgent(formData, token) : await userService.updateAgent(agent.id || '', formData, token); if (!response.ok) throw new Error('Failed to create agent draft'); const data = await response.json(); const updatedAgent = { ...agent, id: data.id || agent.id, image: data.image || agent.image, }; setAgent(updatedAgent); if (effectiveMode === 'new') setEffectiveMode('draft'); } catch (error) { console.error('Error saving draft:', error); throw new Error('Failed to save draft'); } finally { setDraftLoading(false); } }; const handlePublish = async () => { const formData = new FormData(); formData.append('name', agent.name); formData.append('description', agent.description); if (selectedSourceIds.size > 1) { const sourcesArray = Array.from(selectedSourceIds) .map((id) => { const sourceDoc = sourceDocs?.find( (source) => source.id === id || source.retriever === id || source.name === id, ); if (sourceDoc?.name === 'Default' && !sourceDoc?.id) { return 'default'; } return sourceDoc?.id || id; }) .filter(Boolean); formData.append('sources', JSON.stringify(sourcesArray)); formData.append('source', ''); } else if (selectedSourceIds.size === 1) { const singleSourceId = Array.from(selectedSourceIds)[0]; const sourceDoc = sourceDocs?.find( (source) => source.id === singleSourceId || source.retriever === singleSourceId || source.name === singleSourceId, ); let finalSourceId; if (sourceDoc?.name === 'Default' && !sourceDoc?.id) finalSourceId = 'default'; else finalSourceId = sourceDoc?.id || singleSourceId; formData.append('source', String(finalSourceId)); formData.append('sources', JSON.stringify([])); } else { formData.append('source', ''); formData.append('sources', JSON.stringify([])); } formData.append('chunks', agent.chunks); formData.append('retriever', agent.retriever); formData.append('prompt_id', agent.prompt_id); formData.append('agent_type', agent.agent_type); formData.append('status', 'published'); if (imageFile) formData.append('image', imageFile); if (agent.tools && agent.tools.length > 0) formData.append('tools', JSON.stringify(agent.tools)); else formData.append('tools', '[]'); if (agent.json_schema) { formData.append('json_schema', JSON.stringify(agent.json_schema)); } // Always send the limited mode fields if (agent.limited_token_mode && agent.token_limit) { formData.append('limited_token_mode', 'True'); formData.append('token_limit', agent.token_limit.toString()); } else { formData.append('limited_token_mode', 'False'); formData.append('token_limit', '0'); } if (agent.limited_request_mode && agent.request_limit) { formData.append('limited_request_mode', 'True'); formData.append('request_limit', agent.request_limit.toString()); } else { formData.append('limited_request_mode', 'False'); formData.append('request_limit', '0'); } if (agent.models && agent.models.length > 0) { formData.append('models', JSON.stringify(agent.models)); } if (agent.default_model_id) { formData.append('default_model_id', agent.default_model_id); } if (agent.agent_type === 'workflow' && agent.workflow) { formData.append('workflow', JSON.stringify(agent.workflow)); } if (effectiveMode === 'new' && validatedFolderId) { formData.append('folder_id', validatedFolderId); } try { setPublishLoading(true); const response = effectiveMode === 'new' ? await userService.createAgent(formData, token) : await userService.updateAgent(agent.id || '', formData, token); if (!response.ok) throw new Error('Failed to publish agent'); const data = await response.json(); const updatedAgent = { ...agent, id: data.id || agent.id, key: data.key || agent.key, status: 'published', image: data.image || agent.image, }; setAgent(updatedAgent); initialAgentRef.current = updatedAgent; if (effectiveMode === 'new' || effectiveMode === 'draft') { setEffectiveMode('edit'); setAgentDetails('ACTIVE'); } setImageFile(null); } catch (error) { console.error('Error publishing agent:', error); throw new Error('Failed to publish agent'); } finally { setPublishLoading(false); } }; const validateAndSetJsonSchema = (text: string) => { setJsonSchemaText(text); if (text.trim() === '') { setAgent({ ...agent, json_schema: undefined }); setJsonSchemaValid(true); return; } try { const parsed = JSON.parse(text); setAgent({ ...agent, json_schema: parsed }); setJsonSchemaValid(true); } catch (error) { setJsonSchemaValid(false); } }; useEffect(() => { const getTools = async () => { const response = await userService.getUserTools(token); if (!response.ok) throw new Error('Failed to fetch tools'); const data = await response.json(); const tools: OptionType[] = data.tools.map((tool: UserToolType) => ({ id: tool.id, label: tool.customName ? tool.customName : tool.displayName, icon: `/toolIcons/tool_${tool.name}.svg`, })); setUserTools(tools); }; const getModels = async () => { const response = await modelService.getModels(null); if (!response.ok) throw new Error('Failed to fetch models'); const data = await response.json(); const transformed = modelService.transformModels(data.models || []); setAvailableModels(transformed); if (mode === 'new' && transformed.length > 0) { const preferredDefaultModelId = transformed.find((model) => model.id === data.default_model_id)?.id || transformed[0].id; if (preferredDefaultModelId) { setSelectedModelIds((prevSelectedModelIds) => prevSelectedModelIds.size > 0 ? prevSelectedModelIds : new Set([preferredDefaultModelId]), ); } } }; getTools(); getModels(); }, [token, mode]); // Validate folder_id from URL against user's folders useEffect(() => { const validateAndSetFolder = async () => { if (!folderIdFromUrl) { setValidatedFolderId(null); return; } let folders = agentFolders; if (!folders) { try { const response = await userService.getAgentFolders(token); if (response.ok) { const data = await response.json(); folders = data.folders || []; dispatch(setAgentFolders(folders)); } } catch { setValidatedFolderId(null); return; } } const folderExists = folders?.some((f) => f.id === folderIdFromUrl); setValidatedFolderId(folderExists ? folderIdFromUrl : null); }; validateAndSetFolder(); }, [folderIdFromUrl, agentFolders, token, dispatch]); // Auto-select default source if none selected useEffect(() => { if (sourceDocs && sourceDocs.length > 0 && selectedSourceIds.size === 0) { const defaultSource = sourceDocs.find((s) => s.name === 'Default'); if (defaultSource) { setSelectedSourceIds( new Set([ defaultSource.id || defaultSource.retriever || defaultSource.name, ]), ); } else { setSelectedSourceIds( new Set([ sourceDocs[0].id || sourceDocs[0].retriever || sourceDocs[0].name, ]), ); } } }, [sourceDocs, selectedSourceIds.size]); useEffect(() => { if ((mode === 'edit' || mode === 'draft') && agentId) { const getAgent = async () => { const response = await userService.getAgent(agentId, token); if (!response.ok) { navigate('/agents'); throw new Error('Failed to fetch agent'); } const data = await response.json(); if (data.sources && data.sources.length > 0) { const mappedSources = data.sources.map((sourceId: string) => { if (sourceId === 'default') { const defaultSource = sourceDocs?.find( (source) => source.name === 'Default', ); return defaultSource?.retriever || 'classic'; } return sourceId; }); setSelectedSourceIds(new Set(mappedSources)); } else if (data.source) { if (data.source === 'default') { const defaultSource = sourceDocs?.find( (source) => source.name === 'Default', ); setSelectedSourceIds( new Set([defaultSource?.retriever || 'classic']), ); } else { setSelectedSourceIds(new Set([data.source])); } } else if (data.retriever) { setSelectedSourceIds(new Set([data.retriever])); } if (data.tool_details) setSelectedTools(data.tool_details); if (data.status === 'draft') setEffectiveMode('draft'); if (data.json_schema) { const jsonText = JSON.stringify(data.json_schema, null, 2); setJsonSchemaText(jsonText); setJsonSchemaValid(true); } setAgent(data); initialAgentRef.current = data; }; getAgent(); } }, [agentId, mode, token]); useEffect(() => { if (agent.models && agent.models.length > 0 && availableModels.length > 0) { const agentModelIds = new Set(agent.models); if (agentModelIds.size > 0 && selectedModelIds.size === 0) { setSelectedModelIds(agentModelIds); } } }, [agent.models, availableModels.length]); useEffect(() => { const modelsArray = Array.from(selectedModelIds); if (modelsArray.length > 0) { setAgent((prev) => ({ ...prev, models: modelsArray, default_model_id: modelsArray.includes(prev.default_model_id || '') ? prev.default_model_id : modelsArray[0], })); } else { setAgent((prev) => ({ ...prev, models: [], default_model_id: '', })); } }, [selectedModelIds]); useEffect(() => { const selectedSources = Array.from(selectedSourceIds) .map((id) => sourceDocs?.find( (source) => source.id === id || source.retriever === id || source.name === id, ), ) .filter(Boolean); if (selectedSources.length > 0) { // Handle multiple sources if (selectedSources.length > 1) { // Multiple sources selected - store in sources array const sourceIds = selectedSources .map((source) => source?.id) .filter((id): id is string => Boolean(id)); setAgent((prev) => ({ ...prev, sources: sourceIds, source: '', // Clear single source for multiple sources retriever: '', })); } else { // Single source selected - maintain backward compatibility const selectedSource = selectedSources[0]; if (selectedSource && 'id' in selectedSource) { setAgent((prev) => ({ ...prev, source: selectedSource?.id || 'default', sources: [], // Clear sources array for single source retriever: '', })); } else { setAgent((prev) => ({ ...prev, source: '', sources: [], // Clear sources array retriever: selectedSource?.retriever || 'classic', })); } } } else { // No sources selected setAgent((prev) => ({ ...prev, source: '', sources: [], retriever: '', })); } }, [selectedSourceIds]); useEffect(() => { setAgent((prev) => ({ ...prev, tools: Array.from(selectedTools) .map((tool) => tool?.id) .filter((id): id is string => typeof id === 'string'), })); }, [selectedTools]); useEffect(() => { if (isPublishable()) dispatch(setSelectedAgent(agent)); if (!modeConfig[effectiveMode].trackChanges) { setHasChanges(true); return; } if (!initialAgentRef.current) { setHasChanges(false); return; } const initialJsonSchemaText = initialAgentRef.current.json_schema ? JSON.stringify(initialAgentRef.current.json_schema, null, 2) : ''; const isChanged = !isEqual(agent, initialAgentRef.current) || imageFile !== null || jsonSchemaText !== initialJsonSchemaText; setHasChanges(isChanged); }, [agent, dispatch, effectiveMode, imageFile, jsonSchemaText]); return (

{t('agents.backToAll')}

{modeConfig[effectiveMode].heading}

{agent.agent_type === 'workflow' && (
)}
{modeConfig[effectiveMode].showDelete && agent.id && ( )} {modeConfig[effectiveMode].showSaveDraft && ( )} {modeConfig[effectiveMode].showAccessDetails && ( )} {modeConfig[effectiveMode].showAccessDetails && ( )}

{t('agents.form.sections.meta')}

setAgent({ ...agent, name: e.target.value })} />