Repository: Azure-Samples/azure-search-openai-demo Branch: main Commit: 56735b70df9c Files: 471 Total size: 33.7 MB Directory structure: gitextract_llud8udi/ ├── .azdo/ │ └── pipelines/ │ └── azure-dev.yml ├── .devcontainer/ │ └── devcontainer.json ├── .gitattributes ├── .github/ │ ├── CODE_OF_CONDUCT.md │ ├── ISSUE_TEMPLATE.md │ ├── PULL_REQUEST_TEMPLATE.md │ ├── agents/ │ │ ├── fixer.agent.md │ │ └── triager.agent.md │ ├── dependabot.yaml │ ├── instructions/ │ │ └── bicep.instructions.md │ ├── prompts/ │ │ └── review_pr_comments.prompt.md │ ├── skills/ │ │ └── github-pr-inline-reply/ │ │ └── SKILL.md │ └── workflows/ │ ├── azure-dev-validation.yaml │ ├── azure-dev.yml │ ├── evaluate.yaml │ ├── frontend.yaml │ ├── lint-markdown.yml │ ├── nightly-jobs.yaml │ ├── python-test.yaml │ ├── stale-bot.yml │ └── validate-markdown.yml ├── .gitignore ├── .markdownlint-cli2.jsonc ├── .pre-commit-config.yaml ├── .vscode/ │ ├── extensions.json │ ├── launch.json │ ├── settings.json │ └── tasks.json ├── AGENTS.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── app/ │ ├── backend/ │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── app.py │ │ ├── approaches/ │ │ │ ├── __init__.py │ │ │ ├── approach.py │ │ │ ├── chatreadretrieveread.py │ │ │ ├── promptmanager.py │ │ │ └── prompts/ │ │ │ ├── chat_answer.system.jinja2 │ │ │ ├── chat_answer.user.jinja2 │ │ │ ├── chat_query_rewrite_tools.json │ │ │ └── query_rewrite.system.jinja2 │ │ ├── chat_history/ │ │ │ ├── __init__.py │ │ │ └── cosmosdb.py │ │ ├── config.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── authentication.py │ │ │ └── sessionhelper.py │ │ ├── custom_uvicorn_worker.py │ │ ├── decorators.py │ │ ├── error.py │ │ ├── gunicorn.conf.py │ │ ├── load_azd_env.py │ │ ├── main.py │ │ ├── prepdocs.py │ │ ├── prepdocslib/ │ │ │ ├── __init__.py │ │ │ ├── blobmanager.py │ │ │ ├── cloudingestionstrategy.py │ │ │ ├── csvparser.py │ │ │ ├── embeddings.py │ │ │ ├── figureprocessor.py │ │ │ ├── fileprocessor.py │ │ │ ├── filestrategy.py │ │ │ ├── htmlparser.py │ │ │ ├── integratedvectorizerstrategy.py │ │ │ ├── jsonparser.py │ │ │ ├── listfilestrategy.py │ │ │ ├── mediadescriber.py │ │ │ ├── page.py │ │ │ ├── parser.py │ │ │ ├── pdfparser.py │ │ │ ├── searchmanager.py │ │ │ ├── servicesetup.py │ │ │ ├── strategy.py │ │ │ ├── textparser.py │ │ │ ├── textprocessor.py │ │ │ └── textsplitter.py │ │ ├── requirements.in │ │ ├── requirements.txt │ │ └── setup_cloud_ingestion.py │ ├── frontend/ │ │ ├── .npmrc │ │ ├── .nvmrc │ │ ├── .prettierignore │ │ ├── .prettierrc.json │ │ ├── index.html │ │ ├── package.json │ │ ├── src/ │ │ │ ├── api/ │ │ │ │ ├── api.ts │ │ │ │ ├── index.ts │ │ │ │ └── models.ts │ │ │ ├── authConfig.ts │ │ │ ├── components/ │ │ │ │ ├── AnalysisPanel/ │ │ │ │ │ ├── AgentPlan.tsx │ │ │ │ │ ├── AnalysisPanel.module.css │ │ │ │ │ ├── AnalysisPanel.tsx │ │ │ │ │ ├── AnalysisPanelTabs.tsx │ │ │ │ │ ├── ThoughtProcess.tsx │ │ │ │ │ ├── TokenUsageGraph.tsx │ │ │ │ │ ├── agentPlanUtils.ts │ │ │ │ │ └── index.tsx │ │ │ │ ├── Answer/ │ │ │ │ │ ├── Answer.module.css │ │ │ │ │ ├── Answer.tsx │ │ │ │ │ ├── AnswerError.tsx │ │ │ │ │ ├── AnswerIcon.tsx │ │ │ │ │ ├── AnswerLoading.tsx │ │ │ │ │ ├── AnswerParser.tsx │ │ │ │ │ ├── SpeechOutputAzure.tsx │ │ │ │ │ ├── SpeechOutputBrowser.tsx │ │ │ │ │ └── index.ts │ │ │ │ ├── ClearChatButton/ │ │ │ │ │ ├── ClearChatButton.module.css │ │ │ │ │ ├── ClearChatButton.tsx │ │ │ │ │ └── index.tsx │ │ │ │ ├── Example/ │ │ │ │ │ ├── Example.module.css │ │ │ │ │ ├── Example.tsx │ │ │ │ │ ├── ExampleList.tsx │ │ │ │ │ └── index.tsx │ │ │ │ ├── HelpCallout/ │ │ │ │ │ ├── HelpCallout.tsx │ │ │ │ │ └── index.ts │ │ │ │ ├── HistoryButton/ │ │ │ │ │ ├── HistoryButton.module.css │ │ │ │ │ ├── HistoryButton.tsx │ │ │ │ │ └── index.tsx │ │ │ │ ├── HistoryItem/ │ │ │ │ │ ├── HistoryItem.module.css │ │ │ │ │ ├── HistoryItem.tsx │ │ │ │ │ └── index.tsx │ │ │ │ ├── HistoryPanel/ │ │ │ │ │ ├── HistoryPanel.module.css │ │ │ │ │ ├── HistoryPanel.tsx │ │ │ │ │ └── index.tsx │ │ │ │ ├── HistoryProviders/ │ │ │ │ │ ├── CosmosDB.ts │ │ │ │ │ ├── HistoryManager.ts │ │ │ │ │ ├── IProvider.ts │ │ │ │ │ ├── IndexedDB.ts │ │ │ │ │ ├── None.ts │ │ │ │ │ └── index.ts │ │ │ │ ├── LoginButton/ │ │ │ │ │ ├── LoginButton.module.css │ │ │ │ │ ├── LoginButton.tsx │ │ │ │ │ └── index.tsx │ │ │ │ ├── MarkdownViewer/ │ │ │ │ │ ├── MarkdownViewer.module.css │ │ │ │ │ ├── MarkdownViewer.tsx │ │ │ │ │ └── index.tsx │ │ │ │ ├── QuestionInput/ │ │ │ │ │ ├── QuestionInput.module.css │ │ │ │ │ ├── QuestionInput.tsx │ │ │ │ │ ├── SpeechInput.tsx │ │ │ │ │ └── index.ts │ │ │ │ ├── Settings/ │ │ │ │ │ ├── Settings.module.css │ │ │ │ │ └── Settings.tsx │ │ │ │ ├── SettingsButton/ │ │ │ │ │ ├── SettingsButton.module.css │ │ │ │ │ ├── SettingsButton.tsx │ │ │ │ │ └── index.tsx │ │ │ │ ├── SupportingContent/ │ │ │ │ │ ├── SupportingContent.module.css │ │ │ │ │ ├── SupportingContent.tsx │ │ │ │ │ ├── SupportingContentParser.ts │ │ │ │ │ └── index.ts │ │ │ │ ├── TokenClaimsDisplay/ │ │ │ │ │ ├── TokenClaimsDisplay.tsx │ │ │ │ │ └── index.tsx │ │ │ │ ├── UploadFile/ │ │ │ │ │ ├── UploadFile.module.css │ │ │ │ │ ├── UploadFile.tsx │ │ │ │ │ └── index.tsx │ │ │ │ ├── UserChatMessage/ │ │ │ │ │ ├── UserChatMessage.module.css │ │ │ │ │ ├── UserChatMessage.tsx │ │ │ │ │ └── index.ts │ │ │ │ └── VectorSettings/ │ │ │ │ ├── VectorSettings.module.css │ │ │ │ ├── VectorSettings.tsx │ │ │ │ └── index.ts │ │ │ ├── i18n/ │ │ │ │ ├── LanguagePicker.module.css │ │ │ │ ├── LanguagePicker.tsx │ │ │ │ ├── config.ts │ │ │ │ └── index.tsx │ │ │ ├── index.css │ │ │ ├── index.tsx │ │ │ ├── layoutWrapper.tsx │ │ │ ├── locales/ │ │ │ │ ├── da/ │ │ │ │ │ └── translation.json │ │ │ │ ├── en/ │ │ │ │ │ └── translation.json │ │ │ │ ├── es/ │ │ │ │ │ └── translation.json │ │ │ │ ├── fr/ │ │ │ │ │ └── translation.json │ │ │ │ ├── it/ │ │ │ │ │ └── translation.json │ │ │ │ ├── ja/ │ │ │ │ │ └── translation.json │ │ │ │ ├── nl/ │ │ │ │ │ └── translation.json │ │ │ │ ├── pl/ │ │ │ │ │ └── translation.json │ │ │ │ ├── ptBR/ │ │ │ │ │ └── translation.json │ │ │ │ └── tr/ │ │ │ │ └── translation.json │ │ │ ├── loginContext.tsx │ │ │ ├── pages/ │ │ │ │ ├── NoPage.tsx │ │ │ │ ├── chat/ │ │ │ │ │ ├── Chat.module.css │ │ │ │ │ └── Chat.tsx │ │ │ │ └── layout/ │ │ │ │ ├── Layout.module.css │ │ │ │ └── Layout.tsx │ │ │ └── vite-env.d.ts │ │ ├── tsconfig.json │ │ └── vite.config.ts │ ├── functions/ │ │ ├── __init__.py │ │ ├── document_extractor/ │ │ │ ├── .funcignore │ │ │ ├── function_app.py │ │ │ └── host.json │ │ ├── figure_processor/ │ │ │ ├── .funcignore │ │ │ ├── function_app.py │ │ │ └── host.json │ │ └── text_processor/ │ │ ├── .funcignore │ │ ├── function_app.py │ │ └── host.json │ ├── start.ps1 │ └── start.sh ├── azure.yaml ├── data/ │ ├── Json_Examples/ │ │ ├── 2189.json │ │ ├── 2190.json │ │ ├── 2191.json │ │ ├── 2192.json │ │ └── query.json │ └── Zava_Company_Overview.md ├── docs/ │ ├── README.md │ ├── agentic_retrieval.md │ ├── appservice.md │ ├── architecture.md │ ├── azd.md │ ├── azure_app_service.md │ ├── azure_container_apps.md │ ├── customization.md │ ├── data_ingestion.md │ ├── deploy_existing.md │ ├── deploy_features.md │ ├── deploy_freetrial.md │ ├── deploy_lowcost.md │ ├── deploy_private.md │ ├── deploy_troubleshooting.md │ ├── evaluation.md │ ├── http_protocol.md │ ├── localdev.md │ ├── login_and_acl.md │ ├── monitoring.md │ ├── multimodal.md │ ├── other_samples.md │ ├── productionizing.md │ ├── reasoning.md │ ├── safety_evaluation.md │ ├── sharing_environments.md │ └── textsplitter.md ├── evals/ │ ├── evaluate.py │ ├── evaluate_config.json │ ├── evaluate_config_multimodal.json │ ├── generate_ground_truth.py │ ├── ground_truth.jsonl │ ├── ground_truth_kg.json │ ├── ground_truth_multimodal.jsonl │ ├── requirements.txt │ ├── results/ │ │ ├── baseline/ │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ ├── gpt35turbo-ada002/ │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ ├── gpt4omini-ada002/ │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ ├── gpt4omini-emb3l/ │ │ │ ├── README.md │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ ├── gpt4omini-emb3l-2/ │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ ├── gpt5-emb3l/ │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ ├── gpt5chat-emb3l/ │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ ├── gpt5mini-emb3l/ │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ ├── gpt5mini-emb3l-2/ │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ └── o3mini-ada002/ │ │ ├── config.json │ │ ├── eval_results.jsonl │ │ ├── evaluate_parameters.json │ │ └── summary.json │ ├── results_multimodal/ │ │ ├── baseline/ │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ ├── no-image-embeddings/ │ │ │ ├── config.json │ │ │ ├── eval_results.jsonl │ │ │ ├── evaluate_parameters.json │ │ │ └── summary.json │ │ └── no-image-sources/ │ │ ├── config.json │ │ ├── eval_results.jsonl │ │ ├── evaluate_parameters.json │ │ └── summary.json │ ├── safety_evaluation.py │ └── safety_results.json ├── infra/ │ ├── abbreviations.json │ ├── app/ │ │ ├── functions-app.bicep │ │ ├── functions-rbac.bicep │ │ ├── functions.bicep │ │ └── storage-containers.bicep │ ├── backend-dashboard.bicep │ ├── bicepconfig.json │ ├── core/ │ │ ├── ai/ │ │ │ ├── ai-environment.bicep │ │ │ ├── hub.bicep │ │ │ └── project.bicep │ │ ├── auth/ │ │ │ └── appregistration.bicep │ │ ├── host/ │ │ │ ├── appservice-appsettings.bicep │ │ │ ├── appservice.bicep │ │ │ ├── appserviceplan.bicep │ │ │ ├── container-app-upsert.bicep │ │ │ ├── container-app.bicep │ │ │ ├── container-apps-auth.bicep │ │ │ ├── container-apps-environment.bicep │ │ │ ├── container-apps.bicep │ │ │ └── container-registry.bicep │ │ ├── monitor/ │ │ │ └── monitoring.bicep │ │ ├── networking/ │ │ │ ├── private-dns-zones.bicep │ │ │ ├── private-endpoint.bicep │ │ │ └── vnet.bicep │ │ ├── search/ │ │ │ ├── search-diagnostics.bicep │ │ │ └── search-services.bicep │ │ ├── security/ │ │ │ ├── aca-identity.bicep │ │ │ ├── documentdb-sql-role.bicep │ │ │ ├── registry-access.bicep │ │ │ ├── role.bicep │ │ │ └── storage-role.bicep │ │ └── storage/ │ │ └── storage-account.bicep │ ├── main.bicep │ ├── main.parameters.json │ ├── main.test.bicep │ ├── network-isolation.bicep │ └── private-endpoints.bicep ├── locustfile.py ├── ps-rule.yaml ├── pyproject.toml ├── requirements-dev.txt ├── scripts/ │ ├── adlsgen2setup.py │ ├── auth_common.py │ ├── auth_init.ps1 │ ├── auth_init.py │ ├── auth_init.sh │ ├── auth_update.ps1 │ ├── auth_update.py │ ├── auth_update.sh │ ├── copy_prepdocslib.py │ ├── cosmosdb_migration.py │ ├── load-balance-aca-setup.sh │ ├── load_azd_env.py │ ├── load_python_env.ps1 │ ├── load_python_env.sh │ ├── manageacl.py │ ├── prepdocs.ps1 │ ├── prepdocs.sh │ ├── roles.ps1 │ ├── roles.sh │ ├── sampleacls.json │ ├── setup_cloud_ingestion.ps1 │ ├── setup_cloud_ingestion.sh │ └── verify_search_index_acls.py └── tests/ ├── __init__.py ├── conftest.py ├── e2e.py ├── mocks.py ├── snapshots/ │ ├── test_app/ │ │ ├── test_chat_followup/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_handle_exception/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_handle_exception_contentsafety/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_handle_exception_contentsafety_streaming/ │ │ │ ├── client0/ │ │ │ │ └── result.jsonlines │ │ │ └── client1/ │ │ │ └── result.jsonlines │ │ ├── test_chat_handle_exception_streaming/ │ │ │ ├── client0/ │ │ │ │ └── result.jsonlines │ │ │ └── client1/ │ │ │ └── result.jsonlines │ │ ├── test_chat_hybrid/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_hybrid_semantic_captions/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_hybrid_semantic_ranker/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_prompt_template/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_prompt_template_concat/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_seed/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_session_state_persists/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_stream_followup/ │ │ │ ├── client0/ │ │ │ │ └── result.jsonlines │ │ │ └── client1/ │ │ │ └── result.jsonlines │ │ ├── test_chat_stream_handle_exception/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_stream_session_state_persists/ │ │ │ ├── client0/ │ │ │ │ └── result.jsonlines │ │ │ └── client1/ │ │ │ └── result.jsonlines │ │ ├── test_chat_stream_text/ │ │ │ ├── client0/ │ │ │ │ └── result.jsonlines │ │ │ └── client1/ │ │ │ └── result.jsonlines │ │ ├── test_chat_stream_text_filter/ │ │ │ └── auth_client0/ │ │ │ └── result.jsonlines │ │ ├── test_chat_stream_text_reasoning/ │ │ │ ├── reasoning_client0/ │ │ │ │ └── result.jsonlines │ │ │ └── reasoning_client1/ │ │ │ └── result.jsonlines │ │ ├── test_chat_stream_vision/ │ │ │ └── client0/ │ │ │ └── result.jsonlines │ │ ├── test_chat_text/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_text_agent/ │ │ │ ├── knowledgebase_client0/ │ │ │ │ └── result.json │ │ │ ├── knowledgebase_client1_web/ │ │ │ │ └── result.json │ │ │ └── knowledgebase_client2_sharepoint/ │ │ │ └── result.json │ │ ├── test_chat_text_filter/ │ │ │ └── auth_client0/ │ │ │ └── result.json │ │ ├── test_chat_text_filter_agent/ │ │ │ └── knowledgebase_auth_client0/ │ │ │ └── result.json │ │ ├── test_chat_text_filter_public_documents/ │ │ │ └── auth_public_documents_client0/ │ │ │ └── result.json │ │ ├── test_chat_text_reasoning/ │ │ │ ├── reasoning_client0/ │ │ │ │ └── result.json │ │ │ └── reasoning_client1/ │ │ │ └── result.json │ │ ├── test_chat_text_semantic_ranker/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_text_semanticcaptions/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_text_semanticranker/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_vector/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_vector_semantic_ranker/ │ │ │ ├── client0/ │ │ │ │ └── result.json │ │ │ └── client1/ │ │ │ └── result.json │ │ ├── test_chat_vision/ │ │ │ ├── client0/ │ │ │ │ ├── result.json │ │ │ │ └── result.jsonlines │ │ │ └── client1/ │ │ │ └── result.jsonlines │ │ ├── test_chat_vision_user/ │ │ │ └── auth_client0/ │ │ │ └── result.json │ │ ├── test_chat_vision_vectors/ │ │ │ ├── client0/ │ │ │ │ └── result.jsonlines │ │ │ └── client1/ │ │ │ └── result.jsonlines │ │ └── test_chat_with_history/ │ │ ├── client0/ │ │ │ └── result.json │ │ └── client1/ │ │ └── result.json │ ├── test_authenticationhelper/ │ │ ├── test_auth_setup/ │ │ │ └── result.json │ │ ├── test_auth_setup_required_access_control/ │ │ │ └── result.json │ │ └── test_auth_setup_required_access_control_and_unauthenticated_access/ │ │ └── result.json │ ├── test_cosmosdb/ │ │ ├── test_chathistory_getitem/ │ │ │ └── auth_public_documents_client0/ │ │ │ └── result.json │ │ ├── test_chathistory_query/ │ │ │ └── auth_public_documents_client0/ │ │ │ └── result.json │ │ └── test_chathistory_query_continuation/ │ │ └── auth_public_documents_client0/ │ │ └── result.json │ └── test_prepdocslib_textsplitter/ │ ├── test_pages_with_figures/ │ │ ├── pages_with_figures.json/ │ │ │ └── split_pages_with_figures.json │ │ └── pages_with_just_text.json/ │ │ └── split_pages_with_figures.json │ └── test_sentencetextsplitter_list_parse_and_split/ │ └── text_splitter_sections.txt ├── test-data/ │ ├── Simple Figure_content.txt │ ├── Simple Table_content.txt │ ├── pages_with_figures.json │ └── pages_with_just_text.json ├── test_adlsgen2setup.py ├── test_agentic_retrieval.py ├── test_app.py ├── test_app_config.py ├── test_auth_init.py ├── test_authenticationhelper.py ├── test_blob_manager.py ├── test_chatapproach.py ├── test_content_file.py ├── test_cosmosdb.py ├── test_cosmosdb_migration.py ├── test_csvparser.py ├── test_function_apps.py ├── test_htmlparser.py ├── test_jsonparser.py ├── test_listfilestrategy.py ├── test_manageacl.py ├── test_mediadescriber.py ├── test_pdfparser.py ├── test_prepdocs.py ├── test_prepdocslib_filestrategy.py ├── test_prepdocslib_textsplitter.py ├── test_searchmanager.py ├── test_sentencetextsplitter.py ├── test_servicesetup.py ├── test_textparser.py ├── test_textprocessor.py └── test_upload.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .azdo/pipelines/azure-dev.yml ================================================ # Run when commits are pushed to mainline branch (main or master) # Set this to the mainline branch you are using trigger: - main - master # Azure Pipelines workflow to deploy to Azure using azd # To configure required secrets and service connection for connecting to Azure, simply run `azd pipeline config --provider azdo` # Task "Install azd" needs to install setup-azd extension for azdo - https://marketplace.visualstudio.com/items?itemName=ms-azuretools.azd # See below for alternative task to install azd if you can't install above task in your organization pool: vmImage: ubuntu-latest steps: - task: setup-azd@0 displayName: Install azd # If you can't install above task in your organization, you can comment it and uncomment below task to install azd # - task: Bash@3 # displayName: Install azd # inputs: # targetType: 'inline' # script: | # curl -fsSL https://aka.ms/install-azd.sh | bash # azd delegate auth to az to use service connection with AzureCLI@2 - pwsh: | azd config set auth.useAzCliAuth "true" displayName: Configure AZD to Use AZ CLI Authentication. - task: AzureCLI@2 displayName: Provision Infrastructure inputs: # azconnection is the service connection created by azd. You can change it to any service connection you have in your organization. azureSubscription: azconnection scriptType: bash scriptLocation: inlineScript inlineScript: | azd provision --no-prompt env: AZURE_SUBSCRIPTION_ID: $(AZURE_SUBSCRIPTION_ID) AZURE_ENV_NAME: $(AZURE_ENV_NAME) AZURE_LOCATION: $(AZURE_LOCATION) AZD_INITIAL_ENVIRONMENT_CONFIG: $(AZD_INITIAL_ENVIRONMENT_CONFIG) AZURE_OPENAI_SERVICE: $(AZURE_OPENAI_SERVICE) AZURE_OPENAI_LOCATION: $(AZURE_OPENAI_LOCATION) AZURE_OPENAI_RESOURCE_GROUP: $(AZURE_OPENAI_RESOURCE_GROUP) AZURE_DOCUMENTINTELLIGENCE_SERVICE: $(AZURE_DOCUMENTINTELLIGENCE_SERVICE) AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP: $(AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP) AZURE_DOCUMENTINTELLIGENCE_SKU: $(AZURE_DOCUMENTINTELLIGENCE_SKU) AZURE_DOCUMENTINTELLIGENCE_LOCATION: $(AZURE_DOCUMENTINTELLIGENCE_LOCATION) AZURE_SEARCH_INDEX: $(AZURE_SEARCH_INDEX) AZURE_SEARCH_SERVICE: $(AZURE_SEARCH_SERVICE) AZURE_SEARCH_SERVICE_RESOURCE_GROUP: $(AZURE_SEARCH_SERVICE_RESOURCE_GROUP) AZURE_SEARCH_SERVICE_LOCATION: $(AZURE_SEARCH_SERVICE_LOCATION) AZURE_SEARCH_SERVICE_SKU: $(AZURE_SEARCH_SERVICE_SKU) AZURE_SEARCH_QUERY_LANGUAGE: $(AZURE_SEARCH_QUERY_LANGUAGE) AZURE_SEARCH_QUERY_SPELLER: $(AZURE_SEARCH_QUERY_SPELLER) AZURE_SEARCH_SEMANTIC_RANKER: $(AZURE_SEARCH_SEMANTIC_RANKER) AZURE_SEARCH_QUERY_REWRITING: $(AZURE_SEARCH_QUERY_REWRITING) AZURE_SEARCH_FIELD_NAME_EMBEDDING: $(AZURE_SEARCH_FIELD_NAME_EMBEDDING) AZURE_STORAGE_ACCOUNT: $(AZURE_STORAGE_ACCOUNT) AZURE_STORAGE_RESOURCE_GROUP: $(AZURE_STORAGE_RESOURCE_GROUP) AZURE_STORAGE_SKU: $(AZURE_STORAGE_SKU) AZURE_APP_SERVICE_SKU: $(AZURE_APP_SERVICE_SKU) AZURE_OPENAI_CHATGPT_MODEL: $(AZURE_OPENAI_CHATGPT_MODEL) AZURE_OPENAI_CHATGPT_DEPLOYMENT: $(AZURE_OPENAI_CHATGPT_DEPLOYMENT) AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY: $(AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY) AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION: $(AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION) AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU: $(AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU) AZURE_OPENAI_REASONING_EFFORT: $(AZURE_OPENAI_REASONING_EFFORT) AGENTIC_KNOWLEDGEBASE_REASONING_EFFORT: $(AGENTIC_KNOWLEDGEBASE_REASONING_EFFORT) AZURE_OPENAI_EMB_MODEL_NAME: $(AZURE_OPENAI_EMB_MODEL_NAME) AZURE_OPENAI_EMB_DEPLOYMENT: $(AZURE_OPENAI_EMB_DEPLOYMENT) AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY: $(AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY) AZURE_OPENAI_EMB_DEPLOYMENT_VERSION: $(AZURE_OPENAI_EMB_DEPLOYMENT_VERSION) AZURE_OPENAI_EMB_DEPLOYMENT_SKU: $(AZURE_OPENAI_EMB_DEPLOYMENT_SKU) AZURE_OPENAI_EMB_DIMENSIONS: $(AZURE_OPENAI_EMB_DIMENSIONS) AZURE_OPENAI_DISABLE_KEYS: $(AZURE_OPENAI_DISABLE_KEYS) OPENAI_HOST: $(OPENAI_HOST) OPENAI_API_KEY: $(OPENAI_API_KEY) OPENAI_ORGANIZATION: $(OPENAI_ORGANIZATION) AZURE_USE_APPLICATION_INSIGHTS: $(AZURE_USE_APPLICATION_INSIGHTS) AZURE_APPLICATION_INSIGHTS: $(AZURE_APPLICATION_INSIGHTS) AZURE_APPLICATION_INSIGHTS_DASHBOARD: $(AZURE_APPLICATION_INSIGHTS_DASHBOARD) AZURE_LOG_ANALYTICS: $(AZURE_LOG_ANALYTICS) USE_VECTORS: $(USE_VECTORS) USE_MULTIMODAL: $(USE_MULTIMODAL) USE_CLOUD_INGESTION: $(USE_CLOUD_INGESTION) USE_CLOUD_INGESTION_ACLS: $(USE_CLOUD_INGESTION_ACLS) USE_EXISTING_ADLS_STORAGE: $(USE_EXISTING_ADLS_STORAGE) AZURE_ADLS_GEN2_STORAGE_ACCOUNT: $(AZURE_ADLS_GEN2_STORAGE_ACCOUNT) AZURE_ADLS_GEN2_STORAGE_RESOURCE_GROUP: $(AZURE_ADLS_GEN2_STORAGE_RESOURCE_GROUP) AZURE_VISION_ENDPOINT: $(AZURE_VISION_ENDPOINT) VISION_SECRET_NAME: $(VISION_SECRET_NAME) AZURE_VISION_SERVICE: $(AZURE_VISION_SERVICE) AZURE_VISION_RESOURCE_GROUP: $(AZURE_VISION_RESOURCE_GROUP) AZURE_VISION_LOCATION: $(AZURE_VISION_LOCATION) AZURE_VISION_SKU: $(AZURE_VISION_SKU) ENABLE_LANGUAGE_PICKER: $(ENABLE_LANGUAGE_PICKER) USE_SPEECH_INPUT_BROWSER: $(USE_SPEECH_INPUT_BROWSER) USE_SPEECH_OUTPUT_BROWSER: $(USE_SPEECH_OUTPUT_BROWSER) USE_SPEECH_OUTPUT_AZURE: $(USE_SPEECH_OUTPUT_AZURE) AZURE_SPEECH_SERVICE: $(AZURE_SPEECH_SERVICE) AZURE_SPEECH_SERVICE_RESOURCE_GROUP: $(AZURE_SPEECH_SERVICE_RESOURCE_GROUP) AZURE_SPEECH_SERVICE_LOCATION: $(AZURE_SPEECH_SERVICE_LOCATION) AZURE_SPEECH_SERVICE_SKU: $(AZURE_SPEECH_SERVICE_SKU) AZURE_SPEECH_SERVICE_VOICE: $(AZURE_SPEECH_SERVICE_VOICE) AZURE_KEY_VAULT_NAME: $(AZURE_KEY_VAULT_NAME) AZURE_USE_AUTHENTICATION: $(AZURE_USE_AUTHENTICATION) AZURE_ENFORCE_ACCESS_CONTROL: $(AZURE_ENFORCE_ACCESS_CONTROL) AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS: $(AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS) AZURE_ENABLE_UNAUTHENTICATED_ACCESS: $(AZURE_ENABLE_UNAUTHENTICATED_ACCESS) AZURE_TENANT_ID: $(AZURE_TENANT_ID) AZURE_AUTH_TENANT_ID: $(AZURE_AUTH_TENANT_ID) AZURE_SERVER_APP_ID: $(AZURE_SERVER_APP_ID) AZURE_CLIENT_APP_ID: $(AZURE_CLIENT_APP_ID) ALLOWED_ORIGIN: $(ALLOWED_ORIGIN) AZURE_SERVER_APP_SECRET: $(AZURE_SERVER_APP_SECRET) AZURE_CLIENT_APP_SECRET: $(AZURE_CLIENT_APP_SECRET) AZURE_ADLS_GEN2_FILESYSTEM: $(AZURE_ADLS_GEN2_FILESYSTEM) DEPLOYMENT_TARGET: $(DEPLOYMENT_TARGET) AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: $(AZURE_CONTAINER_APPS_WORKLOAD_PROFILE) USE_CHAT_HISTORY_BROWSER: $(USE_CHAT_HISTORY_BROWSER) USE_MEDIA_DESCRIBER_AZURE_CU: $(USE_MEDIA_DESCRIBER_AZURE_CU) RAG_SEARCH_TEXT_EMBEDDINGS: $(RAG_SEARCH_TEXT_EMBEDDINGS) RAG_SEARCH_IMAGE_EMBEDDINGS: $(RAG_SEARCH_IMAGE_EMBEDDINGS) RAG_SEND_TEXT_SOURCES: $(RAG_SEND_TEXT_SOURCES) RAG_SEND_IMAGE_SOURCES: $(RAG_SEND_IMAGE_SOURCES) USE_AGENTIC_KNOWLEDGEBASE: $(USE_AGENTIC_KNOWLEDGEBASE) USE_WEB_SOURCE: $(USE_WEB_SOURCE) USE_SHAREPOINT_SOURCE: $(USE_SHAREPOINT_SOURCE) - task: AzureCLI@2 displayName: Deploy Application inputs: azureSubscription: azconnection scriptType: bash scriptLocation: inlineScript inlineScript: | azd deploy --no-prompt ================================================ FILE: .devcontainer/devcontainer.json ================================================ { "name": "Azure Search OpenAI Demo", "image": "mcr.microsoft.com/devcontainers/python:3.13-bookworm", "features": { "ghcr.io/devcontainers/features/node:1": { // This should match the version of Node.js in Github Actions workflows "version": "22", "nodeGypDependencies": false }, "ghcr.io/devcontainers/features/azure-cli:1.2.5": {}, "ghcr.io/devcontainers/features/docker-in-docker:2": {}, "ghcr.io/azure/azure-dev/azd:latest": {} }, "customizations": { "vscode": { "extensions": [ "ms-azuretools.azure-dev", "ms-azuretools.vscode-bicep", "ms-python.python", "astral-sh.ty", "esbenp.prettier-vscode", "DavidAnson.vscode-markdownlint" ] } }, "forwardPorts": [ 50505 ], "postCreateCommand": "", "remoteUser": "vscode", "hostRequirements": { "memory": "8gb" } } ================================================ FILE: .gitattributes ================================================ *.sh text eol=lf *.jsonlines text eol=lf ================================================ FILE: .github/CODE_OF_CONDUCT.md ================================================ # Microsoft Open Source Code of Conduct This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). Resources: - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns ================================================ FILE: .github/ISSUE_TEMPLATE.md ================================================ > Please provide us with the following information: > --------------------------------------------------------------- ### This issue is for a: (mark with an `x`) ``` - [ ] bug report -> please search issues before submitting - [ ] feature request - [ ] documentation issue or request - [ ] regression (a behavior that used to work and stopped in a new release) ``` ### Minimal steps to reproduce > ### Any log messages given by the failure > ### Expected/desired behavior > ### OS and Version? > Windows 7, 8 or 10. Linux (which distribution). macOS (Yosemite? El Capitan? Sierra?) ### azd version? > run `azd version` and copy paste here. ### Versions > ### Mention any other details that might be useful > --------------------------------------------------------------- > Thanks! We'll be in touch soon. ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ## Purpose ## Does this introduce a breaking change? When developers merge from main and run the server, azd up, or azd deploy, will this produce an error? If you're not sure, try it out on an old environment. ``` [ ] Yes [ ] No ``` ## Does this require changes to learn.microsoft.com docs? This repository is referenced by [this tutorial](https://learn.microsoft.com/azure/developer/python/get-started-app-chat-template) which includes deployment, settings and usage instructions. If text or screenshot need to change in the tutorial, check the box below and notify the tutorial author. A Microsoft employee can do this for you if you're an external contributor. ``` [ ] Yes [ ] No ``` ## Type of change ``` [ ] Bugfix [ ] Feature [ ] Code style update (formatting, local variables) [ ] Refactoring (no functional changes, no api changes) [ ] Documentation content changes [ ] Other... Please describe: ``` ## Code quality checklist See [CONTRIBUTING.md](https://github.com/Azure-Samples/azure-search-openai-demo/blob/main/CONTRIBUTING.md#submit-pr) for more details. - [ ] The current tests all pass (`python -m pytest`). - [ ] I added tests that prove my fix is effective or that my feature works - [ ] I ran `python -m pytest --cov` to verify 100% coverage of added lines - [ ] I ran `ty check` to check for type errors - [ ] I either used the pre-commit hooks or ran `ruff` and `black` manually on my code. ================================================ FILE: .github/agents/fixer.agent.md ================================================ --- description: 'Fix and verify issues in app' tools: ['vscode', 'execute', 'read', 'edit', 'search', 'web', 'agent', 'azure-mcp/search', 'github/create_pull_request', 'github/issue_read', 'github/list_issues', 'github/search_issues', 'playwright/*', 'pylance-mcp-server/*', 'microsoftdocs/mcp/*'] --- # Fixer Mode Instructions You are in fixer mode. When given an issue to fix, follow these steps: 1. **Gather context**: Read error messages/stack traces/related code. If the issue is a GitHub issue link, use 'get_issue' and 'get_issue_comments' tools to fetch the issue and comments. 2. **Make targeted fix**: Make minimal changes to fix the issue. Do not fix any issues that weren't identified. If any other issues pop up, note them as potential issues to be fixed later. 3. **Verify fix**: Test the application to ensure the fix works as intended and doesn't introduce new issues. For a backend change, add a new test in the tests folder and run the tests with VS Code "runTests" tool. RUN all the tests using that tool, not just the tests you added. Try to add tests to existing test files when possible, like test_app.py. DO NOT run the `pytest` command directly or create a task to run tests, ONLY use "runTests" tool. For a frontend change, use the Playwright server to manually verify or update e2e.py tests. ## Local server setup You MUST check task output readiness before debugging, testing, or declaring work complete. - Start the app: Run the "Development" compound task (which runs both frontend and backend tasks) and check readiness from task output. Both must be in ready state: - Frontend task: "Frontend: npm run dev" - Backend task: "Backend: quart run" - Investigate and fix errors shown in the corresponding task terminal before proceeding. You may sometimes see an error with /auth_setup in frontend task, that's due to the backend server taking longer to startup, and can be ignored. - Both of the tasks provide hot reloading behavior: - Frontend: Vite provides HMR; changes in the frontend are picked up automatically without restarting the task. - Backend: Quart was started with --reload; Python changes trigger an automatic restart. - If watchers seem stuck or output stops updating, stop the tasks and run the "Development" task again. - To interact with a running application, use the Playwright MCP server. If testing login, you will need to navigate to 'localhost' instead of '127.0.0.1' since that's the URL allowed by the Entra application. ## Running Python scripts If you are running Python scripts that depend on installed requirements, you must run them using the virtual environment in `.venv`. ## Committing the change When change is complete, offer to make a new branch, git commit, and pull request. DO NOT check out a new branch unless explicitly confirmed - sometimes user is already in a branch ## Making the PR * Use the `github/create_pull_request` tool to create the PR. * Follow the `.github/PULL_REQUEST_TEMPLATE.md` format, with all sections filled out and appropriate checkboxes checked. If any section does not apply, write "N/A" in that section. * Includes "Fixes #" sentence in the PR description to auto-close the issue when the PR is merged. ================================================ FILE: .github/agents/triager.agent.md ================================================ --- description: 'Triage old stale issues for obsolescence and recommend closures' tools: ['edit', 'search/usages', 'web', 'azure-mcp/search', 'github/add_issue_comment', 'github/get_commit', 'github/get_file_contents', 'github/get_latest_release', 'github/get_me', 'github/get_release_by_tag', 'github/get_tag', 'github/issue_read', 'github/issue_write', 'github/list_branches', 'github/list_commits', 'github/list_issue_types', 'github/list_issues', 'github/list_pull_requests', 'github/list_releases', 'github/list_tags', 'github/pull_request_read', 'github/search_code', 'github/search_issues', 'github/search_pull_requests', 'github/search_repositories', 'github/search_users', 'github/assign_copilot_to_issue', 'todo'] --- # Issue Triager You are a GitHub issue triage specialist tasked with finding old stale issues that can be safely closed as obsolete. DO NOT actually close them yourself unless specifically told to do so. Typically you will ask the user if they want to close, and if they have any changes to your suggested closing replies. ## Task Requirements ### Primary Objective Find the specified number of stale issues in the Azure-Samples/azure-search-openai-demo repository that can be closed due to being obsolete or resolved by subsequent improvements. ### Analysis Process 1. **Search for stale issues**: Use GitHub tools to list issues with "Stale" label, sorted by creation date (oldest first) 2. **Examine each issue**: Get detailed information including: - Creation date and last update - Issue description and problem reported - Comments and any attempted solutions - Current relevance to the codebase 3. **Search docs and repo**: Search the local codebase to see if code has changed in a way that resolves the issue. Also look at README.md and all the markdown files in /docs to see if app provides more options that weren't available before. 4. **Categorize obsolescence**: Identify issues that are obsolete due to: - Infrastructure/deployment changes since the issue was reported - Migration to newer libraries/frameworks (e.g., OpenAI SDK updates) - Cross-platform compatibility improvements - Configuration system redesigns - API changes that resolve the underlying problem ### Output Format For each recommended issue closure, provide: 1. **Issue Number and Title** 2. **GitHub Link**: Direct URL to the issue 3. **Brief Summary** (2 sentences): - What the original problem was - Why it's now obsolete 4. **Suggested Closing Reply**: A professional comment explaining: - Why the issue is being closed as obsolete - What changes have made it irrelevant (Only high confidence changes) - Invitation to open a new issue if the problem persists with current version ### Success Criteria - Issues should be at least 1 year old - Issues should have "Stale" label - Must provide clear rationale for why each issue is obsolete - Closing replies should be professional and helpful - Focus on issues that won't recur with current codebase ### Constraints - Do not recommend closing issues that represent ongoing valid feature requests - Avoid closing issues that highlight fundamental design limitations - Skip issues that could still affect current users even if less common - Ensure the obsolescence is due to actual code/infrastructure changes, not just age ### Example Categories to Target - Deployment failures from early 2023 that were fixed by infrastructure improvements - Cross-platform compatibility issues resolved by script migrations - API errors from old library versions that have been updated - Configuration issues resolved by azd template redesigns - Authentication/permissions errors fixed by improved role assignment logic ================================================ FILE: .github/dependabot.yaml ================================================ version: 2 updates: # Maintain dependencies for GitHub Actions - package-ecosystem: "github-actions" directory: "/" schedule: interval: "weekly" groups: github-actions: patterns: - "*" # Maintain dependencies for npm - package-ecosystem: "npm" directory: "/app/frontend" schedule: interval: "weekly" # Ignore Vite 7.x for now: Vite >=7 requires Node >=20.19.0 while project engines.node is currently >=20.0.0 ignore: - dependency-name: "vite" versions: - ">=7.0.0" # Maintain dependencies for pip - package-ecosystem: "pip" directory: "/" schedule: interval: "weekly" groups: telemetry: patterns: - "opentelemetry-*" - "azure-monitor-opentelemetry*" pydantic: patterns: - "pydantic" - "pydantic-*" - "typing-extensions" pallets: patterns: - "flask" - "werkzeug" - "blinker" - "quart" - "jinja2" - "click" - "itsdangerous" - "markupsafe" ================================================ FILE: .github/instructions/bicep.instructions.md ================================================ --- description: 'Infrastructure as Code with Bicep' applyTo: '**/*.bicep' --- # Bicep best-practices This list of best-practices builds on top of information available at https://learn.microsoft.com/azure/azure-resource-manager/bicep. It provides a more opinionated and up-to-date set of rules for generating high-quality Bicep code. You should aim to follow these rules whenever generating or modifying Bicep code. ## Rules ### General 1. Avoid setting the `name` field for `module` statements - it is no longer required. 1. If you need to input or output a set of logically-grouped values, generate a single `param` or `output` statement with a User-defined type instead of emitting a `param` or `output` statement for each value. 1. If generating parameters, default to generating Bicep parameters files (`*.bicepparam`), instead of ARM parameters files (`*.json`). ### Resources 1. Do not add references from child resources to parent resources by using `/` characters in the child resource `name` property. Instead, use the `parent` property with a symbolic reference to the parent resource. 1. If you are generating a child resource type, sometimes this may require you to add an `existing` resource for the parent if the parent is not already present in the file. 1. If you see diagnostic codes `BCP036`, `BCP037` or `BCP081`, this may indicate you have hallucinated resource types or resource type properties. You may need to double-check against available resource type schema to tune your output. 1. Avoid using multiple `resourceId()` functions and `reference()` function where possible. Instead use symbolic names to refer to ids or properties, creating `existing` resources if needed. For example, write `foo.id` or `foo.properties.id`, instead of `resourceId('...')` or `reference('...').id`. ### Types 1. Avoid using open types such as `array` or `object` when referencing types where possible (e.g. in `output` or `param` statements). Instead, use User-defined types to define a more precise type. 1. Use typed variables instead of untyped variables when exporting values with the `@export()` decorator. For example, use `var foo string = 'blah'` instead of `var foo = bar`. 1. When using User-defined types, aim to avoid repetition, and comment properties with `@description()` where the context is unclear. 1. If you are passing data directly to or from a resource body via a `param` or `output` statement, try to use existing Resource-derived types (`resourceInput<'type@version'>` and `resourceOutput<'type@version'>`) instead of writing User-defined types. ### Security 1. When generating `param` or `output` statements, ALWAYS use the `@secure()` decorator if sensitive data is present. ### Syntax 1. If you hit warnings or errors with null properties, prefer solving them with the safe-dereference (`.?`) operator, in conjunction with the coalesce (`??`) operator. For example, `a.?b ?? c` is better than `a!.b` which may cause runtime errors, or `a != null ? a.b : c` which is unnecessarily verbose. ## Glossary * Child resource: an Azure resource type with type name consisting of more than 1 `/` characters. For example, `Microsoft.Network/virtualNetworks/subnets` is a child resource. `Microsoft.Network/virtualNetworks` is not. ================================================ FILE: .github/prompts/review_pr_comments.prompt.md ================================================ --- agent: agent --- We have received comments on the current active pull request. Together, we will go through each comment one by one and discuss whether to accept the change, iterate on it, or reject the change. ## Steps to follow: 1. Fetch the active pull request: If available, use the `activePullRequest` tool from the `GitHub Pull Requests` toolset to get the details of the active pull request including the comments. If not, use the GitHub MCP server or GitHub CLI to get the details of the active pull request. Fetch both top level comments and inline comments. 2. Present a list of the comments with a one-sentence summary of each. 3. One at a time, present each comment in full detail and ask me whether to accept, iterate, or reject the change. Provide your recommendation for each comment based on best practices, code quality, and project guidelines. Await user's decision before proceeding to the next comment. DO NOT make any changes to the code or files until I have responded with my decision for each comment. 4. If the decision is to accept or iterate, make the necessary code changes to address the comment. If the decision is to reject, provide a brief explanation of why the change was not made. 5. Wait for user to affirm completion of any code changes made before moving to the next comment. 6. Reply to each comment on the pull request with the outcome of our discussion (accepted, iterated, or rejected) along with any relevant explanations. ================================================ FILE: .github/skills/github-pr-inline-reply/SKILL.md ================================================ --- name: github-pr-inline-reply description: Reply to inline PR review comments on GitHub pull requests using the GitHub API. Use this skill when you need to respond to individual review comments on a PR, acknowledge feedback, or mark comments as resolved by posting direct replies to comment threads. --- # GitHub PR Inline Reply Skill This skill enables replying directly to inline review comments on GitHub pull requests. ## When to use - Replying to individual PR review comments - Acknowledging reviewer feedback on specific lines of code - Marking review comments as addressed with a reply ## API Endpoint To reply to an inline PR comment, use: ```http POST /repos/{owner}/{repo}/pulls/{pull_number}/comments/{comment_id}/replies ``` With body: ```json { "body": "Your reply message" } ``` ## Using gh CLI ```bash gh api repos/{owner}/{repo}/pulls/{pull_number}/comments/{comment_id}/replies \ -X POST \ -f body="Your reply message" ``` ## Workflow 1. **Get PR comments**: First fetch the PR review comments to get their IDs: ```bash gh api repos/{owner}/{repo}/pulls/{pull_number}/comments ``` 2. **Identify comment IDs**: Each comment has an `id` field. For threaded comments, use the root comment's `id`. 3. **Post replies**: For each comment you want to reply to: ```bash gh api repos/{owner}/{repo}/pulls/{pull_number}/comments/{comment_id}/replies \ -X POST \ -f body="Fixed in commit abc123" ``` ## Example Replies For accepted changes: - "Fixed in {commit_sha}" - "Accepted - fixed in {commit_sha}" For rejected changes: - "Rejected - {reason}" - "Won't fix - {explanation}" For questions: - "Good catch, addressed in {commit_sha}" ## Notes - The `comment_id` is the numeric ID from the comment object, NOT the `node_id` - Replies appear as threaded responses under the original comment - You can reply to any comment, including bot comments (like Copilot reviews) ## Resolving Conversations To resolve (mark as resolved) PR review threads, use the GraphQL API: 1. **Get thread IDs**: Query for unresolved threads: ```bash gh api graphql -f query=' query { repository(owner: "{owner}", name: "{repo}") { pullRequest(number: {pull_number}) { reviewThreads(first: 50) { nodes { id isResolved comments(first: 1) { nodes { body path } } } } } } }' ``` 2. **Resolve threads**: Use the `resolveReviewThread` mutation: ```bash gh api graphql -f query=' mutation { resolveReviewThread(input: {threadId: "PRRT_xxx"}) { thread { isResolved } } }' ``` 3. **Resolve multiple threads at once**: ```bash gh api graphql -f query=' mutation { t1: resolveReviewThread(input: {threadId: "PRRT_xxx"}) { thread { isResolved } } t2: resolveReviewThread(input: {threadId: "PRRT_yyy"}) { thread { isResolved } } }' ``` The thread ID starts with `PRRT_` and can be found in the GraphQL query response. Note: This skill can be removed once the GitHub MCP server has added built-in support for replying to PR review comments and resolving threads. See: https://github.com/github/github-mcp-server/issues/1323 https://github.com/github/github-mcp-server/issues/1768 ================================================ FILE: .github/workflows/azure-dev-validation.yaml ================================================ name: Validate AZD template on: push: branches: [ main ] paths: - "infra/**" pull_request: branches: [ main ] paths: - "infra/**" workflow_dispatch: jobs: bicep: runs-on: ubuntu-latest permissions: security-events: write steps: - name: Checkout uses: actions/checkout@v6 - name: Build Bicep for linting uses: azure/CLI@v2 with: inlineScript: | export DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 az config set bicep.use_binary_from_path=false && az bicep build -f infra/main.bicep --stdout psrule: runs-on: ubuntu-latest permissions: security-events: write steps: - name: Checkout uses: actions/checkout@v6 - name: Run PSRule analysis uses: microsoft/ps-rule@v2.9.0 with: modules: PSRule.Rules.Azure baseline: Azure.Pillar.Security inputPath: infra/*.test.bicep outputFormat: Sarif outputPath: reports/ps-rule-results.sarif summary: true continue-on-error: true env: PSRULE_CONFIGURATION_AZURE_BICEP_FILE_EXPANSION: 'true' PSRULE_CONFIGURATION_AZURE_BICEP_FILE_EXPANSION_TIMEOUT: '30' - name: Upload results to security tab uses: github/codeql-action/upload-sarif@v4 if: github.repository == 'Azure-Samples/azure-search-openai-demo' with: sarif_file: reports/ps-rule-results.sarif ================================================ FILE: .github/workflows/azure-dev.yml ================================================ name: Deploy on: workflow_dispatch: push: # Run when commits are pushed to mainline branch (main or master) # Set this to the mainline branch you are using branches: - main - master # GitHub Actions workflow to deploy to Azure using azd # To configure required secrets for connecting to Azure, simply run `azd pipeline config` # Set up permissions for deploying with secretless Azure federated credentials # https://learn.microsoft.com/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication permissions: id-token: write contents: read jobs: build: runs-on: ubuntu-latest env: # azd required AZURE_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }} AZURE_TENANT_ID: ${{ vars.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} AZURE_ENV_NAME: ${{ vars.AZURE_ENV_NAME }} AZURE_LOCATION: ${{ vars.AZURE_LOCATION }} # project specific AZURE_OPENAI_SERVICE: ${{ vars.AZURE_OPENAI_SERVICE }} AZURE_OPENAI_LOCATION: ${{ vars.AZURE_OPENAI_LOCATION }} AZURE_OPENAI_RESOURCE_GROUP: ${{ vars.AZURE_OPENAI_RESOURCE_GROUP }} AZURE_DOCUMENTINTELLIGENCE_SERVICE: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_SERVICE }} AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP }} AZURE_DOCUMENTINTELLIGENCE_SKU: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_SKU }} AZURE_DOCUMENTINTELLIGENCE_LOCATION: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_LOCATION }} AZURE_VISION_SERVICE: ${{ vars.AZURE_VISION_SERVICE }} AZURE_VISION_RESOURCE_GROUP: ${{ vars.AZURE_VISION_RESOURCE_GROUP }} AZURE_VISION_LOCATION: ${{ vars.AZURE_VISION_LOCATION }} AZURE_VISION_SKU: ${{ vars.AZURE_VISION_SKU }} AZURE_SEARCH_INDEX: ${{ vars.AZURE_SEARCH_INDEX }} AZURE_SEARCH_SERVICE: ${{ vars.AZURE_SEARCH_SERVICE }} AZURE_SEARCH_SERVICE_RESOURCE_GROUP: ${{ vars.AZURE_SEARCH_SERVICE_RESOURCE_GROUP }} AZURE_SEARCH_SERVICE_LOCATION: ${{ vars.AZURE_SEARCH_SERVICE_LOCATION }} AZURE_SEARCH_SERVICE_SKU: ${{ vars.AZURE_SEARCH_SERVICE_SKU }} AZURE_SEARCH_QUERY_LANGUAGE: ${{ vars.AZURE_SEARCH_QUERY_LANGUAGE }} AZURE_SEARCH_QUERY_SPELLER: ${{ vars.AZURE_SEARCH_QUERY_SPELLER }} AZURE_SEARCH_SEMANTIC_RANKER: ${{ vars.AZURE_SEARCH_SEMANTIC_RANKER }} AZURE_SEARCH_QUERY_REWRITING: ${{ vars.AZURE_SEARCH_QUERY_REWRITING }} AZURE_SEARCH_FIELD_NAME_EMBEDDING: ${{ vars.AZURE_SEARCH_FIELD_NAME_EMBEDDING }} AZURE_STORAGE_ACCOUNT: ${{ vars.AZURE_STORAGE_ACCOUNT }} AZURE_STORAGE_RESOURCE_GROUP: ${{ vars.AZURE_STORAGE_RESOURCE_GROUP }} AZURE_STORAGE_SKU: ${{ vars.AZURE_STORAGE_SKU }} AZURE_APP_SERVICE_PLAN: ${{ vars.AZURE_APP_SERVICE_PLAN }} AZURE_APP_SERVICE_SKU: ${{ vars.AZURE_APP_SERVICE_SKU }} AZURE_APP_SERVICE: ${{ vars.AZURE_APP_SERVICE }} AZURE_OPENAI_CHATGPT_MODEL: ${{ vars.AZURE_OPENAI_CHATGPT_MODEL }} AZURE_OPENAI_CHATGPT_DEPLOYMENT: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT }} AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY }} AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION }} AZURE_OPENAI_REASONING_EFFORT: ${{ vars.AZURE_OPENAI_REASONING_EFFORT }} AGENTIC_KNOWLEDGEBASE_REASONING_EFFORT: ${{ vars.AGENTIC_KNOWLEDGEBASE_REASONING_EFFORT }} AZURE_OPENAI_EMB_MODEL_NAME: ${{ vars.AZURE_OPENAI_EMB_MODEL_NAME }} AZURE_OPENAI_EMB_DEPLOYMENT: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT }} AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY }} AZURE_OPENAI_EMB_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_VERSION }} AZURE_OPENAI_EMB_DIMENSIONS: ${{ vars.AZURE_OPENAI_EMB_DIMENSIONS }} USE_EVAL: ${{ vars.USE_EVAL }} AZURE_OPENAI_EVAL_MODEL: ${{ vars.AZURE_OPENAI_EVAL_MODEL }} AZURE_OPENAI_EVAL_MODEL_VERSION: ${{ vars.AZURE_OPENAI_EVAL_MODEL_VERSION }} AZURE_OPENAI_EVAL_DEPLOYMENT: ${{ vars.AZURE_OPENAI_EVAL_DEPLOYMENT }} AZURE_OPENAI_EVAL_DEPLOYMENT_SKU: ${{ vars.AZURE_OPENAI_EVAL_DEPLOYMENT_SKU }} AZURE_OPENAI_EVAL_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_EVAL_DEPLOYMENT_CAPACITY }} AZURE_OPENAI_DISABLE_KEYS: ${{ vars.AZURE_OPENAI_DISABLE_KEYS }} OPENAI_HOST: ${{ vars.OPENAI_HOST }} OPENAI_API_KEY: ${{ vars.OPENAI_API_KEY }} OPENAI_ORGANIZATION: ${{ vars.OPENAI_ORGANIZATION }} AZURE_USE_APPLICATION_INSIGHTS: ${{ vars.AZURE_USE_APPLICATION_INSIGHTS }} AZURE_APPLICATION_INSIGHTS: ${{ vars.AZURE_APPLICATION_INSIGHTS }} AZURE_APPLICATION_INSIGHTS_DASHBOARD: ${{ vars.AZURE_APPLICATION_INSIGHTS_DASHBOARD }} AZURE_LOG_ANALYTICS: ${{ vars.AZURE_LOG_ANALYTICS }} USE_VECTORS: ${{ vars.USE_VECTORS }} USE_MULTIMODAL: ${{ vars.USE_MULTIMODAL }} USE_CLOUD_INGESTION: ${{ vars.USE_CLOUD_INGESTION }} USE_CLOUD_INGESTION_ACLS: ${{ vars.USE_CLOUD_INGESTION_ACLS }} USE_EXISTING_ADLS_STORAGE: ${{ vars.USE_EXISTING_ADLS_STORAGE }} AZURE_ADLS_GEN2_STORAGE_ACCOUNT: ${{ vars.AZURE_ADLS_GEN2_STORAGE_ACCOUNT }} AZURE_ADLS_GEN2_STORAGE_RESOURCE_GROUP: ${{ vars.AZURE_ADLS_GEN2_STORAGE_RESOURCE_GROUP }} AZURE_VISION_ENDPOINT: ${{ vars.AZURE_VISION_ENDPOINT }} VISION_SECRET_NAME: ${{ vars.VISION_SECRET_NAME }} ENABLE_LANGUAGE_PICKER: ${{ vars.ENABLE_LANGUAGE_PICKER }} USE_SPEECH_INPUT_BROWSER: ${{ vars.USE_SPEECH_INPUT_BROWSER }} USE_SPEECH_OUTPUT_BROWSER: ${{ vars.USE_SPEECH_OUTPUT_BROWSER }} USE_SPEECH_OUTPUT_AZURE: ${{ vars.USE_SPEECH_OUTPUT_AZURE }} AZURE_SPEECH_SERVICE: ${{ vars.AZURE_SPEECH_SERVICE }} AZURE_SPEECH_SERVICE_RESOURCE_GROUP: ${{ vars.AZURE_SPEECH_RESOURCE_GROUP }} AZURE_SPEECH_SERVICE_LOCATION: ${{ vars.AZURE_SPEECH_SERVICE_LOCATION }} AZURE_SPEECH_SERVICE_SKU: ${{ vars.AZURE_SPEECH_SERVICE_SKU }} AZURE_SPEECH_SERVICE_VOICE: ${{ vars.AZURE_SPEECH_SERVICE_VOICE }} AZURE_KEY_VAULT_NAME: ${{ vars.AZURE_KEY_VAULT_NAME }} AZURE_USE_AUTHENTICATION: ${{ vars.AZURE_USE_AUTHENTICATION }} AZURE_ENFORCE_ACCESS_CONTROL: ${{ vars.AZURE_ENFORCE_ACCESS_CONTROL }} AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS: ${{ vars.AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS }} AZURE_ENABLE_UNAUTHENTICATED_ACCESS: ${{ vars.AZURE_ENABLE_UNAUTHENTICATED_ACCESS }} AZURE_AUTH_TENANT_ID: ${{ vars.AZURE_AUTH_TENANT_ID }} AZURE_SERVER_APP_ID: ${{ vars.AZURE_SERVER_APP_ID }} AZURE_CLIENT_APP_ID: ${{ vars.AZURE_CLIENT_APP_ID }} ALLOWED_ORIGIN: ${{ vars.ALLOWED_ORIGIN }} AZURE_ADLS_GEN2_FILESYSTEM: ${{ vars.AZURE_ADLS_GEN2_FILESYSTEM }} DEPLOYMENT_TARGET: ${{ vars.DEPLOYMENT_TARGET }} AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: ${{ vars.AZURE_CONTAINER_APPS_WORKLOAD_PROFILE }} USE_CHAT_HISTORY_BROWSER: ${{ vars.USE_CHAT_HISTORY_BROWSER }} USE_MEDIA_DESCRIBER_AZURE_CU: ${{ vars.USE_MEDIA_DESCRIBER_AZURE_CU }} USE_AI_PROJECT: ${{ vars.USE_AI_PROJECT }} RAG_SEARCH_TEXT_EMBEDDINGS: ${{ vars.RAG_SEARCH_TEXT_EMBEDDINGS }} RAG_SEARCH_IMAGE_EMBEDDINGS: ${{ vars.RAG_SEARCH_IMAGE_EMBEDDINGS }} RAG_SEND_TEXT_SOURCES: ${{ vars.RAG_SEND_TEXT_SOURCES }} RAG_SEND_IMAGE_SOURCES: ${{ vars.RAG_SEND_IMAGE_SOURCES }} USE_AGENTIC_KNOWLEDGEBASE: ${{ vars.USE_AGENTIC_KNOWLEDGEBASE }} USE_WEB_SOURCE: ${{ vars.USE_WEB_SOURCE }} USE_SHAREPOINT_SOURCE: ${{ vars.USE_SHAREPOINT_SOURCE }} steps: - name: Checkout uses: actions/checkout@v6 - name: Install azd uses: Azure/setup-azd@v2.2.1 - name: Install Nodejs uses: actions/setup-node@v6 with: node-version: 20 - name: Log in with Azure (Federated Credentials) run: | azd auth login ` --client-id "$Env:AZURE_CLIENT_ID" ` --federated-credential-provider "github" ` --tenant-id "$Env:AZURE_TENANT_ID" shell: pwsh - name: Provision Infrastructure run: azd provision --no-prompt env: AZD_INITIAL_ENVIRONMENT_CONFIG: ${{ secrets.AZD_INITIAL_ENVIRONMENT_CONFIG }} AZURE_SERVER_APP_SECRET: ${{ secrets.AZURE_SERVER_APP_SECRET }} AZURE_CLIENT_APP_SECRET: ${{ secrets.AZURE_CLIENT_APP_SECRET }} - name: Deploy Application run: azd deploy --no-prompt ================================================ FILE: .github/workflows/evaluate.yaml ================================================ name: Evaluate RAG answer flow on: issue_comment: types: [created] # Set up permissions for deploying with secretless Azure federated credentials # https://learn.microsoft.com/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication permissions: id-token: write contents: read issues: write pull-requests: write jobs: evaluate: if: | contains('["OWNER", "CONTRIBUTOR", "COLLABORATOR", "MEMBER"]', github.event.comment.author_association) && github.event.issue.pull_request && github.event.comment.body == '/evaluate' runs-on: ubuntu-latest env: # azd required AZURE_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }} AZURE_TENANT_ID: ${{ vars.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }} AZURE_ENV_NAME: ${{ vars.AZURE_ENV_NAME }} AZURE_LOCATION: ${{ vars.AZURE_LOCATION }} # project specific AZURE_OPENAI_SERVICE: ${{ vars.AZURE_OPENAI_SERVICE }} AZURE_OPENAI_LOCATION: ${{ vars.AZURE_OPENAI_LOCATION }} AZURE_OPENAI_RESOURCE_GROUP: ${{ vars.AZURE_OPENAI_RESOURCE_GROUP }} AZURE_DOCUMENTINTELLIGENCE_SERVICE: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_SERVICE }} AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP }} AZURE_DOCUMENTINTELLIGENCE_SKU: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_SKU }} AZURE_DOCUMENTINTELLIGENCE_LOCATION: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_LOCATION }} AZURE_VISION_SERVICE: ${{ vars.AZURE_VISION_SERVICE }} AZURE_VISION_RESOURCE_GROUP: ${{ vars.AZURE_VISION_RESOURCE_GROUP }} AZURE_VISION_LOCATION: ${{ vars.AZURE_VISION_LOCATION }} AZURE_VISION_SKU: ${{ vars.AZURE_VISION_SKU }} AZURE_SEARCH_INDEX: ${{ vars.AZURE_SEARCH_INDEX }} AZURE_SEARCH_SERVICE: ${{ vars.AZURE_SEARCH_SERVICE }} AZURE_SEARCH_SERVICE_RESOURCE_GROUP: ${{ vars.AZURE_SEARCH_SERVICE_RESOURCE_GROUP }} AZURE_SEARCH_SERVICE_LOCATION: ${{ vars.AZURE_SEARCH_SERVICE_LOCATION }} AZURE_SEARCH_SERVICE_SKU: ${{ vars.AZURE_SEARCH_SERVICE_SKU }} AZURE_SEARCH_QUERY_LANGUAGE: ${{ vars.AZURE_SEARCH_QUERY_LANGUAGE }} AZURE_SEARCH_QUERY_SPELLER: ${{ vars.AZURE_SEARCH_QUERY_SPELLER }} AZURE_SEARCH_SEMANTIC_RANKER: ${{ vars.AZURE_SEARCH_SEMANTIC_RANKER }} AZURE_STORAGE_ACCOUNT: ${{ vars.AZURE_STORAGE_ACCOUNT }} AZURE_STORAGE_RESOURCE_GROUP: ${{ vars.AZURE_STORAGE_RESOURCE_GROUP }} AZURE_STORAGE_SKU: ${{ vars.AZURE_STORAGE_SKU }} AZURE_APP_SERVICE_PLAN: ${{ vars.AZURE_APP_SERVICE_PLAN }} AZURE_APP_SERVICE_SKU: ${{ vars.AZURE_APP_SERVICE_SKU }} AZURE_APP_SERVICE: ${{ vars.AZURE_APP_SERVICE }} AZURE_OPENAI_CHATGPT_MODEL: ${{ vars.AZURE_OPENAI_CHATGPT_MODEL }} AZURE_OPENAI_CHATGPT_DEPLOYMENT: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT }} AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY }} AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION }} AZURE_OPENAI_EMB_MODEL_NAME: ${{ vars.AZURE_OPENAI_EMB_MODEL_NAME }} AZURE_OPENAI_EMB_DEPLOYMENT: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT }} AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY }} AZURE_OPENAI_EMB_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_VERSION }} AZURE_OPENAI_EMB_DIMENSIONS: ${{ vars.AZURE_OPENAI_EMB_DIMENSIONS }} USE_EVAL: ${{ vars.USE_EVAL }} AZURE_OPENAI_EVAL_MODEL: ${{ vars.AZURE_OPENAI_EVAL_MODEL }} AZURE_OPENAI_EVAL_MODEL_VERSION: ${{ vars.AZURE_OPENAI_EVAL_MODEL_VERSION }} AZURE_OPENAI_EVAL_DEPLOYMENT: ${{ vars.AZURE_OPENAI_EVAL_DEPLOYMENT }} AZURE_OPENAI_EVAL_DEPLOYMENT_SKU: ${{ vars.AZURE_OPENAI_EVAL_DEPLOYMENT_SKU }} AZURE_OPENAI_EVAL_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_EVAL_DEPLOYMENT_CAPACITY }} AZURE_OPENAI_DISABLE_KEYS: ${{ vars.AZURE_OPENAI_DISABLE_KEYS }} OPENAI_HOST: ${{ vars.OPENAI_HOST }} OPENAI_API_KEY: ${{ vars.OPENAI_API_KEY }} OPENAI_ORGANIZATION: ${{ vars.OPENAI_ORGANIZATION }} AZURE_USE_APPLICATION_INSIGHTS: ${{ vars.AZURE_USE_APPLICATION_INSIGHTS }} AZURE_APPLICATION_INSIGHTS: ${{ vars.AZURE_APPLICATION_INSIGHTS }} AZURE_APPLICATION_INSIGHTS_DASHBOARD: ${{ vars.AZURE_APPLICATION_INSIGHTS_DASHBOARD }} AZURE_LOG_ANALYTICS: ${{ vars.AZURE_LOG_ANALYTICS }} USE_VECTORS: ${{ vars.USE_VECTORS }} USE_MULTIMODAL: ${{ vars.USE_MULTIMODAL }} AZURE_VISION_ENDPOINT: ${{ vars.AZURE_VISION_ENDPOINT }} VISION_SECRET_NAME: ${{ vars.VISION_SECRET_NAME }} ENABLE_LANGUAGE_PICKER: ${{ vars.ENABLE_LANGUAGE_PICKER }} USE_SPEECH_INPUT_BROWSER: ${{ vars.USE_SPEECH_INPUT_BROWSER }} USE_SPEECH_OUTPUT_BROWSER: ${{ vars.USE_SPEECH_OUTPUT_BROWSER }} USE_SPEECH_OUTPUT_AZURE: ${{ vars.USE_SPEECH_OUTPUT_AZURE }} AZURE_SPEECH_SERVICE: ${{ vars.AZURE_SPEECH_SERVICE }} AZURE_SPEECH_SERVICE_RESOURCE_GROUP: ${{ vars.AZURE_SPEECH_RESOURCE_GROUP }} AZURE_SPEECH_SERVICE_LOCATION: ${{ vars.AZURE_SPEECH_SERVICE_LOCATION }} AZURE_SPEECH_SERVICE_SKU: ${{ vars.AZURE_SPEECH_SERVICE_SKU }} AZURE_SPEECH_SERVICE_VOICE: ${{ vars.AZURE_SPEECH_SERVICE_VOICE }} AZURE_KEY_VAULT_NAME: ${{ vars.AZURE_KEY_VAULT_NAME }} AZURE_USE_AUTHENTICATION: ${{ vars.AZURE_USE_AUTHENTICATION }} AZURE_ENFORCE_ACCESS_CONTROL: ${{ vars.AZURE_ENFORCE_ACCESS_CONTROL }} AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS: ${{ vars.AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS }} AZURE_ENABLE_UNAUTHENTICATED_ACCESS: ${{ vars.AZURE_ENABLE_UNAUTHENTICATED_ACCESS }} AZURE_AUTH_TENANT_ID: ${{ vars.AZURE_AUTH_TENANT_ID }} AZURE_SERVER_APP_ID: ${{ vars.AZURE_SERVER_APP_ID }} AZURE_CLIENT_APP_ID: ${{ vars.AZURE_CLIENT_APP_ID }} ALLOWED_ORIGIN: ${{ vars.ALLOWED_ORIGIN }} AZURE_ADLS_GEN2_STORAGE_ACCOUNT: ${{ vars.AZURE_ADLS_GEN2_STORAGE_ACCOUNT }} AZURE_ADLS_GEN2_FILESYSTEM: ${{ vars.AZURE_ADLS_GEN2_FILESYSTEM }} DEPLOYMENT_TARGET: ${{ vars.DEPLOYMENT_TARGET }} AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: ${{ vars.AZURE_CONTAINER_APPS_WORKLOAD_PROFILE }} USE_CHAT_HISTORY_BROWSER: ${{ vars.USE_CHAT_HISTORY_BROWSER }} USE_MEDIA_DESCRIBER_AZURE_CU: ${{ vars.USE_MEDIA_DESCRIBER_AZURE_CU }} USE_AI_PROJECT: ${{ vars.USE_AI_PROJECT }} steps: - name: Comment on pull request uses: actions/github-script@v8 with: script: | github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, body: "Starting evaluation! Check the Actions tab for progress, or wait for a comment with the results." }) - name: Checkout pull request uses: actions/checkout@v6 with: ref: refs/pull/${{ github.event.issue.number }}/head - name: Install uv uses: astral-sh/setup-uv@v7 with: enable-cache: true version: "0.9.5" cache-dependency-glob: "requirements**.txt" python-version: "3.11" - name: Setup node uses: actions/setup-node@v6 with: node-version: 20 - name: Install azd uses: Azure/setup-azd@v2.2.1 - name: Login to Azure with az CLI uses: azure/login@v2 with: client-id: ${{ env.AZURE_CLIENT_ID }} tenant-id: ${{ env.AZURE_TENANT_ID }} subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }} - name: Set az account uses: azure/CLI@v2 with: inlineScript: | az account set --subscription ${{env.AZURE_SUBSCRIPTION_ID}} - name: Login to with Azure with azd (Federated Credentials) run: | azd auth login ` --client-id "$Env:AZURE_CLIENT_ID" ` --federated-credential-provider "github" ` --tenant-id "$Env:AZURE_TENANT_ID" shell: pwsh - name: Refresh azd environment variables run: | azd env refresh -e $AZURE_ENV_NAME --no-prompt env: AZD_INITIAL_ENVIRONMENT_CONFIG: ${{ secrets.AZD_INITIAL_ENVIRONMENT_CONFIG }} - name: Build frontend run: | cd ./app/frontend npm install npm run build - name: Install dependencies run: | uv pip install -r requirements-dev.txt - name: Run local server in background run: | cd app/backend RUNNER_TRACKING_ID="" && (nohup python3 -m quart --app main:app run --port 50505 > serverlogs.out 2> serverlogs.err &) cd ../.. - name: Install evaluate dependencies run: | uv pip install -r evals/requirements.txt - name: Evaluate local RAG flow run: | python evals/evaluate.py --targeturl=http://127.0.0.1:50505/chat --resultsdir=evals/results/pr${{ github.event.issue.number }} - name: Upload eval results as build artifact if: ${{ success() }} uses: actions/upload-artifact@v7 with: name: eval_result path: ./evals/results/pr${{ github.event.issue.number }} - name: Upload server logs as build artifact uses: actions/upload-artifact@v7 with: name: server_logs path: ./app/backend/serverlogs.out - name: Upload server error logs as build artifact uses: actions/upload-artifact@v7 with: name: server_error_logs path: ./app/backend/serverlogs.err - name: Summarize results if: ${{ success() }} run: | echo "## Evaluation results" >> eval-summary.md python -m evaltools summary evals/results --output=markdown >> eval-summary.md echo "## Answer differences across runs" >> run-diff.md python -m evaltools diff evals/results/baseline evals/results/pr${{ github.event.issue.number }} --output=markdown >> run-diff.md cat eval-summary.md >> $GITHUB_STEP_SUMMARY cat run-diff.md >> $GITHUB_STEP_SUMMARY - name: Comment on pull request uses: actions/github-script@v8 with: script: | const fs = require('fs'); const summaryPath = "eval-summary.md"; const summary = fs.readFileSync(summaryPath, 'utf8'); const runId = process.env.GITHUB_RUN_ID; const repo = process.env.GITHUB_REPOSITORY; const actionsUrl = `https://github.com/${repo}/actions/runs/${runId}`; github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, body: `${summary}\n\n[Check the workflow run for more details](${actionsUrl}).` }) ================================================ FILE: .github/workflows/frontend.yaml ================================================ name: Frontend linting on: push: branches: [ main ] paths: - "app/frontend/**" pull_request: branches: [ main ] paths: - "app/frontend/**" jobs: prettier: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Run prettier on frontend run: | cd ./app/frontend npm install npx prettier --check . ================================================ FILE: .github/workflows/lint-markdown.yml ================================================ name: Validate Markdown on: pull_request: branches: - main paths: - '**.md' jobs: lint-markdown: name: Check for Markdown linting errors runs-on: ubuntu-latest steps: - name: Checkout repo uses: actions/checkout@v6 - name: Run markdownlint-cli2 uses: DavidAnson/markdownlint-cli2-action@v22 with: config: .markdownlint-cli2.jsonc globs: | **/*.md !data/** !.github/** ================================================ FILE: .github/workflows/nightly-jobs.yaml ================================================ name: Nightly Jobs on: schedule: - cron: '0 0 * * *' workflow_dispatch: jobs: python-test: uses: ./.github/workflows/python-test.yaml ================================================ FILE: .github/workflows/python-test.yaml ================================================ name: Python check on: push: branches: [ main ] paths-ignore: - "**.md" - ".azdo/**" - ".devcontainer/**" - ".github/**" pull_request: branches: [ main ] paths-ignore: - "**.md" - ".azdo/**" - ".devcontainer/**" - ".github/**" workflow_call: jobs: test_package: name: Test ${{ matrix.os }} Python ${{ matrix.python_version }} Node ${{ matrix.node_version }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: ["ubuntu-latest", "windows-latest"] python_version: ["3.10", "3.11", "3.12", "3.13", "3.14"] node_version: ["20.14", "22"] steps: - uses: actions/checkout@v6 with: # Fetch full history so diff-cover can compute a merge base with origin/main fetch-depth: 0 - name: Install uv uses: astral-sh/setup-uv@v7 with: enable-cache: true version: "0.9.5" cache-dependency-glob: "requirements**.txt" python-version: ${{ matrix.python_version }} activate-environment: true - name: Setup node uses: actions/setup-node@v6 with: node-version: ${{ matrix.node_version }} - name: Build frontend run: | cd ./app/frontend npm install npm run build - name: Check i18n translations run: npx -y @lingual/i18n-check@0.8.12 --locales app/frontend/src/locales -s en -f i18next -r summary - name: Install dependencies run: | uv pip install -r requirements-dev.txt - name: Lint with ruff run: ruff check . - name: Check types with ty run: ty check - name: Check formatting with black run: black . --check --verbose - name: Run Python tests if: runner.os != 'Windows' run: pytest -s -vv --cov --cov-report=xml --cov-fail-under=90 - name: Check diff coverage if: runner.os != 'Windows' run: | BASE_REF="${{ github.base_ref }}" if [ -z "$BASE_REF" ]; then BASE_REF="main"; fi git fetch origin "$BASE_REF:refs/remotes/origin/$BASE_REF" diff-cover coverage.xml --compare-branch="origin/$BASE_REF" --fail-under=90 - name: Run E2E tests with Playwright id: e2e if: runner.os != 'Windows' run: | playwright install chromium --with-deps pytest tests/e2e.py --tracing=retain-on-failure - name: Upload test artifacts if: ${{ failure() && steps.e2e.conclusion == 'failure' }} uses: actions/upload-artifact@v7 with: name: playwright-traces${{ matrix.python_version }} path: test-results ================================================ FILE: .github/workflows/stale-bot.yml ================================================ name: 'Close stale issues and PRs' on: schedule: - cron: '30 1 * * *' jobs: stale: runs-on: ubuntu-latest steps: - uses: actions/stale@v10 with: stale-issue-message: 'This issue is stale because it has been open 60 days with no activity. Remove stale label or comment or this issue will be closed.' stale-pr-message: 'This PR is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed.' close-issue-message: 'This issue was closed because it has been stalled for 7 days with no activity.' close-pr-message: 'This PR was closed because it has been stalled for 10 days with no activity.' days-before-issue-stale: 60 days-before-pr-stale: 60 days-before-issue-close: -1 days-before-pr-close: -1 ================================================ FILE: .github/workflows/validate-markdown.yml ================================================ name: Validate Markdown on: # Trigger the workflow on pull request pull_request_target: branches: - main paths: - '**.md' - '**.ipynb' permissions: contents: read pull-requests: write jobs: check-broken-paths: name: Check Broken Relative Paths runs-on: ubuntu-latest steps: - name: Checkout Repo uses: actions/checkout@v6 with: ref: ${{ github.event.pull_request.head.sha }} - name: Check broken Paths id: check-broken-paths uses: john0isaac/action-check-markdown@v1.1.0 with: command: check_broken_paths directory: ./ guide-url: 'https://github.com/Azure-Samples/azure-search-openai-demo/blob/main/CONTRIBUTING.md' github-token: ${{ secrets.GITHUB_TOKEN }} check-urls-locale: if: ${{ always() }} needs: check-broken-paths name: Check URLs Don't Have Locale runs-on: ubuntu-latest steps: - name: Checkout Repo uses: actions/checkout@v6 with: ref: ${{ github.event.pull_request.head.sha }} - name: Run Check URLs Country Locale id: check-urls-locale uses: john0isaac/action-check-markdown@v1.1.0 with: command: check_urls_locale directory: ./ guide-url: 'https://github.com/Azure-Samples/azure-search-openai-demo/blob/main/CONTRIBUTING.md' github-token: ${{ secrets.GITHUB_TOKEN }} check-broken-urls: if: ${{ always() }} name: Check Broken URLs runs-on: ubuntu-latest steps: - name: Checkout Repo uses: actions/checkout@v6 with: ref: ${{ github.event.pull_request.head.sha }} - name: Run Check Broken URLs id: check-broken-urls uses: john0isaac/action-check-markdown@v1.1.0 with: command: check_broken_urls directory: ./ guide-url: 'https://github.com/Azure-Samples/azure-search-openai-demo/blob/main/CONTRIBUTING.md' github-token: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .gitignore ================================================ # Azure az webapp deployment details .azure *_env # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ coverage_report.html # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv .evalenv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # ty .ty_cache/ # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # NPM npm-debug.log* node_modules static/ app/functions/*/prepdocslib/ app/functions/*/requirements.txt data/**/*.md5 .DS_Store ================================================ FILE: .markdownlint-cli2.jsonc ================================================ { "config": { "default": true, "line-length": false, "table-column-style": false, "MD033": { "allowed_elements": ["br", "details", "summary"] } } } ================================================ FILE: .pre-commit-config.yaml ================================================ exclude: '^tests/snapshots/' repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.14.2 hooks: - id: ruff - repo: https://github.com/psf/black rev: 26.1.0 hooks: - id: black - repo: https://github.com/pre-commit/mirrors-prettier rev: v3.1.0 hooks: - id: prettier types_or: [css, javascript, ts, tsx, html] ================================================ FILE: .vscode/extensions.json ================================================ { "recommendations": [ "ms-azuretools.azure-dev", "ms-azuretools.vscode-bicep", "ms-python.python", "astral-sh.ty", "esbenp.prettier-vscode", "DavidAnson.vscode-markdownlint" ] } ================================================ FILE: .vscode/launch.json ================================================ { // Use IntelliSense to learn about possible attributes. // Hover to view descriptions of existing attributes. // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ { "name": "Backend (Python)", "type": "debugpy", "request": "launch", "module": "quart", "cwd": "${workspaceFolder}/app/backend", // Use the currently selected interpreter in VS Code. MAC/Linux use /bin, Windows uses /Scripts "python": "${command:python.interpreterPath}", "env": { "QUART_APP": "main:app", "QUART_ENV": "development", "QUART_DEBUG": "0", // Set this to "no-override" if you want env vars here to override AZD env vars "LOADING_MODE_FOR_AZD_ENV_VARS": "override" }, "args": [ "run", "--no-reload", "-p 50505" ], "console": "integratedTerminal", "justMyCode": false }, { "name": "Frontend", "type": "node-terminal", "request": "launch", "command": "npm run dev", "cwd": "${workspaceFolder}/app/frontend", }, { "name": "Tests (Python)", "type": "debugpy", "request": "launch", "program": "${file}", "purpose": ["debug-test"], "console": "integratedTerminal", "justMyCode": false } ], "compounds": [ { "name": "Frontend & Backend", "configurations": ["Backend (Python)", "Frontend"], "stopAll": true } ] } ================================================ FILE: .vscode/settings.json ================================================ { "python.languageServer": "None", // Disabling due to ty using its own full-featured language server "[javascript]": { "editor.defaultFormatter": "esbenp.prettier-vscode", "editor.formatOnSave": true }, "[typescript]": { "editor.defaultFormatter": "esbenp.prettier-vscode", "editor.formatOnSave": true }, "[typescriptreact]": { "editor.defaultFormatter": "esbenp.prettier-vscode", "editor.formatOnSave": true }, "[css]": { "editor.defaultFormatter": "esbenp.prettier-vscode", "editor.formatOnSave": true }, "files.exclude": { "**/__pycache__": true, "**/.coverage": true, "**/.pytest_cache": true, "**/.ruff_cache": true, "**/.mypy_cache": true, "**/.ty_cache": true }, "search.exclude": { "**/node_modules": true, "static": true }, "python.testing.pytestArgs": [ "tests" ], "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true } ================================================ FILE: .vscode/tasks.json ================================================ { "version": "2.0.0", "tasks": [ { "label": "Start App", "type": "shell", "command": "${workspaceFolder}/app/start.sh", "windows": { "command": "pwsh ${workspaceFolder}/app/start.ps1" }, "presentation": { "reveal": "silent" }, "options": { "cwd": "${workspaceFolder}/app" }, "problemMatcher": [] }, { "label": "Development", "dependsOn": [ "Frontend: npm run dev", "Backend: quart run" ], "group": { "kind": "build", "isDefault": true } }, { "label": "Frontend: npm run dev", "type": "npm", "script": "dev", "isBackground": true, "options": { "cwd": "${workspaceFolder}/app/frontend" }, "presentation": { "reveal": "always", "group": "buildWatchers", "panel": "dedicated", "clear": false }, "problemMatcher": { "pattern": { "regexp": "" }, "background": { "activeOnStart": true, "beginsPattern": ".*VITE v.*", "endsPattern": ".*(?:➜\\s*)?Local:\\s+https?://.*" } } }, { "label": "Backend: quart run", "type": "shell", "command": "${workspaceFolder}/.venv/bin/python", "windows": { "command": "${workspaceFolder}\\.venv\\Scripts\\python.exe" }, "args": ["-m", "quart", "run", "--reload", "-p", "50505"], "options": { "cwd": "${workspaceFolder}/app/backend", "env": { "QUART_APP": "main:app", "QUART_ENV": "development", "QUART_DEBUG": "0", "LOADING_MODE_FOR_AZD_ENV_VARS": "override" } }, "isBackground": true, "presentation": { "reveal": "always", "group": "buildWatchers", "panel": "dedicated" }, "problemMatcher": { "pattern": { "regexp": "" }, "background": { "activeOnStart": true, "beginsPattern": ".*Serving Quart app.*", "endsPattern": ".*hypercorn.*Running on http://.*" } } } ] } ================================================ FILE: AGENTS.md ================================================ # Instructions for Coding Agents This file contains instructions for developers working on the Azure Search and OpenAI demo application. It covers the overall code layout, how to add new data, how to add new azd environment variables, how to add new developer settings, and how to add tests for new features. Always keep this file up to date with any changes to the codebase or development process. If necessary, edit this file to ensure it accurately reflects the current state of the project. ## Overall code layout * app: Contains the main application code, including frontend and backend. * app/backend: Contains the Python backend code, written with Quart framework. * app/backend/approaches: Contains the different approaches * app/backend/approaches/approach.py: Base class for all approaches * app/backend/approaches/chatreadretrieveread.py: Chat approach, includes query rewriting step first * app/backend/approaches/promptmanager.py: Manages loading and rendering of Jinja2 prompt templates * app/backend/approaches/prompts/query_rewrite.system.jinja2: Jinja2 template used to rewrite the query based off search history into a better search query * app/backend/approaches/prompts/chat_query_rewrite_tools.json: Tools used by the query rewriting prompt * app/backend/approaches/prompts/chat_answer.system.jinja2: Jinja2 template for the system message used by the Chat approach to answer questions * app/backend/approaches/prompts/chat_answer.user.jinja2: Jinja2 template for the user message used by the Chat approach, including sources * app/backend/prepdocslib: Contains the document ingestion library used by both local and cloud ingestion * app/backend/prepdocslib/blobmanager.py: Manages uploads to Azure Blob Storage * app/backend/prepdocslib/cloudingestionstrategy.py: Builds the Azure AI Search indexer and skillset for the cloud ingestion pipeline * app/backend/prepdocslib/csvparser.py: Parses CSV files * app/backend/prepdocslib/embeddings.py: Generates embeddings for text and images using Azure OpenAI * app/backend/prepdocslib/figureprocessor.py: Generates figure descriptions for both local ingestion and the cloud figure-processor skill * app/backend/prepdocslib/fileprocessor.py: Orchestrates parsing and chunking of individual files * app/backend/prepdocslib/filestrategy.py: Strategy for uploading and indexing files (local ingestion) * app/backend/prepdocslib/htmlparser.py: Parses HTML files * app/backend/prepdocslib/integratedvectorizerstrategy.py: Strategy using Azure AI Search integrated vectorization * app/backend/prepdocslib/jsonparser.py: Parses JSON files * app/backend/prepdocslib/listfilestrategy.py: Lists files from local filesystem or Azure Data Lake * app/backend/prepdocslib/mediadescriber.py: Interfaces for describing images (Azure OpenAI GPT-4o, Content Understanding) * app/backend/prepdocslib/page.py: Data classes for pages, images, and chunks * app/backend/prepdocslib/parser.py: Base parser interface * app/backend/prepdocslib/pdfparser.py: Parses PDFs using Azure Document Intelligence or local parser * app/backend/prepdocslib/searchmanager.py: Manages Azure AI Search index creation and updates * app/backend/prepdocslib/servicesetup.py: Shared service setup helpers for OpenAI, embeddings, blob storage, etc. * app/backend/prepdocslib/strategy.py: Base strategy interface for document ingestion * app/backend/prepdocslib/textparser.py: Parses plain text and markdown files * app/backend/prepdocslib/textprocessor.py: Processes text chunks for cloud ingestion (merges figures, generates embeddings) * app/backend/prepdocslib/textsplitter.py: Splits text into chunks using different strategies * app/backend/app.py: The main entry point for the backend application. * app/functions: Azure Functions used for cloud ingestion custom skills (document extraction, figure processing, text processing). Each function bundles a synchronized copy of `prepdocslib`; run `python scripts/copy_prepdocslib.py` to refresh the local copies if you modify the library. * app/frontend: Contains the React frontend code, built with TypeScript, built with vite. * app/frontend/src/api: Contains the API client code for communicating with the backend. * app/frontend/src/components: Contains the React components for the frontend. * app/frontend/src/locales: Contains the translation files for internationalization. * app/frontend/src/locales/da/translation.json: Danish translations * app/frontend/src/locales/en/translation.json: English translations * app/frontend/src/locales/es/translation.json: Spanish translations * app/frontend/src/locales/fr/translation.json: French translations * app/frontend/src/locales/it/translation.json: Italian translations * app/frontend/src/locales/ja/translation.json: Japanese translations * app/frontend/src/locales/nl/translation.json: Dutch translations * app/frontend/src/locales/ptBR/translation.json: Portuguese translations * app/frontend/src/locales/tr/translation.json: Turkish translations * app/frontend/src/pages: Contains the main pages of the application * infra: Contains the Bicep templates for provisioning Azure resources. * tests: Contains the test code, including e2e tests, app integration tests, and unit tests. ## Adding new data New files should be added to the `data` folder, and then either run scripts/prepdocs.sh or scripts/prepdocs.ps1 to ingest the data. ## Adding a new azd environment variable An azd environment variable is stored by the azd CLI for each environment. It is passed to the "azd up" command and can configure both provisioning options and application settings. When adding new azd environment variables, update: 1. infra/main.parameters.json : Add the new parameter with a Bicep-friendly variable name and map to the new environment variable 1. infra/main.bicep: Add the new Bicep parameter at the top, and add it to the `appEnvVariables` object 1. .azdo/pipelines/azure-dev.yml: Add the new environment variable under `env` section 1. .github/workflows/azure-dev.yml: Add the new environment variable under `env` section You may also need to update: 1. app/backend/prepdocs.py: If the variable is used in the ingestion script, retrieve it from environment variables here. Not always needed. 1. app/backend/app.py: If the variable is used in the backend application, retrieve it from environment variables in setup_clients() function. Not always needed. ## Adding a new setting to "Developer Settings" in RAG app When adding a new developer setting, update: * frontend: * app/frontend/src/api/models.ts : Add to ChatAppRequestOverrides * app/frontend/src/components/Settings.tsx : Add a UI element for the setting * app/frontend/src/locales/*/translations.json: Add a translation for the setting label/tooltip for all languages * app/frontend/src/pages/chat/Chat.tsx: Add the setting to the component, pass it to Settings * backend: * app/backend/approaches/chatreadretrieveread.py : Retrieve from overrides parameter * app/backend/app.py: Some settings may need to be sent down in the /config route. ## When adding tests for a new feature All tests are in the `tests` folder and use the pytest framework. There are three styles of tests: * e2e tests: These use playwright to run the app in a browser and test the UI end-to-end. They are in e2e.py and they mock the backend using the snapshots from the app tests. (Before running e2e tests, make sure to run `npm run build` in app/frontend first to build the frontend code.) * app integration tests: Mostly in test_app.py, these test the app's API endpoints and use mocks for services like Azure OpenAI and Azure Search. * unit tests: The rest of the tests are unit tests that test individual functions and methods. They are in test_*.py files. When adding a new feature, add tests for it in the appropriate file. If the feature is a UI element, add an e2e test for it. If it is an API endpoint, add an app integration test for it. If it is a function or method, add a unit test for it. Use mocks from tests/conftest.py to mock external services. Prefer mocking at the HTTP/requests level when possible. When you're running tests, make sure you activate the .venv virtual environment first: ```shell source .venv/bin/activate ``` To check for coverage, run the following command: ```shell pytest --cov --cov-report=annotate:cov_annotate ``` Open the cov_annotate directory to view the annotated source code. There will be one file per source file. If a file has 100% source coverage, it means all lines are covered by tests, so you do not need to open the file. For each file that has less than 100% test coverage, find the matching file in cov_annotate and review the file. If a line starts with a ! (exclamation mark), it means that the line is not covered by tests. Add tests to cover the missing lines. ## Sending pull requests When sending pull requests, make sure to follow the PULL_REQUEST_TEMPLATE.md format. ## Upgrading dependencies ### Python backend dependencies To upgrade a particular package in the backend, use the following command, replacing `` with the name of the package you want to upgrade: ```shell cd app/backend && uv pip compile requirements.in -o requirements.txt --python-version 3.10 --upgrade-package ``` After upgrading, run tests to verify compatibility: ```shell source .venv/bin/activate pytest tests/ ``` ### npm frontend dependencies To upgrade a particular package in the frontend: 1. **Navigate to the frontend directory**: ```shell cd app/frontend ``` 2. **Upgrade the package** (replace `` with the package you want to upgrade): ```shell npm install @latest ``` 3. **Build the frontend** to verify the upgrade works: ```shell npm run build ``` 4. **Run all tests** to ensure nothing broke: ```shell # Run e2e tests from the root directory cd ../.. source .venv/bin/activate pytest tests/e2e.py ``` 5. **Commit changes** if the upgrade is successful: ```shell git add package.json package-lock.json git commit -m "chore: upgrade to " ``` **Important notes for frontend upgrades**: * When upgrading React or related core packages, you may need to upgrade multiple packages together (e.g., `react`, `react-dom`, `@types/react`, `@types/react-dom`) * Some upgrades may require code changes for API compatibility - check the package's changelog * For major version upgrades of UI libraries like Fluent UI or MSAL, review breaking changes carefully. Manual tests are required for any MSAL changes since the E2E tests do not cover authentication flows. * If npm reports peer dependency conflicts, the `.npmrc` file has `legacy-peer-deps=true` which allows the install to proceed. This is currently needed because `react-helmet-async` declares peer dependencies on React 17/18, but works fine with React 19. ## Checking Python type hints To check Python type hints, use the following command: ```shell ty check ``` Note that we do not currently enforce type hints in the tests folder, as it would require adding a lot of `# type: ignore` comments to the existing tests. We only enforce type hints in the main application code and scripts. ## Python code style Do not use single underscores in front of "private" methods or variables in Python code. We do not follow that convention in this codebase, since this is an application and not a library. ## Deploying the application To deploy the application, use the `azd` CLI tool. Make sure you have the latest version of the `azd` CLI installed. Then, run the following command from the root of the repository: ```shell azd up ``` That command will BOTH provision the Azure resources AND deploy the application code. If you only changed the Bicep templates and want to re-provision the Azure resources, run: ```shell azd provision ``` If you only changed the application code and want to re-deploy the code, run: ```shell azd deploy ``` If you are using cloud ingestion and only want to deploy individual functions, run the necessary deploy commands, for example: ```shell azd deploy document-extractor azd deploy figure-processor azd deploy text-processor ``` ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit . When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. - [Submitting a Pull Request (PR)](#submitting-a-pull-request-pr) - [Setting up the development environment](#setting-up-the-development-environment) - [Running unit tests](#running-unit-tests) - [Running E2E tests](#running-e2e-tests) - [Code style](#code-style) - [Adding new features](#adding-new-features) - [Adding new azd environment variables](#adding-new-azd-environment-variables) - [Adding new UI strings](#adding-new-ui-strings) ## Submitting a Pull Request (PR) Before you submit your Pull Request (PR) consider the following guidelines: - Search the repository (]/[repository-name]/pulls) for an open or closed PR that relates to your submission. You don't want to duplicate effort. - Make your changes in a new git fork - Follow [Code style conventions](#code-style) - [Run the tests](#running-unit-tests) (and write new ones, if needed) - Commit your changes using a descriptive commit message - Push your fork to GitHub - In GitHub, create a pull request to the `main` branch of the repository - Ask a maintainer to review your PR and address any comments they might have ## Setting up the development environment Install the development dependencies: ```shell python -m pip install -r requirements-dev.txt ``` Install the pre-commit hooks: ```shell pre-commit install ``` Compile the JavaScript: ```shell ( cd ./app/frontend ; npm install ; npm run build ) ``` ## Running unit tests Run the tests: ```shell python -m pytest ``` If test snapshots need updating (and the changes are expected), you can update them by running: ```shell python -m pytest --snapshot-update ``` Once tests are passing, generate a coverage report to make sure your changes are covered: ```shell pytest --cov --cov-report=xml && \ diff-cover coverage.xml --html-report coverage_report.html && \ open coverage_report.html ``` ## Running E2E tests Install Playwright browser dependencies: ```shell playwright install --with-deps ``` Run the tests: ```shell python -m pytest tests/e2e.py --tracing=retain-on-failure ``` When a failure happens, the trace zip will be saved in the test-results folder. You can view that using the Playwright CLI: ```shell playwright show-trace test-results/ ``` You can also use the online trace viewer at ## Code style This codebase includes several languages: TypeScript, Python, Bicep, Powershell, and Bash. Code should follow the standard conventions of each language. For Python, you can enforce the conventions using `ruff` and `black`. Install the development dependencies: ```shell python -m pip install -r requirements-dev.txt ``` Run `ruff` to lint a file: ```shell python -m ruff ``` Run `black` to format a file: ```shell python -m black ``` If you followed the steps above to install the pre-commit hooks, then you can just wait for those hooks to run `ruff` and `black` for you. ## Adding new features We recommend using GitHub Copilot Agent mode when adding new features, as this project includes an [AGENTS.md](AGENTS.md) file that instructs Copilot (and other coding agents) about how to generate code for common code changes. If you are not using Copilot Agent mode, consult both that file and suggestions below. ### Adding new azd environment variables When adding new azd environment variables, please remember to update: 1. [main.parameters.json](./infra/main.parameters.json) 1. [appEnvVariables in main.bicep](./infra/main.bicep) 1. [ADO pipeline](.azdo/pipelines/azure-dev.yml). 1. [Github workflows](.github/workflows/azure-dev.yml) ### Adding new UI strings When adding new UI strings, please remember to update all translations. For any translations that you generate with an AI tool, please indicate in the PR description which language's strings were AI-generated. Here are community contributors that can review translations: | Language | Contributor | |----------|---------------------| | Danish | @EMjetrot | | French | @manekinekko | | Japanese | @bnodir | | Norwegian| @@jeannotdamoiseaux | | Portugese| @glaucia86 | | Spanish | @miguelmsft | | Turkish | @mertcakdogan | | Italian | @ivanvaccarics | | Dutch | | | Polish | @michuhu | ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 Azure Samples Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # RAG chat app with Azure OpenAI and Azure AI Search (Python) This solution creates a ChatGPT-like frontend experience over your own documents using RAG (Retrieval Augmented Generation). It uses Azure OpenAI Service to access GPT models, and Azure AI Search for data indexing and retrieval. This solution's backend is written in Python. There are also [**JavaScript**](https://aka.ms/azai/js/code), [**.NET**](https://aka.ms/azai/net/code), and [**Java**](https://aka.ms/azai/java/code) samples based on this one. Learn more about [developing AI apps using Azure AI Services](https://aka.ms/azai). [![Open in GitHub Codespaces](https://img.shields.io/static/v1?style=for-the-badge&label=GitHub+Codespaces&message=Open&color=brightgreen&logo=github)](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=599293758&machine=standardLinux32gb&devcontainer_path=.devcontainer%2Fdevcontainer.json&location=WestUs2) [![Open in Dev Containers](https://img.shields.io/static/v1?style=for-the-badge&label=Dev%20Containers&message=Open&color=blue&logo=visualstudiocode)](https://vscode.dev/redirect?url=vscode://ms-vscode-remote.remote-containers/cloneInVolume?url=https://github.com/azure-samples/azure-search-openai-demo) [![Open in VS Code for the Web](https://img.shields.io/static/v1?style=for-the-badge&label=VS+Code+for+the+Web&message=Open&color=purple&logo=visualstudiocode)](https://vscode.dev/azure?azdTemplateUrl=https://github.com/azure-samples/azure-search-openai-demo) ## Important Security Notice This template, the application code and configuration it contains, has been built to showcase Microsoft Azure specific services and tools. We strongly advise our customers not to make this code part of their production environments without implementing or enabling additional security features. See our [productionizing guide](docs/productionizing.md) for tips, and consult the [Azure OpenAI Landing Zone reference architecture](https://techcommunity.microsoft.com/blog/azurearchitectureblog/azure-openai-landing-zone-reference-architecture/3882102) for more best practices. ## Table of Contents - [Features](#features) - [Azure account requirements](#azure-account-requirements) - [Cost estimation](#cost-estimation) - [Getting Started](#getting-started) - [GitHub Codespaces](#github-codespaces) - [VS Code Dev Containers](#vs-code-dev-containers) - [Local environment](#local-environment) - [Deploying](#deploying) - [Deploying again](#deploying-again) - [Running the development server](#running-the-development-server) - [Using the app](#using-the-app) - [Clean up](#clean-up) - [Guidance](#guidance) - [Resources](#resources) ![Chat screen](docs/images/chatscreen.png) [📺 Watch a video overview of the app.](https://youtu.be/3acB0OWmLvM) This sample demonstrates a few approaches for creating ChatGPT-like experiences over your own data using the Retrieval Augmented Generation pattern. It uses Azure OpenAI Service to access a GPT model (gpt-4.1-mini), and Azure AI Search for data indexing and retrieval. The repo includes sample data so it's ready to try end to end. In this sample application we use a fictitious company called Zava, and the experience allows its employees to ask questions about the benefits, internal policies, as well as job descriptions and roles. ## Features - Chat (multi-turn) interface - Renders citations and thought process for each answer - Includes settings directly in the UI to tweak the behavior and experiment with options - Integrates Azure AI Search for indexing and retrieval of documents, with support for [many document formats](/docs/data_ingestion.md#supported-document-formats) as well as [cloud data ingestion](/docs/data_ingestion.md#cloud-data-ingestion) - Optional usage of [multimodal models](/docs/multimodal.md) to reason over image-heavy documents - Optional addition of [speech input/output](/docs/deploy_features.md#enabling-speech-inputoutput) for accessibility - Optional automation of [user login and data access](/docs/login_and_acl.md) via Microsoft Entra - Performance tracing and monitoring with Application Insights ### Architecture Diagram ![RAG Architecture](docs/images/appcomponents.png) ## Azure account requirements **IMPORTANT:** In order to deploy and run this example, you'll need: - **Azure account**. If you're new to Azure, [get an Azure account for free](https://azure.microsoft.com/free/cognitive-search/) and you'll get some free Azure credits to get started. See [guide to deploying with the free trial](docs/deploy_freetrial.md). - **Azure account permissions**: - Your Azure account must have `Microsoft.Authorization/roleAssignments/write` permissions, such as [Role Based Access Control Administrator](https://learn.microsoft.com/azure/role-based-access-control/built-in-roles#role-based-access-control-administrator-preview), [User Access Administrator](https://learn.microsoft.com/azure/role-based-access-control/built-in-roles#user-access-administrator), or [Owner](https://learn.microsoft.com/azure/role-based-access-control/built-in-roles#owner). If you don't have subscription-level permissions, you must be granted [RBAC](https://learn.microsoft.com/azure/role-based-access-control/built-in-roles#role-based-access-control-administrator-preview) for an existing resource group and [deploy to that existing group](docs/deploy_existing.md#resource-group). - Your Azure account also needs `Microsoft.Resources/deployments/write` permissions on the subscription level. ### Cost estimation Pricing varies per region and usage, so it isn't possible to predict exact costs for your usage. However, you can try the [Azure pricing calculator](https://azure.com/e/e3490de2372a4f9b909b0d032560e41b) for the resources below. - Azure Container Apps: Default host for app deployment as of 10/28/2024. See more details in [the ACA deployment guide](docs/azure_container_apps.md). Consumption plan with 1 CPU core, 2 GB RAM, minimum of 0 replicas. Pricing with Pay-as-You-Go. [Pricing](https://azure.microsoft.com/pricing/details/container-apps/) - Azure Container Registry: Basic tier. [Pricing](https://azure.microsoft.com/pricing/details/container-registry/) - Azure App Service: Only provisioned if you deploy to Azure App Service following [the App Service deployment guide](docs/azure_app_service.md). Basic Tier with 1 CPU core, 1.75 GB RAM. Pricing per hour. [Pricing](https://azure.microsoft.com/pricing/details/app-service/linux/) - Azure OpenAI: Standard tier, GPT and Ada models. Pricing per 1K tokens used, and at least 1K tokens are used per question. [Pricing](https://azure.microsoft.com/pricing/details/cognitive-services/openai-service/) - Azure AI Document Intelligence: SO (Standard) tier using pre-built layout. Pricing per document page, sample documents have 261 pages total. [Pricing](https://azure.microsoft.com/pricing/details/form-recognizer/) - Azure AI Search: Basic tier, 1 replica, free level of semantic search. Pricing per hour. [Pricing](https://azure.microsoft.com/pricing/details/search/) - Azure Blob Storage: Standard tier with ZRS (Zone-redundant storage). Pricing per storage and read operations. [Pricing](https://azure.microsoft.com/pricing/details/storage/blobs/) - Azure Cosmos DB: Only provisioned if you enabled [chat history with Cosmos DB](docs/deploy_features.md#enabling-persistent-chat-history-with-azure-cosmos-db). Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/) - Azure AI Vision: Only provisioned if you enabled [multimodal approach](docs/multimodal.md). Pricing per 1K transactions. [Pricing](https://azure.microsoft.com/pricing/details/cognitive-services/computer-vision/) - Azure AI Content Understanding: Only provisioned if you enabled [media description](docs/deploy_features.md#enabling-media-description-with-azure-content-understanding). Pricing per 1K images. [Pricing](https://azure.microsoft.com/pricing/details/content-understanding/) - Azure Monitor: Pay-as-you-go tier. Costs based on data ingested. [Pricing](https://azure.microsoft.com/pricing/details/monitor/) To reduce costs, you can switch to free SKUs for various services, but those SKUs have limitations. See this guide on [deploying with minimal costs](docs/deploy_lowcost.md) for more details. ⚠️ To avoid unnecessary costs, remember to take down your app if it's no longer in use, either by deleting the resource group in the Portal or running `azd down`. ## Getting Started You have a few options for setting up this project. The easiest way to get started is GitHub Codespaces, since it will setup all the tools for you, but you can also [set it up locally](#local-environment) if desired. ### GitHub Codespaces You can run this repo virtually by using GitHub Codespaces, which will open a web-based VS Code in your browser: [![Open in GitHub Codespaces](https://img.shields.io/static/v1?style=for-the-badge&label=GitHub+Codespaces&message=Open&color=brightgreen&logo=github)](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=599293758&machine=standardLinux32gb&devcontainer_path=.devcontainer%2Fdevcontainer.json&location=WestUs2) Once the codespace opens (this may take several minutes), open a terminal window. ### VS Code Dev Containers A related option is VS Code Dev Containers, which will open the project in your local VS Code using the [Dev Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers): 1. Start Docker Desktop (install it if not already installed) 2. Open the project: [![Open in Dev Containers](https://img.shields.io/static/v1?style=for-the-badge&label=Dev%20Containers&message=Open&color=blue&logo=visualstudiocode)](https://vscode.dev/redirect?url=vscode://ms-vscode-remote.remote-containers/cloneInVolume?url=https://github.com/azure-samples/azure-search-openai-demo) 3. In the VS Code window that opens, once the project files show up (this may take several minutes), open a terminal window. ### Local environment 1. Install the required tools: - [Azure Developer CLI](https://aka.ms/azure-dev/install) - [Python 3.10, 3.11, 3.12, 3.13, or 3.14](https://www.python.org/downloads/) - **Important**: Python and the pip package manager must be in the path in Windows for the setup scripts to work. - **Important**: Ensure you can run `python --version` from console. On Ubuntu, you might need to run `sudo apt install python-is-python3` to link `python` to `python3`. - [Node.js 20+](https://nodejs.org/download/) - [Git](https://git-scm.com/downloads) - [Powershell 7+ (pwsh)](https://github.com/powershell/powershell) - For Windows users only. - **Important**: Ensure you can run `pwsh.exe` from a PowerShell terminal. If this fails, you likely need to upgrade PowerShell. 2. Create a new folder and switch to it in the terminal. 3. Run this command to download the project code: ```shell azd init -t azure-search-openai-demo ``` Note that this command will initialize a git repository, so you do not need to clone this repository. ## Deploying The steps below will provision Azure resources and deploy the application code to Azure Container Apps. To deploy to Azure App Service instead, follow [the app service deployment guide](docs/azure_app_service.md). 1. Login to your Azure account: ```shell azd auth login ``` For GitHub Codespaces users, if the previous command fails, try: ```shell azd auth login --use-device-code ``` 1. Create a new azd environment: ```shell azd env new ``` Enter a name that will be used for the resource group. This will create a new folder in the `.azure` folder, and set it as the active environment for any calls to `azd` going forward. 1. (Optional) This is the point where you can customize the deployment by setting environment variables, in order to [use existing resources](docs/deploy_existing.md), [enable optional features (such as auth or vision)](docs/deploy_features.md), or [deploy low-cost options](docs/deploy_lowcost.md), or [deploy with the Azure free trial](docs/deploy_freetrial.md). 1. Run `azd up` - This will provision Azure resources and deploy this sample to those resources, including building the search index based on the files found in the `./data` folder. - **Important**: Beware that the resources created by this command will incur immediate costs, primarily from the AI Search resource. These resources may accrue costs even if you interrupt the command before it is fully executed. You can run `azd down` or delete the resources manually to avoid unnecessary spending. - You will be prompted to select two locations, one for the majority of resources and one for the OpenAI resource, which is currently a short list. That location list is based on the [OpenAI model availability table](https://learn.microsoft.com/azure/cognitive-services/openai/concepts/models#model-summary-table-and-region-availability) and may become outdated as availability changes. 1. After the application has been successfully deployed you will see a URL printed to the console. Click that URL to interact with the application in your browser. It will look like the following: !['Output from running azd up'](docs/images/endpoint.png) > NOTE: It may take 5-10 minutes after you see 'SUCCESS' for the application to be fully deployed. If you see a "Python Developer" welcome screen or an error page, then wait a bit and refresh the page. ### Deploying again If you've only changed the backend/frontend code in the `app` folder, then you don't need to re-provision the Azure resources. You can just run: ```shell azd deploy ``` If you've changed the infrastructure files (`infra` folder or `azure.yaml`), then you'll need to re-provision the Azure resources. You can do that by running: ```shell azd up ``` ## Running the development server You can only run a development server locally **after** having successfully run the `azd up` command. If you haven't yet, follow the [deploying](#deploying) steps above. 1. Run `azd auth login` if you have not logged in recently. 2. Start the server: Windows: ```shell ./app/start.ps1 ``` Linux/Mac: ```shell ./app/start.sh ``` VS Code: Run the "VS Code Task: Start App" task. It's also possible to enable hotloading or the VS Code debugger. See more tips in [the local development guide](docs/localdev.md). ## Using the app - In Azure: navigate to the Azure WebApp deployed by azd. The URL is printed out when azd completes (as "Endpoint"), or you can find it in the Azure portal. - Running locally: navigate to 127.0.0.1:50505 Once in the web app: - Try different topics in chat. Try follow up questions, clarifications, ask to simplify or elaborate on answer, etc. - Explore citations and sources - Click on "settings" to try different options, tweak prompts, etc. ## Clean up To clean up all the resources created by this sample: 1. Run `azd down` 2. When asked if you are sure you want to continue, enter `y` 3. When asked if you want to permanently delete the resources, enter `y` The resource group and all the resources will be deleted. ## Guidance You can find extensive documentation in the [docs](docs/README.md) folder: - Deploying: - [Troubleshooting deployment](docs/deploy_troubleshooting.md) - [Debugging the app on App Service](docs/appservice.md) - [Deploying with azd: deep dive and CI/CD](docs/azd.md) - [Deploying with existing Azure resources](docs/deploy_existing.md) - [Deploying from a free account](docs/deploy_lowcost.md) - [Enabling optional features](docs/deploy_features.md) - [All features](docs/deploy_features.md) - [Login and access control](docs/login_and_acl.md) - [Multimodal](docs/multimodal.md) - [Reasoning](docs/reasoning.md) - [Private endpoints](docs/deploy_private.md) - [Agentic retrieval](docs/agentic_retrieval.md) - [Sharing deployment environments](docs/sharing_environments.md) - [Local development](docs/localdev.md) - [Customizing the app](docs/customization.md) - [App architecture](docs/architecture.md) - [HTTP Protocol](docs/http_protocol.md) - [Data ingestion](docs/data_ingestion.md) - [Evaluation](docs/evaluation.md) - [Safety evaluation](docs/safety_evaluation.md) - [Monitoring with Application Insights](docs/monitoring.md) - [Productionizing](docs/productionizing.md) - [Alternative RAG chat samples](docs/other_samples.md) ### Resources - [📖 Docs: Get started using the chat with your data sample](https://learn.microsoft.com/azure/developer/python/get-started-app-chat-template?toc=%2Fazure%2Fdeveloper%2Fai%2Ftoc.json&bc=%2Fazure%2Fdeveloper%2Fai%2Fbreadcrumb%2Ftoc.json&tabs=github-codespaces) - [📖 Blog: Revolutionize your Enterprise Data with ChatGPT: Next-gen Apps w/ Azure OpenAI and AI Search](https://techcommunity.microsoft.com/blog/azure-ai-services-blog/revolutionize-your-enterprise-data-with-chatgpt-next-gen-apps-w-azure-openai-and/3762087) - [📖 Docs: Azure AI Search](https://learn.microsoft.com/azure/search/search-what-is-azure-search) - [📖 Docs: Azure OpenAI Service](https://learn.microsoft.com/azure/cognitive-services/openai/overview) - [📖 Docs: Comparing Azure OpenAI and OpenAI](https://learn.microsoft.com/azure/cognitive-services/openai/overview#comparing-azure-openai-and-openai/) - [📖 Blog: Access Control in Generative AI applications with Azure AI Search](https://techcommunity.microsoft.com/blog/azure-ai-services-blog/access-control-in-generative-ai-applications-with-azure-ai-search/3956408) - [📺 Talk: Quickly build and deploy OpenAI apps on Azure, infused with your own data](https://www.youtube.com/watch?v=j8i-OM5kwiY) - [📺 Video: RAG Deep Dive Series](https://techcommunity.microsoft.com/blog/azuredevcommunityblog/rag-deep-dive-watch-all-the-recordings/4383171) ### Getting help This is a sample built to demonstrate the capabilities of modern Generative AI apps and how they can be built in Azure. For help with deploying this sample, please post in [GitHub Issues](/issues). If you're a Microsoft employee, you can also post in [our Teams channel](https://aka.ms/azai-python-help). This repository is supported by the maintainers, _not_ by Microsoft Support, so please use the support mechanisms described above, and we will do our best to help you out. For general questions about developing AI solutions on Azure, join the Azure AI Foundry Developer Community: [![Azure AI Foundry Discord](https://img.shields.io/badge/Discord-Azure_AI_Foundry_Community_Discord-blue?style=for-the-badge&logo=discord&color=5865f2&logoColor=fff)](https://aka.ms/foundry/discord) [![Azure AI Foundry Developer Forum](https://img.shields.io/badge/GitHub-Azure_AI_Foundry_Developer_Forum-blue?style=for-the-badge&logo=github&color=000000&logoColor=fff)](https://aka.ms/foundry/forum) ### Note >Note: The PDF documents used in this demo contain information generated using a language model (Azure OpenAI Service). The information contained in these documents is only for demonstration purposes and does not reflect the opinions or beliefs of Microsoft. Microsoft makes no representations or warranties of any kind, express or implied, about the completeness, accuracy, reliability, suitability or availability with respect to the information contained in this document. All rights reserved to Microsoft. ================================================ FILE: SECURITY.md ================================================ # Security Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](), please report it to us as described below. ## Reporting Security Issues **Please do not report security vulnerabilities through public GitHub issues.** Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/msrc/pgp-key-msrc). You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: - Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) - Full paths of source file(s) related to the manifestation of the issue - The location of the affected source code (tag/branch/commit or direct URL) - Any special configuration required to reproduce the issue - Step-by-step instructions to reproduce the issue - Proof-of-concept or exploit code (if possible) - Impact of the issue, including how an attacker might exploit the issue This information will help us triage your report more quickly. If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. ## Preferred Languages We prefer all communications to be in English. ## Policy Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/msrc/cvd). ================================================ FILE: app/backend/.dockerignore ================================================ .git __pycache__ *.pyc *.pyo *.pyd .Python env ================================================ FILE: app/backend/Dockerfile ================================================ FROM python:3.13-bookworm WORKDIR /app COPY ./ /app RUN python -m pip install -r requirements.txt RUN python -m pip install gunicorn CMD ["python3", "-m", "gunicorn", "-b", "0.0.0.0:8000", "main:app"] ================================================ FILE: app/backend/app.py ================================================ import dataclasses import io import json import logging import mimetypes import os import time from collections.abc import AsyncGenerator, Awaitable, Callable from pathlib import Path from typing import Any, cast from azure.cognitiveservices.speech import ( ResultReason, SpeechConfig, SpeechSynthesisOutputFormat, SpeechSynthesisResult, SpeechSynthesizer, ) from azure.identity.aio import ( AzureDeveloperCliCredential, ManagedIdentityCredential, get_bearer_token_provider, ) from azure.monitor.opentelemetry import configure_azure_monitor from azure.search.documents.aio import SearchClient from azure.search.documents.indexes.aio import SearchIndexClient from azure.search.documents.knowledgebases.aio import KnowledgeBaseRetrievalClient from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor from opentelemetry.instrumentation.asgi import OpenTelemetryMiddleware from opentelemetry.instrumentation.httpx import ( HTTPXClientInstrumentor, ) from opentelemetry.instrumentation.openai import OpenAIInstrumentor from quart import ( Blueprint, Quart, abort, current_app, jsonify, make_response, request, send_file, send_from_directory, ) from quart_cors import cors from approaches.approach import Approach, DataPoints from approaches.chatreadretrieveread import ChatReadRetrieveReadApproach from approaches.promptmanager import PromptManager from chat_history.cosmosdb import chat_history_cosmosdb_bp from config import ( CONFIG_AGENTIC_KNOWLEDGEBASE_ENABLED, CONFIG_AUTH_CLIENT, CONFIG_CHAT_APPROACH, CONFIG_CHAT_HISTORY_BROWSER_ENABLED, CONFIG_CHAT_HISTORY_COSMOS_ENABLED, CONFIG_CREDENTIAL, CONFIG_DEFAULT_REASONING_EFFORT, CONFIG_DEFAULT_RETRIEVAL_REASONING_EFFORT, CONFIG_GLOBAL_BLOB_MANAGER, CONFIG_INGESTER, CONFIG_KNOWLEDGEBASE_CLIENT, CONFIG_KNOWLEDGEBASE_CLIENT_WITH_SHAREPOINT, CONFIG_KNOWLEDGEBASE_CLIENT_WITH_WEB, CONFIG_KNOWLEDGEBASE_CLIENT_WITH_WEB_AND_SHAREPOINT, CONFIG_LANGUAGE_PICKER_ENABLED, CONFIG_MULTIMODAL_ENABLED, CONFIG_OPENAI_CLIENT, CONFIG_QUERY_REWRITING_ENABLED, CONFIG_RAG_SEARCH_IMAGE_EMBEDDINGS, CONFIG_RAG_SEARCH_TEXT_EMBEDDINGS, CONFIG_RAG_SEND_IMAGE_SOURCES, CONFIG_RAG_SEND_TEXT_SOURCES, CONFIG_REASONING_EFFORT_ENABLED, CONFIG_SEARCH_CLIENT, CONFIG_SEMANTIC_RANKER_DEPLOYED, CONFIG_SHAREPOINT_SOURCE_ENABLED, CONFIG_SPEECH_INPUT_ENABLED, CONFIG_SPEECH_OUTPUT_AZURE_ENABLED, CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED, CONFIG_SPEECH_SERVICE_ID, CONFIG_SPEECH_SERVICE_LOCATION, CONFIG_SPEECH_SERVICE_TOKEN, CONFIG_SPEECH_SERVICE_VOICE, CONFIG_STREAMING_ENABLED, CONFIG_USER_BLOB_MANAGER, CONFIG_USER_UPLOAD_ENABLED, CONFIG_VECTOR_SEARCH_ENABLED, CONFIG_WEB_SOURCE_ENABLED, ) from core.authentication import AuthenticationHelper from core.sessionhelper import create_session_id from decorators import authenticated, authenticated_path from error import error_dict, error_response from prepdocs import ( OpenAIHost, setup_embeddings_service, setup_file_processors, setup_image_embeddings_service, setup_openai_client, setup_search_info, ) from prepdocslib.blobmanager import AdlsBlobManager, BlobManager from prepdocslib.embeddings import ImageEmbeddings from prepdocslib.filestrategy import UploadUserFileStrategy from prepdocslib.listfilestrategy import File bp = Blueprint("routes", __name__, static_folder="static") # Fix Windows registry issue with mimetypes mimetypes.add_type("application/javascript", ".js") mimetypes.add_type("text/css", ".css") @bp.route("/") async def index(): return await bp.send_static_file("index.html") # Empty page is recommended for login redirect to work. # See https://github.com/AzureAD/microsoft-authentication-library-for-js/blob/dev/lib/msal-browser/docs/initialization.md#redirecturi-considerations for more information @bp.route("/redirect") async def redirect(): return "" @bp.route("/favicon.ico") async def favicon(): return await bp.send_static_file("favicon.ico") @bp.route("/assets/") async def assets(path): return await send_from_directory(Path(__file__).resolve().parent / "static" / "assets", path) @bp.route("/content/") @authenticated_path async def content_file(path: str, auth_claims: dict[str, Any]): """ Serve content files from blob storage from within the app to keep the example self-contained. *** NOTE *** if you are using app services authentication, this route will return unauthorized to all users that are not logged in if AZURE_ENFORCE_ACCESS_CONTROL is not set or false, logged in users can access all files regardless of access control if AZURE_ENFORCE_ACCESS_CONTROL is set to true, logged in users can only access files they have access to This is also slow and memory hungry. """ # Remove page number from path, filename-1.txt -> filename.txt # This shouldn't typically be necessary as browsers don't send hash fragments to servers if path.find("#page=") > 0: path_parts = path.rsplit("#page=", 1) path = path_parts[0] current_app.logger.info("Opening file %s", path) blob_manager: BlobManager = current_app.config[CONFIG_GLOBAL_BLOB_MANAGER] # Get bytes and properties from the blob manager result = await blob_manager.download_blob(path) if result is None: current_app.logger.info("Path not found in general Blob container: %s", path) if current_app.config[CONFIG_USER_UPLOAD_ENABLED]: user_oid = auth_claims["oid"] user_blob_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER] result = await user_blob_manager.download_blob(path, user_oid=user_oid) if result is None: current_app.logger.exception("Path not found in DataLake: %s", path) if not result: abort(404) content, properties = result if not properties or "content_settings" not in properties: abort(404) mime_type = properties["content_settings"]["content_type"] if mime_type == "application/octet-stream": mime_type = mimetypes.guess_type(path)[0] or "application/octet-stream" # Create a BytesIO object from the bytes blob_file = io.BytesIO(content) return await send_file(blob_file, mimetype=mime_type, as_attachment=False, attachment_filename=path) class JSONEncoder(json.JSONEncoder): def default(self, o): if dataclasses.is_dataclass(o) and not isinstance(o, type): as_dict = dataclasses.asdict(o) if isinstance(o, DataPoints): # Drop optional data point collections that are not populated to keep API surface stable return {k: v for k, v in as_dict.items() if v is not None} data_points_payload = as_dict.get("data_points") if isinstance(as_dict, dict) else None if isinstance(data_points_payload, dict) and data_points_payload.get("citation_activity_details") is None: data_points_payload.pop("citation_activity_details") return as_dict return super().default(o) async def format_as_ndjson(r: AsyncGenerator[dict, None]) -> AsyncGenerator[str, None]: try: async for event in r: yield json.dumps(event, ensure_ascii=False, cls=JSONEncoder) + "\n" except Exception as error: logging.exception("Exception while generating response stream: %s", error) yield json.dumps(error_dict(error)) @bp.route("/chat", methods=["POST"]) @authenticated async def chat(auth_claims: dict[str, Any]): if not request.is_json: return jsonify({"error": "request must be json"}), 415 request_json = await request.get_json() context = request_json.get("context", {}) context["auth_claims"] = auth_claims try: approach: Approach = cast(Approach, current_app.config[CONFIG_CHAT_APPROACH]) # If session state is provided, persists the session state, # else creates a new session_id depending on the chat history options enabled. session_state = request_json.get("session_state") if session_state is None: session_state = create_session_id( current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED], current_app.config[CONFIG_CHAT_HISTORY_BROWSER_ENABLED], ) result = await approach.run( request_json["messages"], context=context, session_state=session_state, ) return jsonify(result) except Exception as error: return error_response(error, "/chat") @bp.route("/chat/stream", methods=["POST"]) @authenticated async def chat_stream(auth_claims: dict[str, Any]): if not request.is_json: return jsonify({"error": "request must be json"}), 415 request_json = await request.get_json() context = request_json.get("context", {}) context["auth_claims"] = auth_claims try: approach: Approach = cast(Approach, current_app.config[CONFIG_CHAT_APPROACH]) # If session state is provided, persists the session state, # else creates a new session_id depending on the chat history options enabled. session_state = request_json.get("session_state") if session_state is None: session_state = create_session_id( current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED], current_app.config[CONFIG_CHAT_HISTORY_BROWSER_ENABLED], ) result = await approach.run_stream( request_json["messages"], context=context, session_state=session_state, ) response = await make_response(format_as_ndjson(result)) response.timeout = None # type: ignore response.mimetype = "application/json-lines" return response except Exception as error: return error_response(error, "/chat") # Send MSAL.js settings to the client UI @bp.route("/auth_setup", methods=["GET"]) def auth_setup(): auth_helper = current_app.config[CONFIG_AUTH_CLIENT] return jsonify(auth_helper.get_auth_setup_for_client()) @bp.route("/config", methods=["GET"]) def config(): return jsonify( { "showMultimodalOptions": current_app.config[CONFIG_MULTIMODAL_ENABLED], "showSemanticRankerOption": current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED], "showQueryRewritingOption": current_app.config[CONFIG_QUERY_REWRITING_ENABLED], "showReasoningEffortOption": current_app.config[CONFIG_REASONING_EFFORT_ENABLED], "streamingEnabled": current_app.config[CONFIG_STREAMING_ENABLED], "defaultReasoningEffort": current_app.config[CONFIG_DEFAULT_REASONING_EFFORT], "defaultRetrievalReasoningEffort": current_app.config[CONFIG_DEFAULT_RETRIEVAL_REASONING_EFFORT], "showVectorOption": current_app.config[CONFIG_VECTOR_SEARCH_ENABLED], "showUserUpload": current_app.config[CONFIG_USER_UPLOAD_ENABLED], "showLanguagePicker": current_app.config[CONFIG_LANGUAGE_PICKER_ENABLED], "showSpeechInput": current_app.config[CONFIG_SPEECH_INPUT_ENABLED], "showSpeechOutputBrowser": current_app.config[CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED], "showSpeechOutputAzure": current_app.config[CONFIG_SPEECH_OUTPUT_AZURE_ENABLED], "showChatHistoryBrowser": current_app.config[CONFIG_CHAT_HISTORY_BROWSER_ENABLED], "showChatHistoryCosmos": current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED], "showAgenticRetrievalOption": current_app.config[CONFIG_AGENTIC_KNOWLEDGEBASE_ENABLED], "ragSearchTextEmbeddings": current_app.config[CONFIG_RAG_SEARCH_TEXT_EMBEDDINGS], "ragSearchImageEmbeddings": current_app.config[CONFIG_RAG_SEARCH_IMAGE_EMBEDDINGS], "ragSendTextSources": current_app.config[CONFIG_RAG_SEND_TEXT_SOURCES], "ragSendImageSources": current_app.config[CONFIG_RAG_SEND_IMAGE_SOURCES], "webSourceEnabled": current_app.config[CONFIG_WEB_SOURCE_ENABLED], "sharepointSourceEnabled": current_app.config[CONFIG_SHAREPOINT_SOURCE_ENABLED], } ) @bp.route("/speech", methods=["POST"]) async def speech(): if not request.is_json: return jsonify({"error": "request must be json"}), 415 speech_token = current_app.config.get(CONFIG_SPEECH_SERVICE_TOKEN) if speech_token is None or speech_token.expires_on < time.time() + 60: speech_token = await current_app.config[CONFIG_CREDENTIAL].get_token( "https://cognitiveservices.azure.com/.default" ) current_app.config[CONFIG_SPEECH_SERVICE_TOKEN] = speech_token request_json = await request.get_json() text = request_json["text"] try: # Construct a token as described in documentation: # https://learn.microsoft.com/azure/ai-services/speech-service/how-to-configure-azure-ad-auth?pivots=programming-language-python auth_token = ( "aad#" + current_app.config[CONFIG_SPEECH_SERVICE_ID] + "#" + current_app.config[CONFIG_SPEECH_SERVICE_TOKEN].token ) speech_config = SpeechConfig(auth_token=auth_token, region=current_app.config[CONFIG_SPEECH_SERVICE_LOCATION]) speech_config.speech_synthesis_voice_name = current_app.config[CONFIG_SPEECH_SERVICE_VOICE] speech_config.set_speech_synthesis_output_format(SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3) synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None) result: SpeechSynthesisResult = synthesizer.speak_text_async(text).get() if result.reason == ResultReason.SynthesizingAudioCompleted: return result.audio_data, 200, {"Content-Type": "audio/mp3"} elif result.reason == ResultReason.Canceled: cancellation_details = result.cancellation_details current_app.logger.error( "Speech synthesis canceled: %s %s", cancellation_details.reason, cancellation_details.error_details ) raise Exception("Speech synthesis canceled. Check logs for details.") else: current_app.logger.error("Unexpected result reason: %s", result.reason) raise Exception("Speech synthesis failed. Check logs for details.") except Exception as e: current_app.logger.exception("Exception in /speech") return jsonify({"error": str(e)}), 500 @bp.post("/upload") @authenticated async def upload(auth_claims: dict[str, Any]): request_files = await request.files if "file" not in request_files: return jsonify({"message": "No file part in the request", "status": "failed"}), 400 try: user_oid = auth_claims["oid"] file = request_files.getlist("file")[0] adls_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER] file_url = await adls_manager.upload_blob(file, file.filename, user_oid) ingester: UploadUserFileStrategy = current_app.config[CONFIG_INGESTER] await ingester.add_file(File(content=file, url=file_url, acls={"oids": [user_oid]}), user_oid=user_oid) return jsonify({"message": "File uploaded successfully"}), 200 except Exception as error: current_app.logger.error("Error uploading file: %s", error) return jsonify({"message": "Error uploading file, check server logs for details.", "status": "failed"}), 500 @bp.post("/delete_uploaded") @authenticated async def delete_uploaded(auth_claims: dict[str, Any]): request_json = await request.get_json() filename = request_json.get("filename") user_oid = auth_claims["oid"] adls_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER] await adls_manager.remove_blob(filename, user_oid) ingester: UploadUserFileStrategy = current_app.config[CONFIG_INGESTER] await ingester.remove_file(filename, user_oid) return jsonify({"message": f"File {filename} deleted successfully"}), 200 @bp.get("/list_uploaded") @authenticated async def list_uploaded(auth_claims: dict[str, Any]): """Lists the uploaded documents for the current user. Only returns files directly in the user's directory, not in subdirectories. Excludes image files and the images directory.""" user_oid = auth_claims["oid"] adls_manager: AdlsBlobManager = current_app.config[CONFIG_USER_BLOB_MANAGER] files = await adls_manager.list_blobs(user_oid) return jsonify(files), 200 @bp.before_app_serving async def setup_clients(): # Replace these with your own values, either in environment variables or directly here AZURE_STORAGE_ACCOUNT = os.environ["AZURE_STORAGE_ACCOUNT"] AZURE_STORAGE_CONTAINER = os.environ["AZURE_STORAGE_CONTAINER"] AZURE_IMAGESTORAGE_CONTAINER = os.environ.get("AZURE_IMAGESTORAGE_CONTAINER") AZURE_USERSTORAGE_ACCOUNT = os.environ.get("AZURE_USERSTORAGE_ACCOUNT") AZURE_USERSTORAGE_CONTAINER = os.environ.get("AZURE_USERSTORAGE_CONTAINER") AZURE_SEARCH_SERVICE = os.environ["AZURE_SEARCH_SERVICE"] AZURE_SEARCH_ENDPOINT = f"https://{AZURE_SEARCH_SERVICE}.search.windows.net" AZURE_SEARCH_INDEX = os.environ["AZURE_SEARCH_INDEX"] AZURE_SEARCH_KNOWLEDGEBASE_NAME = os.getenv("AZURE_SEARCH_KNOWLEDGEBASE_NAME", "") # Shared by all OpenAI deployments OPENAI_HOST = OpenAIHost(os.getenv("OPENAI_HOST", "azure")) OPENAI_CHATGPT_MODEL = os.environ["AZURE_OPENAI_CHATGPT_MODEL"] AZURE_OPENAI_KNOWLEDGEBASE_MODEL = os.getenv("AZURE_OPENAI_KNOWLEDGEBASE_MODEL") AZURE_OPENAI_KNOWLEDGEBASE_DEPLOYMENT = os.getenv("AZURE_OPENAI_KNOWLEDGEBASE_DEPLOYMENT") OPENAI_EMB_MODEL = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-ada-002") OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS") or 1536) OPENAI_REASONING_EFFORT = os.getenv("AZURE_OPENAI_REASONING_EFFORT") # Used with Azure OpenAI deployments AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE") AZURE_OPENAI_CHATGPT_DEPLOYMENT = ( os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") if OPENAI_HOST in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM] else None ) AZURE_OPENAI_EMB_DEPLOYMENT = ( os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT") if OPENAI_HOST in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM] else None ) AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL") AZURE_VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT", "") AZURE_OPENAI_API_KEY_OVERRIDE = os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE") # Used only with non-Azure OpenAI deployments OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_ORGANIZATION = os.getenv("OPENAI_ORGANIZATION") AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID") AZURE_USE_AUTHENTICATION = os.getenv("AZURE_USE_AUTHENTICATION", "").lower() == "true" AZURE_ENFORCE_ACCESS_CONTROL = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true" AZURE_ENABLE_UNAUTHENTICATED_ACCESS = os.getenv("AZURE_ENABLE_UNAUTHENTICATED_ACCESS", "").lower() == "true" AZURE_SERVER_APP_ID = os.getenv("AZURE_SERVER_APP_ID") AZURE_SERVER_APP_SECRET = os.getenv("AZURE_SERVER_APP_SECRET") AZURE_CLIENT_APP_ID = os.getenv("AZURE_CLIENT_APP_ID") AZURE_AUTH_TENANT_ID = os.getenv("AZURE_AUTH_TENANT_ID", AZURE_TENANT_ID) KB_FIELDS_CONTENT = os.getenv("KB_FIELDS_CONTENT", "content") KB_FIELDS_SOURCEPAGE = os.getenv("KB_FIELDS_SOURCEPAGE", "sourcepage") AZURE_SEARCH_QUERY_LANGUAGE = os.getenv("AZURE_SEARCH_QUERY_LANGUAGE") or "en-us" AZURE_SEARCH_QUERY_SPELLER = os.getenv("AZURE_SEARCH_QUERY_SPELLER") or "lexicon" AZURE_SEARCH_SEMANTIC_RANKER = os.getenv("AZURE_SEARCH_SEMANTIC_RANKER", "free").lower() AZURE_SEARCH_QUERY_REWRITING = os.getenv("AZURE_SEARCH_QUERY_REWRITING", "false").lower() # This defaults to the previous field name "embedding", for backwards compatibility AZURE_SEARCH_FIELD_NAME_EMBEDDING = os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding") AZURE_SPEECH_SERVICE_ID = os.getenv("AZURE_SPEECH_SERVICE_ID") AZURE_SPEECH_SERVICE_LOCATION = os.getenv("AZURE_SPEECH_SERVICE_LOCATION") AZURE_SPEECH_SERVICE_VOICE = os.getenv("AZURE_SPEECH_SERVICE_VOICE") or "en-US-AndrewMultilingualNeural" USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "").lower() == "true" RAG_SEARCH_TEXT_EMBEDDINGS = os.getenv("RAG_SEARCH_TEXT_EMBEDDINGS", "true").lower() == "true" RAG_SEARCH_IMAGE_EMBEDDINGS = os.getenv("RAG_SEARCH_IMAGE_EMBEDDINGS", "true").lower() == "true" RAG_SEND_TEXT_SOURCES = os.getenv("RAG_SEND_TEXT_SOURCES", "true").lower() == "true" RAG_SEND_IMAGE_SOURCES = os.getenv("RAG_SEND_IMAGE_SOURCES", "true").lower() == "true" USE_USER_UPLOAD = os.getenv("USE_USER_UPLOAD", "").lower() == "true" ENABLE_LANGUAGE_PICKER = os.getenv("ENABLE_LANGUAGE_PICKER", "").lower() == "true" USE_SPEECH_INPUT_BROWSER = os.getenv("USE_SPEECH_INPUT_BROWSER", "").lower() == "true" USE_SPEECH_OUTPUT_BROWSER = os.getenv("USE_SPEECH_OUTPUT_BROWSER", "").lower() == "true" USE_SPEECH_OUTPUT_AZURE = os.getenv("USE_SPEECH_OUTPUT_AZURE", "").lower() == "true" USE_CHAT_HISTORY_BROWSER = os.getenv("USE_CHAT_HISTORY_BROWSER", "").lower() == "true" USE_CHAT_HISTORY_COSMOS = os.getenv("USE_CHAT_HISTORY_COSMOS", "").lower() == "true" USE_AGENTIC_KNOWLEDGEBASE = os.getenv("USE_AGENTIC_KNOWLEDGEBASE", "").lower() == "true" USE_WEB_SOURCE = os.getenv("USE_WEB_SOURCE", "").lower() == "true" USE_SHAREPOINT_SOURCE = os.getenv("USE_SHAREPOINT_SOURCE", "").lower() == "true" AGENTIC_KNOWLEDGEBASE_REASONING_EFFORT = os.getenv("AGENTIC_KNOWLEDGEBASE_REASONING_EFFORT", "low") USE_VECTORS = os.getenv("USE_VECTORS", "").lower() != "false" # WEBSITE_HOSTNAME is always set by App Service, RUNNING_IN_PRODUCTION is set in main.bicep RUNNING_ON_AZURE = os.getenv("WEBSITE_HOSTNAME") is not None or os.getenv("RUNNING_IN_PRODUCTION") is not None # Use the current user identity for keyless authentication to Azure services. # This assumes you use 'azd auth login' locally, and managed identity when deployed on Azure. # The managed identity is setup in the infra/ folder. azure_credential: AzureDeveloperCliCredential | ManagedIdentityCredential azure_ai_token_provider: Callable[[], Awaitable[str]] if RUNNING_ON_AZURE: current_app.logger.info("Setting up Azure credential using ManagedIdentityCredential") if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): # ManagedIdentityCredential should use AZURE_CLIENT_ID if set in env, but its not working for some reason, # so we explicitly pass it in as the client ID here. This is necessary for user-assigned managed identities. current_app.logger.info( "Setting up Azure credential using ManagedIdentityCredential with client_id %s", AZURE_CLIENT_ID ) azure_credential = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) else: current_app.logger.info("Setting up Azure credential using ManagedIdentityCredential") azure_credential = ManagedIdentityCredential() elif AZURE_TENANT_ID: current_app.logger.info( "Setting up Azure credential using AzureDeveloperCliCredential with tenant_id %s", AZURE_TENANT_ID ) azure_credential = AzureDeveloperCliCredential(tenant_id=AZURE_TENANT_ID, process_timeout=60) else: current_app.logger.info("Setting up Azure credential using AzureDeveloperCliCredential for home tenant") azure_credential = AzureDeveloperCliCredential(process_timeout=60) azure_ai_token_provider = get_bearer_token_provider( azure_credential, "https://cognitiveservices.azure.com/.default" ) # Set the Azure credential in the app config for use in other parts of the app current_app.config[CONFIG_CREDENTIAL] = azure_credential # Set up clients for AI Search and Storage search_client = SearchClient( endpoint=AZURE_SEARCH_ENDPOINT, index_name=AZURE_SEARCH_INDEX, credential=azure_credential, ) knowledgebase_client = KnowledgeBaseRetrievalClient( endpoint=AZURE_SEARCH_ENDPOINT, knowledge_base_name=AZURE_SEARCH_KNOWLEDGEBASE_NAME, credential=azure_credential ) knowledgebase_client_with_web = None knowledgebase_client_with_sharepoint = None knowledgebase_client_with_web_and_sharepoint = None if AZURE_SEARCH_KNOWLEDGEBASE_NAME: if USE_WEB_SOURCE: knowledgebase_client_with_web = KnowledgeBaseRetrievalClient( endpoint=AZURE_SEARCH_ENDPOINT, knowledge_base_name=f"{AZURE_SEARCH_KNOWLEDGEBASE_NAME}-with-web", credential=azure_credential, ) if USE_SHAREPOINT_SOURCE: knowledgebase_client_with_sharepoint = KnowledgeBaseRetrievalClient( endpoint=AZURE_SEARCH_ENDPOINT, knowledge_base_name=f"{AZURE_SEARCH_KNOWLEDGEBASE_NAME}-with-sp", credential=azure_credential, ) if USE_WEB_SOURCE and USE_SHAREPOINT_SOURCE: knowledgebase_client_with_web_and_sharepoint = KnowledgeBaseRetrievalClient( endpoint=AZURE_SEARCH_ENDPOINT, knowledge_base_name=f"{AZURE_SEARCH_KNOWLEDGEBASE_NAME}-with-web-and-sp", credential=azure_credential, ) # Set up the global blob storage manager (used for global content/images, but not user uploads) global_blob_manager = BlobManager( endpoint=f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net", credential=azure_credential, container=AZURE_STORAGE_CONTAINER, image_container=AZURE_IMAGESTORAGE_CONTAINER, ) current_app.config[CONFIG_GLOBAL_BLOB_MANAGER] = global_blob_manager # Set up authentication helper search_index = None if AZURE_USE_AUTHENTICATION: current_app.logger.info("AZURE_USE_AUTHENTICATION is true, setting up search index client") search_index_client = SearchIndexClient( endpoint=AZURE_SEARCH_ENDPOINT, credential=azure_credential, ) search_index = await search_index_client.get_index(AZURE_SEARCH_INDEX) await search_index_client.close() auth_helper = AuthenticationHelper( search_index=search_index, use_authentication=AZURE_USE_AUTHENTICATION, server_app_id=AZURE_SERVER_APP_ID, server_app_secret=AZURE_SERVER_APP_SECRET, client_app_id=AZURE_CLIENT_APP_ID, tenant_id=AZURE_AUTH_TENANT_ID, enforce_access_control=AZURE_ENFORCE_ACCESS_CONTROL, enable_unauthenticated_access=AZURE_ENABLE_UNAUTHENTICATED_ACCESS, ) if USE_SPEECH_OUTPUT_AZURE: current_app.logger.info("USE_SPEECH_OUTPUT_AZURE is true, setting up Azure speech service") if not AZURE_SPEECH_SERVICE_ID or AZURE_SPEECH_SERVICE_ID == "": raise ValueError("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_ID") if not AZURE_SPEECH_SERVICE_LOCATION or AZURE_SPEECH_SERVICE_LOCATION == "": raise ValueError("Azure speech resource not configured correctly, missing AZURE_SPEECH_SERVICE_LOCATION") current_app.config[CONFIG_SPEECH_SERVICE_ID] = AZURE_SPEECH_SERVICE_ID current_app.config[CONFIG_SPEECH_SERVICE_LOCATION] = AZURE_SPEECH_SERVICE_LOCATION current_app.config[CONFIG_SPEECH_SERVICE_VOICE] = AZURE_SPEECH_SERVICE_VOICE # Wait until token is needed to fetch for the first time current_app.config[CONFIG_SPEECH_SERVICE_TOKEN] = None openai_client, azure_openai_endpoint = setup_openai_client( openai_host=OPENAI_HOST, azure_credential=azure_credential, azure_openai_service=AZURE_OPENAI_SERVICE, azure_openai_custom_url=AZURE_OPENAI_CUSTOM_URL, azure_openai_api_key=AZURE_OPENAI_API_KEY_OVERRIDE, openai_api_key=OPENAI_API_KEY, openai_organization=OPENAI_ORGANIZATION, ) user_blob_manager = None if USE_USER_UPLOAD: current_app.logger.info("USE_USER_UPLOAD is true, setting up user upload feature") if not AZURE_USERSTORAGE_ACCOUNT or not AZURE_USERSTORAGE_CONTAINER: raise ValueError( "AZURE_USERSTORAGE_ACCOUNT and AZURE_USERSTORAGE_CONTAINER must be set when USE_USER_UPLOAD is true" ) if not AZURE_ENFORCE_ACCESS_CONTROL: raise ValueError("AZURE_ENFORCE_ACCESS_CONTROL must be true when USE_USER_UPLOAD is true") user_blob_manager = AdlsBlobManager( endpoint=f"https://{AZURE_USERSTORAGE_ACCOUNT}.dfs.core.windows.net", container=AZURE_USERSTORAGE_CONTAINER, credential=azure_credential, ) current_app.config[CONFIG_USER_BLOB_MANAGER] = user_blob_manager # Set up ingester file_processors, figure_processor = setup_file_processors( azure_credential=azure_credential, document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"), local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER", "").lower() == "true", local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER", "").lower() == "true", use_content_understanding=os.getenv("USE_CONTENT_UNDERSTANDING", "").lower() == "true", content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), use_multimodal=USE_MULTIMODAL, openai_client=openai_client, openai_model=OPENAI_CHATGPT_MODEL, openai_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT if OPENAI_HOST == OpenAIHost.AZURE else None, ) search_info = setup_search_info( search_service=AZURE_SEARCH_SERVICE, index_name=AZURE_SEARCH_INDEX, azure_credential=azure_credential, use_agentic_knowledgebase=USE_AGENTIC_KNOWLEDGEBASE, azure_openai_endpoint=azure_openai_endpoint, knowledgebase_name=AZURE_SEARCH_KNOWLEDGEBASE_NAME, azure_openai_knowledgebase_deployment=AZURE_OPENAI_KNOWLEDGEBASE_DEPLOYMENT, azure_openai_knowledgebase_model=AZURE_OPENAI_KNOWLEDGEBASE_MODEL, ) text_embeddings_service = None if USE_VECTORS: text_embeddings_service = setup_embeddings_service( open_ai_client=openai_client, openai_host=OPENAI_HOST, emb_model_name=OPENAI_EMB_MODEL, emb_model_dimensions=OPENAI_EMB_DIMENSIONS, azure_openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT, azure_openai_endpoint=azure_openai_endpoint, ) image_embeddings_service = setup_image_embeddings_service( azure_credential=azure_credential, vision_endpoint=AZURE_VISION_ENDPOINT, use_multimodal=USE_MULTIMODAL, ) ingester = UploadUserFileStrategy( search_info=search_info, file_processors=file_processors, embeddings=text_embeddings_service, image_embeddings=image_embeddings_service, search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING, blob_manager=user_blob_manager, figure_processor=figure_processor, ) current_app.config[CONFIG_INGESTER] = ingester image_embeddings_client = None if USE_MULTIMODAL: image_embeddings_client = ImageEmbeddings(AZURE_VISION_ENDPOINT, azure_ai_token_provider) current_app.config[CONFIG_OPENAI_CLIENT] = openai_client current_app.config[CONFIG_SEARCH_CLIENT] = search_client current_app.config[CONFIG_KNOWLEDGEBASE_CLIENT] = knowledgebase_client current_app.config[CONFIG_KNOWLEDGEBASE_CLIENT_WITH_WEB] = knowledgebase_client_with_web current_app.config[CONFIG_KNOWLEDGEBASE_CLIENT_WITH_SHAREPOINT] = knowledgebase_client_with_sharepoint current_app.config[CONFIG_KNOWLEDGEBASE_CLIENT_WITH_WEB_AND_SHAREPOINT] = ( knowledgebase_client_with_web_and_sharepoint ) current_app.config[CONFIG_AUTH_CLIENT] = auth_helper current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED] = AZURE_SEARCH_SEMANTIC_RANKER != "disabled" current_app.config[CONFIG_QUERY_REWRITING_ENABLED] = ( AZURE_SEARCH_QUERY_REWRITING == "true" and AZURE_SEARCH_SEMANTIC_RANKER != "disabled" ) current_app.config[CONFIG_DEFAULT_REASONING_EFFORT] = OPENAI_REASONING_EFFORT current_app.config[CONFIG_DEFAULT_RETRIEVAL_REASONING_EFFORT] = AGENTIC_KNOWLEDGEBASE_REASONING_EFFORT current_app.config[CONFIG_REASONING_EFFORT_ENABLED] = OPENAI_CHATGPT_MODEL in Approach.GPT_REASONING_MODELS current_app.config[CONFIG_STREAMING_ENABLED] = ( OPENAI_CHATGPT_MODEL not in Approach.GPT_REASONING_MODELS or Approach.GPT_REASONING_MODELS[OPENAI_CHATGPT_MODEL].streaming ) current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = bool(USE_VECTORS) current_app.config[CONFIG_USER_UPLOAD_ENABLED] = bool(USE_USER_UPLOAD) current_app.config[CONFIG_LANGUAGE_PICKER_ENABLED] = ENABLE_LANGUAGE_PICKER current_app.config[CONFIG_SPEECH_INPUT_ENABLED] = USE_SPEECH_INPUT_BROWSER current_app.config[CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED] = USE_SPEECH_OUTPUT_BROWSER current_app.config[CONFIG_SPEECH_OUTPUT_AZURE_ENABLED] = USE_SPEECH_OUTPUT_AZURE current_app.config[CONFIG_CHAT_HISTORY_BROWSER_ENABLED] = USE_CHAT_HISTORY_BROWSER current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED] = USE_CHAT_HISTORY_COSMOS current_app.config[CONFIG_AGENTIC_KNOWLEDGEBASE_ENABLED] = USE_AGENTIC_KNOWLEDGEBASE current_app.config[CONFIG_MULTIMODAL_ENABLED] = USE_MULTIMODAL current_app.config[CONFIG_RAG_SEARCH_TEXT_EMBEDDINGS] = RAG_SEARCH_TEXT_EMBEDDINGS current_app.config[CONFIG_RAG_SEARCH_IMAGE_EMBEDDINGS] = RAG_SEARCH_IMAGE_EMBEDDINGS current_app.config[CONFIG_RAG_SEND_TEXT_SOURCES] = RAG_SEND_TEXT_SOURCES current_app.config[CONFIG_RAG_SEND_IMAGE_SOURCES] = RAG_SEND_IMAGE_SOURCES current_app.config[CONFIG_WEB_SOURCE_ENABLED] = USE_WEB_SOURCE if AGENTIC_KNOWLEDGEBASE_REASONING_EFFORT == "minimal" and current_app.config[CONFIG_WEB_SOURCE_ENABLED]: raise ValueError("Web source cannot be used with minimal retrieval reasoning effort") current_app.config[CONFIG_SHAREPOINT_SOURCE_ENABLED] = USE_SHAREPOINT_SOURCE prompt_manager = PromptManager() # ChatReadRetrieveReadApproach is used by /chat for multi-turn conversation current_app.config[CONFIG_CHAT_APPROACH] = ChatReadRetrieveReadApproach( search_client=search_client, search_index_name=AZURE_SEARCH_INDEX, knowledgebase_model=AZURE_OPENAI_KNOWLEDGEBASE_MODEL, knowledgebase_deployment=AZURE_OPENAI_KNOWLEDGEBASE_DEPLOYMENT, knowledgebase_client=knowledgebase_client, knowledgebase_client_with_web=knowledgebase_client_with_web, knowledgebase_client_with_sharepoint=knowledgebase_client_with_sharepoint, knowledgebase_client_with_web_and_sharepoint=knowledgebase_client_with_web_and_sharepoint, openai_client=openai_client, chatgpt_model=OPENAI_CHATGPT_MODEL, chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT, embedding_model=OPENAI_EMB_MODEL, embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT, embedding_dimensions=OPENAI_EMB_DIMENSIONS, embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING, sourcepage_field=KB_FIELDS_SOURCEPAGE, content_field=KB_FIELDS_CONTENT, query_language=AZURE_SEARCH_QUERY_LANGUAGE, query_speller=AZURE_SEARCH_QUERY_SPELLER, prompt_manager=prompt_manager, reasoning_effort=OPENAI_REASONING_EFFORT, multimodal_enabled=USE_MULTIMODAL, image_embeddings_client=image_embeddings_client, global_blob_manager=global_blob_manager, user_blob_manager=user_blob_manager, use_web_source=current_app.config[CONFIG_WEB_SOURCE_ENABLED], use_sharepoint_source=current_app.config[CONFIG_SHAREPOINT_SOURCE_ENABLED], retrieval_reasoning_effort=AGENTIC_KNOWLEDGEBASE_REASONING_EFFORT, ) @bp.after_app_serving async def close_clients(): await current_app.config[CONFIG_SEARCH_CLIENT].close() await current_app.config[CONFIG_GLOBAL_BLOB_MANAGER].close_clients() if user_blob_manager := current_app.config.get(CONFIG_USER_BLOB_MANAGER): await user_blob_manager.close_clients() await current_app.config[CONFIG_CREDENTIAL].close() def create_app(): app = Quart(__name__) app.register_blueprint(bp) app.register_blueprint(chat_history_cosmosdb_bp) if os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING"): app.logger.info("APPLICATIONINSIGHTS_CONNECTION_STRING is set, enabling Azure Monitor") configure_azure_monitor( instrumentation_options={ "django": {"enabled": False}, "psycopg2": {"enabled": False}, "fastapi": {"enabled": False}, } ) # This tracks HTTP requests made by aiohttp: AioHttpClientInstrumentor().instrument() # This tracks HTTP requests made by httpx: HTTPXClientInstrumentor().instrument() # This tracks OpenAI SDK requests: OpenAIInstrumentor().instrument() # This middleware tracks app route requests: app.asgi_app = OpenTelemetryMiddleware(app.asgi_app) # type: ignore[assignment] # Log levels should be one of https://docs.python.org/3/library/logging.html#logging-levels # Set root level to WARNING to avoid seeing overly verbose logs from SDKS logging.basicConfig(level=logging.WARNING) # Set our own logger levels to INFO by default app_level = os.getenv("APP_LOG_LEVEL", "INFO") app.logger.setLevel(os.getenv("APP_LOG_LEVEL", app_level)) logging.getLogger("scripts").setLevel(app_level) if allowed_origin := os.getenv("ALLOWED_ORIGIN"): allowed_origins = allowed_origin.split(";") if len(allowed_origins) > 0: app.logger.info("CORS enabled for %s", allowed_origins) cors(app, allow_origin=allowed_origins, allow_methods=["GET", "POST"]) return app ================================================ FILE: app/backend/approaches/__init__.py ================================================ ================================================ FILE: app/backend/approaches/approach.py ================================================ import base64 import json import re from abc import ABC from collections.abc import AsyncGenerator, Awaitable from dataclasses import asdict, dataclass, field from typing import Any, Optional, TypedDict, cast from azure.search.documents.aio import SearchClient from azure.search.documents.knowledgebases.aio import KnowledgeBaseRetrievalClient from azure.search.documents.knowledgebases.models import ( KnowledgeBaseMessage, KnowledgeBaseMessageTextContent, KnowledgeBaseRemoteSharePointActivityRecord, KnowledgeBaseRemoteSharePointReference, KnowledgeBaseRetrievalRequest, KnowledgeBaseRetrievalResponse, KnowledgeBaseSearchIndexActivityRecord, KnowledgeBaseSearchIndexReference, KnowledgeBaseWebActivityRecord, KnowledgeBaseWebReference, KnowledgeRetrievalLowReasoningEffort, KnowledgeRetrievalMediumReasoningEffort, KnowledgeRetrievalMinimalReasoningEffort, KnowledgeRetrievalSemanticIntent, KnowledgeSourceParams, RemoteSharePointKnowledgeSourceParams, SearchIndexKnowledgeSourceParams, WebKnowledgeSourceParams, ) from azure.search.documents.models import ( QueryCaptionResult, QueryType, VectorizedQuery, VectorQuery, ) from openai import AsyncOpenAI, AsyncStream from openai.types import CompletionUsage from openai.types.chat import ( ChatCompletion, ChatCompletionChunk, ChatCompletionMessageFunctionToolCall, ChatCompletionMessageParam, ChatCompletionReasoningEffort, ChatCompletionToolParam, ) from approaches.promptmanager import PromptManager from prepdocslib.blobmanager import AdlsBlobManager, BlobManager from prepdocslib.embeddings import ImageEmbeddings @dataclass class ActivityDetail: id: int number: int type: str source: str query: str @dataclass class Document: id: Optional[str] = None ref_id: Optional[str] = None # Reference id from agentic retrieval (if applicable) content: Optional[str] = None category: Optional[str] = None sourcepage: Optional[str] = None sourcefile: Optional[str] = None oids: Optional[list[str]] = None groups: Optional[list[str]] = None captions: Optional[list[QueryCaptionResult]] = None score: Optional[float] = None reranker_score: Optional[float] = None activity: Optional[ActivityDetail] = None images: Optional[list[dict[str, Any]]] = None def serialize_for_results(self) -> dict[str, Any]: result_dict = { "type": "searchIndex", "id": self.id, "content": self.content, "category": self.category, "sourcepage": self.sourcepage, "sourcefile": self.sourcefile, "oids": self.oids, "groups": self.groups, "captions": ( [ { "additional_properties": caption.additional_properties, "text": caption.text, "highlights": caption.highlights, } for caption in self.captions ] if self.captions else [] ), "score": self.score, "reranker_score": self.reranker_score, "activity": asdict(self.activity) if self.activity else None, "images": self.images, } return result_dict @dataclass class WebResult: id: Optional[str] = None title: Optional[str] = None url: Optional[str] = None activity: Optional[ActivityDetail] = None def serialize_for_results(self) -> dict[str, Any]: return { "type": "web", "id": self.id, "ref_id": str(self.id), "title": self.title, "url": self.url, "activity": asdict(self.activity) if self.activity else None, } @dataclass class SharePointResult: id: Optional[str] = None web_url: Optional[str] = None content: Optional[str] = None title: Optional[str] = None reranker_score: Optional[float] = None activity: Optional[ActivityDetail] = None def serialize_for_results(self) -> dict[str, Any]: return { "type": "remoteSharePoint", "id": self.id, "ref_id": str(self.id), "web_url": self.web_url, "content": self.content, "title": self.title, "reranker_score": self.reranker_score, "activity": asdict(self.activity) if self.activity else None, } @dataclass class RewriteQueryResult: query: str messages: list[ChatCompletionMessageParam] completion: ChatCompletion reasoning_effort: ChatCompletionReasoningEffort @dataclass class ThoughtStep: title: str description: Optional[Any] props: Optional[dict[str, Any]] = None def update_token_usage(self, usage: CompletionUsage) -> None: if self.props: self.props["token_usage"] = TokenUsageProps.from_completion_usage(usage) @dataclass class AgenticRetrievalResults: """Results from agentic retrieval including activities, documents, web results, SharePoint results, and optional answer.""" response: KnowledgeBaseRetrievalResponse documents: list[Document] web_results: list[WebResult] sharepoint_results: list[SharePointResult] = field(default_factory=list) answer: Optional[str] = None # Synthesized answer when web knowledge source is used rewrite_result: Optional[RewriteQueryResult] = None activity_details_by_id: Optional[dict[int, ActivityDetail]] = None thoughts: list[ThoughtStep] = field(default_factory=list) @dataclass class DataPoints: text: Optional[list[str]] = None images: Optional[list] = None citations: Optional[list[str]] = None external_results_metadata: Optional[list[dict[str, Any]]] = None citation_activity_details: Optional[dict[str, dict[str, Any]]] = None @dataclass class ExtraInfo: data_points: DataPoints thoughts: list[ThoughtStep] = field(default_factory=list) followup_questions: Optional[list[Any]] = None answer: Optional[str] = None # Only when web knowledge source is used @dataclass class TokenUsageProps: prompt_tokens: int completion_tokens: int reasoning_tokens: Optional[int] total_tokens: int @classmethod def from_completion_usage(cls, usage: CompletionUsage) -> "TokenUsageProps": return cls( prompt_tokens=usage.prompt_tokens, completion_tokens=usage.completion_tokens, reasoning_tokens=( usage.completion_tokens_details.reasoning_tokens if usage.completion_tokens_details else None ), total_tokens=usage.total_tokens, ) # GPT reasoning models don't support the same set of parameters as other models # https://learn.microsoft.com/azure/ai-services/openai/how-to/reasoning @dataclass class GPTReasoningModelSupport: streaming: bool minimal_effort: bool class Approach(ABC): # List of GPT reasoning models support GPT_REASONING_MODELS = { "o1": GPTReasoningModelSupport(streaming=False, minimal_effort=False), "o3": GPTReasoningModelSupport(streaming=True, minimal_effort=False), "o3-mini": GPTReasoningModelSupport(streaming=True, minimal_effort=False), "o4-mini": GPTReasoningModelSupport(streaming=True, minimal_effort=False), "gpt-5": GPTReasoningModelSupport(streaming=True, minimal_effort=True), "gpt-5-nano": GPTReasoningModelSupport(streaming=True, minimal_effort=True), "gpt-5-mini": GPTReasoningModelSupport(streaming=True, minimal_effort=True), } # Set a higher token limit for GPT reasoning models RESPONSE_DEFAULT_TOKEN_LIMIT = 1024 RESPONSE_REASONING_DEFAULT_TOKEN_LIMIT = 8192 QUERY_REWRITE_NO_RESPONSE = "0" def __init__( self, search_client: SearchClient, openai_client: AsyncOpenAI, knowledgebase_model: Optional[str], knowledgebase_deployment: Optional[str], query_language: Optional[str], query_speller: Optional[str], embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text" embedding_model: str, embedding_dimensions: int, embedding_field: str, openai_host: str, chatgpt_model: str, chatgpt_deployment: Optional[str], # Not needed for non-Azure OpenAI prompt_manager: PromptManager, reasoning_effort: Optional[str] = None, multimodal_enabled: bool = False, image_embeddings_client: Optional[ImageEmbeddings] = None, global_blob_manager: Optional[BlobManager] = None, user_blob_manager: Optional[AdlsBlobManager] = None, ): self.search_client = search_client self.openai_client = openai_client self.query_language = query_language self.query_speller = query_speller self.knowledgebase_model = knowledgebase_model self.knowledgebase_deployment = knowledgebase_deployment self.embedding_deployment = embedding_deployment self.embedding_model = embedding_model self.embedding_dimensions = embedding_dimensions self.embedding_field = embedding_field self.openai_host = openai_host self.chatgpt_model = chatgpt_model self.chatgpt_deployment = chatgpt_deployment self.prompt_manager = prompt_manager self.query_rewrite_tools = self.prompt_manager.load_tools("chat_query_rewrite_tools.json") self.reasoning_effort = reasoning_effort self.include_token_usage = True self.multimodal_enabled = multimodal_enabled self.image_embeddings_client = image_embeddings_client self.global_blob_manager = global_blob_manager self.user_blob_manager = user_blob_manager def build_filter(self, overrides: dict[str, Any]) -> Optional[str]: include_category = overrides.get("include_category") exclude_category = overrides.get("exclude_category") filters = [] if include_category: filters.append("category eq '{}'".format(include_category.replace("'", "''"))) if exclude_category: filters.append("category ne '{}'".format(exclude_category.replace("'", "''"))) return None if not filters else " and ".join(filters) async def search( self, top: int, query_text: Optional[str], filter: Optional[str], vectors: list[VectorQuery], use_text_search: bool, use_vector_search: bool, use_semantic_ranker: bool, use_semantic_captions: bool, minimum_search_score: Optional[float] = None, minimum_reranker_score: Optional[float] = None, use_query_rewriting: Optional[bool] = None, access_token: Optional[str] = None, ) -> list[Document]: search_text = query_text if use_text_search else "" search_vectors = vectors if use_vector_search else [] if use_semantic_ranker: results = await self.search_client.search( search_text=search_text, filter=filter, top=top, query_caption="extractive|highlight-false" if use_semantic_captions else None, query_rewrites="generative" if use_query_rewriting else None, vector_queries=search_vectors, query_type=QueryType.SEMANTIC, query_language=self.query_language, query_speller=self.query_speller, semantic_configuration_name="default", semantic_query=query_text, x_ms_query_source_authorization=access_token, ) else: results = await self.search_client.search( search_text=search_text, filter=filter, top=top, vector_queries=search_vectors, x_ms_query_source_authorization=access_token, ) documents: list[Document] = [] async for page in results.by_page(): async for document in page: documents.append( Document( id=document.get("id"), content=document.get("content"), category=document.get("category"), sourcepage=document.get("sourcepage"), sourcefile=document.get("sourcefile"), oids=document.get("oids"), groups=document.get("groups"), captions=cast(list[QueryCaptionResult], document.get("@search.captions")), score=document.get("@search.score"), reranker_score=document.get("@search.reranker_score"), images=document.get("images"), ) ) qualified_documents = [ doc for doc in documents if ( (doc.score or 0) >= (minimum_search_score or 0) and (doc.reranker_score or 0) >= (minimum_reranker_score or 0) ) ] return qualified_documents def extract_rewritten_query( self, chat_completion: ChatCompletion, user_query: str, no_response_token: Optional[str] = None, ) -> str: response_message = chat_completion.choices[0].message if response_message.tool_calls: for tool_call in response_message.tool_calls: if tool_call.type != "function": continue arguments_payload = cast(ChatCompletionMessageFunctionToolCall, tool_call).function.arguments or "{}" try: parsed_arguments = json.loads(arguments_payload) except json.JSONDecodeError: continue search_query = parsed_arguments.get("search_query") if search_query and (no_response_token is None or search_query != no_response_token): return search_query if response_message.content: candidate = response_message.content.strip() if candidate and (no_response_token is None or candidate != no_response_token): return candidate return user_query async def rewrite_query( self, *, prompt_template: str, prompt_variables: dict[str, Any], overrides: dict[str, Any], chatgpt_model: str, chatgpt_deployment: Optional[str], user_query: str, response_token_limit: int, tools: Optional[list[ChatCompletionToolParam]] = None, temperature: float = 0.0, no_response_token: Optional[str] = None, ) -> RewriteQueryResult: query_messages = [self.prompt_manager.build_system_prompt(prompt_template, prompt_variables)] rewrite_reasoning_effort = self.get_lowest_reasoning_effort(self.chatgpt_model) chat_completion = cast( ChatCompletion, await self.create_chat_completion( chatgpt_deployment, chatgpt_model, messages=query_messages, overrides=overrides, response_token_limit=response_token_limit, temperature=temperature, tools=tools, reasoning_effort=rewrite_reasoning_effort, ), ) rewritten_query = self.extract_rewritten_query( chat_completion, user_query, no_response_token=no_response_token, ) return RewriteQueryResult( query=rewritten_query, messages=query_messages, completion=chat_completion, reasoning_effort=rewrite_reasoning_effort, ) async def run_agentic_retrieval( self, messages: list[ChatCompletionMessageParam], knowledgebase_client: KnowledgeBaseRetrievalClient, search_index_name: str, filter_add_on: Optional[str] = None, minimum_reranker_score: Optional[float] = None, access_token: Optional[str] = None, use_web_source: bool = False, use_sharepoint_source: bool = False, retrieval_reasoning_effort: Optional[str] = None, should_rewrite_query: bool = True, ) -> AgenticRetrievalResults: # STEP 1: Invoke agentic retrieval thoughts = [] knowledge_source_params = [ SearchIndexKnowledgeSourceParams( knowledge_source_name=search_index_name, filter_add_on=filter_add_on, include_references=True, include_reference_source_data=True, always_query_source=False, reranker_threshold=minimum_reranker_score, ) ] # Build list as KnowledgeSourceParams for type variance knowledge_source_params_list: list[KnowledgeSourceParams] = cast( list[KnowledgeSourceParams], knowledge_source_params ) if use_web_source: knowledge_source_params_list.append( WebKnowledgeSourceParams( knowledge_source_name="web", include_references=True, include_reference_source_data=True, always_query_source=False, ) ) if use_sharepoint_source: knowledge_source_params_list.append( RemoteSharePointKnowledgeSourceParams( knowledge_source_name="sharepoint", include_references=True, include_reference_source_data=True, always_query_source=False, ) ) agentic_retrieval_input: dict[str, Any] = {} rewrite_result = None if retrieval_reasoning_effort == "minimal" and should_rewrite_query: original_user_query = messages[-1]["content"] if not isinstance(original_user_query, str): raise ValueError("The most recent message content must be a string.") rewrite_result = await self.rewrite_query( prompt_template="query_rewrite.system.jinja2", prompt_variables={"user_query": original_user_query, "past_messages": messages[:-1]}, overrides={}, chatgpt_model=self.chatgpt_model, chatgpt_deployment=self.chatgpt_deployment, user_query=original_user_query, response_token_limit=self.get_response_token_limit( self.chatgpt_model, 100 ), # Setting too low risks malformed JSON, setting too high may affect performance tools=self.query_rewrite_tools, temperature=0.0, # Minimize creativity for search query generation no_response_token=self.QUERY_REWRITE_NO_RESPONSE, ) thoughts.append( self.format_thought_step_for_chatcompletion( title="Prompt to generate search query", messages=rewrite_result.messages, overrides={}, model=self.chatgpt_model, deployment=self.chatgpt_deployment, usage=rewrite_result.completion.usage, reasoning_effort=rewrite_result.reasoning_effort, ) ) agentic_retrieval_input["intents"] = [KnowledgeRetrievalSemanticIntent(search=rewrite_result.query)] elif retrieval_reasoning_effort == "minimal": last_content = messages[-1]["content"] if not isinstance(last_content, str): raise ValueError("The most recent message content must be a string.") agentic_retrieval_input["intents"] = [KnowledgeRetrievalSemanticIntent(search=last_content)] else: kb_messages: list[KnowledgeBaseMessage] = [ KnowledgeBaseMessage( role=str(msg["role"]), content=[KnowledgeBaseMessageTextContent(text=str(msg["content"]))] ) for msg in messages if msg["role"] != "system" ] agentic_retrieval_input["messages"] = kb_messages # When we're not using a web source, set output mode to extractiveData to avoid synthesized answer if not use_web_source: agentic_retrieval_input["output_mode"] = "extractiveData" retrieval_effort: Optional[ KnowledgeRetrievalMinimalReasoningEffort | KnowledgeRetrievalLowReasoningEffort | KnowledgeRetrievalMediumReasoningEffort ] = None if retrieval_reasoning_effort == "minimal": retrieval_effort = KnowledgeRetrievalMinimalReasoningEffort() elif retrieval_reasoning_effort == "low": retrieval_effort = KnowledgeRetrievalLowReasoningEffort() elif retrieval_reasoning_effort == "medium": retrieval_effort = KnowledgeRetrievalMediumReasoningEffort() request_kwargs: dict[str, Any] = { "knowledge_source_params": knowledge_source_params_list, "include_activity": True, "retrieval_reasoning_effort": retrieval_effort, } request_kwargs.update(agentic_retrieval_input) response = await knowledgebase_client.retrieve( retrieval_request=KnowledgeBaseRetrievalRequest(**request_kwargs), x_ms_query_source_authorization=access_token, ) # Map activity id -> agent's internal search query and citation activities = response.activity or [] activity_details_by_id: dict[int, ActivityDetail] = {} for index, activity in enumerate(activities): search_query = None if isinstance(activity, KnowledgeBaseSearchIndexActivityRecord): if activity.search_index_arguments: search_query = activity.search_index_arguments.search elif isinstance(activity, KnowledgeBaseWebActivityRecord): if activity.web_arguments: search_query = activity.web_arguments.search elif isinstance(activity, KnowledgeBaseRemoteSharePointActivityRecord): if activity.remote_share_point_arguments: search_query = activity.remote_share_point_arguments.search activity_details_by_id[activity.id] = ActivityDetail( id=activity.id, number=index + 1, type=activity.type or "", source=getattr(activity, "knowledge_source_name", "") or "", # Not all activity types have knowledge_source_name query=search_query or "", ) # Extract references references = response.references or [] document_refs = [ r for r in references if isinstance(r, KnowledgeBaseSearchIndexReference) or hasattr(r, "doc_key") ] document_results: list[Document] = [] # Create documents from reference source data for ref in document_refs: if ref.source_data and ref.doc_key: # Note that ref.doc_key is the same as source_data["id"] document_results.append( Document( id=cast(str, ref.doc_key), ref_id=ref.id, content=ref.source_data.get("content"), category=ref.source_data.get("category"), sourcepage=ref.source_data.get("sourcepage"), sourcefile=ref.source_data.get("sourcefile"), oids=ref.source_data.get("oids"), groups=ref.source_data.get("groups"), reranker_score=getattr(ref, "reranker_score", None), images=ref.source_data.get("images"), activity=activity_details_by_id[ref.activity_source], ) ) # We need to handle KnowledgeBaseWebReference separately if web knowledge source is used web_refs = [r for r in references if isinstance(r, KnowledgeBaseWebReference)] web_results: list[WebResult] = [] for ref in web_refs: web_result = WebResult( id=ref.id, title=ref.title, url=ref.url, activity=activity_details_by_id[ref.activity_source] ) web_results.append(web_result) # Handle KnowledgeBaseRemoteSharePointReference if SharePoint knowledge source is used sharepoint_refs = [r for r in references if isinstance(r, KnowledgeBaseRemoteSharePointReference)] sharepoint_results: list[SharePointResult] = [] for ref in sharepoint_refs: # Extract content from all sourceData.extracts[].text and concatenate content = None if ref.source_data and "extracts" in ref.source_data and len(ref.source_data["extracts"]) > 0: extracts = [extract.get("text", "") for extract in ref.source_data["extracts"]] content = "\n\n".join(extracts) if extracts else None # Extract title from sourceData.resourceMetadata.title title = None if ref.source_data and "resourceMetadata" in ref.source_data: title = ref.source_data["resourceMetadata"].get("title") sharepoint_result = SharePointResult( id=ref.id, web_url=ref.web_url, content=content, title=title, reranker_score=getattr(ref, "reranker_score", None), activity=activity_details_by_id[ref.activity_source], ) sharepoint_results.append(sharepoint_result) # Extract answer from response if web knowledge source provided one answer: Optional[str] = None if ( use_web_source and response.response and len(response.response) > 0 and len(response.response[0].content) > 0 ): message_content = response.response[0].content[0] if isinstance(message_content, KnowledgeBaseMessageTextContent): raw_answer: Optional[str] = message_content.text # Replace all ref_id tokens (web -> URL, documents -> sourcepage, SharePoint -> web_url) if raw_answer: answer = self.replace_all_ref_ids(raw_answer, document_results, web_results, sharepoint_results) thoughts.append( ThoughtStep( "Agentic retrieval response", [result.serialize_for_results() for result in document_results + web_results + sharepoint_results], { "query_plan": ( [activity.as_dict() for activity in response.activity] if response.activity else None ), "model": self.knowledgebase_model, "deployment": self.knowledgebase_deployment, "reranker_threshold": minimum_reranker_score, "filter": filter_add_on, }, ) ) return AgenticRetrievalResults( response=response, documents=document_results, web_results=web_results, sharepoint_results=sharepoint_results, answer=answer, rewrite_result=rewrite_result, activity_details_by_id=activity_details_by_id, thoughts=thoughts, ) def replace_all_ref_ids( self, answer: str, documents: list[Document], web_results: list[WebResult], sharepoint_results: Optional[list[SharePointResult]] = None, ) -> str: """Replace [ref_id:] tokens with document sourcepage, web URL, or SharePoint web_url. Priority: web result -> SharePoint result -> document. Unknown ids left untouched. """ doc_map = {d.ref_id: d.sourcepage for d in documents if d.ref_id and d.sourcepage} web_map = {str(w.id): w.url for w in web_results if w.id and w.url} sharepoint_entries = sharepoint_results or [] sharepoint_map = {str(sp.id): sp.web_url.split("/")[-1] for sp in sharepoint_entries if sp.id and sp.web_url} def _sub(match: re.Match) -> str: ref_id = match.group(1) if ref_id in web_map and web_map[ref_id]: return f"[{web_map[ref_id]}]" if ref_id in sharepoint_map and sharepoint_map[ref_id]: return f"[{sharepoint_map[ref_id]}]" if ref_id in doc_map and doc_map[ref_id]: return f"[{doc_map[ref_id]}]" return match.group(0) return re.sub(r"\[ref_id:([^\]]+)\]", _sub, answer) async def get_sources_content( self, results: list[Document], use_semantic_captions: bool, include_text_sources: bool, download_image_sources: bool, user_oid: Optional[str] = None, web_results: Optional[list[WebResult]] = None, sharepoint_results: Optional[list[SharePointResult]] = None, ) -> DataPoints: """Extract text/image sources & citations from documents. Args: results: List of retrieved Document objects. use_semantic_captions: Whether to use semantic captions instead of full content text. download_image_sources: Whether to attempt downloading & base64 encoding referenced images. user_oid: Optional user object id for per-user storage access (ADLS scenarios). web_results: Optional list of web retrieval results to expose to clients. sharepoint_results: Optional list of SharePoint retrieval results to expose to clients. Returns: DataPoints: with text (list[str]), images (list[str - base64 data URI]), citations (list[str]). """ def clean_source(s: str) -> str: s = s.replace("\n", " ").replace("\r", " ") # normalize newlines to spaces s = s.replace(":::", ":::") # escape DocFX/markdown triple colons return s citations = [] text_sources = [] image_sources = [] seen_urls = set() external_results_metadata: list[dict[str, Any]] = [] citation_activity_details: dict[str, dict[str, Any]] = {} for doc in results: # Get the citation for the source page citation = self.get_citation(doc.sourcepage) if citation not in citations: citations.append(citation) # Add activity details if available if doc.activity: citation_activity_details[citation] = asdict(doc.activity) # If semantic captions are used, extract captions; otherwise, use content if include_text_sources: if use_semantic_captions and doc.captions: cleaned = clean_source(" . ".join([cast(str, c.text) for c in doc.captions])) else: cleaned = clean_source(doc.content or "") text_sources.append(f"{citation}: {cleaned}") if download_image_sources and hasattr(doc, "images") and doc.images: for img in doc.images: # Skip if we've already processed this URL if img["url"] in seen_urls or not img["url"]: continue seen_urls.add(img["url"]) url = await self.download_blob_as_base64(img["url"], user_oid=user_oid) if url: image_sources.append(url) image_citation = self.get_image_citation(doc.sourcepage or "", img["url"]) citations.append(image_citation) if web_results: for web in web_results: citation = self.get_citation(web.url) if citation and citation not in citations: citations.append(citation) # Add activity details if available if web.activity: citation_activity_details[citation] = asdict(web.activity) external_results_metadata.append( { "id": web.id, "title": web.title, "url": web.url, "activity": asdict(web.activity) if web.activity else None, } ) if sharepoint_results: for sp in sharepoint_results: # Extract filename from web_url for citation filename = sp.web_url.split("/")[-1] if sp.web_url else "" citation = self.get_citation(filename) if citation and citation not in citations: citations.append(citation) # Add activity details if available if sp.activity: citation_activity_details[citation] = asdict(sp.activity) if include_text_sources and sp.content: text_sources.append(f"{citation}: {clean_source(sp.content)}") external_results_metadata.append( { "id": sp.id, "title": sp.title or "", "url": sp.web_url or "", "snippet": clean_source(sp.content or ""), "activity": asdict(sp.activity) if sp.activity else None, } ) return DataPoints( text=text_sources, images=image_sources, citations=citations, external_results_metadata=external_results_metadata, citation_activity_details=citation_activity_details if citation_activity_details else None, ) def get_citation(self, sourcepage: Optional[str]): return sourcepage or "" def get_image_citation(self, sourcepage: Optional[str], image_url: str): sourcepage_citation = self.get_citation(sourcepage) image_filename = image_url.split("/")[-1] return f"{sourcepage_citation}({image_filename})" async def download_blob_as_base64(self, blob_url: str, user_oid: Optional[str] = None) -> Optional[str]: """ Downloads a blob from either Azure Blob Storage or Azure Data Lake Storage and returns it as a base64 encoded string. Args: blob_url: The URL or path to the blob to download user_oid: The user's object ID, required for Data Lake Storage operations and access control Returns: Optional[str]: The base64 encoded image data with data URI scheme prefix, or None if the blob cannot be downloaded """ # Handle full URLs for both Blob Storage and Data Lake Storage container: Optional[str] = None if blob_url.startswith("http"): url_parts = blob_url.split("/") # Extract container name from URL # For blob: https://{account}.blob.core.windows.net/{container}/{blob_path} # For dfs: https://{account}.dfs.core.windows.net/{filesystem}/{path} container = url_parts[3] # Extract the blob path portion (everything after the container/filesystem segment) blob_path = "/".join(url_parts[4:]) # If %20 in URL, replace it with a space blob_path = blob_path.replace("%20", " ") else: # Treat as a direct blob path blob_path = blob_url # Download the blob using the appropriate client result = None if ".dfs.core.windows.net" in blob_url and self.user_blob_manager: result = await self.user_blob_manager.download_blob(blob_path, user_oid=user_oid, container=container) elif self.global_blob_manager: result = await self.global_blob_manager.download_blob(blob_path, container=container) if result: content, _ = result # Unpack the tuple, ignoring properties img = base64.b64encode(content).decode("utf-8") return f"data:image/png;base64,{img}" return None async def compute_text_embedding(self, q: str): SUPPORTED_DIMENSIONS_MODEL = { "text-embedding-ada-002": False, "text-embedding-3-small": True, "text-embedding-3-large": True, } class ExtraArgs(TypedDict, total=False): dimensions: int dimensions_args: ExtraArgs = ( {"dimensions": self.embedding_dimensions} if SUPPORTED_DIMENSIONS_MODEL[self.embedding_model] else {} ) embedding = await self.openai_client.embeddings.create( # Azure OpenAI takes the deployment name as the model name model=self.embedding_deployment if self.embedding_deployment else self.embedding_model, input=q, **dimensions_args, ) query_vector = embedding.data[0].embedding # This performs an oversampling due to how the search index was setup, # so we do not need to explicitly pass in an oversampling parameter here return VectorizedQuery(vector=query_vector, k=50, fields=self.embedding_field) async def compute_multimodal_embedding(self, q: str): if not self.image_embeddings_client: raise ValueError("Approach is missing an image embeddings client for multimodal queries") multimodal_query_vector = await self.image_embeddings_client.create_embedding_for_text(q) return VectorizedQuery(vector=multimodal_query_vector, k=50, fields="images/embedding") def get_system_prompt_variables(self, override_prompt: Optional[str]) -> dict[str, str]: # Allows client to replace the entire prompt, or to inject into the existing prompt using >>> if override_prompt is None: return {} elif override_prompt.startswith(">>>"): return {"injected_prompt": override_prompt[3:]} else: return {"override_prompt": override_prompt} def get_response_token_limit(self, model: str, default_limit: int) -> int: if model in self.GPT_REASONING_MODELS: return self.RESPONSE_REASONING_DEFAULT_TOKEN_LIMIT return default_limit def get_lowest_reasoning_effort(self, model: str) -> ChatCompletionReasoningEffort: """ Return the lowest valid reasoning_effort for the given model. """ if model not in self.GPT_REASONING_MODELS: return None if self.GPT_REASONING_MODELS[model].minimal_effort: return "minimal" return "low" def create_chat_completion( self, chatgpt_deployment: Optional[str], chatgpt_model: str, messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], response_token_limit: int, should_stream: bool = False, tools: Optional[list[ChatCompletionToolParam]] = None, temperature: Optional[float] = None, n: Optional[int] = None, reasoning_effort: Optional[ChatCompletionReasoningEffort] = None, ) -> Awaitable[ChatCompletion] | Awaitable[AsyncStream[ChatCompletionChunk]]: if chatgpt_model in self.GPT_REASONING_MODELS: params: dict[str, Any] = { # max_tokens is not supported "max_completion_tokens": response_token_limit } # Adjust parameters for reasoning models supported_features = self.GPT_REASONING_MODELS[chatgpt_model] if supported_features.streaming and should_stream: params["stream"] = True params["stream_options"] = {"include_usage": True} params["reasoning_effort"] = reasoning_effort or overrides.get("reasoning_effort") or self.reasoning_effort else: # Include parameters that may not be supported for reasoning models params = { "max_tokens": response_token_limit, "temperature": temperature or overrides.get("temperature", 0.3), } if should_stream: params["stream"] = True params["stream_options"] = {"include_usage": True} if tools is not None: params["tools"] = tools # Azure OpenAI takes the deployment name as the model name seed_value: Optional[int] = overrides.get("seed", None) return self.openai_client.chat.completions.create( # type: ignore[no-matching-overload] model=chatgpt_deployment if chatgpt_deployment else chatgpt_model, messages=messages, seed=seed_value, n=n or 1, **params, ) def format_thought_step_for_chatcompletion( self, title: str, messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], model: str, deployment: Optional[str], usage: Optional[CompletionUsage] = None, reasoning_effort: Optional[ChatCompletionReasoningEffort] = None, ) -> ThoughtStep: properties: dict[str, Any] = {"model": model} if deployment: properties["deployment"] = deployment # Only add reasoning_effort setting if the model supports it if model in self.GPT_REASONING_MODELS: properties["reasoning_effort"] = reasoning_effort or overrides.get( "reasoning_effort", self.reasoning_effort ) if usage: properties["token_usage"] = TokenUsageProps.from_completion_usage(usage) return ThoughtStep(title, messages, properties) async def run( self, messages: list[ChatCompletionMessageParam], session_state: Any = None, context: dict[str, Any] = {}, ) -> dict[str, Any]: raise NotImplementedError async def run_stream( self, messages: list[ChatCompletionMessageParam], session_state: Any = None, context: dict[str, Any] = {}, ) -> AsyncGenerator[dict[str, Any], None]: raise NotImplementedError ================================================ FILE: app/backend/approaches/chatreadretrieveread.py ================================================ import re from collections.abc import AsyncGenerator, Awaitable from dataclasses import asdict from typing import Any, Optional, cast from azure.search.documents.aio import SearchClient from azure.search.documents.knowledgebases.aio import KnowledgeBaseRetrievalClient from azure.search.documents.models import VectorQuery from openai import AsyncOpenAI, AsyncStream from openai.types.chat import ( ChatCompletion, ChatCompletionChunk, ChatCompletionMessageParam, ) from openai.types.chat.chat_completion import Choice from openai.types.chat.chat_completion_message import ChatCompletionMessage from approaches.approach import ( Approach, ExtraInfo, ThoughtStep, ) from approaches.promptmanager import PromptManager from prepdocslib.blobmanager import AdlsBlobManager, BlobManager from prepdocslib.embeddings import ImageEmbeddings class ChatReadRetrieveReadApproach(Approach): """ A multi-step approach that first uses OpenAI to turn the user's question into a search query, then uses Azure AI Search to retrieve relevant documents, and then sends the conversation history, original user question, and search results to OpenAI to generate a response. """ NO_RESPONSE = Approach.QUERY_REWRITE_NO_RESPONSE def __init__( self, *, search_client: SearchClient, search_index_name: str, knowledgebase_model: Optional[str], knowledgebase_deployment: Optional[str], knowledgebase_client: Optional[KnowledgeBaseRetrievalClient], knowledgebase_client_with_web: Optional[KnowledgeBaseRetrievalClient] = None, knowledgebase_client_with_sharepoint: Optional[KnowledgeBaseRetrievalClient] = None, knowledgebase_client_with_web_and_sharepoint: Optional[KnowledgeBaseRetrievalClient] = None, openai_client: AsyncOpenAI, chatgpt_model: str, chatgpt_deployment: Optional[str], # Not needed for non-Azure OpenAI embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text" embedding_model: str, embedding_dimensions: int, embedding_field: str, sourcepage_field: str, content_field: str, query_language: str, query_speller: str, prompt_manager: PromptManager, reasoning_effort: Optional[str] = None, multimodal_enabled: bool = False, image_embeddings_client: Optional[ImageEmbeddings] = None, global_blob_manager: Optional[BlobManager] = None, user_blob_manager: Optional[AdlsBlobManager] = None, use_web_source: bool = False, use_sharepoint_source: bool = False, retrieval_reasoning_effort: Optional[str] = None, ): self.search_client = search_client self.search_index_name = search_index_name self.knowledgebase_model = knowledgebase_model self.knowledgebase_deployment = knowledgebase_deployment self.knowledgebase_client = knowledgebase_client self.knowledgebase_client_with_web = knowledgebase_client_with_web self.knowledgebase_client_with_sharepoint = knowledgebase_client_with_sharepoint self.knowledgebase_client_with_web_and_sharepoint = knowledgebase_client_with_web_and_sharepoint self.openai_client = openai_client self.chatgpt_model = chatgpt_model self.chatgpt_deployment = chatgpt_deployment self.embedding_deployment = embedding_deployment self.embedding_model = embedding_model self.embedding_dimensions = embedding_dimensions self.embedding_field = embedding_field self.sourcepage_field = sourcepage_field self.content_field = content_field self.query_language = query_language self.query_speller = query_speller self.prompt_manager = prompt_manager self.query_rewrite_tools = self.prompt_manager.load_tools("chat_query_rewrite_tools.json") self.reasoning_effort = reasoning_effort self.include_token_usage = True self.multimodal_enabled = multimodal_enabled self.image_embeddings_client = image_embeddings_client self.global_blob_manager = global_blob_manager self.user_blob_manager = user_blob_manager # Track whether web source retrieval is enabled for this deployment; overrides may only disable it. self.web_source_enabled = use_web_source self.use_sharepoint_source = use_sharepoint_source self.retrieval_reasoning_effort = retrieval_reasoning_effort def extract_followup_questions(self, content: Optional[str]): if content is None: return content, [] return content.split("<<")[0], re.findall(r"<<([^>>]+)>>", content) def get_search_query(self, chat_completion: ChatCompletion, default_query: str) -> str: """Read the optimized search query from a chat completion tool call.""" try: return self.extract_rewritten_query(chat_completion, default_query, no_response_token=self.NO_RESPONSE) except Exception: return default_query async def run_without_streaming( self, messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], auth_claims: dict[str, Any], session_state: Any = None, ) -> dict[str, Any]: extra_info, chat_coroutine = await self.run_until_final_call( messages, overrides, auth_claims, should_stream=False ) chat_completion_response: ChatCompletion = await cast(Awaitable[ChatCompletion], chat_coroutine) content = chat_completion_response.choices[0].message.content role = chat_completion_response.choices[0].message.role if overrides.get("suggest_followup_questions"): content, followup_questions = self.extract_followup_questions(content) extra_info.followup_questions = followup_questions # Assume last thought is for generating answer # TODO: Update for agentic? This isn't still true? if self.include_token_usage and extra_info.thoughts and chat_completion_response.usage: extra_info.thoughts[-1].update_token_usage(chat_completion_response.usage) chat_app_response = { "message": {"content": content, "role": role}, "context": { "thoughts": extra_info.thoughts, "data_points": { key: value for key, value in asdict(extra_info.data_points).items() if value is not None }, "followup_questions": extra_info.followup_questions, }, "session_state": session_state, } return chat_app_response async def run_with_streaming( self, messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], auth_claims: dict[str, Any], session_state: Any = None, ) -> AsyncGenerator[dict, None]: extra_info, chat_coroutine = await self.run_until_final_call( messages, overrides, auth_claims, should_stream=True ) yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state} followup_questions_started = False followup_content = "" chat_result = await chat_coroutine if isinstance(chat_result, ChatCompletion): message = chat_result.choices[0].message content = message.content or "" role = message.role or "assistant" followup_questions: list[str] = [] if overrides.get("suggest_followup_questions"): content, followup_questions = self.extract_followup_questions(content) extra_info.followup_questions = followup_questions if self.include_token_usage and extra_info.thoughts and chat_result.usage: extra_info.thoughts[-1].update_token_usage(chat_result.usage) delta_payload: dict[str, Any] = {"role": role} if content: delta_payload["content"] = content yield {"delta": delta_payload} yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state} if followup_questions: yield { "delta": {"role": "assistant"}, "context": {"context": extra_info, "followup_questions": followup_questions}, } return chat_result = cast(AsyncStream[ChatCompletionChunk], chat_result) async for event_chunk in chat_result: # "2023-07-01-preview" API version has a bug where first response has empty choices event = event_chunk.model_dump() # Convert pydantic model to dict if event["choices"]: # No usage during streaming completion = { "delta": { "content": event["choices"][0]["delta"].get("content"), "role": event["choices"][0]["delta"]["role"], } } # if event contains << and not >>, it is start of follow-up question, truncate delta_content_raw = completion["delta"].get("content") delta_content: str = ( delta_content_raw or "" ) # content may either not exist in delta, or explicitly be None if overrides.get("suggest_followup_questions") and "<<" in delta_content: followup_questions_started = True earlier_content = delta_content[: delta_content.index("<<")] if earlier_content: completion["delta"]["content"] = earlier_content yield completion followup_content += delta_content[delta_content.index("<<") :] elif followup_questions_started: followup_content += delta_content else: yield completion else: # Final chunk at end of streaming should contain usage # https://cookbook.openai.com/examples/how_to_stream_completions#4-how-to-get-token-usage-data-for-streamed-chat-completion-response if event_chunk.usage and extra_info.thoughts and self.include_token_usage: extra_info.thoughts[-1].update_token_usage(event_chunk.usage) yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state} if followup_content: _, followup_questions = self.extract_followup_questions(followup_content) yield { "delta": {"role": "assistant"}, "context": {"context": extra_info, "followup_questions": followup_questions}, } async def run( self, messages: list[ChatCompletionMessageParam], session_state: Any = None, context: dict[str, Any] = {}, ) -> dict[str, Any]: overrides = context.get("overrides", {}) auth_claims = context.get("auth_claims", {}) return await self.run_without_streaming(messages, overrides, auth_claims, session_state) async def run_stream( self, messages: list[ChatCompletionMessageParam], session_state: Any = None, context: dict[str, Any] = {}, ) -> AsyncGenerator[dict[str, Any], None]: overrides = context.get("overrides", {}) auth_claims = context.get("auth_claims", {}) return self.run_with_streaming(messages, overrides, auth_claims, session_state) async def run_until_final_call( self, messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], auth_claims: dict[str, Any], should_stream: bool = False, ) -> tuple[ExtraInfo, Awaitable[ChatCompletion] | Awaitable[AsyncStream[ChatCompletionChunk]]]: use_agentic_knowledgebase = True if overrides.get("use_agentic_knowledgebase") else False original_user_query = messages[-1]["content"] reasoning_model_support = self.GPT_REASONING_MODELS.get(self.chatgpt_model) if reasoning_model_support and (not reasoning_model_support.streaming and should_stream): raise Exception( f"{self.chatgpt_model} does not support streaming. Please use a different model or disable streaming." ) if use_agentic_knowledgebase: if should_stream and overrides.get("use_web_source"): raise Exception( "Streaming is not supported with agentic retrieval when web source is enabled. Please disable streaming or web source." ) extra_info = await self.run_agentic_retrieval_approach(messages, overrides, auth_claims) else: extra_info = await self.run_search_approach(messages, overrides, auth_claims) if extra_info.answer: # If agentic retrieval already provided an answer, skip final call to LLM async def return_answer() -> ChatCompletion: return ChatCompletion( id="no-final-call", object="chat.completion", created=0, model=self.chatgpt_model, choices=[ Choice( message=ChatCompletionMessage( role="assistant", content=extra_info.answer, ), finish_reason="stop", index=0, ) ], ) return (extra_info, return_answer()) messages = self.prompt_manager.build_conversation( system_template_path="chat_answer.system.jinja2", system_template_variables=self.get_system_prompt_variables(overrides.get("prompt_template")) | { "include_follow_up_questions": bool(overrides.get("suggest_followup_questions")), "image_sources": extra_info.data_points.images, "citations": extra_info.data_points.citations, }, user_template_path="chat_answer.user.jinja2", user_template_variables={ "user_query": original_user_query, "text_sources": extra_info.data_points.text, }, user_image_sources=extra_info.data_points.images, past_messages=messages[:-1], ) chat_coroutine = cast( Awaitable[ChatCompletion] | Awaitable[AsyncStream[ChatCompletionChunk]], self.create_chat_completion( self.chatgpt_deployment, self.chatgpt_model, messages, overrides, self.get_response_token_limit(self.chatgpt_model, 1024), should_stream, ), ) extra_info.thoughts.append( self.format_thought_step_for_chatcompletion( title="Prompt to generate answer", messages=messages, overrides=overrides, model=self.chatgpt_model, deployment=self.chatgpt_deployment, usage=None, ) ) return (extra_info, chat_coroutine) async def run_search_approach( self, messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], auth_claims: dict[str, Any] ): use_text_search = overrides.get("retrieval_mode") in ["text", "hybrid", None] use_vector_search = overrides.get("retrieval_mode") in ["vectors", "hybrid", None] use_semantic_ranker = True if overrides.get("semantic_ranker") else False use_semantic_captions = True if overrides.get("semantic_captions") else False use_query_rewriting = True if overrides.get("query_rewriting") else False top = overrides.get("top", 3) minimum_search_score = overrides.get("minimum_search_score", 0.0) minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0) search_index_filter = self.build_filter(overrides) access_token = auth_claims.get("access_token") send_text_sources = overrides.get("send_text_sources", True) send_image_sources = overrides.get("send_image_sources", self.multimodal_enabled) and self.multimodal_enabled search_text_embeddings = overrides.get("search_text_embeddings", True) search_image_embeddings = ( overrides.get("search_image_embeddings", self.multimodal_enabled) and self.multimodal_enabled ) original_user_query = messages[-1]["content"] if not isinstance(original_user_query, str): raise ValueError("The most recent message content must be a string.") # STEP 1: Generate an optimized keyword search query based on the chat history and the last question rewrite_result = await self.rewrite_query( prompt_template="query_rewrite.system.jinja2", prompt_variables={ "user_query": original_user_query, "past_messages": messages[:-1], }, overrides=overrides, chatgpt_model=self.chatgpt_model, chatgpt_deployment=self.chatgpt_deployment, user_query=original_user_query, response_token_limit=self.get_response_token_limit( self.chatgpt_model, 100 ), # Setting too low risks malformed JSON, setting too high may affect performance tools=self.query_rewrite_tools, temperature=0.0, # Minimize creativity for search query generation no_response_token=self.NO_RESPONSE, ) query_text = rewrite_result.query # STEP 2: Retrieve relevant documents from the search index with the GPT optimized query vectors: list[VectorQuery] = [] if use_vector_search: if search_text_embeddings: vectors.append(await self.compute_text_embedding(query_text)) if search_image_embeddings: vectors.append(await self.compute_multimodal_embedding(query_text)) results = await self.search( top, query_text, search_index_filter, vectors, use_text_search, use_vector_search, use_semantic_ranker, use_semantic_captions, minimum_search_score, minimum_reranker_score, use_query_rewriting, access_token, ) # STEP 3: Generate a contextual and content specific answer using the search results and chat history data_points = await self.get_sources_content( results, use_semantic_captions, include_text_sources=send_text_sources, download_image_sources=send_image_sources, user_oid=auth_claims.get("oid"), ) extra_info = ExtraInfo( data_points, thoughts=[ self.format_thought_step_for_chatcompletion( title="Prompt to generate search query", messages=rewrite_result.messages, overrides=overrides, model=self.chatgpt_model, deployment=self.chatgpt_deployment, usage=rewrite_result.completion.usage, reasoning_effort=rewrite_result.reasoning_effort, ), ThoughtStep( "Search using generated search query", query_text, { "use_semantic_captions": use_semantic_captions, "use_semantic_ranker": use_semantic_ranker, "use_query_rewriting": use_query_rewriting, "top": top, "filter": search_index_filter, "use_vector_search": use_vector_search, "use_text_search": use_text_search, "search_text_embeddings": search_text_embeddings, "search_image_embeddings": search_image_embeddings, }, ), ThoughtStep( "Search results", [result.serialize_for_results() for result in results], ), ], ) return extra_info async def run_agentic_retrieval_approach( self, messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], auth_claims: dict[str, Any], ): search_index_filter = self.build_filter(overrides) access_token = auth_claims.get("access_token") minimum_reranker_score = overrides.get("minimum_reranker_score", 0) send_text_sources = overrides.get("send_text_sources", True) send_image_sources = overrides.get("send_image_sources", self.multimodal_enabled) and self.multimodal_enabled retrieval_reasoning_effort = overrides.get("retrieval_reasoning_effort", self.retrieval_reasoning_effort) # Overrides can only disable web source support configured at construction time. use_web_source = self.web_source_enabled override_use_web_source = overrides.get("use_web_source") if isinstance(override_use_web_source, bool): use_web_source = use_web_source and override_use_web_source # Overrides can only disable sharepoint source support configured at construction time. use_sharepoint_source = self.use_sharepoint_source override_use_sharepoint_source = overrides.get("use_sharepoint_source") if isinstance(override_use_sharepoint_source, bool): use_sharepoint_source = use_sharepoint_source and override_use_sharepoint_source if use_web_source and retrieval_reasoning_effort == "minimal": raise Exception("Web source cannot be used with minimal retrieval reasoning effort.") selected_client, effective_web_source, effective_sharepoint_source = self._select_knowledgebase_client( use_web_source, use_sharepoint_source, ) agentic_results = await self.run_agentic_retrieval( messages=messages, knowledgebase_client=selected_client, search_index_name=self.search_index_name, filter_add_on=search_index_filter, minimum_reranker_score=minimum_reranker_score, access_token=access_token, use_web_source=effective_web_source, use_sharepoint_source=effective_sharepoint_source, retrieval_reasoning_effort=retrieval_reasoning_effort, ) data_points = await self.get_sources_content( agentic_results.documents, use_semantic_captions=False, include_text_sources=send_text_sources, download_image_sources=send_image_sources, user_oid=auth_claims.get("oid"), web_results=agentic_results.web_results, sharepoint_results=agentic_results.sharepoint_results, ) return ExtraInfo( data_points, thoughts=agentic_results.thoughts, answer=agentic_results.answer, ) def _select_knowledgebase_client( self, use_web_source: bool, use_sharepoint_source: bool, ) -> tuple[KnowledgeBaseRetrievalClient, bool, bool]: if use_web_source and use_sharepoint_source: if self.knowledgebase_client_with_web_and_sharepoint: return self.knowledgebase_client_with_web_and_sharepoint, True, True if self.knowledgebase_client_with_web: return self.knowledgebase_client_with_web, True, False if self.knowledgebase_client_with_sharepoint: return self.knowledgebase_client_with_sharepoint, False, True if use_web_source and self.knowledgebase_client_with_web: return self.knowledgebase_client_with_web, True, False if use_sharepoint_source and self.knowledgebase_client_with_sharepoint: return self.knowledgebase_client_with_sharepoint, False, True if self.knowledgebase_client: return self.knowledgebase_client, False, False raise ValueError("Agentic retrieval requested but no knowledge base is configured") ================================================ FILE: app/backend/approaches/promptmanager.py ================================================ import json import pathlib from typing import Any, cast from jinja2 import Environment, FileSystemLoader from openai.types.chat import ( ChatCompletionMessageParam, ChatCompletionSystemMessageParam, ChatCompletionToolParam, ChatCompletionUserMessageParam, ) class PromptManager: """Builds OpenAI chat completion messages from Jinja2 templates.""" PROMPTS_DIRECTORY = pathlib.Path(__file__).parent / "prompts" def __init__(self): self.env = Environment( loader=FileSystemLoader(self.PROMPTS_DIRECTORY), autoescape=False, trim_blocks=True, lstrip_blocks=True, ) def build_system_prompt( self, template_path: str, template_variables: dict[str, Any] ) -> ChatCompletionSystemMessageParam: """Build a single system message. Use for simple prompts like query rewrite. Args: template_path: Path to the system message template file template_variables: Dictionary of variables to pass to the template Returns: A system message """ content = self.env.get_template(template_path).render(**template_variables).strip() return {"role": "system", "content": content} def build_user_prompt( self, template_path: str, template_variables: dict[str, Any], image_sources: list[str] | None = None, ) -> ChatCompletionUserMessageParam: """Build a single user message with optional images. Args: template_path: Path to the user message template file template_variables: Dictionary of variables to pass to the template image_sources: Optional list of image URLs to include in the message Returns: A user message """ user_text = self.env.get_template(template_path).render(**template_variables).strip() if image_sources: user_content: list[dict[str, Any]] = [{"type": "text", "text": user_text}] for image in image_sources: user_content.append({"type": "image_url", "image_url": {"url": image, "detail": "auto"}}) return cast(ChatCompletionUserMessageParam, {"role": "user", "content": user_content}) return {"role": "user", "content": user_text} def build_conversation( self, system_template_path: str, system_template_variables: dict[str, Any], user_template_path: str, user_template_variables: dict[str, Any], user_image_sources: list[str] | None = None, past_messages: list[ChatCompletionMessageParam] | None = None, ) -> list[ChatCompletionMessageParam]: """Build a full conversation with system, history, and user message. Args: system_template_path: Path to the system message template file system_template_variables: Dictionary of variables to pass to the system template user_template_path: Path to the user message template file user_template_variables: Dictionary of variables to pass to the user template user_image_sources: Optional list of image URLs to include in the user message past_messages: Optional list of past messages to include as conversation history Returns: A list of ChatCompletionMessageParam messages """ messages: list[ChatCompletionMessageParam] = [] # System message messages.append(self.build_system_prompt(system_template_path, system_template_variables)) # Past messages (conversation history) for msg in past_messages or []: messages.append(cast(ChatCompletionMessageParam, msg)) # User message (with optional images) messages.append(self.build_user_prompt(user_template_path, user_template_variables, user_image_sources)) return messages def load_tools(self, path: str) -> list[ChatCompletionToolParam]: """Load tools from a JSON file.""" with open(self.PROMPTS_DIRECTORY / path) as f: return cast(list[ChatCompletionToolParam], json.load(f)) ================================================ FILE: app/backend/approaches/prompts/chat_answer.system.jinja2 ================================================ {% if override_prompt %} {{ override_prompt }} {% else %} Assistant helps the company employees with their questions about internal documents. Be brief in your answers. Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. If the question is not in English, answer in the language used in the question. Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf]. {% if image_sources %} Each image source has the document file name in the top left corner of the image with coordinates (10,10) pixels with format , and the image figure name is right-aligned in the top right corner of the image. The filename of the actual image is in the top right corner of the image and is in the format . Each text source starts in a new line and has the file name followed by colon and the actual information Always include the source document filename for each fact you use in the response in the format: [document_name.ext#page=N]. If you are referencing an image, add the image filename in the format: [document_name.ext#page=N(image_name.png)]. {% endif %} Possible citations for current question: {% for citation in citations %} [{{ citation }}] {% endfor %} {{ injected_prompt }} {% endif %} {% if include_follow_up_questions %} Generate 3 very brief follow-up questions that the user would likely ask next. Enclose the follow-up questions in double angle brackets. Example: <> <> <> Do not repeat questions that have already been asked. Make sure the last question ends with ">>". {% endif %} ================================================ FILE: app/backend/approaches/prompts/chat_answer.user.jinja2 ================================================ {{ user_query }} {% if text_sources is defined and text_sources %} Sources: {% for text_source in text_sources %} {{ text_source }} {% endfor %} {% endif %} ================================================ FILE: app/backend/approaches/prompts/chat_query_rewrite_tools.json ================================================ [{ "type": "function", "function": { "name": "search_sources", "description": "Retrieve sources from the Azure AI Search index", "parameters": { "type": "object", "properties": { "search_query": { "type": "string", "description": "Query string to retrieve documents from azure search eg: 'Health care plan'" } }, "required": ["search_query"] } } }] ================================================ FILE: app/backend/approaches/prompts/query_rewrite.system.jinja2 ================================================ Below is a history of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base. You have access to Azure AI Search index with 100's of documents. Generate a search query based on the conversation and the new question. Do not include cited source filenames and document names e.g. info.txt or doc.pdf in the search query terms. Do not include any text inside [] or <<>> in the search query terms. Do not include any special characters like '+'. If the question is not in English, translate the question to English before generating the search query. If you cannot generate a search query, return just the number 0. Generate search query for: How did crypto do last year? Search query: Summarize Cryptocurrency Market Dynamics from last year Generate search query for: What are my health plans? Search query: Show available health plans {% if past_messages %} Conversation history: {% for message in past_messages %} {{ message["role"] }}: {{ message["content"] }} {% endfor %} {% endif %} Generate search query for: {{ user_query }} ================================================ FILE: app/backend/chat_history/__init__.py ================================================ ================================================ FILE: app/backend/chat_history/cosmosdb.py ================================================ import os import time from typing import Any from azure.cosmos.aio import ContainerProxy, CosmosClient from azure.identity.aio import AzureDeveloperCliCredential, ManagedIdentityCredential from quart import Blueprint, current_app, jsonify, make_response, request from config import ( CONFIG_CHAT_HISTORY_COSMOS_ENABLED, CONFIG_COSMOS_HISTORY_CLIENT, CONFIG_COSMOS_HISTORY_CONTAINER, CONFIG_COSMOS_HISTORY_VERSION, CONFIG_CREDENTIAL, ) from decorators import authenticated from error import error_response chat_history_cosmosdb_bp = Blueprint("chat_history_cosmos", __name__, static_folder="static") @chat_history_cosmosdb_bp.post("/chat_history") @authenticated async def post_chat_history(auth_claims: dict[str, Any]): if not current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED]: return jsonify({"error": "Chat history not enabled"}), 400 container: ContainerProxy = current_app.config[CONFIG_COSMOS_HISTORY_CONTAINER] if not container: return jsonify({"error": "Chat history not enabled"}), 400 entra_oid = auth_claims.get("oid") if not entra_oid: return jsonify({"error": "User OID not found"}), 401 try: request_json = await request.get_json() session_id = request_json.get("id") message_pairs = request_json.get("answers") first_question = message_pairs[0][0] title = first_question + "..." if len(first_question) > 50 else first_question timestamp = int(time.time() * 1000) # Insert the session item: session_item = { "id": session_id, "version": current_app.config[CONFIG_COSMOS_HISTORY_VERSION], "session_id": session_id, "entra_oid": entra_oid, "type": "session", "title": title, "timestamp": timestamp, } message_pair_items = [] # Now insert a message item for each question/response pair: for ind, message_pair in enumerate(message_pairs): message_pair_items.append( { "id": f"{session_id}-{ind}", "version": current_app.config[CONFIG_COSMOS_HISTORY_VERSION], "session_id": session_id, "entra_oid": entra_oid, "type": "message_pair", "question": message_pair[0], "response": message_pair[1], } ) batch_operations = [("upsert", (session_item,))] + [ ("upsert", (message_pair_item,)) for message_pair_item in message_pair_items ] await container.execute_item_batch(batch_operations=batch_operations, partition_key=[entra_oid, session_id]) return jsonify({}), 201 except Exception as error: return error_response(error, "/chat_history") @chat_history_cosmosdb_bp.get("/chat_history/sessions") @authenticated async def get_chat_history_sessions(auth_claims: dict[str, Any]): if not current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED]: return jsonify({"error": "Chat history not enabled"}), 400 container: ContainerProxy = current_app.config[CONFIG_COSMOS_HISTORY_CONTAINER] if not container: return jsonify({"error": "Chat history not enabled"}), 400 entra_oid = auth_claims.get("oid") if not entra_oid: return jsonify({"error": "User OID not found"}), 401 try: count = int(request.args.get("count", 10)) continuation_token = request.args.get("continuation_token") res = container.query_items( query="SELECT c.id, c.entra_oid, c.title, c.timestamp FROM c WHERE c.entra_oid = @entra_oid AND c.type = @type ORDER BY c.timestamp DESC", parameters=[dict(name="@entra_oid", value=entra_oid), dict(name="@type", value="session")], partition_key=[entra_oid], max_item_count=count, ) pager = res.by_page(continuation_token) # Get the first page, and the continuation token sessions = [] try: page = await pager.__anext__() continuation_token = pager.continuation_token async for item in page: sessions.append( { "id": item.get("id"), "entra_oid": item.get("entra_oid"), "title": item.get("title", "untitled"), "timestamp": item.get("timestamp"), } ) # If there are no more pages, StopAsyncIteration is raised except StopAsyncIteration: continuation_token = None return jsonify({"sessions": sessions, "continuation_token": continuation_token}), 200 except Exception as error: return error_response(error, "/chat_history/sessions") @chat_history_cosmosdb_bp.get("/chat_history/sessions/") @authenticated async def get_chat_history_session(auth_claims: dict[str, Any], session_id: str): if not current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED]: return jsonify({"error": "Chat history not enabled"}), 400 container: ContainerProxy = current_app.config[CONFIG_COSMOS_HISTORY_CONTAINER] if not container: return jsonify({"error": "Chat history not enabled"}), 400 entra_oid = auth_claims.get("oid") if not entra_oid: return jsonify({"error": "User OID not found"}), 401 try: res = container.query_items( query="SELECT * FROM c WHERE c.session_id = @session_id AND c.type = @type", parameters=[dict(name="@session_id", value=session_id), dict(name="@type", value="message_pair")], partition_key=[entra_oid, session_id], ) message_pairs = [] async for page in res.by_page(): async for item in page: message_pairs.append([item["question"], item["response"]]) return ( jsonify( { "id": session_id, "entra_oid": entra_oid, "answers": message_pairs, } ), 200, ) except Exception as error: return error_response(error, f"/chat_history/sessions/{session_id}") @chat_history_cosmosdb_bp.delete("/chat_history/sessions/") @authenticated async def delete_chat_history_session(auth_claims: dict[str, Any], session_id: str): if not current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED]: return jsonify({"error": "Chat history not enabled"}), 400 container: ContainerProxy = current_app.config[CONFIG_COSMOS_HISTORY_CONTAINER] if not container: return jsonify({"error": "Chat history not enabled"}), 400 entra_oid = auth_claims.get("oid") if not entra_oid: return jsonify({"error": "User OID not found"}), 401 try: res = container.query_items( query="SELECT c.id FROM c WHERE c.session_id = @session_id", parameters=[dict(name="@session_id", value=session_id)], partition_key=[entra_oid, session_id], ) ids_to_delete = [] async for page in res.by_page(): async for item in page: ids_to_delete.append(item["id"]) batch_operations = [("delete", (id,)) for id in ids_to_delete] await container.execute_item_batch(batch_operations=batch_operations, partition_key=[entra_oid, session_id]) return await make_response("", 204) except Exception as error: return error_response(error, f"/chat_history/sessions/{session_id}") @chat_history_cosmosdb_bp.before_app_serving async def setup_clients(): USE_CHAT_HISTORY_COSMOS = os.getenv("USE_CHAT_HISTORY_COSMOS", "").lower() == "true" AZURE_COSMOSDB_ACCOUNT = os.getenv("AZURE_COSMOSDB_ACCOUNT") AZURE_CHAT_HISTORY_DATABASE = os.getenv("AZURE_CHAT_HISTORY_DATABASE") AZURE_CHAT_HISTORY_CONTAINER = os.getenv("AZURE_CHAT_HISTORY_CONTAINER") azure_credential: AzureDeveloperCliCredential | ManagedIdentityCredential = current_app.config[CONFIG_CREDENTIAL] if USE_CHAT_HISTORY_COSMOS: current_app.logger.info("USE_CHAT_HISTORY_COSMOS is true, setting up CosmosDB client") if not AZURE_COSMOSDB_ACCOUNT: raise ValueError("AZURE_COSMOSDB_ACCOUNT must be set when USE_CHAT_HISTORY_COSMOS is true") if not AZURE_CHAT_HISTORY_DATABASE: raise ValueError("AZURE_CHAT_HISTORY_DATABASE must be set when USE_CHAT_HISTORY_COSMOS is true") if not AZURE_CHAT_HISTORY_CONTAINER: raise ValueError("AZURE_CHAT_HISTORY_CONTAINER must be set when USE_CHAT_HISTORY_COSMOS is true") cosmos_client = CosmosClient( url=f"https://{AZURE_COSMOSDB_ACCOUNT}.documents.azure.com:443/", credential=azure_credential ) cosmos_db = cosmos_client.get_database_client(AZURE_CHAT_HISTORY_DATABASE) cosmos_container = cosmos_db.get_container_client(AZURE_CHAT_HISTORY_CONTAINER) current_app.config[CONFIG_COSMOS_HISTORY_CLIENT] = cosmos_client current_app.config[CONFIG_COSMOS_HISTORY_CONTAINER] = cosmos_container current_app.config[CONFIG_COSMOS_HISTORY_VERSION] = os.environ["AZURE_CHAT_HISTORY_VERSION"] @chat_history_cosmosdb_bp.after_app_serving async def close_clients(): if current_app.config.get(CONFIG_COSMOS_HISTORY_CLIENT): cosmos_client: CosmosClient = current_app.config[CONFIG_COSMOS_HISTORY_CLIENT] await cosmos_client.close() ================================================ FILE: app/backend/config.py ================================================ CONFIG_OPENAI_TOKEN = "openai_token" CONFIG_CREDENTIAL = "azure_credential" CONFIG_CHAT_APPROACH = "chat_approach" CONFIG_GLOBAL_BLOB_MANAGER = "global_blob_manager" CONFIG_USER_BLOB_MANAGER = "user_blob_manager" CONFIG_USER_UPLOAD_ENABLED = "user_upload_enabled" CONFIG_AUTH_CLIENT = "auth_client" CONFIG_SEMANTIC_RANKER_DEPLOYED = "semantic_ranker_deployed" CONFIG_QUERY_REWRITING_ENABLED = "query_rewriting_enabled" CONFIG_REASONING_EFFORT_ENABLED = "reasoning_effort_enabled" CONFIG_DEFAULT_REASONING_EFFORT = "default_reasoning_effort" CONFIG_DEFAULT_RETRIEVAL_REASONING_EFFORT = "default_retrieval_reasoning_effort" CONFIG_VECTOR_SEARCH_ENABLED = "vector_search_enabled" CONFIG_SEARCH_CLIENT = "search_client" CONFIG_OPENAI_CLIENT = "openai_client" CONFIG_KNOWLEDGEBASE_CLIENT = "knowledgebase_client" CONFIG_KNOWLEDGEBASE_CLIENT_WITH_WEB = "knowledgebase_client_with_web" CONFIG_KNOWLEDGEBASE_CLIENT_WITH_SHAREPOINT = "knowledgebase_client_with_sharepoint" CONFIG_KNOWLEDGEBASE_CLIENT_WITH_WEB_AND_SHAREPOINT = "knowledgebase_client_with_web_and_sharepoint" CONFIG_INGESTER = "ingester" CONFIG_LANGUAGE_PICKER_ENABLED = "language_picker_enabled" CONFIG_SPEECH_INPUT_ENABLED = "speech_input_enabled" CONFIG_SPEECH_OUTPUT_BROWSER_ENABLED = "speech_output_browser_enabled" CONFIG_SPEECH_OUTPUT_AZURE_ENABLED = "speech_output_azure_enabled" CONFIG_SPEECH_SERVICE_ID = "speech_service_id" CONFIG_SPEECH_SERVICE_LOCATION = "speech_service_location" CONFIG_SPEECH_SERVICE_TOKEN = "speech_service_token" CONFIG_SPEECH_SERVICE_VOICE = "speech_service_voice" CONFIG_STREAMING_ENABLED = "streaming_enabled" CONFIG_CHAT_HISTORY_BROWSER_ENABLED = "chat_history_browser_enabled" CONFIG_CHAT_HISTORY_COSMOS_ENABLED = "chat_history_cosmos_enabled" CONFIG_AGENTIC_KNOWLEDGEBASE_ENABLED = "agentic_knowledgebase_enabled" CONFIG_COSMOS_HISTORY_CLIENT = "cosmos_history_client" CONFIG_COSMOS_HISTORY_CONTAINER = "cosmos_history_container" CONFIG_COSMOS_HISTORY_VERSION = "cosmos_history_version" CONFIG_MULTIMODAL_ENABLED = "multimodal_enabled" CONFIG_RAG_SEARCH_TEXT_EMBEDDINGS = "rag_search_text_embeddings" CONFIG_RAG_SEARCH_IMAGE_EMBEDDINGS = "rag_search_image_embeddings" CONFIG_RAG_SEND_TEXT_SOURCES = "rag_send_text_sources" CONFIG_RAG_SEND_IMAGE_SOURCES = "rag_send_image_sources" CONFIG_WEB_SOURCE_ENABLED = "web_source_enabled" CONFIG_SHAREPOINT_SOURCE_ENABLED = "sharepoint_source_enabled" ================================================ FILE: app/backend/core/__init__.py ================================================ ================================================ FILE: app/backend/core/authentication.py ================================================ # Refactored from https://github.com/Azure-Samples/ms-identity-python-on-behalf-of import base64 import json import logging from typing import Any, Optional import aiohttp import jwt from azure.search.documents.aio import SearchClient from azure.search.documents.indexes.models import SearchIndex from cryptography.hazmat.primitives import serialization from cryptography.hazmat.primitives.asymmetric import rsa from msal import ConfidentialClientApplication from msal.token_cache import TokenCache from tenacity import ( AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_random_exponential, ) # AuthError is raised when the authentication token sent by the client UI cannot be parsed or there is an authentication error accessing the graph API class AuthError(Exception): def __init__(self, error, status_code): self.error = error self.status_code = status_code def __str__(self) -> str: return self.error or "" class AuthenticationHelper: scope: str = "https://search.azure.com/.default" def __init__( self, search_index: Optional[SearchIndex], use_authentication: bool, server_app_id: Optional[str], server_app_secret: Optional[str], client_app_id: Optional[str], tenant_id: Optional[str], enforce_access_control: bool = False, enable_unauthenticated_access: bool = False, ): self.use_authentication = use_authentication self.server_app_id = server_app_id self.server_app_secret = server_app_secret self.client_app_id = client_app_id self.tenant_id = tenant_id self.authority = f"https://login.microsoftonline.com/{tenant_id}" # Depending on if requestedAccessTokenVersion is 1 or 2, the issuer and audience of the token may be different # See https://learn.microsoft.com/graph/api/resources/apiapplication self.valid_issuers = [ f"https://sts.windows.net/{tenant_id}/", f"https://login.microsoftonline.com/{tenant_id}/v2.0", ] self.valid_audiences = [f"api://{server_app_id}", str(server_app_id)] # See https://learn.microsoft.com/entra/identity-platform/access-tokens#validate-the-issuer for more information on token validation self.key_url = f"{self.authority}/discovery/v2.0/keys" if self.use_authentication: field_names = [field.name for field in search_index.fields] if search_index else [] self.has_auth_fields = "oids" in field_names and "groups" in field_names self.enforce_access_control = enforce_access_control self.enable_unauthenticated_access = enable_unauthenticated_access self.confidential_client = ConfidentialClientApplication( server_app_id, authority=self.authority, client_credential=server_app_secret, token_cache=TokenCache() ) else: self.has_auth_fields = False self.enforce_access_control = False self.enable_unauthenticated_access = True def get_auth_setup_for_client(self) -> dict[str, Any]: # returns MSAL.js settings used by the client app return { "useLogin": self.use_authentication, # Whether or not login elements are enabled on the UI "requireAccessControl": self.enforce_access_control, # Whether or not access control is required to access documents with access control lists "enableUnauthenticatedAccess": self.enable_unauthenticated_access, # Whether or not the user can access the app without login "msalConfig": { "auth": { "clientId": self.client_app_id, # Client app id used for login "authority": self.authority, # Directory to use for login https://learn.microsoft.com/entra/identity-platform/msal-client-application-configuration#authority "redirectUri": "/redirect", # Points to window.location.origin. You must register this URI on Azure Portal/App Registration. "postLogoutRedirectUri": "/", # Indicates the page to navigate after logout. "navigateToLoginRequestUrl": False, # If "true", will navigate back to the original request location before processing the auth code response. }, "cache": { # Configures cache location. "sessionStorage" is more secure, but "localStorage" gives you SSO between tabs. "cacheLocation": "localStorage", # Set this to "true" if you are having issues on IE11 or Edge "storeAuthStateInCookie": False, }, }, "loginRequest": { # Scopes you add here will be prompted for user consent during sign-in. # By default, MSAL.js will add OIDC scopes (openid, profile, email) to any login request. # For more information about OIDC scopes, visit: # https://learn.microsoft.com/entra/identity-platform/permissions-consent-overview#openid-connect-scopes "scopes": [".default"], # Uncomment the following line to cause a consent dialog to appear on every login # For more information, please visit https://learn.microsoft.com/entra/identity-platform/v2-oauth2-auth-code-flow#request-an-authorization-code # "prompt": "consent" }, "tokenRequest": { "scopes": [f"api://{self.server_app_id}/access_as_user"], }, } @staticmethod def get_token_auth_header(headers: dict) -> str: # Obtains the Access Token from the Authorization Header auth = headers.get("Authorization") if auth: parts = auth.split() if parts[0].lower() != "bearer": raise AuthError(error="Authorization header must start with Bearer", status_code=401) elif len(parts) == 1: raise AuthError(error="Token not found", status_code=401) elif len(parts) > 2: raise AuthError(error="Authorization header must be Bearer token", status_code=401) token = parts[1] return token # App services built-in authentication passes the access token directly as a header # To learn more, please visit https://learn.microsoft.com/azure/app-service/configure-authentication-oauth-tokens token = headers.get("x-ms-token-aad-access-token") if token: return token raise AuthError(error="Authorization header is expected", status_code=401) async def get_auth_claims_if_enabled(self, headers: dict) -> dict[str, Any]: if not self.use_authentication: return {} try: # Read the authentication token from the authorization header and exchange it using the On Behalf Of Flow # The scope is set to Azure Search for authentication # https://learn.microsoft.com/entra/identity-platform/v2-oauth2-on-behalf-of-flow auth_token = AuthenticationHelper.get_token_auth_header(headers) # Validate the token before use await self.validate_access_token(auth_token) # Use the on-behalf-of-flow to acquire another token for use with Azure Search # See https://learn.microsoft.com/entra/identity-platform/v2-oauth2-on-behalf-of-flow for more information search_resource_access_token = self.confidential_client.acquire_token_on_behalf_of( user_assertion=auth_token, scopes=[self.scope] ) if "error" in search_resource_access_token: raise AuthError(error=str(search_resource_access_token), status_code=401) id_token_claims = search_resource_access_token["id_token_claims"] auth_claims = {"oid": id_token_claims["oid"]} # Only pass on the access token if access control is required # See https://learn.microsoft.com/azure/search/search-query-access-control-rbac-enforcement for more information if self.enforce_access_control: access_token = search_resource_access_token["access_token"] auth_claims["access_token"] = access_token return auth_claims except AuthError as e: logging.exception("Exception getting authorization information - " + json.dumps(e.error)) if not self.enable_unauthenticated_access: raise return {} except Exception: logging.exception("Exception getting authorization information") if not self.enable_unauthenticated_access: raise return {} async def check_path_auth(self, path: str, auth_claims: dict[str, Any], search_client: SearchClient) -> bool: # If there was no access control or no path, then the path is allowed if not self.enforce_access_control or len(path) == 0: return True # Remove any fragment string from the path before checking fragment_index = path.find("#") if fragment_index != -1: path = path[:fragment_index] # Filter down to only chunks that are from the specific source file # Sourcepage is used for GPT-4V # Replace ' with '' to escape the single quote for the filter # https://learn.microsoft.com/azure/search/query-odata-filter-orderby-syntax#escaping-special-characters-in-string-constants path_for_filter = path.replace("'", "''") filter = f"(sourcefile eq '{path_for_filter}') or (sourcepage eq '{path_for_filter}')" # If the filter returns any results, the user is allowed to access the document # Otherwise, access is denied results = await search_client.search( search_text="*", top=1, filter=filter, x_ms_query_source_authorization=auth_claims["access_token"] ) allowed = False async for _ in results: allowed = True break return allowed async def create_pem_format(self, jwks, token): unverified_header = jwt.get_unverified_header(token) for key in jwks["keys"]: if key["kid"] == unverified_header["kid"]: # Construct the RSA public key public_numbers = rsa.RSAPublicNumbers( e=int.from_bytes(base64.urlsafe_b64decode(key["e"] + "=="), byteorder="big"), n=int.from_bytes(base64.urlsafe_b64decode(key["n"] + "=="), byteorder="big"), ) public_key = public_numbers.public_key() # Convert to PEM format pem_key = public_key.public_bytes( encoding=serialization.Encoding.PEM, format=serialization.PublicFormat.SubjectPublicKeyInfo ) rsa_key = pem_key return rsa_key # See https://github.com/Azure-Samples/ms-identity-python-on-behalf-of/blob/939be02b11f1604814532fdacc2c2eccd198b755/FlaskAPI/helpers/authorization.py#L44 async def validate_access_token(self, token: str): """ Validate an access token is issued by Entra """ jwks = None async for attempt in AsyncRetrying( retry=retry_if_exception_type(AuthError), wait=wait_random_exponential(min=15, max=60), stop=stop_after_attempt(5), ): with attempt: async with aiohttp.ClientSession() as session: async with session.get(url=self.key_url) as resp: resp_status = resp.status if resp_status in [500, 502, 503, 504]: raise AuthError( error=f"Failed to get keys info: {await resp.text()}", status_code=resp_status ) jwks = await resp.json() if not jwks or "keys" not in jwks: raise AuthError("Unable to get keys to validate auth token.", 401) rsa_key = None issuer = None audience = None try: unverified_claims = jwt.decode(token, options={"verify_signature": False}) issuer = unverified_claims.get("iss") audience = unverified_claims.get("aud") rsa_key = await self.create_pem_format(jwks, token) except jwt.PyJWTError as exc: raise AuthError("Unable to parse authorization token.", 401) from exc if not rsa_key: raise AuthError("Unable to find appropriate key", 401) if issuer not in self.valid_issuers: raise AuthError(f"Issuer {issuer} not in {','.join(self.valid_issuers)}", 401) if audience not in self.valid_audiences: raise AuthError( f"Audience {audience} not in {','.join(self.valid_audiences)}", 401, ) try: jwt.decode(token, rsa_key, algorithms=["RS256"], audience=audience, issuer=issuer) except jwt.ExpiredSignatureError as jwt_expired_exc: raise AuthError("Token is expired", 401) from jwt_expired_exc except (jwt.InvalidAudienceError, jwt.InvalidIssuerError) as jwt_claims_exc: raise AuthError( "Incorrect claims: please check the audience and issuer", 401, ) from jwt_claims_exc except Exception as exc: raise AuthError("Unable to parse authorization token.", 401) from exc ================================================ FILE: app/backend/core/sessionhelper.py ================================================ import uuid from typing import Optional def create_session_id( config_chat_history_cosmos_enabled: bool, config_chat_history_browser_enabled: bool ) -> Optional[str]: if config_chat_history_cosmos_enabled: return str(uuid.uuid4()) if config_chat_history_browser_enabled: return str(uuid.uuid4()) return None ================================================ FILE: app/backend/custom_uvicorn_worker.py ================================================ from uvicorn.workers import UvicornWorker logconfig_dict = { "version": 1, "disable_existing_loggers": False, "formatters": { "default": { "()": "uvicorn.logging.DefaultFormatter", "format": "%(asctime)s - %(levelname)s - %(message)s", }, "access": { "()": "uvicorn.logging.AccessFormatter", "format": "%(asctime)s - %(message)s", }, }, "handlers": { "default": { "formatter": "default", "class": "logging.StreamHandler", "stream": "ext://sys.stderr", }, "access": { "formatter": "access", "class": "logging.StreamHandler", "stream": "ext://sys.stdout", }, }, "loggers": { "root": {"handlers": ["default"]}, "uvicorn.error": { "level": "INFO", "handlers": ["default"], "propagate": False, }, "uvicorn.access": { "level": "INFO", "handlers": ["access"], "propagate": False, }, }, } class CustomUvicornWorker(UvicornWorker): CONFIG_KWARGS = { "log_config": logconfig_dict, } ================================================ FILE: app/backend/decorators.py ================================================ import logging from collections.abc import Callable from functools import wraps from typing import Any, TypeVar, cast from quart import abort, current_app, request from config import CONFIG_AUTH_CLIENT, CONFIG_SEARCH_CLIENT from core.authentication import AuthError from error import error_response def authenticated_path(route_fn: Callable[[str, dict[str, Any]], Any]): """ Decorator for routes that request a specific file that might require access control enforcement """ @wraps(route_fn) async def auth_handler(path=""): # If authentication is enabled, validate the user can access the file auth_helper = current_app.config[CONFIG_AUTH_CLIENT] search_client = current_app.config[CONFIG_SEARCH_CLIENT] authorized = False try: auth_claims = await auth_helper.get_auth_claims_if_enabled(request.headers) authorized = await auth_helper.check_path_auth(path, auth_claims, search_client) except AuthError: abort(403) except Exception as error: logging.exception("Problem checking path auth %s", error) return error_response(error, route="/content") if not authorized: abort(403) return await route_fn(path, auth_claims) return auth_handler _C = TypeVar("_C", bound=Callable[..., Any]) def authenticated(route_fn: _C) -> _C: """ Decorator for routes that might require access control. Unpacks Authorization header information into an auth_claims dictionary """ @wraps(route_fn) async def auth_handler(*args, **kwargs): auth_helper = current_app.config[CONFIG_AUTH_CLIENT] try: auth_claims = await auth_helper.get_auth_claims_if_enabled(request.headers) except AuthError: abort(403) return await route_fn(auth_claims, *args, **kwargs) return cast(_C, auth_handler) ================================================ FILE: app/backend/error.py ================================================ import logging from openai import APIError from quart import jsonify ERROR_MESSAGE = """The app encountered an error processing your request. If you are an administrator of the app, check the application logs for a full traceback. Error type: {error_type} """ ERROR_MESSAGE_FILTER = """Your message contains content that was flagged by the OpenAI content filter.""" ERROR_MESSAGE_LENGTH = """Your message exceeded the context length limit for this OpenAI model. Please shorten your message or change your settings to retrieve fewer search results.""" def error_dict(error: Exception) -> dict: if isinstance(error, APIError) and error.code == "content_filter": return {"error": ERROR_MESSAGE_FILTER} if isinstance(error, APIError) and error.code == "context_length_exceeded": return {"error": ERROR_MESSAGE_LENGTH} return {"error": ERROR_MESSAGE.format(error_type=type(error))} def error_response(error: Exception, route: str, status_code: int = 500): logging.exception("Exception in %s: %s", route, error) if isinstance(error, APIError) and error.code == "content_filter": status_code = 400 return jsonify(error_dict(error)), status_code ================================================ FILE: app/backend/gunicorn.conf.py ================================================ import multiprocessing import os max_requests = 1000 max_requests_jitter = 50 log_file = "-" bind = "0.0.0.0" timeout = 230 # https://learn.microsoft.com/troubleshoot/azure/app-service/web-apps-performance-faqs#why-does-my-request-time-out-after-230-seconds num_cpus = multiprocessing.cpu_count() if os.getenv("WEBSITE_SKU") == "LinuxFree": # Free tier reports 2 CPUs but can't handle multiple workers workers = 1 else: workers = (num_cpus * 2) + 1 worker_class = "custom_uvicorn_worker.CustomUvicornWorker" ================================================ FILE: app/backend/load_azd_env.py ================================================ import json import logging import os import subprocess from dotenv import load_dotenv logger = logging.getLogger("scripts") def load_azd_env(): """Get path to current azd env file and load file using python-dotenv""" result = subprocess.run("azd env list -o json", shell=True, capture_output=True, text=True) if result.returncode != 0: raise Exception("Error loading azd env") env_json = json.loads(result.stdout) env_file_path = None for entry in env_json: if entry["IsDefault"]: env_file_path = entry["DotEnvPath"] if not env_file_path: raise Exception("No default azd env file found") loading_mode = os.getenv("LOADING_MODE_FOR_AZD_ENV_VARS") or "override" if loading_mode == "no-override": logger.info("Loading azd env from %s, but not overriding existing environment variables", env_file_path) load_dotenv(env_file_path, override=False) else: logger.info("Loading azd env from %s, which may override existing environment variables", env_file_path) load_dotenv(env_file_path, override=True) ================================================ FILE: app/backend/main.py ================================================ import os from app import create_app from load_azd_env import load_azd_env # WEBSITE_HOSTNAME is always set by App Service, RUNNING_IN_PRODUCTION is set in main.bicep RUNNING_ON_AZURE = os.getenv("WEBSITE_HOSTNAME") is not None or os.getenv("RUNNING_IN_PRODUCTION") is not None if not RUNNING_ON_AZURE: load_azd_env() app = create_app() ================================================ FILE: app/backend/prepdocs.py ================================================ import argparse import asyncio import logging import os from typing import Optional import aiohttp from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential from openai import AsyncOpenAI from rich.logging import RichHandler from load_azd_env import load_azd_env from prepdocslib.filestrategy import FileStrategy from prepdocslib.integratedvectorizerstrategy import ( IntegratedVectorizerStrategy, ) from prepdocslib.listfilestrategy import ( LocalListFileStrategy, ) from prepdocslib.servicesetup import ( OpenAIHost, build_file_processors, clean_key_if_exists, setup_blob_manager, setup_embeddings_service, setup_figure_processor, setup_image_embeddings_service, setup_openai_client, setup_search_info, ) from prepdocslib.strategy import DocumentAction, Strategy logger = logging.getLogger("scripts") async def check_search_service_connectivity(search_service: str) -> bool: """Check if the search service is accessible by hitting the /ping endpoint.""" ping_url = f"https://{search_service}.search.windows.net/ping" try: async with aiohttp.ClientSession() as session: async with session.get(ping_url, timeout=aiohttp.ClientTimeout(total=10)) as response: return response.status == 200 except Exception as e: logger.debug(f"Search service ping failed: {e}") return False def setup_list_file_strategy( azure_credential: AsyncTokenCredential, local_files: str, enable_global_documents: bool = False, ): logger.info("Using local files: %s", local_files) list_file_strategy = LocalListFileStrategy( path_pattern=local_files, enable_global_documents=enable_global_documents ) return list_file_strategy def setup_file_processors( azure_credential: AsyncTokenCredential, document_intelligence_service: Optional[str], document_intelligence_key: Optional[str] = None, local_pdf_parser: bool = False, local_html_parser: bool = False, use_content_understanding: bool = False, use_multimodal: bool = False, openai_client: Optional[AsyncOpenAI] = None, openai_model: Optional[str] = None, openai_deployment: Optional[str] = None, content_understanding_endpoint: Optional[str] = None, ): """Setup file processors and figure processor for document ingestion. Uses build_file_processors from servicesetup to ensure consistent parser/splitter selection logic with the Azure Functions cloud ingestion pipeline. """ file_processors = build_file_processors( azure_credential=azure_credential, document_intelligence_service=document_intelligence_service, document_intelligence_key=document_intelligence_key, use_local_pdf_parser=local_pdf_parser, use_local_html_parser=local_html_parser, process_figures=use_multimodal, ) figure_processor = setup_figure_processor( credential=azure_credential, use_multimodal=use_multimodal, use_content_understanding=use_content_understanding, content_understanding_endpoint=content_understanding_endpoint, openai_client=openai_client, openai_model=openai_model, openai_deployment=openai_deployment, ) return file_processors, figure_processor async def main(strategy: Strategy, setup_index: bool = True): if setup_index: await strategy.setup() await strategy.run() if __name__ == "__main__": # pragma: no cover parser = argparse.ArgumentParser( description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index." ) parser.add_argument("files", nargs="?", help="Files to be processed") parser.add_argument( "--category", help="Value for the category field in the search index for all sections indexed in this run" ) parser.add_argument( "--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections" ) parser.add_argument( "--remove", action="store_true", help="Remove references to this document from blob storage and the search index", ) parser.add_argument( "--removeall", action="store_true", help="Remove all blobs from blob storage and documents from the search index", ) # Optional key specification: parser.add_argument( "--searchkey", required=False, help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)", ) parser.add_argument( "--storagekey", required=False, help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)", ) parser.add_argument( "--documentintelligencekey", required=False, help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)", ) parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") args = parser.parse_args() if args.verbose: logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)]) # We only set the level to INFO for our logger, # to avoid seeing the noisy INFO level logs from the Azure SDKs logger.setLevel(logging.DEBUG) load_azd_env() if os.getenv("USE_CLOUD_INGESTION", "").lower() == "true": logger.warning( "Cloud ingestion is enabled. Please use setup_cloud_ingestion.py instead of prepdocs.py. Exiting." ) exit(0) if ( os.getenv("AZURE_PUBLIC_NETWORK_ACCESS") == "Disabled" and os.getenv("AZURE_USE_VPN_GATEWAY", "").lower() != "true" ): logger.error("AZURE_PUBLIC_NETWORK_ACCESS is set to Disabled. Exiting.") exit(0) use_int_vectorization = os.getenv("USE_FEATURE_INT_VECTORIZATION", "").lower() == "true" use_multimodal = os.getenv("USE_MULTIMODAL", "").lower() == "true" use_acls = os.getenv("AZURE_USE_AUTHENTICATION", "").lower() == "true" enforce_access_control = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true" enable_global_documents = os.getenv("AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS", "").lower() == "true" dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false" use_agentic_knowledgebase = os.getenv("USE_AGENTIC_KNOWLEDGEBASE", "").lower() == "true" use_content_understanding = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "").lower() == "true" use_web_source = os.getenv("USE_WEB_SOURCE", "").lower() == "true" use_sharepoint_source = os.getenv("USE_SHAREPOINT_SOURCE", "").lower() == "true" # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments. if tenant_id := os.getenv("AZURE_TENANT_ID"): logger.info("Connecting to Azure services using the azd credential for tenant %s", tenant_id) azd_credential = AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60) else: logger.info("Connecting to Azure services using the azd credential for home tenant") azd_credential = AzureDeveloperCliCredential(process_timeout=60) if args.removeall: document_action = DocumentAction.RemoveAll elif args.remove: document_action = DocumentAction.Remove else: document_action = DocumentAction.Add loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) OPENAI_HOST = OpenAIHost(os.environ["OPENAI_HOST"]) # Check for incompatibility # if openai host is not azure if use_agentic_knowledgebase and OPENAI_HOST not in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: raise Exception("Agentic retrieval requires an Azure OpenAI chat completion service") search_info = setup_search_info( search_service=os.environ["AZURE_SEARCH_SERVICE"], index_name=os.environ["AZURE_SEARCH_INDEX"], use_agentic_knowledgebase=use_agentic_knowledgebase, knowledgebase_name=os.getenv("AZURE_SEARCH_KNOWLEDGEBASE_NAME"), azure_openai_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], azure_openai_knowledgebase_deployment=os.getenv("AZURE_OPENAI_KNOWLEDGEBASE_DEPLOYMENT"), azure_openai_knowledgebase_model=os.getenv("AZURE_OPENAI_KNOWLEDGEBASE_MODEL"), azure_credential=azd_credential, search_key=clean_key_if_exists(args.searchkey), azure_vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), ) # Check search service connectivity search_service = os.environ["AZURE_SEARCH_SERVICE"] is_connected = loop.run_until_complete(check_search_service_connectivity(search_service)) if not is_connected: if os.getenv("AZURE_USE_PRIVATE_ENDPOINT"): logger.error( "Unable to connect to Azure AI Search service, which indicates either a network issue or a misconfiguration. You have AZURE_USE_PRIVATE_ENDPOINT enabled. Perhaps you're not yet connected to the VPN? Download the VPN configuration from the Azure portal here: %s", os.getenv("AZURE_VPN_CONFIG_DOWNLOAD_LINK"), ) else: logger.error( "Unable to connect to Azure AI Search service, which indicates either a network issue or a misconfiguration." ) exit(1) blob_manager = setup_blob_manager( azure_credential=azd_credential, storage_account=os.environ["AZURE_STORAGE_ACCOUNT"], storage_container=os.environ["AZURE_STORAGE_CONTAINER"], storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"], subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], storage_key=clean_key_if_exists(args.storagekey), image_storage_container=os.environ.get("AZURE_IMAGESTORAGE_CONTAINER"), # Pass the image container ) list_file_strategy = setup_list_file_strategy( azure_credential=azd_credential, local_files=args.files, enable_global_documents=enable_global_documents, ) emb_model_dimensions = 1536 if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"): emb_model_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"]) openai_client, azure_openai_endpoint = setup_openai_client( openai_host=OPENAI_HOST, azure_credential=azd_credential, azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"), azure_openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"), azure_openai_api_key=os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"), openai_api_key=clean_key_if_exists(os.getenv("OPENAI_API_KEY")), openai_organization=os.getenv("OPENAI_ORGANIZATION"), ) openai_embeddings_service = None if not dont_use_vectors: openai_embeddings_service = setup_embeddings_service( OPENAI_HOST, openai_client, emb_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], emb_model_dimensions=emb_model_dimensions, azure_openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), azure_openai_endpoint=azure_openai_endpoint, disable_batch=args.disablebatchvectors, ) ingestion_strategy: Strategy if use_int_vectorization: if not openai_embeddings_service or OPENAI_HOST not in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: raise Exception("Integrated vectorization strategy requires an Azure OpenAI embeddings service") ingestion_strategy = IntegratedVectorizerStrategy( search_info=search_info, list_file_strategy=list_file_strategy, blob_manager=blob_manager, document_action=document_action, embeddings=openai_embeddings_service, search_field_name_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"], subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), use_acls=use_acls, category=args.category, enforce_access_control=enforce_access_control, ) else: file_processors, figure_processor = setup_file_processors( azure_credential=azd_credential, document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"), document_intelligence_key=clean_key_if_exists(args.documentintelligencekey), local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true", local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true", use_content_understanding=use_content_understanding, use_multimodal=use_multimodal, content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), openai_client=openai_client, openai_model=os.getenv("AZURE_OPENAI_CHATGPT_MODEL"), openai_deployment=os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") if OPENAI_HOST == OpenAIHost.AZURE else None, ) image_embeddings_service = setup_image_embeddings_service( azure_credential=azd_credential, vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), use_multimodal=use_multimodal, ) ingestion_strategy = FileStrategy( search_info=search_info, list_file_strategy=list_file_strategy, blob_manager=blob_manager, file_processors=file_processors, document_action=document_action, embeddings=openai_embeddings_service, image_embeddings=image_embeddings_service, search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), # Default to the previous field names for backward compatibility search_field_name_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding"), use_acls=use_acls, category=args.category, figure_processor=figure_processor, enforce_access_control=enforce_access_control, use_web_source=use_web_source, use_sharepoint_source=use_sharepoint_source, ) try: loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall)) finally: # Gracefully close any async clients/credentials to avoid noisy destructor warnings try: loop.run_until_complete(blob_manager.close_clients()) loop.run_until_complete(openai_client.close()) loop.run_until_complete(azd_credential.close()) except Exception as e: logger.debug(f"Failed to close async clients cleanly: {e}") loop.close() ================================================ FILE: app/backend/prepdocslib/__init__.py ================================================ ================================================ FILE: app/backend/prepdocslib/blobmanager.py ================================================ import io import logging import os import re from pathlib import Path from typing import IO, Any, Optional, TypedDict, cast from urllib.parse import unquote from azure.core.credentials_async import AsyncTokenCredential from azure.core.exceptions import ResourceNotFoundError from azure.storage.blob.aio import BlobServiceClient from azure.storage.filedatalake.aio import ( DataLakeDirectoryClient, FileSystemClient, ) from PIL import Image, ImageDraw, ImageFont from .listfilestrategy import File logger = logging.getLogger("scripts") class BlobProperties(TypedDict, total=False): """Properties of a blob, with optional fields for content settings""" content_settings: dict[str, Any] class BaseBlobManager: """ Base class for Azure Storage operations, providing common file naming and path utilities """ @classmethod def sourcepage_from_file_page(cls, filename, page=0) -> str: if os.path.splitext(filename)[1].lower() == ".pdf": return f"{os.path.basename(filename)}#page={page+1}" else: return os.path.basename(filename) @classmethod def blob_name_from_file_name(cls, filename) -> str: return os.path.basename(filename) @classmethod def add_image_citation( cls, image_bytes: bytes, document_filename: str, image_filename: str, page_num: int ) -> bytes: """ Adds citation text to an image from a document. Args: image_bytes: The original image bytes document_filename: The name of the document containing the image image_filename: The name of the image file page_num: The page number where the image appears Returns: A tuple containing (BytesIO of the modified image, format of the image) """ # Load and modify the image to add text image = Image.open(io.BytesIO(image_bytes)) line_height = 30 text_height = line_height * 2 # Two lines of text new_img = Image.new("RGB", (image.width, image.height + text_height), "white") new_img.paste(image, (0, text_height)) # Add text draw = ImageDraw.Draw(new_img) sourcepage = cls.sourcepage_from_file_page(document_filename, page=page_num) text = sourcepage figure_text = image_filename # Load the Jupiteroid font which is included in the repo font_path = Path(__file__).parent / "Jupiteroid-Regular.ttf" font = ImageFont.truetype(str(font_path), 20) # Slightly smaller font for better fit # Calculate text widths for right alignment fig_width = draw.textlength(figure_text, font=font) # Left align document name, right align figure name padding = 20 # Padding from edges draw.text((padding, 5), text, font=font, fill="black") # Left aligned draw.text( (new_img.width - fig_width - padding, line_height + 5), figure_text, font=font, fill="black" ) # Right aligned # Convert back to bytes output = io.BytesIO() format = image.format or "PNG" new_img.save(output, format=format) return output.getvalue() async def upload_document_image( self, document_filename: str, image_bytes: bytes, image_filename: str, image_page_num: int, user_oid: Optional[str] = None, ) -> Optional[str]: raise NotImplementedError("Subclasses must implement this method") async def download_blob( self, blob_path: str, user_oid: Optional[str] = None, container: Optional[str] = None ) -> Optional[tuple[bytes, BlobProperties]]: """ Downloads a blob from Azure Storage. If user_oid is provided, it checks if the blob belongs to the user. Args: blob_path: The path to the blob in the storage user_oid: The user's object ID (optional) container: Optional container name override (defaults to the manager's configured container) Returns: Optional[tuple[bytes, BlobProperties]]: - A tuple containing the blob content as bytes and the blob properties - None if blob not found or access denied """ raise NotImplementedError("Subclasses must implement this method") class AdlsBlobManager(BaseBlobManager): """ Manager for Azure Data Lake Storage blob operations, particularly for user-specific file operations. Documents are stored directly in the user's directory for backwards compatibility. Images are stored in a separate images subdirectory for better organization. """ def __init__(self, endpoint: str, container: str, credential: AsyncTokenCredential): """ Initializes the AdlsBlobManager with the necessary parameters. Args: endpoint: The ADLS endpoint URL container: The name of the container (file system) credential: The credential for accessing ADLS """ self.endpoint = endpoint self.container = container self.credential = credential self.file_system_client = FileSystemClient( account_url=self.endpoint, file_system_name=self.container, credential=self.credential, ) async def close_clients(self): await self.file_system_client.close() async def _ensure_directory(self, directory_path: str, user_oid: str) -> DataLakeDirectoryClient: """ Ensures that a directory path exists and has proper permissions. Creates the entire path in a single operation if it doesn't exist. Args: directory_path: Full path of directory to create (e.g., 'user123/images/mydoc') user_oid: The owner to set for all created directories """ directory_client = self.file_system_client.get_directory_client(directory_path) try: await directory_client.get_directory_properties() # Check directory properties to ensure it has the correct owner props = await directory_client.get_access_control() if props.get("owner") != user_oid: raise PermissionError(f"User {user_oid} does not have permission to access {directory_path}") except ResourceNotFoundError: logger.info("Creating directory path %s", directory_path) await directory_client.create_directory() await directory_client.set_access_control(owner=user_oid) return directory_client async def upload_blob(self, file: File | IO, filename: str, user_oid: str) -> str: """ Uploads a file directly to the user's directory in ADLS (no subdirectory). Args: file: Either a File object or an IO object to upload filename: The name of the file to upload user_oid: The user's object ID Returns: str: The URL of the uploaded file, with forward slashes (not URL-encoded) """ # Ensure user directory exists but don't create a subdirectory user_directory_client = await self._ensure_directory(directory_path=user_oid, user_oid=user_oid) # Create file directly in user directory file_client = user_directory_client.get_file_client(filename) # Handle both File and IO objects if isinstance(file, File): file_io = file.content else: file_io = file # Ensure the file is at the beginning file_io.seek(0) await file_client.upload_data(file_io, overwrite=True) # Reset the file position for any subsequent reads file_io.seek(0) # Decode the URL to convert %2F back to / and other escaped characters return unquote(file_client.url) def _get_image_directory_path(self, document_filename: str, user_oid: str, page_num: Optional[int] = None) -> str: """ Returns the standardized path for storing document images. Args: document_filename: The name of the document user_oid: The user's object ID page_num: Optional page number. If provided, includes a page-specific subdirectory Returns: str: Full path to the image directory """ if page_num is not None: return f"{user_oid}/images/{document_filename}/page_{page_num}" return f"{user_oid}/images/{document_filename}" async def upload_document_image( self, document_filename: str, image_bytes: bytes, image_filename: str, image_page_num: int, user_oid: Optional[str] = None, ) -> Optional[str]: """ Uploads an image from a document to ADLS in a directory structure: {user_oid}/{document_name}/images/{image_name} This structure allows for easy cleanup when documents are deleted. Args: document_filename: The name of the document containing the image image_bytes: The image data to upload image_filename: The name to give the image file image_page_num: The page number where the image appears in the document user_oid: The user's object ID Returns: str: The URL of the uploaded file, with forward slashes (not URL-encoded) """ if user_oid is None: raise ValueError("user_oid must be provided for user-specific operations.") await self._ensure_directory(directory_path=user_oid, user_oid=user_oid) image_directory_path = self._get_image_directory_path(document_filename, user_oid, image_page_num) image_directory_client = await self._ensure_directory(directory_path=image_directory_path, user_oid=user_oid) file_client = image_directory_client.get_file_client(image_filename) image_bytes = BaseBlobManager.add_image_citation(image_bytes, document_filename, image_filename, image_page_num) logger.info("Uploading document image '%s' to '%s'", image_filename, image_directory_path) await file_client.upload_data(image_bytes, overwrite=True, metadata={"UploadedBy": user_oid}) return unquote(file_client.url) async def download_blob( self, blob_path: str, user_oid: Optional[str] = None, container: Optional[str] = None ) -> Optional[tuple[bytes, BlobProperties]]: """ Downloads a blob from Azure Data Lake Storage. Args: blob_path: The path to the blob in the format {user_oid}/{document_name}/images/{image_name} user_oid: The user's object ID container: Optional filesystem name override (ignored; this manager uses its configured filesystem) Returns: Optional[tuple[bytes, BlobProperties]]: - A tuple containing the blob content as bytes and the blob properties as a dictionary - None if blob not found or access denied """ if user_oid is None: logger.warning("user_oid must be provided for Data Lake Storage operations.") return None # Get the directory path and file name from the blob path path_parts = blob_path.split("/") if len(path_parts) < 2: # If no slashes in path, we assume it's a file in the user's root directory filename = blob_path directory_path = user_oid else: # First verify that the root directory matches the user_oid root_dir = path_parts[0] if root_dir != user_oid: logger.warning(f"User {user_oid} does not have permission to access {blob_path}") return None # Get the directory client for the full path except the filename directory_path = "/".join(path_parts[:-1]) filename = path_parts[-1] try: user_directory_client = await self._ensure_directory(directory_path=directory_path, user_oid=user_oid) file_client = user_directory_client.get_file_client(filename) download_response = await file_client.download_file() content = await download_response.readall() # Convert FileProperties to our BlobProperties format properties: BlobProperties = { "content_settings": { "content_type": download_response.properties.get("content_type", "application/octet-stream") } } return content, properties except ResourceNotFoundError: logger.warning(f"Directory or file not found: {directory_path}/{filename}") return None except Exception as e: logging.error(f"Error accessing directory {directory_path}: {str(e)}") return None async def remove_blob(self, filename: str, user_oid: str) -> None: """ Deletes a file from the user's directory in ADLS and any associated image directories. The following will be deleted: - {user_oid}/{filename} - {user_oid}/images/{filename}/* (recursively) Args: filename: The name of the file to delete user_oid: The user's object ID Raises: ResourceNotFoundError: If the file does not exist """ # Ensure the user directory exists user_directory_client = await self._ensure_directory(directory_path=user_oid, user_oid=user_oid) # Delete the main document file file_client = user_directory_client.get_file_client(filename) await file_client.delete_file() # Try to delete any associated image directories image_directory_path = self._get_image_directory_path(filename, user_oid) try: image_directory_client = await self._ensure_directory( directory_path=image_directory_path, user_oid=user_oid ) await image_directory_client.delete_directory() logger.info(f"Deleted associated image directory: {image_directory_path}") except ResourceNotFoundError: # It's okay if there was no image directory logger.debug(f"No image directory found at {image_directory_path}") pass async def list_blobs(self, user_oid: str) -> list[str]: """ Lists the uploaded documents for the given user. Only returns files directly in the user's directory, not in subdirectories. Excludes image files and the images directory. Args: user_oid: The user's object ID Returns: list[str]: List of filenames that belong to the user """ await self._ensure_directory(directory_path=user_oid, user_oid=user_oid) files = [] try: all_paths = self.file_system_client.get_paths(path=user_oid, recursive=True) async for path in all_paths: # Split path into parts (user_oid/filename or user_oid/directory/files) path_parts = path.name.split("/", 1) if len(path_parts) != 2: continue filename = path_parts[1] # Only include files that are: # 1. Directly in the user's directory (no additional slashes) # 2. Not image files # 3. Not in a directory containing 'images' if ( "/" not in filename and not any(filename.lower().endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".gif", ".bmp"]) and "images" not in filename ): files.append(filename) except ResourceNotFoundError as error: if error.status_code != 404: logger.exception("Error listing uploaded files", error) # Return empty list for 404 (no directory) as this is expected for new users return files class BlobManager(BaseBlobManager): """ Class to manage uploading and deleting blobs containing citation information from a blob storage account """ def __init__( self, endpoint: str, container: str, credential: AsyncTokenCredential | str, image_container: Optional[str] = None, account: Optional[str] = None, resource_group: Optional[str] = None, subscription_id: Optional[str] = None, ): self.endpoint = endpoint self.credential = credential self.account = account self.container = container self.resource_group = resource_group self.subscription_id = subscription_id self.image_container = image_container self.blob_service_client = BlobServiceClient( account_url=self.endpoint, credential=self.credential, max_single_put_size=4 * 1024 * 1024 ) async def close_clients(self): await self.blob_service_client.close() def get_managedidentity_connectionstring(self): if not self.account or not self.resource_group or not self.subscription_id: raise ValueError("Account, resource group, and subscription ID must be set to generate connection string.") return f"ResourceId=/subscriptions/{self.subscription_id}/resourceGroups/{self.resource_group}/providers/Microsoft.Storage/storageAccounts/{self.account};" async def upload_blob(self, file: File) -> str: container_client = self.blob_service_client.get_container_client(self.container) if not await container_client.exists(): await container_client.create_container() # Re-open and upload the original file # URL may be a path to a local file or already set to a blob URL if file.url is None or os.path.exists(file.url): with open(file.content.name, "rb") as reopened_file: blob_name = self.blob_name_from_file_name(file.content.name) logger.info("Uploading blob for document '%s'", blob_name) blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True) file.url = blob_client.url if file.url is None: raise ValueError("file.url must be set after upload") return unquote(file.url) async def upload_document_image( self, document_filename: str, image_bytes: bytes, image_filename: str, image_page_num: int, user_oid: Optional[str] = None, ) -> Optional[str]: if self.image_container is None: raise ValueError( "Image container name is not set. Re-run `azd provision` to automatically set up the images container." ) if user_oid is not None: raise ValueError( "user_oid is not supported for BlobManager. Use AdlsBlobManager for user-specific operations." ) container_client = self.blob_service_client.get_container_client(self.image_container) if not await container_client.exists(): await container_client.create_container() image_bytes = self.add_image_citation(image_bytes, document_filename, image_filename, image_page_num) blob_name = f"{self.blob_name_from_file_name(document_filename)}/page{image_page_num}/{image_filename}" logger.info("Uploading blob for document image '%s'", blob_name) blob_client = await container_client.upload_blob(blob_name, image_bytes, overwrite=True) return blob_client.url async def download_blob( self, blob_path: str, user_oid: Optional[str] = None, container: Optional[str] = None ) -> Optional[tuple[bytes, BlobProperties]]: """ Downloads a blob from Azure Blob Storage. Args: blob_path: The path to the blob in the storage user_oid: Not used in BlobManager, but included for API compatibility container: Optional container name override (defaults to self.container) Returns: Optional[tuple[bytes, BlobProperties]]: - A tuple containing the blob content as bytes and the blob properties - None if blob not found Raises: ValueError: If user_oid is provided (not supported for BlobManager) """ if user_oid is not None: raise ValueError( "user_oid is not supported for BlobManager. Use AdlsBlobManager for user-specific operations." ) container_client = self.blob_service_client.get_container_client(container or self.container) if not await container_client.exists(): return None if len(blob_path) == 0: logger.warning("Blob path is empty") return None blob_client = container_client.get_blob_client(blob_path) try: download_response = await blob_client.download_blob() if not download_response.properties: logger.warning(f"No blob exists for {blob_path}") return None # Get the content as bytes content = await download_response.readall() # Convert BlobProperties to our internal BlobProperties format properties: BlobProperties = { "content_settings": { "content_type": ( download_response.properties.content_settings.content_type if ( hasattr(download_response.properties, "content_settings") and download_response.properties.content_settings and hasattr(download_response.properties.content_settings, "content_type") ) else "application/octet-stream" ) } } return cast(bytes, content), properties except ResourceNotFoundError: logger.warning("Blob not found: %s", blob_path) return None async def remove_blob(self, path: Optional[str] = None): container_client = self.blob_service_client.get_container_client(self.container) if not await container_client.exists(): return if path is None: prefix = None blobs = container_client.list_blob_names() else: prefix = os.path.splitext(os.path.basename(path))[0] blobs = container_client.list_blob_names(name_starts_with=os.path.splitext(os.path.basename(prefix))[0]) async for blob_path in blobs: # This still supports PDFs split into individual pages, but we could remove in future to simplify code if ( prefix is not None and (not re.match(rf"{prefix}-\d+\.pdf", blob_path) or not re.match(rf"{prefix}-\d+\.png", blob_path)) ) or (path is not None and blob_path == os.path.basename(path)): continue logger.info("Removing blob %s", blob_path) await container_client.delete_blob(blob_path) ================================================ FILE: app/backend/prepdocslib/cloudingestionstrategy.py ================================================ """Cloud ingestion strategy using Azure AI Search custom skills.""" import logging from dataclasses import dataclass from datetime import timedelta from azure.search.documents.indexes._generated.models import ( NativeBlobSoftDeleteDeletionDetectionPolicy, ) from azure.search.documents.indexes.models import ( IndexingParameters, IndexingParametersConfiguration, IndexProjectionMode, InputFieldMappingEntry, OutputFieldMappingEntry, SearchIndexer, SearchIndexerDataContainer, SearchIndexerDataSourceConnection, SearchIndexerDataSourceType, SearchIndexerDataUserAssignedIdentity, SearchIndexerIndexProjection, SearchIndexerIndexProjectionSelector, SearchIndexerIndexProjectionsParameters, SearchIndexerSkillset, ShaperSkill, WebApiSkill, ) from .blobmanager import BlobManager from .embeddings import OpenAIEmbeddings from .listfilestrategy import ListFileStrategy from .searchmanager import SearchManager from .strategy import DocumentAction, SearchInfo, Strategy logger = logging.getLogger("scripts") DEFAULT_SKILL_TIMEOUT = timedelta(seconds=230) DEFAULT_BATCH_SIZE = 1 @dataclass(slots=True) class SkillConfig: """Configuration for a custom Web API skill.""" name: str description: str uri: str auth_resource_id: str class CloudIngestionStrategy(Strategy): # pragma: no cover """Ingestion strategy that wires Azure Function custom skills into an indexer.""" def __init__( self, *, list_file_strategy: ListFileStrategy, blob_manager: BlobManager, search_info: SearchInfo, embeddings: OpenAIEmbeddings, search_field_name_embedding: str, document_extractor_uri: str, document_extractor_auth_resource_id: str, figure_processor_uri: str, figure_processor_auth_resource_id: str, text_processor_uri: str, text_processor_auth_resource_id: str, subscription_id: str, document_action: DocumentAction = DocumentAction.Add, search_analyzer_name: str | None = None, use_acls: bool = False, use_multimodal: bool = False, enforce_access_control: bool = False, use_web_source: bool = False, search_user_assigned_identity_resource_id: str, ) -> None: self.list_file_strategy = list_file_strategy self.blob_manager = blob_manager self.document_action = document_action self.embeddings = embeddings self.search_field_name_embedding = search_field_name_embedding self.search_info = search_info self.search_analyzer_name = search_analyzer_name self.use_acls = use_acls self.use_multimodal = use_multimodal self.enforce_access_control = enforce_access_control self.use_web_source = use_web_source self.subscription_id = subscription_id prefix = f"{self.search_info.index_name}-cloud" self.skillset_name = f"{prefix}-skillset" self.indexer_name = f"{prefix}-indexer" self.data_source_name = f"{prefix}-blob" self.document_extractor = SkillConfig( name=f"{prefix}-document-extractor-skill", description="Custom skill that downloads and parses source documents", uri=document_extractor_uri, auth_resource_id=document_extractor_auth_resource_id, ) self.figure_processor = SkillConfig( name=f"{prefix}-figure-processor-skill", description="Custom skill that enriches individual figures", uri=figure_processor_uri, auth_resource_id=figure_processor_auth_resource_id, ) self.text_processor = SkillConfig( name=f"{prefix}-text-processor-skill", description="Custom skill that merges figures, chunks text, and generates embeddings", uri=text_processor_uri, auth_resource_id=text_processor_auth_resource_id, ) self._search_manager: SearchManager | None = None self.search_user_assigned_identity_resource_id = search_user_assigned_identity_resource_id def _build_skillset(self) -> SearchIndexerSkillset: prefix = f"{self.search_info.index_name}-cloud" # NOTE: Do NOT map the chunk id directly to the index key field. Azure AI Search # index projections forbid mapping an input field onto the target index key when # using parent/child projections. The service will generate keys for projected # child documents automatically. Removing the explicit 'id' mapping resolves # HttpResponseError: "Input 'id' cannot map to the key field". mappings = [ InputFieldMappingEntry(name="content", source="/document/chunks/*/content"), InputFieldMappingEntry(name="sourcepage", source="/document/chunks/*/sourcepage"), InputFieldMappingEntry(name="sourcefile", source="/document/chunks/*/sourcefile"), InputFieldMappingEntry(name=self.search_field_name_embedding, source="/document/chunks/*/embedding"), InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), ] if self.use_multimodal: mappings.append(InputFieldMappingEntry(name="images", source="/document/chunks/*/images")) if self.use_acls: mappings.append(InputFieldMappingEntry(name="oids", source="/document/chunks/*/oids")) mappings.append(InputFieldMappingEntry(name="groups", source="/document/chunks/*/groups")) index_projection = SearchIndexerIndexProjection( selectors=[ SearchIndexerIndexProjectionSelector( target_index_name=self.search_info.index_name, parent_key_field_name="parent_id", source_context="/document/chunks/*", mappings=mappings, ) ], parameters=SearchIndexerIndexProjectionsParameters( projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS, ), ) document_extractor_skill = WebApiSkill( name=self.document_extractor.name, description=self.document_extractor.description, context="/document", uri=self.document_extractor.uri, http_method="POST", timeout=DEFAULT_SKILL_TIMEOUT, batch_size=DEFAULT_BATCH_SIZE, degree_of_parallelism=1, # Managed identity: Search service authenticates against the function app using this resource ID. auth_resource_id=self.document_extractor.auth_resource_id, auth_identity=SearchIndexerDataUserAssignedIdentity( resource_id=self.search_user_assigned_identity_resource_id ), inputs=[ # Always provide the blob URL so the function can download large files (> 16MB) InputFieldMappingEntry(name="metadata_storage_path", source="/document/metadata_storage_path"), # We are not using the SAS token since the functions have RBAC access via managed identity ], outputs=[ OutputFieldMappingEntry(name="pages", target_name="pages"), OutputFieldMappingEntry(name="figures", target_name="figures"), ] + ( [ # ACL outputs for document-level access control (populated by manual ADLS Gen2 extraction) OutputFieldMappingEntry(name="oids", target_name="oids"), OutputFieldMappingEntry(name="groups", target_name="groups"), ] if self.use_acls else [] ), ) figure_processor_skill = WebApiSkill( name=self.figure_processor.name, description=self.figure_processor.description, context="/document/figures/*", uri=self.figure_processor.uri, http_method="POST", timeout=DEFAULT_SKILL_TIMEOUT, batch_size=DEFAULT_BATCH_SIZE, degree_of_parallelism=1, # Managed identity: Search service authenticates against the function app using this resource ID. auth_resource_id=self.figure_processor.auth_resource_id, auth_identity=SearchIndexerDataUserAssignedIdentity( resource_id=self.search_user_assigned_identity_resource_id ), inputs=[ InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), InputFieldMappingEntry(name="document_file_name", source="/document/figures/*/document_file_name"), InputFieldMappingEntry(name="filename", source="/document/figures/*/filename"), InputFieldMappingEntry(name="mime_type", source="/document/figures/*/mime_type"), InputFieldMappingEntry(name="bytes_base64", source="/document/figures/*/bytes_base64"), InputFieldMappingEntry(name="page_num", source="/document/figures/*/page_num"), InputFieldMappingEntry(name="bbox", source="/document/figures/*/bbox"), InputFieldMappingEntry(name="placeholder", source="/document/figures/*/placeholder"), InputFieldMappingEntry(name="title", source="/document/figures/*/title"), ], outputs=[ # Only output the enriched fields to avoid cyclic dependency OutputFieldMappingEntry(name="description", target_name="description"), OutputFieldMappingEntry(name="url", target_name="url"), OutputFieldMappingEntry(name="embedding", target_name="embedding"), ], ) # Shaper skill to consolidate pages and enriched figures into a single object shaper_skill = ShaperSkill( name=f"{prefix}-document-shaper-skill", description="Consolidates pages and enriched figures into a single document object", context="/document", inputs=[ InputFieldMappingEntry(name="pages", source="/document/pages"), InputFieldMappingEntry( name="figures", source_context="/document/figures/*", inputs=[ InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), InputFieldMappingEntry( name="document_file_name", source="/document/figures/*/document_file_name" ), InputFieldMappingEntry(name="filename", source="/document/figures/*/filename"), InputFieldMappingEntry(name="mime_type", source="/document/figures/*/mime_type"), InputFieldMappingEntry(name="page_num", source="/document/figures/*/page_num"), InputFieldMappingEntry(name="bbox", source="/document/figures/*/bbox"), InputFieldMappingEntry(name="placeholder", source="/document/figures/*/placeholder"), InputFieldMappingEntry(name="title", source="/document/figures/*/title"), InputFieldMappingEntry(name="description", source="/document/figures/*/description"), InputFieldMappingEntry(name="url", source="/document/figures/*/url"), InputFieldMappingEntry(name="embedding", source="/document/figures/*/embedding"), ], ), InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), ] + ( [ # ACL fields from document_extractor's manual ADLS Gen2 ACL extraction InputFieldMappingEntry(name="oids", source="/document/oids"), InputFieldMappingEntry(name="groups", source="/document/groups"), ] if self.use_acls else [] ), outputs=[OutputFieldMappingEntry(name="output", target_name="consolidated_document")], ) text_processor_skill = WebApiSkill( name=self.text_processor.name, description=self.text_processor.description, context="/document", uri=self.text_processor.uri, http_method="POST", timeout=DEFAULT_SKILL_TIMEOUT, batch_size=DEFAULT_BATCH_SIZE, degree_of_parallelism=1, # Managed identity: Search service authenticates against the function app using this resource ID. auth_resource_id=self.text_processor.auth_resource_id, auth_identity=SearchIndexerDataUserAssignedIdentity( resource_id=self.search_user_assigned_identity_resource_id ), inputs=[ InputFieldMappingEntry(name="consolidated_document", source="/document/consolidated_document"), ], outputs=[OutputFieldMappingEntry(name="chunks", target_name="chunks")], ) return SearchIndexerSkillset( name=self.skillset_name, description="Skillset linking document extraction, figure enrichment, and text processing functions", skills=[document_extractor_skill, figure_processor_skill, shaper_skill, text_processor_skill], index_projection=index_projection, ) async def setup(self) -> None: logger.info("Setting up search index and skillset for cloud ingestion") if not self.embeddings.azure_endpoint or not self.embeddings.azure_deployment_name: raise ValueError("Cloud ingestion requires Azure OpenAI endpoint and deployment") if not isinstance(self.embeddings, OpenAIEmbeddings): raise TypeError("Cloud ingestion requires Azure OpenAI embeddings to configure the search index.") # Warn if access control is enforced but ACL extraction is not enabled if self.enforce_access_control and not self.use_acls: logger.warning( "AZURE_ENFORCE_ACCESS_CONTROL is enabled but USE_CLOUD_INGESTION_ACLS is not. " "Documents will not have ACLs extracted automatically from ADLS Gen2. " "If you intend to use document-level access control, either set USE_CLOUD_INGESTION_ACLS=true " "or manually set ACLs using scripts/manageacl.py after ingestion." ) # Verify the storage container exists before attempting to create the data source container_client = self.blob_manager.blob_service_client.get_container_client(self.blob_manager.container) if not await container_client.exists(): raise ValueError( f"Storage container '{self.blob_manager.container}' does not exist in storage account '{self.blob_manager.account}'. " f"Please create the container first, or set AZURE_STORAGE_CONTAINER to an existing container name." ) self._search_manager = SearchManager( search_info=self.search_info, search_analyzer_name=self.search_analyzer_name, use_acls=self.use_acls, use_parent_index_projection=True, embeddings=self.embeddings, field_name_embedding=self.search_field_name_embedding, search_images=self.use_multimodal, enforce_access_control=self.enforce_access_control, use_web_source=self.use_web_source, ) await self._search_manager.create_index() async with self.search_info.create_search_indexer_client() as indexer_client: # Use ADLS_GEN2 when ACLs are enabled (requires hierarchical namespace storage) # Note: We do NOT use indexer_permission_options because that's incompatible with # Custom WebAPI skills. Instead, ACLs are extracted manually in document_extractor. data_source_type = ( SearchIndexerDataSourceType.ADLS_GEN2 if self.use_acls else SearchIndexerDataSourceType.AZURE_BLOB ) data_source_connection = SearchIndexerDataSourceConnection( name=self.data_source_name, type=data_source_type, connection_string=self.blob_manager.get_managedidentity_connectionstring(), container=SearchIndexerDataContainer(name=self.blob_manager.container), data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy(), ) await indexer_client.create_or_update_data_source_connection(data_source_connection) skillset = self._build_skillset() await indexer_client.create_or_update_skillset(skillset) indexer = SearchIndexer( name=self.indexer_name, description="Indexer orchestrating cloud ingestion pipeline", data_source_name=self.data_source_name, target_index_name=self.search_info.index_name, skillset_name=self.skillset_name, parameters=IndexingParameters( configuration=IndexingParametersConfiguration( query_timeout=None, # type: ignore data_to_extract="storageMetadata", allow_skillset_to_read_file_data=False, ) ), ) await indexer_client.create_or_update_indexer(indexer) async def run(self) -> None: files = self.list_file_strategy.list() async for file in files: try: await self.blob_manager.upload_blob(file) finally: if file: file.close() async with self.search_info.create_search_indexer_client() as indexer_client: await indexer_client.run_indexer(self.indexer_name) logger.info("Triggered indexer '%s' for cloud ingestion", self.indexer_name) ================================================ FILE: app/backend/prepdocslib/csvparser.py ================================================ import csv from collections.abc import AsyncGenerator from typing import IO from .page import Page from .parser import Parser class CsvParser(Parser): """ Concrete parser that can parse CSV into Page objects. Each row becomes a Page object. """ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: # Check if content is in bytes (binary file) and decode to string content_str: str if isinstance(content, (bytes, bytearray)): content_str = content.decode("utf-8") elif hasattr(content, "read"): # Handle BufferedReader content_str = content.read().decode("utf-8") # Create a CSV reader from the text content reader = csv.reader(content_str.splitlines()) offset = 0 # Skip the header row next(reader, None) for i, row in enumerate(reader): page_text = ",".join(row) yield Page(i, offset, page_text) offset += len(page_text) + 1 # Account for newline character ================================================ FILE: app/backend/prepdocslib/embeddings.py ================================================ import logging from abc import ABC from collections.abc import Awaitable, Callable from urllib.parse import urljoin import aiohttp import tiktoken from openai import AsyncOpenAI, RateLimitError from tenacity import ( AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_random_exponential, ) from typing_extensions import TypedDict logger = logging.getLogger("scripts") class EmbeddingBatch: """Represents a batch of text that is going to be embedded.""" def __init__(self, texts: list[str], token_length: int): self.texts = texts self.token_length = token_length class ExtraArgs(TypedDict, total=False): dimensions: int class OpenAIEmbeddings(ABC): """Client wrapper that handles batching, retries, and token accounting.""" SUPPORTED_BATCH_MODEL = { "text-embedding-ada-002": {"token_limit": 8100, "max_batch_size": 16}, "text-embedding-3-small": {"token_limit": 8100, "max_batch_size": 16}, "text-embedding-3-large": {"token_limit": 8100, "max_batch_size": 16}, } SUPPORTED_DIMENSIONS_MODEL = { "text-embedding-ada-002": False, "text-embedding-3-small": True, "text-embedding-3-large": True, } def __init__( self, open_ai_client: AsyncOpenAI, open_ai_model_name: str, open_ai_dimensions: int, *, disable_batch: bool = False, azure_deployment_name: str | None = None, azure_endpoint: str | None = None, ): self.open_ai_client = open_ai_client self.open_ai_model_name = open_ai_model_name self.open_ai_dimensions = open_ai_dimensions self.disable_batch = disable_batch self.azure_deployment_name = azure_deployment_name self.azure_endpoint = azure_endpoint.rstrip("/") if azure_endpoint else None @property def _api_model(self) -> str: return self.azure_deployment_name or self.open_ai_model_name def before_retry_sleep(self, retry_state): logger.info("Rate limited on the OpenAI embeddings API, sleeping before retrying...") def calculate_token_length(self, text: str): encoding = tiktoken.encoding_for_model(self.open_ai_model_name) return len(encoding.encode(text)) def split_text_into_batches(self, texts: list[str]) -> list[EmbeddingBatch]: batch_info = OpenAIEmbeddings.SUPPORTED_BATCH_MODEL.get(self.open_ai_model_name) if not batch_info: raise NotImplementedError( f"Model {self.open_ai_model_name} is not supported with batch embedding operations" ) batch_token_limit = batch_info["token_limit"] batch_max_size = batch_info["max_batch_size"] batches: list[EmbeddingBatch] = [] batch: list[str] = [] batch_token_length = 0 for text in texts: text_token_length = self.calculate_token_length(text) if batch_token_length + text_token_length >= batch_token_limit and len(batch) > 0: batches.append(EmbeddingBatch(batch, batch_token_length)) batch = [] batch_token_length = 0 batch.append(text) batch_token_length = batch_token_length + text_token_length if len(batch) == batch_max_size: batches.append(EmbeddingBatch(batch, batch_token_length)) batch = [] batch_token_length = 0 if len(batch) > 0: batches.append(EmbeddingBatch(batch, batch_token_length)) return batches async def create_embedding_batch(self, texts: list[str], dimensions_args: ExtraArgs) -> list[list[float]]: batches = self.split_text_into_batches(texts) embeddings = [] for batch in batches: async for attempt in AsyncRetrying( retry=retry_if_exception_type(RateLimitError), wait=wait_random_exponential(min=15, max=60), stop=stop_after_attempt(15), before_sleep=self.before_retry_sleep, ): with attempt: emb_response = await self.open_ai_client.embeddings.create( model=self._api_model, input=batch.texts, **dimensions_args ) embeddings.extend([data.embedding for data in emb_response.data]) logger.info( "Computed embeddings in batch. Batch size: %d, Token count: %d", len(batch.texts), batch.token_length, ) return embeddings async def create_embedding_single(self, text: str, dimensions_args: ExtraArgs) -> list[float]: async for attempt in AsyncRetrying( retry=retry_if_exception_type(RateLimitError), wait=wait_random_exponential(min=15, max=60), stop=stop_after_attempt(15), before_sleep=self.before_retry_sleep, ): with attempt: emb_response = await self.open_ai_client.embeddings.create( model=self._api_model, input=text, **dimensions_args ) logger.info("Computed embedding for text section. Character count: %d", len(text)) return emb_response.data[0].embedding async def create_embeddings(self, texts: list[str]) -> list[list[float]]: dimensions_args: ExtraArgs = ( {"dimensions": self.open_ai_dimensions} if OpenAIEmbeddings.SUPPORTED_DIMENSIONS_MODEL.get(self.open_ai_model_name) else {} ) if not self.disable_batch and self.open_ai_model_name in OpenAIEmbeddings.SUPPORTED_BATCH_MODEL: return await self.create_embedding_batch(texts, dimensions_args) return [await self.create_embedding_single(text, dimensions_args) for text in texts] class ImageEmbeddings: """ Class for using image embeddings from Azure AI Vision To learn more, please visit https://learn.microsoft.com/azure/ai-services/computer-vision/how-to/image-retrieval#call-the-vectorize-image-api """ def __init__(self, endpoint: str, token_provider: Callable[[], Awaitable[str]]): self.token_provider = token_provider self.endpoint = endpoint async def create_embedding_for_image(self, image_bytes: bytes) -> list[float]: endpoint = urljoin(self.endpoint, "computervision/retrieval:vectorizeImage") params = {"api-version": "2024-02-01", "model-version": "2023-04-15"} headers = {"Authorization": "Bearer " + await self.token_provider()} async with aiohttp.ClientSession(headers=headers) as session: async for attempt in AsyncRetrying( retry=retry_if_exception_type(Exception), wait=wait_random_exponential(min=15, max=60), stop=stop_after_attempt(15), before_sleep=self.before_retry_sleep, ): with attempt: async with session.post(url=endpoint, params=params, data=image_bytes) as resp: resp_json = await resp.json() return resp_json["vector"] raise ValueError("Failed to get image embedding after multiple retries.") async def create_embedding_for_text(self, q: str): endpoint = urljoin(self.endpoint, "computervision/retrieval:vectorizeText") headers = {"Content-Type": "application/json"} params = {"api-version": "2024-02-01", "model-version": "2023-04-15"} data = {"text": q} headers["Authorization"] = "Bearer " + await self.token_provider() async with aiohttp.ClientSession() as session: async with session.post( url=endpoint, params=params, headers=headers, json=data, raise_for_status=True ) as response: json = await response.json() return json["vector"] raise ValueError("Failed to get text embedding after multiple retries.") def before_retry_sleep(self, retry_state): logger.info("Rate limited on the Vision embeddings API, sleeping before retrying...") ================================================ FILE: app/backend/prepdocslib/figureprocessor.py ================================================ """Utilities for describing and enriching figures extracted from documents.""" import logging from enum import Enum from typing import Any, Optional from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from .blobmanager import BaseBlobManager from .embeddings import ImageEmbeddings from .mediadescriber import ( ContentUnderstandingDescriber, MediaDescriber, MultimodalModelDescriber, ) from .page import ImageOnPage logger = logging.getLogger("scripts") class MediaDescriptionStrategy(Enum): """Supported mechanisms for describing images extracted from documents.""" NONE = "none" OPENAI = "openai" CONTENTUNDERSTANDING = "content_understanding" class FigureProcessor: """Helper that lazily creates a media describer and captions figures on demand.""" def __init__( self, *, credential: AsyncTokenCredential | AzureKeyCredential | None = None, strategy: MediaDescriptionStrategy = MediaDescriptionStrategy.NONE, openai_client: Any | None = None, openai_model: str | None = None, openai_deployment: str | None = None, content_understanding_endpoint: str | None = None, ) -> None: self.credential = credential self.strategy = strategy self.openai_client = openai_client self.openai_model = openai_model self.openai_deployment = openai_deployment self.content_understanding_endpoint = content_understanding_endpoint self.media_describer: MediaDescriber | None = None self.content_understanding_ready = False async def get_media_describer(self) -> MediaDescriber | None: """Return (and lazily create) the media describer for this processor.""" if self.strategy == MediaDescriptionStrategy.NONE: return None if self.media_describer is not None: return self.media_describer if self.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: if self.content_understanding_endpoint is None: raise ValueError("Content Understanding strategy requires an endpoint") if self.credential is None: raise ValueError("Content Understanding strategy requires a credential") if isinstance(self.credential, AzureKeyCredential): raise ValueError( "Content Understanding does not support key credentials; provide a token credential instead" ) self.media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential) return self.media_describer if self.strategy == MediaDescriptionStrategy.OPENAI: if self.openai_client is None or self.openai_model is None: raise ValueError("OpenAI strategy requires both a client and a model name") self.media_describer = MultimodalModelDescriber( self.openai_client, model=self.openai_model, deployment=self.openai_deployment ) return self.media_describer logger.warning("Unknown media description strategy '%s'; skipping description", self.strategy) return None def mark_content_understanding_ready(self) -> None: """Record that the Content Understanding analyzer exists to avoid recreating it.""" self.content_understanding_ready = True async def describe(self, image_bytes: bytes) -> str | None: """Generate a description for the provided image bytes if a describer is available.""" describer = await self.get_media_describer() if describer is None: return None if isinstance(describer, ContentUnderstandingDescriber) and not self.content_understanding_ready: await describer.create_analyzer() self.content_understanding_ready = True return await describer.describe_image(image_bytes) def build_figure_markup(image: "ImageOnPage", description: Optional[str] = None) -> str: """Create consistent HTML markup for a figure description on demand.""" caption_parts = [image.figure_id] if image.title: caption_parts.append(image.title) caption = " ".join(part for part in caption_parts if part) if description: return f"
{caption}
{description}
" return f"
{caption}
" async def process_page_image( *, image: "ImageOnPage", document_filename: str, blob_manager: Optional[BaseBlobManager], image_embeddings_client: Optional[ImageEmbeddings], figure_processor: Optional[FigureProcessor] = None, user_oid: Optional[str] = None, ) -> "ImageOnPage": """Generate description, upload image, and optionally compute embedding for a figure.""" if blob_manager is None: raise ValueError("BlobManager must be provided to process images.") # Generate plain (model) description text only; do not wrap in HTML markup here. description_text: str | None = None if figure_processor is not None: description_text = await figure_processor.describe(image.bytes) # Store plain descriptive text (can be None). HTML rendering is deferred to build_figure_markup. image.description = description_text if image.url is None: image.url = await blob_manager.upload_document_image( document_filename, image.bytes, image.filename, image.page_num, user_oid=user_oid ) if image_embeddings_client is not None: try: image.embedding = await image_embeddings_client.create_embedding_for_image(image.bytes) except Exception: # pragma: no cover - embedding failures shouldn't abort figure processing logger.warning("Image embedding generation failed for figure %s", image.figure_id, exc_info=True) return image ================================================ FILE: app/backend/prepdocslib/fileprocessor.py ================================================ from dataclasses import dataclass from .parser import Parser from .textsplitter import TextSplitter @dataclass(frozen=True) class FileProcessor: parser: Parser splitter: TextSplitter ================================================ FILE: app/backend/prepdocslib/filestrategy.py ================================================ import logging from typing import Optional from .blobmanager import AdlsBlobManager, BaseBlobManager, BlobManager from .embeddings import ImageEmbeddings, OpenAIEmbeddings from .figureprocessor import ( FigureProcessor, MediaDescriptionStrategy, process_page_image, ) from .fileprocessor import FileProcessor from .listfilestrategy import File, ListFileStrategy from .mediadescriber import ContentUnderstandingDescriber from .searchmanager import SearchManager, Section from .strategy import DocumentAction, SearchInfo, Strategy from .textprocessor import process_text logger = logging.getLogger("scripts") async def parse_file( file: File, file_processors: dict[str, FileProcessor], category: Optional[str] = None, blob_manager: Optional[BaseBlobManager] = None, image_embeddings_client: Optional[ImageEmbeddings] = None, figure_processor: Optional[FigureProcessor] = None, user_oid: Optional[str] = None, ) -> list[Section]: key = file.file_extension().lower() processor = file_processors.get(key) if processor is None: logger.info("Skipping '%s', no parser found.", file.filename()) return [] logger.info("Ingesting '%s'", file.filename()) pages = [page async for page in processor.parser.parse(content=file.content)] for page in pages: for image in page.images: logger.info("Processing image '%s' on page %d", image.filename, page.page_num) await process_page_image( image=image, document_filename=file.filename(), blob_manager=blob_manager, image_embeddings_client=image_embeddings_client, figure_processor=figure_processor, user_oid=user_oid, ) sections = process_text(pages, file, processor.splitter, category) return sections class FileStrategy(Strategy): """ Strategy for ingesting documents into a search service from files stored either locally or in a data lake storage account """ def __init__( self, list_file_strategy: ListFileStrategy, blob_manager: BlobManager, search_info: SearchInfo, file_processors: dict[str, FileProcessor], document_action: DocumentAction = DocumentAction.Add, embeddings: Optional[OpenAIEmbeddings] = None, image_embeddings: Optional[ImageEmbeddings] = None, search_analyzer_name: Optional[str] = None, search_field_name_embedding: Optional[str] = None, use_acls: bool = False, category: Optional[str] = None, figure_processor: Optional[FigureProcessor] = None, enforce_access_control: bool = False, use_web_source: bool = False, use_sharepoint_source: bool = False, ): self.list_file_strategy = list_file_strategy self.blob_manager = blob_manager self.file_processors = file_processors self.document_action = document_action self.embeddings = embeddings self.image_embeddings = image_embeddings self.search_analyzer_name = search_analyzer_name self.search_field_name_embedding = search_field_name_embedding self.search_info = search_info self.use_acls = use_acls self.category = category self.figure_processor = figure_processor self.enforce_access_control = enforce_access_control self.use_web_source = use_web_source self.use_sharepoint_source = use_sharepoint_source def setup_search_manager(self): self.search_manager = SearchManager( self.search_info, self.search_analyzer_name, self.use_acls, False, # use_parent_index_projection disabled for file-based ingestion self.embeddings, field_name_embedding=self.search_field_name_embedding, search_images=self.image_embeddings is not None, enforce_access_control=self.enforce_access_control, use_web_source=self.use_web_source, use_sharepoint_source=self.use_sharepoint_source, ) async def setup(self): self.setup_search_manager() await self.search_manager.create_index() if ( self.figure_processor is not None and self.figure_processor.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING ): media_describer = await self.figure_processor.get_media_describer() if isinstance(media_describer, ContentUnderstandingDescriber): await media_describer.create_analyzer() self.figure_processor.mark_content_understanding_ready() async def run(self): self.setup_search_manager() if self.document_action == DocumentAction.Add: files = self.list_file_strategy.list() async for file in files: try: blob_url = await self.blob_manager.upload_blob(file) sections = await parse_file( file, self.file_processors, self.category, self.blob_manager, self.image_embeddings, figure_processor=self.figure_processor, ) if sections: await self.search_manager.update_content(sections, url=blob_url) finally: if file: file.close() elif self.document_action == DocumentAction.Remove: paths = self.list_file_strategy.list_paths() async for path in paths: await self.blob_manager.remove_blob(path) await self.search_manager.remove_content(path) elif self.document_action == DocumentAction.RemoveAll: await self.blob_manager.remove_blob() await self.search_manager.remove_content() class UploadUserFileStrategy: """ Strategy for ingesting a file that has already been uploaded to a ADLS2 storage account """ def __init__( self, search_info: SearchInfo, file_processors: dict[str, FileProcessor], blob_manager: AdlsBlobManager, search_field_name_embedding: Optional[str] = None, embeddings: Optional[OpenAIEmbeddings] = None, image_embeddings: Optional[ImageEmbeddings] = None, enforce_access_control: bool = False, figure_processor: Optional[FigureProcessor] = None, ): self.file_processors = file_processors self.embeddings = embeddings self.image_embeddings = image_embeddings self.search_info = search_info self.blob_manager = blob_manager self.figure_processor = figure_processor self.search_manager = SearchManager( search_info=self.search_info, search_analyzer_name=None, use_acls=True, use_parent_index_projection=False, embeddings=self.embeddings, field_name_embedding=search_field_name_embedding, search_images=image_embeddings is not None, enforce_access_control=enforce_access_control, ) self.search_field_name_embedding = search_field_name_embedding async def add_file(self, file: File, user_oid: str): sections = await parse_file( file, self.file_processors, None, self.blob_manager, self.image_embeddings, figure_processor=self.figure_processor, user_oid=user_oid, ) if sections: await self.search_manager.update_content(sections, url=file.url) async def remove_file(self, filename: str, oid: str): if filename is None or filename == "": logging.warning("Filename is required to remove a file") return await self.search_manager.remove_content(filename, oid) ================================================ FILE: app/backend/prepdocslib/htmlparser.py ================================================ import logging import re from collections.abc import AsyncGenerator from typing import IO from bs4 import BeautifulSoup from .page import Page from .parser import Parser logger = logging.getLogger("scripts") def cleanup_data(data: str) -> str: """Cleans up the given content using regexes Args: data: (str): The data to clean up. Returns: str: The cleaned up data. """ # match two or more newlines and replace them with one new line output = re.sub(r"\n{2,}", "\n", data) # match two or more spaces that are not newlines and replace them with one space output = re.sub(r"[^\S\n]{2,}", " ", output) # match two or more hyphens and replace them with two hyphens output = re.sub(r"-{2,}", "--", output) return output.strip() class LocalHTMLParser(Parser): """Parses HTML text into Page objects.""" async def parse(self, content: IO) -> AsyncGenerator[Page, None]: """Parses the given content. To learn more, please visit https://pypi.org/project/beautifulsoup4/ Args: content (IO): The content to parse. Returns: Page: The parsed html Page. """ logger.info("Extracting text from '%s' using local HTML parser (BeautifulSoup)", content.name) data = content.read() soup = BeautifulSoup(data, "html.parser") # Get text only from html file result = soup.get_text() yield Page(0, 0, text=cleanup_data(result)) ================================================ FILE: app/backend/prepdocslib/integratedvectorizerstrategy.py ================================================ import logging from typing import Optional from azure.search.documents.indexes._generated.models import ( NativeBlobSoftDeleteDeletionDetectionPolicy, ) from azure.search.documents.indexes.models import ( AzureOpenAIEmbeddingSkill, IndexProjectionMode, InputFieldMappingEntry, OutputFieldMappingEntry, SearchIndexer, SearchIndexerDataContainer, SearchIndexerDataSourceConnection, SearchIndexerDataSourceType, SearchIndexerIndexProjection, SearchIndexerIndexProjectionSelector, SearchIndexerIndexProjectionsParameters, SearchIndexerSkillset, SplitSkill, ) from .blobmanager import BlobManager from .embeddings import OpenAIEmbeddings from .listfilestrategy import ListFileStrategy from .searchmanager import SearchManager from .strategy import DocumentAction, SearchInfo, Strategy logger = logging.getLogger("scripts") class IntegratedVectorizerStrategy(Strategy): # pragma: no cover """ Strategy for ingesting and vectorizing documents into a search service from files stored storage account """ def __init__( self, list_file_strategy: ListFileStrategy, blob_manager: BlobManager, search_info: SearchInfo, embeddings: OpenAIEmbeddings, search_field_name_embedding: str, subscription_id: str, document_action: DocumentAction = DocumentAction.Add, search_analyzer_name: Optional[str] = None, use_acls: bool = False, category: Optional[str] = None, enforce_access_control: bool = False, use_web_source: bool = False, ): self.list_file_strategy = list_file_strategy self.blob_manager = blob_manager self.document_action = document_action self.embeddings = embeddings self.search_field_name_embedding = search_field_name_embedding self.subscription_id = subscription_id self.search_analyzer_name = search_analyzer_name self.use_acls = use_acls self.category = category self.search_info = search_info prefix = f"{self.search_info.index_name}-{self.search_field_name_embedding}" self.skillset_name = f"{prefix}-skillset" self.indexer_name = f"{prefix}-indexer" self.data_source_name = f"{prefix}-blob" self.enforce_access_control = enforce_access_control self.use_web_source = use_web_source async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset: """ Create a skillset for the indexer to chunk documents and generate embeddings """ split_skill = SplitSkill( name="split-skill", description="Split skill to chunk documents", text_split_mode="pages", context="/document", maximum_page_length=2048, page_overlap_length=20, inputs=[ InputFieldMappingEntry(name="text", source="/document/content"), ], outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")], ) if not self.embeddings.azure_endpoint or not self.embeddings.azure_deployment_name: raise ValueError("Integrated vectorization requires Azure OpenAI endpoint and deployment") embedding_skill = AzureOpenAIEmbeddingSkill( name="embedding-skill", description="Skill to generate embeddings via Azure OpenAI", context="/document/pages/*", resource_url=self.embeddings.azure_endpoint, deployment_name=self.embeddings.azure_deployment_name, model_name=self.embeddings.open_ai_model_name, dimensions=self.embeddings.open_ai_dimensions, inputs=[ InputFieldMappingEntry(name="text", source="/document/pages/*"), ], outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")], ) index_projection = SearchIndexerIndexProjection( selectors=[ SearchIndexerIndexProjectionSelector( target_index_name=index_name, parent_key_field_name="parent_id", source_context="/document/pages/*", mappings=[ InputFieldMappingEntry(name="content", source="/document/pages/*"), InputFieldMappingEntry(name="sourcepage", source="/document/metadata_storage_name"), InputFieldMappingEntry(name="sourcefile", source="/document/metadata_storage_name"), InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), InputFieldMappingEntry( name=self.search_field_name_embedding, source="/document/pages/*/vector" ), ], ), ], parameters=SearchIndexerIndexProjectionsParameters( projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS ), ) skillset = SearchIndexerSkillset( name=self.skillset_name, description="Skillset to chunk documents and generate embeddings", skills=[split_skill, embedding_skill], index_projection=index_projection, ) return skillset async def setup(self): logger.info("Setting up search index using integrated vectorization...") search_manager = SearchManager( search_info=self.search_info, search_analyzer_name=self.search_analyzer_name, use_acls=self.use_acls, use_parent_index_projection=True, embeddings=self.embeddings, field_name_embedding=self.search_field_name_embedding, search_images=False, enforce_access_control=self.enforce_access_control, use_web_source=self.use_web_source, ) await search_manager.create_index() ds_client = self.search_info.create_search_indexer_client() ds_container = SearchIndexerDataContainer(name=self.blob_manager.container) data_source_connection = SearchIndexerDataSourceConnection( name=self.data_source_name, type=SearchIndexerDataSourceType.AZURE_BLOB, connection_string=self.blob_manager.get_managedidentity_connectionstring(), container=ds_container, data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy(), ) await ds_client.create_or_update_data_source_connection(data_source_connection) embedding_skillset = await self.create_embedding_skill(self.search_info.index_name) await ds_client.create_or_update_skillset(embedding_skillset) await ds_client.close() async def run(self): if self.document_action == DocumentAction.Add: files = self.list_file_strategy.list() async for file in files: try: await self.blob_manager.upload_blob(file) finally: if file: file.close() elif self.document_action == DocumentAction.Remove: paths = self.list_file_strategy.list_paths() async for path in paths: await self.blob_manager.remove_blob(path) elif self.document_action == DocumentAction.RemoveAll: await self.blob_manager.remove_blob() # Create an indexer indexer = SearchIndexer( name=self.indexer_name, description="Indexer to index documents and generate embeddings", skillset_name=self.skillset_name, target_index_name=self.search_info.index_name, data_source_name=self.data_source_name, ) indexer_client = self.search_info.create_search_indexer_client() indexer_result = await indexer_client.create_or_update_indexer(indexer) # Run the indexer await indexer_client.run_indexer(self.indexer_name) await indexer_client.close() logger.info( f"Successfully created index, indexer: {indexer_result.name}, and skillset. Please navigate to search service in Azure Portal to view the status of the indexer." ) ================================================ FILE: app/backend/prepdocslib/jsonparser.py ================================================ import json from collections.abc import AsyncGenerator from typing import IO from .page import Page from .parser import Parser class JsonParser(Parser): """ Concrete parser that can parse JSON into Page objects. A top-level object becomes a single Page, while a top-level array becomes multiple Page objects. """ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: offset = 0 data = json.loads(content.read()) if isinstance(data, list): for i, obj in enumerate(data): offset += 1 # For opening bracket or comma before object page_text = json.dumps(obj) yield Page(i, offset, page_text) offset += len(page_text) elif isinstance(data, dict): yield Page(0, 0, json.dumps(data)) ================================================ FILE: app/backend/prepdocslib/listfilestrategy.py ================================================ import base64 import hashlib import logging import os import re from abc import ABC from collections.abc import AsyncGenerator from glob import glob from typing import IO, Optional logger = logging.getLogger("scripts") class File: """ Represents a file stored either locally or in a data lake storage account This file might contain access control information about which users or groups can access it """ def __init__(self, content: IO, acls: Optional[dict[str, list]] = None, url: Optional[str] = None): self.content = content self.acls = acls or {} self.url = url def filename(self) -> str: """ Get the filename from the content object. Different file-like objects store the filename in different attributes: - File objects from open() have a .name attribute - HTTP uploaded files (werkzeug.datastructures.FileStorage) have a .filename attribute Returns: str: The basename of the file """ content_name = None # Try to get filename attribute (common for HTTP uploaded files) if hasattr(self.content, "filename"): content_name = getattr(self.content, "filename") if content_name: return os.path.basename(content_name) # Try to get name attribute (common for file objects from open()) if hasattr(self.content, "name"): content_name = getattr(self.content, "name") if content_name and content_name != "file": return os.path.basename(content_name) raise ValueError("The content object does not have a filename or name attribute. ") def file_extension(self): return os.path.splitext(self.filename())[1] def filename_to_id(self): filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", self.filename()) filename_hash = base64.b16encode(self.filename().encode("utf-8")).decode("ascii") acls_hash = "" if self.acls: acls_hash = base64.b16encode(str(self.acls).encode("utf-8")).decode("ascii") return f"file-{filename_ascii}-{filename_hash}{acls_hash}" def close(self): if self.content: self.content.close() class ListFileStrategy(ABC): """ Abstract strategy for listing files that are located somewhere. For example, on a local computer or remotely in a storage account """ async def list(self) -> AsyncGenerator[File, None]: if False: # pragma: no cover - this is necessary for mypy to type check yield async def list_paths(self) -> AsyncGenerator[str, None]: if False: # pragma: no cover - this is necessary for mypy to type check yield class LocalListFileStrategy(ListFileStrategy): """ Concrete strategy for listing files that are located in a local filesystem """ def __init__(self, path_pattern: str, enable_global_documents: bool = False): self.path_pattern = path_pattern self.enable_global_documents = enable_global_documents async def list_paths(self) -> AsyncGenerator[str, None]: async for p in self._list_paths(self.path_pattern): yield p async def _list_paths(self, path_pattern: str) -> AsyncGenerator[str, None]: for path in glob(path_pattern): if os.path.isdir(path): async for p in self._list_paths(f"{path}/*"): yield p else: # Only list files, not directories yield path async def list(self) -> AsyncGenerator[File, None]: acls = {"oids": ["all"], "groups": ["all"]} if self.enable_global_documents else {} async for path in self.list_paths(): if not self.check_md5(path): yield File(content=open(path, mode="rb"), acls=acls, url=path) def check_md5(self, path: str) -> bool: # if filename ends in .md5 skip if path.endswith(".md5"): return True # if there is a file called .md5 in this directory, see if its updated stored_hash = None with open(path, "rb") as file: existing_hash = hashlib.md5(file.read()).hexdigest() hash_path = f"{path}.md5" if os.path.exists(hash_path): with open(hash_path, encoding="utf-8") as md5_f: stored_hash = md5_f.read() if stored_hash and stored_hash.strip() == existing_hash.strip(): logger.info("Skipping '%s', no changes detected.", path) return True # Write the hash with open(hash_path, "w", encoding="utf-8") as md5_f: md5_f.write(existing_hash) return False ================================================ FILE: app/backend/prepdocslib/mediadescriber.py ================================================ import base64 import logging from abc import ABC from typing import Optional import aiohttp from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import get_bearer_token_provider from openai import AsyncOpenAI, RateLimitError from rich.progress import Progress from tenacity import ( AsyncRetrying, retry, retry_if_exception_type, stop_after_attempt, wait_fixed, wait_random_exponential, ) logger = logging.getLogger("scripts") class MediaDescriber(ABC): async def describe_image(self, image_bytes) -> str: raise NotImplementedError # pragma: no cover class ContentUnderstandingDescriber(MediaDescriber): CU_API_VERSION = "2024-12-01-preview" analyzer_schema = { "analyzerId": "image_analyzer", "name": "Image understanding", "description": "Extract detailed structured information from images extracted from documents.", "baseAnalyzerId": "prebuilt-image", "scenario": "image", "config": {"returnDetails": False}, "fieldSchema": { "name": "ImageInformation", "descriptions": "Description of image.", "fields": { "Description": { "type": "string", "description": "Description of the image. If the image has a title, start with the title. Include a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in an HTML table tag, with accurate numbers. If the image is a chart, describe any axis or legends. The only allowed HTML tags are the table/thead/tr/td/tbody tags.", }, }, }, } def __init__(self, endpoint: str, credential: AsyncTokenCredential): self.endpoint = endpoint self.credential = credential async def poll_api(self, session, poll_url, headers): @retry(stop=stop_after_attempt(60), wait=wait_fixed(2), retry=retry_if_exception_type(ValueError)) async def poll(): async with session.get(poll_url, headers=headers) as response: response.raise_for_status() response_json = await response.json() if response_json["status"] == "Failed": raise Exception("Failed") if response_json["status"] == "Running": raise ValueError("Running") return response_json return await poll() async def create_analyzer(self): logger.info("Creating analyzer '%s'...", self.analyzer_schema["analyzerId"]) token_provider = get_bearer_token_provider(self.credential, "https://cognitiveservices.azure.com/.default") token = await token_provider() headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} params = {"api-version": self.CU_API_VERSION} analyzer_id = self.analyzer_schema["analyzerId"] cu_endpoint = f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_id}" async with aiohttp.ClientSession() as session: async with session.put( url=cu_endpoint, params=params, headers=headers, json=self.analyzer_schema ) as response: if response.status == 409: logger.info("Analyzer '%s' already exists.", analyzer_id) return elif response.status != 201: data = await response.text() raise Exception("Error creating analyzer", data) else: poll_url = response.headers.get("Operation-Location") with Progress() as progress: progress.add_task("Creating analyzer...", total=None, start=False) await self.poll_api(session, poll_url, headers) async def describe_image(self, image_bytes: bytes) -> str: async with aiohttp.ClientSession() as session: token = await self.credential.get_token("https://cognitiveservices.azure.com/.default") headers = {"Authorization": "Bearer " + token.token} params = {"api-version": self.CU_API_VERSION} analyzer_name = self.analyzer_schema["analyzerId"] async with session.post( url=f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_name}:analyze", params=params, headers=headers, data=image_bytes, ) as response: response.raise_for_status() poll_url = response.headers["Operation-Location"] with Progress() as progress: progress.add_task("Processing...", total=None, start=False) results = await self.poll_api(session, poll_url, headers) fields = results["result"]["contents"][0]["fields"] return fields["Description"]["valueString"] class MultimodalModelDescriber(MediaDescriber): def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: Optional[str] = None): self.openai_client = openai_client self.model = model self.deployment = deployment async def describe_image(self, image_bytes: bytes) -> str: def before_retry_sleep(retry_state): logger.info("Rate limited on the OpenAI chat completions API, sleeping before retrying...") image_base64 = base64.b64encode(image_bytes).decode("utf-8") image_datauri = f"data:image/png;base64,{image_base64}" async for attempt in AsyncRetrying( retry=retry_if_exception_type(RateLimitError), wait=wait_random_exponential(min=15, max=60), stop=stop_after_attempt(15), before_sleep=before_retry_sleep, ): with attempt: response = await self.openai_client.chat.completions.create( model=self.model if self.deployment is None else self.deployment, max_tokens=500, seed=42, # Keep responses more consistent across runs messages=[ { "role": "system", "content": "You are a helpful assistant that describes images from organizational documents.", }, { "role": "user", "content": [ { "text": "Describe image with no more than 5 sentences. Do not speculate about anything you don't know.", "type": "text", }, {"image_url": {"url": image_datauri, "detail": "auto"}, "type": "image_url"}, ], }, ], ) description = "" if response.choices and response.choices[0].message.content: description = response.choices[0].message.content.strip() return description ================================================ FILE: app/backend/prepdocslib/page.py ================================================ import base64 from dataclasses import asdict, dataclass, field from typing import Any, Optional @dataclass class ImageOnPage: bytes: bytes bbox: tuple[float, float, float, float] # Pixels filename: str figure_id: str page_num: int # 0-indexed placeholder: str # HTML placeholder in page text, e.g. '
' mime_type: str = "image/png" # Set by parser; default assumes PNG rendering url: Optional[str] = None title: str = "" embedding: Optional[list[float]] = None description: Optional[str] = None def to_skill_payload( self, file_name: str, *, include_bytes_base64: bool = True, ) -> dict[str, Any]: data = asdict(self) # Remove raw bytes to keep payload lean (and JSON-friendly without extra handling). data.pop("bytes", None) # Optionally include base64-encoded bytes for skills that need it if include_bytes_base64: b = self.bytes if isinstance(self.bytes, (bytes, bytearray)) else b"" data["bytes_base64"] = base64.b64encode(b).decode("utf-8") data["document_file_name"] = file_name return data @classmethod def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]: # Decode base64 image data (optional - may be omitted if already persisted to blob) bytes_base64 = data.get("bytes_base64") if bytes_base64: try: raw_bytes = base64.b64decode(bytes_base64) except Exception as exc: # pragma: no cover - defensive raise ValueError("Invalid bytes_base64 image data") from exc else: raw_bytes = b"" # Empty bytes if not provided (already uploaded to blob) # page_num may arrive as str; coerce try: page_num = int(data.get("page_num") or 0) except Exception: page_num = 0 # bbox may arrive as list; coerce into tuple bbox_val = data.get("bbox") if isinstance(bbox_val, list) and len(bbox_val) == 4: bbox = tuple(bbox_val) else: bbox = (0, 0, 0, 0) filename = data.get("filename") figure_id = data.get("figure_id") placeholder = data.get("placeholder") if filename is None: raise ValueError("filename is required") if figure_id is None: raise ValueError("figure_id is required for ImageOnPage deserialization") # Generate placeholder if not provided if placeholder is None: placeholder = f'
' image = cls( bytes=raw_bytes, bbox=bbox, page_num=page_num, filename=filename, figure_id=figure_id, placeholder=placeholder, mime_type=data.get("mime_type") or "image/png", title=data.get("title") or "", description=data.get("description"), url=data.get("url"), ) return image, data.get("document_file_name", "") @dataclass class Page: """ A single page from a document Attributes: page_num (int): Page number (0-indexed) offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow") text (str): The text of the page """ page_num: int offset: int text: str images: list[ImageOnPage] = field(default_factory=list) tables: list[str] = field(default_factory=list) @dataclass class Chunk: """Semantic chunk emitted by the splitter (may originate wholly within one page or be the result of a cross-page merge / trailing fragment carry-forward). Attributes: page_num (int): Logical source page number (0-indexed) for the originating portion of content. For merged content spanning pages we keep the earliest contributing page number for stable attribution. text (str): Textual content of the chunk. images (list[ImageOnPage]): Images associated with this chunk, if any. """ page_num: int text: str images: list[ImageOnPage] = field(default_factory=list) ================================================ FILE: app/backend/prepdocslib/parser.py ================================================ from abc import ABC from collections.abc import AsyncGenerator from typing import IO from .page import Page class Parser(ABC): """ Abstract parser that parses content into Page objects """ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: if False: yield # pragma: no cover - this is necessary for mypy to type check ================================================ FILE: app/backend/prepdocslib/pdfparser.py ================================================ import html import io import logging import uuid from collections.abc import AsyncGenerator from enum import Enum from typing import IO, Optional, cast import pymupdf from azure.ai.documentintelligence.aio import DocumentIntelligenceClient from azure.ai.documentintelligence.models import ( AnalyzeDocumentRequest, AnalyzeResult, DocumentFigure, DocumentTable, ) from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.core.exceptions import HttpResponseError from PIL import Image from pypdf import PdfReader from .page import ImageOnPage, Page from .parser import Parser logger = logging.getLogger("scripts") class LocalPdfParser(Parser): """ Concrete parser backed by PyPDF that can parse PDFs into pages To learn more, please visit https://pypi.org/project/pypdf/ """ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: logger.info("Extracting text from '%s' using local PDF parser (pypdf)", content.name) reader = PdfReader(content) pages = reader.pages offset = 0 for page_num, p in enumerate(pages): page_text = p.extract_text() yield Page(page_num=page_num, offset=offset, text=page_text) offset += len(page_text) class DocumentAnalysisParser(Parser): """ Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages To learn more, please visit https://learn.microsoft.com/azure/ai-services/document-intelligence/overview """ def __init__( self, endpoint: str, credential: AsyncTokenCredential | AzureKeyCredential, model_id: str = "prebuilt-layout", process_figures: bool = False, ) -> None: self.model_id = model_id self.endpoint = endpoint self.credential = credential self.process_figures = process_figures async def parse(self, content: IO) -> AsyncGenerator[Page, None]: logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name) async with DocumentIntelligenceClient( endpoint=self.endpoint, credential=self.credential ) as document_intelligence_client: # Always convert to bytes up front to avoid passing a FileStorage/stream object try: content.seek(0) except Exception: pass content_bytes = content.read() poller = None doc_for_pymupdf = None if self.process_figures: try: poller = await document_intelligence_client.begin_analyze_document( model_id="prebuilt-layout", body=AnalyzeDocumentRequest(bytes_source=content_bytes), output=["figures"], features=["ocrHighResolution"], output_content_format="markdown", ) doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes)) except HttpResponseError as e: if e.error and e.error.code == "InvalidArgument": logger.error( "This document type does not support media description. Proceeding with standard analysis." ) else: logger.error( "Unexpected error analyzing document for media description: %s. Proceeding with standard analysis.", e, ) poller = None if poller is None: poller = await document_intelligence_client.begin_analyze_document( model_id=self.model_id, body=AnalyzeDocumentRequest(bytes_source=content_bytes), ) analyze_result: AnalyzeResult = await poller.result() offset = 0 for page in analyze_result.pages: tables_on_page = [ table for table in (analyze_result.tables or []) if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number ] figures_on_page = [] if self.process_figures: figures_on_page = [ figure for figure in (analyze_result.figures or []) if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number ] page_images: list[ImageOnPage] = [] page_tables: list[str] = [] class ObjectType(Enum): NONE = -1 TABLE = 0 FIGURE = 1 MaskEntry = tuple[ObjectType, Optional[int]] page_offset = page.spans[0].offset page_length = page.spans[0].length mask_chars: list[MaskEntry] = cast(list[MaskEntry], [(ObjectType.NONE, None)] * page_length) # mark all positions of the table spans in the page for table_idx, table in enumerate(tables_on_page): for span in table.spans: # replace all table spans with "table_id" in table_chars array for i in range(span.length): idx = span.offset - page_offset + i if idx >= 0 and idx < page_length: mask_chars[idx] = (ObjectType.TABLE, table_idx) # mark all positions of the figure spans in the page for figure_idx, figure in enumerate(figures_on_page): for span in figure.spans: # replace all figure spans with "figure_id" in figure_chars array for i in range(span.length): idx = span.offset - page_offset + i if idx >= 0 and idx < page_length: mask_chars[idx] = (ObjectType.FIGURE, figure_idx) # build page text by replacing characters in table spans with table html page_text = "" added_objects: set[MaskEntry] = set() for idx, mask_char in enumerate(mask_chars): object_type, object_idx = mask_char if object_type == ObjectType.NONE: page_text += analyze_result.content[page_offset + idx] elif object_type == ObjectType.TABLE: if object_idx is None: raise ValueError("Expected object_idx to be set") if mask_char not in added_objects: table_html = DocumentAnalysisParser.table_to_html(tables_on_page[object_idx]) page_tables.append(table_html) page_text += table_html added_objects.add(mask_char) elif object_type == ObjectType.FIGURE: if object_idx is None: raise ValueError("Expected object_idx to be set") if doc_for_pymupdf is None: # pragma: no cover raise ValueError("Expected doc_for_pymupdf to be set for figure processing") if mask_char not in added_objects: image_on_page = await DocumentAnalysisParser.figure_to_image( doc_for_pymupdf, figures_on_page[object_idx] ) page_images.append(image_on_page) page_text += image_on_page.placeholder added_objects.add(mask_char) # We remove these comments since they are not needed and skew the page numbers page_text = page_text.replace("", "") # We remove excess newlines at the beginning and end of the page page_text = page_text.strip() yield Page( page_num=page.page_number - 1, offset=offset, text=page_text, images=page_images, tables=page_tables, ) offset += len(page_text) @staticmethod async def figure_to_image(doc: pymupdf.Document, figure: DocumentFigure) -> ImageOnPage: figure_title = figure.caption.content if figure.caption and figure.caption.content else "" # Generate a random UUID if figure.id is None figure_id = figure.id or f"fig_{uuid.uuid4().hex[:8]}" figure_filename = f"figure{figure_id.replace('.', '_')}.png" logger.info("Cropping figure %s with title '%s'", figure_id, figure_title) placeholder = f'
' if not figure.bounding_regions: return ImageOnPage( bytes=b"", page_num=0, # 0-indexed figure_id=figure_id, bbox=(0, 0, 0, 0), filename=figure_filename, title=figure_title, placeholder=placeholder, mime_type="image/png", ) if len(figure.bounding_regions) > 1: logger.warning("Figure %s has more than one bounding region, using the first one", figure_id) first_region = figure.bounding_regions[0] # To learn more about bounding regions, see https://aka.ms/bounding-region bounding_box = ( first_region.polygon[0], # x0 (left) first_region.polygon[1], # y0 (top first_region.polygon[4], # x1 (right) first_region.polygon[5], # y1 (bottom) ) page_number = first_region["pageNumber"] # 1-indexed cropped_img, bbox_pixels = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box) return ImageOnPage( bytes=cropped_img, page_num=page_number - 1, # Convert to 0-indexed figure_id=figure_id, bbox=bbox_pixels, filename=figure_filename, title=figure_title, placeholder=placeholder, mime_type="image/png", ) @staticmethod def table_to_html(table: DocumentTable): table_html = "
" rows = [ sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count) ] for row_cells in rows: table_html += "" for cell in row_cells: tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td" cell_spans = "" if cell.column_span is not None and cell.column_span > 1: cell_spans += f" colSpan={cell.column_span}" if cell.row_span is not None and cell.row_span > 1: cell_spans += f" rowSpan={cell.row_span}" table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}" table_html += "" table_html += "
" return table_html @staticmethod def crop_image_from_pdf_page( doc: pymupdf.Document, page_number: int, bbox_inches: tuple[float, float, float, float] ) -> tuple[bytes, tuple[float, float, float, float]]: """ Crops a region from a given page in a PDF and returns it as an image. :param pdf_path: Path to the PDF file. :param page_number: The page number to crop from (0-indexed). :param bbox_inches: A tuple of (x0, y0, x1, y1) coordinates for the bounding box, in inches. :return: A tuple of (image_bytes, bbox_pixels). """ # Scale the bounding box to 72 DPI bbox_dpi = 72 # We multiply using unpacking to ensure the resulting tuple has the correct number of elements x0, y0, x1, y1 = (round(x * bbox_dpi, 2) for x in bbox_inches) bbox_pixels = (x0, y0, x1, y1) rect = pymupdf.Rect(bbox_pixels) # Assume that the PDF has 300 DPI, # and use the matrix to convert between the 2 DPIs page_dpi = 300 page = doc.load_page(page_number) pix = page.get_pixmap(matrix=pymupdf.Matrix(page_dpi / bbox_dpi, page_dpi / bbox_dpi), clip=rect) img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples) bytes_io = io.BytesIO() img.save(bytes_io, format="PNG") return bytes_io.getvalue(), bbox_pixels ================================================ FILE: app/backend/prepdocslib/searchmanager.py ================================================ import asyncio import logging import os from typing import Optional from azure.search.documents.indexes.models import ( AIServicesVisionParameters, AIServicesVisionVectorizer, AzureOpenAIVectorizer, AzureOpenAIVectorizerParameters, BinaryQuantizationCompression, HnswAlgorithmConfiguration, HnswParameters, KnowledgeBase, KnowledgeBaseAzureOpenAIModel, KnowledgeRetrievalOutputMode, KnowledgeSourceReference, PermissionFilter, RemoteSharePointKnowledgeSource, RemoteSharePointKnowledgeSourceParameters, RescoringOptions, SearchableField, SearchField, SearchFieldDataType, SearchIndex, SearchIndexFieldReference, SearchIndexKnowledgeSource, SearchIndexKnowledgeSourceParameters, SearchIndexPermissionFilterOption, SemanticConfiguration, SemanticField, SemanticPrioritizedFields, SemanticSearch, SimpleField, VectorSearch, VectorSearchAlgorithmConfiguration, VectorSearchCompression, VectorSearchCompressionRescoreStorageMethod, VectorSearchProfile, VectorSearchVectorizer, WebKnowledgeSource, ) from .blobmanager import BlobManager from .embeddings import OpenAIEmbeddings from .listfilestrategy import File from .strategy import SearchInfo from .textsplitter import Chunk logger = logging.getLogger("scripts") class Section: """ A section of a page that is stored in a search service. These sections are used as context by Azure OpenAI service """ def __init__(self, chunk: Chunk, content: File, category: Optional[str] = None): self.chunk = chunk # content comes from here self.content = content # sourcepage and sourcefile come from here self.category = category # this also needs images which will become the images field class SearchManager: """ Class to manage a search service. It can create indexes, and update or remove sections stored in these indexes To learn more, please visit https://learn.microsoft.com/azure/search/search-what-is-azure-search """ def __init__( self, search_info: SearchInfo, search_analyzer_name: Optional[str] = None, use_acls: bool = False, use_parent_index_projection: bool = False, embeddings: Optional[OpenAIEmbeddings] = None, field_name_embedding: Optional[str] = None, search_images: bool = False, enforce_access_control: bool = False, use_web_source: bool = False, use_sharepoint_source: bool = False, ): self.search_info = search_info self.search_analyzer_name = search_analyzer_name self.use_acls = use_acls self.use_parent_index_projection = use_parent_index_projection self.embeddings = embeddings self.embedding_dimensions = self.embeddings.open_ai_dimensions if self.embeddings else None self.field_name_embedding = field_name_embedding self.search_images = search_images self.enforce_access_control = enforce_access_control self.use_web_source = use_web_source self.use_sharepoint_source = use_sharepoint_source async def create_index(self): logger.info("Checking whether search index %s exists...", self.search_info.index_name) async with self.search_info.create_search_index_client() as search_index_client: embedding_field = None images_field = None text_vector_search_profile = None text_vector_algorithm = None text_vector_compression = None image_vector_search_profile = None image_vector_algorithm = None permission_filter_option = None if self.embeddings: if self.embedding_dimensions is None: raise ValueError( "Embedding dimensions must be set in order to add an embedding field to the search index" ) if self.field_name_embedding is None: raise ValueError( "Embedding field must be set in order to add an embedding field to the search index" ) text_vectorizer = None if self.embeddings.azure_endpoint and self.embeddings.azure_deployment_name: text_vectorizer = AzureOpenAIVectorizer( vectorizer_name=f"{self.embeddings.open_ai_model_name}-vectorizer", parameters=AzureOpenAIVectorizerParameters( resource_url=self.embeddings.azure_endpoint, deployment_name=self.embeddings.azure_deployment_name, model_name=self.embeddings.open_ai_model_name, ), ) text_vector_algorithm = HnswAlgorithmConfiguration( name="hnsw_config", parameters=HnswParameters(metric="cosine"), ) text_vector_compression = BinaryQuantizationCompression( compression_name=f"{self.field_name_embedding}-compression", truncation_dimension=1024, # should this be a parameter? maybe not yet? rescoring_options=RescoringOptions( enable_rescoring=True, default_oversampling=10, rescore_storage_method=VectorSearchCompressionRescoreStorageMethod.PRESERVE_ORIGINALS, ), ) text_vector_search_profile = VectorSearchProfile( name=f"{self.field_name_embedding}-profile", algorithm_configuration_name=text_vector_algorithm.name, compression_name=text_vector_compression.compression_name, **({"vectorizer_name": text_vectorizer.vectorizer_name if text_vectorizer else None}), ) embedding_field = SearchField( name=self.field_name_embedding, type=SearchFieldDataType.Collection(SearchFieldDataType.Single), hidden=True, searchable=True, filterable=False, sortable=False, facetable=False, vector_search_dimensions=self.embedding_dimensions, vector_search_profile_name=f"{self.field_name_embedding}-profile", stored=False, ) if self.search_images: if not self.search_info.azure_vision_endpoint: raise ValueError("Azure AI Vision endpoint must be provided to use image embeddings") image_vector_algorithm = HnswAlgorithmConfiguration( name="images_hnsw_config", parameters=HnswParameters(metric="cosine"), ) # Create the AI Vision vectorizer for image embeddings image_vectorizer = AIServicesVisionVectorizer( vectorizer_name="images-vision-vectorizer", ai_services_vision_parameters=AIServicesVisionParameters( resource_uri=self.search_info.azure_vision_endpoint, model_version="2023-04-15", ), ) image_vector_search_profile = VectorSearchProfile( name="images_embedding_profile", algorithm_configuration_name=image_vector_algorithm.name, vectorizer_name=image_vectorizer.vectorizer_name, ) images_field = SearchField( name="images", type=SearchFieldDataType.Collection(SearchFieldDataType.ComplexType), fields=[ SearchField( name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, stored=False, vector_search_dimensions=1024, vector_search_profile_name=image_vector_search_profile.name, ), SearchField( name="url", type=SearchFieldDataType.String, searchable=False, filterable=True, sortable=False, facetable=True, ), SearchField( name="description", type=SearchFieldDataType.String, searchable=True, filterable=False, sortable=False, facetable=False, ), SearchField( name="boundingbox", type=SearchFieldDataType.Collection(SearchFieldDataType.Double), searchable=False, filterable=False, sortable=False, facetable=False, ), ], ) if self.use_acls: oids_field = SearchField( name="oids", type=SearchFieldDataType.Collection(SearchFieldDataType.String), filterable=True, permission_filter=PermissionFilter.USER_IDS, ) groups_field = SearchField( name="groups", type=SearchFieldDataType.Collection(SearchFieldDataType.String), filterable=True, permission_filter=PermissionFilter.GROUP_IDS, ) if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]: logger.info("Creating new search index %s", self.search_info.index_name) fields = [ ( SimpleField(name="id", type="Edm.String", key=True) if not self.use_parent_index_projection else SearchField( name="id", type="Edm.String", key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword", ) ), SearchableField( name="content", type="Edm.String", analyzer_name=self.search_analyzer_name, ), SimpleField(name="category", type="Edm.String", filterable=True, facetable=True), SimpleField( name="sourcepage", type="Edm.String", filterable=True, facetable=True, ), SimpleField( name="sourcefile", type="Edm.String", filterable=True, facetable=True, ), SimpleField( name="storageUrl", type="Edm.String", filterable=True, facetable=False, ), ] if self.use_acls: fields.append(oids_field) fields.append(groups_field) permission_filter_option = ( SearchIndexPermissionFilterOption.ENABLED if self.enforce_access_control else SearchIndexPermissionFilterOption.DISABLED ) if self.use_parent_index_projection: logger.info("Including parent_id field for parent/child index projection support in new index") fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True)) vectorizers: list[VectorSearchVectorizer] = [] vector_search_profiles = [] vector_algorithms: list[VectorSearchAlgorithmConfiguration] = [] vector_compressions: list[VectorSearchCompression] = [] if embedding_field: logger.info("Including %s field for text vectors in new index", embedding_field.name) fields.append(embedding_field) if text_vectorizer is not None: vectorizers.append(text_vectorizer) if ( text_vector_search_profile is None or text_vector_algorithm is None or text_vector_compression is None ): raise ValueError("Text vector search profile, algorithm and compression must be set") vector_search_profiles.append(text_vector_search_profile) vector_algorithms.append(text_vector_algorithm) vector_compressions.append(text_vector_compression) if images_field: logger.info("Including %s field for image descriptions and vectors in new index", images_field.name) fields.append(images_field) if image_vector_search_profile is None or image_vector_algorithm is None: raise ValueError("Image search profile and algorithm must be set") vector_search_profiles.append(image_vector_search_profile) vector_algorithms.append(image_vector_algorithm) # Add image vectorizer to vectorizers list vectorizers.append(image_vectorizer) index = SearchIndex( name=self.search_info.index_name, fields=fields, semantic_search=SemanticSearch( default_configuration_name="default", configurations=[ SemanticConfiguration( name="default", prioritized_fields=SemanticPrioritizedFields( title_field=SemanticField(field_name="sourcepage"), content_fields=[SemanticField(field_name="content")], ), ) ], ), vector_search=VectorSearch( profiles=vector_search_profiles, algorithms=vector_algorithms, compressions=vector_compressions, vectorizers=vectorizers, ), permission_filter_option=permission_filter_option, ) await search_index_client.create_index(index) else: logger.info("Search index %s already exists", self.search_info.index_name) existing_index = await search_index_client.get_index(self.search_info.index_name) if not any(field.name == "storageUrl" for field in existing_index.fields): logger.info("Adding storageUrl field to index %s", self.search_info.index_name) existing_index.fields.append( SimpleField( name="storageUrl", type="Edm.String", filterable=True, facetable=False, ), ) await search_index_client.create_or_update_index(existing_index) if embedding_field and not any( field.name == self.field_name_embedding for field in existing_index.fields ): logger.info("Adding %s field for text embeddings", self.field_name_embedding) embedding_field.stored = True existing_index.fields.append(embedding_field) if existing_index.vector_search is None: raise ValueError("Vector search is not enabled for the existing index") if text_vectorizer is not None: if existing_index.vector_search.vectorizers is None: existing_index.vector_search.vectorizers = [] existing_index.vector_search.vectorizers.append(text_vectorizer) if ( text_vector_search_profile is None or text_vector_algorithm is None or text_vector_compression is None ): raise ValueError("Text vector search profile, algorithm and compression must be set") if existing_index.vector_search.profiles is None: existing_index.vector_search.profiles = [] existing_index.vector_search.profiles.append(text_vector_search_profile) if existing_index.vector_search.algorithms is None: existing_index.vector_search.algorithms = [] # existing_index.vector_search.algorithms.append(text_vector_algorithm) if existing_index.vector_search.compressions is None: existing_index.vector_search.compressions = [] existing_index.vector_search.compressions.append(text_vector_compression) await search_index_client.create_or_update_index(existing_index) if ( images_field and images_field.fields and not any(field.name == "images" for field in existing_index.fields) ): logger.info("Adding %s field for image embeddings", images_field.name) images_field.fields[0].stored = True existing_index.fields.append(images_field) if image_vector_search_profile is None or image_vector_algorithm is None: raise ValueError("Image vector search profile and algorithm must be set") if existing_index.vector_search is None: raise ValueError("Image vector search is not enabled for the existing index") if existing_index.vector_search.profiles is None: existing_index.vector_search.profiles = [] existing_index.vector_search.profiles.append(image_vector_search_profile) if existing_index.vector_search.algorithms is None: existing_index.vector_search.algorithms = [] existing_index.vector_search.algorithms.append(image_vector_algorithm) if existing_index.vector_search.vectorizers is None: existing_index.vector_search.vectorizers = [] existing_index.vector_search.vectorizers.append(image_vectorizer) await search_index_client.create_or_update_index(existing_index) if existing_index.semantic_search: if not existing_index.semantic_search.default_configuration_name: logger.info("Adding default semantic configuration to index %s", self.search_info.index_name) existing_index.semantic_search.default_configuration_name = "default" if existing_index.semantic_search.configurations: existing_semantic_config = existing_index.semantic_search.configurations[0] if ( existing_semantic_config.prioritized_fields and existing_semantic_config.prioritized_fields.title_field and not existing_semantic_config.prioritized_fields.title_field.field_name == "sourcepage" ): logger.info("Updating semantic configuration for index %s", self.search_info.index_name) existing_semantic_config.prioritized_fields.title_field = SemanticField( field_name="sourcepage" ) if existing_index.vector_search is not None and ( existing_index.vector_search.vectorizers is None or len(existing_index.vector_search.vectorizers) == 0 ): if ( self.embeddings is not None and self.embeddings.azure_endpoint and self.embeddings.azure_deployment_name ): logger.info("Adding vectorizer to search index %s", self.search_info.index_name) existing_index.vector_search.vectorizers = [ AzureOpenAIVectorizer( vectorizer_name=f"{self.search_info.index_name}-vectorizer", parameters=AzureOpenAIVectorizerParameters( resource_url=self.embeddings.azure_endpoint, deployment_name=self.embeddings.azure_deployment_name, model_name=self.embeddings.open_ai_model_name, ), ) ] await search_index_client.create_or_update_index(existing_index) else: logger.info( "Can't add vectorizer to search index %s since no Azure OpenAI embeddings service is defined", self.search_info, ) if self.use_acls: if self.enforce_access_control: logger.info("Enabling permission filtering on index %s", self.search_info.index_name) existing_index.permission_filter_option = SearchIndexPermissionFilterOption.ENABLED else: logger.info("Disabling permission filtering on index %s", self.search_info.index_name) existing_index.permission_filter_option = SearchIndexPermissionFilterOption.DISABLED existing_oids_field = next((field for field in existing_index.fields if field.name == "oids"), None) if existing_oids_field: existing_oids_field.permission_filter = PermissionFilter.USER_IDS else: existing_index.fields.append(oids_field) existing_groups_field = next( (field for field in existing_index.fields if field.name == "groups"), None ) if existing_groups_field: existing_groups_field.permission_filter = PermissionFilter.GROUP_IDS else: existing_index.fields.append(groups_field) await search_index_client.create_or_update_index(existing_index) if self.search_info.use_agentic_knowledgebase and self.search_info.knowledgebase_name: await self.create_knowledgebase() async def create_knowledgebase(self): """Creates one or more Knowledge Bases in the search index based on desired knowledge sources.""" if self.search_info.knowledgebase_name: field_names = ["id", "sourcepage", "sourcefile", "content", "category"] if self.use_acls: field_names.extend(["oids", "groups"]) if self.search_images: field_names.append("images/url") # Create field references using the new SDK pattern source_data_fields = [SearchIndexFieldReference(name=field) for field in field_names] async with self.search_info.create_search_index_client() as search_index_client: search_index_knowledge_source = SearchIndexKnowledgeSource( name=self.search_info.index_name, # Use the same name for convenience description="Default knowledge source using the main search index", search_index_parameters=SearchIndexKnowledgeSourceParameters( search_index_name=self.search_info.index_name, source_data_fields=source_data_fields, ), ) await search_index_client.create_or_update_knowledge_source( knowledge_source=search_index_knowledge_source ) knowledge_source_refs: dict[str, KnowledgeSourceReference] = { "index": KnowledgeSourceReference(name=search_index_knowledge_source.name) } if self.use_web_source: logger.info("Adding web knowledge source to the knowledge base") web_knowledge_source = WebKnowledgeSource( name="web" # We do not specify a description here, since the default description is quite detailed already ) await search_index_client.create_or_update_knowledge_source(knowledge_source=web_knowledge_source) knowledge_source_refs["web"] = KnowledgeSourceReference(name=web_knowledge_source.name) if self.use_sharepoint_source: logger.info("Adding SharePoint knowledge source to the knowledge base") sharepoint_knowledge_source = RemoteSharePointKnowledgeSource( name="sharepoint", description="SharePoint knowledge source", remote_share_point_parameters=RemoteSharePointKnowledgeSourceParameters(), ) await search_index_client.create_or_update_knowledge_source( knowledge_source=sharepoint_knowledge_source ) knowledge_source_refs["sharepoint"] = KnowledgeSourceReference( name=sharepoint_knowledge_source.name ) # Build the set of knowledge bases that should exist based on optional sources base_knowledgebase_name = self.search_info.knowledgebase_name knowledge_bases_to_upsert: list[tuple[str, list[KnowledgeSourceReference]]] = [ (base_knowledgebase_name, [knowledge_source_refs["index"]]) ] if "web" in knowledge_source_refs: knowledge_bases_to_upsert.append( ( f"{base_knowledgebase_name}-with-web", [knowledge_source_refs["index"], knowledge_source_refs["web"]], ) ) if "sharepoint" in knowledge_source_refs: knowledge_bases_to_upsert.append( ( f"{base_knowledgebase_name}-with-sp", [knowledge_source_refs["index"], knowledge_source_refs["sharepoint"]], ) ) if "web" in knowledge_source_refs and "sharepoint" in knowledge_source_refs: knowledge_bases_to_upsert.append( ( f"{base_knowledgebase_name}-with-web-and-sp", [ knowledge_source_refs["index"], knowledge_source_refs["web"], knowledge_source_refs["sharepoint"], ], ) ) created_kb_names: list[str] = [] for kb_name, sources in knowledge_bases_to_upsert: logger.info("Creating (or updating) knowledge base '%s'...", kb_name) await search_index_client.create_or_update_knowledge_base( knowledge_base=KnowledgeBase( name=kb_name, knowledge_sources=sources, models=[ KnowledgeBaseAzureOpenAIModel( azure_open_ai_parameters=AzureOpenAIVectorizerParameters( resource_url=self.search_info.azure_openai_endpoint, deployment_name=self.search_info.azure_openai_knowledgebase_deployment, model_name=self.search_info.azure_openai_knowledgebase_model, ) ) ], output_mode=KnowledgeRetrievalOutputMode.ANSWER_SYNTHESIS, ) ) created_kb_names.append(kb_name) if created_kb_names: logger.info( "Knowledge bases created successfully: %s", ", ".join(created_kb_names), ) async def update_content(self, sections: list[Section], url: Optional[str] = None): MAX_BATCH_SIZE = 1000 section_batches = [sections[i : i + MAX_BATCH_SIZE] for i in range(0, len(sections), MAX_BATCH_SIZE)] async with self.search_info.create_search_client() as search_client: for batch_index, batch in enumerate(section_batches): documents = [] for section_index, section in enumerate(batch): image_fields = {} if self.search_images: image_fields = { "images": [ { "url": image.url, "description": image.description, "boundingbox": image.bbox, "embedding": image.embedding, } for image in section.chunk.images ] } document = { "id": f"{section.content.filename_to_id()}-page-{section_index + batch_index * MAX_BATCH_SIZE}", "content": section.chunk.text, "category": section.category, "sourcepage": BlobManager.sourcepage_from_file_page( filename=section.content.filename(), page=section.chunk.page_num ), "sourcefile": section.content.filename(), **image_fields, **section.content.acls, } documents.append(document) if url: for document in documents: document["storageUrl"] = url if self.embeddings: if self.field_name_embedding is None: raise ValueError("Embedding field name must be set") embeddings = await self.embeddings.create_embeddings( texts=[section.chunk.text for section in batch] ) for i, document in enumerate(documents): document[self.field_name_embedding] = embeddings[i] logger.info( "Uploading batch %d with %d sections to search index '%s'", batch_index + 1, len(documents), self.search_info.index_name, ) await search_client.upload_documents(documents) async def remove_content(self, path: Optional[str] = None, only_oid: Optional[str] = None): logger.info( "Removing sections from '{%s or ''}' from search index '%s'", path, self.search_info.index_name ) async with self.search_info.create_search_client() as search_client: while True: filter = None if path is not None: # Replace ' with '' to escape the single quote for the filter # https://learn.microsoft.com/azure/search/query-odata-filter-orderby-syntax#escaping-special-characters-in-string-constants path_for_filter = os.path.basename(path).replace("'", "''") filter = f"sourcefile eq '{path_for_filter}'" max_results = 1000 result = await search_client.search( search_text="", filter=filter, top=max_results, include_total_count=True ) result_count = await result.get_count() if result_count == 0: break documents_to_remove = [] async for document in result: # If only_oid is set, only remove documents that have only this oid if not only_oid or document.get("oids") == [only_oid]: documents_to_remove.append({"id": document["id"]}) if len(documents_to_remove) == 0: if result_count < max_results: break else: continue removed_docs = await search_client.delete_documents(documents_to_remove) logger.info("Removed %d sections from index", len(removed_docs)) # It can take a few seconds for search results to reflect changes, so wait a bit await asyncio.sleep(2) ================================================ FILE: app/backend/prepdocslib/servicesetup.py ================================================ """Shared service setup helpers.""" import logging import os from collections.abc import Awaitable, Callable from enum import Enum from typing import Optional from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import get_bearer_token_provider from openai import AsyncOpenAI from .blobmanager import BlobManager from .csvparser import CsvParser from .embeddings import ImageEmbeddings, OpenAIEmbeddings from .figureprocessor import FigureProcessor, MediaDescriptionStrategy from .fileprocessor import FileProcessor from .htmlparser import LocalHTMLParser from .jsonparser import JsonParser from .parser import Parser from .pdfparser import DocumentAnalysisParser, LocalPdfParser from .strategy import SearchInfo from .textparser import TextParser from .textsplitter import SentenceTextSplitter, SimpleTextSplitter logger = logging.getLogger("scripts") def clean_key_if_exists(key: Optional[str]) -> Optional[str]: """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None.""" if key is not None and key.strip() != "": return key.strip() return None class OpenAIHost(str, Enum): """Supported OpenAI hosting styles. OPENAI: Public OpenAI API. AZURE: Standard Azure OpenAI (service name becomes endpoint). AZURE_CUSTOM: A fully custom endpoint URL (for Network Isolation / APIM). LOCAL: A locally hosted OpenAI-compatible endpoint (no key required). """ OPENAI = "openai" AZURE = "azure" AZURE_CUSTOM = "azure_custom" LOCAL = "local" def setup_search_info( search_service: str, index_name: str, azure_credential: AsyncTokenCredential, use_agentic_knowledgebase: Optional[bool] = None, azure_openai_endpoint: Optional[str] = None, knowledgebase_name: Optional[str] = None, azure_openai_knowledgebase_deployment: Optional[str] = None, azure_openai_knowledgebase_model: Optional[str] = None, search_key: Optional[str] = None, azure_vision_endpoint: Optional[str] = None, ) -> SearchInfo: """Setup search service information.""" search_creds: AsyncTokenCredential | AzureKeyCredential = ( azure_credential if search_key is None else AzureKeyCredential(search_key) ) if use_agentic_knowledgebase and azure_openai_knowledgebase_deployment is None: raise ValueError("Azure OpenAI deployment for Knowledge Base must be specified for agentic retrieval.") return SearchInfo( endpoint=f"https://{search_service}.search.windows.net/", credential=search_creds, index_name=index_name, knowledgebase_name=knowledgebase_name, use_agentic_knowledgebase=use_agentic_knowledgebase, azure_openai_endpoint=azure_openai_endpoint, azure_openai_knowledgebase_model=azure_openai_knowledgebase_model, azure_openai_knowledgebase_deployment=azure_openai_knowledgebase_deployment, azure_vision_endpoint=azure_vision_endpoint, ) def setup_openai_client( openai_host: OpenAIHost, azure_credential: AsyncTokenCredential, azure_openai_api_key: Optional[str] = None, azure_openai_service: Optional[str] = None, azure_openai_custom_url: Optional[str] = None, openai_api_key: Optional[str] = None, openai_organization: Optional[str] = None, ) -> tuple[AsyncOpenAI, Optional[str]]: openai_client: AsyncOpenAI azure_openai_endpoint: Optional[str] = None if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: base_url: Optional[str] = None api_key_or_token: Optional[str | Callable[[], Awaitable[str]]] = None if openai_host == OpenAIHost.AZURE_CUSTOM: logger.info("OPENAI_HOST is azure_custom, setting up Azure OpenAI custom client") if not azure_openai_custom_url: raise ValueError("AZURE_OPENAI_CUSTOM_URL must be set when OPENAI_HOST is azure_custom") base_url = azure_openai_custom_url else: logger.info("OPENAI_HOST is azure, setting up Azure OpenAI client") if not azure_openai_service: raise ValueError("AZURE_OPENAI_SERVICE must be set when OPENAI_HOST is azure") azure_openai_endpoint = f"https://{azure_openai_service}.openai.azure.com" base_url = f"{azure_openai_endpoint}/openai/v1" if azure_openai_api_key: logger.info("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client") api_key_or_token = azure_openai_api_key else: logger.info("Using Azure credential (passwordless authentication) for Azure OpenAI client") api_key_or_token = get_bearer_token_provider( azure_credential, "https://cognitiveservices.azure.com/.default" ) openai_client = AsyncOpenAI( base_url=base_url, api_key=api_key_or_token, ) elif openai_host == OpenAIHost.LOCAL: logger.info("OPENAI_HOST is local, setting up local OpenAI client for OPENAI_BASE_URL with no key") openai_client = AsyncOpenAI( base_url=os.environ["OPENAI_BASE_URL"], api_key="no-key-required", ) else: logger.info( "OPENAI_HOST is not azure, setting up OpenAI client using OPENAI_API_KEY and OPENAI_ORGANIZATION environment variables" ) if openai_api_key is None: raise ValueError("OpenAI key is required when using the non-Azure OpenAI API") openai_client = AsyncOpenAI( api_key=openai_api_key, organization=openai_organization, ) return openai_client, azure_openai_endpoint def setup_image_embeddings_service( azure_credential: AsyncTokenCredential, vision_endpoint: Optional[str], use_multimodal: bool, ) -> ImageEmbeddings | None: image_embeddings_service: Optional[ImageEmbeddings] = None if use_multimodal: if vision_endpoint is None: raise ValueError("An Azure AI Vision endpoint must be provided to use multimodal features.") image_embeddings_service = ImageEmbeddings( endpoint=vision_endpoint, token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"), ) return image_embeddings_service def setup_embeddings_service( openai_host: OpenAIHost, open_ai_client: AsyncOpenAI, emb_model_name: str, emb_model_dimensions: int, azure_openai_deployment: Optional[str] = None, azure_openai_endpoint: Optional[str] = None, disable_batch: bool = False, ) -> OpenAIEmbeddings: if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: if azure_openai_endpoint is None: raise ValueError("Azure OpenAI endpoint must be provided when using Azure OpenAI embeddings") if azure_openai_deployment is None: raise ValueError("Azure OpenAI deployment must be provided when using Azure OpenAI embeddings") return OpenAIEmbeddings( open_ai_client=open_ai_client, open_ai_model_name=emb_model_name, open_ai_dimensions=emb_model_dimensions, disable_batch=disable_batch, azure_deployment_name=azure_openai_deployment, azure_endpoint=azure_openai_endpoint, ) def setup_blob_manager( azure_credential: AsyncTokenCredential | str, storage_account: str, storage_container: str, storage_resource_group: Optional[str] = None, subscription_id: Optional[str] = None, storage_key: Optional[str] = None, image_storage_container: Optional[str] = None, ) -> BlobManager: """Create a BlobManager instance for document or figure storage. The optional resource group and subscription are retained for parity with local ingestion (used for diagnostic operations) but not required by Azure Functions. The optional image storage container is used for the multimodal ingestion feature. """ endpoint = f"https://{storage_account}.blob.core.windows.net" storage_credential: AsyncTokenCredential | str = azure_credential if storage_key is None else storage_key return BlobManager( endpoint=endpoint, container=storage_container, account=storage_account, credential=storage_credential, resource_group=storage_resource_group, subscription_id=subscription_id, image_container=image_storage_container, ) def setup_figure_processor( *, credential: AsyncTokenCredential | None, use_multimodal: bool, use_content_understanding: bool, content_understanding_endpoint: str | None, openai_client: object | None, openai_model: str | None, openai_deployment: str | None, ) -> FigureProcessor | None: """Create a FigureProcessor based on feature flags. Priority order: 1. use_multimodal -> MediaDescriptionStrategy.OPENAI 2. else if use_content_understanding and endpoint -> CONTENTUNDERSTANDING 3. else -> return None (no figure description) """ if use_multimodal: return FigureProcessor( credential=credential, strategy=MediaDescriptionStrategy.OPENAI, openai_client=openai_client, openai_model=openai_model, openai_deployment=openai_deployment, ) if use_content_understanding and content_understanding_endpoint: return FigureProcessor( credential=credential, strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, content_understanding_endpoint=content_understanding_endpoint, ) return None def build_file_processors( *, azure_credential: AsyncTokenCredential, document_intelligence_service: str | None, document_intelligence_key: str | None = None, use_local_pdf_parser: bool = False, use_local_html_parser: bool = False, process_figures: bool = False, ) -> dict[str, FileProcessor]: sentence_text_splitter = SentenceTextSplitter() doc_int_parser: Optional[DocumentAnalysisParser] = None # check if Azure Document Intelligence credentials are provided if document_intelligence_service: credential: AsyncTokenCredential | AzureKeyCredential if document_intelligence_key: credential = AzureKeyCredential(document_intelligence_key) else: credential = azure_credential doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", credential=credential, process_figures=process_figures, ) pdf_parser: Optional[Parser] = None if use_local_pdf_parser or document_intelligence_service is None: pdf_parser = LocalPdfParser() elif doc_int_parser is not None: pdf_parser = doc_int_parser else: logger.warning("No PDF parser available") html_parser: Optional[Parser] = None if use_local_html_parser or document_intelligence_service is None: html_parser = LocalHTMLParser() elif doc_int_parser is not None: html_parser = doc_int_parser else: logger.warning("No HTML parser available") # These file formats can always be parsed: file_processors = { ".json": FileProcessor(JsonParser(), SimpleTextSplitter()), ".md": FileProcessor(TextParser(), sentence_text_splitter), ".txt": FileProcessor(TextParser(), sentence_text_splitter), ".csv": FileProcessor(CsvParser(), sentence_text_splitter), } # These require either a Python package or Document Intelligence if pdf_parser is not None: file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)}) if html_parser is not None: file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)}) # These file formats require Document Intelligence if doc_int_parser is not None: file_processors.update( { ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), ".png": FileProcessor(doc_int_parser, sentence_text_splitter), ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter), ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter), ".heic": FileProcessor(doc_int_parser, sentence_text_splitter), } ) return file_processors def select_processor_for_filename(file_name: str, file_processors: dict[str, FileProcessor]) -> FileProcessor: """Select the appropriate file processor for a given filename. Args: file_name: Name of the file to process file_processors: Dictionary mapping file extensions to FileProcessor instances Returns: FileProcessor instance for the file Raises: ValueError: If the file extension is not supported """ file_ext = os.path.splitext(file_name)[1].lower() file_processor = file_processors.get(file_ext) if not file_processor: raise ValueError(f"Unsupported file type: {file_name}") return file_processor ================================================ FILE: app/backend/prepdocslib/strategy.py ================================================ from abc import ABC from enum import Enum from typing import Optional from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.search.documents.aio import SearchClient from azure.search.documents.indexes.aio import SearchIndexClient, SearchIndexerClient USER_AGENT = "azure-search-chat-demo/1.0.0" class SearchInfo: """ Class representing a connection to a search service To learn more, please visit https://learn.microsoft.com/azure/search/search-what-is-azure-search """ def __init__( self, endpoint: str, credential: AsyncTokenCredential | AzureKeyCredential, index_name: str, use_agentic_knowledgebase: Optional[bool] = False, knowledgebase_name: Optional[str] = None, azure_openai_knowledgebase_model: Optional[str] = None, azure_openai_knowledgebase_deployment: Optional[str] = None, azure_openai_endpoint: Optional[str] = None, azure_vision_endpoint: Optional[str] = None, ): self.endpoint = endpoint self.credential = credential self.index_name = index_name self.knowledgebase_name = knowledgebase_name self.use_agentic_knowledgebase = use_agentic_knowledgebase self.azure_openai_knowledgebase_model = azure_openai_knowledgebase_model self.azure_openai_knowledgebase_deployment = azure_openai_knowledgebase_deployment self.azure_openai_endpoint = azure_openai_endpoint self.azure_vision_endpoint = azure_vision_endpoint def create_search_client(self) -> SearchClient: return SearchClient(endpoint=self.endpoint, index_name=self.index_name, credential=self.credential) def create_search_index_client(self) -> SearchIndexClient: return SearchIndexClient(endpoint=self.endpoint, credential=self.credential) def create_search_indexer_client(self) -> SearchIndexerClient: return SearchIndexerClient(endpoint=self.endpoint, credential=self.credential) class DocumentAction(Enum): Add = 0 Remove = 1 RemoveAll = 2 class Strategy(ABC): """ Abstract strategy for ingesting documents into a search service. It has a single setup step to perform any required initialization, and then a run step that actually ingests documents into the search service. """ async def setup(self): raise NotImplementedError async def run(self): raise NotImplementedError ================================================ FILE: app/backend/prepdocslib/textparser.py ================================================ import re from collections.abc import AsyncGenerator from typing import IO from .page import Page from .parser import Parser def cleanup_data(data: str) -> str: """Cleans up the given content using regexes Args: data: (str): The data to clean up. Returns: str: The cleaned up data. """ # match two or more newlines and replace them with one new line output = re.sub(r"\n{2,}", "\n", data) # match two or more spaces that are not newlines and replace them with one space output = re.sub(r"[^\S\n]{2,}", " ", output) return output.strip() class TextParser(Parser): """Parses simple text into a Page object.""" async def parse(self, content: IO) -> AsyncGenerator[Page, None]: data = content.read() decoded_data = data.decode("utf-8") text = cleanup_data(decoded_data) yield Page(0, 0, text=text) ================================================ FILE: app/backend/prepdocslib/textprocessor.py ================================================ """Utilities for processing document text and combining it with figure descriptions.""" import logging from .figureprocessor import build_figure_markup from .listfilestrategy import File from .page import Page from .searchmanager import Section from .textsplitter import TextSplitter logger = logging.getLogger("scripts") def combine_text_with_figures(page: "Page") -> None: """Replace figure placeholders in page text with full description markup.""" for image in page.images: if image.description and image.placeholder in page.text: figure_markup = build_figure_markup(image, image.description) page.text = page.text.replace(image.placeholder, figure_markup) logger.info("Replaced placeholder for figure %s with description markup", image.figure_id) elif not image.description: logger.debug("No description for figure %s; keeping placeholder", image.figure_id) elif image.placeholder not in page.text: logger.warning("Placeholder not found for figure %s in page %d", image.figure_id, page.page_num) def process_text( pages: list["Page"], file: "File", splitter: "TextSplitter", category: str | None = None, ) -> list["Section"]: """Process document text and figures into searchable sections. Combines text with figure descriptions, splits into chunks, and associates figures with their containing sections. """ # Step 1: Combine text with figures on each page for page in pages: combine_text_with_figures(page) # Step 2: Split combined text into chunks logger.info("Splitting '%s' into sections", file.filename()) sections = [Section(chunk, content=file, category=category) for chunk in splitter.split_pages(pages)] # Step 3: Add images back to each section based on page number for section in sections: section.chunk.images = [ image for page in pages if page.page_num == section.chunk.page_num for image in page.images ] return sections ================================================ FILE: app/backend/prepdocslib/textsplitter.py ================================================ import logging import re from abc import ABC from collections.abc import Generator from dataclasses import dataclass, field from typing import Optional import tiktoken from .page import Chunk, Page logger = logging.getLogger("scripts") class TextSplitter(ABC): """ Splits a list of pages into smaller chunks. :param pages: The pages to split :return: A generator of Chunk """ def split_pages(self, pages: list[Page]) -> Generator[Chunk, None, None]: if False: # pragma: no cover - this is necessary for mypy to type check yield ENCODING_MODEL = "text-embedding-ada-002" STANDARD_WORD_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"] # See W3C document https://www.w3.org/TR/jlreq/#cl-01 CJK_WORD_BREAKS = [ "、", ",", ";", ":", "(", ")", "【", "】", "「", "」", "『", "』", "〔", "〕", "〈", "〉", "《", "》", "〖", "〗", "〘", "〙", "〚", "〛", "〝", "〞", "〟", "〰", "–", "—", "‘", "’", "‚", "‛", "“", "”", "„", "‟", "‹", "›", ] STANDARD_SENTENCE_ENDINGS = [".", "!", "?"] # See CL05 and CL06, based on JIS X 4051:2004 # https://www.w3.org/TR/jlreq/#cl-04 CJK_SENTENCE_ENDINGS = ["。", "!", "?", "‼", "⁇", "⁈", "⁉"] # NB: text-embedding-3-XX is the same BPE as text-embedding-ada-002 bpe = tiktoken.encoding_for_model(ENCODING_MODEL) DEFAULT_OVERLAP_PERCENT = 10 # See semantic search article for 10% overlap performance DEFAULT_SECTION_LENGTH = 1000 # Roughly 400-500 tokens for English def _safe_concat(a: str, b: str) -> str: """Concatenate two non-empty segments, inserting a space only when both sides end/start with alphanumerics and no natural boundary exists. Rules: - Both input strings are expected to be non-empty - Preserve existing whitespace if either side already provides a boundary. - Do not insert a space after a closing HTML tag marker '>'. - If both boundary characters are alphanumeric, insert a single space. - Otherwise concatenate directly. """ assert a and b, "_safe_concat expects non-empty strings" a_last = a[-1] b_first = b[0] if a_last.isspace() or b_first.isspace(): # pre-existing boundary return a + b if a_last == ">": # HTML tag end acts as a boundary return a + b if a_last.isalnum() and b_first.isalnum(): # need explicit separator return a + " " + b return a + b def _normalize_chunk(text: str, max_chars: int) -> str: """Normalize a non-figure chunk that may slightly exceed max_chars. Allows overflow for any chunk containing a
tag (figures are atomic), trims leading spaces if they alone cause minor overflow, and as a final step removes a trailing space/newline when within a small tolerance (<=3 chars over). """ lower = text.lower() if " max_chars: trimmed = trimmed[1:] if len(trimmed) > max_chars and len(trimmed) <= max_chars + 3: if trimmed.endswith(" ") or trimmed.endswith("\n"): trimmed = trimmed.rstrip() return trimmed @dataclass class _ChunkBuilder: """Accumulates sentence-like spans for a single page until size limits are reached. Responsibilities: - Track appended text fragments and their approximate token length. - Decide if a new span can be added without exceeding character or token thresholds. - Flush accumulated content into an output list as a `Chunk`. - Allow a figure block to be force-appended (even if it overflows) so that headings + figure stay together. Notes: - Character limit is soft (exact enforcement + later normalization); token limit is hard. - Token counts are computed by the caller and passed to `add`; this class stays agnostic of the encoder. """ page_num: int max_chars: int max_tokens: int parts: list[str] = field(default_factory=list) token_len: int = 0 def can_fit(self, text: str, token_count: int) -> bool: if not self.parts: # always allow first span return token_count <= self.max_tokens and len(text) <= self.max_chars # Character + token constraints return (len("".join(self.parts)) + len(text) <= self.max_chars) and ( self.token_len + token_count <= self.max_tokens ) def add(self, text: str, token_count: int) -> bool: if not self.can_fit(text, token_count): return False self.parts.append(text) self.token_len += token_count return True def force_append(self, text: str): self.parts.append(text) def flush_into(self, out: list[Chunk]): if self.parts: chunk = "".join(self.parts) if chunk.strip(): out.append(Chunk(page_num=self.page_num, text=chunk)) self.parts.clear() self.token_len = 0 # Convenience helpers for readability at call sites def has_content(self) -> bool: return bool(self.parts) def append_figure_and_flush(self, figure_text: str, out: list[Chunk]): """Append a figure (allowed to overflow) to current accumulation and flush in one step.""" self.force_append(figure_text) self.flush_into(out) class SentenceTextSplitter(TextSplitter): """ Class that splits pages into smaller chunks. This is required because embedding models may not be able to analyze an entire page at once """ def __init__(self, max_tokens_per_section: int = 500): self.sentence_endings = STANDARD_SENTENCE_ENDINGS + CJK_SENTENCE_ENDINGS self.word_breaks = STANDARD_WORD_BREAKS + CJK_WORD_BREAKS self.max_section_length = DEFAULT_SECTION_LENGTH self.sentence_search_limit = 100 self.max_tokens_per_section = max_tokens_per_section self.section_overlap = int(self.max_section_length * DEFAULT_OVERLAP_PERCENT / 100) # Always-on semantic overlap percent (duplicated suffix of previous chunk), applied: # - Between chunks on the same page. # - Across page boundary ONLY if semantic continuation heuristics pass. self.semantic_overlap_percent = 10 def _find_split_pos(self, text: str) -> tuple[int, bool]: """Find a good split position near midpoint. Returns (index, use_overlap_fallback). Priority: 1. Sentence-ending punctuation near midpoint (scan outward within central third). 2. Word-break character near midpoint (space / punctuation) within same window. 3. Fallback: caller should use midpoint + overlap strategy. """ length = len(text) if length < 4: return -1, True mid = length // 2 window_limit = length // 3 # defines central region scan boundary # 1. Sentence endings pos = 0 while mid - pos > window_limit: left = mid - pos right = mid + pos if left >= 0 and text[left] in self.sentence_endings: return left, False if right < length and text[right] in self.sentence_endings: return right, False pos += 1 # 2. Word breaks pos = 0 while mid - pos > window_limit: left = mid - pos right = mid + pos if left >= 0 and text[left] in self.word_breaks: return left, False if right < length and text[right] in self.word_breaks: return right, False pos += 1 # 3. Fallback return -1, True def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[Chunk, None, None]: """Recursively split plain text by token count. Boundary preference order when an oversized span is encountered: 1. Sentence-ending punctuation near midpoint. 2. Word-break character near midpoint (space/punctuation) to avoid mid-word cuts. 3. Midpoint split with symmetric overlap (DEFAULT_OVERLAP_PERCENT). """ tokens = bpe.encode(text) if len(tokens) <= self.max_tokens_per_section: yield Chunk(page_num=page_num, text=text) return split_pos, use_overlap = self._find_split_pos(text) if not use_overlap and split_pos > 0: first_half = text[: split_pos + 1] second_half = text[split_pos + 1 :] else: middle = len(text) // 2 overlap = int(len(text) * (DEFAULT_OVERLAP_PERCENT / 100)) first_half = text[: middle + overlap] second_half = text[middle - overlap :] yield from self.split_page_by_max_tokens(page_num, first_half) yield from self.split_page_by_max_tokens(page_num, second_half) def _is_heading_like(self, line: str) -> bool: """Heuristic heading detector used to suppress cross-page semantic overlap when a new section starts.""" line_str = line.strip() if not line_str: return False if line_str.startswith("#"): return True # Short Title Case or ALL CAPS lines (limited word count) often represent headings if len(line_str) <= 80 and (line_str.isupper() or (line_str.istitle() and len(line_str.split()) <= 12)): return True import re as _re # Numbered / roman numeral list or section forms: '1. ', 'II) ', 'III. ' if _re.match(r"^(?:\d+|[IVXLCM]+)[.)]\s", line_str): return True if line_str.startswith(("- ", "* ", "• ")): return True return False def _should_cross_page_overlap(self, prev: Chunk, nxt: Chunk) -> bool: if not prev or not nxt: return False if " Chunk: """Return a modified copy of prev_chunk whose text has an appended semantic overlap prefix from next_chunk; next_chunk itself is left unchanged so it continues to start at its natural sentence boundary. Strategy: - Take ~semantic_overlap_percent tail size (in chars) from the START of next_chunk. - Extend that region forward to the first sentence-ending (preferred) or word break so we end on a natural boundary (avoid chopping mid-word/mid-sentence). - Refuse overlap if either chunk contains a
to avoid duplicating figures. - Enforce hard token + soft char limits; shrink overlap if necessary. """ if not prev_chunk or not next_chunk: return prev_chunk if " 20: # fallback boundary after some progress boundary_found = True break if not boundary_found: # Trim trailing partial word if we stopped without boundary while prefix and prefix[-1].isalnum() and len(prefix) > target: prefix = prefix[:-1] # Avoid appending text that already exists at end (rare but possible due to prior operations) if prev_chunk.text.endswith(prefix): return prev_chunk candidate = prev_chunk.text + prefix max_chars = int(self.max_section_length * 1.2) if len(candidate) > max_chars or len(bpe.encode(candidate)) > self.max_tokens_per_section: # Attempt to shrink prefix at word / sentence boundaries from its start shrink = prefix while shrink and ( len(prev_chunk.text + shrink) > max_chars or len(bpe.encode(prev_chunk.text + shrink)) > self.max_tokens_per_section ): cut_index = 1 for i, ch in enumerate(shrink): if ch in self.word_breaks or ch in self.sentence_endings: cut_index = i + 1 break shrink = shrink[:-cut_index] if cut_index < len(shrink) else "" if not shrink: return prev_chunk candidate = prev_chunk.text + shrink if len(candidate) > max_chars or len(bpe.encode(candidate)) > self.max_tokens_per_section: return prev_chunk return Chunk(page_num=prev_chunk.page_num, text=candidate) def split_pages(self, pages: list[Page]) -> Generator[Chunk, None, None]: """Split each page into semantic chunks using token-aware accumulation with atomic figures. Strategy (per page): 1. Extract balanced
...
blocks as atomic "figure" blocks. 2. Treat intervening text as "text" blocks. 3. For text blocks, break into sentence-ish spans (using sentence ending chars) and accumulate until adding the next span would exceed either character or token limit. Flush when needed. 4. When a figure block arrives: - If there is accumulated text (builder), append the figure even if this exceeds token limit and flush. - If no accumulated text, emit the figure as its own chunk. 5. Ignore token limits for any chunk that contains a figure (never split figures). This avoids partial/duplicated figures and keeps headings with their following figure when space permits. """ figure_regex = re.compile(r"", re.IGNORECASE | re.DOTALL) previous_chunk: Optional[Chunk] = None for page in pages: raw = page.text or "" if not raw.strip(): continue # Build ordered list of blocks: (type, text) blocks: list[tuple[str, str]] = [] last = 0 for m in figure_regex.finditer(raw): if m.start() > last: blocks.append(("text", raw[last : m.start()])) blocks.append(("figure", m.group())) last = m.end() if last < len(raw): blocks.append(("text", raw[last:])) page_chunks: list[Chunk] = [] builder = _ChunkBuilder( page_num=page.page_num, max_chars=self.max_section_length, max_tokens=self.max_tokens_per_section, ) for btype, btext in blocks: if btype == "figure": if builder.has_content(): # Append figure to existing text (allow overflow) and flush builder.append_figure_and_flush(btext, page_chunks) else: # Emit figure standalone if btext.strip(): page_chunks.append(Chunk(page_num=page.page_num, text=btext)) continue # Process text block: split into sentence-like spans spans: list[str] = [] current_chars: list[str] = [] for ch in btext: current_chars.append(ch) if ch in self.sentence_endings: spans.append("".join(current_chars)) current_chars = [] if current_chars: # remaining tail spans.append("".join(current_chars)) for span in spans: span_tokens = len(bpe.encode(span)) # If a single span itself exceeds token limit (rare, very long sentence), split it directly if span_tokens > self.max_tokens_per_section: builder.flush_into(page_chunks) for chunk in self.split_page_by_max_tokens(page.page_num, span): page_chunks.append(chunk) continue if not builder.add(span, span_tokens): # Flush and retry (guaranteed to fit because span_tokens <= limit) builder.flush_into(page_chunks) if not builder.add(span, span_tokens): page_chunks.append(Chunk(page_num=page.page_num, text=span)) # Flush any trailing builder content builder.flush_into(page_chunks) # Attempt cross-page merge with previous_chunk (look-behind) if semantic continuation if previous_chunk and page_chunks: prev_last_char = previous_chunk.text.rstrip()[-1:] if previous_chunk.text.rstrip() else "" first_new = page_chunks[0] first_new_stripped = first_new.text.lstrip() first_char = first_new_stripped[:1] if ( prev_last_char and prev_last_char not in self.sentence_endings and not first_new_stripped.startswith("#") and first_char and first_char.islower() and " bool: combined = candidate + first_new_text if len(combined) > max_chars: return False if len(bpe.encode(combined)) > self.max_tokens_per_section: return False return True move_fragment = fragment_full if not fits(move_fragment): # Hard trim path: fragment begins after the last sentence-ending punctuation # of the previous chunk. Reduce to remaining character budget, then iteratively # shrink until token constraints are satisfied. remaining_chars = max_chars - len(first_new_text) # always > 0 given builder invariants move_fragment = move_fragment[:remaining_chars] while ( move_fragment and len(bpe.encode(move_fragment + first_new_text)) > self.max_tokens_per_section ): move_fragment = ( move_fragment[:-50] if len(move_fragment) > 50 else move_fragment[:-1] ) leftover_fragment = fragment_full[len(move_fragment) :] # Prepend the allowed fragment if move_fragment: page_chunks[0] = Chunk( page_num=page_chunks[0].page_num, text=_safe_concat(move_fragment, first_new_text), ) # Adjust previous_chunk retained portion if retained.strip(): previous_chunk = Chunk(page_num=previous_chunk.page_num, text=retained) else: previous_chunk = None # Insert leftover fragment as its own chunk (split if needed) BEFORE modified first_new if leftover_fragment.strip(): # Ensure leftover respects limits by splitting if needed leftover_pages = list( self.split_page_by_max_tokens(page_chunks[0].page_num, leftover_fragment) ) # Insert these before current first chunk page_chunks = leftover_pages + page_chunks # Normalize chunks (non-figure) that barely exceed char limit due to added boundary space max_chars = int(self.max_section_length * 1.2) if previous_chunk: previous_chunk = Chunk( page_num=previous_chunk.page_num, text=_normalize_chunk(previous_chunk.text, max_chars) ) if page_chunks: page_chunks = [ Chunk(page_num=chunk.page_num, text=_normalize_chunk(chunk.text, max_chars)) for chunk in page_chunks ] # Apply semantic overlap duplication (append style). We append a small # prefix of the NEXT chunk onto the PREVIOUS chunk, keeping natural starts. if self.semantic_overlap_percent > 0: # Cross-page overlap: modify previous_chunk (look-ahead to first new chunk) if previous_chunk and page_chunks and self._should_cross_page_overlap(previous_chunk, page_chunks[0]): previous_chunk = self._append_overlap(previous_chunk, page_chunks[0]) # Intra-page overlaps if len(page_chunks) > 1: for i in range(1, len(page_chunks)): prev_c = page_chunks[i - 1] curr_c = page_chunks[i] if " Generator[Chunk, None, None]: all_text = "".join(page.text for page in pages) if len(all_text.strip()) == 0: return length = len(all_text) if length <= self.max_object_length: yield Chunk(page_num=0, text=all_text) return # its too big, so we need to split it for i in range(0, length, self.max_object_length): yield Chunk(page_num=i // self.max_object_length, text=all_text[i : i + self.max_object_length]) return ================================================ FILE: app/backend/requirements.in ================================================ azure-functions>=1.24.0 azure-identity quart quart-cors openai>=1.109.1 tiktoken tenacity azure-ai-documentintelligence==1.0.2 azure-cognitiveservices-speech azure-cosmos azure-search-documents==11.7.0b2 azure-storage-blob azure-storage-file-datalake uvicorn aiohttp azure-monitor-opentelemetry opentelemetry-instrumentation-asgi opentelemetry-instrumentation-httpx opentelemetry-instrumentation-aiohttp-client opentelemetry-instrumentation-openai msal cryptography PyJWT Pillow types-Pillow pypdf PyMuPDF beautifulsoup4 types-beautifulsoup4 msgraph-sdk python-dotenv jinja2 rich typing-extensions ================================================ FILE: app/backend/requirements.txt ================================================ # This file was autogenerated by uv via the following command: # uv pip compile requirements.in -o requirements.txt --python-version 3.10 aiofiles==24.1.0 # via quart aiohappyeyeballs==2.6.1 # via aiohttp aiohttp==3.13.3 # via # -r requirements.in # microsoft-kiota-authentication-azure aiosignal==1.4.0 # via aiohttp annotated-types==0.7.0 # via pydantic anyio==4.4.0 # via # httpx # openai asgiref==3.10.0 # via opentelemetry-instrumentation-asgi attrs==25.3.0 # via aiohttp azure-ai-documentintelligence==1.0.2 # via -r requirements.in azure-cognitiveservices-speech==1.40.0 # via -r requirements.in azure-common==1.1.28 # via azure-search-documents azure-core==1.38.0 # via # azure-ai-documentintelligence # azure-core-tracing-opentelemetry # azure-cosmos # azure-identity # azure-monitor-opentelemetry # azure-monitor-opentelemetry-exporter # azure-search-documents # azure-storage-blob # azure-storage-file-datalake # microsoft-kiota-authentication-azure # msrest azure-core-tracing-opentelemetry==1.0.0b11 # via azure-monitor-opentelemetry azure-cosmos==4.9.0 # via -r requirements.in azure-functions==1.24.0 # via -r requirements.in azure-identity==1.17.1 # via # -r requirements.in # azure-monitor-opentelemetry-exporter # msgraph-sdk azure-monitor-opentelemetry==1.8.4 # via -r requirements.in azure-monitor-opentelemetry-exporter==1.0.0b46 # via azure-monitor-opentelemetry azure-search-documents==11.7.0b2 # via -r requirements.in azure-storage-blob==12.22.0 # via # -r requirements.in # azure-storage-file-datalake azure-storage-file-datalake==12.16.0 # via -r requirements.in beautifulsoup4==4.14.3 # via -r requirements.in blinker==1.9.0 # via # flask # quart certifi==2026.1.4 # via # httpcore # httpx # msrest # requests cffi==2.0.0 # via cryptography charset-normalizer==3.3.2 # via requests click==8.3.1 # via # flask # quart # uvicorn cryptography==46.0.5 # via # -r requirements.in # azure-identity # azure-storage-blob # msal # pyjwt distro==1.9.0 # via openai flask==3.1.3 # via quart frozenlist==1.8.0 # via # aiohttp # aiosignal h11==0.16.0 # via # httpcore # hypercorn # uvicorn # wsproto h2==4.3.0 # via # httpx # hypercorn hpack==4.1.0 # via h2 httpcore==1.0.9 # via httpx httpx[http2]==0.28.1 # via # microsoft-kiota-http # msgraph-core # openai hypercorn==0.18.0 # via quart hyperframe==6.1.0 # via h2 idna==3.10 # via # anyio # httpx # requests # yarl importlib-metadata==8.0.0 # via opentelemetry-api isodate==0.7.2 # via # azure-ai-documentintelligence # azure-search-documents # azure-storage-blob # azure-storage-file-datalake # msrest itsdangerous==2.2.0 # via # flask # quart jinja2==3.1.6 # via # -r requirements.in # flask # quart jiter==0.11.0 # via openai markdown-it-py==3.0.0 # via rich markupsafe==3.0.3 # via # flask # jinja2 # quart # werkzeug mdurl==0.1.2 # via markdown-it-py microsoft-kiota-abstractions==1.9.3 # via # microsoft-kiota-authentication-azure # microsoft-kiota-http # microsoft-kiota-serialization-form # microsoft-kiota-serialization-json # microsoft-kiota-serialization-multipart # microsoft-kiota-serialization-text # msgraph-core microsoft-kiota-authentication-azure==1.9.3 # via msgraph-core microsoft-kiota-http==1.9.3 # via msgraph-core microsoft-kiota-serialization-form==1.9.3 # via msgraph-sdk microsoft-kiota-serialization-json==1.9.3 # via msgraph-sdk microsoft-kiota-serialization-multipart==1.9.3 # via msgraph-sdk microsoft-kiota-serialization-text==1.9.3 # via msgraph-sdk msal==1.33.0 # via # -r requirements.in # azure-identity # msal-extensions msal-extensions==1.3.1 # via azure-identity msgraph-core==1.3.3 # via msgraph-sdk msgraph-sdk==1.45.0 # via -r requirements.in msrest==0.7.1 # via azure-monitor-opentelemetry-exporter multidict==6.7.0 # via # aiohttp # yarl oauthlib==3.3.1 # via requests-oauthlib openai==2.6.1 # via -r requirements.in opentelemetry-api==1.39.0 # via # azure-core-tracing-opentelemetry # azure-monitor-opentelemetry-exporter # microsoft-kiota-abstractions # microsoft-kiota-authentication-azure # microsoft-kiota-http # opentelemetry-instrumentation # opentelemetry-instrumentation-aiohttp-client # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-django # opentelemetry-instrumentation-fastapi # opentelemetry-instrumentation-flask # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-openai # opentelemetry-instrumentation-psycopg2 # opentelemetry-instrumentation-requests # opentelemetry-instrumentation-urllib # opentelemetry-instrumentation-urllib3 # opentelemetry-instrumentation-wsgi # opentelemetry-sdk # opentelemetry-semantic-conventions opentelemetry-instrumentation==0.60b0 # via # opentelemetry-instrumentation-aiohttp-client # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-django # opentelemetry-instrumentation-fastapi # opentelemetry-instrumentation-flask # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-openai # opentelemetry-instrumentation-psycopg2 # opentelemetry-instrumentation-requests # opentelemetry-instrumentation-urllib # opentelemetry-instrumentation-urllib3 # opentelemetry-instrumentation-wsgi opentelemetry-instrumentation-aiohttp-client==0.60b0 # via -r requirements.in opentelemetry-instrumentation-asgi==0.60b0 # via # -r requirements.in # opentelemetry-instrumentation-fastapi opentelemetry-instrumentation-dbapi==0.60b0 # via opentelemetry-instrumentation-psycopg2 opentelemetry-instrumentation-django==0.60b0 # via azure-monitor-opentelemetry opentelemetry-instrumentation-fastapi==0.60b0 # via azure-monitor-opentelemetry opentelemetry-instrumentation-flask==0.60b0 # via azure-monitor-opentelemetry opentelemetry-instrumentation-httpx==0.60b0 # via -r requirements.in opentelemetry-instrumentation-openai==0.51.1 # via -r requirements.in opentelemetry-instrumentation-psycopg2==0.60b0 # via azure-monitor-opentelemetry opentelemetry-instrumentation-requests==0.60b0 # via azure-monitor-opentelemetry opentelemetry-instrumentation-urllib==0.60b0 # via azure-monitor-opentelemetry opentelemetry-instrumentation-urllib3==0.60b0 # via azure-monitor-opentelemetry opentelemetry-instrumentation-wsgi==0.60b0 # via # opentelemetry-instrumentation-django # opentelemetry-instrumentation-flask opentelemetry-resource-detector-azure==0.1.5 # via azure-monitor-opentelemetry opentelemetry-sdk==1.39.0 # via # azure-monitor-opentelemetry # azure-monitor-opentelemetry-exporter # microsoft-kiota-abstractions # microsoft-kiota-authentication-azure # microsoft-kiota-http # opentelemetry-resource-detector-azure opentelemetry-semantic-conventions==0.60b0 # via # opentelemetry-instrumentation # opentelemetry-instrumentation-aiohttp-client # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-django # opentelemetry-instrumentation-fastapi # opentelemetry-instrumentation-flask # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-openai # opentelemetry-instrumentation-requests # opentelemetry-instrumentation-urllib # opentelemetry-instrumentation-urllib3 # opentelemetry-instrumentation-wsgi # opentelemetry-sdk opentelemetry-semantic-conventions-ai==0.4.13 # via opentelemetry-instrumentation-openai opentelemetry-util-http==0.60b0 # via # opentelemetry-instrumentation-aiohttp-client # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-django # opentelemetry-instrumentation-fastapi # opentelemetry-instrumentation-flask # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests # opentelemetry-instrumentation-urllib # opentelemetry-instrumentation-urllib3 # opentelemetry-instrumentation-wsgi packaging==24.1 # via # opentelemetry-instrumentation # opentelemetry-instrumentation-flask pillow==12.1.1 # via -r requirements.in priority==2.0.0 # via hypercorn propcache==0.2.0 # via # aiohttp # yarl psutil==7.1.2 # via azure-monitor-opentelemetry-exporter pycparser==2.22 # via cffi pydantic==2.12.5 # via openai pydantic-core==2.41.5 # via pydantic pygments==2.19.2 # via rich pyjwt[crypto]==2.11.0 # via # -r requirements.in # msal pymupdf==1.26.0 # via -r requirements.in pypdf==6.8.0 # via -r requirements.in python-dotenv==1.1.1 # via -r requirements.in quart==0.20.0 # via # -r requirements.in # quart-cors quart-cors==0.7.0 # via -r requirements.in regex==2025.7.34 # via tiktoken requests==2.32.4 # via # azure-core # msal # msrest # requests-oauthlib # tiktoken requests-oauthlib==2.0.0 # via msrest rich==14.1.0 # via -r requirements.in sniffio==1.3.1 # via # anyio # openai soupsieve==2.7 # via beautifulsoup4 std-uritemplate==2.0.8 # via microsoft-kiota-abstractions tenacity==9.1.2 # via -r requirements.in tiktoken==0.12.0 # via -r requirements.in tqdm==4.66.5 # via openai types-beautifulsoup4==4.12.0.20250516 # via -r requirements.in types-html5lib==1.1.11.20251014 # via types-beautifulsoup4 types-pillow==10.2.0.20240822 # via -r requirements.in typing-extensions==4.15.0 # via # -r requirements.in # azure-ai-documentintelligence # azure-core # azure-cosmos # azure-identity # azure-search-documents # azure-storage-blob # azure-storage-file-datalake # beautifulsoup4 # openai # opentelemetry-api # opentelemetry-sdk # opentelemetry-semantic-conventions # pydantic # pydantic-core # typing-inspection typing-inspection==0.4.2 # via pydantic urllib3==2.6.3 # via requests uvicorn==0.30.6 # via -r requirements.in werkzeug==3.1.6 # via # azure-functions # flask # quart wrapt==1.16.0 # via # opentelemetry-instrumentation # opentelemetry-instrumentation-aiohttp-client # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-urllib3 wsproto==1.3.2 # via hypercorn yarl==1.17.2 # via aiohttp zipp==3.23.0 # via importlib-metadata ================================================ FILE: app/backend/setup_cloud_ingestion.py ================================================ """Script to setup cloud ingestion for Azure AI Search.""" import asyncio import logging import os from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential from openai import AsyncOpenAI from rich.logging import RichHandler from load_azd_env import load_azd_env from prepdocslib.blobmanager import BlobManager from prepdocslib.cloudingestionstrategy import CloudIngestionStrategy from prepdocslib.listfilestrategy import LocalListFileStrategy from prepdocslib.servicesetup import ( OpenAIHost, clean_key_if_exists, setup_blob_manager, setup_embeddings_service, setup_openai_client, setup_search_info, ) from prepdocslib.strategy import DocumentAction logger = logging.getLogger("scripts") async def setup_cloud_ingestion_strategy( azure_credential: AsyncTokenCredential, document_action: DocumentAction = DocumentAction.Add, ) -> tuple[CloudIngestionStrategy, AsyncOpenAI, AsyncTokenCredential, BlobManager]: """Setup the cloud ingestion strategy with all required services.""" # Get environment variables search_service = os.environ["AZURE_SEARCH_SERVICE"] index_name = os.environ["AZURE_SEARCH_INDEX"] search_user_assigned_identity_resource_id = os.environ["AZURE_SEARCH_USER_ASSIGNED_IDENTITY_RESOURCE_ID"] storage_container = os.environ["AZURE_STORAGE_CONTAINER"] subscription_id = os.environ["AZURE_SUBSCRIPTION_ID"] image_storage_container = os.environ.get("AZURE_IMAGESTORAGE_CONTAINER") search_embedding_field = os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"] # Cloud ingestion storage account (ADLS Gen2 when ACLs enabled, standard blob otherwise) # Fallback to AZURE_STORAGE_ACCOUNT is for legacy deployments only - may be removed in future storage_account = os.getenv("AZURE_CLOUD_INGESTION_STORAGE_ACCOUNT") or os.environ["AZURE_STORAGE_ACCOUNT"] storage_resource_group = ( os.getenv("AZURE_CLOUD_INGESTION_STORAGE_RESOURCE_GROUP") or os.environ["AZURE_STORAGE_RESOURCE_GROUP"] ) # Cloud ingestion specific endpoints document_extractor_uri = os.environ["DOCUMENT_EXTRACTOR_SKILL_ENDPOINT"] document_extractor_resource_id = os.environ["DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID"] figure_processor_uri = os.environ["FIGURE_PROCESSOR_SKILL_ENDPOINT"] figure_processor_resource_id = os.environ["FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID"] text_processor_uri = os.environ["TEXT_PROCESSOR_SKILL_ENDPOINT"] text_processor_resource_id = os.environ["TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID"] # Feature flags use_multimodal = os.getenv("USE_MULTIMODAL", "").lower() == "true" use_acls = os.getenv("USE_CLOUD_INGESTION_ACLS", "").lower() == "true" enforce_access_control = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true" use_web_source = os.getenv("USE_WEB_SOURCE", "").lower() == "true" # Warn if access control is enforced but ACL extraction is not enabled if enforce_access_control and not use_acls: logger.warning( "AZURE_ENFORCE_ACCESS_CONTROL is enabled but USE_CLOUD_INGESTION_ACLS is not. " "Documents will be indexed without ACLs, so access control filtering will not work. " "Either set USE_CLOUD_INGESTION_ACLS=true to extract ACLs from ADLS Gen2, " "or manually set ACLs using scripts/manageacl.py after ingestion." ) # Setup search info search_info = setup_search_info( search_service=search_service, index_name=index_name, azure_credential=azure_credential, azure_vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), ) # Setup blob manager blob_manager = setup_blob_manager( azure_credential=azure_credential, storage_account=storage_account, storage_container=storage_container, storage_resource_group=storage_resource_group, subscription_id=subscription_id, storage_key=None, image_storage_container=image_storage_container, ) # Setup OpenAI embeddings OPENAI_HOST = OpenAIHost(os.environ["OPENAI_HOST"]) openai_client, azure_openai_endpoint = setup_openai_client( openai_host=OPENAI_HOST, azure_credential=azure_credential, azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"), azure_openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"), azure_openai_api_key=os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"), openai_api_key=clean_key_if_exists(os.getenv("OPENAI_API_KEY")), openai_organization=os.getenv("OPENAI_ORGANIZATION"), ) emb_model_dimensions = 1536 if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"): emb_model_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"]) openai_embeddings_service = setup_embeddings_service( OPENAI_HOST, openai_client, emb_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], emb_model_dimensions=emb_model_dimensions, azure_openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), azure_openai_endpoint=azure_openai_endpoint, disable_batch=False, ) # Create a list file strategy for uploading files from the data folder list_file_strategy = LocalListFileStrategy(path_pattern="data/*", enable_global_documents=False) # Create the cloud ingestion strategy ingestion_strategy = CloudIngestionStrategy( list_file_strategy=list_file_strategy, blob_manager=blob_manager, search_info=search_info, embeddings=openai_embeddings_service, search_field_name_embedding=search_embedding_field, document_extractor_uri=document_extractor_uri, document_extractor_auth_resource_id=document_extractor_resource_id, figure_processor_uri=figure_processor_uri, figure_processor_auth_resource_id=figure_processor_resource_id, text_processor_uri=text_processor_uri, text_processor_auth_resource_id=text_processor_resource_id, subscription_id=subscription_id, document_action=document_action, search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), use_acls=use_acls, use_multimodal=use_multimodal, enforce_access_control=enforce_access_control, use_web_source=use_web_source, search_user_assigned_identity_resource_id=search_user_assigned_identity_resource_id, ) return ingestion_strategy, openai_client, azure_credential, blob_manager async def main(): """Main function to setup cloud ingestion.""" load_azd_env() # Check if cloud ingestion is enabled use_cloud_ingestion = os.getenv("USE_CLOUD_INGESTION", "").lower() == "true" if not use_cloud_ingestion: logger.info("Cloud ingestion is not enabled. Skipping setup.") return # Setup logging logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)]) logger.setLevel(logging.INFO) logger.info("Setting up cloud ingestion...") # Use the current user identity to connect to Azure services if tenant_id := os.getenv("AZURE_TENANT_ID"): logger.info("Connecting to Azure services using the azd credential for tenant %s", tenant_id) azd_credential = AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60) else: logger.info("Connecting to Azure services using the azd credential for home tenant") azd_credential = AzureDeveloperCliCredential(process_timeout=60) try: ingestion_strategy, openai_client, credential, blob_manager = await setup_cloud_ingestion_strategy( azure_credential=azd_credential, document_action=DocumentAction.Add, ) # Setup the indexer, skillset, and data source logger.info("Setting up indexer, skillset, and data source...") await ingestion_strategy.setup() logger.info("Triggering initial indexing run...") await ingestion_strategy.run() finally: await blob_manager.close_clients() await openai_client.close() await azd_credential.close() if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: app/frontend/.npmrc ================================================ engine-strict=true fund=false # Required because react-helmet-async declares peer deps for @types/react <19.0.0, # but works fine with React 19. See https://github.com/staylor/react-helmet-async/issues/238 legacy-peer-deps=true ================================================ FILE: app/frontend/.nvmrc ================================================ 22.0.0 ================================================ FILE: app/frontend/.prettierignore ================================================ # Ignore JSON **/*.json ================================================ FILE: app/frontend/.prettierrc.json ================================================ { "tabWidth": 4, "printWidth": 160, "arrowParens": "avoid", "trailingComma": "none" } ================================================ FILE: app/frontend/index.html ================================================ Azure OpenAI + AI Search
================================================ FILE: app/frontend/package.json ================================================ { "name": "frontend", "private": true, "version": "0.0.0", "type": "module", "engines": { "node": ">=20.0.0" }, "scripts": { "dev": "vite --host 127.0.0.1", "build": "tsc && vite build", "preview": "vite preview", "format": "prettier --write ." }, "dependencies": { "@azure/msal-browser": "^4.16.0", "@azure/msal-react": "^3.0.16", "@fluentui/react-components": "^9.73.1", "@fluentui/react-icons": "^2.0.319", "@fluentui/react-table": "^9.19.9", "@react-spring/web": "^10.0.3", "dompurify": "^3.3.2", "i18next": "^24.2.0", "i18next-browser-languagedetector": "^8.0.2", "i18next-http-backend": "^3.0.1", "idb": "^8.0.0", "ndjson-readablestream": "^1.4.0", "react": "^19.2.4", "react-dom": "^19.2.4", "react-helmet-async": "^2.0.5", "react-i18next": "^15.4.1", "react-markdown": "^9.0.1", "react-router-dom": "^7.12.0", "react-syntax-highlighter": "^16.1.0", "rehype-raw": "^7.0.0", "remark-gfm": "^4.0.0", "scheduler": "^0.27.0" }, "devDependencies": { "@types/dom-speech-recognition": "^0.0.7", "@types/dompurify": "^3.0.5", "@types/react": "^19.2.13", "@types/react-dom": "^19.2.3", "@types/react-syntax-highlighter": "^15.5.13", "@vitejs/plugin-react": "^4.3.3", "prettier": "^3.3.3", "rollup-plugin-visualizer": "^5.12.0", "typescript": "^5.6.3", "vite": "^6.0.0" } } ================================================ FILE: app/frontend/src/api/api.ts ================================================ const BACKEND_URI = ""; import { ChatAppResponse, ChatAppResponseOrError, ChatAppRequest, Config, SimpleAPIResponse, HistoryListApiResponse, HistoryApiResponse } from "./models"; import { useLogin, getToken, isUsingAppServicesLogin } from "../authConfig"; export async function getHeaders(idToken: string | undefined): Promise> { // If using login and not using app services, add the id token of the logged in account as the authorization if (useLogin && !isUsingAppServicesLogin) { if (idToken) { return { Authorization: `Bearer ${idToken}` }; } } return {}; } export async function configApi(): Promise { const response = await fetch(`${BACKEND_URI}/config`, { method: "GET" }); return (await response.json()) as Config; } export async function chatApi(request: ChatAppRequest, shouldStream: boolean, idToken: string | undefined, signal: AbortSignal): Promise { let url = `${BACKEND_URI}/chat`; if (shouldStream) { url += "/stream"; } const headers = await getHeaders(idToken); return await fetch(url, { method: "POST", headers: { ...headers, "Content-Type": "application/json" }, body: JSON.stringify(request), signal: signal }); } export async function getSpeechApi(text: string): Promise { return await fetch("/speech", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text: text }) }) .then(response => { if (response.status == 200) { return response.blob(); } else if (response.status == 400) { console.log("Speech synthesis is not enabled."); return null; } else { console.error("Unable to get speech synthesis."); return null; } }) .then(blob => (blob ? URL.createObjectURL(blob) : null)); } export function getCitationFilePath(citation: string): string { // If there are parentheses at end of citation, remove part in parentheses const cleanedCitation = citation.replace(/\s*\(.*?\)\s*$/, "").trim(); return `${BACKEND_URI}/content/${cleanedCitation}`; } export async function uploadFileApi(request: FormData, idToken: string): Promise { const response = await fetch("/upload", { method: "POST", headers: await getHeaders(idToken), body: request }); if (!response.ok) { throw new Error(`Uploading files failed: ${response.statusText}`); } const dataResponse: SimpleAPIResponse = await response.json(); return dataResponse; } export async function deleteUploadedFileApi(filename: string, idToken: string): Promise { const headers = await getHeaders(idToken); const response = await fetch("/delete_uploaded", { method: "POST", headers: { ...headers, "Content-Type": "application/json" }, body: JSON.stringify({ filename }) }); if (!response.ok) { throw new Error(`Deleting file failed: ${response.statusText}`); } const dataResponse: SimpleAPIResponse = await response.json(); return dataResponse; } export async function listUploadedFilesApi(idToken: string): Promise { const response = await fetch(`/list_uploaded`, { method: "GET", headers: await getHeaders(idToken) }); if (!response.ok) { throw new Error(`Listing files failed: ${response.statusText}`); } const dataResponse: string[] = await response.json(); return dataResponse; } export async function postChatHistoryApi(item: any, idToken: string): Promise { const headers = await getHeaders(idToken); const response = await fetch("/chat_history", { method: "POST", headers: { ...headers, "Content-Type": "application/json" }, body: JSON.stringify(item) }); if (!response.ok) { throw new Error(`Posting chat history failed: ${response.statusText}`); } const dataResponse: any = await response.json(); return dataResponse; } export async function getChatHistoryListApi(count: number, continuationToken: string | undefined, idToken: string): Promise { const headers = await getHeaders(idToken); let url = `${BACKEND_URI}/chat_history/sessions?count=${count}`; if (continuationToken) { url += `&continuationToken=${continuationToken}`; } const response = await fetch(url.toString(), { method: "GET", headers: { ...headers, "Content-Type": "application/json" } }); if (!response.ok) { throw new Error(`Getting chat histories failed: ${response.statusText}`); } const dataResponse: HistoryListApiResponse = await response.json(); return dataResponse; } export async function getChatHistoryApi(id: string, idToken: string): Promise { const headers = await getHeaders(idToken); const response = await fetch(`/chat_history/sessions/${id}`, { method: "GET", headers: { ...headers, "Content-Type": "application/json" } }); if (!response.ok) { throw new Error(`Getting chat history failed: ${response.statusText}`); } const dataResponse: HistoryApiResponse = await response.json(); return dataResponse; } export async function deleteChatHistoryApi(id: string, idToken: string): Promise { const headers = await getHeaders(idToken); const response = await fetch(`/chat_history/sessions/${id}`, { method: "DELETE", headers: { ...headers, "Content-Type": "application/json" } }); if (!response.ok) { throw new Error(`Deleting chat history failed: ${response.statusText}`); } } ================================================ FILE: app/frontend/src/api/index.ts ================================================ export * from "./api"; export * from "./models"; ================================================ FILE: app/frontend/src/api/models.ts ================================================ export const enum RetrievalMode { Hybrid = "hybrid", Vectors = "vectors", Text = "text" } export type ChatAppRequestOverrides = { retrieval_mode?: RetrievalMode; semantic_ranker?: boolean; semantic_captions?: boolean; query_rewriting?: boolean; reasoning_effort?: string; include_category?: string; exclude_category?: string; seed?: number; top?: number; retrieval_reasoning_effort?: string; temperature?: number; minimum_search_score?: number; minimum_reranker_score?: number; prompt_template?: string; prompt_template_prefix?: string; prompt_template_suffix?: string; suggest_followup_questions?: boolean; send_text_sources: boolean; send_image_sources: boolean; search_text_embeddings: boolean; search_image_embeddings: boolean; language: string; use_agentic_knowledgebase: boolean; use_web_source?: boolean; use_sharepoint_source?: boolean; }; export type ResponseMessage = { content: string; role: string; }; export type Thoughts = { title: string; description: any; // It can be any output from the api props?: { [key: string]: any }; }; export type ActivityDetail = { id?: number; number?: number; type?: string; label?: string; source?: string; query?: string; }; export type ExternalResultMetadata = { id?: string; title?: string; url?: string; snippet?: string; activity?: ActivityDetail; }; export type CitationActivityDetail = { id?: string; number?: number; type?: string; source?: string; query?: string; }; export type DataPoints = { text: string[]; images: string[]; citations: string[]; citation_activity_details?: Record; external_results_metadata?: ExternalResultMetadata[]; }; export type ResponseContext = { data_points: DataPoints; followup_questions: string[] | null; thoughts: Thoughts[]; answer?: string; }; export type ChatAppResponseOrError = { message: ResponseMessage; delta: ResponseMessage; context: ResponseContext; session_state: any; error?: string; }; export type ChatAppResponse = { message: ResponseMessage; delta: ResponseMessage; context: ResponseContext; session_state: any; }; export type ChatAppRequestContext = { overrides?: ChatAppRequestOverrides; }; export type ChatAppRequest = { messages: ResponseMessage[]; context?: ChatAppRequestContext; session_state: any; }; export type Config = { defaultReasoningEffort: string; defaultRetrievalReasoningEffort: string; showMultimodalOptions: boolean; showSemanticRankerOption: boolean; showQueryRewritingOption: boolean; showReasoningEffortOption: boolean; streamingEnabled: boolean; showVectorOption: boolean; showUserUpload: boolean; showLanguagePicker: boolean; showSpeechInput: boolean; showSpeechOutputBrowser: boolean; showSpeechOutputAzure: boolean; showChatHistoryBrowser: boolean; showChatHistoryCosmos: boolean; showAgenticRetrievalOption: boolean; ragSearchTextEmbeddings: boolean; ragSearchImageEmbeddings: boolean; ragSendTextSources: boolean; ragSendImageSources: boolean; webSourceEnabled: boolean; sharepointSourceEnabled: boolean; }; export type SimpleAPIResponse = { message?: string; }; export interface SpeechConfig { speechUrls: (string | null)[]; setSpeechUrls: (urls: (string | null)[]) => void; audio: HTMLAudioElement; isPlaying: boolean; setIsPlaying: (isPlaying: boolean) => void; } export type HistoryListApiResponse = { sessions: { id: string; entra_oid: string; title: string; timestamp: number; }[]; continuation_token?: string; }; export type HistoryApiResponse = { id: string; entra_oid: string; answers: any; }; ================================================ FILE: app/frontend/src/authConfig.ts ================================================ // Refactored from https://github.com/Azure-Samples/ms-identity-javascript-react-tutorial/blob/main/1-Authentication/1-sign-in/SPA/src/authConfig.js import { IPublicClientApplication } from "@azure/msal-browser"; const appServicesAuthTokenUrl = ".auth/me"; const appServicesAuthTokenRefreshUrl = ".auth/refresh"; const appServicesAuthLogoutUrl = ".auth/logout?post_logout_redirect_uri=/"; interface AppServicesToken { id_token: string; access_token: string; user_claims: Record; expires_on: string; } interface AuthSetup { // Set to true if login elements should be shown in the UI useLogin: boolean; // Set to true if access control is enforced by the application requireAccessControl: boolean; // Set to true if the application allows unauthenticated access (only applies for documents without access control) enableUnauthenticatedAccess: boolean; /** * Configuration object to be passed to MSAL instance on creation. * For a full list of MSAL.js configuration parameters, visit: * https://github.com/AzureAD/microsoft-authentication-library-for-js/blob/dev/lib/msal-browser/docs/configuration.md */ msalConfig: { auth: { clientId: string; // Client app id used for login authority: string; // Directory to use for login https://learn.microsoft.com/entra/identity-platform/msal-client-application-configuration#authority redirectUri: string; // Points to window.location.origin. You must register this URI on Azure Portal/App Registration. postLogoutRedirectUri: string; // Indicates the page to navigate after logout. navigateToLoginRequestUrl: boolean; // If "true", will navigate back to the original request location before processing the auth code response. }; cache: { cacheLocation: string; // Configures cache location. "sessionStorage" is more secure, but "localStorage" gives you SSO between tabs. storeAuthStateInCookie: boolean; // Set this to "true" if you are having issues on IE11 or Edge }; }; loginRequest: { /** * Scopes you add here will be prompted for user consent during sign-in. * By default, MSAL.js will add OIDC scopes (openid, profile, email) to any login request. * For more information about OIDC scopes, visit: * https://learn.microsoft.com/entra/identity-platform/permissions-consent-overview#openid-connect-scopes */ scopes: Array; }; tokenRequest: { scopes: Array; }; } // Fetch the auth setup JSON data from the API if not already cached async function fetchAuthSetup(): Promise { const response = await fetch("/auth_setup"); if (!response.ok) { throw new Error(`auth setup response was not ok: ${response.status}`); } return await response.json(); } const authSetup = await fetchAuthSetup(); export const useLogin = authSetup.useLogin; export const requireAccessControl = authSetup.requireAccessControl; export const enableUnauthenticatedAccess = authSetup.enableUnauthenticatedAccess; export const requireLogin = requireAccessControl && !enableUnauthenticatedAccess; /** * Configuration object to be passed to MSAL instance on creation. * For a full list of MSAL.js configuration parameters, visit: * https://github.com/AzureAD/microsoft-authentication-library-for-js/blob/dev/lib/msal-browser/docs/configuration.md */ export const msalConfig = authSetup.msalConfig; /** * Scopes you add here will be prompted for user consent during sign-in. * By default, MSAL.js will add OIDC scopes (openid, profile, email) to any login request. * For more information about OIDC scopes, visit: * https://learn.microsoft.com/entra/identity-platform/permissions-consent-overview#openid-connect-scopes */ export const loginRequest = authSetup.loginRequest; const tokenRequest = authSetup.tokenRequest; // Build an absolute redirect URI using the current window's location and the relative redirect URI from auth setup export const getRedirectUri = () => { return window.location.origin + authSetup.msalConfig.auth.redirectUri; }; // Cache the app services token if it's available // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/this#global_context declare global { var cachedAppServicesToken: AppServicesToken | null; } globalThis.cachedAppServicesToken = null; /** * Retrieves an access token if the user is logged in using app services authentication. * Checks if the current token is expired and fetches a new token if necessary. * Returns null if the app doesn't support app services authentication. * * @returns {Promise} A promise that resolves to an AppServicesToken if the user is authenticated, or null if authentication is not supported or fails. */ const getAppServicesToken = (): Promise => { const checkNotExpired = (appServicesToken: AppServicesToken) => { const currentDate = new Date(); const expiresOnDate = new Date(appServicesToken.expires_on); return expiresOnDate > currentDate; }; if (globalThis.cachedAppServicesToken && checkNotExpired(globalThis.cachedAppServicesToken)) { return Promise.resolve(globalThis.cachedAppServicesToken); } const getAppServicesTokenFromMe: () => Promise = () => { return fetch(appServicesAuthTokenUrl).then(r => { if (r.ok) { return r.json().then(json => { if (json.length > 0) { return { id_token: json[0]["id_token"] as string, access_token: json[0]["access_token"] as string, user_claims: json[0]["user_claims"].reduce((acc: Record, item: Record) => { acc[item.typ] = item.val; return acc; }, {}) as Record, expires_on: json[0]["expires_on"] as string } as AppServicesToken; } return null; }); } return null; }); }; return getAppServicesTokenFromMe().then(token => { if (token) { if (checkNotExpired(token)) { globalThis.cachedAppServicesToken = token; return token; } return fetch(appServicesAuthTokenRefreshUrl).then(r => { if (r.ok) { return getAppServicesTokenFromMe(); } return null; }); } return null; }); }; export const isUsingAppServicesLogin = (await getAppServicesToken()) != null; // Sign out of app services // Learn more at https://learn.microsoft.com/azure/app-service/configure-authentication-customize-sign-in-out#sign-out-of-a-session export const appServicesLogout = () => { window.location.href = appServicesAuthLogoutUrl; }; /** * Determines if the user is logged in either via the MSAL public client application or the app services login. * @param {IPublicClientApplication | undefined} client - The MSAL public client application instance, or undefined if not available. * @returns {Promise} A promise that resolves to true if the user is logged in, false otherwise. */ export const checkLoggedIn = async (client: IPublicClientApplication | undefined): Promise => { if (client) { const activeAccount = client.getActiveAccount(); if (activeAccount) { return true; } } const appServicesToken = await getAppServicesToken(); if (appServicesToken) { return true; } return false; }; // Get an access token for use with the API server. // ID token received when logging in may not be used for this purpose because it has the incorrect audience // Use the access token from app services login if available export const getToken = async (client: IPublicClientApplication): Promise => { const appServicesToken = await getAppServicesToken(); if (appServicesToken) { return Promise.resolve(appServicesToken.access_token); } return client .acquireTokenSilent({ ...tokenRequest, redirectUri: getRedirectUri() }) .then(r => r.accessToken) .catch(error => { console.log(error); return undefined; }); }; /** * Retrieves the username of the active account. * If no active account is found, attempts to retrieve the username from the app services login token if available. * @param {IPublicClientApplication} client - The MSAL public client application instance. * @returns {Promise} The username of the active account, or null if no username is found. */ export const getUsername = async (client: IPublicClientApplication): Promise => { const activeAccount = client.getActiveAccount(); if (activeAccount) { return activeAccount.username; } const appServicesToken = await getAppServicesToken(); if (appServicesToken?.user_claims) { return appServicesToken.user_claims.preferred_username; } return null; }; /** * Retrieves the token claims of the active account. * If no active account is found, attempts to retrieve the token claims from the app services login token if available. * @param {IPublicClientApplication} client - The MSAL public client application instance. * @returns {Promise | undefined>} A promise that resolves to the token claims of the active account, the user claims from the app services login token, or undefined if no claims are found. */ export const getTokenClaims = async (client: IPublicClientApplication): Promise | undefined> => { const activeAccount = client.getActiveAccount(); if (activeAccount) { return activeAccount.idTokenClaims; } const appServicesToken = await getAppServicesToken(); if (appServicesToken) { return appServicesToken.user_claims; } return undefined; }; ================================================ FILE: app/frontend/src/components/AnalysisPanel/AgentPlan.tsx ================================================ import React from "react"; import { Light as SyntaxHighlighter } from "react-syntax-highlighter"; import json from "react-syntax-highlighter/dist/esm/languages/hljs/json"; import { a11yLight } from "react-syntax-highlighter/dist/esm/styles/hljs"; import { getCitationFilePath } from "../../api"; import { QueryPlanStep, getStepLabel } from "./agentPlanUtils"; import styles from "./AnalysisPanel.module.css"; import { TokenUsage, TokenUsageGraph } from "./TokenUsageGraph"; SyntaxHighlighter.registerLanguage("json", json); const renderDetail = (step: QueryPlanStep) => { switch (step.type) { case "modelQueryPlanning": { const usage: TokenUsage = { prompt_tokens: step.input_tokens ?? 0, completion_tokens: step.output_tokens ?? 0, reasoning_tokens: 0, total_tokens: (step.input_tokens ?? 0) + (step.output_tokens ?? 0) }; return ; } case "searchIndex": { const search = step.search_index_arguments?.search ?? "—"; return ( <>
Source: {step.knowledge_source_name ?? "search index"}
Search: {search}
); } case "web": { const webSearch = step.web_arguments?.search ?? "—"; return ( <>
Source: {step.knowledge_source_name ?? "web"}
Search: {webSearch}
); } case "remoteSharePoint": { const sharepointSearch = step.remote_share_point_arguments?.search ?? "—"; return ( <>
Source: {step.knowledge_source_name ?? "SharePoint"}
Search: {sharepointSearch}
); } case "agenticReasoning": { const usage: TokenUsage = { prompt_tokens: 0, completion_tokens: step.reasoning_tokens ?? 0, reasoning_tokens: step.reasoning_tokens ?? 0, total_tokens: step.reasoning_tokens ?? 0 }; return ( <>
This step uses Azure AI Search models, so the token capacity does not affect the deployed model.
); } case "modelAnswerSynthesis": { const usage: TokenUsage = { prompt_tokens: step.input_tokens ?? 0, completion_tokens: step.output_tokens ?? 0, reasoning_tokens: 0, total_tokens: (step.input_tokens ?? 0) + (step.output_tokens ?? 0) }; return ; } default: return ( {JSON.stringify(step, null, 2)} ); } }; interface Props { queryPlan: QueryPlanStep[]; onEffortExtracted?: (effort: string | undefined) => void; onCitationClicked?: (citationFilePath: string) => void; results?: any[]; } export const AgentPlan: React.FC = ({ queryPlan, onEffortExtracted, onCitationClicked, results }) => { // Helper to get search query for a step const getStepQuery = (step: QueryPlanStep): string | undefined => { if (step.search_index_arguments?.search) return step.search_index_arguments.search; if (step.web_arguments?.search) return step.web_arguments.search; if (step.remote_share_point_arguments?.search) return step.remote_share_point_arguments.search; return undefined; }; // Helper to get results for a specific step const getResultsForStep = (step: QueryPlanStep): any[] => { if (!results || results.length === 0) return []; const stepQuery = getStepQuery(step); if (!stepQuery) return []; // Filter by both query and step type, then de-duplicate by filename const filtered = results.filter(result => result.activity?.query === stepQuery && result.type == step.type); const uniqueMap = new Map(filtered.map(r => [r.sourcepage || r.web_url || r.url, r])); return Array.from(uniqueMap.values()); }; const stepNumberLookup = React.useMemo(() => { const lookup: Record = {}; queryPlan.forEach((step, index) => { if (step != null && step.id !== undefined && step.id !== null) { lookup[String(step.id)] = index + 1; } }); return lookup; }, [queryPlan]); const iterations = React.useMemo(() => { if (!queryPlan || queryPlan.length === 0) { return [] as QueryPlanStep[][]; } const planningIndices = queryPlan.reduce((indices, step, index) => { if (step.type === "modelQueryPlanning") { indices.push(index); } return indices; }, []); if (planningIndices.length <= 1) { return [queryPlan]; } const iterationsList: QueryPlanStep[][] = []; const prePlanningSteps = planningIndices[0] > 0 ? queryPlan.slice(0, planningIndices[0]) : []; planningIndices.forEach((planningIndex, idx) => { const nextPlanningIndex = planningIndices[idx + 1] ?? queryPlan.length; const iterationSteps = queryPlan.slice(planningIndex, nextPlanningIndex); if (idx === 0 && prePlanningSteps.length > 0) { iterationsList.push([...prePlanningSteps, ...iterationSteps]); } else if (iterationSteps.length > 0) { iterationsList.push(iterationSteps); } }); return iterationsList; }, [queryPlan]); React.useEffect(() => { // Extract effort from first agentic reasoning step const agenticStep = queryPlan.find(step => step.type === "agenticReasoning"); const effort = agenticStep?.retrieval_reasoning_effort?.kind; if (onEffortExtracted) { onEffortExtracted(effort); } }, [queryPlan, onEffortExtracted]); if (iterations.length === 0) { return null; } return (
{iterations.map((iterationSteps, iterationIndex) => { const hasMultipleIterations = iterations.length > 1; const headerLabel = hasMultipleIterations ? `Iteration ${iterationIndex + 1} Execution steps` : "Execution steps"; return (
{headerLabel}
{iterationSteps.map(step => { const stepId = step?.id; const stepNumber = stepId !== undefined ? stepNumberLookup[String(stepId)] : undefined; const stepResults = getResultsForStep(step); return ( ); })}
Step Details Elapsed MS
{stepNumber && {`Step ${stepNumber}:`}} {getStepLabel(step)}
{renderDetail(step)} {(step.type === "searchIndex" || step.type === "remoteSharePoint" || step.type === "web") && (stepResults.length > 0 ? (
{stepResults.map((result, idx) => { // Handle different result types if (result.type === "remoteSharePoint" && result.web_url) { return ( ); } else if (result.url) { // Web result return ( ); } else if (result.sourcepage) { // Document result - make it clickable to open citation tab const path = getCitationFilePath(result.sourcepage); return ( ); } return null; })}
) : (
No results found
))}
{step.elapsed_ms ?? "—"}
); })}
); }; ================================================ FILE: app/frontend/src/components/AnalysisPanel/AnalysisPanel.module.css ================================================ .thoughtProcess { font-family: source-code-pro, Menlo, Monaco, Consolas, "Courier New", monospace; word-wrap: break-word; padding-top: 0.75em; padding-bottom: 0.75em; } .tList { padding: 1.25em 1.25em 0 1.25em; display: block; /* allow it to shrink within container */ background: #e9e9e9; width: 100%; box-sizing: border-box; overflow-wrap: anywhere; /* aggressive wrapping for very long tokens/urls */ } .tListItem { list-style: none; margin: auto; margin-left: 1.25em; min-height: 3.125em; border-left: 0.0625em solid #123bb6; padding: 0 0 1.875em 1.875em; position: relative; } .tListItem:last-child { border-left: 0; } .tListItem::before { position: absolute; left: -18px; top: -5px; content: " "; border: 8px solid #d1dbfa; border-radius: 500%; background: #123bb6; height: 20px; width: 20px; } .tStep { color: #123bb6; position: relative; font-size: 0.875em; margin-bottom: 0.5em; } .tCodeBlock { max-height: 18.75em; overflow: auto; white-space: pre-wrap; /* allow wrapping */ word-break: break-word; overflow-wrap: anywhere; width: 100%; box-sizing: border-box; } .tPropRow { flex-wrap: wrap; gap: 5px; max-width: 100%; margin-bottom: 0.5em; } .tProp { display: inline-block; background-color: #d7d7d7; font-size: 0.95em; padding: 0.1875em 0.625em; border-radius: 0.625em; margin-bottom: 0.3em; word-break: break-word; white-space: normal; overflow-wrap: anywhere; } .citationImg { height: 28.125rem; max-width: 100%; object-fit: contain; } .header { color: #123bb6; position: relative; font-size: 0.875em; margin-bottom: 0.5em; } .reasoningEffort { font-size: 14px; margin-bottom: 8px; } .tokenUsageGraph { padding: 6px 6px; width: 100%; box-sizing: border-box; } .tokenBar { min-height: 28px; height: auto; margin-bottom: 8px; padding: 6px 12px; font-size: inherit; display: flex; flex-wrap: wrap; align-items: center; background-color: #d7d7d7; white-space: normal; overflow: hidden; word-break: break-word; overflow-wrap: anywhere; min-width: 0; box-sizing: border-box; } /* Adjust tokenLabel to allow bar-specific text color overrides */ .tokenLabel { font-size: inherit; padding-right: 4px; flex: 1 1 auto; word-break: break-word; overflow-wrap: anywhere; } .primaryBarContainer { width: 100%; display: flex; gap: 8px; padding: 4px 0; flex-wrap: nowrap; } .primaryBarContainer .tokenBar { margin-bottom: 0; } .promptBar { background-color: #a82424; color: #ffffff; /* White text for contrast */ } .reasoningBar { background-color: #265e29; color: #ffffff; } .outputBar { background-color: #12579b; color: #ffffff; min-width: 120px; } .totalBar { background-color: #424242; color: #ffffff; } .secondaryTotalBar { background-color: #6d6d6d; color: #ffffff; } .segmentWrapper { margin-top: 16px; padding-top: 4px; } .segmentWrapperFirst { margin-top: 0; } .standaloneTotalBar { margin-top: 10px; } .groupedTotalBar { margin-top: 2px; } .subqueriesTable, .subqueriesTable th, .subqueriesTable td, .subqueriesTable tr { background: #fff; } .iterationSection { margin-top: 1.5em; } .iterationSection:first-of-type { margin-top: 0; } .stepHeaderCell { display: flex; flex-direction: column; align-items: flex-start; gap: 0.25em; } .stepNumberText { font-weight: 600; color: #123bb6; } .stepLabel { font-weight: 600; } .stepCitations { margin-top: 0.75em; display: flex; flex-direction: column; gap: 0.375em; } .stepResults { margin-top: 0.5em; margin-bottom: 0.5em; display: flex; flex-direction: column; gap: 0.25em; } .noResults { margin-top: 0.5em; font-size: 0.85em; color: #666; font-style: italic; } .stepResult { display: inline-block; font-size: 0.85em; word-break: break-word; overflow-wrap: anywhere; } .stepResult a { font-weight: 500; line-height: 1.5em; text-align: center; border-radius: 0.25em; padding: 0em 0.5em; background: #d1dbfa; color: #123bb6; text-decoration: none; cursor: pointer; display: inline-block; } .stepResult a:hover { text-decoration: underline; } .stepCitationEntry { display: inline-flex; flex-wrap: wrap; align-items: center; gap: 0.25em; white-space: normal; word-break: break-word; overflow-wrap: anywhere; max-width: 100%; } .stepCitationEntry a { white-space: normal; word-break: break-word; overflow-wrap: anywhere; text-align: left; max-width: 100%; } ================================================ FILE: app/frontend/src/components/AnalysisPanel/AnalysisPanel.tsx ================================================ import { useMsal } from "@azure/msal-react"; import { Tab, TabList, SelectTabData, SelectTabEvent } from "@fluentui/react-components"; import { useEffect, useState } from "react"; import { useTranslation } from "react-i18next"; import { ChatAppResponse, getHeaders } from "../../api"; import { getToken, useLogin } from "../../authConfig"; import { MarkdownViewer } from "../MarkdownViewer"; import { SupportingContent } from "../SupportingContent"; import styles from "./AnalysisPanel.module.css"; import { AnalysisPanelTabs } from "./AnalysisPanelTabs"; import { ThoughtProcess } from "./ThoughtProcess"; interface Props { className: string; activeTab: AnalysisPanelTabs; onActiveTabChanged: (tab: AnalysisPanelTabs) => void; activeCitation: string | undefined; citationHeight: string; answer: ChatAppResponse; onCitationClicked?: (citationFilePath: string) => void; } export const AnalysisPanel = ({ answer, activeTab, activeCitation, citationHeight, className, onActiveTabChanged, onCitationClicked }: Props) => { const isDisabledThoughtProcessTab: boolean = !answer.context.thoughts; const dataPoints = answer.context.data_points; const hasSupportingContent = Boolean( dataPoints && ((dataPoints.text && dataPoints.text.length > 0) || (dataPoints.images && dataPoints.images.length > 0) || (dataPoints.external_results_metadata && dataPoints.external_results_metadata.length > 0)) ); const isDisabledSupportingContentTab: boolean = !hasSupportingContent; const isDisabledCitationTab: boolean = !activeCitation; const [citation, setCitation] = useState(""); const client = useLogin ? useMsal().instance : undefined; const { t } = useTranslation(); const fetchCitation = async () => { const token = client ? await getToken(client) : undefined; if (activeCitation) { // Get hash from the URL as it may contain #page=N // which helps browser PDF renderer jump to correct page N const originalHash = activeCitation.indexOf("#") ? activeCitation.split("#")[1] : ""; const response = await fetch(activeCitation, { method: "GET", headers: await getHeaders(token) }); const citationContent = await response.blob(); let citationObjectUrl = URL.createObjectURL(citationContent); // Add hash back to the new blob URL if (originalHash) { citationObjectUrl += "#" + originalHash; } setCitation(citationObjectUrl); } }; useEffect(() => { fetchCitation(); }, []); const renderFileViewer = () => { if (!activeCitation) { return null; } const fileExtension = activeCitation.split(".").pop()?.toLowerCase(); switch (fileExtension) { case "png": return Citation Image; case "md": return ; default: return