Repository: SPThole/CoexistAI Branch: main Commit: b9e037a8b3ec Files: 40 Total size: 10.4 MB Directory structure: gitextract_gxmiyyip/ ├── .dockerignore ├── Dockerfile ├── Dockerfile.searxng ├── LICENSE ├── README.docker.md ├── README.md ├── README_MCP.md ├── __init__.py ├── app.py ├── coexist_tutorial.ipynb ├── config/ │ └── model_config.json ├── demo_queries.ipynb ├── docker-compose.yml ├── entrypoint.sh ├── model_config.py ├── output/ │ └── map_with_route_and_pois.html ├── quick_setup.sh ├── quick_setup_docker.sh ├── requirements.txt ├── searxng/ │ ├── settings.yml │ ├── settings.yml.new │ ├── uwsgi.ini │ └── uwsgi.ini.new ├── static/ │ └── admin.html ├── system_prompt.py └── utils/ ├── __init__.py ├── answer_generation.py ├── config.py ├── crawler_utils.py ├── git_utils.py ├── knowledge_base.py ├── map.py ├── process_content.py ├── profiler_utils.py ├── reddit_utils.py ├── retriever_utils.py ├── startup_banner.py ├── tts_utils.py ├── utils.py └── websearch_utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ __pycache__ *.pyc *.pyo *.pyd .pytest_cache .venv env/ infinity_env/ coexistaienv/ *.log artifacts/ output/ downloads/ ================================================ FILE: Dockerfile ================================================ FROM python:3.13-slim ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 # Build-time args that will be copied into the image as environment variables. # Users can pass these via `docker build --build-arg KEY=VALUE` to bake defaults. ARG LLM_MODEL_NAME=gemini-2.0-flash ARG LLM_TYPE=google ARG LLM_TEMPERATURE=0.1 ARG PORT_NUM_APP=8000 ARG PORT_NUM_SEARXNG=8085 ARG HOST_APP=0.0.0.0 ARG HOST_SEARXNG=0.0.0.0 ARG EMBED_MODE=google ARG EMBEDDING_MODEL_NAME=models/embedding-001 # Export non-secret build args as environment variables so model_config.py can read them at runtime ENV LLM_MODEL_NAME=${LLM_MODEL_NAME} ENV LLM_TYPE=${LLM_TYPE} ENV LLM_TEMPERATURE=${LLM_TEMPERATURE} ENV PORT_NUM_APP=${PORT_NUM_APP} ENV PORT_NUM_SEARXNG=${PORT_NUM_SEARXNG} ENV HOST_APP=${HOST_APP} ENV HOST_SEARXNG=${HOST_SEARXNG} ENV EMBED_MODE=${EMBED_MODE} ENV EMBEDDING_MODEL_NAME=${EMBEDDING_MODEL_NAME} # Install small set of system deps commonly needed by ML/audio packages RUN apt-get update && \ apt-get install -y --no-install-recommends \ git \ wget \ ffmpeg \ build-essential \ libsndfile1 \ && rm -rf /var/lib/apt/lists/* # Use /app as the workdir so the Dockerfile can be built from the CoexistAI folder WORKDIR /app # Copy only requirements first to leverage Docker cache (build context is the CoexistAI folder) COPY ./requirements.txt ./requirements.txt RUN python -m pip install --upgrade pip setuptools wheel # Copy application code (copy the current folder contents into /app) COPY ./ ./ # Reproduce quick_setup.sh virtualenv installs inside the image (mirrors the script) # Create a separate infinity_env and install packages there to avoid conflicts as in the script RUN python3.13 -m venv /opt/infinity_env && \ /opt/infinity_env/bin/pip install --no-cache-dir 'infinity-emb[all]' && \ /opt/infinity_env/bin/pip install --no-cache-dir 'optimum==1.27.0' && \ /opt/infinity_env/bin/pip install --no-cache-dir 'transformers<4.49' && \ /opt/infinity_env/bin/pip install --no-cache-dir --upgrade "typer==0.19.1" "click>=8.1.3" || true # Create a second venv similar to coexistaienv and install markitdown[all] RUN python3.13 -m venv /opt/coexistaienv && \ /opt/coexistaienv/bin/pip install --no-cache-dir 'markitdown[all]' || true # Now install the project requirements into the coexistaienv (matches quick_setup.sh order) RUN /opt/coexistaienv/bin/pip install --no-cache-dir -r requirements.txt || true # Entrypoint will be executed via shell; no need to force executable bit when host may mount files EXPOSE 8000 # Invoke the entrypoint from the copied project path. The entrypoint lives at CoexistAI/entrypoint.sh CMD ["sh", "/app/entrypoint.sh"] ================================================ FILE: Dockerfile.searxng ================================================ FROM searxng/searxng:latest # Copy custom settings COPY ./searxng/settings.yml /etc/searxng/settings.yml # Optionally copy other config files if needed # COPY ./searxng/uwsgi.ini /etc/searxng/uwsgi.ini ================================================ FILE: LICENSE ================================================ NON-COMMERCIAL RESEARCH AND EDUCATIONAL USE LICENSE Copyright (c) 2025 Sidhant Thole and CoexistAI Contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to use, copy, modify, and distribute the Software, subject to the following conditions: 1. **Non-Commercial Use Only** - The Software may be used, copied, modified, and distributed solely for non-commercial research, prototyping, and educational purposes. - Commercial use, including but not limited to use in a product, service, or offering for which a fee is charged or which is used in the operation of a business, is strictly prohibited without the express prior written consent of the copyright holders. 2. **No Redistribution for Commercial Purposes** - Redistribution of the Software or any derivative works for commercial purposes is not permitted. - Integration of the Software into commercial products or services is not permitted without explicit written permission. 3. **Attribution** - Any use, copy, or distribution of the Software must retain this license notice, copyright notice, and all disclaimers. 4. **No Warranty** - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 5. **Compliance** - You are responsible for ensuring that your use of the Software complies with all applicable laws, regulations, and terms of service of any data sources or third-party services accessed through the Software. 6. **Contact for Commercial Licensing** - For commercial licensing or other use not permitted by this license, please contact the maintainers at: [GitHub Issues or project contact email]. By using the Software, you agree to be bound by the terms of this license. ================================================ FILE: README.docker.md ================================================ # CoexistAI — Docker Quickstart ### Short, step-by-step instructions for two ways to start CoexistAI. Pick either Method A (helper script) or Method B (direct Docker Compose). ## Prerequisites - Docker Engine installed. ## Before you start (one-time) 1. Open a terminal and change into the repository folder: ```bash cd /path/to/CoexistAI ``` 2. Edit the .env file for keys and admin token (which will be used while editing model params): ## Method A — Helper script (recommended for beginners) This script automates the compose start and waits until the app reports ready. 1. Run the helper (from repo root): ```bash ./quick_setup_docker.sh ``` or ```bash # default timeout 300s ./quick_setup_docker.sh 600 # pass timeout in seconds (example: 600s = 10min) ``` For subsequent starts, run in similar way (it detects the existing image and skips building/installing). 2. What the script does (so you know what to expect): - Checks if the Docker image 'coexistai-app' already exists; if yes, runs `docker compose up -d` (no build); if not, runs `docker compose up -d --build` to start containers detached. - Polls `http://localhost:8000/status` every few seconds and prints a spinner. - Exits with code 0 when the app reports `{"status":"ready"}`. - Exits non-zero if the app reports `error` or the timeout is reached. 3. After the script finishes successfully, open: - http://localhost:8000/admin ![Admin ui](./artifacts/admin_ui.png) - If using local models ignore api_keys fields - By default ADMIN_TOKEN=123456, you can change it via .env This opens the Admin UI, where you can edit model configurations, API keys, and reload settings without rebuilding the container. When to use Method A: you're new to Docker or want a simple way to wait until the app is ready. ## Method B — Direct Docker Compose (fast, manual) 1. Start the stack: - **First time** (builds the image): ```bash docker compose up -d --build ``` - **Subsequent times** (uses existing image): ```bash docker compose up -d ``` To stop: `docker compose down` To restart: `docker compose restart` 2. Wait for ready signal in terminal where you ran docker compose, then open the admin UI: - http://localhost:8000/admin 3. Verify status from the host: ```bash curl http://localhost:8000/status # expected JSON: {"status":"starting"} or {"status":"ready"} ``` 4. Edit configuration: - Use the Admin UI `/admin` and click "Save & Reload" to apply changes without rebuilding. - Or from the host (curl): ```bash curl -X POST -H "X-Admin-Token: $ADMIN_TOKEN" http://localhost:8000/admin/reload-config ``` When to use Method A: you prefer to run compose directly and watch logs yourself. Secrets (recommended pattern) - Do not store API keys in the repo. Use `.env` or file-backed secrets. - Recommended: create `CoexistAI/config/keys/` on the host, place key files there, and mount that folder into the container. Reference them in `config/model_config.json` with `llm_api_key_file` / `embed_api_key_file`. Quick troubleshooting - App unreachable? Check app logs: ```bash docker compose logs app --tail=200 ``` - App timed out in `quick_setup_docker.sh` or reports `error`? Inspect logs and increase timeout: ```bash docker compose logs app --tail=400 ./quick_setup_docker.sh 600 ``` - Long model downloads or HF errors: allow more time on first start or mount `artifacts/` (HF cache) into the container to avoid repeated downloads. Helpful commands ```bash # Check status curl http://localhost:8000/status # Ask app to reload config (from host) curl -X POST -H "X-Admin-Token: $ADMIN_TOKEN" http://localhost:8000/admin/reload-config # Follow logs interactively docker compose logs -f app --tail=200 ``` ================================================ FILE: README.md ================================================ # CoexistAI CoexistAI is a modular, developer-friendly research assistant framework. It enables you to build, search, summarize, and automate research workflows using LLMs, web search, Reddit, YouTube, git and mapping tools—all with simple API calls or Python functions.

CoexistAI Logo

## 🎙️ New Features & Updates - 🔥 _Docker Installation available (Thanks for all the feedback, hope this makes installations easy)_. For a containerized setup with Docker, follow the instructions in [README.docker.md](README.docker.md). - **Text → Podcast**: Instantly turn written content into engaging podcast episodes—ideal for on-the-go listening or repurposing articles/notes/blogs. Example: Converted [this article](https://www.theatlantic.com/newsletters/archive/2025/08/ai-high-school-college/684057/) to a podcast. **[Listen here](output/podcasts/podcast_58fc33d6.wav)** - **Text → Speech**: Convert text to high-quality audio using advanced TTS. Check [Notebook](coexist_tutorial.ipynb) for examples. - **Flexible Integration**: Generate audio files via FastAPI or MCP—integrate with agents or use standalone. - **Direct Location Search**: Search for any place, not just routes. - **Advanced Reddit Search**: Custom phrases with BM25 ranking for sharper discovery. - **YouTube Power-Up**: Search/summarize videos or URLs with custom prompts. - **File/Folder Exploration**: Explore local folders/files with vision support for images (.png, .jpg, etc.). - **Sharper Web Search**: More focused, actionable results. - **MCP Support Everywhere**: Full integration with LM Studio and other MCP hosts. [See Guide](README_MCP.md) - **GitHub & Local Repo Explorer**: Ask questions about codebases (GitHub or local). ## 🚀 Features - **Web Explorer**: Query the web, summarize results, and extract context using LLMs. - **Reddit Explorer**: Fetch and summarize reddit via search phrase or subreddit focused queries - **YouTube Transcript Explorer**: Search youtube with search phrases and summarise/QA any video - **Map Explorer**: Generate maps, explore routes, locations with points of interest like hotels, cafes near given locations. - **Github Explorer**: Explore/summarise/explain/QA any github or even local git codebases - **Pluggable LLMs and Embedders**: Use any LLMs Google Gemini, OpenAI, Ollama, and any embedders - **Async & Parallel**: Fast, scalable, and robust asynchronous execution. - **Notebook & API Ready**: Use as a Python library or via a FAST API. - **MCP ready**: Spins up the MCP server on the fly along with FAST API server --- ## 🛠️ Installation **Prerequisite:** Make sure Docker is installed and the Docker daemon is running. ### Method 1: Docker (Recommended) New 🔥 For a containerized setup with Docker, follow the instructions in [README.docker.md](README.docker.md). This method uses Method A (helper script) to automate the process and provides an Admin UI for easy configuration. ### Method 2: Local Setup 1. **Clone the repository:** ```sh git clone https://github.com/SPThole/CoexistAI.git coexistai cd coexistai ``` 2. **Configure your model and embedding settings:** - [NEW] Edit `config/model_config.json` to set your preferred LLM and embedding model. - Edit above file to set your preferred SearxNG host and port (if needed) - Add LLM and Embedder API Key (for google mode both would be same) - Example (for full local mode): ```json { "llm_model_name": "jan-nano", "llm_type": "local", // based on baseurl dict given below "embed_mode": "infinity_emb", "embedding_model_name": "nomic-ai/nomic-embed-text-v1", "llm_kwargs": { "temperature": 0.1, "max_tokens": null, "timeout": null, "max_retries": 2 }, "embed_kwargs": {}, "llm_api_key": "dummy", "HOST_APP": "localhost", "PORT_NUM_APP": 8000, "HOST_SEARXNG": "localhost", "PORT_NUM_SEARXNG": 8080, "openai_compatible": { "google": "https://generativelanguage.googleapis.com/v1beta/openai/", "local": "http://localhost:1234/v1", "groq": "https://api.groq.com/openai/v1", "openai": "https://api.openai.com/v1", "others": "https://openrouter.ai/api/v1" } } ``` - See the file for all available options and defaults. - If you using others llm type, then check the openai_compatible url dict for others key, you can generally find it by "googling YOUR provider name openai api base compatilble url" 3. **Run the setup script:** - For macOS or Linux with zsh: ```sh zsh quick_setup.sh ``` - For Linux with bash: ```sh bash quick_setup.sh ``` > The script will: > - Pull the SearxNG Docker image > - Create and activate a Python virtual environment > - **USER ACTION NEEDED** Set your `GOOGLE_API_KEY` (edit the script to use your real key). [Obtain your API key (Currently Gemini, OpenAI and ollama is supported)](https://ai.google.dev/gemini-api/docs/api-key) from your preferred LLM provider. (Only needed when google mode is set, else set in model_config.py) > - Start the SearxNG Docker container > - Install Python dependencies > - Start the FastAPI server 4. **That’s it!** The FastAPI and MCP server will start automatically and you’re ready to go. **Note:** - Make sure Docker, Python 3, and pip are installed on your system. - Edit quick_setup.sh to set your real `GOOGLE_API_KEY` before running (needed if using google models) - Windows users can use [WSL](https://docs.microsoft.com/en-us/windows/wsl/) or Git Bash to run the script, or follow manual setup steps. --- ### Get Your API Key (optional if you want to use gemini llm/google embedders) [Obtain your API key (Currently Gemini, OpenAI and ollama is supported)](https://ai.google.dev/gemini-api/docs/api-key) from your preferred LLM provider. Once you have the key, update the `app.py` file or your environment variables as follows: ```python import os os.environ['GOOGLE_API_KEY'] = "YOUR_API_KEY" ``` Alternatively, you can set the API key in your shell before starting the server: ```bash export YOUR_LLM_API_KEY=your-api-key-here ``` > **Note:** For optimal quality and speed, use Google models with `embedding-001` embeddings and Gemini Flash models. They provide free API keys. Update the place (default: India) in utils/config.py for personalized results ## 🔧 How to use FASTAPI/tools **Remove comments after // before pasting** Swagger UI: http://127.0.0.1:8000/docs if you haven't changed the host and port ### 1. Web Search **Search the web, summarize, and get actionable answers—automatically.** **Endpoint:** POST `/web-search` **Request Example:** ```json { "query": "Top news of today worldwide", // Query you want to ask; if you provide a URL and ask to summarise, it will summarize the full page. "rerank": true, // Set to true for better result ranking. "num_results": 2, // Number of top results per subquery to explore (higher values = more tokens, slower/more costly). "local_mode": false, // Set to true to explore local documents (currently, only PDF supported). "split": true, // Set to false if you want full pages as input to LLMs; false may cause slower/more costly response. "document_paths": [] // If local_mode is true, add a list of document paths, e.g., ["documents/1706.03762v7.pdf"] } ``` or QA/sumamrise local documents ```json { "query": "Summarise this research paper", "rerank": true, "num_results": 3, "local_mode": true, "split": true, "document_paths": ["documents/1706.03762v7.pdf"] // Must be a list. } ``` --- ### 2. Summarize Any Web Page **Summarize any article or research paper by URL.** **Endpoint:** POST `/web-summarize` **Request Example:** ```json { "query": "Write a short blog on the model", // Instruction or question for the fetched page content. "url": "https://huggingface.co/unsloth/Qwen3-8B-GGUF", // Webpage to fetch content from. "local_mode": false // Set to true if summarizing a local document. } ``` --- ### 3. YouTube Search **Search YouTube (supports prompts and batch).** **Endpoint:** POST `/youtube-search` **Request Example:** ```json { "query": "switzerland itinerary", // Query to search on YouTube; if a URL is provided, it fetches content from that URL. url should be in format: https://www.youtube.com/watch?v=videoID "prompt": "I want to plan my Switzerland trip", // Instruction or question for using the fetched content. "n": 2 // Number of top search results to summarize (only works if query is not a URL). } ``` --- ### 4. Reddit Deep Dive **Custom Reddit search, sort, filter, and get top comments.** **Endpoint:** POST `/reddit-search` **Request Example:** ```json { "subreddit": "", // Subreddit to fetch content from (use if url_type is not 'search'). "url_type": "search", // 'search' for phrase search; "url" for url, otherwise, use 'hot', 'top', 'best', etc. "n": 3, // Number of posts to fetch. "k": 1, // Number of top comments per post. "custom_url": "", // Use if you already have a specific Reddit URL. "time_filter": "all", // Time range: 'all', 'today', 'week', 'month', 'year'. "search_query": "gemma 3n reviews", // Search phrase (useful if url_type is 'search'). "sort_type": "relevance" // 'top', 'hot', 'new', 'relevance' — controls how results are sorted. } ``` --- ### 5. Map & Location/Route Search **Find places, routes, and nearby points of interest.** **Endpoint:** POST `/map-search` **Request Example:** ```json { "start_location": "MG Road, Bangalore", // Starting point. "end_location": "Lalbagh, Bangalore", // Destination. "pois_radius": 500, // Search radius in meters for amenities. "amenities": "restaurant|cafe|bar|hotel", // Amenities to search near start or end location. "limit": 3, // Maximum number of results if address not found exactly. "task": "route_and_pois" // Use 'location_only' for address/coordinates only, or 'route_and_pois' for routes and POIs. } ``` OR search for any single location (open street map has api rate limit) ```json { "start_location": "MG Road, Bangalore", "end_location": "Lalbagh, Bangalore", "pois_radius": 500, "amenities": "restaurant|cafe|bar|hotel", "limit": 3, "task": "location_only" } ``` --- ### 6. GitHub & Local Repo Directory Tree **Get the directory structure of any GitHub or local repo.** **Endpoint:** POST `/git-tree-search` **Request Example:** ```json { "repobaseurl": "https://github.com/SPThole/CoexistAI/" // Base URL of the repository to explore. } ``` or for local repo: ```json { "repobaseurl": "/home/user/projects/myrepo" } ``` --- ### 7. Ask Questions or Search Inside GitHub/Local Code **Fetch, search, and analyze code in any repo.** **Endpoint:** POST `/git-search` **Request Example:** ```json { "repobaseurl": "https://github.com/google-deepmind/gemma", // Base URL of the repository. "parttoresearch": "research/t5gemma/t5gemma.py", // Folder or file path relative to the base URL. "query": "explain t5gemma", // Instruction or question to answer from the file/folder. "type": "file" // Either 'file' or 'folder'. } ``` or: ```json { "repobaseurl": "https://github.com/openai", "parttoresearch": "openai-cookbook/examples/mcp", "query": "Write a medium blog, for beginners", "type": "folder" } ``` --- ## 🧑‍💻 Usage in Python (use method 2 install else use requests to hit fastapi endpoints) - [see example notebook](coexist_tutorial.ipynb) - [Example Usage patterns](demo_queries.ipynb) ```python from utils.websearch_utils import query_web_response from utils.reddit_utils import reddit_reader_response # Web Exploration result = await query_web_response( query="latest AI research in the last 7 days", date="2025-07-08", day="Tuesday", websearcher=searcher, #Searxng hf_embeddings=hf_embeddings,#embedder rerank=True, cross_encoder=cross_encoder,#reranker model=llmgoogle, #replace with llm text_model=llmgoogle,#replace with llm num_results=1,#topk results for each subquery document_paths=[], local_mode=False, # True if you have local files in document_paths split=True ) result = await query_web_response( query="summarise in the form of linkedin post https://modelcontextprotocol.io/introduction", date="2025-07-08", day="Tuesday", websearcher=searcher, #Searxng hf_embeddings=hf_embeddings,#embedder rerank=True, cross_encoder=cross_encoder,#reranker model=llmgoogle, #replace with llm text_model=llmgoogle,#replace with llm num_results=1,#topk results for each subquery document_paths=[], local_mode=False, # True if you have local files in document_paths split=True ) ## Reddit Exploration summary = reddit_reader_response( subreddit="", url_type="search", n=5, k=2, custom_url=None,# Replace with llm time_filter="month", search_query="Gemma 3N reviews", sort_type="relevance", model=llmgoogle ) ## Map Exploration from utils.map import generate_map # Generate a map with route and POIs html_path = generate_map("MG Road, Bangalore", "Indiranagar, Bangalore", 500, "hotel", 3) locations = generate_map("MG Road, Bangalore", "Indiranagar, Bangalore", 500, "", 3,"location_only") ## Youtube Exploration from utils.websearch_utils import * learnings = youtube_transcript_response("https://www.youtube.com/watch?v=DB9mjd-65gw", "Summarise this podcast and share me top learnings as a data scientist", llmgoogle) podcast = youtube_transcript_response("History of India top 5 interesting facts", "Make a podcast of this in Hindi, 5 minutes long", llmgoogle, 1) ## Git exploration from utils.git_utils import * tree = await git_tree_search("https://github.com/SPThole/CoexistAI") content = await git_specific_content("https://github.com/SPThole/CoexistAI","README.md","file") ``` --- ## 🤖 Advanced Patterns & Extensibility - **Plug in your own LLMs**: Swap out Google Gemini for OpenAI, Ollama, or any LangChain-supported model. - **Custom Tools**: Add your own tools to the agent system for new capabilities (see `utils/` for examples). - **Async/Parallel**: All web and document search utilities are asynchronous for high performance. - **MCP Servers**: Connect your local apps like lmstudio with coexistAI MCP server, all local --- ## 🤝 Contributing Pull requests, issues, and feature suggestions are welcome! Please open an issue or PR on GitHub. --- ## ⚖️ Legality & Responsible Use **Non-Commercial Use Only:** CoexistAI is intended strictly for research, prototyping, and educational purposes. Commercial or production use of this project or its outputs is **not permitted**. **Web and Reddit Data:** CoexistAI uses public web scraping and Reddit JSON endpoints. It does not use the official Reddit API. Always respect robots.txt, site terms, and copyright law when using this tool. **YouTube & Other Sources:** Use responsibly and in accordance with the terms of service of each data provider. **Compliance:** You are responsible for ensuring your use of this tool complies with all relevant terms, conditions, and laws. --- ## 📄 License This project is licensed under a custom Non-Commercial Research and Educational Use License. Use of this software is permitted only for non-commercial research, prototyping, and educational purposes. Commercial or production use is strictly prohibited. See the LICENSE file for full terms and conditions. --- ## ⭐ Star & Share If you find this project useful, please star the repo and share it with your network! --- ## Acknowledgement: Special thanks to users like @[TotallyTofu](https://github.com/TotallyTofu) for their valuable feedback. ## 📬 Contact For questions, reach out via GitHub Issues or open a discussion. ================================================ FILE: README_MCP.md ================================================ # CoexistAI v0.0.2

CoexistAI MCP Logo

## 🚀 What's New in v2: [Example Usage patterns](demo_queries.ipynb) - **Direct location search:** You can now search for any place, not just find routes! - **Advanced Reddit search:** Use your own phrases to search across reddit; results ranked better with BM25 for sharper discovery. - **YouTube power-up:** Search and summarize YouTube using your own search phrases or video URLs and even add a prompt for custom responses. - **Explore to your folders/files**: Explore local folders and files with extended support to diverse files including (vision integrated)'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.tiff', '.svg', etc, more to come.. - **Sharper web search:** More focused and actionable results than ever before. - **MCP support everywhere:** Now fully connect coexistai to LM Studio and other MCP hosts—seamless integration! [See Guide](README_MCP.md) - **GitHub & local repo explorer:** Explore ask questions about codebases - works with both GitHub and local repos! --- ## 🛠 Quick Start ### Method (Less flexible but faster): **Prerequisite:** Make sure Docker is installed and the Docker daemon is running. 1. **Clone the repository:** ```sh git clone https://github.com/SPThole/CoexistAI.git coexistai cd coexistai ``` 2. **Configure your model and embedding settings:** - Edit `model_config.py` to set your preferred LLM and embedding model. - Edit above file to set your preferred SearxNG host and port (if needed) - Add LLM and Embedder API Key (for google mode both would be same) - Example (for full local mode): ```py model_config = { # Name of the LLM model to use. For local models, use the model name served by your local server. "llm_model_name": "google/gemma-3-12b", # LLM provider type: choose from 'google', 'local', 'groq', or 'openai' or 'others' # in case of 'others' (base url needs to be updated in openai_compatible given below accordingly). # Make sure to update the api_key variable above to match the provider. "llm_type": "local", # List of tools or plugins to use with the LLM, if any. Set to None if not used. "llm_tools": None, # Additional keyword arguments for LLM initialization. "llm_kwargs": { "temperature": 0.1, # Sampling temperature for generation. "max_tokens": None, # Maximum number of tokens to generate (None for default). "timeout": None, # Timeout for API requests (None for default). "max_retries": 2, # Maximum number of retries for failed requests. "api_key": llm_api_key, # API key for authentication. }, # Name of the embedding model to use. # For Google, use their embedding model names. For local/HuggingFace, use the model path or name. "embedding_model_name": "nomic-ai/nomic-embed-text-v1", "embed_kwargs":{}, #additional kwargs for embedding model initialization # Embedding backend: 'google' for Google, 'infinity_emb' for local/HuggingFace models. "embed_mode": "infinity_emb", # Name of the cross-encoder model for reranking, typically a HuggingFace model. "cross_encoder_name": "BAAI/bge-reranker-base" } ``` - See the file for all available options and defaults. - If you using others llm type, then check the openai_compatible url dict for others key, you can generally find it by "googling YOUR provider name openai api base compatilble url" 3. **Run the setup script:** - For macOS or Linux with zsh: ```sh zsh quick_setup.sh ``` - For Linux with bash: ```sh bash quick_setup.sh ``` > The script will: > - Pull the SearxNG Docker image > - Create and activate a Python virtual environment > - **USER ACTION NEEDED** Set your `GOOGLE_API_KEY` (edit the script to use your real key). [Obtain your API key (Currently Gemini, OpenAI and ollama is supported)](https://ai.google.dev/gemini-api/docs/api-key) from your preferred LLM provider. (Only needed when google mode is set, else set in model_config.py) > - Start the SearxNG Docker container > - Install Python dependencies > - Start the FastAPI server 4. **That’s it!** The FastAPI and MCP server will start automatically and you’re ready to go. **Note:** - Make sure Docker, Python 3, and pip are installed on your system. - Edit quick_setup.sh to set your real `GOOGLE_API_KEY` before running (needed if using google models) - Windows users can use [WSL](https://docs.microsoft.com/en-us/windows/wsl/) or Git Bash to run the script, or follow manual setup steps. ## 🔍 What Can You Do? (API Highlights & Examples) **Remove comments after // before pasting** Swagger UI: http://127.0.0.1:8000/docs if you haven't changed the host and port ### 1. Web Search **Search the web, summarize, and get actionable answers—automatically.** **Endpoint:** POST `/web-search` **Request Example:** ```json { "query": "Top news of today worldwide", // Query you want to ask; if you provide a URL and ask to summarise, it will summarize the full page. "rerank": true, // Set to true for better result ranking. "num_results": 2, // Number of top results per subquery to explore (higher values = more tokens, slower/more costly). "local_mode": false, // Set to true to explore local documents (currently, only PDF supported). "split": true, // Set to false if you want full pages as input to LLMs; false may cause slower/more costly response. "document_paths": [] // If local_mode is true, add a list of document paths, e.g., ["documents/1706.03762v7.pdf"] } ``` or QA/sumamrise local documents ```json { "query": "Summarise this research paper", "rerank": true, "num_results": 3, "local_mode": true, "split": true, "document_paths": ["documents/1706.03762v7.pdf"] // Must be list. } ``` --- ### 2. Summarize Any Web Page **Summarize any article or research paper by URL.** **Endpoint:** POST `/web-summarize` **Request Example:** ```json { "query": "Write a short blog on the model", // Instruction or question for the fetched page content. "url": "https://huggingface.co/unsloth/Qwen3-8B-GGUF", // Webpage to fetch content from. "local_mode": false // Set to true if summarizing a local document. } ``` --- ### 3. YouTube Search **Search YouTube (supports prompts and batch).** **Endpoint:** POST `/youtube-search` **Request Example:** ```json { "query": "switzerland itinerary", // Query to search on YouTube; if a URL is provided, it fetches content from that URL. url should be in format: https://www.youtube.com/watch?v=videoID "prompt": "I want to plan my Switzerland trip", // Instruction or question for using the fetched content. "n": 2 // Number of top search results to summarize (only works if query is not a URL). } ``` --- ### 4. Reddit Deep Dive **Custom Reddit search, sort, filter, and get top comments.** **Endpoint:** POST `/reddit-search` **Request Example:** ```json { "subreddit": "", // Subreddit to fetch content from (use if url_type is not 'search'). "url_type": "search", // 'search' for phrase search; "url" for url, otherwise, use 'hot', 'top', 'best', etc. "n": 3, // Number of posts to fetch. "k": 1, // Number of top comments per post. "custom_url": "", // Use if you already have a specific Reddit URL. "time_filter": "all", // Time range: 'all', 'today', 'week', 'month', 'year'. "search_query": "gemma 3n reviews", // Search phrase (useful if url_type is 'search'). "sort_type": "relevance" // 'top', 'hot', 'new', 'relevance' — controls how results are sorted. } ``` --- ### 5. Map & Location/Route Search **Find places, routes, and nearby points of interest.** **Endpoint:** POST `/map-search` **Request Example:** ```json { "start_location": "MG Road, Bangalore", // Starting point. "end_location": "Lalbagh, Bangalore", // Destination. "pois_radius": 500, // Search radius in meters for amenities. "amenities": "restaurant|cafe|bar|hotel", // Amenities to search near start or end location. "limit": 3, // Maximum number of results if address not found exactly. "task": "route_and_pois" // Use 'location_only' for address/coordinates only, or 'route_and_pois' for routes and POIs. } ``` OR search for any single location (open street map has api rate limit) ```json { "start_location": "MG Road, Bangalore", "end_location": "Lalbagh, Bangalore", "pois_radius": 500, "amenities": "restaurant|cafe|bar|hotel", "limit": 3, "task": "location_only" } ``` --- ### 6. GitHub & Local Repo Directory Tree **Get the directory structure of any GitHub or local repo.** **Endpoint:** POST `/git-tree-search` **Request Example:** ```json { "repobaseurl": "https://github.com/SPThole/CoexistAI/" // Base URL of the repository to explore. } ``` or for local repo: ```json { "repobaseurl": "/home/user/projects/myrepo" } ``` --- ## 🧑‍💻 Integrate coexistai as an MCP Server (LM Studio, Cursor, etc.) Starting LM Studio 0.3.17, LM Studio acts as an Model Context Protocol (MCP) Host. This means you can connect MCP servers to the app and make them available to your models. You can now run coexistai as an MCP server—**plug it into LM Studio** or any other MCP-compatible tool! ### How to Integrate with LM Studio 1. Download (latest) lm studio> 0.3.17 (https://lmstudio.ai/docs/app) 2. Find the [MCP guide](https://lmstudio.ai/docs/app/plugins/mcp)

CoexistAI MCP Logo

1. **Edit your `mcp.json` in LM Studio:** - Go to the Program tab → `Install > Edit mcp.json` - Add coexistai as a server. Example: ```json { "mcpServers": { "coexistai": { "url": "http://127.0.0.1:8000/mcp", "timeout": 180000 } } } ``` - Replace with your actual server address and token (if needed). 2. **Or use an "Add to LM Studio" button** (if provided on coexistai website). **Security note:** Only use MCP servers you trust—servers can access files/network. 3. Use [system prompt](system_prompt.py) as context (system prompt) in lmstudio --- ## 🏆 Best Local Model For fastest, highest-quality local LLM results, I have personally liked following: **unsloth/Qwen3-8B-GGUF** and **google/gemma-3-12b** as lmstudio model for model behind MCP, prefer the model which is faster and yet good at structural output generation I am working towards making system work with local smaller models which are not so great. --- ================================================ FILE: __init__.py ================================================ ================================================ FILE: app.py ================================================ from utils.websearch_utils import * from utils.reddit_utils import * from utils.map import * from fastapi import FastAPI, Request from pydantic import BaseModel from utils.utils import * from utils.map import * from utils.git_utils import * from utils.startup_banner import display_startup_banner, display_shutdown_banner, get_ascii_banner from utils.knowledge_base import create_knowledge_base from utils.crawler_utils import crawl_and_create_kb import html as _html from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import FileResponse, HTMLResponse from fastapi.staticfiles import StaticFiles from uuid import uuid4 import subprocess from utils.tts_utils import * from fastapi_mcp import FastApiMCP import json import os import atexit from model_config import * import time from typing import List, Optional, Union from utils.knowledge_base import create_knowledge_base # Application state for startup/reload notifications app_state = {"status": "starting", "message": "Initializing components..."} def init_components(): """(Re)initialize model and embedding components from model_config. This is safe to call at startup or after config reload. It updates module-level globals used by request handlers. """ global llm, hf_embeddings, cross_encoder, text_splitter, searcher, date, day, llm_model_name, llm_type, llm_kwargs, embedding_model_name, embed_mode, cross_encoder_name app_state['status'] = 'starting' app_state['message'] = 'Loading models and embeddings (this may take a minute)...' print("=== CoexistAI Startup: Loading models and embeddings ===", flush=True) try: # Read config values print("Reading configuration from model_config...", flush=True) llm_model_name = model_config.get("llm_model_name", llm_model_name if 'llm_model_name' in globals() else 'google/gemma-3-12b') llm_type = model_config.get("llm_type", llm_type if 'llm_type' in globals() else 'local') llm_kwargs = model_config.get("llm_kwargs", llm_kwargs if 'llm_kwargs' in globals() else {'temperature':0.1,'api_key': llm_api_key}) embedding_model_name = model_config.get("embedding_model_name", embedding_model_name if 'embedding_model_name' in globals() else 'models/embedding-001') embed_mode = model_config.get("embed_mode", embed_mode if 'embed_mode' in globals() else 'google') cross_encoder_name = model_config.get("cross_encoder_name", cross_encoder_name if 'cross_encoder_name' in globals() else 'BAAI/bge-reranker-base') print(f"Config loaded: llm_type={llm_type}, llm_model={llm_model_name}, embed_mode={embed_mode}", flush=True) # instantiate generative LLM print(f"Initializing LLM: {llm_model_name} ({llm_type})...", flush=True) llm = get_generative_model( model_name=llm_model_name, type=llm_type, base_url=openai_compatible.get(llm_type, 'https://api.openai.com/v1'), _tools=None, kwargs=llm_kwargs ) print("LLM initialized successfully", flush=True) # load embeddings and cross-encoder print(f"Loading embeddings: {embedding_model_name} (mode={embed_mode})...", flush=True) hf_embeddings, cross_encoder = load_model(embedding_model_name, _embed_mode=embed_mode, cross_encoder_name=cross_encoder_name, kwargs=model_config.get('embed_kwargs', {})) print("Embeddings and cross-encoder loaded successfully", flush=True) print("Initializing text splitter...", flush=True) text_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=32) # recreate searxng searcher print(f"Initializing SearchWeb with {HOST_SEARXNG}:{PORT_NUM_SEARXNG}...", flush=True) searcher = SearchWeb(PORT_NUM_SEARXNG, HOST_SEARXNG) print("Getting local date and time...", flush=True) date, day = get_local_data() app_state['status'] = 'ready' app_state['message'] = 'Ready' print("=== CoexistAI Startup Complete: All components ready ===", flush=True) except Exception as e: app_state['status'] = 'error' app_state['message'] = f'Initialization failed: {e}' # keep exception visible in logs print(f"=== CoexistAI Startup FAILED: {e} ===", flush=True) logger.exception('Failed to initialize components') raise # Initialize components once at import/startup # (This is now done in the lifespan startup handler) # try: # init_components() # except Exception as e: # # already logged; keep going so admin endpoints can be used to diagnose/fix # logger.error(f'Failed to initialize at startup: {e}') # # Update status to show startup failed but app is running for diagnostics # app_state['status'] = 'error' # app_state['message'] = f'Startup failed: {e}' # Use config values for model and embedding paths llm_model_name = model_config.get("llm_model_name", 'google/gemma-3-12b') llm_type = model_config.get("llm_type", 'local') llm_tools = model_config.get("llm_tools",None) llm_base_url = openai_compatible.get(model_config['llm_type'], 'https://api.openai.com/v1') llm_kwargs = model_config.get("llm_kwargs", {'temperature': 0.1, 'max_tokens': None, 'timeout': None, 'api_key':llm_api_key, 'max_retries': 2}) embed_kwargs = model_config.get("embed_kwargs", {}) embedding_model_name = model_config.get("embedding_model_name", "models/embedding-001") embed_mode = model_config.get("embed_mode", "google") cross_encoder_name = model_config.get("cross_encoder_name", "BAAI/bge-reranker-base") if not is_searxng_running(): # Running `docker` from inside a container is not supported in most environments # (docker binary may not exist or there are permission restrictions). Instead, # log a clear warning and let orchestration (docker-compose / external admin) # manage the searxng service. try: logger.warning(f"SearxNG not reachable at {HOST_SEARXNG}:{PORT_NUM_SEARXNG}. Please start the searxng service (e.g. `docker compose up searxng`) or ensure it's reachable from this container.") except Exception: print(f"SearxNG not reachable at {HOST_SEARXNG}:{PORT_NUM_SEARXNG}. Please start searxng service.") else: try: logger.info("SearxNG is reachable.") except Exception: print("SearxNG docker container is already running.") llm = get_generative_model( model_name=llm_model_name, type=llm_type, base_url=llm_base_url, _tools=None, kwargs=llm_kwargs ) hf_embeddings, cross_encoder = load_model(embedding_model_name, _embed_mode=embed_mode, cross_encoder_name=cross_encoder_name, kwargs=embed_kwargs) text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=128) searcher = SearchWeb(PORT_NUM_SEARXNG, HOST_SEARXNG) date, day = get_local_data() # Lifespan context manager for startup/shutdown from contextlib import asynccontextmanager @asynccontextmanager async def lifespan(app_instance): # Startup print("\n" + "="*80, flush=True) print("FastAPI app starting up...", flush=True) logger.info("FastAPI app starting up...") app_state['status'] = 'starting' app_state['message'] = 'Initializing components...' try: init_components() print("="*80 + "\n", flush=True) except Exception as e: print(f"STARTUP ERROR: {e}", flush=True) print("="*80 + "\n", flush=True) logger.error(f"Failed to initialize components during startup: {e}", exc_info=True) app_state['status'] = 'error' app_state['message'] = f'Startup failed: {e}' yield # Shutdown logger.info("FastAPI app shutting down...") app_state['status'] = 'shutting_down' app_state['message'] = 'App shutting down' app = FastAPI(title='coexistai', lifespan=lifespan) # Mount static files app.mount("/artifacts", StaticFiles(directory="artifacts"), name="artifacts") # --- Admin endpoints for runtime config reload/update --------------------------------- from fastapi import HTTPException, Depends def _check_admin_token(token: str = None): # token supplied via header X-Admin-Token or env ADMIN_TOKEN # FastAPI dependency will pass header automatically when named 'x_admin_token' env_token = os.environ.get('ADMIN_TOKEN') if env_token is None: # no admin token configured; disallow by default to avoid accidental exposure raise HTTPException(status_code=403, detail='Admin actions disabled (no ADMIN_TOKEN set)') if token != env_token: raise HTTPException(status_code=401, detail='Invalid admin token') return True @app.post('/admin/reload-config') async def admin_reload_config(request: Request): """Reload model config from the configured JSON file. Protected by ADMIN_TOKEN env var. Send header 'X-Admin-Token: ' to authenticate. Returns the reloaded config on success. """ token = request.headers.get('X-Admin-Token') try: _check_admin_token(token) except HTTPException as e: raise e try: new_cfg = reload_model_config() except Exception as e: raise HTTPException(status_code=500, detail=f'Failed to reload config: {e}') # apply config immediately try: init_components() except Exception as e: raise HTTPException(status_code=500, detail=f'Config reloaded but applying failed: {e}') return {"status": "ok", "model_config": new_cfg, "app_state": app_state} @app.post('/admin/update-config') async def admin_update_config(request: Request): """Overwrite the config file with the posted JSON body. Protected by ADMIN_TOKEN. Body must be a JSON object compatible with the config schema. Returns saved config on success. """ token = request.headers.get('X-Admin-Token') try: _check_admin_token(token) except HTTPException as e: raise e try: body = await request.json() except Exception: raise HTTPException(status_code=400, detail='Invalid JSON body') cfg_path = os.environ.get('CONFIG_PATH', os.path.join(os.path.dirname(__file__), 'config', 'model_config.json')) cfg_dir = os.path.dirname(cfg_path) os.makedirs(cfg_dir, exist_ok=True) try: with open(cfg_path, 'w') as f: json.dump(body, f, indent=2) except Exception as e: raise HTTPException(status_code=500, detail=f'Failed to write config: {e}') try: new_cfg = reload_model_config(cfg_path) except Exception as e: raise HTTPException(status_code=500, detail=f'Config saved but reload failed: {e}') # apply new config immediately try: init_components() except Exception as e: raise HTTPException(status_code=500, detail=f'Config saved but applying failed: {e}') return {"status": "ok", "saved": cfg_path, "model_config": new_cfg, "app_state": app_state} # -------------------------------------------------------------------------------------- @app.get('/admin', response_class=HTMLResponse) async def admin_page(): """Serve the static admin UI and inject the ASCII banner at request time. The static UI lives at ./static/admin.html so it's easier to edit and keep app.py small. """ try: static_path = os.path.join(os.path.dirname(__file__), 'static', 'admin.html') with open(static_path, 'r', encoding='utf-8') as f: html = f.read() except Exception as e: return HTMLResponse(content=f"Error loading admin UI: {e}", status_code=500) # inject the ascii banner into the HTML, escaped for safety try: banner = get_ascii_banner() or '' banner_html = _html.escape(banner) html = html.replace('BANNER_PLACEHOLDER', banner_html) except Exception: pass return HTMLResponse(content=html) @app.get('/status') async def status(): """Return basic app startup/reload status for UI and health checks.""" return app_state @app.get('/admin/config') async def admin_get_config(): """Return the effective model_config plus helper globals for the admin UI.""" # safe copy of model # include openai_compatible and host/port defaults def _mask(s): try: if not s: return '' s = str(s) if len(s) <= 6: return '*' * len(s) return s[:3] + '...' + s[-3:] except Exception: return '' cfg = dict(model_config) cfg['_meta'] = { 'openai_compatible': openai_compatible, 'HOST_APP': globals().get('HOST_APP'), 'PORT_NUM_APP': globals().get('PORT_NUM_APP'), 'HOST_SEARXNG': globals().get('HOST_SEARXNG'), 'PORT_NUM_SEARXNG': globals().get('PORT_NUM_SEARXNG'), 'llm_api_key': _mask(globals().get('llm_api_key')), 'embed_api_key': _mask(globals().get('embed_api_key')), } return cfg # Register shutdown handler atexit.register(display_shutdown_banner) origins = [ "*", # Allow all origins (use specific domains in production) ] app.add_middleware( CORSMiddleware, allow_origins=origins, # e.g. ["http://localhost", "http://localhost:3000"] allow_credentials=True, allow_methods=["*"], # Allow all HTTP methods (including OPTIONS) allow_headers=["*"], # Allow all headers ) @app.get('/') async def root(): return {"message": "Welcome to CoexistAI!"} class WebSearchRequest(BaseModel): query: str rerank: bool = True num_results: int = 2 local_mode: bool = False split: bool = True document_paths: list[str] = [] # List of paths for local documents vectordb: str = "" # Optional vector database name to use instead of search quick_answer: bool = False # Whether to force quick answer mode (disables summary mode) class YouTubeSearchRequest(BaseModel): query: str prompt: str n: int = 1 # Number of videos to summarize, default is 1 class RedditSearchRequest(BaseModel): subreddit: str = None url_type: str = "hot" n: int = 3 k: int = 1 custom_url: str = None time_filter: str = "all" search_query: str = None sort_type: str = "relevance" class MapSearchRequest(BaseModel): start_location: Optional[str] = None # Start location can be a string or None end_location: Optional[str] = None # End location can be a string or None pois_radius: int = 500 # Default radius for POIs in meters amenities: str = "restaurant|cafe|bar|hotel" # Default amenities to search for limit: int = 3 # Default number of results to return task: str = "route_and_pois" # Default task is to find a route class WebSummarizeRequest(BaseModel): query: str url: str local_mode: bool = False class GitTreeRequest(BaseModel): repobaseurl: str class GitSearchRequest(BaseModel): repobaseurl: str parttoresearch: str query: str type: str class LocalFolderTreeRequest(BaseModel): folder_path:str level: str = 'broad-first' prefix: str = '' class ResearchCheckRequest(BaseModel): query: str toolsshorthand: str # Default budget for deep research, can be adjusted as needed class ClickableElementRequest(BaseModel): url:str query:str topk:int=10 class PodcastRequest(BaseModel): text: str = None prompt: str = None # Optional theme for the podcast class BasicTTSRequest(BaseModel): text: str = None voice: str = "am_santa" lang: str = "en-us" filename: str = "" class KnowledgeBaseRequest(BaseModel): document_paths: list[str] # List of paths to create knowledge base from class CrawlerRequest(BaseModel): url_or_urls: Union[str, List[str]] # Single URL to crawl or list of URLs to scrape keywords: Optional[List[str]] = [""] # Optional keywords to filter content depth: Optional[int] = None # Crawl depth for crawling (None for full website crawl) crawl: bool = True # Whether to crawl (True) or process URLs directly (False) min_delay: float = 1.0 # Minimum delay between requests in seconds max_delay: float = 2.0 # Maximum delay between requests in seconds max_pages: int = 10000 # Maximum number of pages to collect during crawling url_keyword: Optional[str] = "" # Optional keyword to filter URLs by presence in the URL string @app.post('/clickable-elements', operation_id="get_website_structure") async def get_website_structure(request: ClickableElementRequest): """ Retrieves the top-k clickable elements from a given URL based on a query. This will help you to find out if there are any clickable elements on the page that match the query. You can use this to find deeper links since connected pieces of information are often linked together. RECOMMENDATION: Be specific with the query to get the most relevant clickable elements. Args: url (str): The URL to search for clickable elements. query (str): The query to filter the clickable elements. topk (int): The number of top clickable elements to return. Returns: list: A list of dictionaries containing the title, URL, and score of each clickable element. """ return await get_topk_bm25_clickable_elements(request.url, request.query, request.topk) @app.post('/local-folder-tree', operation_id="get_local_folder_tree") async def get_local_folder_tree(request: LocalFolderTreeRequest): """ Async Markdown folder tree. Args: folder_path (str): Root directory. level (str): - 'full': Show all folders and files, recursively, except hidden/system/cache entries. - 'broad-first': Only show immediate (top-level) folders and files (no nesting). - 'broad-second': Show top-level folders/files and their immediate child folders/files (two levels, no deeper). prefix (str): Indentation (internal) Returns: str: Markdown tree string """ return await folder_tree(request.folder_path, level=request.level, prefix=request.prefix) @app.post('/git-tree-search',operation_id="get_git_tree") async def get_git_tree(request:GitTreeRequest): """ Retrieves and returns the directory tree structure of a GitHub repository or a local Git repository. Args: url (str): The base URL of the GitHub repository (e.g., 'https://github.com/user/repo') or the path to the local repository on your system. Returns: str: The directory tree structure as a string. """ return await git_tree_search(request.repobaseurl) @app.post('/git-search',operation_id="get_git_search") async def get_git_search(request:GitSearchRequest): """ Fetches the content of a specific part (directory or file) from either and does what asked in users query. First use get_git_tree to understand the structure of the repo and which part might be useful to answer users query - a GitHub repository (via URL), or - a local Git repository (via local path). Args: base_url (str): The base URL of the GitHub repository (e.g., 'https://github.com/user/repo'), or the local path to the root of the repository. part (str): The path inside the repository you wish to access (e.g., 'basefolder/subfolder'). use get_git_tree for getting specific part if needed query (str): Users query type (str): "Folder" or "file" Returns: str: Response of the users query based on the content fetched """ content = await git_specific_content(request.repobaseurl,request.parttoresearch,request.type) prompt = f"""You are a professional coder, your task is to answer the users query based on the content fetched from git repo User Query: {request.query} Fetched Content: {content} """ result = await llm.ainvoke( prompt ) return result.content @app.post('/web-search',operation_id="get_web_search") async def websearch(request: WebSearchRequest): """ Performs a web search and retrieves results, then generates a response based on those results. It also throws back the next steps, you should carry out your research until there are no next steps left. RECOMMENDATION: Be specific with the query to get the most relevant results. and Set num_results to 2 (for better results) Args: query (str): The input query. rerank (bool): Whether to rerank results. num_results (int, optional): Number of search results to retrieve. Defaults to 3. (can take values from 1-5) document_paths (list of str, optional): List of paths for local documents/folders. Defaults to empty list. for an example [path1,path2,path3]. if different tasks are related to different documents local_mode (bool, optional): Whether to process local documents. Defaults to False. split (bool, optional): Whether to split documents into chunks. Defaults to True. vectordb (str, optional): Name of an existing vector database to query instead of performing search. Defaults to None. quick_answer (bool, optional): Whether to force quick answer mode (disables summary mode). Defaults to False. Returns: str: Generated response to query based on the retrieved and reranked search results and sources """ # You may need to adjust these arguments based on your actual setup # For demonstration, using None for models and embeddings try: result = await query_web_response( query=request.query, date=date, day=day, websearcher=searcher, # Replace with your actual searcher instance if needed hf_embeddings=hf_embeddings, rerank=request.rerank, cross_encoder=cross_encoder, model=llm, text_model=llm, num_results=min(2,request.num_results), document_paths=request.document_paths, local_mode=request.local_mode, split=request.split, vectordb=request.vectordb, quick_answer=request.quick_answer ) return "result:" + result[0] + '\n\nsources:' + result[1] except: return "No Websites found, Try rephrasing query" @app.post('/create-knowledge-base', operation_id="get_knowledge_base") async def create_kb(request: KnowledgeBaseRequest): """ Creates a knowledge base from the provided document paths. Processes all files in the paths, embeds them, and saves to a vector database. Args: document_paths (list of str): List of paths to folders or files to include in the knowledge base. Returns: str: The name of the created vector database collection. """ try: collection_name = await create_knowledge_base( document_paths=request.document_paths, hf_embeddings=hf_embeddings ) return f"Knowledge base created successfully. Collection name: {collection_name}" except Exception as e: return f"Error creating knowledge base: {str(e)}" @app.post('/crawl-and-create-knowledge-base', operation_id="get_crawled_knowledge_base") async def crawl_kb(request: CrawlerRequest): """ Crawls a website or processes a list of URLs and creates a knowledge base from the content. Args: url_or_urls: Single URL to crawl or list of URLs to scrape directly keywords: Optional list of keywords to filter content by depth: Maximum crawl depth for crawling (None for full website crawl) crawl: Whether to crawl (True) or process URLs directly (False) min_delay: Minimum delay between requests in seconds (default: 1.0) max_delay: Maximum delay between requests in seconds (default: 3.0) max_pages: Maximum number of pages to collect during crawling (default: 100) url_keyword: Optional keyword to filter URLs by presence in the URL string Returns: str: Message with the collection name and list of scraped URLs. """ try: collection_name, scraped_urls = await crawl_and_create_kb( url_or_urls=request.url_or_urls, keywords=request.keywords, depth=request.depth, crawl=request.crawl, min_delay=request.min_delay, max_delay=request.max_delay, max_pages=request.max_pages, url_keyword=request.url_keyword, hf_embeddings=hf_embeddings ) return f"Crawled knowledge base created successfully. Collection name: {collection_name}. Scraped URLs: {scraped_urls}" except Exception as e: return f"Error creating crawled knowledge base: {str(e)}" @app.post('/web-summarize', operation_id="get_web_summarize") async def websummarize(request: WebSummarizeRequest): """Generates a summary of a web page based on the provided query and URL. Args: query (str): The input query. url (str): The URL of the web page to summarize. model (str): The model to use for summarization. local_mode (bool): Whether to process local documents. Returns: str: The generated summary of the url provided to answer query""" try: result = await summary_of_url( query=request.query, url=request.url, model=llm, # Replace with your actual model if needed local_mode=request.local_mode ) return result except: return "URL is not reacheable, try different URL" @app.post('/youtube-search', operation_id="get_youtube_search") async def youtube_search(request: YouTubeSearchRequest): """Performs a YouTube search and return summaries of it. Args: query (str): The YouTube video URL if provided else search term prompt (str): The prompt to generate a response from the transcript. n (int): Number of videos to summarize if search term is provided instead of URL. Returns: str: response from the YouTube transcripts based on the given query""" # You may need to adjust the model argument as per your setup result = youtube_transcript_response( request.query, request.prompt, n = request.n, #number of videos to summarise model=llm # Replace with your actual model if needed ) return result @app.post('/reddit-search', operation_id="get_reddit_search") async def reddit_search(request: RedditSearchRequest): """Performs a Reddit search and retrieves posts based on the provided parameters. Args: subreddit (str): The subreddit to search in. When search_query is provided url_type (str): The type of Reddit URL to fetch (e.g., 'search','hot', 'new','top','best','controversial','rising'). set to 'search' if specific search_query is provided n (int): Number of posts to retrieve. k (int): Number of comments on each post to return after processing. When more perspectives needed increase this. custom_url (str): Custom URL for Reddit search. time_filter (str): Time filter for the search (e.g., 'all', 'day'). search_query (str): Search query for Reddit posts. IF NOT SEARCHING FOR A QUERY, dont set this value, keep it "" sort_type (str): Sorting type for the results. Returns: str: A response containing the summary of the Reddit search results""" # You may need to adjust the model argument as per your setup if request.search_query: request.url_type = 'search' result = reddit_reader_response( subreddit=request.subreddit, url_type=request.url_type, n=request.n, k=request.k, custom_url=request.custom_url, time_filter=request.time_filter, search_query=request.search_query, sort_type=request.sort_type, model=llm # Replace with your actual model if needed ) return result @app.post('/map-search', operation_id="get_map_search") async def map_search(request: MapSearchRequest): """Performs a map search and retrieves the route and points of interest like (POIs) between two locations. Args: start_location (optional str): The starting location for the route. can be None as well end_location (optional str): The destination location for the route.can be None as well pois_radius (int): Radius in meters to search for points of interest around the route. amenities (str): Types of amenities to search for, separated by '|'. For example, "restaurant|cafe|bar|hotel". limit (int): Maximum number of POIs to return. task (str): The task to perform, either "location_only" - if lat long of start and end location is needed, else by default is "route_and_pois" - if route and POIs are needed. Returns: dict: location or route and POIs or both""" result = generate_map(request.start_location, request.end_location, pois_radius=request.pois_radius, amenities=request.amenities, limit=request.limit, task=request.task, ) return result @app.post('/check-response', operation_id="get_response_check") async def check_response(request: ResearchCheckRequest): """ Evaluates whether the agent's collected information is complete for writing answer to the user's query. If any aspect is missing, list them all in bullet format Args: query (str): The user's original query. toolsshorthand (str): Exact Facts/Information collected in bullets from every past tool usage which would be useful to answer Returns: str: Suggestions for improvement or confirmation that all aspects are addressed. """ system_prompt = f"""You are a professional researcher. Review the following user query and the agent's short hand of informations collected. If not explicitly asked for deep research, you should just check if most necessary information and all aspects present in query are covered, NO NEED TO SUGGEST EXTRA, SINCE ITS QUICK QUERY Determine if the shorthand fully addresses every aspect and intent of the query. If any part is missing or could be improved, list the specific aspects or suggestions for further research or value addition.(IF DEEP RESEARCH ASKED EXPLICITLY) If the response is complete, state that all aspects have been addressed. User Query: {request.query} Agent Shorthand: {request.toolsshorthand} """ result = await llm.ainvoke( system_prompt ) return result.content @app.post('/text-to-podcast', operation_id="get_podcast") async def podcaster(request: PodcastRequest): """ Converts a list of sentences with specified voices into a podcast audio file. Each sentence is spoken in the specified voice, and random pauses are added between sentences for natural flow. Args: prompt: The theme or topic of the podcast episode. You can even provide length instructions, like shorter/longer duration, tone, etc. text: The detailed content over which the podcast is to be made. Returns: FileResponse: The generated podcast .wav file. or str """ system_prompt = f"""You are an experienced podcaster who can create engaging episodes on any topic. Your style makes complex concepts simple, clear, and enjoyable to listen to. When writing scripts: Use natural, conversational language. Avoid special characters (like *, #, etc.) and TTS markup (such as tags). Do not include background descriptions or stage directions. Always stay on the provided theme (if one is given). If no theme is provided, use the given text to generate engaging, informative content. The podcast script should be formatted as follows: [Person1] What Person1 says [Person2] What Person2 says ... Where each [Person] represents a speaker, followed by their dialogue. Theme: {request.prompt} Text: {request.text} """ result = await llm.ainvoke( system_prompt ) voice_choices = ["af_heart","am_michael","am_adam","am_eric","am_echo","am_puck", "am_fenrir","am_santa","am_liam","af_river" ] podcast_segments = await parse_podcast(result.content, voice_choices) try: if os.path.exists("output/podcasts") is False: os.makedirs("output/podcasts") file_path = f"output/podcasts/podcast_{str(uuid4())[:8]}.wav" _ = await podcasting(podcast_segments, filename=file_path) logger.info(f"Current working directory: {os.getcwd()}") logger.info(f"Podcast file created at: {file_path}") try: return FileResponse( file_path, media_type="audio/wav", filename=os.path.basename(file_path) ) except: return f"Generated podcast and stored at {file_path}" except Exception as e: return {"error": f"Error occurred while creating podcast: {e}"} @app.post('/basic-tts', operation_id="get_basic_tts") async def basic_tts(request: BasicTTSRequest): """Converts input text to speech using the specified voice and language, and returns the generated audio file. Args: request (BasicTTSRequest): The request object containing the following fields: - text (str): The text to be converted to speech. - voice (str): The voice to use for speech synthesis. - lang (str): The language code for speech synthesis. - filename (str, optional): The output filename for the generated audio file. Returns: FileResponse: The generated audio file in WAV format if successful. dict: An error message if text is not provided or if an exception occurs during TTS generation. """ text = request.text voice = request.voice lang = request.lang filename = request.filename if not filename: filename = f"output/basic_tts_{str(uuid4())[:8]}.wav" if not text: return {"error": "Text is required for TTS."} try: await text_to_speech(text, voice, filename, lang) return FileResponse( filename, media_type="audio/wav", filename=os.path.basename(filename) ) except Exception as e: return {"error": f"Error occurred while creating TTS: {e}"} mcp = FastApiMCP(app,include_operations=['get_web_search', 'get_web_summarize', 'get_youtube_search', 'get_reddit_search', 'get_map_search', "get_git_tree", "get_git_search", "get_local_folder_tree", "get_response_check", "get_website_structure", "get_podcast", "get_basic_tts" ],) mcp.mount() # Display startup banner when the app starts display_startup_banner(host=HOST_APP, port=PORT_NUM_APP) ================================================ FILE: coexist_tutorial.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "6dd23e8d", "metadata": {}, "source": [ "# CoexistAI Tool Tutorial\n", "\n", "Welcome to the tutorial for the coexistAI tool! This notebook will guide you through the main functionalities of the tool, including web search, document processing, generative models, answer generation, YouTube summarization, and more. Each section contains explanations and code examples to help you get started quickly." ] }, { "cell_type": "markdown", "id": "d0bbf059", "metadata": {}, "source": [ "## 1. Setup and Initialization\n", "\n", "First, let's import the required libraries, set up environment variables, and initialize the main components. This ensures that all dependencies are loaded and ready for use." ] }, { "cell_type": "code", "execution_count": null, "id": "3849e2e0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "USER_AGENT environment variable not set, consider setting it to identify your requests.\n" ] }, { "data": { "text/html": [ "
SearxNG docker container is already running.\n",
       "
\n" ], "text/plain": [ "SearxNG docker container is already running.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from utils.utils import *\n", "from utils.websearch_utils import *\n", "set_logging(True) \n", "from langchain_text_splitters import TokenTextSplitter\n", "import os\n", "port_num_searxng = 8085\n", "host_searxng = \"localhost\"\n", "if not is_searxng_running():\n", " subprocess.run([\n", " \"docker\", \"run\", \"--rm\",\n", " \"-d\", \"-p\", f\"{port_num_searxng}:8080\",\n", " \"-v\", f\"{os.getcwd()}/searxng:/etc/searxng:rw\",\n", " \"-e\", f\"BASE_URL=http://{host_searxng}:{port_num_searxng}/\",\n", " \"-e\", \"INSTANCE_NAME=my-instance\",\n", " \"searxng/searxng\"\n", " ])\n", "else:\n", " print(\"SearxNG docker container is already running.\")\n", "\n", "\n", "os.environ['GOOGLE_API_KEY'] = 'YOUR_LLM_KEY' # Replace with your actual if google models are being used\n", "\n", "text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=128)\n", "from utils.websearch_utils import *\n", "searcher = SearchWeb(port_num_searxng,host_searxng) # Initialize web search with a result limit" ] }, { "cell_type": "markdown", "id": "ad2fddd7", "metadata": {}, "source": [ "## 2. Loading Models\n", "\n", "Load embedding models and cross-encoders using the `load_model` function. You can choose between different embedding modes such as 'gemini', 'huggingface', or 'infinity_emb'.\n", "\n", "for local embedders, either use infinity_emb or huggingface" ] }, { "cell_type": "code", "execution_count": 9, "id": "237dc7be", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:05:45,487] INFO utils.utils: Loading model: models/embedding-001 with embedding mode: google\n", "[2025-07-27 12:05:45,487] INFO utils.utils: Loading model: models/embedding-001 with embedding mode: google\n", "[2025-07-27 12:05:45,749] INFO transformers.configuration_utils: loading configuration file config.json from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/config.json\n", "[2025-07-27 12:05:45,749] INFO transformers.configuration_utils: loading configuration file config.json from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/config.json\n", "[2025-07-27 12:05:45,751] INFO transformers.configuration_utils: Model config XLMRobertaConfig {\n", " \"architectures\": [\n", " \"XLMRobertaForSequenceClassification\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"bos_token_id\": 0,\n", " \"classifier_dropout\": null,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"id2label\": {\n", " \"0\": \"LABEL_0\"\n", " },\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"label2id\": {\n", " \"LABEL_0\": 0\n", " },\n", " \"layer_norm_eps\": 1e-05,\n", " \"max_position_embeddings\": 514,\n", " \"model_type\": \"xlm-roberta\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"output_past\": true,\n", " \"pad_token_id\": 1,\n", " \"position_embedding_type\": \"absolute\",\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.52.4\",\n", " \"type_vocab_size\": 1,\n", " \"use_cache\": true,\n", " \"vocab_size\": 250002\n", "}\n", "\n", "[2025-07-27 12:05:45,751] INFO transformers.configuration_utils: Model config XLMRobertaConfig {\n", " \"architectures\": [\n", " \"XLMRobertaForSequenceClassification\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"bos_token_id\": 0,\n", " \"classifier_dropout\": null,\n", " \"eos_token_id\": 2,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"id2label\": {\n", " \"0\": \"LABEL_0\"\n", " },\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"label2id\": {\n", " \"LABEL_0\": 0\n", " },\n", " \"layer_norm_eps\": 1e-05,\n", " \"max_position_embeddings\": 514,\n", " \"model_type\": \"xlm-roberta\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"output_past\": true,\n", " \"pad_token_id\": 1,\n", " \"position_embedding_type\": \"absolute\",\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.52.4\",\n", " \"type_vocab_size\": 1,\n", " \"use_cache\": true,\n", " \"vocab_size\": 250002\n", "}\n", "\n", "[2025-07-27 12:05:45,755] INFO transformers.modeling_utils: loading weights file model.safetensors from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/model.safetensors\n", "[2025-07-27 12:05:45,755] INFO transformers.modeling_utils: loading weights file model.safetensors from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/model.safetensors\n", "[2025-07-27 12:05:45,826] INFO transformers.modeling_utils: All model checkpoint weights were used when initializing XLMRobertaForSequenceClassification.\n", "\n", "[2025-07-27 12:05:45,826] INFO transformers.modeling_utils: All model checkpoint weights were used when initializing XLMRobertaForSequenceClassification.\n", "\n", "[2025-07-27 12:05:45,827] INFO transformers.modeling_utils: All the weights of XLMRobertaForSequenceClassification were initialized from the model checkpoint at BAAI/bge-reranker-base.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use XLMRobertaForSequenceClassification for predictions without further training.\n", "[2025-07-27 12:05:45,827] INFO transformers.modeling_utils: All the weights of XLMRobertaForSequenceClassification were initialized from the model checkpoint at BAAI/bge-reranker-base.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use XLMRobertaForSequenceClassification for predictions without further training.\n", "[2025-07-27 12:05:46,461] INFO transformers.tokenization_utils_base: loading file sentencepiece.bpe.model from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/sentencepiece.bpe.model\n", "[2025-07-27 12:05:46,461] INFO transformers.tokenization_utils_base: loading file sentencepiece.bpe.model from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/sentencepiece.bpe.model\n", "[2025-07-27 12:05:46,463] INFO transformers.tokenization_utils_base: loading file tokenizer.json from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/tokenizer.json\n", "[2025-07-27 12:05:46,463] INFO transformers.tokenization_utils_base: loading file tokenizer.json from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/tokenizer.json\n", "[2025-07-27 12:05:46,464] INFO transformers.tokenization_utils_base: loading file added_tokens.json from cache at None\n", "[2025-07-27 12:05:46,464] INFO transformers.tokenization_utils_base: loading file added_tokens.json from cache at None\n", "[2025-07-27 12:05:46,465] INFO transformers.tokenization_utils_base: loading file special_tokens_map.json from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/special_tokens_map.json\n", "[2025-07-27 12:05:46,465] INFO transformers.tokenization_utils_base: loading file special_tokens_map.json from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/special_tokens_map.json\n", "[2025-07-27 12:05:46,466] INFO transformers.tokenization_utils_base: loading file tokenizer_config.json from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/tokenizer_config.json\n", "[2025-07-27 12:05:46,466] INFO transformers.tokenization_utils_base: loading file tokenizer_config.json from cache at /Users/sidhantthole/.cache/huggingface/hub/models--BAAI--bge-reranker-base/snapshots/2cfc18c9415c912f9d8155881c133215df768a70/tokenizer_config.json\n", "[2025-07-27 12:05:46,467] INFO transformers.tokenization_utils_base: loading file chat_template.jinja from cache at None\n", "[2025-07-27 12:05:46,467] INFO transformers.tokenization_utils_base: loading file chat_template.jinja from cache at None\n", "[2025-07-27 12:05:47,331] INFO sentence_transformers.cross_encoder.CrossEncoder: Use pytorch device: mps\n", "[2025-07-27 12:05:47,331] INFO sentence_transformers.cross_encoder.CrossEncoder: Use pytorch device: mps\n" ] } ], "source": [ "hf_embeddings, cross_encoder = load_model(\"models/embedding-001\", _embed_mode='google',\n", " kwargs={'api_key': os.environ['GOOGLE_API_KEY']})" ] }, { "cell_type": "markdown", "id": "eab297bd", "metadata": {}, "source": [ "## 4. Web Search Integration\n", "\n", "Use the `SearchWeb` class to perform web searches and retrieve results. This is useful for augmenting LLMs with up-to-date information from the web." ] }, { "cell_type": "code", "execution_count": 10, "id": "0b1c2e46", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:05:51,924] INFO utils.websearch_utils: Search results for query 'latest news in AI': [{'snippet': 'SoftBank chief: Forget AGI, ASI will be here within 10 years · Anthropic deploys AI agents to audit models for safety · Sam Altman: AI will cause job losses and ...', 'title': 'AI News | Latest AI News, Analysis & Events', 'link': 'https://www.artificialintelligence-news.com/', 'engines': ['google'], 'category': 'general'}, {'snippet': 'AI · Meta names Shengjia Zhao as chief scientist of AI superintelligence unit · AI referrals to top websites were up 357% year-over-year in June, reaching 1.13B.', 'title': 'AI News & Artificial Intelligence', 'link': 'https://techcrunch.com/category/artificial-intelligence/', 'engines': ['google'], 'category': 'general'}, {'snippet': \"2 Jul 2025 — Here's a recap of some of our biggest AI updates from June, including more ways to search with AI Mode, a new way to share your NotebookLM notebooks publicly.\", 'title': 'The latest AI news we announced in June', 'link': 'https://blog.google/technology/ai/google-ai-updates-june-2025/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:05:51,924] INFO utils.websearch_utils: Search results for query 'latest news in AI': [{'snippet': 'SoftBank chief: Forget AGI, ASI will be here within 10 years · Anthropic deploys AI agents to audit models for safety · Sam Altman: AI will cause job losses and ...', 'title': 'AI News | Latest AI News, Analysis & Events', 'link': 'https://www.artificialintelligence-news.com/', 'engines': ['google'], 'category': 'general'}, {'snippet': 'AI · Meta names Shengjia Zhao as chief scientist of AI superintelligence unit · AI referrals to top websites were up 357% year-over-year in June, reaching 1.13B.', 'title': 'AI News & Artificial Intelligence', 'link': 'https://techcrunch.com/category/artificial-intelligence/', 'engines': ['google'], 'category': 'general'}, {'snippet': \"2 Jul 2025 — Here's a recap of some of our biggest AI updates from June, including more ways to search with AI Mode, a new way to share your NotebookLM notebooks publicly.\", 'title': 'The latest AI news we announced in June', 'link': 'https://blog.google/technology/ai/google-ai-updates-june-2025/', 'engines': ['google'], 'category': 'general'}]\n" ] }, { "data": { "text/html": [ "
[\n",
       "    {\n",
       "        'snippet': 'SoftBank chief: Forget AGI, ASI will be here within 10 years · Anthropic deploys AI agents to \n",
       "audit models for safety · Sam Altman: AI will cause job losses and ...',\n",
       "        'title': 'AI News | Latest AI News, Analysis & Events',\n",
       "        'link': 'https://www.artificialintelligence-news.com/',\n",
       "        'engines': ['google'],\n",
       "        'category': 'general'\n",
       "    },\n",
       "    {\n",
       "        'snippet': 'AI · Meta names Shengjia Zhao as chief scientist of AI superintelligence unit · AI referrals to\n",
       "top websites were up 357% year-over-year in June, reaching 1.13B.',\n",
       "        'title': 'AI News & Artificial Intelligence',\n",
       "        'link': 'https://techcrunch.com/category/artificial-intelligence/',\n",
       "        'engines': ['google'],\n",
       "        'category': 'general'\n",
       "    },\n",
       "    {\n",
       "        'snippet': \"2 Jul 2025 — Here's a recap of some of our biggest AI updates from June, including more ways to\n",
       "search with AI Mode, a new way to share your NotebookLM notebooks publicly.\",\n",
       "        'title': 'The latest AI news we announced in June',\n",
       "        'link': 'https://blog.google/technology/ai/google-ai-updates-june-2025/',\n",
       "        'engines': ['google'],\n",
       "        'category': 'general'\n",
       "    }\n",
       "]\n",
       "
\n" ], "text/plain": [ "\u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", " \u001b[32m'snippet'\u001b[0m: \u001b[32m'SoftBank chief: Forget AGI, ASI will be here within 10 years · Anthropic deploys AI agents to \u001b[0m\n", "\u001b[32maudit models for safety · Sam Altman: AI will cause job losses and ...'\u001b[0m,\n", " \u001b[32m'title'\u001b[0m: \u001b[32m'AI News | Latest AI News, Analysis & Events'\u001b[0m,\n", " \u001b[32m'link'\u001b[0m: \u001b[32m'https://www.artificialintelligence-news.com/'\u001b[0m,\n", " \u001b[32m'engines'\u001b[0m: \u001b[1m[\u001b[0m\u001b[32m'google'\u001b[0m\u001b[1m]\u001b[0m,\n", " \u001b[32m'category'\u001b[0m: \u001b[32m'general'\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", " \u001b[32m'snippet'\u001b[0m: \u001b[32m'AI · Meta names Shengjia Zhao as chief scientist of AI superintelligence unit · AI referrals to\u001b[0m\n", "\u001b[32mtop websites were up 357% year-over-year in June, reaching 1.13B.'\u001b[0m,\n", " \u001b[32m'title'\u001b[0m: \u001b[32m'AI News & Artificial Intelligence'\u001b[0m,\n", " \u001b[32m'link'\u001b[0m: \u001b[32m'https://techcrunch.com/category/artificial-intelligence/'\u001b[0m,\n", " \u001b[32m'engines'\u001b[0m: \u001b[1m[\u001b[0m\u001b[32m'google'\u001b[0m\u001b[1m]\u001b[0m,\n", " \u001b[32m'category'\u001b[0m: \u001b[32m'general'\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", " \u001b[32m'snippet'\u001b[0m: \u001b[32m\"2 Jul 2025 — Here's a recap of some of our biggest AI updates from June, including more ways to\u001b[0m\n", "\u001b[32msearch with AI Mode, a new way to share your NotebookLM notebooks publicly.\"\u001b[0m,\n", " \u001b[32m'title'\u001b[0m: \u001b[32m'The latest AI news we announced in June'\u001b[0m,\n", " \u001b[32m'link'\u001b[0m: \u001b[32m'https://blog.google/technology/ai/google-ai-updates-june-2025/'\u001b[0m,\n", " \u001b[32m'engines'\u001b[0m: \u001b[1m[\u001b[0m\u001b[32m'google'\u001b[0m\u001b[1m]\u001b[0m,\n", " \u001b[32m'category'\u001b[0m: \u001b[32m'general'\u001b[0m\n", " \u001b[1m}\u001b[0m\n", "\u001b[1m]\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Get the top 3 results from querying the web\n", "results = searcher.query_search(\"latest news in AI\", num_results=3)\n", "print(results)" ] }, { "cell_type": "markdown", "id": "ed3c2eb8", "metadata": {}, "source": [ "## 5. Document Conversion from URLs\n", "\n", "Convert URLs into document objects using the `urls_to_docs` function. This allows you to process and analyze web content as structured documents." ] }, { "cell_type": "code", "execution_count": 11, "id": "ca473c3a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py:136: UserWarning: resource_tracker: process died unexpectedly, relaunching. Some resources might leak.\n", " warnings.warn('resource_tracker: process died unexpectedly, '\n", "[2025-07-27 12:05:53,392] INFO utils.websearch_utils: Fetching URL: https://en.wikipedia.org/wiki/India\n", "[2025-07-27 12:05:53,392] INFO utils.websearch_utils: Fetching URL: https://en.wikipedia.org/wiki/India\n", "[2025-07-27 12:05:53,395] INFO utils.websearch_utils: Fetching URL: https://en.wikipedia.org/wiki/Bangalore\n", "[2025-07-27 12:05:53,395] INFO utils.websearch_utils: Fetching URL: https://en.wikipedia.org/wiki/Bangalore\n", "[2025-07-27 12:05:53,696] INFO utils.websearch_utils: Fetched content from https://en.wikipedia.org/wiki/Bangalore with type text/html; charset=UTF-8\n", "[2025-07-27 12:05:53,696] INFO utils.websearch_utils: Fetched content from https://en.wikipedia.org/wiki/Bangalore with type text/html; charset=UTF-8\n", "[2025-07-27 12:05:53,719] INFO utils.websearch_utils: Fetched content from https://en.wikipedia.org/wiki/India with type text/html; charset=UTF-8\n", "[2025-07-27 12:05:53,719] INFO utils.websearch_utils: Fetched content from https://en.wikipedia.org/wiki/India with type text/html; charset=UTF-8\n", "[2025-07-27 12:05:56,040] INFO utils.websearch_utils: Processed markdown for: https://en.wikipedia.org/wiki/Bangalore\n", "[2025-07-27 12:05:56,040] INFO utils.websearch_utils: Processed markdown for: https://en.wikipedia.org/wiki/Bangalore\n", "[2025-07-27 12:05:56,073] INFO utils.websearch_utils: Processed markdown for: https://en.wikipedia.org/wiki/India\n", "[2025-07-27 12:05:56,073] INFO utils.websearch_utils: Processed markdown for: https://en.wikipedia.org/wiki/India\n", "[2025-07-27 12:05:56,074] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://en.wikipedia.org/wiki/India\n", "[2025-07-27 12:05:56,074] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://en.wikipedia.org/wiki/India\n", "[2025-07-27 12:05:56,074] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://en.wikipedia.org/wiki/Bangalore\n", "[2025-07-27 12:05:56,074] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://en.wikipedia.org/wiki/Bangalore\n", "[2025-07-27 12:05:57,351] INFO utils.websearch_utils: Total URLs processed: 2\n", "[2025-07-27 12:05:57,351] INFO utils.websearch_utils: Total URLs processed: 2\n" ] }, { "data": { "text/html": [ "
Loaded 2 documents.\n",
       "
\n" ], "text/plain": [ "Loaded \u001b[1;36m2\u001b[0m documents.\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Document(\n",
       "    metadata={'source': 'https://en.wikipedia.org/wiki/India'},\n",
       "    page_content='India - Wikipedia\\n\\nJump to content\\n\\nCoordinates: 21°N 78°E\\ufeff / \\ufeff21°N 78°E\\ufeff \n",
       "/ 21; 78\\n\\n![Featured \\n\\n![Extended-protected \\n\\nFrom Wikipedia, the free encyclopedia\\n\\nCountry in South \n",
       "Asia\\n\\nThis article is about the country. For other uses, see India (disambiguation).\\n\\n| Republic of India  \n",
       "*BhÄ\\x81rat Gaá¹\\x87arÄ\\x81jya* | |\\n| --- | --- |\\n| Horizontal tricolour flag bearing, from top to bottom, deep \n",
       "saffron, white, and green horizontal bands. In the centre of the white band is a navy-blue wheel with 24 spokes.  \n",
       "Flag  State emblem | |\\n| **Motto:**Satyameva Jayate\\xa0(Sanskrit) \"Truth Alone Triumphs\"[1] | |\\n| \n",
       "**Anthem:**\\xa0Jana Gana Mana\\xa0(Hindi)[a][2][3] \"Thou Art the Ruler of the Minds of All People\"[4][2] | |\\n| \n",
       "**National song: Vande Mataram\\xa0(Sanskrit)[c]** \"I Bow to Thee, Mother\"[b][1][2] | |\\n| Image of a globe centred \n",
       "on India, with India highlighted. Territory controlled by India  Territory claimed but not controlled | |\\n| \n",
       "Capital | New Delhi 28°36â\\x80²50â\\x80³N 77°12â\\x80²30â\\x80³E / 28.61389°N 77.20833°E / 28.61389; \n",
       "77.20833 |\\n| Largest city by city proper population | Mumbai |\\n| Largest city by metropolitan area population | \n",
       "Delhi |\\n| Official\\xa0languages |  |\\n| Recognised regional\\xa0languages | State level and Eighth Schedule[9] |\\n|\n",
       "Native languages | 424 languages[g] |\\n| Religion (2011)[11] |  |\\n| Demonym(s) |  |\\n| Government | Federal \n",
       "parliamentary republic |\\n|  | |\\n| â\\x80¢\\xa0President | Droupadi Murmu |\\n| â\\x80¢\\xa0Prime Minister | Narendra \n",
       "Modi |\\n|  | |\\n| Legislature | Parliament |\\n| â\\x80¢\\xa0Upper house | Rajya Sabha |\\n| â\\x80¢\\xa0Lower house | \n",
       "Lok Sabha |\\n| Independence from the United Kingdom | |\\n|  | |\\n| â\\x80¢\\xa0Dominion | 15 August 1947 |\\n| \n",
       "â\\x80¢\\xa0Republic | 26 January 1950 |\\n|  | |\\n| Area | |\\n| â\\x80¢\\xa0Total | 3,287,263\\xa0km2 \n",
       "(1,269,219\\xa0sq\\xa0mi)[2][h] (7th) |\\n| â\\x80¢\\xa0Water\\xa0(%) | 9.6 |\\n| Population | |\\n| â\\x80¢\\xa02023 \n",
       "estimate | Neutral increase 1,428,627,663[13] (1st) |\\n| â\\x80¢\\xa02011\\xa0census | Neutral increase  (2nd) |\\n| \n",
       "â\\x80¢\\xa0Density | 430.5/km2 (1,115.0/sq\\xa0mi) (30th) |\\n| GDP\\xa0(PPP) | 2025\\xa0estimate |\\n| â\\x80¢\\xa0Total |\n",
       "Increase $17.647 trillion[16] (3rd) |\\n| â\\x80¢\\xa0Per capita | Increase $12,132[16] (119th) |\\n| GDP\\xa0(nominal) \n",
       "| 2025\\xa0estimate |\\n| â\\x80¢\\xa0Total | Increase $4.187 trillion[16] (4th) |\\n| â\\x80¢\\xa0Per capita | Increase \n",
       "$2,878[16] (136th) |\\n| Gini\\xa0(2021) | Positive decrease\\xa025.5[17] low inequality |\\n| HDI\\xa0(2023) | \n",
       "Increase\\xa00.685[18] medium\\xa0(130th) |\\n| Currency | Indian rupee (â\\x82¹) (INR) |\\n| Time zone | UTC+05:30 \n",
       "(IST) |\\n| Date format |  |\\n| Calling code | +91 |\\n| ISO 3166 code | IN |\\n| Internet TLD | .in (others) \n",
       "|\\n\\n**India**, officially the **Republic of India**,[j][20] is a country in South Asia. It is the seventh-largest \n",
       "country by area; the most populous country since 2023;[21] and, since its independence in 1947, the world\\'s most \n",
       "populous  Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the \n",
       "southeast, it shares land borders with Pakistan to the west;[k] China, Nepal, and Bhutan to the north; and \n",
       "Bangladesh and Myanmar to the east. In the Indian Ocean, India is near Sri Lanka and the Maldives; its Andaman and \n",
       "Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.\\n\\nModern humans arrived on the \n",
       "Indian subcontinent from Africa no later than 55,000 years ago.[26][27][28] Their long occupation, predominantly in\n",
       "isolation as hunter-gatherers, has made the region highly diverse.[29] Settled life emerged on the subcontinent in \n",
       "the western margins of the Indus river basin 9,000 years ago, evolving gradually into the Indus Valley Civilisation\n",
       "of the third millennium BCE.[30] By 1200\\xa0BCE, an archaic form of Sanskrit, an Indo-European language, had \n",
       "diffused into India from the northwest.[31][32] Its hymns recorded the early dawnings of Hinduism in India.[33] \n",
       "India\\'s pre-existing Dravidian languages were supplanted in the northern regions.[34] By 400\\xa0BCE, caste had \n",
       "emerged within Hinduism,[35] and Buddhism and Jainism had arisen, proclaiming social orders unlinked to \n",
       "heredity.[36] Early political consolidations gave rise to the loose-knit Maurya and Gupta Empires.[37] Widespread \n",
       "creativity suffused this era,[38] but the status of women declined,[39] and untouchability became an organized \n",
       "belief.[l][40] In South India, the Middle kingdoms exported Dravidian language scripts and religious cultures to \n",
       "the kingdoms of Southeast Asia.[41]\\n\\nIn the early medieval era, Christianity, Islam, Judaism, and Zoroastrianism \n",
       "became established on India\\'s southern and western coasts.[42] Muslim armies from Central Asia intermittently \n",
       "overran India\\'s northern plains in the second millennium.[43] The resulting Delhi Sultanate drew northern India \n",
       "into the cosmopolitan networks of medieval Islam.[44] In south India, the Vijayanagara Empire created a \n",
       "long-lasting composite Hindu culture.[45] In the Punjab, Sikhism emerged, rejecting institutionalised religion.[46]\n",
       "The Mughal Empire ushered in two centuries of economic expansion and relative peace,[47] leaving a rich \n",
       "architectural legacy.[48][49] Gradually expanding rule of the British East India Company turned India into a \n",
       "colonial economy but consolidated its sovereignty.[50] British Crown rule began in 1858. The rights promised to \n",
       "Indians were granted slowly,[51][52] but technological changes were introduced, and modern ideas of education and \n",
       "the public life took root.[53] A nationalist movement emerged in India, the first in the non-European British \n",
       "empire and an influence on other nationalist movements.[54][55] Noted for nonviolent resistance after 1920,[56] it \n",
       "became the primary factor in ending British rule.[57] In 1947, the British Indian Empire was partitioned into two \n",
       "independent  a Hindu-majority dominion of India and a Muslim-majority dominion of Pakistan. A large-scale loss of \n",
       "life and an unprecedented migration accompanied the partition.[62]\\n\\nIndia has been a federal republic since 1950,\n",
       "governed through a democratic parliamentary system. It is a pluralistic, multilingual and multi-ethnic society. \n",
       "India\\'s population grew from 361 million in 1951 to over 1.4 billion in 2023.[63] During this time, its nominal \n",
       "per capita income increased from US$64 annually to US$2,601, and its literacy rate from 16.6% to 74%. A \n",
       "comparatively destitute country in 1951,[64] India has become a fast-growing major economy and hub for information \n",
       "technology services; it has an expanding middle class.[65] Indian movies and music increasingly influence global \n",
       "culture.[66] India has reduced its poverty rate, though at the cost of increasing economic inequality.[67] It is a \n",
       "nuclear-weapon state that ranks high in military expenditure. It has disputes over Kashmir with its neighbours, \n",
       "Pakistan and China, unresolved since the mid-20th century.[68] Among the socio-economic challenges India faces are \n",
       "gender inequality, child malnutrition,[69] and rising levels of air pollution.[70] India\\'s land is megadiverse \n",
       "with four biodiversity hotspots.[71] India\\'s wildlife, which has traditionally been viewed with tolerance in its \n",
       "culture,[72] is supported in protected habitats.\\n\\nEtymology\\n---------\\n\\nMain article: Names for \n",
       "India\\n\\nAccording to the *Oxford English Dictionary* (2009), the name \"India\" is derived from the Classical Latin \n",
       "*India*, a reference to South Asia and an uncertain region to its east. In turn \"India\" derived successively from \n",
       "Hellenistic Greek *India* (Ἰνδία), Ancient Greek *Indos* (ἸνδÏ\\x8cÏ\\x82), Old Persian *Hindush* (an \n",
       "eastern province of the Achaemenid Empire), and ultimately its cognate, the Sanskrit *Sindhu*, or  the Indus River,\n",
       "and by extension its well-settled southern basin.[73][74] The Ancient Greeks referred to the Indians as *Indoi*, \n",
       "\\'the people of the Indus\\'.[75]\\n\\nThe term *Bharat* (*BhÄ\\x81rat*; pronounced [Ë\\x88bʱaË\\x90ɾÉ\\x99t] \n",
       "â\\x93\\x98), mentioned in both Indian epic poetry and the Constitution of India,[76][77] is used in its variations \n",
       "by many Indian languages. A modern rendering of the historical name *Bharatavarsha*, which applied originally to \n",
       "North India,[78][79] *Bharat* gained increased currency from the mid-19th century as a native name for \n",
       "India.[76][80]\\n\\n*Hindustan* ([ɦɪndÊ\\x8aË\\x88staË\\x90n] â\\x93\\x98) is a Middle Persian name for India that \n",
       "became popular by the 13th century,[81] and was used widely since the era of the Mughal Empire. The meaning of \n",
       "*Hindustan* has varied, referring to a region encompassing the northern Indian subcontinent (present-day northern \n",
       "India and Pakistan) or to India in its near \\n\\nHistory\\n-------\\n\\nMain articles: History of India and History of \n",
       "the Republic of India\\n\\n### Ancient India\\n\\n\\n\\nManuscript illustration, c.â\\x80\\x891650, of the Sanskrit epic \n",
       "Ramayana, composed in story-telling fashion c.â\\x80\\x89400\\xa0BCE\\xa0â\\x80\\x93 c.â\\x80\\x89300\\xa0CE[83]\\n\\nBy \n",
       "55,000 years ago, the first modern humans, or *Homo sapiens*, had arrived on the Indian subcontinent from \n",
       "Africa.[26][27][28] The earliest known modern human remains in South Asia date to about 30,000 years ago.[26] After\n",
       "6500\\xa0BCE, evidence for domestication of food crops and animals, construction of permanent structures, and \n",
       "storage of agricultural surplus appeared in Mehrgarh and other sites in Balochistan, Pakistan.[84] These gradually \n",
       "developed into the Indus Valley  the first urban culture in South Asia,[86] which flourished during \n",
       "2500â\\x80\\x931900\\xa0BCE in Pakistan and western India.[87] Centred around cities such as Mohenjo-daro, Harappa, \n",
       "Dholavira, and Kalibangan, and relying on varied forms of subsistence, the civilisation engaged robustly in crafts \n",
       "production and wide-ranging trade.[86]\\n\\nDuring the period 2000â\\x80\\x93500\\xa0BCE, many regions of the \n",
       "subcontinent transitioned from the Chalcolithic cultures to the Iron Age ones.[88] The Vedas, the oldest scriptures\n",
       "associated with Hinduism,[89] were composed during this period,[90] and historians have analysed these to posit a \n",
       "Vedic culture in the Punjab region and the upper Gangetic Plain.[88] Most historians also consider this period to \n",
       "have encompassed several waves of Indo-Aryan migration into the subcontinent from the north-west.[89] The caste \n",
       "system, which created a hierarchy of priests, warriors, and free peasants, but which excluded indigenous peoples by\n",
       "labelling their occupations impure, arose during this period.[91] On the Deccan Plateau, archaeological evidence \n",
       "from this period suggests the existence of a chiefdom stage of political organisation.[88] In South India, a \n",
       "progression to sedentary life is indicated by the large number of megalithic monuments dating from this period,[92]\n",
       "as well as by nearby traces of agriculture, irrigation tanks, and craft traditions.[92]\\n\\n\\n\\nCave 26 of the \n",
       "rock-cut Ajanta Caves\\n\\nIn the late Vedic period, around the 6th century BCE, the small states and chiefdoms of \n",
       "the Ganges Plain and the north-western regions had consolidated into 16 major oligarchies and monarchies that were \n",
       "known as the  The emerging urbanisation gave rise to non-Vedic religious movements, two of which became independent\n",
       "religions. Jainism came into prominence during the life of its exemplar, Mahavira.[95] Buddhism, based on the \n",
       "teachings of Gautama Buddha, attracted followers from all social classes excepting the middle class; chronicling \n",
       "the life of the Buddha was central to the beginnings of recorded history in India.[96][97][98] In an age of \n",
       "increasing urban wealth, both religions held up renunciation as an ideal,[99] and both established long-lasting \n",
       "monastic traditions. Politically, by the 3rd century BCE, the kingdom of Magadha had annexed or reduced other \n",
       "states to emerge as the Maurya Empire.[100] The empire was once thought to have controlled most of the subcontinent\n",
       "except the far south, but its core regions are now thought to have been separated by large autonomous \n",
       "areas.[101][102] The Mauryan kings are known as much for their empire-building and determined management of public \n",
       "life as for Ashoka\\'s renunciation of militarism and far-flung advocacy of the Buddhist *dhamma*.[103][104]\\n\\nThe \n",
       "Sangam literature of the Tamil language reveals that, between 200\\xa0BCE and 200\\xa0CE, the southern peninsula was \n",
       "ruled by the Cheras, the Cholas, and the Pandyas, dynasties that traded extensively with the Roman Empire and with \n",
       "West and Southeast Asia.[105][106] In North India, Hinduism asserted patriarchal control within the family, leading\n",
       "to increased subordination of women.[107][100] By the 4th and 5th centuries, the Gupta Empire had created a complex\n",
       "system of administration and taxation in the greater Ganges Plain; this system became a model for later Indian \n",
       "kingdoms.[108][109] Under the Guptas, a renewed Hinduism based on devotion, rather than the management of ritual, \n",
       "began to assert itself.[110] This renewal was reflected in a flowering of sculpture and architecture, which found \n",
       "patrons among an urban elite.[109] Classical Sanskrit literature flowered as well, and Indian science, astronomy, \n",
       "medicine, and mathematics made significant advances.[109]\\n\\n### Medieval India\\n\\nMain article: Medieval \n",
       "India\\n\\n\\n\\nBrihadeshwara temple, Thanjavur, completed in 1010\\xa0CE\\n\\n\\n\\nThe Qutub Minar, 73\\xa0m (240\\xa0ft) \n",
       "tall, completed by the Sultan of Delhi, Iltutmish\\n\\nThe Indian early medieval age, from 600 to 1200\\xa0CE, is \n",
       "defined by regional kingdoms and cultural diversity.[111] When Harsha of Kannauj, who ruled much of the \n",
       "Indo-Gangetic Plain from 606 to 647\\xa0CE, attempted to expand southwards, he was defeated by the Chalukya ruler of\n",
       "the Deccan.[112] When his successor attempted to expand eastwards, he was defeated by the Pala king of Bengal.[112]\n",
       "When the Chalukyas attempted to expand southwards, they were defeated by the Pallavas from farther south, who in \n",
       "turn were opposed by the Pandyas and the Cholas from still farther south.[112] No ruler of this period was able to \n",
       "create an empire and consistently control lands much beyond their core region.[111] During this time, pastoral \n",
       "peoples, whose land had been cleared to make way for the growing agricultural economy, were accommodated within \n",
       "caste society, as were new non-traditional ruling classes.[113] The caste system consequently began to show \n",
       "regional differences.[113]\\n\\nIn the 6th and 7th centuries, the first devotional hymns were created in the Tamil \n",
       "language.[114] They were imitated all over India and led to both the resurgence of Hinduism and the development of \n",
       "all modern languages of the subcontinent.[114] Indian royalty, big and small, and the temples they patronised drew \n",
       "citizens in great numbers to the capital cities, which became economic hubs as well.[115] Temple towns of various \n",
       "sizes began to appear everywhere as India underwent another urbanisation.[115] By the 8th and 9th centuries, the \n",
       "effects were felt in Southeast Asia, as South Indian culture and political systems were exported to lands that \n",
       "became part of modern-day Myanmar, Thailand, Laos, Brunei, Cambodia, Vietnam, Philippines, Malaysia, and \n",
       "Indonesia.[116] Indian merchants, scholars, and sometimes armies were involved in this transmission; Southeast \n",
       "Asians took the initiative as well, with many sojourning in Indian seminaries and translating Buddhist and Hindu \n",
       "texts into their languages.[116]\\n\\nAfter the 10th century, Muslim Central Asian nomadic clans, using swift-horse \n",
       "cavalry and raising vast armies united by ethnicity and religion, repeatedly overran South Asia\\'s north-western \n",
       "plains, leading eventually to the establishment of the Islamic Delhi Sultanate in 1206.[117] The sultanate was to \n",
       "control much of North India and to make many forays into South India. Although at first disruptive for the Indian \n",
       "elites, the sultanate largely left its vast non-Muslim subject population to its own laws and customs.[118][119] By\n",
       "repeatedly repulsing Mongol raiders in the 13th century, the sultanate saved India from the devastation visited on \n",
       "West and Central Asia, setting the scene for centuries of migration of fleeing soldiers, learned men, mystics, \n",
       "traders, artists, and artisans from that region into the subcontinent, thereby creating a syncretic Indo-Islamic \n",
       "culture in the north.[120][121] The sultanate\\'s raiding and weakening of the regional kingdoms of South India \n",
       "paved the way for the indigenous Vijayanagara Empire.[122] Embracing a strong Shaivite tradition and building upon \n",
       "the military technology of the sultanate, the empire came to control much of peninsular India,[123] and was to \n",
       "influence South Indian society for long afterwards.[122]\\n\\n### Early modern India\\n\\n\\n\\nA distant view of the Taj\n",
       "Mahal from the Agra Fort\\n\\n\\n\\nA two mohur Company gold coin, issued in 1835, the obverse inscribed \"William IIII,\n",
       "King\"\\n\\nIn the early 16th century, northern India, then under mainly Muslim rulers,[124] fell again to the \n",
       "superior mobility and firepower of a new generation of Central Asian warriors.[125] The resulting Mughal Empire did\n",
       "not stamp out the local societies it came to rule. Instead, it balanced and pacified them through new \n",
       "administrative practices[126][127] and diverse and inclusive ruling elites,[128] leading to more systematic, \n",
       "centralised, and uniform rule.[129] Eschewing tribal bonds and Islamic identity, especially under Akbar, the \n",
       "Mughals united their far-flung realms through loyalty, expressed through a Persianised culture, to an emperor who \n",
       "had near-divine status.[128] The Mughal state\\'s economic policies, deriving most revenues from agriculture[130] \n",
       "and mandating that taxes be paid in the well-regulated silver currency,[131] caused peasants and artisans to enter \n",
       "larger markets.[129] The relative peace maintained by the empire during much of the 17th century was a factor in \n",
       "India\\'s economic expansion,[129] resulting in greater patronage of painting, literary forms, textiles, and \n",
       "architecture.[132] Newly coherent social groups in northern and western India, such as the Marathas, the Rajputs, \n",
       "and the Sikhs, gained military and governing ambitions during Mughal rule, which, through collaboration or \n",
       "adversity, gave them both recognition and military experience.[133] Expanding commerce during Mughal rule gave rise\n",
       "to new Indian commercial and political elites along the coasts of southern and eastern India.[133] As the empire \n",
       "disintegrated, many among these elites were able to seek and control their own affairs.[134]\\n\\nBy the early 18th \n",
       "century, with the lines between commercial and political dominance being increasingly blurred, a number of European\n",
       "trading companies, including the English East India Company, had established coastal outposts.[135][136] The East \n",
       "India Company\\'s control of the seas, greater resources, and more advanced military training and technology led it \n",
       "to increasingly assert its military strength and caused it to become attractive to a portion of the Indian elite; \n",
       "these factors were crucial in allowing the company to gain control over the Bengal region by 1765 and sideline the \n",
       "other European  Its further access to the riches of Bengal and the subsequent increased strength and size of its \n",
       "army enabled it to annex or subdue most of India by the 1820s.[140] India was then no longer exporting manufactured\n",
       "goods as it long had, but was instead supplying the British Empire with raw materials. Many historians consider \n",
       "this to be the onset of India\\'s colonial period.[135] By this time, with its economic power severely curtailed by \n",
       "the British parliament and having effectively been made an arm of British administration, the East India Company \n",
       "began more consciously to enter non-economic arenas, including education, social reform, and culture.[141]\\n\\n### \n",
       "Modern India\\n\\nMain article: History of India (1947â\\x80\\x93present)\\n\\n\\n\\n1909 map of the British Indian \n",
       "Empire\\n\\n\\n\\nJawaharlal Nehru sharing a light moment with Mahatma Gandhi, Mumbai, 6 July 1946\\n\\nHistorians \n",
       "consider India\\'s modern age to have begun sometime between 1848 and 1885. The appointment in 1848 of Lord \n",
       "Dalhousie as Governor General of the East India Company set the stage for changes essential to a modern state. \n",
       "These included the consolidation and demarcation of sovereignty, the surveillance of the population, and the \n",
       "education of citizens. Technological changesâ\\x80\\x94among them, railways, canals, and the telegraphâ\\x80\\x94were \n",
       "introduced not long after their introduction in  However, disaffection with the company also grew during this time \n",
       "and set off the Indian Rebellion of 1857. Fed by diverse resentments and perceptions, including invasive \n",
       "British-style social reforms, harsh land taxes, and summary treatment of some rich landowners and princes, the \n",
       "rebellion rocked many regions of northern and central India and shook the foundations of Company rule.[146][147] \n",
       "Although the rebellion was suppressed by 1858, it led to the dissolution of the East India Company and the direct \n",
       "administration of India by the British government. Proclaiming a unitary state and a gradual but limited \n",
       "British-style parliamentary system, the new rulers also protected princes and landed gentry as a feudal safeguard \n",
       "against future unrest.[148][149] In the decades following, public life gradually emerged all over India, leading \n",
       "eventually to the founding of the Indian National Congress in \\n\\nThe rush of technology and the commercialisation \n",
       "of agriculture in the second half of the 19th century was marked by economic setbacks, and many small farmers \n",
       "became dependent on the whims of far-away markets.[154] There was an increase in the number of large-scale \n",
       "famines,[155] and, despite the risks of infrastructure development borne by Indian taxpayers, little industrial \n",
       "employment was generated for Indians.[156] There were also salutary effects: commercial cropping, especially in the\n",
       "newly canalled Punjab, led to increased food production for internal consumption.[157] The railway network provided\n",
       "critical famine relief,[158] notably reduced the cost of moving goods,[158] and helped nascent Indian-owned \n",
       "industry.[157]\\n\\nAfter World War I, in which approximately one million Indians served,[159] a new period began. It\n",
       "was marked by British reforms but also repressive legislation, by more strident Indian calls for self-rule, and by \n",
       "the beginnings of a nonviolent movement of non-co-operation, of which Mahatma Gandhi would become the leader and \n",
       "enduring symbol.[160] During the 1930s, slow legislative reform was enacted by the British; the Indian National \n",
       "Congress won victories in the resulting elections.[161] The next decade was beset with crises: Indian participation\n",
       "in World War\\xa0II, the Congress\\'s final push for non-co-operation, and an upsurge of Muslim nationalism. All were\n",
       "capped by the advent of independence in 1947, but tempered by the partition of India into two states: India and \n",
       "Pakistan.[162]\\n\\nVital to India\\'s self-image as an independent nation was its constitution, completed in 1950, \n",
       "which put in place a secular and democratic republic.[163] Economic liberalisation, which began in the 1980s and \n",
       "with the collaboration with Soviet Union for technical knowledge,[164] has created a large urban middle class, \n",
       "transformed India into one of the world\\'s fastest-growing economies,[165] and increased its geopolitical \n",
       "influence. Yet, India is also shaped by persistent poverty, both rural and urban;[166] by religious and \n",
       "caste-related violence;[167] by Maoist-inspired Naxalite insurgencies;[168] and by separatism in Jammu and Kashmir \n",
       "and in Northeast India.[169] It has unresolved territorial disputes with China and with Pakistan.[170] India\\'s \n",
       "sustained democratic freedoms are unique among the world\\'s newer nations; however, in spite of its recent economic\n",
       "successes, freedom from want for its disadvantaged population remains a goal yet to be achieved.[171] As of 2025, \n",
       "poverty in India declined sharply, mainly due to government welfare programs.[172]\\n\\nGeography\\n---------\\n\\nMain \n",
       "article: Geography of India\\n\\n\\n\\nThe Tungabhadra, with rocky outcrops, flows into the peninsular Krishna \n",
       "River[173]\\n\\n\\n\\nFishing boats lashed together in a tidal creek in Anjarle village, Maharashtra\\n\\nIndia accounts \n",
       "for the bulk of the Indian subcontinent, lying atop the Indian tectonic plate, a part of the Indo-Australian \n",
       "Plate.[174] India\\'s defining geological processes began 75 million years ago when the Indian Plate, then part of \n",
       "the southern supercontinent Gondwana, began a north-eastward drift caused by seafloor spreading to its south-west, \n",
       "and later, south and south-east.[174] Simultaneously, the vast Tethyan oceanic crust, to its northeast, began to \n",
       "subduct under the Eurasian Plate.[174] These dual processes, driven by convection in the Earth\\'s mantle, both \n",
       "created the Indian Ocean and caused the Indian continental crust eventually to under-thrust Eurasia and to uplift \n",
       "the Himalayas.[174] Immediately south of the emerging Himalayas, plate movement created a vast crescent-shaped \n",
       "trough that rapidly filled with river-borne sediment[175] and now constitutes the Indo-Gangetic Plain.[176] The \n",
       "original Indian plate makes its first appearance above the sediment in the ancient Aravalli range, which extends \n",
       "from the Delhi Ridge in a southwesterly direction. To the west lies the Thar Desert, the eastern spread of which is\n",
       "checked by the \\n\\nThe remaining Indian Plate survives as peninsular India, the oldest and geologically most stable\n",
       "part of India. It extends as far north as the Satpura and Vindhya ranges in central India. These parallel chains \n",
       "run from the Arabian Sea coast in Gujarat in the west to the coal-rich Chota Nagpur Plateau in Jharkhand in the \n",
       "east.[180] To the south, the remaining peninsular landmass, the Deccan Plateau, is flanked on the west and east by \n",
       "coastal ranges known as the Western and Eastern Ghats;[181] the plateau contains the country\\'s oldest rock \n",
       "formations, some over one billion years old. Constituted in such fashion, India lies to the north of the equator \n",
       "between 6° 44â\\x80² and 35° 30â\\x80² north latitude[m] and 68° 7â\\x80² and 97° 25â\\x80² east \n",
       "longitude.[182]\\n\\nIndia\\'s coastline measures 7,517 kilometres (4,700\\xa0mi) in length; of this distance, 5,423 \n",
       "kilometres (3,400\\xa0mi) belong to peninsular India and 2,094 kilometres (1,300\\xa0mi) to the Andaman, Nicobar, and\n",
       "Lakshadweep island chains.[183] According to the Indian naval hydrographic charts, the mainland coastline consists \n",
       "of the following: 43% sandy beaches; 11% rocky shores, including cliffs; and 46% mudflats or marshy shores.[183] \n",
       "Major Himalayan-origin rivers that substantially flow through India include the Ganges and the Brahmaputra, both of\n",
       "which drain into the Bay of Bengal.[184] Important tributaries of the Ganges include the Yamuna and the Kosi; the \n",
       "latter\\'s extremely low gradient, caused by long-term silt deposition, leads to severe floods and course \n",
       "changes.[185][186] Major peninsular rivers, whose steeper gradients prevent their waters from flooding, include the\n",
       "Godavari, the Mahanadi, the Kaveri, and the Krishna, which also drain into the Bay of Bengal;[187] and the Narmada \n",
       "and the Tapti, which drain into the Arabian Sea.[188] Coastal features include the marshy Rann of Kutch of western \n",
       "India and the alluvial Sundarbans delta of eastern India; the latter is shared with Bangladesh.[189] India has two \n",
       "archipelagos: the Lakshadweep, coral atolls off India\\'s south-western coast; and the Andaman and Nicobar Islands, \n",
       "a volcanic chain in the Andaman Sea.[190]\\n\\nIndian climate is strongly influenced by the Himalayas and the Thar \n",
       "Desert, both of which drive the economically and culturally pivotal summer and winter monsoons.[191] The Himalayas \n",
       "prevent cold Central Asian katabatic winds from blowing in, keeping the bulk of the Indian subcontinent warmer than\n",
       "most locations at similar latitudes.[192][193] The Thar Desert plays a crucial role in attracting the \n",
       "moisture-laden south-west summer monsoon winds that, between June and October, provide the majority of India\\'s \n",
       "rainfall.[191] Four major climatic groupings predominate in India: tropical wet, tropical dry, subtropical humid, \n",
       "and montane.[194] Temperatures in India have risen by 0.7\\xa0°C (1.3\\xa0°F) between 1901 and 2018.[195] Climate \n",
       "change in India is often thought to be the cause. The retreat of Himalayan glaciers has adversely affected the flow\n",
       "rate of the major Himalayan rivers, including the Ganges and the Brahmaputra.[196] According to some current \n",
       "projections, the number and severity of droughts in India will have markedly increased by the end of the present \n",
       "century.[197]\\n\\n### Biodiversity\\n\\nMain articles: Forestry in India and Wildlife of India\\n\\n\\n\\nIndia has the \n",
       "majority of the world\\'s wild tigers, approximately 3,170 in 2022.[198]\\n\\n\\n\\nA chital (*Axis axis*) stag in the \n",
       "Nagarhole National Park in a region covered by a moderately dense[n] forest.\\n\\n\\n\\nThree of the last Asiatic \n",
       "cheetahs in India were shot dead in 1948 in Surguja district, Madhya Pradesh, Central India by Maharajah Ramanuj \n",
       "Pratap Singh Deo. The young male cheetahs, all from the same litter, were sitting together when they were shot at \n",
       "night.\\n\\nIndia is a megadiverse country, a term employed for 17 countries that display high biological diversity \n",
       "and contain many species exclusively indigenous, or endemic, to them.[199] India is the habitat for 8.6% of all \n",
       "mammals, 13.7% of bird species, 7.9% of reptile species, 6% of amphibian species, 12.2% of fish species, and 6.0% \n",
       "of all flowering plant species.[200][201] Fully a third of Indian plant species are endemic.[202] India also \n",
       "contains four of the world\\'s 34 biodiversity hotspots,[71] or regions that display significant habitat loss in the\n",
       "presence of high endemism.[o][203]\\n\\nIndia\\'s most dense forests, such as the tropical moist forest of the Andaman\n",
       "Islands, the Western Ghats, and Northeast India, occupy approximately 3% of its land area.[204][205] *Moderately \n",
       "dense forest*, whose canopy density is between 40% and 70%, occupies 9.39% of India\\'s land area.[204][205] It \n",
       "predominates in the temperate coniferous forest of the Himalayas, the moist deciduous *sal* forest of eastern \n",
       "India, and the dry deciduous teak forest of central and southern India.[206] India has two natural zones of thorn \n",
       "forest, one in the Deccan Plateau, immediately east of the Western Ghats, and the other in the western part of the \n",
       "Indo-Gangetic plain, now turned into rich agricultural land by irrigation, its features no longer visible.[207] \n",
       "Among the Indian subcontinent\\'s notable indigenous trees are the astringent *Azadirachta indica*, or *neem*, which\n",
       "is widely used in rural Indian herbal medicine,[208] and the luxuriant *Ficus religiosa*, or *peepul*,[209] which \n",
       "is displayed on the ancient seals of Mohenjo-daro,[210] and under which the Buddha is recorded in the Pali canon to\n",
       "have sought enlightenment.[211]\\n\\nMany Indian species have descended from those of Gondwana, the southern \n",
       "supercontinent from which India separated more than 100 million years ago.[212] India\\'s subsequent collision with \n",
       "Eurasia set off a mass exchange of species. However, volcanism and climatic changes later caused the extinction of \n",
       "many endemic Indian forms.[213] Still later, mammals entered India from Asia through two zoogeographic passes \n",
       "flanking the Himalayas.[214] This had the effect of lowering endemism among India\\'s mammals, which stands at \n",
       "12.6%, contrasting with 45.8% among reptiles and 55.8% among amphibians.[201] Among endemics are the \n",
       "vulnerable[215] hooded leaf monkey[216] and the threatened Beddome\\'s toad[217][218] of the Western Ghats.\\n\\nIndia\n",
       "contains 172 IUCN-designated threatened animal species, or 2.9% of endangered forms.[219] These include the \n",
       "endangered Bengal tiger and the Ganges river dolphin. Critically endangered species include the gharial, a \n",
       "crocodilian; the great Indian bustard; and the Indian white-rumped vulture, which has become nearly extinct by \n",
       "having ingested the carrion of diclofenac-treated cattle.[220] Before they were extensively used for agriculture \n",
       "and cleared for human settlement, the thorn forests of Punjab were mingled at intervals with open grasslands that \n",
       "were grazed by large herds of blackbuck preyed on by the Asiatic cheetah; the blackbuck, no longer extant in \n",
       "Punjab, is now severely endangered in India, and the cheetah is extinct.[221] The pervasive and ecologically \n",
       "devastating human encroachment of recent decades has critically endangered Indian wildlife. In response, the system\n",
       "of national parks and protected areas, first established in 1935, was expanded substantially. In 1972, India \n",
       "enacted the Wildlife Protection Act[222] and Project Tiger to safeguard crucial wilderness; the Forest Conservation\n",
       "Act was enacted in 1980 and amendments added in 1988.[223] India hosts more than five hundred wildlife sanctuaries \n",
       "and eighteen\\xa0biosphere reserves,[224] four of which are part of the World Network of Biosphere Reserves; its \n",
       "eighty-nine wetlands are registered under the Ramsar Convention.[225]\\n\\nPolitics and government\\n\\n\\n### \n",
       "Politics\\n\\nMain article: Politics of India\\n\\nSee also: Democracy in India\\n\\n\\n\\nAs part of Janadesh 2007, 25,000\n",
       "proâ\\x80\\x93land reform landless people in Madhya Pradesh listen to Rajagopal P. V.[226]\\n\\n\\n\\nUS president Barack\n",
       "Obama addresses the members of the Parliament of India in New Delhi in November 2010.\\n\\nIndia is a parliamentary \n",
       "republic with a multi-party system.[227] It has six\\xa0recognised national parties, including the Indian National \n",
       "Congress (INC) and the Bharatiya Janata Party (BJP), and over 50\\xa0regional parties.[228] Congress is considered \n",
       "the ideological centre in Indian political culture,[229] whereas the BJP is right-wing to  From 1950 to the late \n",
       "1980s, Congress held a majority in the India\\'s parliament. Afterwards, it increasingly shared power with the \n",
       "BJP,[233] as well as with powerful regional parties, which forced multi-party coalition governments at the \n",
       "centre.[234]\\n\\nIn the Republic of India\\'s general elections in 1951, 1957, and 1962, Congress, led by Jawaharlal \n",
       "Nehru, won easy victories. On Nehru\\'s death in 1964, Lal Bahadur Shastri briefly became prime minister; he was \n",
       "succeeded in 1966, by Nehru\\'s daughter Indira Gandhi, who led the Congress to election victories in 1967 and 1971.\n",
       "Following public discontent with the state of emergency Indira Gandhi had declared in 1975, Congress was voted out \n",
       "of power in 1977; Janata Party, which had opposed the emergency, was voted in. Its government lasted two years; \n",
       "Morarji Desai and Charan Singh served as prime ministers. After Congress was returned to power in 1980, Indira \n",
       "Gandhi was assassinated and succeeded by Rajiv Gandhi, who won easily in the elections later that year. In the 1989\n",
       "elections a National Front coalition, led by the Janata Dal in alliance with the Left Front, won, lasting just \n",
       "under two years, and V.P. Singh and Chandra Shekhar serving as prime ministers.[235] In the 1991 Indian general \n",
       "election, Congress, as the largest single party, formed a minority government led by P. V. Narasimha \n",
       "Rao.[236]\\n\\nAfter the 1996 Indian general election, the BJP formed a government briefly; it was followed by United\n",
       "Front coalitions, which depended on external political support. Two prime ministers served during this period: H.D.\n",
       "Deve Gowda and I.K. Gujral. In 1998, the BJP formed a coalitionâ\\x80\\x94the National Democratic Alliance (NDA). Led\n",
       "by Atal Bihari Vajpayee, the NDA became the first non-Congress, coalition government to complete a five-year \n",
       "term.[237] In the 2004 Indian general elections, no party won an absolute majority. Still, the Congress emerged as \n",
       "the largest single party, forming another successful coalition: the United Progressive Alliance (UPA). It had the \n",
       "support of left-leaning parties and MPs who opposed the BJP. The UPA returned to power in the 2009 general election\n",
       "with increased numbers, and it no longer required external support from India\\'s communist parties.[238] Manmohan \n",
       "Singh became the first prime minister since Jawaharlal Nehru in 1957 and 1962 to be re-elected to a consecutive \n",
       "five-year term.[239] In the 2014 general election, the BJP became the first political party since 1984 to win an \n",
       "absolute majority.[240] In the 2019 general election, the BJP regained an absolute majority. In the 2024 general \n",
       "election, a BJP-led NDA coalition formed the government. Narendra Modi, a former chief minister of Gujarat, is \n",
       "serving as the prime minister of India in his third term since May 26, 2014.[241]\\n\\n### Government\\n\\nMain \n",
       "article: Government of India\\n\\nSee also: Constitution of India\\n\\n\\n\\nRashtrapati Bhavan, the official residence \n",
       "of the President of India, was designed by British architects Edwin Lutyens and Herbert Baker for the Viceroy of \n",
       "India, and constructed between 1911 and 1931 during the British Raj.[242]\\n\\nIndia is a federation with a \n",
       "parliamentary system governed under the Constitution of India. Federalism in India defines the power distribution \n",
       "between the union and the states. India\\'s form of government, traditionally described as \"quasi-federal\" with a \n",
       "strong centre and weak states,[243] has grown increasingly federal since the late 1990s as a result of political, \n",
       "economic, and social changes.[244][245]\\n\\nThe Government of India comprises three branches: the Executive, \n",
       "Legislature, and Judiciary.[246] The President of India is the ceremonial head of state,[247] who is elected \n",
       "indirectly for a five-year term by an electoral college comprising members of national and state  The Prime \n",
       "Minister of India is the head of government and exercises most executive power.[250] Appointed by the \n",
       "president,[251] the prime minister is supported by the party or political alliance with a majority of seats in the \n",
       "lower house of parliament.[250] The executive of the Indian government consists of the president, the \n",
       "vice-president, and the Union Council of Ministersâ\\x80\\x94with the cabinet being its executive \n",
       "committeeâ\\x80\\x94headed by the prime minister. Any minister holding a portfolio must be a member of one of the \n",
       "houses of parliament.[247] In the Indian parliamentary system, the executive is subordinate to the legislature; the\n",
       "prime minister and their council are directly responsible to the lower house of the parliament. Civil servants act \n",
       "as permanent executives and all decisions of the executive are implemented by them.[252]\\n\\nThe legislature of \n",
       "India is the bicameral parliament. Operating under a Westminster-style parliamentary system, it comprises an upper \n",
       "house called the Rajya Sabha (Council of States) and a lower house called the Lok Sabha (House of the People).[253]\n",
       "The Rajya Sabha is a permanent body of 245\\xa0members who serve staggered six-year terms with elections every 2 \n",
       "years.[254] Most are elected indirectly by the state and union territorial legislatures in numbers proportional to \n",
       "their state\\'s share of the national population.[251] The Lok Sabha\\'s 543\\xa0members are elected directly by \n",
       "popular vote among citizens aged at least 18;[255] they represent single-member constituencies for \n",
       "five-year\\xa0terms.[256] Several seats from each state are reserved for candidates from Scheduled Castes and \n",
       "Scheduled Tribes in proportion to their population within that state.[255]\\n\\nIndia has a three-tier\\xa0unitary \n",
       "independent judiciary[257] comprising the supreme court, headed by the Chief Justice of India, 25\\xa0high courts, \n",
       "and a large number of trial courts.[257] The supreme court has original jurisdiction over cases involving \n",
       "fundamental rights and over disputes between states and the centre and has appellate jurisdiction over the high \n",
       "courts.[258] It has the power to both strike down union or state laws which contravene the constitution[259] and \n",
       "invalidate any government action it deems \\n\\n### Administrative divisions\\n\\nMain article: Administrative \n",
       "divisions of India\\n\\nSee also: Political integration of India\\n\\n\\n\\nA clickable map of the 28 states and 8 union \n",
       "territories of India\\n\\nIndia is a federal union comprising 28 states and 8 union territories.[12] All states, as \n",
       "well as the union territories of Jammu and Kashmir, Puducherry and the National Capital Territory of Delhi, have \n",
       "elected legislatures and governments following the Westminster system. The remaining five union territories are \n",
       "directly ruled by the central government through appointed administrators. In 1956, under the States Reorganisation\n",
       "Act, states were reorganised on a linguistic basis.[261] There are over a quarter of a million local government \n",
       "bodies at city, town, block, district and village levels.[262]\\n\\n#### States\\n\\n#### Union territories\\n\\nForeign,\n",
       "economic, and strategic relations\\n\\n\\nMain article: Foreign relations of India\\n\\nSee also: Indian Armed \n",
       "Forces\\n\\n\\n\\nDuring the 1950s and 60s, India played a pivotal role in the Non-Aligned Movement.[263] From left to \n",
       "right: Gamal Abdel Nasser of United Arab Republic (now Egypt), Josip Broz Tito of Yugoslavia and Jawaharlal Nehru \n",
       "in Belgrade, September 1961.\\n\\nIndia became a republic in 1950, remaining a member of the Commonwealth of \n",
       "Nations.[264][265] India strongly supported decolonisation in Africa and Asia in the 1950s; it played a leading \n",
       "role in the Non-Aligned Movement.[266] After cordial relations initially, India went to war with China in 1962. \n",
       "India was widely thought to have been humiliated.[267] Another military conflict followed in 1967 in which India \n",
       "successfully repelled a Chinese attack.[268] India has had uneasy relations with its western neighbour, Pakistan. \n",
       "The two countries went to war in 1947, 1965, 1971, and 1999. Three of these wars were fought over the disputed \n",
       "territory of Kashmir. In contrast, the 1971 war followed India\\'s support for the independence of Bangladesh.[269] \n",
       "After the 1965 war with Pakistan, India began to pursue close military and economic ties with the Soviet Union; by \n",
       "the late 1960s, the Soviet Union was its largest arms supplier.[270] India has played a key role in the South Asian\n",
       "Association for Regional Cooperation and the World Trade Organization. The nation has supplied 100,000 military and\n",
       "police personnel in 35 UN peacekeeping  needed*]\\n\\n\\n\\nThe Indian Air Force contingent marching at the 221st \n",
       "Bastille Day military parade in Paris, on 14 July 2009. The parade at which India was the foreign guest was led by \n",
       "India\\'s oldest regiment, the Maratha Light Infantry, founded in 1768.[271]\\n\\nChina\\'s nuclear test of 1964 and \n",
       "threats to intervene in support of Pakistan in the 1965 war caused India to produce nuclear weapons.[272] India \n",
       "conducted its first nuclear weapons test in 1974 and carried out additional underground testing in 1998. India has \n",
       "signed neither the Comprehensive Nuclear-Test-Ban Treaty nor the Nuclear Non-Proliferation Treaty, considering both\n",
       "to be flawed and discriminatory.[273] India maintains a \"no first use\" nuclear policy and is developing a nuclear \n",
       "triad capability as a part of its \"Minimum Credible Deterrence\" doctrine.[274][275]\\n\\nSince the end of the Cold \n",
       "War, India has increased its economic, strategic, and military cooperation with the United States and the European \n",
       "Union.[276] In 2008, a civilian nuclear agreement was signed between India and the United States. Although India \n",
       "possessed nuclear weapons at the time and was not a party to the Nuclear Non-Proliferation Treaty, it received \n",
       "waivers from the International Atomic Energy Agency and the Nuclear Suppliers Group, ending earlier restrictions on\n",
       "India\\'s nuclear technology and commerce; India subsequently signed co-operation agreements involving civilian \n",
       "nuclear energy with Russia,[277] France,[278] the United Kingdom,[279] and Canada.[280]\\n\\nThe President of India \n",
       "is the supreme commander of the nation\\'s armed forces; with 1.45\\xa0million active troops, they compose the \n",
       "world\\'s second-largest military. It comprises the Indian Army, the Indian Navy, the Indian Air Force, and the \n",
       "Indian Coast Guard.[281] The official Indian defence budget for 2011 was US$36.03\\xa0billion, or 1.83% of GDP.[282]\n",
       "Defence expenditure was pegged at US$70.12\\xa0billion for fiscal year 2022â\\x80\\x9323 and, increased 9.8% than \n",
       "previous fiscal year.[283][284] India is the world\\'s second-largest arms importer; between 2016 and 2020, it \n",
       "accounted for 9.5% of the total global arms imports.[285] Much of the military expenditure was focused on defence \n",
       "against Pakistan and countering growing Chinese influence in the Indian Ocean.[286]\\n\\nEconomy\\n-------\\n\\nMain \n",
       "article: Economy of India\\n\\n\\n\\nIn 2019, 43% of India\\'s total workforce was employed in \n",
       "agriculture.[287]\\n\\n\\n\\nIndia is the world\\'s largest producer of milk, with the largest population of cattle. In \n",
       "2018, nearly 80% of India\\'s milk was sourced from small farms with herd size between one and two, the milk \n",
       "harvested by hand milking.[289]\\n\\n\\n\\n55% of India\\'s female workforce was employed in agriculture in \n",
       "2019.[288]\\n\\nAccording to the International Monetary Fund (IMF), the Indian economy in 2024 was nominally worth \n",
       "$3.94\\xa0trillion; it was the fifth-largest economy by market exchange rates and is, at around $15.0\\xa0trillion, \n",
       "the third-largest by purchasing power parity (PPP).[16] With its average annual GDP growth rate of 5.8% over the \n",
       "past two decades, and reaching 6.1% during 2011â\\x80\\x932012,[290] India is one of the world\\'s fastest-growing \n",
       "economies.[291] However, due to its low GDP per capitaâ\\x80\\x94which ranks 136th in the world in nominal per capita\n",
       "income and 125th in per capita income adjusted for purchasing power parity (PPP)â\\x80\\x94the vast majority of \n",
       "Indians fall into the low-income group.[292][293] Until 1991, all Indian governments followed protectionist \n",
       "policies that were influenced by socialist economics. Widespread state intervention and regulation largely walled \n",
       "the economy off from the outside world. An acute balance of payments crisis in 1991 forced the nation to liberalise\n",
       "its economy;[294] since then, it has moved increasingly towards a free-market system[295][296] by emphasising both \n",
       "foreign trade and direct investment inflows.[297] India has been a member of World Trade Organization since 1 \n",
       "January 1995.[298]\\n\\nThe 522-million-worker Indian labour force is the world\\'s second largest, as of \n",
       "2017[update].[281] The service sector makes up 55.6% of GDP, the industrial sector 26.3% and the agricultural \n",
       "sector 18.1%. India\\'s foreign exchange remittances of US$100 billion in 2022,[299] highest in the world, were \n",
       "contributed to its economy by 32 million Indians working in foreign countries.[300] In 2006, the share of external \n",
       "trade in India\\'s GDP stood at 24%, up from 6% in 1985.[295] In 2008, India\\'s share of world trade was 1.7%;[301] \n",
       "In 2021, India was the world\\'s ninth-largest importer and the sixteenth-largest exporter.[302] Between 2001 and \n",
       "2011, the contribution of petrochemical and engineering goods to total exports grew from 14% to 42%.[303] India was\n",
       "the world\\'s second-largest textile exporter after China in the 2013 calendar year.[304]\\n\\nAveraging an economic \n",
       "growth rate of 7.5% for several years before 2007,[295] India has more than doubled its hourly wage rates during \n",
       "the first decade of the 21st century.[305] Some 431 million Indians have left poverty since 1985; India\\'s middle \n",
       "classes are projected to number around 580\\xa0million by 2030.[306] In 2023, India\\'s consumer market was the \n",
       "world\\'s fifth largest.[307] India\\'s nominal GDP per capita increased steadily from US$308 in 1991, when economic \n",
       "liberalisation began, to US$1,380 in 2010, to an estimated US$2,731 in 2024. It is expected to grow to US$3,264 by \n",
       "2026.[16]\\n\\n### Industries\\n\\n\\n\\nA tea garden in Sikkim. India, the world\\'s second-largest producer of tea, is a\n",
       "nation of one billion tea drinkers, who consume 70% of India\\'s tea output.\\n\\nThe Indian automotive industry, the \n",
       "world\\'s second-fastest growing, increased domestic sales by 26% during 2009â\\x80\\x932010,[308] and exports by 36% \n",
       "during 2008â\\x80\\x932009.[309] In 2022, India became the world\\'s third-largest vehicle market after China and the \n",
       "United States, surpassing Japan.[310] At the end of 2011, the Indian IT industry employed 2.8\\xa0million \n",
       "professionals, generated revenues close to US$100\\xa0billion equalling 7.5% of Indian GDP, and contributed 26% of \n",
       "India\\'s merchandise exports.[311]\\n\\nThe pharmaceutical industry in India includes 3,000 pharmaceutical companies \n",
       "and 10,500 manufacturing units; India is the world\\'s third-largest pharmaceutical producer, largest producer of \n",
       "generic medicines and supply up to 50â\\x80\\x9360% of global vaccines demand, these all contribute up to \n",
       "US$24.44\\xa0billions in exports and India\\'s local pharmaceutical market is estimated up to \n",
       "US$42\\xa0billion.[312][313] India is among the top 12 biotech destinations in the world.[314][315] The Indian \n",
       "biotech industry grew by 15.1% in 2012â\\x80\\x932013, increasing its revenues from â\\x82¹204.4\\xa0billion (Indian \n",
       "rupees) to â\\x82¹235.24\\xa0billion (US$3.94\\xa0billion at June 2013 exchange rates).[316]\\n\\n### Energy\\n\\nMain \n",
       "article: Energy in India\\n\\nSee also: Energy policy of India\\n\\nIndia\\'s capacity to generate electrical power is \n",
       "300 gigawatts, of which 42 gigawatts is renewable.[317] The country\\'s usage of coal is a major cause of India\\'s \n",
       "greenhouse gas emissions, but its renewable energy is competing \\xa0source\\xa0needed*] India emits about 7% of \n",
       "global greenhouse gas emissions. This equates to about 2.5 tons of carbon dioxide per person per year, which is \n",
       "half the world average.[319][320] Increasing access to electricity and clean cooking with liquefied petroleum gas \n",
       "have been priorities for energy in India.[321]\\n\\n### Socio-economic challenges\\n\\nMain articles: Poverty in India,\n",
       "Income inequality in India, and Debt bondage in India\\n\\n\\n\\nHealth workers about to begin another day of \n",
       "immunisation against infectious diseases in 2006. Eight years later, and three years after India\\'s last case of \n",
       "polio, the World Health Organization declared India to be polio-free.[322]\\n\\nDespite economic growth during recent\n",
       "decades, India continues to face socio-economic challenges. In 2006, India contained the largest number of people \n",
       "living below the World Bank\\'s international poverty line of US$1.25 per day.[323] The proportion decreased from \n",
       "60% in 1981 to 42% in 2005.[324] Under the World Bank\\'s later revised poverty line, it was 21%-22.5 in \n",
       "2011.[p][326][327] In 2019, the estimates had gone down to 10.2%.[327] In 2014, 30.7% of India\\'s children under \n",
       "the age of five were underweight.[328] According to a Food and Agriculture Organization report in 2015, 15% of the \n",
       "population was  The Midday Meal Scheme attempts to lower these rates.[331]\\n\\nA 2018 Walk Free Foundation report \n",
       "estimated that nearly 8\\xa0million people in India were living in different forms of modern slavery, such as bonded\n",
       "labour, child labour, human trafficking, and forced begging.[332] According to the 2011 census, there were \n",
       "10.1\\xa0million child labourers in the country, a decline of 2.6\\xa0million from 12.6\\xa0million in \n",
       "2001.[333]\\n\\nSince 1991, economic inequality between India\\'s states has consistently grown: the per-capita net \n",
       "state domestic product of the richest states in 2007 was 3.2 times that of the poorest.[334] Corruption in India is\n",
       "perceived to have decreased. According to the Corruption Perceptions Index, India ranked 78th out of 180 countries \n",
       "in 2018, an improvement from 85th in 2014.[335][336]\\n\\nAs of 2025, poverty in India declined sharply. According to\n",
       "the World Bank report, extreme poverty fall from 16.2% in 2011-12 to 2.3% in 2022-23. In rural areas it fell from \n",
       "18.4% to 2.8%, and in urban areas, from 10.7% to 1.1%. 378 million peopole were lifted from poverty and 171 million\n",
       "from extreme poverty. The main reason, according to the World Bank, is not economic growth but different government\n",
       "welfare programs, like transferring food and money to the people with low income, improving their access to \n",
       "services.[172]\\n\\nDemographics, languages, and religion\\n\\n\\nMain articles: Demographics of India, Languages of \n",
       "India, and Religion in India\\n\\nSee also: South Asian ethnic groups\\n\\n\\n\\nA Sikh pilgrim at the Harmandir Sahib, \n",
       "or Golden Temple, in Amritsar, Punjab\\n\\n\\n\\nThe interior of San Thome Basilica, Chennai, Tamil Nadu. Christianity \n",
       "is believed to have been introduced to India by the late 2nd century by Syriac-speaking Christians.\\n\\nWith an \n",
       "estimated 1,428,627,663 residents in 2023, India is the world\\'s most populous country.[13] 1,210,193,422 residents\n",
       "were reported in the 2011 provisional census report.[337] Its population grew by 17.64% from 2001 to 2011,[338] \n",
       "compared to 21.54% growth in the previous decade (1991â\\x80\\x932001).[338] The human sex ratio, according to the \n",
       "2011 census, is 940 females per 1,000 males.[337] The median age was 28.7 in 2020.[281] The first post-colonial \n",
       "census, conducted in 1951, counted 361\\xa0million people.[339] Medical advances made in the last 50 years as well \n",
       "as increased agricultural productivity brought about by the \"Green Revolution\" have caused India\\'s population to \n",
       "grow rapidly.[340]\\n\\nThe life expectancy in India is at 70 yearsâ\\x80\\x9471.5 years for women, 68.7 years for \n",
       "men.[281] There are around 93 physicians per 100,000 people.[341] Migration from rural to urban areas has been an \n",
       "important dynamic in India\\'s recent history. The number of people living in urban areas grew by 31.2% between 1991\n",
       "and 2001.[342] Yet, in 2001, over 70% still lived in rural areas.[343][344] The level of urbanisation increased \n",
       "further from 27.81% in the 2001 Census to 31.16% in the 2011 Census. The slowing down of the overall population \n",
       "growth rate was due to the sharp decline in the growth rate in rural areas since 1991.[345] According to the 2011 \n",
       "census, there are 53 million-plus urban agglomerations in India; among them Mumbai, Delhi, Kolkata, Chennai, \n",
       "Bengaluru, Hyderabad and Ahmedabad, in decreasing order by population.[346] The literacy rate in 2011 was 74.04%: \n",
       "65.46% among females and 82.14% among males.[347] The rural-urban literacy gap, which was 21.2 percentage points in\n",
       "2001, dropped to 16.1 percentage points in 2011. The improvement in the rural literacy rate is twice that of urban \n",
       "areas.[345] Kerala is the most literate state with 93.91% literacy; while Bihar the least with \n",
       "63.82%.[347]\\n\\nAmong speakers of the Indian languages, 74% speak Indo-Aryan languages, the easternmost branch of \n",
       "the Indo-European languages; 24% speak Dravidian languages, indigenous to South Asia and spoken widely before the \n",
       "spread of Indo-Aryan languages and 2% speak Austroasiatic languages or the Sino-Tibetan languages. India has no \n",
       "national language.[348] Hindi, with the largest number of speakers, is the official language of the  English is \n",
       "used extensively in business and administration and has the status of a \"subsidiary official language\";[6] it is \n",
       "important in education, especially as a medium of higher education. Each state and union territory has one or more \n",
       "official languages, and the constitution recognises in particular 22 \"scheduled languages\".\\n\\nThe 2011 census \n",
       "reported the religion in India with the largest number of followers was Hinduism (79.80% of the population), \n",
       "followed by Islam (14.23%); the remaining were Christianity (2.30%), Sikhism (1.72%), Buddhism (0.70%), Jainism \n",
       "(0.36%) and others[q] (0.9%).[11] India has the third-largest Muslim populationâ\\x80\\x94the largest for a \n",
       "non-Muslim majority country.[351][352]\\n\\nCulture\\n-------\\n\\nMain article: Culture of India\\n\\n### Visual \n",
       "art\\n\\nMain article: Indian art\\n\\nIndia has a very ancient tradition of art, which has exchanged many influences \n",
       "with the rest of Eurasia, especially in the first millennium, when Buddhist art spread with Indian religions to \n",
       "Central, East and Southeast Asia, the last also greatly influenced by Hindu art.[353] Thousands of seals from the \n",
       "Indus Valley Civilization of the third millennium BCE have been found, usually carved with animals, but also some \n",
       "with human figures. The Pashupati seal, excavated in Mohenjo-daro, Pakistan, in 1928â\\x80\\x9329, is the best \n",
       "known.[354][355] After this there is a long period with virtually nothing surviving.[355][356] Almost all surviving\n",
       "ancient Indian art thereafter is in various forms of religious sculpture in durable materials, or coins. There was \n",
       "probably originally far more in wood, which is lost. In north India Mauryan art is the first imperial  In the first\n",
       "millennium CE, Buddhist art spread with Indian religions to Central, East and Southeast Asia, the last also greatly\n",
       "influenced by Hindu art.[360] Over the following centuries a distinctly Indian style of sculpting the human figure \n",
       "developed, with less interest in articulating precise anatomy than ancient Greek sculpture but showing smoothly \n",
       "flowing forms expressing *prana* (\"breath\" or  This is often complicated by the need to give figures multiple arms \n",
       "or heads, or represent different genders on the left and right of figures, as with the Ardhanarishvara form of \n",
       "Shiva and Parvati.[363][364]\\n\\nMost of the earliest large sculpture is Buddhist, either excavated from Buddhist \n",
       "stupas such as Sanchi, Sarnath and Amaravati,[365] or is rock cut reliefs at sites such as Ajanta, Karla and \n",
       "Ellora. Hindu and Jain sites appear rather later.[366][367] In spite of this complex mixture of religious \n",
       "traditions, generally, the prevailing artistic style at any time and place has been shared by the major religious \n",
       "groups, and sculptors probably usually served all communities.[368] Gupta art, at its peak \n",
       "c.â\\x80\\x89300\\xa0CE\\xa0â\\x80\\x93 c.â\\x80\\x89500\\xa0CE, is often regarded as a classical period whose influence \n",
       "lingered for many centuries after; it saw a new dominance of Hindu sculpture, as at the Elephanta Caves.[369][370] \n",
       "Across the north, this became rather stiff and formulaic after c.â\\x80\\x89800\\xa0CE, though rich with finely carved\n",
       "detail in the surrounds of statues.[371] But in the South, under the Pallava and Chola dynasties, sculpture in both\n",
       "stone and bronze had a sustained period of great achievement; the large bronzes with Shiva as Nataraja have become \n",
       "an iconic symbol of India.[372][373]\\n\\nAncient paintings have only survived at a few sites, of which the crowded \n",
       "scenes of court life in the Ajanta Caves are some of the most important.[374][375] Painted manuscripts of religious\n",
       "texts survive from Eastern India from 10th century onwards, most of the earliest being Buddhist and later Jain. \n",
       "These significantly influenced later artistic styles.[376] The Persian-derived Deccan painting, starting just \n",
       "before the Mughal miniature, between them give the first large body of secular painting, with an emphasis on \n",
       "portraits, and the recording of princely pleasures and wars.[377][378] The style spread to Hindu courts, especially\n",
       "among the Rajputs, and developed a variety of styles, with the smaller courts often the most innovative, with \n",
       "figures such as Nihâl Chand and Nainsukh.[379][380] As a market developed among European residents, it was \n",
       "supplied by Company painting by Indian artists with considerable Western influence.[381][382] In the 19th century, \n",
       "cheap Kalighat paintings of gods and everyday life, done on paper, were urban folk art from Calcutta, which later \n",
       "saw the Bengal School of Art, reflecting the art colleges founded by the British, the first movement in modern \n",
       "Indian  Society\\n\\nMain articles: Caste system in India and Gender inequality in India\\n\\n\\n\\nMuslims offer *namaz*\n",
       "at a mosque in Srinagar, Jammu and Kashmir.\\n\\nThe Indian caste system embodies much of the social stratification \n",
       "and many of the social restrictions found on the Indian subcontinent. Social classes are defined by thousands of \n",
       "endogamous hereditary groups, often termed as *jÄ\\x81tis*, or \"castes\".[385] India abolished untouchability in 1950\n",
       "with the adoption of the constitution and has since enacted other anti-discriminatory laws and social welfare \n",
       "initiatives.[r] However, the system continues to be dominant in India, and caste-based inequality, discrimination, \n",
       "segregation, and violence persist.[387][388]\\n\\nMulti-generational patrilineal joint families have been the norm in\n",
       "India, though nuclear families are becoming common in urban areas.[389] An overwhelming majority of Indians have \n",
       "their marriages arranged by their parents or other family elders.[390] Marriage is thought to be for life,[390] and\n",
       "the divorce rate is extremely low,[391] with less than one in a thousand marriages ending in divorce.[392] Child \n",
       "marriages are common, especially in rural areas; many women wed before reaching 18, which is their legal \n",
       "marriageable age.[393] Female infanticide in India, and lately female foeticide, have created skewed gender ratios;\n",
       "the number of missing women in the country quadrupled from 15 million to 63\\xa0million in the 50 years ending in \n",
       "2014, faster than the population growth during the same period.[394] According to an Indian government study, an \n",
       "additional 21 million girls are unwanted and do not receive adequate care.[395] Despite a government ban on \n",
       "sex-selective foeticide, the practice remains commonplace in India, the result of a preference for boys in a \n",
       "patriarchal society.[396] The payment of dowry, although illegal, remains widespread across class lines.[397] \n",
       "Deaths resulting from dowry, mostly from bride burning, are on the rise, despite stringent anti-dowry \n",
       "laws.[398]\\n\\n### Education\\n\\nMain article: Education in India\\n\\nSee also: Literacy in India and History of \n",
       "education in the Indian subcontinent\\n\\n\\n\\nChildren awaiting school lunch in Rayka (also Raika), a village in \n",
       "rural Gujarat. The salutation *Jai Bhim* written on the blackboard honours the jurist, social reformer, and Dalit \n",
       "leader B. R. Ambedkar.\\n\\nIn the 2011 census, about 73% of the population was literate, with 81% for men and 65% \n",
       "for women. This compares to 1981 when the respective rates were 41%, 53% and 29%. In 1951 the rates were 18%, 27% \n",
       "and 9%. In 1921 the rates 7%, 12% and 2%. In 1891 they were 5%, 9% and 1%,[399][400] According to Latika Chaudhary,\n",
       "in 1911 there were under three primary schools for every ten villages. Statistically, more caste and religious \n",
       "diversity reduced private spending. Primary schools taught literacy, so local diversity limited its \n",
       "growth.[401]\\n\\nThe education system of India is the world\\'s second-largest.[402] India has over 900 universities,\n",
       "40,000 colleges[403] and 1.5\\xa0million schools.[404] In India\\'s higher education system, a significant number of \n",
       "seats are reserved under affirmative action policies for the historically disadvantaged. In recent decades India\\'s\n",
       "improved education system is often cited as one of the main contributors to its economic \\n\\n### Clothing\\n\\nMain \n",
       "article: Clothing in India\\n\\n\\n\\nWomen in sari at an adult literacy class in Tamil Nadu\\n\\n\\n\\nWomen (from left to\n",
       "right) in churidars and kameez (with back to the camera), jeans and sweater, and pink shalwar kameez\\n\\nFrom \n",
       "ancient times until the advent of the modern, the most widely worn traditional dress in India was draped.[407] For \n",
       "women it took the form of a sari, a single piece of cloth many yards long.[407] The sari was traditionally wrapped \n",
       "around the lower body and the shoulder.[407] In its modern form, it is combined with an underskirt, or Indian \n",
       "petticoat, and tucked in the waist band for more secure fastening. It is also commonly worn with an Indian blouse, \n",
       "or choli, which serves as the primary upper-body garment, the sari\\'s endâ\\x80\\x94passing over the \n",
       "shoulderâ\\x80\\x94covering the midriff and obscuring the upper body\\'s contours.[407] For men, a similar but shorter\n",
       "length of cloth, the dhoti, has served as a lower-body garment.[408]\\n\\nThe use of stitched clothes became \n",
       "widespread after Muslim rule was established at first by the Delhi sultanate (c.â\\x80\\x891300 CE) and then \n",
       "continued by the Mughal Empire (c.â\\x80\\x891525 CE).[409] Among the garments introduced during this time and still \n",
       "commonly worn are: the shalwars and pyjamas, both styles of trousers, and the tunics kurta and kameez.[409] In \n",
       "southern India, the traditional draped garments were to see much longer continuous use.[409]\\n\\nSalwars are \n",
       "atypically wide at the waist but narrow to a cuffed bottom. They are held up by a drawstring, which causes them to \n",
       "become pleated around the waist.[410] The pants can be wide and baggy, or they can be cut quite narrow, on the \n",
       "bias, in which case they are called churidars. When they are ordinarily wide at the waist and their bottoms are \n",
       "hemmed but not cuffed, they are called pyjamas. The kameez is a long shirt or tunic,[411] its side seams left open \n",
       "below the waistline.[412] The kurta is traditionally collarless and made of cotton or silk; it is worn plain or \n",
       "with embroidered decoration, such as chikan; and typically falls to either just above or just below the wearer\\'s \n",
       "knees.[413]\\n\\nIn the last 50 years, fashions have changed a great deal in India. Increasingly, in urban northern \n",
       "India, the sari is no longer the apparel of everyday wear, though they remain popular on formal occasions.[414] The\n",
       "traditional shalwar kameez is rarely worn by younger urban women, who favour churidars or jeans.[414] In office \n",
       "settings, ubiquitous air conditioning allows men to wear sports jackets year-round.[414] For weddings and formal \n",
       "occasions, men in the middle and upper classes often wear bandgala, or short Nehru jackets, with pants, with the \n",
       "groom and his groomsmen sporting sherwanis and churidars.[414] The dhoti, once the universal garment of Hindu \n",
       "males, the wearing of which in the homespun and handwoven khadi allowed Gandhi to bring Indian nationalism to the \n",
       "millions,[415]\\nis seldom seen in the cities.[414]\\n\\n### Cuisine\\n\\nMain article: Indian cuisine\\n\\n\\n\\nSouth \n",
       "Indian vegetarian thali, or platter\\n\\nThe foundation of a typical Indian meal is a cereal cooked plainly and \n",
       "complemented with flavourful savoury dishes.[416] The cooked cereal could be steamed rice; chapati, a thin \n",
       "unleavened bread;[417] the idli, a steamed breakfast cake; or dosa, a griddled pancake.[418] The savoury dishes \n",
       "might include lentils, pulses and vegetables commonly spiced with ginger and garlic, but also with a combination of\n",
       "spices that may include coriander, cumin, turmeric, cinnamon, cardamon and others.[416] They might also include \n",
       "poultry, fish, or meat dishes. In some instances, the ingredients may be mixed during the cooking \n",
       "process.[419]\\n\\nA platter, or thali, used for eating usually has a central place reserved for the cooked cereal, \n",
       "and peripheral ones for the flavourful accompaniments. The cereal and its accompaniments are eaten simultaneously \n",
       "rather than a piecemeal manner. This is accomplished by mixingâ\\x80\\x94for example of rice and lentilsâ\\x80\\x94or \n",
       "folding, wrapping, scooping or dippingâ\\x80\\x94such as chapati and cooked vegetables.[416]\\n\\n\\n\\nA tandoor chef in\n",
       "the Turkman Gate, Old Delhi, makes Khameeri roti (a Muslim-influenced style of leavened bread).[420]\\n\\nIndia has \n",
       "distinctive vegetarian cuisines, each a feature of the geographical and cultural histories of its adherents.[421] \n",
       "About 20% to 39% of India\\'s population consists of  Much of this stems from caste hierarchy, as upper castes, such\n",
       "as the Brahmins, consider vegetarian food to be \"pure\".[424][425] Although meat is eaten widely in India, the \n",
       "proportional consumption of meat in the overall diet is low.[426] Unlike China, which has increased its per capita \n",
       "meat consumption substantially in its years of increased economic growth, in India the strong dietary traditions \n",
       "have contributed to dairy, rather than meat, becoming the preferred form of animal protein consumption.[427]\\n\\nThe\n",
       "most significant import of cooking techniques into India during the last millennium occurred during the Mughal \n",
       "Empire. Dishes such as the pilaf,[428] developed in the Abbasid caliphate,[429] and cooking techniques such as the \n",
       "marinating of meat in yogurt, spread into northern India from regions to its northwest.[430] To the simple yogurt \n",
       "marinade of Persia, onions, garlic, almonds, and spices began to be added in India.[430] Rice was partially cooked \n",
       "and layered alternately with the sauteed meat, the pot sealed tightly, and slow cooked according to another Persian\n",
       "cooking technique, to produce what has today become biryani,[430] a feature of festive dining in many parts of \n",
       "India.[431] In the food served in Indian restaurants worldwide, the diversity of Indian food has been partially \n",
       "concealed by the dominance of Punjabi cuisine. The popularity of tandoori chickenâ\\x80\\x94cooked in the tandoor \n",
       "oven, which had traditionally been used for baking bread in the rural Punjab and the Delhi region, especially among\n",
       "Muslims, but which is originally from Central Asiaâ\\x80\\x94dates to the 1950s, and was caused in large part by an \n",
       "entrepreneurial response among people from the Punjab who had been displaced by the 1947 partition.[421]\\n\\n### \n",
       "Sports and recreation\\n\\nMain article: Sport in India\\n\\nSee also: Indian physical culture\\n\\n\\n\\nGirls play \n",
       "hopscotch in Jaora, Madhya Pradesh. Hopscotch has been commonly played by girls in rural India.[432]\\n\\nSeveral \n",
       "traditional sportsâ\\x80\\x94such as *kabaddi*, *kho kho*, pehlwani, *gilli-danda,* hopscotch and martial arts such \n",
       "as *Kalarippayattu* and *marma adiâ\\x80\\x94*remain popular. Chess is commonly held to have originated in India as \n",
       "*chaturaá¹\\x85ga*;[433] in recent years, there has been a rise in the number of Indian grandmasters,[434] and world\n",
       "champions.[435] Parcheesi is derived from *Pachisi*, another traditional Indian pastime, which in early modern \n",
       "times was played on a giant marble court by Mughal emperor Akbar the Great.[436]\\n\\nCricket is the most popular \n",
       "sport in India.[437] India has won two Cricket World Cups, the 1983 edition and the 2011 edition. India has won \n",
       "eight field hockey gold medals in the summer Olympics.[438]\\n\\nSee \n",
       "also\\n--------Notes\\n-----\\n\\nReferences\\n----------\\n\\nBibliography\\n------------\\n\\n### Overview\\n\\n### \n",
       "Etymology\\n\\n### History\\n\\n### Geography\\n\\n### Biodiversity\\n\\n### Politics\\n\\n### Foreign relations and \n",
       "military\\n\\n### Economy\\n\\n### Demographics\\n\\n### Art\\n\\n### Culture\\n\\nExternal \n",
       "links\\n--------------\\n\\n**India** at Wikipedia\\'s sister projects\\n\\n information**| India topics | |\\n| --- | ---\n",
       "|\\n| | History | | | --- | --- | | |  |  | | --- | --- | | Overviews |  | | Ancient |  | | Middle Kingdoms |  | | \n",
       "Middle Ages |  | | Early Modern |  | | Late modern |  | | Republic |  | | Contemporary |  | | | | |\\n| | Geography \n",
       "| | | --- | --- | | |  |  | | --- | --- | | Geology |  | | Heritage |  | | Environment |  | | Landforms |  | | \n",
       "Regions |  | | Subdivisions |  | | See also |  | | | | |\\n| | Politics | | | --- | --- | | |  |  | | --- | --- | | \n",
       "Government |  | | Law | |  |  | | --- | --- | |  | | | Enforcement | |  |  | | --- | --- | | Federal |  | | \n",
       "Intelligence |  | | | | Military |  | | Politics |  | | | | |\\n| | Economy | | | --- | --- | | |  |  | | --- | --- \n",
       "| | Companies |  | | Governance |  | | Currency |  | | Financial services |  | | History |  | | People |  | | \n",
       "States |  | | Sectors |  | | Regulator |  | | Other |  | | | | |\\n| |  | | | --- | --- | | |  |  | | --- | --- | | \n",
       "Society |  | | Culture |  | | | | |\\n\\n| Related topics | |\\n| --- | --- |\\n| | States and union territories of \n",
       "India | | | --- | --- | | States |  | | Union territories |  | |  | |    | South Asian topics | | | --- | --- | | \n",
       "Countries and regions |  | | Ethnic groups |  | | Culture |  | | Environment |  | | Economy and Politics |  | | \n",
       "History |  | | Religion |  | | Sports |  | | Education |  | | Military |  | | Science and technology |  |    | \n",
       "Members of the Commonwealth of Nations | | | --- | --- | | Sovereign states (members) |  | | Associated states |  |\n",
       "| Dependent territories | |  |  | | --- | --- | | Australia |  | | New Zealand |  | | United Kingdom |  | | | \n",
       "Source: Commonwealth Secretariat â\\x80\\x93 Member Countries | |    | South Asian Association for Regional \n",
       "Cooperation (SAARC) | | | --- | --- | | Agreements | South Asian Free Trade Area | | Summits |  | | Members |  | | \n",
       "Observers |  | | Guests |  | | Specialised agencies |  | | Related Articles |  |    | Group of Eight (G8) and Group\n",
       "of Eight + Five (G8+5) | | | --- | --- | | G8 members |  | | Representative |  | | G8+5 |  | | See also |  |    | \n",
       "G20 major economies | | | --- | --- | |  | |    | BRICS | | | --- | --- | | Membership |  | | Summits |  | | \n",
       "Leaders |  | | Sports |  | | Related |  |    | East Asia Summit (EAS) | | | --- | --- | |  | | |  | |    | Shanghai\n",
       "Cooperation Organisation (SCO) | | | --- | --- | | Summits |  | | Member states |  | | Observer states |  | | \n",
       "Dialogue partners |  | | Guests |  | | Applicants |  | | See also |  |    | Countries and dependencies of Asia | | \n",
       "| --- | --- | | Sovereign states |  | | States with limited recognition |  | | Dependent territories | |  |  | | \n",
       "--- | --- | | Australia |  | | Greece |  | | United Kingdom |  | | | Special administrative regions of China |  | |\n",
       "| | | |\\n\\nPortals:\\n\\n| Authority control databases Edit this at Wikidata | |\\n| --- | --- |\\n| International |  \n",
       "|\\n| National |  |\\n| Geographic |  |\\n| Academics |  |\\n| Artists |  |\\n| People |  |\\n| Other |  |\\n\\n21°N \n",
       "78°E / 21°N 78°E / 21; 78\\n\\n\\n\\nRetrieved from \\n\\nCategories:\\n\\nHidden \n",
       "categories:\\n\\nSearch\\n\\nSearch\\n\\nIndia\\n\\n305 languages\\n\\nAdd topic'\n",
       ")\n",
       "
\n" ], "text/plain": [ "\u001b[1;35mDocument\u001b[0m\u001b[1m(\u001b[0m\n", " \u001b[33mmetadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'source'\u001b[0m: \u001b[32m'https://en.wikipedia.org/wiki/India'\u001b[0m\u001b[1m}\u001b[0m,\n", " \u001b[33mpage_content\u001b[0m=\u001b[32m'India - Wikipedia\\n\\nJump to content\\n\\nCoordinates: 21°N 78°E\\ufeff / \\ufeff21°N 78°E\\ufeff \u001b[0m\n", "\u001b[32m/ 21; 78\\n\\n!\u001b[0m\u001b[32m[\u001b[0m\u001b[32mFeatured \\n\\n!\u001b[0m\u001b[32m[\u001b[0m\u001b[32mExtended-protected \\n\\nFrom Wikipedia, the free encyclopedia\\n\\nCountry in South \u001b[0m\n", "\u001b[32mAsia\\n\\nThis article is about the country. For other uses, see India \u001b[0m\u001b[32m(\u001b[0m\u001b[32mdisambiguation\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n\\n| Republic of India \u001b[0m\n", "\u001b[32m*BhÄ\\x81rat Gaá¹\\x87arÄ\\x81jya* | |\\n| --- | --- |\\n| Horizontal tricolour flag bearing, from top to bottom, deep \u001b[0m\n", "\u001b[32msaffron, white, and green horizontal bands. In the centre of the white band is a navy-blue wheel with 24 spokes. \u001b[0m\n", "\u001b[32mFlag State emblem | |\\n| **Motto:**Satyameva Jayate\\xa0\u001b[0m\u001b[32m(\u001b[0m\u001b[32mSanskrit\u001b[0m\u001b[32m)\u001b[0m\u001b[32m \"Truth Alone Triumphs\"\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m | |\\n| \u001b[0m\n", "\u001b[32m**Anthem:**\\xa0Jana Gana Mana\\xa0\u001b[0m\u001b[32m(\u001b[0m\u001b[32mHindi\u001b[0m\u001b[32m)\u001b[0m\u001b[32m[\u001b[0m\u001b[32ma\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m3\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \"Thou Art the Ruler of the Minds of All People\"\u001b[0m\u001b[32m[\u001b[0m\u001b[32m4\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m | |\\n| \u001b[0m\n", "\u001b[32m**National song: Vande Mataram\\xa0\u001b[0m\u001b[32m(\u001b[0m\u001b[32mSanskrit\u001b[0m\u001b[32m)\u001b[0m\u001b[32m[\u001b[0m\u001b[32mc\u001b[0m\u001b[32m]\u001b[0m\u001b[32m** \"I Bow to Thee, Mother\"\u001b[0m\u001b[32m[\u001b[0m\u001b[32mb\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m1\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m | |\\n| Image of a globe centred \u001b[0m\n", "\u001b[32mon India, with India highlighted. Territory controlled by India Territory claimed but not controlled | |\\n| \u001b[0m\n", "\u001b[32mCapital | New Delhi 28°36â\\x80²50â\\x80³N 77°12â\\x80²30â\\x80³E / 28.61389°N 77.20833°E / 28.61389; \u001b[0m\n", "\u001b[32m77.20833 |\\n| Largest city by city proper population | Mumbai |\\n| Largest city by metropolitan area population | \u001b[0m\n", "\u001b[32mDelhi |\\n| Official\\xa0languages | |\\n| Recognised regional\\xa0languages | State level and Eighth Schedule\u001b[0m\u001b[32m[\u001b[0m\u001b[32m9\u001b[0m\u001b[32m]\u001b[0m\u001b[32m |\\n|\u001b[0m\n", "\u001b[32mNative languages | 424 languages\u001b[0m\u001b[32m[\u001b[0m\u001b[32mg\u001b[0m\u001b[32m]\u001b[0m\u001b[32m |\\n| Religion \u001b[0m\u001b[32m(\u001b[0m\u001b[32m2011\u001b[0m\u001b[32m)\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m | |\\n| Demonym\u001b[0m\u001b[32m(\u001b[0m\u001b[32ms\u001b[0m\u001b[32m)\u001b[0m\u001b[32m | |\\n| Government | Federal \u001b[0m\n", "\u001b[32mparliamentary republic |\\n| | |\\n| â\\x80¢\\xa0President | Droupadi Murmu |\\n| â\\x80¢\\xa0Prime Minister | Narendra \u001b[0m\n", "\u001b[32mModi |\\n| | |\\n| Legislature | Parliament |\\n| â\\x80¢\\xa0Upper house | Rajya Sabha |\\n| â\\x80¢\\xa0Lower house | \u001b[0m\n", "\u001b[32mLok Sabha |\\n| Independence from the United Kingdom | |\\n| | |\\n| â\\x80¢\\xa0Dominion | 15 August 1947 |\\n| \u001b[0m\n", "\u001b[32mâ\\x80¢\\xa0Republic | 26 January 1950 |\\n| | |\\n| Area | |\\n| â\\x80¢\\xa0Total | 3,287,263\\xa0km2 \u001b[0m\n", "\u001b[32m(\u001b[0m\u001b[32m1,269,219\\xa0sq\\xa0mi\u001b[0m\u001b[32m)\u001b[0m\u001b[32m[\u001b[0m\u001b[32m2\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32mh\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32m(\u001b[0m\u001b[32m7th\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| â\\x80¢\\xa0Water\\xa0\u001b[0m\u001b[32m(\u001b[0m\u001b[32m%\u001b[0m\u001b[32m)\u001b[0m\u001b[32m | 9.6 |\\n| Population | |\\n| â\\x80¢\\xa02023 \u001b[0m\n", "\u001b[32mestimate | Neutral increase 1,428,627,663\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1st\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| â\\x80¢\\xa02011\\xa0census | Neutral increase \u001b[0m\u001b[32m(\u001b[0m\u001b[32m2nd\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| \u001b[0m\n", "\u001b[32mâ\\x80¢\\xa0Density | 430.5/km2 \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1,115.0/sq\\xa0mi\u001b[0m\u001b[32m)\u001b[0m\u001b[32m \u001b[0m\u001b[32m(\u001b[0m\u001b[32m30th\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| GDP\\xa0\u001b[0m\u001b[32m(\u001b[0m\u001b[32mPPP\u001b[0m\u001b[32m)\u001b[0m\u001b[32m | 2025\\xa0estimate |\\n| â\\x80¢\\xa0Total |\u001b[0m\n", "\u001b[32mIncrease $17.647 trillion\u001b[0m\u001b[32m[\u001b[0m\u001b[32m16\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32m(\u001b[0m\u001b[32m3rd\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| â\\x80¢\\xa0Per capita | Increase $12,132\u001b[0m\u001b[32m[\u001b[0m\u001b[32m16\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32m(\u001b[0m\u001b[32m119th\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| GDP\\xa0\u001b[0m\u001b[32m(\u001b[0m\u001b[32mnominal\u001b[0m\u001b[32m)\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32m| 2025\\xa0estimate |\\n| â\\x80¢\\xa0Total | Increase $4.187 trillion\u001b[0m\u001b[32m[\u001b[0m\u001b[32m16\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32m(\u001b[0m\u001b[32m4th\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| â\\x80¢\\xa0Per capita | Increase \u001b[0m\n", "\u001b[32m$2,878\u001b[0m\u001b[32m[\u001b[0m\u001b[32m16\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32m(\u001b[0m\u001b[32m136th\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| Gini\\xa0\u001b[0m\u001b[32m(\u001b[0m\u001b[32m2021\u001b[0m\u001b[32m)\u001b[0m\u001b[32m | Positive decrease\\xa025.5\u001b[0m\u001b[32m[\u001b[0m\u001b[32m17\u001b[0m\u001b[32m]\u001b[0m\u001b[32m low inequality |\\n| HDI\\xa0\u001b[0m\u001b[32m(\u001b[0m\u001b[32m2023\u001b[0m\u001b[32m)\u001b[0m\u001b[32m | \u001b[0m\n", "\u001b[32mIncrease\\xa00.685\u001b[0m\u001b[32m[\u001b[0m\u001b[32m18\u001b[0m\u001b[32m]\u001b[0m\u001b[32m medium\\xa0\u001b[0m\u001b[32m(\u001b[0m\u001b[32m130th\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| Currency | Indian rupee \u001b[0m\u001b[32m(\u001b[0m\u001b[32mâ\\x82¹\u001b[0m\u001b[32m)\u001b[0m\u001b[32m \u001b[0m\u001b[32m(\u001b[0m\u001b[32mINR\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| Time zone | UTC+05:30 \u001b[0m\n", "\u001b[32m(\u001b[0m\u001b[32mIST\u001b[0m\u001b[32m)\u001b[0m\u001b[32m |\\n| Date format | |\\n| Calling code | +91 |\\n| ISO 3166 code | IN |\\n| Internet TLD | .in \u001b[0m\u001b[32m(\u001b[0m\u001b[32mothers\u001b[0m\u001b[32m)\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32m|\\n\\n**India**, officially the **Republic of India**,\u001b[0m\u001b[32m[\u001b[0m\u001b[32mj\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m20\u001b[0m\u001b[32m]\u001b[0m\u001b[32m is a country in South Asia. It is the seventh-largest \u001b[0m\n", "\u001b[32mcountry by area; the most populous country since 2023;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m21\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and, since its independence in 1947, the world\\'s most \u001b[0m\n", "\u001b[32mpopulous Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the \u001b[0m\n", "\u001b[32msoutheast, it shares land borders with Pakistan to the west;\u001b[0m\u001b[32m[\u001b[0m\u001b[32mk\u001b[0m\u001b[32m]\u001b[0m\u001b[32m China, Nepal, and Bhutan to the north; and \u001b[0m\n", "\u001b[32mBangladesh and Myanmar to the east. In the Indian Ocean, India is near Sri Lanka and the Maldives; its Andaman and \u001b[0m\n", "\u001b[32mNicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.\\n\\nModern humans arrived on the \u001b[0m\n", "\u001b[32mIndian subcontinent from Africa no later than 55,000 years ago.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m26\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m27\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m28\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Their long occupation, predominantly in\u001b[0m\n", "\u001b[32misolation as hunter-gatherers, has made the region highly diverse.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m29\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Settled life emerged on the subcontinent in \u001b[0m\n", "\u001b[32mthe western margins of the Indus river basin 9,000 years ago, evolving gradually into the Indus Valley Civilisation\u001b[0m\n", "\u001b[32mof the third millennium BCE.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m30\u001b[0m\u001b[32m]\u001b[0m\u001b[32m By 1200\\xa0BCE, an archaic form of Sanskrit, an Indo-European language, had \u001b[0m\n", "\u001b[32mdiffused into India from the northwest.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m31\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m32\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Its hymns recorded the early dawnings of Hinduism in India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m33\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mIndia\\'s pre-existing Dravidian languages were supplanted in the northern regions.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m34\u001b[0m\u001b[32m]\u001b[0m\u001b[32m By 400\\xa0BCE, caste had \u001b[0m\n", "\u001b[32memerged within Hinduism,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m35\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and Buddhism and Jainism had arisen, proclaiming social orders unlinked to \u001b[0m\n", "\u001b[32mheredity.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m36\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Early political consolidations gave rise to the loose-knit Maurya and Gupta Empires.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m37\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Widespread \u001b[0m\n", "\u001b[32mcreativity suffused this era,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m38\u001b[0m\u001b[32m]\u001b[0m\u001b[32m but the status of women declined,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m39\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and untouchability became an organized \u001b[0m\n", "\u001b[32mbelief.\u001b[0m\u001b[32m[\u001b[0m\u001b[32ml\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m40\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In South India, the Middle kingdoms exported Dravidian language scripts and religious cultures to \u001b[0m\n", "\u001b[32mthe kingdoms of Southeast Asia.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m41\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIn the early medieval era, Christianity, Islam, Judaism, and Zoroastrianism \u001b[0m\n", "\u001b[32mbecame established on India\\'s southern and western coasts.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m42\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Muslim armies from Central Asia intermittently \u001b[0m\n", "\u001b[32moverran India\\'s northern plains in the second millennium.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m43\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The resulting Delhi Sultanate drew northern India \u001b[0m\n", "\u001b[32minto the cosmopolitan networks of medieval Islam.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m44\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In south India, the Vijayanagara Empire created a \u001b[0m\n", "\u001b[32mlong-lasting composite Hindu culture.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m45\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In the Punjab, Sikhism emerged, rejecting institutionalised religion.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m46\u001b[0m\u001b[32m]\u001b[0m\n", "\u001b[32mThe Mughal Empire ushered in two centuries of economic expansion and relative peace,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m47\u001b[0m\u001b[32m]\u001b[0m\u001b[32m leaving a rich \u001b[0m\n", "\u001b[32marchitectural legacy.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m48\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m49\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Gradually expanding rule of the British East India Company turned India into a \u001b[0m\n", "\u001b[32mcolonial economy but consolidated its sovereignty.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m50\u001b[0m\u001b[32m]\u001b[0m\u001b[32m British Crown rule began in 1858. The rights promised to \u001b[0m\n", "\u001b[32mIndians were granted slowly,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m51\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m52\u001b[0m\u001b[32m]\u001b[0m\u001b[32m but technological changes were introduced, and modern ideas of education and \u001b[0m\n", "\u001b[32mthe public life took root.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m53\u001b[0m\u001b[32m]\u001b[0m\u001b[32m A nationalist movement emerged in India, the first in the non-European British \u001b[0m\n", "\u001b[32mempire and an influence on other nationalist movements.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m54\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m55\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Noted for nonviolent resistance after 1920,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m56\u001b[0m\u001b[32m]\u001b[0m\u001b[32m it \u001b[0m\n", "\u001b[32mbecame the primary factor in ending British rule.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m57\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In 1947, the British Indian Empire was partitioned into two \u001b[0m\n", "\u001b[32mindependent a Hindu-majority dominion of India and a Muslim-majority dominion of Pakistan. A large-scale loss of \u001b[0m\n", "\u001b[32mlife and an unprecedented migration accompanied the partition.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m62\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIndia has been a federal republic since 1950,\u001b[0m\n", "\u001b[32mgoverned through a democratic parliamentary system. It is a pluralistic, multilingual and multi-ethnic society. \u001b[0m\n", "\u001b[32mIndia\\'s population grew from 361 million in 1951 to over 1.4 billion in 2023.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m63\u001b[0m\u001b[32m]\u001b[0m\u001b[32m During this time, its nominal \u001b[0m\n", "\u001b[32mper capita income increased from US$64 annually to US$2,601, and its literacy rate from 16.6% to 74%. A \u001b[0m\n", "\u001b[32mcomparatively destitute country in 1951,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m64\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has become a fast-growing major economy and hub for information \u001b[0m\n", "\u001b[32mtechnology services; it has an expanding middle class.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m65\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Indian movies and music increasingly influence global \u001b[0m\n", "\u001b[32mculture.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m66\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has reduced its poverty rate, though at the cost of increasing economic inequality.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m67\u001b[0m\u001b[32m]\u001b[0m\u001b[32m It is a \u001b[0m\n", "\u001b[32mnuclear-weapon state that ranks high in military expenditure. It has disputes over Kashmir with its neighbours, \u001b[0m\n", "\u001b[32mPakistan and China, unresolved since the mid-20th century.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m68\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Among the socio-economic challenges India faces are \u001b[0m\n", "\u001b[32mgender inequality, child malnutrition,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m69\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and rising levels of air pollution.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m70\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India\\'s land is megadiverse \u001b[0m\n", "\u001b[32mwith four biodiversity hotspots.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m71\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India\\'s wildlife, which has traditionally been viewed with tolerance in its \u001b[0m\n", "\u001b[32mculture,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m72\u001b[0m\u001b[32m]\u001b[0m\u001b[32m is supported in protected habitats.\\n\\nEtymology\\n---------\\n\\nMain article: Names for \u001b[0m\n", "\u001b[32mIndia\\n\\nAccording to the *Oxford English Dictionary* \u001b[0m\u001b[32m(\u001b[0m\u001b[32m2009\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, the name \"India\" is derived from the Classical Latin \u001b[0m\n", "\u001b[32m*India*, a reference to South Asia and an uncertain region to its east. In turn \"India\" derived successively from \u001b[0m\n", "\u001b[32mHellenistic Greek *India* \u001b[0m\u001b[32m(\u001b[0m\u001b[32mἸνδία\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, Ancient Greek *Indos* \u001b[0m\u001b[32m(\u001b[0m\u001b[32mἸνδÏ\\x8cÏ\\x82\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, Old Persian *Hindush* \u001b[0m\u001b[32m(\u001b[0m\u001b[32man \u001b[0m\n", "\u001b[32meastern province of the Achaemenid Empire\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and ultimately its cognate, the Sanskrit *Sindhu*, or the Indus River,\u001b[0m\n", "\u001b[32mand by extension its well-settled southern basin.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m73\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m74\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The Ancient Greeks referred to the Indians as *Indoi*, \u001b[0m\n", "\u001b[32m\\'the people of the Indus\\'.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m75\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe term *Bharat* \u001b[0m\u001b[32m(\u001b[0m\u001b[32m*BhÄ\\x81rat*; pronounced \u001b[0m\u001b[32m[\u001b[0m\u001b[32mË\\x88bʱaË\\x90ɾÉ\\x99t\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mâ\\x93\\x98\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, mentioned in both Indian epic poetry and the Constitution of India,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m76\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m77\u001b[0m\u001b[32m]\u001b[0m\u001b[32m is used in its variations \u001b[0m\n", "\u001b[32mby many Indian languages. A modern rendering of the historical name *Bharatavarsha*, which applied originally to \u001b[0m\n", "\u001b[32mNorth India,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m78\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m79\u001b[0m\u001b[32m]\u001b[0m\u001b[32m *Bharat* gained increased currency from the mid-19th century as a native name for \u001b[0m\n", "\u001b[32mIndia.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m76\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m80\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n*Hindustan* \u001b[0m\u001b[32m(\u001b[0m\u001b[32m[\u001b[0m\u001b[32mɦɪndÊ\\x8aË\\x88staË\\x90n\u001b[0m\u001b[32m]\u001b[0m\u001b[32m â\\x93\\x98\u001b[0m\u001b[32m)\u001b[0m\u001b[32m is a Middle Persian name for India that \u001b[0m\n", "\u001b[32mbecame popular by the 13th century,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m81\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and was used widely since the era of the Mughal Empire. The meaning of \u001b[0m\n", "\u001b[32m*Hindustan* has varied, referring to a region encompassing the northern Indian subcontinent \u001b[0m\u001b[32m(\u001b[0m\u001b[32mpresent-day northern \u001b[0m\n", "\u001b[32mIndia and Pakistan\u001b[0m\u001b[32m)\u001b[0m\u001b[32m or to India in its near \\n\\nHistory\\n-------\\n\\nMain articles: History of India and History of \u001b[0m\n", "\u001b[32mthe Republic of India\\n\\n### Ancient India\\n\\n\\n\\nManuscript illustration, c.â\\x80\\x891650, of the Sanskrit epic \u001b[0m\n", "\u001b[32mRamayana, composed in story-telling fashion c.â\\x80\\x89400\\xa0BCE\\xa0â\\x80\\x93 c.â\\x80\\x89300\\xa0CE\u001b[0m\u001b[32m[\u001b[0m\u001b[32m83\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nBy \u001b[0m\n", "\u001b[32m55,000 years ago, the first modern humans, or *Homo sapiens*, had arrived on the Indian subcontinent from \u001b[0m\n", "\u001b[32mAfrica.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m26\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m27\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m28\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The earliest known modern human remains in South Asia date to about 30,000 years ago.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m26\u001b[0m\u001b[32m]\u001b[0m\u001b[32m After\u001b[0m\n", "\u001b[32m6500\\xa0BCE, evidence for domestication of food crops and animals, construction of permanent structures, and \u001b[0m\n", "\u001b[32mstorage of agricultural surplus appeared in Mehrgarh and other sites in Balochistan, Pakistan.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m84\u001b[0m\u001b[32m]\u001b[0m\u001b[32m These gradually \u001b[0m\n", "\u001b[32mdeveloped into the Indus Valley the first urban culture in South Asia,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m86\u001b[0m\u001b[32m]\u001b[0m\u001b[32m which flourished during \u001b[0m\n", "\u001b[32m2500â\\x80\\x931900\\xa0BCE in Pakistan and western India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m87\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Centred around cities such as Mohenjo-daro, Harappa, \u001b[0m\n", "\u001b[32mDholavira, and Kalibangan, and relying on varied forms of subsistence, the civilisation engaged robustly in crafts \u001b[0m\n", "\u001b[32mproduction and wide-ranging trade.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m86\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nDuring the period 2000â\\x80\\x93500\\xa0BCE, many regions of the \u001b[0m\n", "\u001b[32msubcontinent transitioned from the Chalcolithic cultures to the Iron Age ones.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m88\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The Vedas, the oldest scriptures\u001b[0m\n", "\u001b[32massociated with Hinduism,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m89\u001b[0m\u001b[32m]\u001b[0m\u001b[32m were composed during this period,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m90\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and historians have analysed these to posit a \u001b[0m\n", "\u001b[32mVedic culture in the Punjab region and the upper Gangetic Plain.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m88\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Most historians also consider this period to \u001b[0m\n", "\u001b[32mhave encompassed several waves of Indo-Aryan migration into the subcontinent from the north-west.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m89\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The caste \u001b[0m\n", "\u001b[32msystem, which created a hierarchy of priests, warriors, and free peasants, but which excluded indigenous peoples by\u001b[0m\n", "\u001b[32mlabelling their occupations impure, arose during this period.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m91\u001b[0m\u001b[32m]\u001b[0m\u001b[32m On the Deccan Plateau, archaeological evidence \u001b[0m\n", "\u001b[32mfrom this period suggests the existence of a chiefdom stage of political organisation.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m88\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In South India, a \u001b[0m\n", "\u001b[32mprogression to sedentary life is indicated by the large number of megalithic monuments dating from this period,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m92\u001b[0m\u001b[32m]\u001b[0m\n", "\u001b[32mas well as by nearby traces of agriculture, irrigation tanks, and craft traditions.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m92\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n\\n\\nCave 26 of the \u001b[0m\n", "\u001b[32mrock-cut Ajanta Caves\\n\\nIn the late Vedic period, around the 6th century BCE, the small states and chiefdoms of \u001b[0m\n", "\u001b[32mthe Ganges Plain and the north-western regions had consolidated into 16 major oligarchies and monarchies that were \u001b[0m\n", "\u001b[32mknown as the The emerging urbanisation gave rise to non-Vedic religious movements, two of which became independent\u001b[0m\n", "\u001b[32mreligions. Jainism came into prominence during the life of its exemplar, Mahavira.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m95\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Buddhism, based on the \u001b[0m\n", "\u001b[32mteachings of Gautama Buddha, attracted followers from all social classes excepting the middle class; chronicling \u001b[0m\n", "\u001b[32mthe life of the Buddha was central to the beginnings of recorded history in India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m96\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m97\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m98\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In an age of \u001b[0m\n", "\u001b[32mincreasing urban wealth, both religions held up renunciation as an ideal,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m99\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and both established long-lasting \u001b[0m\n", "\u001b[32mmonastic traditions. Politically, by the 3rd century BCE, the kingdom of Magadha had annexed or reduced other \u001b[0m\n", "\u001b[32mstates to emerge as the Maurya Empire.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m100\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The empire was once thought to have controlled most of the subcontinent\u001b[0m\n", "\u001b[32mexcept the far south, but its core regions are now thought to have been separated by large autonomous \u001b[0m\n", "\u001b[32mareas.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m101\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m102\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The Mauryan kings are known as much for their empire-building and determined management of public \u001b[0m\n", "\u001b[32mlife as for Ashoka\\'s renunciation of militarism and far-flung advocacy of the Buddhist *dhamma*.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m103\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m104\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe \u001b[0m\n", "\u001b[32mSangam literature of the Tamil language reveals that, between 200\\xa0BCE and 200\\xa0CE, the southern peninsula was \u001b[0m\n", "\u001b[32mruled by the Cheras, the Cholas, and the Pandyas, dynasties that traded extensively with the Roman Empire and with \u001b[0m\n", "\u001b[32mWest and Southeast Asia.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m105\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m106\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In North India, Hinduism asserted patriarchal control within the family, leading\u001b[0m\n", "\u001b[32mto increased subordination of women.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m107\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m100\u001b[0m\u001b[32m]\u001b[0m\u001b[32m By the 4th and 5th centuries, the Gupta Empire had created a complex\u001b[0m\n", "\u001b[32msystem of administration and taxation in the greater Ganges Plain; this system became a model for later Indian \u001b[0m\n", "\u001b[32mkingdoms.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m108\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m109\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Under the Guptas, a renewed Hinduism based on devotion, rather than the management of ritual, \u001b[0m\n", "\u001b[32mbegan to assert itself.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m110\u001b[0m\u001b[32m]\u001b[0m\u001b[32m This renewal was reflected in a flowering of sculpture and architecture, which found \u001b[0m\n", "\u001b[32mpatrons among an urban elite.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m109\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Classical Sanskrit literature flowered as well, and Indian science, astronomy, \u001b[0m\n", "\u001b[32mmedicine, and mathematics made significant advances.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m109\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### Medieval India\\n\\nMain article: Medieval \u001b[0m\n", "\u001b[32mIndia\\n\\n\\n\\nBrihadeshwara temple, Thanjavur, completed in 1010\\xa0CE\\n\\n\\n\\nThe Qutub Minar, 73\\xa0m \u001b[0m\u001b[32m(\u001b[0m\u001b[32m240\\xa0ft\u001b[0m\u001b[32m)\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mtall, completed by the Sultan of Delhi, Iltutmish\\n\\nThe Indian early medieval age, from 600 to 1200\\xa0CE, is \u001b[0m\n", "\u001b[32mdefined by regional kingdoms and cultural diversity.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m111\u001b[0m\u001b[32m]\u001b[0m\u001b[32m When Harsha of Kannauj, who ruled much of the \u001b[0m\n", "\u001b[32mIndo-Gangetic Plain from 606 to 647\\xa0CE, attempted to expand southwards, he was defeated by the Chalukya ruler of\u001b[0m\n", "\u001b[32mthe Deccan.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m112\u001b[0m\u001b[32m]\u001b[0m\u001b[32m When his successor attempted to expand eastwards, he was defeated by the Pala king of Bengal.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m112\u001b[0m\u001b[32m]\u001b[0m\n", "\u001b[32mWhen the Chalukyas attempted to expand southwards, they were defeated by the Pallavas from farther south, who in \u001b[0m\n", "\u001b[32mturn were opposed by the Pandyas and the Cholas from still farther south.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m112\u001b[0m\u001b[32m]\u001b[0m\u001b[32m No ruler of this period was able to \u001b[0m\n", "\u001b[32mcreate an empire and consistently control lands much beyond their core region.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m111\u001b[0m\u001b[32m]\u001b[0m\u001b[32m During this time, pastoral \u001b[0m\n", "\u001b[32mpeoples, whose land had been cleared to make way for the growing agricultural economy, were accommodated within \u001b[0m\n", "\u001b[32mcaste society, as were new non-traditional ruling classes.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m113\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The caste system consequently began to show \u001b[0m\n", "\u001b[32mregional differences.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m113\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIn the 6th and 7th centuries, the first devotional hymns were created in the Tamil \u001b[0m\n", "\u001b[32mlanguage.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m114\u001b[0m\u001b[32m]\u001b[0m\u001b[32m They were imitated all over India and led to both the resurgence of Hinduism and the development of \u001b[0m\n", "\u001b[32mall modern languages of the subcontinent.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m114\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Indian royalty, big and small, and the temples they patronised drew \u001b[0m\n", "\u001b[32mcitizens in great numbers to the capital cities, which became economic hubs as well.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m115\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Temple towns of various \u001b[0m\n", "\u001b[32msizes began to appear everywhere as India underwent another urbanisation.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m115\u001b[0m\u001b[32m]\u001b[0m\u001b[32m By the 8th and 9th centuries, the \u001b[0m\n", "\u001b[32meffects were felt in Southeast Asia, as South Indian culture and political systems were exported to lands that \u001b[0m\n", "\u001b[32mbecame part of modern-day Myanmar, Thailand, Laos, Brunei, Cambodia, Vietnam, Philippines, Malaysia, and \u001b[0m\n", "\u001b[32mIndonesia.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m116\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Indian merchants, scholars, and sometimes armies were involved in this transmission; Southeast \u001b[0m\n", "\u001b[32mAsians took the initiative as well, with many sojourning in Indian seminaries and translating Buddhist and Hindu \u001b[0m\n", "\u001b[32mtexts into their languages.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m116\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nAfter the 10th century, Muslim Central Asian nomadic clans, using swift-horse \u001b[0m\n", "\u001b[32mcavalry and raising vast armies united by ethnicity and religion, repeatedly overran South Asia\\'s north-western \u001b[0m\n", "\u001b[32mplains, leading eventually to the establishment of the Islamic Delhi Sultanate in 1206.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m117\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The sultanate was to \u001b[0m\n", "\u001b[32mcontrol much of North India and to make many forays into South India. Although at first disruptive for the Indian \u001b[0m\n", "\u001b[32melites, the sultanate largely left its vast non-Muslim subject population to its own laws and customs.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m118\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m119\u001b[0m\u001b[32m]\u001b[0m\u001b[32m By\u001b[0m\n", "\u001b[32mrepeatedly repulsing Mongol raiders in the 13th century, the sultanate saved India from the devastation visited on \u001b[0m\n", "\u001b[32mWest and Central Asia, setting the scene for centuries of migration of fleeing soldiers, learned men, mystics, \u001b[0m\n", "\u001b[32mtraders, artists, and artisans from that region into the subcontinent, thereby creating a syncretic Indo-Islamic \u001b[0m\n", "\u001b[32mculture in the north.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m120\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m121\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The sultanate\\'s raiding and weakening of the regional kingdoms of South India \u001b[0m\n", "\u001b[32mpaved the way for the indigenous Vijayanagara Empire.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m122\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Embracing a strong Shaivite tradition and building upon \u001b[0m\n", "\u001b[32mthe military technology of the sultanate, the empire came to control much of peninsular India,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m123\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and was to \u001b[0m\n", "\u001b[32minfluence South Indian society for long afterwards.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m122\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### Early modern India\\n\\n\\n\\nA distant view of the Taj\u001b[0m\n", "\u001b[32mMahal from the Agra Fort\\n\\n\\n\\nA two mohur Company gold coin, issued in 1835, the obverse inscribed \"William IIII,\u001b[0m\n", "\u001b[32mKing\"\\n\\nIn the early 16th century, northern India, then under mainly Muslim rulers,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m124\u001b[0m\u001b[32m]\u001b[0m\u001b[32m fell again to the \u001b[0m\n", "\u001b[32msuperior mobility and firepower of a new generation of Central Asian warriors.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m125\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The resulting Mughal Empire did\u001b[0m\n", "\u001b[32mnot stamp out the local societies it came to rule. Instead, it balanced and pacified them through new \u001b[0m\n", "\u001b[32madministrative practices\u001b[0m\u001b[32m[\u001b[0m\u001b[32m126\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m127\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and diverse and inclusive ruling elites,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m128\u001b[0m\u001b[32m]\u001b[0m\u001b[32m leading to more systematic, \u001b[0m\n", "\u001b[32mcentralised, and uniform rule.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m129\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Eschewing tribal bonds and Islamic identity, especially under Akbar, the \u001b[0m\n", "\u001b[32mMughals united their far-flung realms through loyalty, expressed through a Persianised culture, to an emperor who \u001b[0m\n", "\u001b[32mhad near-divine status.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m128\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The Mughal state\\'s economic policies, deriving most revenues from agriculture\u001b[0m\u001b[32m[\u001b[0m\u001b[32m130\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mand mandating that taxes be paid in the well-regulated silver currency,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m131\u001b[0m\u001b[32m]\u001b[0m\u001b[32m caused peasants and artisans to enter \u001b[0m\n", "\u001b[32mlarger markets.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m129\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The relative peace maintained by the empire during much of the 17th century was a factor in \u001b[0m\n", "\u001b[32mIndia\\'s economic expansion,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m129\u001b[0m\u001b[32m]\u001b[0m\u001b[32m resulting in greater patronage of painting, literary forms, textiles, and \u001b[0m\n", "\u001b[32marchitecture.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m132\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Newly coherent social groups in northern and western India, such as the Marathas, the Rajputs, \u001b[0m\n", "\u001b[32mand the Sikhs, gained military and governing ambitions during Mughal rule, which, through collaboration or \u001b[0m\n", "\u001b[32madversity, gave them both recognition and military experience.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m133\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Expanding commerce during Mughal rule gave rise\u001b[0m\n", "\u001b[32mto new Indian commercial and political elites along the coasts of southern and eastern India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m133\u001b[0m\u001b[32m]\u001b[0m\u001b[32m As the empire \u001b[0m\n", "\u001b[32mdisintegrated, many among these elites were able to seek and control their own affairs.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m134\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nBy the early 18th \u001b[0m\n", "\u001b[32mcentury, with the lines between commercial and political dominance being increasingly blurred, a number of European\u001b[0m\n", "\u001b[32mtrading companies, including the English East India Company, had established coastal outposts.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m135\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m136\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The East \u001b[0m\n", "\u001b[32mIndia Company\\'s control of the seas, greater resources, and more advanced military training and technology led it \u001b[0m\n", "\u001b[32mto increasingly assert its military strength and caused it to become attractive to a portion of the Indian elite; \u001b[0m\n", "\u001b[32mthese factors were crucial in allowing the company to gain control over the Bengal region by 1765 and sideline the \u001b[0m\n", "\u001b[32mother European Its further access to the riches of Bengal and the subsequent increased strength and size of its \u001b[0m\n", "\u001b[32marmy enabled it to annex or subdue most of India by the 1820s.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m140\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India was then no longer exporting manufactured\u001b[0m\n", "\u001b[32mgoods as it long had, but was instead supplying the British Empire with raw materials. Many historians consider \u001b[0m\n", "\u001b[32mthis to be the onset of India\\'s colonial period.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m135\u001b[0m\u001b[32m]\u001b[0m\u001b[32m By this time, with its economic power severely curtailed by \u001b[0m\n", "\u001b[32mthe British parliament and having effectively been made an arm of British administration, the East India Company \u001b[0m\n", "\u001b[32mbegan more consciously to enter non-economic arenas, including education, social reform, and culture.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m141\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### \u001b[0m\n", "\u001b[32mModern India\\n\\nMain article: History of India \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1947â\\x80\\x93present\u001b[0m\u001b[32m)\u001b[0m\u001b[32m\\n\\n\\n\\n1909 map of the British Indian \u001b[0m\n", "\u001b[32mEmpire\\n\\n\\n\\nJawaharlal Nehru sharing a light moment with Mahatma Gandhi, Mumbai, 6 July 1946\\n\\nHistorians \u001b[0m\n", "\u001b[32mconsider India\\'s modern age to have begun sometime between 1848 and 1885. The appointment in 1848 of Lord \u001b[0m\n", "\u001b[32mDalhousie as Governor General of the East India Company set the stage for changes essential to a modern state. \u001b[0m\n", "\u001b[32mThese included the consolidation and demarcation of sovereignty, the surveillance of the population, and the \u001b[0m\n", "\u001b[32meducation of citizens. Technological changesâ\\x80\\x94among them, railways, canals, and the telegraphâ\\x80\\x94were \u001b[0m\n", "\u001b[32mintroduced not long after their introduction in However, disaffection with the company also grew during this time \u001b[0m\n", "\u001b[32mand set off the Indian Rebellion of 1857. Fed by diverse resentments and perceptions, including invasive \u001b[0m\n", "\u001b[32mBritish-style social reforms, harsh land taxes, and summary treatment of some rich landowners and princes, the \u001b[0m\n", "\u001b[32mrebellion rocked many regions of northern and central India and shook the foundations of Company rule.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m146\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m147\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mAlthough the rebellion was suppressed by 1858, it led to the dissolution of the East India Company and the direct \u001b[0m\n", "\u001b[32madministration of India by the British government. Proclaiming a unitary state and a gradual but limited \u001b[0m\n", "\u001b[32mBritish-style parliamentary system, the new rulers also protected princes and landed gentry as a feudal safeguard \u001b[0m\n", "\u001b[32magainst future unrest.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m148\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m149\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In the decades following, public life gradually emerged all over India, leading \u001b[0m\n", "\u001b[32meventually to the founding of the Indian National Congress in \\n\\nThe rush of technology and the commercialisation \u001b[0m\n", "\u001b[32mof agriculture in the second half of the 19th century was marked by economic setbacks, and many small farmers \u001b[0m\n", "\u001b[32mbecame dependent on the whims of far-away markets.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m154\u001b[0m\u001b[32m]\u001b[0m\u001b[32m There was an increase in the number of large-scale \u001b[0m\n", "\u001b[32mfamines,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m155\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and, despite the risks of infrastructure development borne by Indian taxpayers, little industrial \u001b[0m\n", "\u001b[32memployment was generated for Indians.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m156\u001b[0m\u001b[32m]\u001b[0m\u001b[32m There were also salutary effects: commercial cropping, especially in the\u001b[0m\n", "\u001b[32mnewly canalled Punjab, led to increased food production for internal consumption.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m157\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The railway network provided\u001b[0m\n", "\u001b[32mcritical famine relief,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m158\u001b[0m\u001b[32m]\u001b[0m\u001b[32m notably reduced the cost of moving goods,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m158\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and helped nascent Indian-owned \u001b[0m\n", "\u001b[32mindustry.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m157\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nAfter World War I, in which approximately one million Indians served,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m159\u001b[0m\u001b[32m]\u001b[0m\u001b[32m a new period began. It\u001b[0m\n", "\u001b[32mwas marked by British reforms but also repressive legislation, by more strident Indian calls for self-rule, and by \u001b[0m\n", "\u001b[32mthe beginnings of a nonviolent movement of non-co-operation, of which Mahatma Gandhi would become the leader and \u001b[0m\n", "\u001b[32menduring symbol.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m160\u001b[0m\u001b[32m]\u001b[0m\u001b[32m During the 1930s, slow legislative reform was enacted by the British; the Indian National \u001b[0m\n", "\u001b[32mCongress won victories in the resulting elections.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m161\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The next decade was beset with crises: Indian participation\u001b[0m\n", "\u001b[32min World War\\xa0II, the Congress\\'s final push for non-co-operation, and an upsurge of Muslim nationalism. All were\u001b[0m\n", "\u001b[32mcapped by the advent of independence in 1947, but tempered by the partition of India into two states: India and \u001b[0m\n", "\u001b[32mPakistan.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m162\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nVital to India\\'s self-image as an independent nation was its constitution, completed in 1950, \u001b[0m\n", "\u001b[32mwhich put in place a secular and democratic republic.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m163\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Economic liberalisation, which began in the 1980s and \u001b[0m\n", "\u001b[32mwith the collaboration with Soviet Union for technical knowledge,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m164\u001b[0m\u001b[32m]\u001b[0m\u001b[32m has created a large urban middle class, \u001b[0m\n", "\u001b[32mtransformed India into one of the world\\'s fastest-growing economies,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m165\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and increased its geopolitical \u001b[0m\n", "\u001b[32minfluence. Yet, India is also shaped by persistent poverty, both rural and urban;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m166\u001b[0m\u001b[32m]\u001b[0m\u001b[32m by religious and \u001b[0m\n", "\u001b[32mcaste-related violence;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m167\u001b[0m\u001b[32m]\u001b[0m\u001b[32m by Maoist-inspired Naxalite insurgencies;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m168\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and by separatism in Jammu and Kashmir \u001b[0m\n", "\u001b[32mand in Northeast India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m169\u001b[0m\u001b[32m]\u001b[0m\u001b[32m It has unresolved territorial disputes with China and with Pakistan.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m170\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India\\'s \u001b[0m\n", "\u001b[32msustained democratic freedoms are unique among the world\\'s newer nations; however, in spite of its recent economic\u001b[0m\n", "\u001b[32msuccesses, freedom from want for its disadvantaged population remains a goal yet to be achieved.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m171\u001b[0m\u001b[32m]\u001b[0m\u001b[32m As of 2025, \u001b[0m\n", "\u001b[32mpoverty in India declined sharply, mainly due to government welfare programs.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m172\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nGeography\\n---------\\n\\nMain \u001b[0m\n", "\u001b[32marticle: Geography of India\\n\\n\\n\\nThe Tungabhadra, with rocky outcrops, flows into the peninsular Krishna \u001b[0m\n", "\u001b[32mRiver\u001b[0m\u001b[32m[\u001b[0m\u001b[32m173\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n\\n\\nFishing boats lashed together in a tidal creek in Anjarle village, Maharashtra\\n\\nIndia accounts \u001b[0m\n", "\u001b[32mfor the bulk of the Indian subcontinent, lying atop the Indian tectonic plate, a part of the Indo-Australian \u001b[0m\n", "\u001b[32mPlate.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m174\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India\\'s defining geological processes began 75 million years ago when the Indian Plate, then part of \u001b[0m\n", "\u001b[32mthe southern supercontinent Gondwana, began a north-eastward drift caused by seafloor spreading to its south-west, \u001b[0m\n", "\u001b[32mand later, south and south-east.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m174\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Simultaneously, the vast Tethyan oceanic crust, to its northeast, began to \u001b[0m\n", "\u001b[32msubduct under the Eurasian Plate.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m174\u001b[0m\u001b[32m]\u001b[0m\u001b[32m These dual processes, driven by convection in the Earth\\'s mantle, both \u001b[0m\n", "\u001b[32mcreated the Indian Ocean and caused the Indian continental crust eventually to under-thrust Eurasia and to uplift \u001b[0m\n", "\u001b[32mthe Himalayas.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m174\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Immediately south of the emerging Himalayas, plate movement created a vast crescent-shaped \u001b[0m\n", "\u001b[32mtrough that rapidly filled with river-borne sediment\u001b[0m\u001b[32m[\u001b[0m\u001b[32m175\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and now constitutes the Indo-Gangetic Plain.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m176\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The \u001b[0m\n", "\u001b[32moriginal Indian plate makes its first appearance above the sediment in the ancient Aravalli range, which extends \u001b[0m\n", "\u001b[32mfrom the Delhi Ridge in a southwesterly direction. To the west lies the Thar Desert, the eastern spread of which is\u001b[0m\n", "\u001b[32mchecked by the \\n\\nThe remaining Indian Plate survives as peninsular India, the oldest and geologically most stable\u001b[0m\n", "\u001b[32mpart of India. It extends as far north as the Satpura and Vindhya ranges in central India. These parallel chains \u001b[0m\n", "\u001b[32mrun from the Arabian Sea coast in Gujarat in the west to the coal-rich Chota Nagpur Plateau in Jharkhand in the \u001b[0m\n", "\u001b[32meast.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m180\u001b[0m\u001b[32m]\u001b[0m\u001b[32m To the south, the remaining peninsular landmass, the Deccan Plateau, is flanked on the west and east by \u001b[0m\n", "\u001b[32mcoastal ranges known as the Western and Eastern Ghats;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m181\u001b[0m\u001b[32m]\u001b[0m\u001b[32m the plateau contains the country\\'s oldest rock \u001b[0m\n", "\u001b[32mformations, some over one billion years old. Constituted in such fashion, India lies to the north of the equator \u001b[0m\n", "\u001b[32mbetween 6° 44â\\x80² and 35° 30â\\x80² north latitude\u001b[0m\u001b[32m[\u001b[0m\u001b[32mm\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and 68° 7â\\x80² and 97° 25â\\x80² east \u001b[0m\n", "\u001b[32mlongitude.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m182\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIndia\\'s coastline measures 7,517 kilometres \u001b[0m\u001b[32m(\u001b[0m\u001b[32m4,700\\xa0mi\u001b[0m\u001b[32m)\u001b[0m\u001b[32m in length; of this distance, 5,423 \u001b[0m\n", "\u001b[32mkilometres \u001b[0m\u001b[32m(\u001b[0m\u001b[32m3,400\\xa0mi\u001b[0m\u001b[32m)\u001b[0m\u001b[32m belong to peninsular India and 2,094 kilometres \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1,300\\xa0mi\u001b[0m\u001b[32m)\u001b[0m\u001b[32m to the Andaman, Nicobar, and\u001b[0m\n", "\u001b[32mLakshadweep island chains.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m183\u001b[0m\u001b[32m]\u001b[0m\u001b[32m According to the Indian naval hydrographic charts, the mainland coastline consists \u001b[0m\n", "\u001b[32mof the following: 43% sandy beaches; 11% rocky shores, including cliffs; and 46% mudflats or marshy shores.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m183\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mMajor Himalayan-origin rivers that substantially flow through India include the Ganges and the Brahmaputra, both of\u001b[0m\n", "\u001b[32mwhich drain into the Bay of Bengal.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m184\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Important tributaries of the Ganges include the Yamuna and the Kosi; the \u001b[0m\n", "\u001b[32mlatter\\'s extremely low gradient, caused by long-term silt deposition, leads to severe floods and course \u001b[0m\n", "\u001b[32mchanges.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m185\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m186\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Major peninsular rivers, whose steeper gradients prevent their waters from flooding, include the\u001b[0m\n", "\u001b[32mGodavari, the Mahanadi, the Kaveri, and the Krishna, which also drain into the Bay of Bengal;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m187\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and the Narmada \u001b[0m\n", "\u001b[32mand the Tapti, which drain into the Arabian Sea.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m188\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Coastal features include the marshy Rann of Kutch of western \u001b[0m\n", "\u001b[32mIndia and the alluvial Sundarbans delta of eastern India; the latter is shared with Bangladesh.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m189\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has two \u001b[0m\n", "\u001b[32marchipelagos: the Lakshadweep, coral atolls off India\\'s south-western coast; and the Andaman and Nicobar Islands, \u001b[0m\n", "\u001b[32ma volcanic chain in the Andaman Sea.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m190\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIndian climate is strongly influenced by the Himalayas and the Thar \u001b[0m\n", "\u001b[32mDesert, both of which drive the economically and culturally pivotal summer and winter monsoons.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m191\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The Himalayas \u001b[0m\n", "\u001b[32mprevent cold Central Asian katabatic winds from blowing in, keeping the bulk of the Indian subcontinent warmer than\u001b[0m\n", "\u001b[32mmost locations at similar latitudes.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m192\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m193\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The Thar Desert plays a crucial role in attracting the \u001b[0m\n", "\u001b[32mmoisture-laden south-west summer monsoon winds that, between June and October, provide the majority of India\\'s \u001b[0m\n", "\u001b[32mrainfall.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m191\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Four major climatic groupings predominate in India: tropical wet, tropical dry, subtropical humid, \u001b[0m\n", "\u001b[32mand montane.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m194\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Temperatures in India have risen by 0.7\\xa0°C \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1.3\\xa0°F\u001b[0m\u001b[32m)\u001b[0m\u001b[32m between 1901 and 2018.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m195\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Climate \u001b[0m\n", "\u001b[32mchange in India is often thought to be the cause. The retreat of Himalayan glaciers has adversely affected the flow\u001b[0m\n", "\u001b[32mrate of the major Himalayan rivers, including the Ganges and the Brahmaputra.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m196\u001b[0m\u001b[32m]\u001b[0m\u001b[32m According to some current \u001b[0m\n", "\u001b[32mprojections, the number and severity of droughts in India will have markedly increased by the end of the present \u001b[0m\n", "\u001b[32mcentury.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m197\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### Biodiversity\\n\\nMain articles: Forestry in India and Wildlife of India\\n\\n\\n\\nIndia has the \u001b[0m\n", "\u001b[32mmajority of the world\\'s wild tigers, approximately 3,170 in 2022.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m198\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n\\n\\nA chital \u001b[0m\u001b[32m(\u001b[0m\u001b[32m*Axis axis*\u001b[0m\u001b[32m)\u001b[0m\u001b[32m stag in the \u001b[0m\n", "\u001b[32mNagarhole National Park in a region covered by a moderately dense\u001b[0m\u001b[32m[\u001b[0m\u001b[32mn\u001b[0m\u001b[32m]\u001b[0m\u001b[32m forest.\\n\\n\\n\\nThree of the last Asiatic \u001b[0m\n", "\u001b[32mcheetahs in India were shot dead in 1948 in Surguja district, Madhya Pradesh, Central India by Maharajah Ramanuj \u001b[0m\n", "\u001b[32mPratap Singh Deo. The young male cheetahs, all from the same litter, were sitting together when they were shot at \u001b[0m\n", "\u001b[32mnight.\\n\\nIndia is a megadiverse country, a term employed for 17 countries that display high biological diversity \u001b[0m\n", "\u001b[32mand contain many species exclusively indigenous, or endemic, to them.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m199\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India is the habitat for 8.6% of all \u001b[0m\n", "\u001b[32mmammals, 13.7% of bird species, 7.9% of reptile species, 6% of amphibian species, 12.2% of fish species, and 6.0% \u001b[0m\n", "\u001b[32mof all flowering plant species.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m200\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m201\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Fully a third of Indian plant species are endemic.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m202\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India also \u001b[0m\n", "\u001b[32mcontains four of the world\\'s 34 biodiversity hotspots,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m71\u001b[0m\u001b[32m]\u001b[0m\u001b[32m or regions that display significant habitat loss in the\u001b[0m\n", "\u001b[32mpresence of high endemism.\u001b[0m\u001b[32m[\u001b[0m\u001b[32mo\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m203\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIndia\\'s most dense forests, such as the tropical moist forest of the Andaman\u001b[0m\n", "\u001b[32mIslands, the Western Ghats, and Northeast India, occupy approximately 3% of its land area.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m204\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m205\u001b[0m\u001b[32m]\u001b[0m\u001b[32m *Moderately \u001b[0m\n", "\u001b[32mdense forest*, whose canopy density is between 40% and 70%, occupies 9.39% of India\\'s land area.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m204\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m205\u001b[0m\u001b[32m]\u001b[0m\u001b[32m It \u001b[0m\n", "\u001b[32mpredominates in the temperate coniferous forest of the Himalayas, the moist deciduous *sal* forest of eastern \u001b[0m\n", "\u001b[32mIndia, and the dry deciduous teak forest of central and southern India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m206\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has two natural zones of thorn \u001b[0m\n", "\u001b[32mforest, one in the Deccan Plateau, immediately east of the Western Ghats, and the other in the western part of the \u001b[0m\n", "\u001b[32mIndo-Gangetic plain, now turned into rich agricultural land by irrigation, its features no longer visible.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m207\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mAmong the Indian subcontinent\\'s notable indigenous trees are the astringent *Azadirachta indica*, or *neem*, which\u001b[0m\n", "\u001b[32mis widely used in rural Indian herbal medicine,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m208\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and the luxuriant *Ficus religiosa*, or *peepul*,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m209\u001b[0m\u001b[32m]\u001b[0m\u001b[32m which \u001b[0m\n", "\u001b[32mis displayed on the ancient seals of Mohenjo-daro,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m210\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and under which the Buddha is recorded in the Pali canon to\u001b[0m\n", "\u001b[32mhave sought enlightenment.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m211\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nMany Indian species have descended from those of Gondwana, the southern \u001b[0m\n", "\u001b[32msupercontinent from which India separated more than 100 million years ago.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m212\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India\\'s subsequent collision with \u001b[0m\n", "\u001b[32mEurasia set off a mass exchange of species. However, volcanism and climatic changes later caused the extinction of \u001b[0m\n", "\u001b[32mmany endemic Indian forms.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m213\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Still later, mammals entered India from Asia through two zoogeographic passes \u001b[0m\n", "\u001b[32mflanking the Himalayas.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m214\u001b[0m\u001b[32m]\u001b[0m\u001b[32m This had the effect of lowering endemism among India\\'s mammals, which stands at \u001b[0m\n", "\u001b[32m12.6%, contrasting with 45.8% among reptiles and 55.8% among amphibians.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m201\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Among endemics are the \u001b[0m\n", "\u001b[32mvulnerable\u001b[0m\u001b[32m[\u001b[0m\u001b[32m215\u001b[0m\u001b[32m]\u001b[0m\u001b[32m hooded leaf monkey\u001b[0m\u001b[32m[\u001b[0m\u001b[32m216\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and the threatened Beddome\\'s toad\u001b[0m\u001b[32m[\u001b[0m\u001b[32m217\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m218\u001b[0m\u001b[32m]\u001b[0m\u001b[32m of the Western Ghats.\\n\\nIndia\u001b[0m\n", "\u001b[32mcontains 172 IUCN-designated threatened animal species, or 2.9% of endangered forms.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m219\u001b[0m\u001b[32m]\u001b[0m\u001b[32m These include the \u001b[0m\n", "\u001b[32mendangered Bengal tiger and the Ganges river dolphin. Critically endangered species include the gharial, a \u001b[0m\n", "\u001b[32mcrocodilian; the great Indian bustard; and the Indian white-rumped vulture, which has become nearly extinct by \u001b[0m\n", "\u001b[32mhaving ingested the carrion of diclofenac-treated cattle.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m220\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Before they were extensively used for agriculture \u001b[0m\n", "\u001b[32mand cleared for human settlement, the thorn forests of Punjab were mingled at intervals with open grasslands that \u001b[0m\n", "\u001b[32mwere grazed by large herds of blackbuck preyed on by the Asiatic cheetah; the blackbuck, no longer extant in \u001b[0m\n", "\u001b[32mPunjab, is now severely endangered in India, and the cheetah is extinct.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m221\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The pervasive and ecologically \u001b[0m\n", "\u001b[32mdevastating human encroachment of recent decades has critically endangered Indian wildlife. In response, the system\u001b[0m\n", "\u001b[32mof national parks and protected areas, first established in 1935, was expanded substantially. In 1972, India \u001b[0m\n", "\u001b[32menacted the Wildlife Protection Act\u001b[0m\u001b[32m[\u001b[0m\u001b[32m222\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and Project Tiger to safeguard crucial wilderness; the Forest Conservation\u001b[0m\n", "\u001b[32mAct was enacted in 1980 and amendments added in 1988.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m223\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India hosts more than five hundred wildlife sanctuaries \u001b[0m\n", "\u001b[32mand eighteen\\xa0biosphere reserves,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m224\u001b[0m\u001b[32m]\u001b[0m\u001b[32m four of which are part of the World Network of Biosphere Reserves; its \u001b[0m\n", "\u001b[32meighty-nine wetlands are registered under the Ramsar Convention.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m225\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nPolitics and government\\n\\n\\n### \u001b[0m\n", "\u001b[32mPolitics\\n\\nMain article: Politics of India\\n\\nSee also: Democracy in India\\n\\n\\n\\nAs part of Janadesh 2007, 25,000\u001b[0m\n", "\u001b[32mproâ\\x80\\x93land reform landless people in Madhya Pradesh listen to Rajagopal P. V.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m226\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n\\n\\nUS president Barack\u001b[0m\n", "\u001b[32mObama addresses the members of the Parliament of India in New Delhi in November 2010.\\n\\nIndia is a parliamentary \u001b[0m\n", "\u001b[32mrepublic with a multi-party system.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m227\u001b[0m\u001b[32m]\u001b[0m\u001b[32m It has six\\xa0recognised national parties, including the Indian National \u001b[0m\n", "\u001b[32mCongress \u001b[0m\u001b[32m(\u001b[0m\u001b[32mINC\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and the Bharatiya Janata Party \u001b[0m\u001b[32m(\u001b[0m\u001b[32mBJP\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and over 50\\xa0regional parties.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m228\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Congress is considered \u001b[0m\n", "\u001b[32mthe ideological centre in Indian political culture,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m229\u001b[0m\u001b[32m]\u001b[0m\u001b[32m whereas the BJP is right-wing to From 1950 to the late \u001b[0m\n", "\u001b[32m1980s, Congress held a majority in the India\\'s parliament. Afterwards, it increasingly shared power with the \u001b[0m\n", "\u001b[32mBJP,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m233\u001b[0m\u001b[32m]\u001b[0m\u001b[32m as well as with powerful regional parties, which forced multi-party coalition governments at the \u001b[0m\n", "\u001b[32mcentre.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m234\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIn the Republic of India\\'s general elections in 1951, 1957, and 1962, Congress, led by Jawaharlal \u001b[0m\n", "\u001b[32mNehru, won easy victories. On Nehru\\'s death in 1964, Lal Bahadur Shastri briefly became prime minister; he was \u001b[0m\n", "\u001b[32msucceeded in 1966, by Nehru\\'s daughter Indira Gandhi, who led the Congress to election victories in 1967 and 1971.\u001b[0m\n", "\u001b[32mFollowing public discontent with the state of emergency Indira Gandhi had declared in 1975, Congress was voted out \u001b[0m\n", "\u001b[32mof power in 1977; Janata Party, which had opposed the emergency, was voted in. Its government lasted two years; \u001b[0m\n", "\u001b[32mMorarji Desai and Charan Singh served as prime ministers. After Congress was returned to power in 1980, Indira \u001b[0m\n", "\u001b[32mGandhi was assassinated and succeeded by Rajiv Gandhi, who won easily in the elections later that year. In the 1989\u001b[0m\n", "\u001b[32melections a National Front coalition, led by the Janata Dal in alliance with the Left Front, won, lasting just \u001b[0m\n", "\u001b[32munder two years, and V.P. Singh and Chandra Shekhar serving as prime ministers.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m235\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In the 1991 Indian general \u001b[0m\n", "\u001b[32melection, Congress, as the largest single party, formed a minority government led by P. V. Narasimha \u001b[0m\n", "\u001b[32mRao.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m236\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nAfter the 1996 Indian general election, the BJP formed a government briefly; it was followed by United\u001b[0m\n", "\u001b[32mFront coalitions, which depended on external political support. Two prime ministers served during this period: H.D.\u001b[0m\n", "\u001b[32mDeve Gowda and I.K. Gujral. In 1998, the BJP formed a coalitionâ\\x80\\x94the National Democratic Alliance \u001b[0m\u001b[32m(\u001b[0m\u001b[32mNDA\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. Led\u001b[0m\n", "\u001b[32mby Atal Bihari Vajpayee, the NDA became the first non-Congress, coalition government to complete a five-year \u001b[0m\n", "\u001b[32mterm.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m237\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In the 2004 Indian general elections, no party won an absolute majority. Still, the Congress emerged as \u001b[0m\n", "\u001b[32mthe largest single party, forming another successful coalition: the United Progressive Alliance \u001b[0m\u001b[32m(\u001b[0m\u001b[32mUPA\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. It had the \u001b[0m\n", "\u001b[32msupport of left-leaning parties and MPs who opposed the BJP. The UPA returned to power in the 2009 general election\u001b[0m\n", "\u001b[32mwith increased numbers, and it no longer required external support from India\\'s communist parties.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m238\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Manmohan \u001b[0m\n", "\u001b[32mSingh became the first prime minister since Jawaharlal Nehru in 1957 and 1962 to be re-elected to a consecutive \u001b[0m\n", "\u001b[32mfive-year term.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m239\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In the 2014 general election, the BJP became the first political party since 1984 to win an \u001b[0m\n", "\u001b[32mabsolute majority.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m240\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In the 2019 general election, the BJP regained an absolute majority. In the 2024 general \u001b[0m\n", "\u001b[32melection, a BJP-led NDA coalition formed the government. Narendra Modi, a former chief minister of Gujarat, is \u001b[0m\n", "\u001b[32mserving as the prime minister of India in his third term since May 26, 2014.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m241\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### Government\\n\\nMain \u001b[0m\n", "\u001b[32marticle: Government of India\\n\\nSee also: Constitution of India\\n\\n\\n\\nRashtrapati Bhavan, the official residence \u001b[0m\n", "\u001b[32mof the President of India, was designed by British architects Edwin Lutyens and Herbert Baker for the Viceroy of \u001b[0m\n", "\u001b[32mIndia, and constructed between 1911 and 1931 during the British Raj.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m242\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIndia is a federation with a \u001b[0m\n", "\u001b[32mparliamentary system governed under the Constitution of India. Federalism in India defines the power distribution \u001b[0m\n", "\u001b[32mbetween the union and the states. India\\'s form of government, traditionally described as \"quasi-federal\" with a \u001b[0m\n", "\u001b[32mstrong centre and weak states,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m243\u001b[0m\u001b[32m]\u001b[0m\u001b[32m has grown increasingly federal since the late 1990s as a result of political, \u001b[0m\n", "\u001b[32meconomic, and social changes.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m244\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m245\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe Government of India comprises three branches: the Executive, \u001b[0m\n", "\u001b[32mLegislature, and Judiciary.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m246\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The President of India is the ceremonial head of state,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m247\u001b[0m\u001b[32m]\u001b[0m\u001b[32m who is elected \u001b[0m\n", "\u001b[32mindirectly for a five-year term by an electoral college comprising members of national and state The Prime \u001b[0m\n", "\u001b[32mMinister of India is the head of government and exercises most executive power.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m250\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Appointed by the \u001b[0m\n", "\u001b[32mpresident,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m251\u001b[0m\u001b[32m]\u001b[0m\u001b[32m the prime minister is supported by the party or political alliance with a majority of seats in the \u001b[0m\n", "\u001b[32mlower house of parliament.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m250\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The executive of the Indian government consists of the president, the \u001b[0m\n", "\u001b[32mvice-president, and the Union Council of Ministersâ\\x80\\x94with the cabinet being its executive \u001b[0m\n", "\u001b[32mcommitteeâ\\x80\\x94headed by the prime minister. Any minister holding a portfolio must be a member of one of the \u001b[0m\n", "\u001b[32mhouses of parliament.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m247\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In the Indian parliamentary system, the executive is subordinate to the legislature; the\u001b[0m\n", "\u001b[32mprime minister and their council are directly responsible to the lower house of the parliament. Civil servants act \u001b[0m\n", "\u001b[32mas permanent executives and all decisions of the executive are implemented by them.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m252\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe legislature of \u001b[0m\n", "\u001b[32mIndia is the bicameral parliament. Operating under a Westminster-style parliamentary system, it comprises an upper \u001b[0m\n", "\u001b[32mhouse called the Rajya Sabha \u001b[0m\u001b[32m(\u001b[0m\u001b[32mCouncil of States\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and a lower house called the Lok Sabha \u001b[0m\u001b[32m(\u001b[0m\u001b[32mHouse of the People\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m253\u001b[0m\u001b[32m]\u001b[0m\n", "\u001b[32mThe Rajya Sabha is a permanent body of 245\\xa0members who serve staggered six-year terms with elections every 2 \u001b[0m\n", "\u001b[32myears.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m254\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Most are elected indirectly by the state and union territorial legislatures in numbers proportional to \u001b[0m\n", "\u001b[32mtheir state\\'s share of the national population.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m251\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The Lok Sabha\\'s 543\\xa0members are elected directly by \u001b[0m\n", "\u001b[32mpopular vote among citizens aged at least 18;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m255\u001b[0m\u001b[32m]\u001b[0m\u001b[32m they represent single-member constituencies for \u001b[0m\n", "\u001b[32mfive-year\\xa0terms.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m256\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Several seats from each state are reserved for candidates from Scheduled Castes and \u001b[0m\n", "\u001b[32mScheduled Tribes in proportion to their population within that state.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m255\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIndia has a three-tier\\xa0unitary \u001b[0m\n", "\u001b[32mindependent judiciary\u001b[0m\u001b[32m[\u001b[0m\u001b[32m257\u001b[0m\u001b[32m]\u001b[0m\u001b[32m comprising the supreme court, headed by the Chief Justice of India, 25\\xa0high courts, \u001b[0m\n", "\u001b[32mand a large number of trial courts.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m257\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The supreme court has original jurisdiction over cases involving \u001b[0m\n", "\u001b[32mfundamental rights and over disputes between states and the centre and has appellate jurisdiction over the high \u001b[0m\n", "\u001b[32mcourts.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m258\u001b[0m\u001b[32m]\u001b[0m\u001b[32m It has the power to both strike down union or state laws which contravene the constitution\u001b[0m\u001b[32m[\u001b[0m\u001b[32m259\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and \u001b[0m\n", "\u001b[32minvalidate any government action it deems \\n\\n### Administrative divisions\\n\\nMain article: Administrative \u001b[0m\n", "\u001b[32mdivisions of India\\n\\nSee also: Political integration of India\\n\\n\\n\\nA clickable map of the 28 states and 8 union \u001b[0m\n", "\u001b[32mterritories of India\\n\\nIndia is a federal union comprising 28 states and 8 union territories.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m12\u001b[0m\u001b[32m]\u001b[0m\u001b[32m All states, as \u001b[0m\n", "\u001b[32mwell as the union territories of Jammu and Kashmir, Puducherry and the National Capital Territory of Delhi, have \u001b[0m\n", "\u001b[32melected legislatures and governments following the Westminster system. The remaining five union territories are \u001b[0m\n", "\u001b[32mdirectly ruled by the central government through appointed administrators. In 1956, under the States Reorganisation\u001b[0m\n", "\u001b[32mAct, states were reorganised on a linguistic basis.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m261\u001b[0m\u001b[32m]\u001b[0m\u001b[32m There are over a quarter of a million local government \u001b[0m\n", "\u001b[32mbodies at city, town, block, district and village levels.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m262\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n#### States\\n\\n#### Union territories\\n\\nForeign,\u001b[0m\n", "\u001b[32meconomic, and strategic relations\\n\\n\\nMain article: Foreign relations of India\\n\\nSee also: Indian Armed \u001b[0m\n", "\u001b[32mForces\\n\\n\\n\\nDuring the 1950s and 60s, India played a pivotal role in the Non-Aligned Movement.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m263\u001b[0m\u001b[32m]\u001b[0m\u001b[32m From left to \u001b[0m\n", "\u001b[32mright: Gamal Abdel Nasser of United Arab Republic \u001b[0m\u001b[32m(\u001b[0m\u001b[32mnow Egypt\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, Josip Broz Tito of Yugoslavia and Jawaharlal Nehru \u001b[0m\n", "\u001b[32min Belgrade, September 1961.\\n\\nIndia became a republic in 1950, remaining a member of the Commonwealth of \u001b[0m\n", "\u001b[32mNations.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m264\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m265\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India strongly supported decolonisation in Africa and Asia in the 1950s; it played a leading \u001b[0m\n", "\u001b[32mrole in the Non-Aligned Movement.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m266\u001b[0m\u001b[32m]\u001b[0m\u001b[32m After cordial relations initially, India went to war with China in 1962. \u001b[0m\n", "\u001b[32mIndia was widely thought to have been humiliated.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m267\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Another military conflict followed in 1967 in which India \u001b[0m\n", "\u001b[32msuccessfully repelled a Chinese attack.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m268\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has had uneasy relations with its western neighbour, Pakistan. \u001b[0m\n", "\u001b[32mThe two countries went to war in 1947, 1965, 1971, and 1999. Three of these wars were fought over the disputed \u001b[0m\n", "\u001b[32mterritory of Kashmir. In contrast, the 1971 war followed India\\'s support for the independence of Bangladesh.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m269\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mAfter the 1965 war with Pakistan, India began to pursue close military and economic ties with the Soviet Union; by \u001b[0m\n", "\u001b[32mthe late 1960s, the Soviet Union was its largest arms supplier.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m270\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has played a key role in the South Asian\u001b[0m\n", "\u001b[32mAssociation for Regional Cooperation and the World Trade Organization. The nation has supplied 100,000 military and\u001b[0m\n", "\u001b[32mpolice personnel in 35 UN peacekeeping needed*\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n\\n\\nThe Indian Air Force contingent marching at the 221st \u001b[0m\n", "\u001b[32mBastille Day military parade in Paris, on 14 July 2009. The parade at which India was the foreign guest was led by \u001b[0m\n", "\u001b[32mIndia\\'s oldest regiment, the Maratha Light Infantry, founded in 1768.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m271\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nChina\\'s nuclear test of 1964 and \u001b[0m\n", "\u001b[32mthreats to intervene in support of Pakistan in the 1965 war caused India to produce nuclear weapons.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m272\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India \u001b[0m\n", "\u001b[32mconducted its first nuclear weapons test in 1974 and carried out additional underground testing in 1998. India has \u001b[0m\n", "\u001b[32msigned neither the Comprehensive Nuclear-Test-Ban Treaty nor the Nuclear Non-Proliferation Treaty, considering both\u001b[0m\n", "\u001b[32mto be flawed and discriminatory.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m273\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India maintains a \"no first use\" nuclear policy and is developing a nuclear \u001b[0m\n", "\u001b[32mtriad capability as a part of its \"Minimum Credible Deterrence\" doctrine.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m274\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m275\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nSince the end of the Cold \u001b[0m\n", "\u001b[32mWar, India has increased its economic, strategic, and military cooperation with the United States and the European \u001b[0m\n", "\u001b[32mUnion.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m276\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In 2008, a civilian nuclear agreement was signed between India and the United States. Although India \u001b[0m\n", "\u001b[32mpossessed nuclear weapons at the time and was not a party to the Nuclear Non-Proliferation Treaty, it received \u001b[0m\n", "\u001b[32mwaivers from the International Atomic Energy Agency and the Nuclear Suppliers Group, ending earlier restrictions on\u001b[0m\n", "\u001b[32mIndia\\'s nuclear technology and commerce; India subsequently signed co-operation agreements involving civilian \u001b[0m\n", "\u001b[32mnuclear energy with Russia,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m277\u001b[0m\u001b[32m]\u001b[0m\u001b[32m France,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m278\u001b[0m\u001b[32m]\u001b[0m\u001b[32m the United Kingdom,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m279\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and Canada.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m280\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe President of India \u001b[0m\n", "\u001b[32mis the supreme commander of the nation\\'s armed forces; with 1.45\\xa0million active troops, they compose the \u001b[0m\n", "\u001b[32mworld\\'s second-largest military. It comprises the Indian Army, the Indian Navy, the Indian Air Force, and the \u001b[0m\n", "\u001b[32mIndian Coast Guard.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m281\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The official Indian defence budget for 2011 was US$36.03\\xa0billion, or 1.83% of GDP.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m282\u001b[0m\u001b[32m]\u001b[0m\n", "\u001b[32mDefence expenditure was pegged at US$70.12\\xa0billion for fiscal year 2022â\\x80\\x9323 and, increased 9.8% than \u001b[0m\n", "\u001b[32mprevious fiscal year.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m283\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m284\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India is the world\\'s second-largest arms importer; between 2016 and 2020, it \u001b[0m\n", "\u001b[32maccounted for 9.5% of the total global arms imports.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m285\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Much of the military expenditure was focused on defence \u001b[0m\n", "\u001b[32magainst Pakistan and countering growing Chinese influence in the Indian Ocean.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m286\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nEconomy\\n-------\\n\\nMain \u001b[0m\n", "\u001b[32marticle: Economy of India\\n\\n\\n\\nIn 2019, 43% of India\\'s total workforce was employed in \u001b[0m\n", "\u001b[32magriculture.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m287\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n\\n\\nIndia is the world\\'s largest producer of milk, with the largest population of cattle. In \u001b[0m\n", "\u001b[32m2018, nearly 80% of India\\'s milk was sourced from small farms with herd size between one and two, the milk \u001b[0m\n", "\u001b[32mharvested by hand milking.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m289\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n\\n\\n55% of India\\'s female workforce was employed in agriculture in \u001b[0m\n", "\u001b[32m2019.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m288\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nAccording to the International Monetary Fund \u001b[0m\u001b[32m(\u001b[0m\u001b[32mIMF\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, the Indian economy in 2024 was nominally worth \u001b[0m\n", "\u001b[32m$3.94\\xa0trillion; it was the fifth-largest economy by market exchange rates and is, at around $15.0\\xa0trillion, \u001b[0m\n", "\u001b[32mthe third-largest by purchasing power parity \u001b[0m\u001b[32m(\u001b[0m\u001b[32mPPP\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m16\u001b[0m\u001b[32m]\u001b[0m\u001b[32m With its average annual GDP growth rate of 5.8% over the \u001b[0m\n", "\u001b[32mpast two decades, and reaching 6.1% during 2011â\\x80\\x932012,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m290\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India is one of the world\\'s fastest-growing \u001b[0m\n", "\u001b[32meconomies.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m291\u001b[0m\u001b[32m]\u001b[0m\u001b[32m However, due to its low GDP per capitaâ\\x80\\x94which ranks 136th in the world in nominal per capita\u001b[0m\n", "\u001b[32mincome and 125th in per capita income adjusted for purchasing power parity \u001b[0m\u001b[32m(\u001b[0m\u001b[32mPPP\u001b[0m\u001b[32m)\u001b[0m\u001b[32mâ\\x80\\x94the vast majority of \u001b[0m\n", "\u001b[32mIndians fall into the low-income group.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m292\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m293\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Until 1991, all Indian governments followed protectionist \u001b[0m\n", "\u001b[32mpolicies that were influenced by socialist economics. Widespread state intervention and regulation largely walled \u001b[0m\n", "\u001b[32mthe economy off from the outside world. An acute balance of payments crisis in 1991 forced the nation to liberalise\u001b[0m\n", "\u001b[32mits economy;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m294\u001b[0m\u001b[32m]\u001b[0m\u001b[32m since then, it has moved increasingly towards a free-market system\u001b[0m\u001b[32m[\u001b[0m\u001b[32m295\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m296\u001b[0m\u001b[32m]\u001b[0m\u001b[32m by emphasising both \u001b[0m\n", "\u001b[32mforeign trade and direct investment inflows.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m297\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has been a member of World Trade Organization since 1 \u001b[0m\n", "\u001b[32mJanuary 1995.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m298\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe 522-million-worker Indian labour force is the world\\'s second largest, as of \u001b[0m\n", "\u001b[32m2017\u001b[0m\u001b[32m[\u001b[0m\u001b[32mupdate\u001b[0m\u001b[32m]\u001b[0m\u001b[32m.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m281\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The service sector makes up 55.6% of GDP, the industrial sector 26.3% and the agricultural \u001b[0m\n", "\u001b[32msector 18.1%. India\\'s foreign exchange remittances of US$100 billion in 2022,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m299\u001b[0m\u001b[32m]\u001b[0m\u001b[32m highest in the world, were \u001b[0m\n", "\u001b[32mcontributed to its economy by 32 million Indians working in foreign countries.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m300\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In 2006, the share of external \u001b[0m\n", "\u001b[32mtrade in India\\'s GDP stood at 24%, up from 6% in 1985.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m295\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In 2008, India\\'s share of world trade was 1.7%;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m301\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mIn 2021, India was the world\\'s ninth-largest importer and the sixteenth-largest exporter.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m302\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Between 2001 and \u001b[0m\n", "\u001b[32m2011, the contribution of petrochemical and engineering goods to total exports grew from 14% to 42%.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m303\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India was\u001b[0m\n", "\u001b[32mthe world\\'s second-largest textile exporter after China in the 2013 calendar year.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m304\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nAveraging an economic \u001b[0m\n", "\u001b[32mgrowth rate of 7.5% for several years before 2007,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m295\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has more than doubled its hourly wage rates during \u001b[0m\n", "\u001b[32mthe first decade of the 21st century.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m305\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Some 431 million Indians have left poverty since 1985; India\\'s middle \u001b[0m\n", "\u001b[32mclasses are projected to number around 580\\xa0million by 2030.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m306\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In 2023, India\\'s consumer market was the \u001b[0m\n", "\u001b[32mworld\\'s fifth largest.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m307\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India\\'s nominal GDP per capita increased steadily from US$308 in 1991, when economic \u001b[0m\n", "\u001b[32mliberalisation began, to US$1,380 in 2010, to an estimated US$2,731 in 2024. It is expected to grow to US$3,264 by \u001b[0m\n", "\u001b[32m2026.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m16\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### Industries\\n\\n\\n\\nA tea garden in Sikkim. India, the world\\'s second-largest producer of tea, is a\u001b[0m\n", "\u001b[32mnation of one billion tea drinkers, who consume 70% of India\\'s tea output.\\n\\nThe Indian automotive industry, the \u001b[0m\n", "\u001b[32mworld\\'s second-fastest growing, increased domestic sales by 26% during 2009â\\x80\\x932010,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m308\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and exports by 36% \u001b[0m\n", "\u001b[32mduring 2008â\\x80\\x932009.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m309\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In 2022, India became the world\\'s third-largest vehicle market after China and the \u001b[0m\n", "\u001b[32mUnited States, surpassing Japan.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m310\u001b[0m\u001b[32m]\u001b[0m\u001b[32m At the end of 2011, the Indian IT industry employed 2.8\\xa0million \u001b[0m\n", "\u001b[32mprofessionals, generated revenues close to US$100\\xa0billion equalling 7.5% of Indian GDP, and contributed 26% of \u001b[0m\n", "\u001b[32mIndia\\'s merchandise exports.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m311\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe pharmaceutical industry in India includes 3,000 pharmaceutical companies \u001b[0m\n", "\u001b[32mand 10,500 manufacturing units; India is the world\\'s third-largest pharmaceutical producer, largest producer of \u001b[0m\n", "\u001b[32mgeneric medicines and supply up to 50â\\x80\\x9360% of global vaccines demand, these all contribute up to \u001b[0m\n", "\u001b[32mUS$24.44\\xa0billions in exports and India\\'s local pharmaceutical market is estimated up to \u001b[0m\n", "\u001b[32mUS$42\\xa0billion.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m312\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m313\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India is among the top 12 biotech destinations in the world.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m314\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m315\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The Indian \u001b[0m\n", "\u001b[32mbiotech industry grew by 15.1% in 2012â\\x80\\x932013, increasing its revenues from â\\x82¹204.4\\xa0billion \u001b[0m\u001b[32m(\u001b[0m\u001b[32mIndian \u001b[0m\n", "\u001b[32mrupees\u001b[0m\u001b[32m)\u001b[0m\u001b[32m to â\\x82¹235.24\\xa0billion \u001b[0m\u001b[32m(\u001b[0m\u001b[32mUS$3.94\\xa0billion at June 2013 exchange rates\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m316\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### Energy\\n\\nMain \u001b[0m\n", "\u001b[32marticle: Energy in India\\n\\nSee also: Energy policy of India\\n\\nIndia\\'s capacity to generate electrical power is \u001b[0m\n", "\u001b[32m300 gigawatts, of which 42 gigawatts is renewable.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m317\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The country\\'s usage of coal is a major cause of India\\'s \u001b[0m\n", "\u001b[32mgreenhouse gas emissions, but its renewable energy is competing \\xa0source\\xa0needed*\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India emits about 7% of \u001b[0m\n", "\u001b[32mglobal greenhouse gas emissions. This equates to about 2.5 tons of carbon dioxide per person per year, which is \u001b[0m\n", "\u001b[32mhalf the world average.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m319\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m320\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Increasing access to electricity and clean cooking with liquefied petroleum gas \u001b[0m\n", "\u001b[32mhave been priorities for energy in India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m321\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### Socio-economic challenges\\n\\nMain articles: Poverty in India,\u001b[0m\n", "\u001b[32mIncome inequality in India, and Debt bondage in India\\n\\n\\n\\nHealth workers about to begin another day of \u001b[0m\n", "\u001b[32mimmunisation against infectious diseases in 2006. Eight years later, and three years after India\\'s last case of \u001b[0m\n", "\u001b[32mpolio, the World Health Organization declared India to be polio-free.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m322\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nDespite economic growth during recent\u001b[0m\n", "\u001b[32mdecades, India continues to face socio-economic challenges. In 2006, India contained the largest number of people \u001b[0m\n", "\u001b[32mliving below the World Bank\\'s international poverty line of US$1.25 per day.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m323\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The proportion decreased from \u001b[0m\n", "\u001b[32m60% in 1981 to 42% in 2005.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m324\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Under the World Bank\\'s later revised poverty line, it was 21%-22.5 in \u001b[0m\n", "\u001b[32m2011.\u001b[0m\u001b[32m[\u001b[0m\u001b[32mp\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m326\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m327\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In 2019, the estimates had gone down to 10.2%.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m327\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In 2014, 30.7% of India\\'s children under \u001b[0m\n", "\u001b[32mthe age of five were underweight.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m328\u001b[0m\u001b[32m]\u001b[0m\u001b[32m According to a Food and Agriculture Organization report in 2015, 15% of the \u001b[0m\n", "\u001b[32mpopulation was The Midday Meal Scheme attempts to lower these rates.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m331\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nA 2018 Walk Free Foundation report \u001b[0m\n", "\u001b[32mestimated that nearly 8\\xa0million people in India were living in different forms of modern slavery, such as bonded\u001b[0m\n", "\u001b[32mlabour, child labour, human trafficking, and forced begging.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m332\u001b[0m\u001b[32m]\u001b[0m\u001b[32m According to the 2011 census, there were \u001b[0m\n", "\u001b[32m10.1\\xa0million child labourers in the country, a decline of 2.6\\xa0million from 12.6\\xa0million in \u001b[0m\n", "\u001b[32m2001.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m333\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nSince 1991, economic inequality between India\\'s states has consistently grown: the per-capita net \u001b[0m\n", "\u001b[32mstate domestic product of the richest states in 2007 was 3.2 times that of the poorest.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m334\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Corruption in India is\u001b[0m\n", "\u001b[32mperceived to have decreased. According to the Corruption Perceptions Index, India ranked 78th out of 180 countries \u001b[0m\n", "\u001b[32min 2018, an improvement from 85th in 2014.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m335\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m336\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nAs of 2025, poverty in India declined sharply. According to\u001b[0m\n", "\u001b[32mthe World Bank report, extreme poverty fall from 16.2% in 2011-12 to 2.3% in 2022-23. In rural areas it fell from \u001b[0m\n", "\u001b[32m18.4% to 2.8%, and in urban areas, from 10.7% to 1.1%. 378 million peopole were lifted from poverty and 171 million\u001b[0m\n", "\u001b[32mfrom extreme poverty. The main reason, according to the World Bank, is not economic growth but different government\u001b[0m\n", "\u001b[32mwelfare programs, like transferring food and money to the people with low income, improving their access to \u001b[0m\n", "\u001b[32mservices.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m172\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nDemographics, languages, and religion\\n\\n\\nMain articles: Demographics of India, Languages of \u001b[0m\n", "\u001b[32mIndia, and Religion in India\\n\\nSee also: South Asian ethnic groups\\n\\n\\n\\nA Sikh pilgrim at the Harmandir Sahib, \u001b[0m\n", "\u001b[32mor Golden Temple, in Amritsar, Punjab\\n\\n\\n\\nThe interior of San Thome Basilica, Chennai, Tamil Nadu. Christianity \u001b[0m\n", "\u001b[32mis believed to have been introduced to India by the late 2nd century by Syriac-speaking Christians.\\n\\nWith an \u001b[0m\n", "\u001b[32mestimated 1,428,627,663 residents in 2023, India is the world\\'s most populous country.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m13\u001b[0m\u001b[32m]\u001b[0m\u001b[32m 1,210,193,422 residents\u001b[0m\n", "\u001b[32mwere reported in the 2011 provisional census report.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m337\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Its population grew by 17.64% from 2001 to 2011,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m338\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mcompared to 21.54% growth in the previous decade \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1991â\\x80\\x932001\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m338\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The human sex ratio, according to the \u001b[0m\n", "\u001b[32m2011 census, is 940 females per 1,000 males.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m337\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The median age was 28.7 in 2020.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m281\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The first post-colonial \u001b[0m\n", "\u001b[32mcensus, conducted in 1951, counted 361\\xa0million people.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m339\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Medical advances made in the last 50 years as well \u001b[0m\n", "\u001b[32mas increased agricultural productivity brought about by the \"Green Revolution\" have caused India\\'s population to \u001b[0m\n", "\u001b[32mgrow rapidly.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m340\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe life expectancy in India is at 70 yearsâ\\x80\\x9471.5 years for women, 68.7 years for \u001b[0m\n", "\u001b[32mmen.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m281\u001b[0m\u001b[32m]\u001b[0m\u001b[32m There are around 93 physicians per 100,000 people.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m341\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Migration from rural to urban areas has been an \u001b[0m\n", "\u001b[32mimportant dynamic in India\\'s recent history. The number of people living in urban areas grew by 31.2% between 1991\u001b[0m\n", "\u001b[32mand 2001.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m342\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Yet, in 2001, over 70% still lived in rural areas.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m343\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m344\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The level of urbanisation increased \u001b[0m\n", "\u001b[32mfurther from 27.81% in the 2001 Census to 31.16% in the 2011 Census. The slowing down of the overall population \u001b[0m\n", "\u001b[32mgrowth rate was due to the sharp decline in the growth rate in rural areas since 1991.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m345\u001b[0m\u001b[32m]\u001b[0m\u001b[32m According to the 2011 \u001b[0m\n", "\u001b[32mcensus, there are 53 million-plus urban agglomerations in India; among them Mumbai, Delhi, Kolkata, Chennai, \u001b[0m\n", "\u001b[32mBengaluru, Hyderabad and Ahmedabad, in decreasing order by population.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m346\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The literacy rate in 2011 was 74.04%: \u001b[0m\n", "\u001b[32m65.46% among females and 82.14% among males.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m347\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The rural-urban literacy gap, which was 21.2 percentage points in\u001b[0m\n", "\u001b[32m2001, dropped to 16.1 percentage points in 2011. The improvement in the rural literacy rate is twice that of urban \u001b[0m\n", "\u001b[32mareas.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m345\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Kerala is the most literate state with 93.91% literacy; while Bihar the least with \u001b[0m\n", "\u001b[32m63.82%.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m347\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nAmong speakers of the Indian languages, 74% speak Indo-Aryan languages, the easternmost branch of \u001b[0m\n", "\u001b[32mthe Indo-European languages; 24% speak Dravidian languages, indigenous to South Asia and spoken widely before the \u001b[0m\n", "\u001b[32mspread of Indo-Aryan languages and 2% speak Austroasiatic languages or the Sino-Tibetan languages. India has no \u001b[0m\n", "\u001b[32mnational language.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m348\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Hindi, with the largest number of speakers, is the official language of the English is \u001b[0m\n", "\u001b[32mused extensively in business and administration and has the status of a \"subsidiary official language\";\u001b[0m\u001b[32m[\u001b[0m\u001b[32m6\u001b[0m\u001b[32m]\u001b[0m\u001b[32m it is \u001b[0m\n", "\u001b[32mimportant in education, especially as a medium of higher education. Each state and union territory has one or more \u001b[0m\n", "\u001b[32mofficial languages, and the constitution recognises in particular 22 \"scheduled languages\".\\n\\nThe 2011 census \u001b[0m\n", "\u001b[32mreported the religion in India with the largest number of followers was Hinduism \u001b[0m\u001b[32m(\u001b[0m\u001b[32m79.80% of the population\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, \u001b[0m\n", "\u001b[32mfollowed by Islam \u001b[0m\u001b[32m(\u001b[0m\u001b[32m14.23%\u001b[0m\u001b[32m)\u001b[0m\u001b[32m; the remaining were Christianity \u001b[0m\u001b[32m(\u001b[0m\u001b[32m2.30%\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, Sikhism \u001b[0m\u001b[32m(\u001b[0m\u001b[32m1.72%\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, Buddhism \u001b[0m\u001b[32m(\u001b[0m\u001b[32m0.70%\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, Jainism \u001b[0m\n", "\u001b[32m(\u001b[0m\u001b[32m0.36%\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and others\u001b[0m\u001b[32m[\u001b[0m\u001b[32mq\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\u001b[32m(\u001b[0m\u001b[32m0.9%\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m11\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has the third-largest Muslim populationâ\\x80\\x94the largest for a \u001b[0m\n", "\u001b[32mnon-Muslim majority country.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m351\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m352\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nCulture\\n-------\\n\\nMain article: Culture of India\\n\\n### Visual \u001b[0m\n", "\u001b[32mart\\n\\nMain article: Indian art\\n\\nIndia has a very ancient tradition of art, which has exchanged many influences \u001b[0m\n", "\u001b[32mwith the rest of Eurasia, especially in the first millennium, when Buddhist art spread with Indian religions to \u001b[0m\n", "\u001b[32mCentral, East and Southeast Asia, the last also greatly influenced by Hindu art.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m353\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Thousands of seals from the \u001b[0m\n", "\u001b[32mIndus Valley Civilization of the third millennium BCE have been found, usually carved with animals, but also some \u001b[0m\n", "\u001b[32mwith human figures. The Pashupati seal, excavated in Mohenjo-daro, Pakistan, in 1928â\\x80\\x9329, is the best \u001b[0m\n", "\u001b[32mknown.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m354\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m355\u001b[0m\u001b[32m]\u001b[0m\u001b[32m After this there is a long period with virtually nothing surviving.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m355\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m356\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Almost all surviving\u001b[0m\n", "\u001b[32mancient Indian art thereafter is in various forms of religious sculpture in durable materials, or coins. There was \u001b[0m\n", "\u001b[32mprobably originally far more in wood, which is lost. In north India Mauryan art is the first imperial In the first\u001b[0m\n", "\u001b[32mmillennium CE, Buddhist art spread with Indian religions to Central, East and Southeast Asia, the last also greatly\u001b[0m\n", "\u001b[32minfluenced by Hindu art.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m360\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Over the following centuries a distinctly Indian style of sculpting the human figure \u001b[0m\n", "\u001b[32mdeveloped, with less interest in articulating precise anatomy than ancient Greek sculpture but showing smoothly \u001b[0m\n", "\u001b[32mflowing forms expressing *prana* \u001b[0m\u001b[32m(\u001b[0m\u001b[32m\"breath\" or This is often complicated by the need to give figures multiple arms \u001b[0m\n", "\u001b[32mor heads, or represent different genders on the left and right of figures, as with the Ardhanarishvara form of \u001b[0m\n", "\u001b[32mShiva and Parvati.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m363\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m364\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nMost of the earliest large sculpture is Buddhist, either excavated from Buddhist \u001b[0m\n", "\u001b[32mstupas such as Sanchi, Sarnath and Amaravati,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m365\u001b[0m\u001b[32m]\u001b[0m\u001b[32m or is rock cut reliefs at sites such as Ajanta, Karla and \u001b[0m\n", "\u001b[32mEllora. Hindu and Jain sites appear rather later.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m366\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m367\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In spite of this complex mixture of religious \u001b[0m\n", "\u001b[32mtraditions, generally, the prevailing artistic style at any time and place has been shared by the major religious \u001b[0m\n", "\u001b[32mgroups, and sculptors probably usually served all communities.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m368\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Gupta art, at its peak \u001b[0m\n", "\u001b[32mc.â\\x80\\x89300\\xa0CE\\xa0â\\x80\\x93 c.â\\x80\\x89500\\xa0CE, is often regarded as a classical period whose influence \u001b[0m\n", "\u001b[32mlingered for many centuries after; it saw a new dominance of Hindu sculpture, as at the Elephanta Caves.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m369\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m370\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mAcross the north, this became rather stiff and formulaic after c.â\\x80\\x89800\\xa0CE, though rich with finely carved\u001b[0m\n", "\u001b[32mdetail in the surrounds of statues.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m371\u001b[0m\u001b[32m]\u001b[0m\u001b[32m But in the South, under the Pallava and Chola dynasties, sculpture in both\u001b[0m\n", "\u001b[32mstone and bronze had a sustained period of great achievement; the large bronzes with Shiva as Nataraja have become \u001b[0m\n", "\u001b[32man iconic symbol of India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m372\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m373\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nAncient paintings have only survived at a few sites, of which the crowded \u001b[0m\n", "\u001b[32mscenes of court life in the Ajanta Caves are some of the most important.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m374\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m375\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Painted manuscripts of religious\u001b[0m\n", "\u001b[32mtexts survive from Eastern India from 10th century onwards, most of the earliest being Buddhist and later Jain. \u001b[0m\n", "\u001b[32mThese significantly influenced later artistic styles.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m376\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The Persian-derived Deccan painting, starting just \u001b[0m\n", "\u001b[32mbefore the Mughal miniature, between them give the first large body of secular painting, with an emphasis on \u001b[0m\n", "\u001b[32mportraits, and the recording of princely pleasures and wars.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m377\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m378\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The style spread to Hindu courts, especially\u001b[0m\n", "\u001b[32mamong the Rajputs, and developed a variety of styles, with the smaller courts often the most innovative, with \u001b[0m\n", "\u001b[32mfigures such as Nihâl Chand and Nainsukh.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m379\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m380\u001b[0m\u001b[32m]\u001b[0m\u001b[32m As a market developed among European residents, it was \u001b[0m\n", "\u001b[32msupplied by Company painting by Indian artists with considerable Western influence.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m381\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m382\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In the 19th century, \u001b[0m\n", "\u001b[32mcheap Kalighat paintings of gods and everyday life, done on paper, were urban folk art from Calcutta, which later \u001b[0m\n", "\u001b[32msaw the Bengal School of Art, reflecting the art colleges founded by the British, the first movement in modern \u001b[0m\n", "\u001b[32mIndian Society\\n\\nMain articles: Caste system in India and Gender inequality in India\\n\\n\\n\\nMuslims offer *namaz*\u001b[0m\n", "\u001b[32mat a mosque in Srinagar, Jammu and Kashmir.\\n\\nThe Indian caste system embodies much of the social stratification \u001b[0m\n", "\u001b[32mand many of the social restrictions found on the Indian subcontinent. Social classes are defined by thousands of \u001b[0m\n", "\u001b[32mendogamous hereditary groups, often termed as *jÄ\\x81tis*, or \"castes\".\u001b[0m\u001b[32m[\u001b[0m\u001b[32m385\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India abolished untouchability in 1950\u001b[0m\n", "\u001b[32mwith the adoption of the constitution and has since enacted other anti-discriminatory laws and social welfare \u001b[0m\n", "\u001b[32minitiatives.\u001b[0m\u001b[32m[\u001b[0m\u001b[32mr\u001b[0m\u001b[32m]\u001b[0m\u001b[32m However, the system continues to be dominant in India, and caste-based inequality, discrimination, \u001b[0m\n", "\u001b[32msegregation, and violence persist.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m387\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m388\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nMulti-generational patrilineal joint families have been the norm in\u001b[0m\n", "\u001b[32mIndia, though nuclear families are becoming common in urban areas.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m389\u001b[0m\u001b[32m]\u001b[0m\u001b[32m An overwhelming majority of Indians have \u001b[0m\n", "\u001b[32mtheir marriages arranged by their parents or other family elders.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m390\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Marriage is thought to be for life,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m390\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and\u001b[0m\n", "\u001b[32mthe divorce rate is extremely low,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m391\u001b[0m\u001b[32m]\u001b[0m\u001b[32m with less than one in a thousand marriages ending in divorce.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m392\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Child \u001b[0m\n", "\u001b[32mmarriages are common, especially in rural areas; many women wed before reaching 18, which is their legal \u001b[0m\n", "\u001b[32mmarriageable age.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m393\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Female infanticide in India, and lately female foeticide, have created skewed gender ratios;\u001b[0m\n", "\u001b[32mthe number of missing women in the country quadrupled from 15 million to 63\\xa0million in the 50 years ending in \u001b[0m\n", "\u001b[32m2014, faster than the population growth during the same period.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m394\u001b[0m\u001b[32m]\u001b[0m\u001b[32m According to an Indian government study, an \u001b[0m\n", "\u001b[32madditional 21 million girls are unwanted and do not receive adequate care.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m395\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Despite a government ban on \u001b[0m\n", "\u001b[32msex-selective foeticide, the practice remains commonplace in India, the result of a preference for boys in a \u001b[0m\n", "\u001b[32mpatriarchal society.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m396\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The payment of dowry, although illegal, remains widespread across class lines.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m397\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mDeaths resulting from dowry, mostly from bride burning, are on the rise, despite stringent anti-dowry \u001b[0m\n", "\u001b[32mlaws.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m398\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### Education\\n\\nMain article: Education in India\\n\\nSee also: Literacy in India and History of \u001b[0m\n", "\u001b[32meducation in the Indian subcontinent\\n\\n\\n\\nChildren awaiting school lunch in Rayka \u001b[0m\u001b[32m(\u001b[0m\u001b[32malso Raika\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, a village in \u001b[0m\n", "\u001b[32mrural Gujarat. The salutation *Jai Bhim* written on the blackboard honours the jurist, social reformer, and Dalit \u001b[0m\n", "\u001b[32mleader B. R. Ambedkar.\\n\\nIn the 2011 census, about 73% of the population was literate, with 81% for men and 65% \u001b[0m\n", "\u001b[32mfor women. This compares to 1981 when the respective rates were 41%, 53% and 29%. In 1951 the rates were 18%, 27% \u001b[0m\n", "\u001b[32mand 9%. In 1921 the rates 7%, 12% and 2%. In 1891 they were 5%, 9% and 1%,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m399\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m400\u001b[0m\u001b[32m]\u001b[0m\u001b[32m According to Latika Chaudhary,\u001b[0m\n", "\u001b[32min 1911 there were under three primary schools for every ten villages. Statistically, more caste and religious \u001b[0m\n", "\u001b[32mdiversity reduced private spending. Primary schools taught literacy, so local diversity limited its \u001b[0m\n", "\u001b[32mgrowth.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m401\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe education system of India is the world\\'s second-largest.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m402\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has over 900 universities,\u001b[0m\n", "\u001b[32m40,000 colleges\u001b[0m\u001b[32m[\u001b[0m\u001b[32m403\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and 1.5\\xa0million schools.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m404\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In India\\'s higher education system, a significant number of \u001b[0m\n", "\u001b[32mseats are reserved under affirmative action policies for the historically disadvantaged. In recent decades India\\'s\u001b[0m\n", "\u001b[32mimproved education system is often cited as one of the main contributors to its economic \\n\\n### Clothing\\n\\nMain \u001b[0m\n", "\u001b[32marticle: Clothing in India\\n\\n\\n\\nWomen in sari at an adult literacy class in Tamil Nadu\\n\\n\\n\\nWomen \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfrom left to\u001b[0m\n", "\u001b[32mright\u001b[0m\u001b[32m)\u001b[0m\u001b[32m in churidars and kameez \u001b[0m\u001b[32m(\u001b[0m\u001b[32mwith back to the camera\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, jeans and sweater, and pink shalwar kameez\\n\\nFrom \u001b[0m\n", "\u001b[32mancient times until the advent of the modern, the most widely worn traditional dress in India was draped.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m407\u001b[0m\u001b[32m]\u001b[0m\u001b[32m For \u001b[0m\n", "\u001b[32mwomen it took the form of a sari, a single piece of cloth many yards long.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m407\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The sari was traditionally wrapped \u001b[0m\n", "\u001b[32maround the lower body and the shoulder.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m407\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In its modern form, it is combined with an underskirt, or Indian \u001b[0m\n", "\u001b[32mpetticoat, and tucked in the waist band for more secure fastening. It is also commonly worn with an Indian blouse, \u001b[0m\n", "\u001b[32mor choli, which serves as the primary upper-body garment, the sari\\'s endâ\\x80\\x94passing over the \u001b[0m\n", "\u001b[32mshoulderâ\\x80\\x94covering the midriff and obscuring the upper body\\'s contours.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m407\u001b[0m\u001b[32m]\u001b[0m\u001b[32m For men, a similar but shorter\u001b[0m\n", "\u001b[32mlength of cloth, the dhoti, has served as a lower-body garment.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m408\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe use of stitched clothes became \u001b[0m\n", "\u001b[32mwidespread after Muslim rule was established at first by the Delhi sultanate \u001b[0m\u001b[32m(\u001b[0m\u001b[32mc.â\\x80\\x891300 CE\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and then \u001b[0m\n", "\u001b[32mcontinued by the Mughal Empire \u001b[0m\u001b[32m(\u001b[0m\u001b[32mc.â\\x80\\x891525 CE\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m409\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Among the garments introduced during this time and still \u001b[0m\n", "\u001b[32mcommonly worn are: the shalwars and pyjamas, both styles of trousers, and the tunics kurta and kameez.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m409\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In \u001b[0m\n", "\u001b[32msouthern India, the traditional draped garments were to see much longer continuous use.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m409\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nSalwars are \u001b[0m\n", "\u001b[32matypically wide at the waist but narrow to a cuffed bottom. They are held up by a drawstring, which causes them to \u001b[0m\n", "\u001b[32mbecome pleated around the waist.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m410\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The pants can be wide and baggy, or they can be cut quite narrow, on the \u001b[0m\n", "\u001b[32mbias, in which case they are called churidars. When they are ordinarily wide at the waist and their bottoms are \u001b[0m\n", "\u001b[32mhemmed but not cuffed, they are called pyjamas. The kameez is a long shirt or tunic,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m411\u001b[0m\u001b[32m]\u001b[0m\u001b[32m its side seams left open \u001b[0m\n", "\u001b[32mbelow the waistline.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m412\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The kurta is traditionally collarless and made of cotton or silk; it is worn plain or \u001b[0m\n", "\u001b[32mwith embroidered decoration, such as chikan; and typically falls to either just above or just below the wearer\\'s \u001b[0m\n", "\u001b[32mknees.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m413\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIn the last 50 years, fashions have changed a great deal in India. Increasingly, in urban northern \u001b[0m\n", "\u001b[32mIndia, the sari is no longer the apparel of everyday wear, though they remain popular on formal occasions.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m414\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The\u001b[0m\n", "\u001b[32mtraditional shalwar kameez is rarely worn by younger urban women, who favour churidars or jeans.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m414\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In office \u001b[0m\n", "\u001b[32msettings, ubiquitous air conditioning allows men to wear sports jackets year-round.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m414\u001b[0m\u001b[32m]\u001b[0m\u001b[32m For weddings and formal \u001b[0m\n", "\u001b[32moccasions, men in the middle and upper classes often wear bandgala, or short Nehru jackets, with pants, with the \u001b[0m\n", "\u001b[32mgroom and his groomsmen sporting sherwanis and churidars.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m414\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The dhoti, once the universal garment of Hindu \u001b[0m\n", "\u001b[32mmales, the wearing of which in the homespun and handwoven khadi allowed Gandhi to bring Indian nationalism to the \u001b[0m\n", "\u001b[32mmillions,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m415\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\nis seldom seen in the cities.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m414\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### Cuisine\\n\\nMain article: Indian cuisine\\n\\n\\n\\nSouth \u001b[0m\n", "\u001b[32mIndian vegetarian thali, or platter\\n\\nThe foundation of a typical Indian meal is a cereal cooked plainly and \u001b[0m\n", "\u001b[32mcomplemented with flavourful savoury dishes.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m416\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The cooked cereal could be steamed rice; chapati, a thin \u001b[0m\n", "\u001b[32munleavened bread;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m417\u001b[0m\u001b[32m]\u001b[0m\u001b[32m the idli, a steamed breakfast cake; or dosa, a griddled pancake.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m418\u001b[0m\u001b[32m]\u001b[0m\u001b[32m The savoury dishes \u001b[0m\n", "\u001b[32mmight include lentils, pulses and vegetables commonly spiced with ginger and garlic, but also with a combination of\u001b[0m\n", "\u001b[32mspices that may include coriander, cumin, turmeric, cinnamon, cardamon and others.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m416\u001b[0m\u001b[32m]\u001b[0m\u001b[32m They might also include \u001b[0m\n", "\u001b[32mpoultry, fish, or meat dishes. In some instances, the ingredients may be mixed during the cooking \u001b[0m\n", "\u001b[32mprocess.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m419\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nA platter, or thali, used for eating usually has a central place reserved for the cooked cereal, \u001b[0m\n", "\u001b[32mand peripheral ones for the flavourful accompaniments. The cereal and its accompaniments are eaten simultaneously \u001b[0m\n", "\u001b[32mrather than a piecemeal manner. This is accomplished by mixingâ\\x80\\x94for example of rice and lentilsâ\\x80\\x94or \u001b[0m\n", "\u001b[32mfolding, wrapping, scooping or dippingâ\\x80\\x94such as chapati and cooked vegetables.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m416\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n\\n\\nA tandoor chef in\u001b[0m\n", "\u001b[32mthe Turkman Gate, Old Delhi, makes Khameeri roti \u001b[0m\u001b[32m(\u001b[0m\u001b[32ma Muslim-influenced style of leavened bread\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m420\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nIndia has \u001b[0m\n", "\u001b[32mdistinctive vegetarian cuisines, each a feature of the geographical and cultural histories of its adherents.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m421\u001b[0m\u001b[32m]\u001b[0m\u001b[32m \u001b[0m\n", "\u001b[32mAbout 20% to 39% of India\\'s population consists of Much of this stems from caste hierarchy, as upper castes, such\u001b[0m\n", "\u001b[32mas the Brahmins, consider vegetarian food to be \"pure\".\u001b[0m\u001b[32m[\u001b[0m\u001b[32m424\u001b[0m\u001b[32m]\u001b[0m\u001b[32m[\u001b[0m\u001b[32m425\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Although meat is eaten widely in India, the \u001b[0m\n", "\u001b[32mproportional consumption of meat in the overall diet is low.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m426\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Unlike China, which has increased its per capita \u001b[0m\n", "\u001b[32mmeat consumption substantially in its years of increased economic growth, in India the strong dietary traditions \u001b[0m\n", "\u001b[32mhave contributed to dairy, rather than meat, becoming the preferred form of animal protein consumption.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m427\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nThe\u001b[0m\n", "\u001b[32mmost significant import of cooking techniques into India during the last millennium occurred during the Mughal \u001b[0m\n", "\u001b[32mEmpire. Dishes such as the pilaf,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m428\u001b[0m\u001b[32m]\u001b[0m\u001b[32m developed in the Abbasid caliphate,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m429\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and cooking techniques such as the \u001b[0m\n", "\u001b[32mmarinating of meat in yogurt, spread into northern India from regions to its northwest.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m430\u001b[0m\u001b[32m]\u001b[0m\u001b[32m To the simple yogurt \u001b[0m\n", "\u001b[32mmarinade of Persia, onions, garlic, almonds, and spices began to be added in India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m430\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Rice was partially cooked \u001b[0m\n", "\u001b[32mand layered alternately with the sauteed meat, the pot sealed tightly, and slow cooked according to another Persian\u001b[0m\n", "\u001b[32mcooking technique, to produce what has today become biryani,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m430\u001b[0m\u001b[32m]\u001b[0m\u001b[32m a feature of festive dining in many parts of \u001b[0m\n", "\u001b[32mIndia.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m431\u001b[0m\u001b[32m]\u001b[0m\u001b[32m In the food served in Indian restaurants worldwide, the diversity of Indian food has been partially \u001b[0m\n", "\u001b[32mconcealed by the dominance of Punjabi cuisine. The popularity of tandoori chickenâ\\x80\\x94cooked in the tandoor \u001b[0m\n", "\u001b[32moven, which had traditionally been used for baking bread in the rural Punjab and the Delhi region, especially among\u001b[0m\n", "\u001b[32mMuslims, but which is originally from Central Asiaâ\\x80\\x94dates to the 1950s, and was caused in large part by an \u001b[0m\n", "\u001b[32mentrepreneurial response among people from the Punjab who had been displaced by the 1947 partition.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m421\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\n### \u001b[0m\n", "\u001b[32mSports and recreation\\n\\nMain article: Sport in India\\n\\nSee also: Indian physical culture\\n\\n\\n\\nGirls play \u001b[0m\n", "\u001b[32mhopscotch in Jaora, Madhya Pradesh. Hopscotch has been commonly played by girls in rural India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m432\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nSeveral \u001b[0m\n", "\u001b[32mtraditional sportsâ\\x80\\x94such as *kabaddi*, *kho kho*, pehlwani, *gilli-danda,* hopscotch and martial arts such \u001b[0m\n", "\u001b[32mas *Kalarippayattu* and *marma adiâ\\x80\\x94*remain popular. Chess is commonly held to have originated in India as \u001b[0m\n", "\u001b[32m*chaturaá¹\\x85ga*;\u001b[0m\u001b[32m[\u001b[0m\u001b[32m433\u001b[0m\u001b[32m]\u001b[0m\u001b[32m in recent years, there has been a rise in the number of Indian grandmasters,\u001b[0m\u001b[32m[\u001b[0m\u001b[32m434\u001b[0m\u001b[32m]\u001b[0m\u001b[32m and world\u001b[0m\n", "\u001b[32mchampions.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m435\u001b[0m\u001b[32m]\u001b[0m\u001b[32m Parcheesi is derived from *Pachisi*, another traditional Indian pastime, which in early modern \u001b[0m\n", "\u001b[32mtimes was played on a giant marble court by Mughal emperor Akbar the Great.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m436\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nCricket is the most popular \u001b[0m\n", "\u001b[32msport in India.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m437\u001b[0m\u001b[32m]\u001b[0m\u001b[32m India has won two Cricket World Cups, the 1983 edition and the 2011 edition. India has won \u001b[0m\n", "\u001b[32meight field hockey gold medals in the summer Olympics.\u001b[0m\u001b[32m[\u001b[0m\u001b[32m438\u001b[0m\u001b[32m]\u001b[0m\u001b[32m\\n\\nSee \u001b[0m\n", "\u001b[32malso\\n--------Notes\\n-----\\n\\nReferences\\n----------\\n\\nBibliography\\n------------\\n\\n### Overview\\n\\n### \u001b[0m\n", "\u001b[32mEtymology\\n\\n### History\\n\\n### Geography\\n\\n### Biodiversity\\n\\n### Politics\\n\\n### Foreign relations and \u001b[0m\n", "\u001b[32mmilitary\\n\\n### Economy\\n\\n### Demographics\\n\\n### Art\\n\\n### Culture\\n\\nExternal \u001b[0m\n", "\u001b[32mlinks\\n--------------\\n\\n**India** at Wikipedia\\'s sister projects\\n\\n information**| India topics | |\\n| --- | ---\u001b[0m\n", "\u001b[32m|\\n| | History | | | --- | --- | | | | | | --- | --- | | Overviews | | | Ancient | | | Middle Kingdoms | | | \u001b[0m\n", "\u001b[32mMiddle Ages | | | Early Modern | | | Late modern | | | Republic | | | Contemporary | | | | | |\\n| | Geography \u001b[0m\n", "\u001b[32m| | | --- | --- | | | | | | --- | --- | | Geology | | | Heritage | | | Environment | | | Landforms | | | \u001b[0m\n", "\u001b[32mRegions | | | Subdivisions | | | See also | | | | | |\\n| | Politics | | | --- | --- | | | | | | --- | --- | | \u001b[0m\n", "\u001b[32mGovernment | | | Law | | | | | --- | --- | | | | | Enforcement | | | | | --- | --- | | Federal | | | \u001b[0m\n", "\u001b[32mIntelligence | | | | | Military | | | Politics | | | | | |\\n| | Economy | | | --- | --- | | | | | | --- | --- \u001b[0m\n", "\u001b[32m| | Companies | | | Governance | | | Currency | | | Financial services | | | History | | | People | | | \u001b[0m\n", "\u001b[32mStates | | | Sectors | | | Regulator | | | Other | | | | | |\\n| | | | | --- | --- | | | | | | --- | --- | | \u001b[0m\n", "\u001b[32mSociety | | | Culture | | | | | |\\n\\n| Related topics | |\\n| --- | --- |\\n| | States and union territories of \u001b[0m\n", "\u001b[32mIndia | | | --- | --- | | States | | | Union territories | | | | | | South Asian topics | | | --- | --- | | \u001b[0m\n", "\u001b[32mCountries and regions | | | Ethnic groups | | | Culture | | | Environment | | | Economy and Politics | | | \u001b[0m\n", "\u001b[32mHistory | | | Religion | | | Sports | | | Education | | | Military | | | Science and technology | | | \u001b[0m\n", "\u001b[32mMembers of the Commonwealth of Nations | | | --- | --- | | Sovereign states \u001b[0m\u001b[32m(\u001b[0m\u001b[32mmembers\u001b[0m\u001b[32m)\u001b[0m\u001b[32m | | | Associated states | |\u001b[0m\n", "\u001b[32m| Dependent territories | | | | | --- | --- | | Australia | | | New Zealand | | | United Kingdom | | | | \u001b[0m\n", "\u001b[32mSource: Commonwealth Secretariat â\\x80\\x93 Member Countries | | | South Asian Association for Regional \u001b[0m\n", "\u001b[32mCooperation \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSAARC\u001b[0m\u001b[32m)\u001b[0m\u001b[32m | | | --- | --- | | Agreements | South Asian Free Trade Area | | Summits | | | Members | | | \u001b[0m\n", "\u001b[32mObservers | | | Guests | | | Specialised agencies | | | Related Articles | | | Group of Eight \u001b[0m\u001b[32m(\u001b[0m\u001b[32mG8\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and Group\u001b[0m\n", "\u001b[32mof Eight + Five \u001b[0m\u001b[32m(\u001b[0m\u001b[32mG8+5\u001b[0m\u001b[32m)\u001b[0m\u001b[32m | | | --- | --- | | G8 members | | | Representative | | | G8+5 | | | See also | | | \u001b[0m\n", "\u001b[32mG20 major economies | | | --- | --- | | | | | BRICS | | | --- | --- | | Membership | | | Summits | | | \u001b[0m\n", "\u001b[32mLeaders | | | Sports | | | Related | | | East Asia Summit \u001b[0m\u001b[32m(\u001b[0m\u001b[32mEAS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m | | | --- | --- | | | | | | | | Shanghai\u001b[0m\n", "\u001b[32mCooperation Organisation \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSCO\u001b[0m\u001b[32m)\u001b[0m\u001b[32m | | | --- | --- | | Summits | | | Member states | | | Observer states | | | \u001b[0m\n", "\u001b[32mDialogue partners | | | Guests | | | Applicants | | | See also | | | Countries and dependencies of Asia | | \u001b[0m\n", "\u001b[32m| --- | --- | | Sovereign states | | | States with limited recognition | | | Dependent territories | | | | | \u001b[0m\n", "\u001b[32m--- | --- | | Australia | | | Greece | | | United Kingdom | | | | Special administrative regions of China | | |\u001b[0m\n", "\u001b[32m| | | |\\n\\nPortals:\\n\\n| Authority control databases Edit this at Wikidata | |\\n| --- | --- |\\n| International | \u001b[0m\n", "\u001b[32m|\\n| National | |\\n| Geographic | |\\n| Academics | |\\n| Artists | |\\n| People | |\\n| Other | |\\n\\n21°N \u001b[0m\n", "\u001b[32m78°E / 21°N 78°E / 21; 78\\n\\n\\n\\nRetrieved from \\n\\nCategories:\\n\\nHidden \u001b[0m\n", "\u001b[32mcategories:\\n\\nSearch\\n\\nSearch\\n\\nIndia\\n\\n305 languages\\n\\nAdd topic'\u001b[0m\n", "\u001b[1m)\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Scrape content from URLs and convert to documents\n", "docs = await urls_to_docs([\n", " \"https://en.wikipedia.org/wiki/India\",\n", " \"https://en.wikipedia.org/wiki/Bangalore\"\n", "])\n", "print(f\"Loaded {len(docs)} documents.\")\n", "print(docs[0])" ] }, { "cell_type": "markdown", "id": "2215615d", "metadata": {}, "source": [ "## 6. Using Generative Models\n", "\n", "Initialize and use generative models like `ChatGoogleGenerativeAI` and `ChatOpenAI` for text generation tasks. The `get_generative_model` function helps you select and configure the model.\n", "\n", "\n", "### For local llms\n", "\n", "```python\n", "llmlocal = get_generative_model(\n", " model_name=\"google/gemma-3-12b\",\n", " type='local',\n", " base_url=\"http://127.0.0.1:1234/v1\", # for an example if you are running lmstudio locally\n", " _tools=None,\n", " kwargs={\n", " \"temperature\": 0.1, # Sampling temperature for generation.\n", " \"max_tokens\": None, # Maximum number of tokens to generate (None for default).\n", " \"timeout\": None, # Timeout for API requests (None for default).\n", " \"max_retries\": 2, # Maximum number of retries for failed requests.\n", " \"api_key\": \"dummy\", # API key for authentication.\n", " }\n", ")\n", "```\n", "\n", "You can even load local models from ollama etc, just configure openai compatible API server base url arg\n", "For other proprietary models as well, just configure the base_url arg accordingly with \"others\" as type\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "b2bbe90b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-09-01 01:32:28,531] WARNING langchain_google_genai.chat_models: Unexpected argument 'generation_config' provided to ChatGoogleGenerativeAI.\n", "[2025-09-01 01:32:28,531] WARNING langchain_google_genai.chat_models: Unexpected argument 'generation_config' provided to ChatGoogleGenerativeAI.\n", "/var/folders/sh/v0nx5vbx4mv07k1h4mxmwxcw0000gn/T/ipykernel_18168/4124600085.py:1: UserWarning: WARNING! generation_config is not default parameter.\n", " generation_config was transferred to model_kwargs.\n", " Please confirm that generation_config is what you intended.\n", " llmgoogle = get_generative_model(model_name='gemini-1.5-flash',\n" ] } ], "source": [ "llmgoogle = get_generative_model(model_name='gemini-1.5-flash',\n", " type='google',\n", " _tools=None,\n", " kwargs={'temperature': 0.1, 'max_tokens': None, 'timeout': None, 'max_retries': 2, \n", " 'api_key': os.environ['GOOGLE_API_KEY'],\n", " 'generation_config':{\"response_mime_type\": \"application/json\"}})" ] }, { "cell_type": "markdown", "id": "197edaf9", "metadata": {}, "source": [ "# 7. Query -> Search -> Scrape -> Response \n", "```query_web_response```, intelligently choses between to RAG and Summary of full page based on the request" ] }, { "cell_type": "code", "execution_count": 13, "id": "e5889dba", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:06:02,440] INFO utils.answer_generation: Summary:True,subquery:['Top science news today', 'Top technology news today', 'Links to news articles', 'Summary of science news', 'Summary of technology news', 'Date of news articles'], iscoverd:False\n", "[2025-07-27 12:06:02,440] INFO utils.answer_generation: Summary:True,subquery:['Top science news today', 'Top technology news today', 'Links to news articles', 'Summary of science news', 'Summary of technology news', 'Date of news articles'], iscoverd:False\n", "[2025-07-27 12:06:02,442] INFO utils.websearch_utils: Search phrases for query 'Top news of today in science and technology with date and link to original article as hyperlink and summary of each in one line': ['top science news today', 'top technology news today', 'links to news articles', 'summary of science news', 'summary of technology news', 'date of news articles']\n", "[2025-07-27 12:06:02,442] INFO utils.websearch_utils: Search phrases for query 'Top news of today in science and technology with date and link to original article as hyperlink and summary of each in one line': ['top science news today', 'top technology news today', 'links to news articles', 'summary of science news', 'summary of technology news', 'date of news articles']\n", "[2025-07-27 12:06:03,262] INFO utils.websearch_utils: Search results for query 'top science news today': [{'snippet': 'Science News features news articles, videos and more about the latest scientific advances. Independent, accurate nonprofit news since 1921.', 'title': 'Science News | The latest news from all areas of science', 'link': 'https://www.sciencenews.org/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:03,262] INFO utils.websearch_utils: Search results for query 'top science news today': [{'snippet': 'Science News features news articles, videos and more about the latest scientific advances. Independent, accurate nonprofit news since 1921.', 'title': 'Science News | The latest news from all areas of science', 'link': 'https://www.sciencenews.org/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:03,264] INFO utils.websearch_utils: Search results fetched for subquery: top science news today\n", "[2025-07-27 12:06:03,264] INFO utils.websearch_utils: Search results fetched for subquery: top science news today\n", "[2025-07-27 12:06:04,509] INFO utils.websearch_utils: Search results for query 'top technology news today': [{'snippet': \"TECHNOLOGY NEWS · Meet Lumo, the new AI chatbot that protects user privacy · Huawei shows off AI computing system to rival Nvidia's top product · Under Rs 30,000: ...\", 'title': 'Technology News - Gadgets', 'link': 'https://indianexpress.com/section/technology/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:04,509] INFO utils.websearch_utils: Search results for query 'top technology news today': [{'snippet': \"TECHNOLOGY NEWS · Meet Lumo, the new AI chatbot that protects user privacy · Huawei shows off AI computing system to rival Nvidia's top product · Under Rs 30,000: ...\", 'title': 'Technology News - Gadgets', 'link': 'https://indianexpress.com/section/technology/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:04,510] INFO utils.websearch_utils: Search results fetched for subquery: top technology news today\n", "[2025-07-27 12:06:04,510] INFO utils.websearch_utils: Search results fetched for subquery: top technology news today\n", "[2025-07-27 12:06:05,089] INFO utils.websearch_utils: Search results for query 'links to news articles': [{'snippet': \"Today's news: Get latest and Breaking News on Politics, Business, Lifestyle, Entertainment and Sports along with News updates from around the world.\", 'title': 'Latest News Today: Breaking News and Top Headlines from ...', 'link': 'https://indianexpress.com/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:05,089] INFO utils.websearch_utils: Search results for query 'links to news articles': [{'snippet': \"Today's news: Get latest and Breaking News on Politics, Business, Lifestyle, Entertainment and Sports along with News updates from around the world.\", 'title': 'Latest News Today: Breaking News and Top Headlines from ...', 'link': 'https://indianexpress.com/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:05,091] INFO utils.websearch_utils: Search results fetched for subquery: links to news articles\n", "[2025-07-27 12:06:05,091] INFO utils.websearch_utils: Search results fetched for subquery: links to news articles\n", "[2025-07-27 12:06:05,707] INFO utils.websearch_utils: Search results for query 'summary of science news': [{'snippet': 'Science News features news articles, videos and more about the latest scientific advances. Independent, accurate nonprofit news since 1921.', 'title': 'Science News | The latest news from all areas of science', 'link': 'https://www.sciencenews.org/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:05,707] INFO utils.websearch_utils: Search results for query 'summary of science news': [{'snippet': 'Science News features news articles, videos and more about the latest scientific advances. Independent, accurate nonprofit news since 1921.', 'title': 'Science News | The latest news from all areas of science', 'link': 'https://www.sciencenews.org/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:05,709] INFO utils.websearch_utils: Search results fetched for subquery: summary of science news\n", "[2025-07-27 12:06:05,709] INFO utils.websearch_utils: Search results fetched for subquery: summary of science news\n", "[2025-07-27 12:06:06,176] INFO utils.websearch_utils: Search results for query 'summary of technology news': [{'snippet': \"TECHNOLOGY NEWS · Meet Lumo, the new AI chatbot that protects user privacy · Huawei shows off AI computing system to rival Nvidia's top product · Under Rs 30,000: ...\", 'title': 'Technology News - Gadgets', 'link': 'https://indianexpress.com/section/technology/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:06,176] INFO utils.websearch_utils: Search results for query 'summary of technology news': [{'snippet': \"TECHNOLOGY NEWS · Meet Lumo, the new AI chatbot that protects user privacy · Huawei shows off AI computing system to rival Nvidia's top product · Under Rs 30,000: ...\", 'title': 'Technology News - Gadgets', 'link': 'https://indianexpress.com/section/technology/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:06,178] INFO utils.websearch_utils: Search results fetched for subquery: summary of technology news\n", "[2025-07-27 12:06:06,178] INFO utils.websearch_utils: Search results fetched for subquery: summary of technology news\n", "[2025-07-27 12:06:06,755] INFO utils.websearch_utils: Search results for query 'date of news articles': [{'snippet': 'Breaking News in India: Read Latest News on Sports, Business, Entertainment, Blogs and Opinions from leading columnists. Times of India brings the Breaking ...', 'title': 'Times of India: News - Breaking News, Latest News, India ...', 'link': 'https://timesofindia.indiatimes.com/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:06,755] INFO utils.websearch_utils: Search results for query 'date of news articles': [{'snippet': 'Breaking News in India: Read Latest News on Sports, Business, Entertainment, Blogs and Opinions from leading columnists. Times of India brings the Breaking ...', 'title': 'Times of India: News - Breaking News, Latest News, India ...', 'link': 'https://timesofindia.indiatimes.com/', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:06:06,757] INFO utils.websearch_utils: Search results fetched for subquery: date of news articles\n", "[2025-07-27 12:06:06,757] INFO utils.websearch_utils: Search results fetched for subquery: date of news articles\n", "[2025-07-27 12:06:06,758] INFO utils.websearch_utils: Created Document for source: https://www.sciencenews.org/\n", "[2025-07-27 12:06:06,758] INFO utils.websearch_utils: Created Document for source: https://www.sciencenews.org/\n", "[2025-07-27 12:06:06,759] INFO utils.websearch_utils: Created Document for source: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:06,759] INFO utils.websearch_utils: Created Document for source: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:06,760] INFO utils.websearch_utils: Created Document for source: https://indianexpress.com/\n", "[2025-07-27 12:06:06,760] INFO utils.websearch_utils: Created Document for source: https://indianexpress.com/\n", "[2025-07-27 12:06:06,761] INFO utils.websearch_utils: Created Document for source: https://www.sciencenews.org/\n", "[2025-07-27 12:06:06,761] INFO utils.websearch_utils: Created Document for source: https://www.sciencenews.org/\n", "[2025-07-27 12:06:06,762] INFO utils.websearch_utils: Created Document for source: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:06,762] INFO utils.websearch_utils: Created Document for source: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:06,763] INFO utils.websearch_utils: Created Document for source: https://timesofindia.indiatimes.com/\n", "[2025-07-27 12:06:06,763] INFO utils.websearch_utils: Created Document for source: https://timesofindia.indiatimes.com/\n", "[2025-07-27 12:06:06,764] INFO utils.websearch_utils: Search results fetched for query 'Top news of today in science and technology with date and link to original article as hyperlink and summary of each in one line'.\n", "[2025-07-27 12:06:06,764] INFO utils.websearch_utils: Search results fetched for query 'Top news of today in science and technology with date and link to original article as hyperlink and summary of each in one line'.\n", "[2025-07-27 12:06:06,764] INFO utils.websearch_utils: Starting context_to_docs for 6 URL groups.\n", "[2025-07-27 12:06:06,764] INFO utils.websearch_utils: Starting context_to_docs for 6 URL groups.\n", "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py:136: UserWarning: resource_tracker: process died unexpectedly, relaunching. Some resources might leak.\n", " warnings.warn('resource_tracker: process died unexpectedly, '\n", "[2025-07-27 12:06:06,786] INFO utils.websearch_utils: Fetching URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:06,786] INFO utils.websearch_utils: Fetching URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:06,788] INFO utils.websearch_utils: Fetching URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:06,788] INFO utils.websearch_utils: Fetching URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:06,792] INFO utils.websearch_utils: Fetching URL: https://indianexpress.com/\n", "[2025-07-27 12:06:06,792] INFO utils.websearch_utils: Fetching URL: https://indianexpress.com/\n", "[2025-07-27 12:06:06,793] INFO utils.websearch_utils: Fetching URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:06,793] INFO utils.websearch_utils: Fetching URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:06,796] INFO utils.websearch_utils: Fetching URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:06,796] INFO utils.websearch_utils: Fetching URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:06,797] ERROR utils.websearch_utils: Error processing https://timesofindia.indiatimes.com/: [Errno 32] Broken pipe\n", "[2025-07-27 12:06:06,797] ERROR utils.websearch_utils: Error processing https://timesofindia.indiatimes.com/: [Errno 32] Broken pipe\n", "[2025-07-27 12:06:06,802] ERROR utils.websearch_utils: An error occurred with URL https://timesofindia.indiatimes.com/: 'NoneType' object has no attribute 'replace'\n", "[2025-07-27 12:06:06,802] ERROR utils.websearch_utils: An error occurred with URL https://timesofindia.indiatimes.com/: 'NoneType' object has no attribute 'replace'\n", "[2025-07-27 12:06:06,875] INFO utils.websearch_utils: Fetched content from https://indianexpress.com/section/technology/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:06,875] INFO utils.websearch_utils: Fetched content from https://indianexpress.com/section/technology/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:06,908] INFO utils.websearch_utils: Fetched content from https://indianexpress.com/section/technology/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:06,908] INFO utils.websearch_utils: Fetched content from https://indianexpress.com/section/technology/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:06,911] INFO utils.websearch_utils: Fetched content from https://indianexpress.com/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:06,911] INFO utils.websearch_utils: Fetched content from https://indianexpress.com/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:08,111] INFO utils.websearch_utils: Fetched content from https://www.sciencenews.org/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:08,111] INFO utils.websearch_utils: Fetched content from https://www.sciencenews.org/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:08,179] INFO utils.websearch_utils: Fetched content from https://www.sciencenews.org/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:08,179] INFO utils.websearch_utils: Fetched content from https://www.sciencenews.org/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:08,778] INFO utils.websearch_utils: Processed markdown for: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:08,778] INFO utils.websearch_utils: Processed markdown for: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:08,781] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:08,781] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:08,828] INFO utils.websearch_utils: Processed markdown for: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:08,828] INFO utils.websearch_utils: Processed markdown for: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:08,830] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:08,830] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:08,868] INFO utils.websearch_utils: Processed markdown for: https://indianexpress.com/\n", "[2025-07-27 12:06:08,868] INFO utils.websearch_utils: Processed markdown for: https://indianexpress.com/\n", "[2025-07-27 12:06:08,869] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://indianexpress.com/\n", "[2025-07-27 12:06:08,869] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://indianexpress.com/\n", "[2025-07-27 12:06:09,314] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:09,314] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:09,321] INFO utils.websearch_utils: Processed 1 docs for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:09,321] INFO utils.websearch_utils: Processed 1 docs for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:09,322] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:06:09,322] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:06:09,323] INFO utils.websearch_utils: Encoding start for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:09,323] INFO utils.websearch_utils: Encoding start for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:09,327] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,327] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,327] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,377] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:09,377] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:09,378] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:09,378] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:09,386] INFO utils.websearch_utils: Processed 1 docs for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:09,386] INFO utils.websearch_utils: Processed 1 docs for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:09,386] INFO utils.websearch_utils: Processed 1 docs for URL: https://indianexpress.com/\n", "[2025-07-27 12:06:09,386] INFO utils.websearch_utils: Processed 1 docs for URL: https://indianexpress.com/\n", "[2025-07-27 12:06:09,387] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:06:09,387] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:06:09,392] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:06:09,392] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:06:09,393] INFO utils.websearch_utils: Encoding start for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:09,394] INFO utils.websearch_utils: Encoding start for URL: https://indianexpress.com/\n", "[2025-07-27 12:06:09,393] INFO utils.websearch_utils: Encoding start for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:09,396] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,394] INFO utils.websearch_utils: Encoding start for URL: https://indianexpress.com/\n", "[2025-07-27 12:06:09,396] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,397] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,396] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,397] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,406] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,397] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,406] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,406] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:09,950] INFO utils.websearch_utils: Processed markdown for: https://www.sciencenews.org/\n", "[2025-07-27 12:06:09,950] INFO utils.websearch_utils: Processed markdown for: https://www.sciencenews.org/\n", "[2025-07-27 12:06:09,950] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:09,950] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,029] INFO utils.websearch_utils: Processed markdown for: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,029] INFO utils.websearch_utils: Processed markdown for: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,030] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,030] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,286] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:10,286] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:10,289] INFO utils.websearch_utils: Processed 1 docs for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,289] INFO utils.websearch_utils: Processed 1 docs for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,290] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:06:10,290] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:06:10,291] INFO utils.websearch_utils: Encoding start for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,291] INFO utils.websearch_utils: Encoding start for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,293] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:10,293] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:10,293] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:10,412] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:10,412] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:10,416] INFO utils.websearch_utils: Processed 1 docs for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,416] INFO utils.websearch_utils: Processed 1 docs for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,417] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:06:10,417] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:06:10,417] INFO utils.websearch_utils: Encoding start for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,417] INFO utils.websearch_utils: Encoding start for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:10,418] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:10,418] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:10,418] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:11,019] INFO utils.websearch_utils: Encoding end for URL: https://indianexpress.com/section/technology/, Time taken: 1.63 seconds\n", "[2025-07-27 12:06:11,019] INFO utils.websearch_utils: Encoding end for URL: https://indianexpress.com/section/technology/, Time taken: 1.63 seconds\n", "[2025-07-27 12:06:11,029] INFO utils.websearch_utils: Encoding end for URL: https://www.sciencenews.org/, Time taken: 0.74 seconds\n", "[2025-07-27 12:06:11,035] INFO utils.websearch_utils: Encoding end for URL: https://indianexpress.com/section/technology/, Time taken: 1.71 seconds\n", "[2025-07-27 12:06:11,029] INFO utils.websearch_utils: Encoding end for URL: https://www.sciencenews.org/, Time taken: 0.74 seconds\n", "[2025-07-27 12:06:11,036] INFO utils.websearch_utils: Reranking start for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:11,035] INFO utils.websearch_utils: Encoding end for URL: https://indianexpress.com/section/technology/, Time taken: 1.71 seconds\n", "[2025-07-27 12:06:11,036] INFO utils.websearch_utils: Reranking start for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:11,036] INFO utils.websearch_utils: Reranking start for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:11,037] INFO utils.websearch_utils: Reranking start for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:11,036] INFO utils.websearch_utils: Reranking start for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:11,037] INFO utils.websearch_utils: Reranking end for URL: https://indianexpress.com/section/technology/, Time taken: 0.00 seconds\n", "[2025-07-27 12:06:11,037] INFO utils.websearch_utils: Reranking start for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:11,038] INFO utils.websearch_utils: Reranking end for URL: https://www.sciencenews.org/, Time taken: 0.00 seconds\n", "[2025-07-27 12:06:11,037] INFO utils.websearch_utils: Reranking end for URL: https://indianexpress.com/section/technology/, Time taken: 0.00 seconds\n", "[2025-07-27 12:06:11,038] INFO utils.websearch_utils: Reranking end for URL: https://indianexpress.com/section/technology/, Time taken: 0.00 seconds\n", "[2025-07-27 12:06:11,038] INFO utils.websearch_utils: Reranking end for URL: https://www.sciencenews.org/, Time taken: 0.00 seconds\n", "[2025-07-27 12:06:11,038] INFO utils.websearch_utils: Reranking end for URL: https://indianexpress.com/section/technology/, Time taken: 0.00 seconds\n", "[2025-07-27 12:06:11,253] INFO utils.websearch_utils: Encoding end for URL: https://indianexpress.com/, Time taken: 1.86 seconds\n", "[2025-07-27 12:06:11,253] INFO utils.websearch_utils: Encoding end for URL: https://indianexpress.com/, Time taken: 1.86 seconds\n", "[2025-07-27 12:06:11,254] INFO utils.websearch_utils: Reranking start for URL: https://indianexpress.com/\n", "[2025-07-27 12:06:11,254] INFO utils.websearch_utils: Reranking start for URL: https://indianexpress.com/\n", "[2025-07-27 12:06:11,254] INFO utils.websearch_utils: Reranking end for URL: https://indianexpress.com/, Time taken: 0.00 seconds\n", "[2025-07-27 12:06:11,254] INFO utils.websearch_utils: Reranking end for URL: https://indianexpress.com/, Time taken: 0.00 seconds\n", "[2025-07-27 12:06:11,866] INFO utils.websearch_utils: Encoding end for URL: https://www.sciencenews.org/, Time taken: 1.45 seconds\n", "[2025-07-27 12:06:11,866] INFO utils.websearch_utils: Encoding end for URL: https://www.sciencenews.org/, Time taken: 1.45 seconds\n", "[2025-07-27 12:06:11,867] INFO utils.websearch_utils: Reranking start for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:11,867] INFO utils.websearch_utils: Reranking start for URL: https://www.sciencenews.org/\n", "[2025-07-27 12:06:11,868] INFO utils.websearch_utils: Reranking end for URL: https://www.sciencenews.org/, Time taken: 0.00 seconds\n", "[2025-07-27 12:06:11,868] INFO utils.websearch_utils: Reranking end for URL: https://www.sciencenews.org/, Time taken: 0.00 seconds\n", "Batches: 100%|██████████| 1/1 [00:00<00:00, 12.64it/s]\n", "[2025-07-27 12:06:12,186] INFO utils.websearch_utils: Retrieved 1 docs for https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:12,186] INFO utils.websearch_utils: Retrieved 1 docs for https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:12,187] INFO utils.websearch_utils: Built context for https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:12,187] INFO utils.websearch_utils: Built context for https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:12,188] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: https://indianexpress.com/section/technology/\n", "[2025-07-27 12:06:12,188] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: https://indianexpress.com/section/technology/\n", "Batches: 0%| | 0/1 [00:00Summarising using 3 documents\n", "\n" ], "text/plain": [ "Summarising using \u001b[1;36m3\u001b[0m documents\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:06:19,947] INFO utils.websearch_utils: Response generated for query 'Top news of today in science and technology with date and link to original article as hyperlink and summary of each in one line'.\n", "[2025-07-27 12:06:19,947] INFO utils.websearch_utils: Response generated for query 'Top news of today in science and technology with date and link to original article as hyperlink and summary of each in one line'.\n" ] }, { "data": { "text/html": [ "
Top Science and Technology News of Today (July 27, 2025):\n",
       "\n",
       "**Note:**  The provided text does not contain specific dates for most articles, so I'm assuming today's date, July \n",
       "27th, 2025, for all entries.  Links are not directly provided in the text, only source websites.  I will provide \n",
       "summaries in one line where possible.\n",
       "\n",
       "\n",
       "**From ScienceNews.org:**\n",
       "\n",
       "1.  **Maggots may have been on the Neandertal menu:**  A study suggests Neandertals may have eaten maggots for \n",
       "nutritional benefits. [Link:  https://www.sciencenews.org/](This link is only the website, not a direct article \n",
       "link.)\n",
       "2.  **This desert beetle runs to cool off:**  A desert beetle uses running to regulate its body temperature. [Link:\n",
       "https://www.sciencenews.org/](This link is only the website, not a direct article link.)\n",
       "3.  **AI is designing proteins that could help treat cancer:** Artificial intelligence is being used to develop \n",
       "cancer-fighting proteins. [Link: https://www.sciencenews.org/](This link is only the website, not a direct article \n",
       "link.)\n",
       "4.  **An injected gel could make drugs like Ozempic last longer:** A new gel could extend the effectiveness of \n",
       "medications like Ozempic. [Link: https://www.sciencenews.org/](This link is only the website, not a direct article \n",
       "link.)\n",
       "5.  **How many steps a day do you really need to take?:**  A discussion on the optimal number of daily steps for \n",
       "health benefits. [Link: https://www.sciencenews.org/](This link is only the website, not a direct article link.)\n",
       "6.  **Climate change may be pushing fungal allergy season earlier:**  Climate change is potentially affecting the \n",
       "timing of fungal allergy seasons. [Link: https://www.sciencenews.org/](This link is only the website, not a direct \n",
       "article link.)\n",
       "7.  **AI reveals new details about a famous Latin inscription:** Artificial intelligence helps uncover new \n",
       "information from an ancient Latin inscription. [Link: https://www.sciencenews.org/](This link is only the website, \n",
       "not a direct article link.)\n",
       "\n",
       "\n",
       "**From IndianExpress.com (Technology Section):**\n",
       "\n",
       "1.  **Amazfit Active 2 (Square) review:** A review of the Amazfit Active 2 Square smartwatch. [Link:  \n",
       "https://indianexpress.com/section/technology/](This link is only the website, not a direct article link.)\n",
       "2.  **Gemini on Galaxy Watch review:**  A user experience review of using the Gemini AI assistant on a Galaxy \n",
       "Watch. [Link: https://indianexpress.com/section/technology/](This link is only the website, not a direct article \n",
       "link.)\n",
       "3.  **SpaceX launches more Starlink satellites:** SpaceX added 52 more Starlink satellites to its constellation. \n",
       "[Link: https://indianexpress.com/section/technology/](This link is only the website, not a direct article link.)\n",
       "4.  **New theory predicts a Big Crunch:** A new cosmological theory suggests a potential \"Big Crunch\" in the \n",
       "distant future. [Link: https://indianexpress.com/section/technology/](This link is only the website, not a direct \n",
       "article link.)\n",
       "\n",
       "\n",
       "**Assumptions Made:**\n",
       "\n",
       "*   All articles from both sources are assumed to be from July 27, 2025, as the provided text lacks specific dates \n",
       "for most articles.\n",
       "*   The links provided are to the source websites, not direct links to individual articles.  Finding the exact \n",
       "articles would require navigating the websites.\n",
       "
\n" ], "text/plain": [ "Top Science and Technology News of Today \u001b[1m(\u001b[0mJuly \u001b[1;36m27\u001b[0m, \u001b[1;36m2025\u001b[0m\u001b[1m)\u001b[0m:\n", "\n", "**Note:** The provided text does not contain specific dates for most articles, so I'm assuming today's date, July \n", "27th, \u001b[1;36m2025\u001b[0m, for all entries. Links are not directly provided in the text, only source websites. I will provide \n", "summaries in one line where possible.\n", "\n", "\n", "**From ScienceNews.org:**\n", "\n", "\u001b[1;36m1\u001b[0m. **Maggots may have been on the Neandertal menu:** A study suggests Neandertals may have eaten maggots for \n", "nutritional benefits. \u001b[1m[\u001b[0mLink: \u001b[4;94mhttps://www.sciencenews.org/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, not a direct article \n", "link.\u001b[1m)\u001b[0m\n", "\u001b[1;36m2\u001b[0m. **This desert beetle runs to cool off:** A desert beetle uses running to regulate its body temperature. \u001b[1m[\u001b[0mLink:\n", "\u001b[4;94mhttps://www.sciencenews.org/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, not a direct article link.\u001b[1m)\u001b[0m\n", "\u001b[1;36m3\u001b[0m. **AI is designing proteins that could help treat cancer:** Artificial intelligence is being used to develop \n", "cancer-fighting proteins. \u001b[1m[\u001b[0mLink: \u001b[4;94mhttps://www.sciencenews.org/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, not a direct article \n", "link.\u001b[1m)\u001b[0m\n", "\u001b[1;36m4\u001b[0m. **An injected gel could make drugs like Ozempic last longer:** A new gel could extend the effectiveness of \n", "medications like Ozempic. \u001b[1m[\u001b[0mLink: \u001b[4;94mhttps://www.sciencenews.org/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, not a direct article \n", "link.\u001b[1m)\u001b[0m\n", "\u001b[1;36m5\u001b[0m. **How many steps a day do you really need to take?:** A discussion on the optimal number of daily steps for \n", "health benefits. \u001b[1m[\u001b[0mLink: \u001b[4;94mhttps://www.sciencenews.org/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, not a direct article link.\u001b[1m)\u001b[0m\n", "\u001b[1;36m6\u001b[0m. **Climate change may be pushing fungal allergy season earlier:** Climate change is potentially affecting the \n", "timing of fungal allergy seasons. \u001b[1m[\u001b[0mLink: \u001b[4;94mhttps://www.sciencenews.org/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, not a direct \n", "article link.\u001b[1m)\u001b[0m\n", "\u001b[1;36m7\u001b[0m. **AI reveals new details about a famous Latin inscription:** Artificial intelligence helps uncover new \n", "information from an ancient Latin inscription. \u001b[1m[\u001b[0mLink: \u001b[4;94mhttps://www.sciencenews.org/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, \n", "not a direct article link.\u001b[1m)\u001b[0m\n", "\n", "\n", "**From IndianExpress.com \u001b[1m(\u001b[0mTechnology Section\u001b[1m)\u001b[0m:**\n", "\n", "\u001b[1;36m1\u001b[0m. **Amazfit Active \u001b[1;36m2\u001b[0m \u001b[1m(\u001b[0mSquare\u001b[1m)\u001b[0m review:** A review of the Amazfit Active \u001b[1;36m2\u001b[0m Square smartwatch. \u001b[1m[\u001b[0mLink: \n", "\u001b[4;94mhttps://indianexpress.com/section/technology/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, not a direct article link.\u001b[1m)\u001b[0m\n", "\u001b[1;36m2\u001b[0m. **Gemini on Galaxy Watch review:** A user experience review of using the Gemini AI assistant on a Galaxy \n", "Watch. \u001b[1m[\u001b[0mLink: \u001b[4;94mhttps://indianexpress.com/section/technology/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, not a direct article \n", "link.\u001b[1m)\u001b[0m\n", "\u001b[1;36m3\u001b[0m. **SpaceX launches more Starlink satellites:** SpaceX added \u001b[1;36m52\u001b[0m more Starlink satellites to its constellation. \n", "\u001b[1m[\u001b[0mLink: \u001b[4;94mhttps://indianexpress.com/section/technology/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, not a direct article link.\u001b[1m)\u001b[0m\n", "\u001b[1;36m4\u001b[0m. **New theory predicts a Big Crunch:** A new cosmological theory suggests a potential \u001b[32m\"Big Crunch\"\u001b[0m in the \n", "distant future. \u001b[1m[\u001b[0mLink: \u001b[4;94mhttps://indianexpress.com/section/technology/\u001b[0m\u001b[1m]\u001b[0m\u001b[1m(\u001b[0mThis link is only the website, not a direct \n", "article link.\u001b[1m)\u001b[0m\n", "\n", "\n", "**Assumptions Made:**\n", "\n", "* All articles from both sources are assumed to be from July \u001b[1;36m27\u001b[0m, \u001b[1;36m2025\u001b[0m, as the provided text lacks specific dates \n", "for most articles.\n", "* The links provided are to the source websites, not direct links to individual articles. Finding the exact \n", "articles would require navigating the websites.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "web_response = await query_web_response(\n", " \"Top news of today in science and technology with date and link to original article as hyperlink and summary of each in one line\",\n", " '26-07-2025',\n", " 'Saturday',\n", " searcher,\n", " hf_embeddings,\n", " True,\n", " cross_encoder,\n", " llmgoogle,\n", " text_model=llmgoogle,\n", " num_results=1,\n", " document_paths=[],\n", " local_mode=False,\n", " split=False\n", ")\n", "print(web_response[0])" ] }, { "cell_type": "markdown", "id": "ddc1bafd", "metadata": {}, "source": [ "# 8. Summarise/QA any web page\n", "You can even summarise any url, and get the response tailored to your specific query" ] }, { "cell_type": "code", "execution_count": 14, "id": "5964bb06", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py:136: UserWarning: resource_tracker: process died unexpectedly, relaunching. Some resources might leak.\n", " warnings.warn('resource_tracker: process died unexpectedly, '\n", "[2025-07-27 12:06:29,297] INFO utils.websearch_utils: Fetching URL: https://www.twofortheworld.com/andaman-itinerary/\n", "[2025-07-27 12:06:29,297] INFO utils.websearch_utils: Fetching URL: https://www.twofortheworld.com/andaman-itinerary/\n", "[2025-07-27 12:06:31,514] INFO utils.websearch_utils: Fetched content from https://www.twofortheworld.com/andaman-itinerary/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:31,514] INFO utils.websearch_utils: Fetched content from https://www.twofortheworld.com/andaman-itinerary/ with type text/html; charset=UTF-8\n", "[2025-07-27 12:06:33,154] INFO utils.websearch_utils: Processed markdown for: https://www.twofortheworld.com/andaman-itinerary/\n", "[2025-07-27 12:06:33,154] INFO utils.websearch_utils: Processed markdown for: https://www.twofortheworld.com/andaman-itinerary/\n", "[2025-07-27 12:06:33,156] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://www.twofortheworld.com/andaman-itinerary/\n", "[2025-07-27 12:06:33,156] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://www.twofortheworld.com/andaman-itinerary/\n", "[2025-07-27 12:06:33,411] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:06:33,411] INFO utils.websearch_utils: Total URLs processed: 1\n" ] }, { "data": { "text/html": [ "
This 10-day Andaman and Nicobar Islands itinerary focuses on three islands: South Andaman (Port Blair), Havelock, \n",
       "and Neil.  A 5-day trip would require significant adjustments, prioritizing fewer locations.\n",
       "\n",
       "**Possible 5-Day Itinerary (requiring flexibility):**\n",
       "\n",
       "* **Days 1-2: Port Blair:** Explore Cellular Jail, Ross Island, and the Zonal Anthropology Museum. Consider a \n",
       "shorter trip to Chidiya Tapu Beach if time allows.  Skip Baratang Limestone Caves due to ethical concerns regarding\n",
       "the Jarawa tribe.\n",
       "* **Days 3-4: Havelock Island:** Focus on either Radhanagar Beach (famous sunset) or Elephant Beach (jungle trek, \n",
       "tide dependent).  Scuba diving is possible but may need pre-booking and depends on weather.\n",
       "* **Day 5: Neil Island:** Choose one beach (Laxmanpur II for sunset and rock pools or Sitapur for sunrise and a \n",
       "relaxed atmosphere).  Return to Port Blair for your departure.\n",
       "\n",
       "**Important Considerations:**\n",
       "\n",
       "* **Ferries:** Ferry travel between islands is crucial and can be unpredictable, especially during monsoon season \n",
       "(May-September).  Book private ferries (Makruzz or Green Ocean) in advance for more reliability, though they are \n",
       "more expensive than government ferries.\n",
       "* **Weather:** October to May is the best time to visit. Be prepared for potential delays and adjust your itinerary\n",
       "accordingly.\n",
       "* **Time Constraints:** A 5-day trip necessitates prioritizing locations.  Choosing between Havelock and Neil might\n",
       "be necessary for a more relaxed experience.\n",
       "\n",
       "\n",
       "This condensed itinerary sacrifices some of the exploration offered in the original 10-day plan but provides a \n",
       "framework for a shorter, impactful Andaman experience.  Remember to check ferry schedules and weather forecasts \n",
       "before and during your trip.\n",
       "
\n" ], "text/plain": [ "This \u001b[1;36m10\u001b[0m-day Andaman and Nicobar Islands itinerary focuses on three islands: South Andaman \u001b[1m(\u001b[0mPort Blair\u001b[1m)\u001b[0m, Havelock, \n", "and Neil. A \u001b[1;36m5\u001b[0m-day trip would require significant adjustments, prioritizing fewer locations.\n", "\n", "**Possible \u001b[1;36m5\u001b[0m-Day Itinerary \u001b[1m(\u001b[0mrequiring flexibility\u001b[1m)\u001b[0m:**\n", "\n", "* **Days \u001b[1;36m1\u001b[0m-\u001b[1;36m2\u001b[0m: Port Blair:** Explore Cellular Jail, Ross Island, and the Zonal Anthropology Museum. Consider a \n", "shorter trip to Chidiya Tapu Beach if time allows. Skip Baratang Limestone Caves due to ethical concerns regarding\n", "the Jarawa tribe.\n", "* **Days \u001b[1;36m3\u001b[0m-\u001b[1;36m4\u001b[0m: Havelock Island:** Focus on either Radhanagar Beach \u001b[1m(\u001b[0mfamous sunset\u001b[1m)\u001b[0m or Elephant Beach \u001b[1m(\u001b[0mjungle trek, \n", "tide dependent\u001b[1m)\u001b[0m. Scuba diving is possible but may need pre-booking and depends on weather.\n", "* **Day \u001b[1;36m5\u001b[0m: Neil Island:** Choose one beach \u001b[1m(\u001b[0mLaxmanpur II for sunset and rock pools or Sitapur for sunrise and a \n", "relaxed atmosphere\u001b[1m)\u001b[0m. Return to Port Blair for your departure.\n", "\n", "**Important Considerations:**\n", "\n", "* **Ferries:** Ferry travel between islands is crucial and can be unpredictable, especially during monsoon season \n", "\u001b[1m(\u001b[0mMay-September\u001b[1m)\u001b[0m. Book private ferries \u001b[1m(\u001b[0mMakruzz or Green Ocean\u001b[1m)\u001b[0m in advance for more reliability, though they are \n", "more expensive than government ferries.\n", "* **Weather:** October to May is the best time to visit. Be prepared for potential delays and adjust your itinerary\n", "accordingly.\n", "* **Time Constraints:** A \u001b[1;36m5\u001b[0m-day trip necessitates prioritizing locations. Choosing between Havelock and Neil might\n", "be necessary for a more relaxed experience.\n", "\n", "\n", "This condensed itinerary sacrifices some of the exploration offered in the original \u001b[1;36m10\u001b[0m-day plan but provides a \n", "framework for a shorter, impactful Andaman experience. Remember to check ferry schedules and weather forecasts \n", "before and during your trip.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "summary = await summary_of_url(\"Plan my 5 day trip to andaman and nicobar based on this\",\n", " \"https://www.twofortheworld.com/andaman-itinerary/\",\n", " llmgoogle)\n", "print(summary)" ] }, { "cell_type": "markdown", "id": "5f9fea81", "metadata": {}, "source": [ " \n", "# Summarise or QA any folder, file (including images/pdfs/excels/csv/pptx/etc), or code-file" ] }, { "cell_type": "code", "execution_count": 15, "id": "a23bff2a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:06:58,344] INFO utils.answer_generation: Summary:True,subquery:['List files in DOCUMENT folder', 'Describe content of each file in DOCUMENT folder'], iscoverd:False\n", "[2025-07-27 12:06:58,344] INFO utils.answer_generation: Summary:True,subquery:['List files in DOCUMENT folder', 'Describe content of each file in DOCUMENT folder'], iscoverd:False\n", "[2025-07-27 12:06:58,346] INFO utils.websearch_utils: Search phrases for query 'Whats all in DOCUMENT folder, enlist name with content in detail': ['list files in document folder', 'describe content of each file in document folder']\n", "[2025-07-27 12:06:58,346] INFO utils.websearch_utils: Search phrases for query 'Whats all in DOCUMENT folder, enlist name with content in detail': ['list files in document folder', 'describe content of each file in document folder']\n", "[2025-07-27 12:06:58,347] WARNING utils.websearch_utils: Please add list of paths as input, earlier it used to be list of list\n", "[2025-07-27 12:06:58,347] WARNING utils.websearch_utils: Please add list of paths as input, earlier it used to be list of list\n", "[2025-07-27 12:06:58,350] INFO utils.websearch_utils: Total paths for 'Whats all in DOCUMENT folder, enlist name with content in detail': [['documents', 'documents/1706.03762v7.pdf']]\n", "[2025-07-27 12:06:58,350] INFO utils.websearch_utils: Total paths for 'Whats all in DOCUMENT folder, enlist name with content in detail': [['documents', 'documents/1706.03762v7.pdf']]\n", "[2025-07-27 12:06:58,350] WARNING utils.websearch_utils: No websearcher provided; using document_paths only.\n", "[2025-07-27 12:06:58,350] WARNING utils.websearch_utils: No websearcher provided; using document_paths only.\n", "[2025-07-27 12:06:58,351] INFO utils.websearch_utils: Starting context_to_docs for 2 URL groups.\n", "[2025-07-27 12:06:58,351] INFO utils.websearch_utils: Starting context_to_docs for 2 URL groups.\n", "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py:136: UserWarning: resource_tracker: process died unexpectedly, relaunching. Some resources might leak.\n", " warnings.warn('resource_tracker: process died unexpectedly, '\n", "[2025-07-27 12:06:58,369] ERROR utils.websearch_utils: The file documents does not exist.\n", "[2025-07-27 12:06:58,369] ERROR utils.websearch_utils: The file documents does not exist.\n", "[2025-07-27 12:06:58,369] ERROR utils.websearch_utils: The file documents does not exist.\n", "[2025-07-27 12:06:58,369] ERROR utils.websearch_utils: The file documents does not exist.\n", "[2025-07-27 12:06:58,370] INFO utils.websearch_utils: Processing local file: documents/1706.03762v7.pdf\n", "[2025-07-27 12:06:58,370] INFO utils.websearch_utils: Processing local file: documents/1706.03762v7.pdf\n", "[2025-07-27 12:06:58,370] INFO utils.websearch_utils: Processing local file: documents/1706.03762v7.pdf\n", "[2025-07-27 12:06:58,370] INFO utils.websearch_utils: Processing local file: documents/1706.03762v7.pdf\n", "[2025-07-27 12:06:58,371] ERROR utils.websearch_utils: File not found: The file documents does not exist.\n", "[2025-07-27 12:06:58,371] ERROR utils.websearch_utils: File not found: The file documents does not exist.\n", "[2025-07-27 12:06:58,372] ERROR utils.websearch_utils: File not found: The file documents does not exist.\n", "[2025-07-27 12:06:58,372] ERROR utils.websearch_utils: File not found: The file documents does not exist.\n", "[2025-07-27 12:06:58,387] WARNING utils.websearch_utils: No content returned for URL documents\n", "[2025-07-27 12:06:58,387] WARNING utils.websearch_utils: No content returned for URL documents\n", "[2025-07-27 12:06:58,409] WARNING utils.websearch_utils: No content returned for URL documents\n", "[2025-07-27 12:06:58,409] WARNING utils.websearch_utils: No content returned for URL documents\n", "[2025-07-27 12:06:58,417] INFO utils.websearch_utils: Total URLs processed: 0\n", "[2025-07-27 12:06:58,417] INFO utils.websearch_utils: Total URLs processed: 0\n", "[2025-07-27 12:06:58,446] INFO utils.websearch_utils: Total URLs processed: 0\n", "[2025-07-27 12:06:58,446] INFO utils.websearch_utils: Total URLs processed: 0\n", "[2025-07-27 12:06:58,448] INFO utils.websearch_utils: Processed 0 docs for URL: documents\n", "[2025-07-27 12:06:58,448] INFO utils.websearch_utils: Processed 0 docs for URL: documents\n", "[2025-07-27 12:06:58,449] INFO utils.websearch_utils: Processed 0 docs for URL: documents\n", "[2025-07-27 12:06:58,449] INFO utils.websearch_utils: Encoding start for URL: documents\n", "[2025-07-27 12:06:58,449] INFO utils.websearch_utils: Processed 0 docs for URL: documents\n", "[2025-07-27 12:06:58,449] INFO utils.websearch_utils: Encoding start for URL: documents\n", "[2025-07-27 12:06:58,449] INFO utils.websearch_utils: Encoding start for URL: documents\n", "[2025-07-27 12:06:58,449] INFO utils.websearch_utils: Encoding start for URL: documents\n", "[2025-07-27 12:06:58,451] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:58,451] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:58,451] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:58,451] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:58,451] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:58,461] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:58,451] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:58,461] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:58,461] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:06:58,639] ERROR utils.websearch_utils: Error setting up retrievers for documents: Expected Embeddings to be non-empty list or numpy array, got [] in upsert.\n", "[2025-07-27 12:06:58,639] ERROR utils.websearch_utils: Error setting up retrievers for documents: Expected Embeddings to be non-empty list or numpy array, got [] in upsert.\n", "[2025-07-27 12:06:58,640] ERROR utils.websearch_utils: Error setting up retrievers for documents: Expected Embeddings to be non-empty list or numpy array, got [] in upsert.\n", "[2025-07-27 12:06:58,640] ERROR utils.websearch_utils: Error setting up retrievers for documents: Expected Embeddings to be non-empty list or numpy array, got [] in upsert.\n", "[2025-07-27 12:06:58,642] WARNING utils.websearch_utils: No valid docs/context for URL: documents\n", "[2025-07-27 12:06:58,642] WARNING utils.websearch_utils: No valid docs/context for URL: documents\n", "[2025-07-27 12:06:58,645] WARNING utils.websearch_utils: No valid docs/context for URL: documents\n", "[2025-07-27 12:06:58,645] WARNING utils.websearch_utils: No valid docs/context for URL: documents\n", "[2025-07-27 12:07:02,121] INFO utils.websearch_utils: Processed markdown for: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,121] INFO utils.websearch_utils: Processed markdown for: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,124] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,124] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,129] INFO utils.websearch_utils: Processed markdown for: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,129] INFO utils.websearch_utils: Processed markdown for: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,131] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,131] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,467] INFO utils.websearch_utils: Total URLs processed: 2\n", "[2025-07-27 12:07:02,467] INFO utils.websearch_utils: Total URLs processed: 2\n", "[2025-07-27 12:07:02,471] INFO utils.websearch_utils: Processed 2 docs for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,471] INFO utils.websearch_utils: Processed 2 docs for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,474] INFO utils.websearch_utils: Total URLs processed: 2\n", "[2025-07-27 12:07:02,474] INFO utils.websearch_utils: Total URLs processed: 2\n", "[2025-07-27 12:07:02,476] INFO utils.websearch_utils: Encoding start for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,476] INFO utils.websearch_utils: Encoding start for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,478] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:02,478] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:02,479] INFO utils.websearch_utils: Processed 2 docs for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,479] INFO utils.websearch_utils: Processed 2 docs for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,479] INFO utils.websearch_utils: Encoding start for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,478] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:02,479] INFO utils.websearch_utils: Encoding start for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:02,487] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:02,487] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:02,487] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:04,411] INFO utils.websearch_utils: Encoding end for URL: documents/1706.03762v7.pdf, Time taken: 1.93 seconds\n", "[2025-07-27 12:07:04,411] INFO utils.websearch_utils: Encoding end for URL: documents/1706.03762v7.pdf, Time taken: 1.93 seconds\n", "[2025-07-27 12:07:04,412] INFO utils.websearch_utils: Reranking start for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:04,412] INFO utils.websearch_utils: Reranking start for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:04,412] INFO utils.websearch_utils: Reranking end for URL: documents/1706.03762v7.pdf, Time taken: 0.00 seconds\n", "[2025-07-27 12:07:04,412] INFO utils.websearch_utils: Reranking end for URL: documents/1706.03762v7.pdf, Time taken: 0.00 seconds\n", "[2025-07-27 12:07:04,793] INFO utils.websearch_utils: Encoding end for URL: documents/1706.03762v7.pdf, Time taken: 2.31 seconds\n", "[2025-07-27 12:07:04,793] INFO utils.websearch_utils: Encoding end for URL: documents/1706.03762v7.pdf, Time taken: 2.31 seconds\n", "[2025-07-27 12:07:04,794] INFO utils.websearch_utils: Reranking start for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:04,794] INFO utils.websearch_utils: Reranking start for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:04,794] INFO utils.websearch_utils: Reranking end for URL: documents/1706.03762v7.pdf, Time taken: 0.00 seconds\n", "[2025-07-27 12:07:04,794] INFO utils.websearch_utils: Reranking end for URL: documents/1706.03762v7.pdf, Time taken: 0.00 seconds\n", "Batches: 100%|██████████| 1/1 [00:00<00:00, 1.68it/s]\n", "Batches: 100%|██████████| 1/1 [00:00<00:00, 4.78it/s]\n", "[2025-07-27 12:07:06,485] INFO utils.websearch_utils: Retrieved 2 docs for documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,485] INFO utils.websearch_utils: Retrieved 2 docs for documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,486] INFO utils.websearch_utils: Retrieved 2 docs for documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,486] INFO utils.websearch_utils: Retrieved 2 docs for documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,487] INFO utils.websearch_utils: Built context for documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,487] INFO utils.websearch_utils: Built context for documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,488] INFO utils.websearch_utils: Built context for documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,488] INFO utils.websearch_utils: Built context for documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,488] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,488] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,489] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,489] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: documents/1706.03762v7.pdf\n", "[2025-07-27 12:07:06,491] INFO utils.websearch_utils: context_to_docs complete. Total contexts: 2, total docs: 4\n", "[2025-07-27 12:07:06,491] INFO utils.websearch_utils: context_to_docs complete. Total contexts: 2, total docs: 4\n", "[2025-07-27 12:07:06,492] INFO utils.websearch_utils: Context generated to answer query 'Whats all in DOCUMENT folder, enlist name with content in detail'.\n", "[2025-07-27 12:07:06,492] INFO utils.websearch_utils: Context generated to answer query 'Whats all in DOCUMENT folder, enlist name with content in detail'.\n", "[2025-07-27 12:07:06,492] INFO utils.utils: Logging results for query: Whats all in DOCUMENT folder, enlist name with content in detail\n", "[2025-07-27 12:07:06,492] INFO utils.utils: Logging results for query: Whats all in DOCUMENT folder, enlist name with content in detail\n", "[2025-07-27 12:07:06,493] INFO utils.websearch_utils: Logged results for query 'Whats all in DOCUMENT folder, enlist name with content in detail'.\n", "[2025-07-27 12:07:06,493] INFO utils.websearch_utils: Logged results for query 'Whats all in DOCUMENT folder, enlist name with content in detail'.\n", "[2025-07-27 12:07:06,494] INFO utils.websearch_utils: Generating summary for query 'Whats all in DOCUMENT folder, enlist name with content in detail' using summarizer.\n", "[2025-07-27 12:07:06,494] INFO utils.websearch_utils: Generating summary for query 'Whats all in DOCUMENT folder, enlist name with content in detail' using summarizer.\n", "[2025-07-27 12:07:06,494] INFO utils.answer_generation: Deduping docs: 4\n", "[2025-07-27 12:07:06,494] INFO utils.answer_generation: Deduping docs: 4\n", "[2025-07-27 12:07:06,495] WARNING utils.answer_generation: Capping max words to 6000\n", "[2025-07-27 12:07:06,495] WARNING utils.answer_generation: Capping max words to 6000\n", "[2025-07-27 12:07:06,496] INFO utils.answer_generation: Arrived Len of Docs: 2\n", "[2025-07-27 12:07:06,496] INFO utils.answer_generation: Arrived Len of Docs: 2\n" ] }, { "data": { "text/html": [ "
Summarising using 2 documents\n",
       "
\n" ], "text/plain": [ "Summarising using \u001b[1;36m2\u001b[0m documents\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:07:12,104] INFO utils.websearch_utils: Response generated for query 'Whats all in DOCUMENT folder, enlist name with content in detail'.\n", "[2025-07-27 12:07:12,104] INFO utils.websearch_utils: Response generated for query 'Whats all in DOCUMENT folder, enlist name with content in detail'.\n" ] }, { "data": { "text/html": [ "
The DOCUMENT folder contains one file: `documents/1706.03762v7.pdf`.  This PDF is a research paper titled \n",
       "\"Attention is All You Need,\" detailing the Transformer model architecture.\n",
       "\n",
       "The paper's content includes:\n",
       "\n",
       "* **Authors and Affiliations:**  A list of authors and their affiliations (Google Research and Google Brain \n",
       "primarily, with one author from the University of Toronto).  Their individual contributions are also specified.\n",
       "\n",
       "* **Abstract:** A summary of the paper, highlighting the introduction of the Transformer model, its superior \n",
       "performance and parallelization capabilities compared to recurrent and convolutional neural networks in machine \n",
       "translation tasks, and its successful application to English constituency parsing.\n",
       "\n",
       "* **Introduction:** Background on sequence transduction models, particularly recurrent neural networks (RNNs) and \n",
       "their limitations regarding parallelization.  It introduces the Transformer as a solution that relies solely on \n",
       "attention mechanisms.\n",
       "\n",
       "* **Background:**  A review of related work, including convolutional neural networks (CNNs) and self-attention \n",
       "mechanisms.\n",
       "\n",
       "* **Model Architecture:** A detailed explanation of the Transformer architecture, including:\n",
       "    * Encoder and Decoder Stacks:  The structure of the encoder and decoder, composed of stacked layers with \n",
       "self-attention and feed-forward networks.\n",
       "    * Attention:  A description of the scaled dot-product attention and multi-head attention mechanisms.\n",
       "    * Position-wise Feed-Forward Networks:  Explanation of the fully connected feed-forward networks applied to \n",
       "each position.\n",
       "    * Embeddings and Softmax:  Details on token embedding and probability prediction.\n",
       "    * Positional Encoding:  How positional information is incorporated into the model.\n",
       "\n",
       "* **Why Self-Attention:** A comparison of self-attention with recurrent and convolutional layers, focusing on \n",
       "computational complexity, parallelization, and path length for long-range dependencies.\n",
       "\n",
       "* **Training:**  Details on the training data (WMT 2014 English-German and English-French datasets), hardware (8 \n",
       "NVIDIA P100 GPUs), optimizer (Adam), and regularization techniques (residual dropout and label smoothing).\n",
       "\n",
       "* **Results:**  Presentation of results on machine translation tasks (English-to-German and English-to-French), \n",
       "showing state-of-the-art BLEU scores achieved by the Transformer model.  Results on English constituency parsing \n",
       "are also included, demonstrating the model's generalizability.\n",
       "\n",
       "* **Model Variations:**  An analysis of different variations of the Transformer architecture and their impact on \n",
       "performance.\n",
       "\n",
       "* **Conclusion:**  Summary of the findings and future research directions.\n",
       "\n",
       "* **References:** A list of cited works.\n",
       "\n",
       "* **Appendix (partially shown):** Attention visualizations illustrating the model's attention mechanisms.\n",
       "\n",
       "\n",
       "**Assumptions Made:**\n",
       "\n",
       "No assumptions were made beyond what is explicitly stated or implied within the provided document.  The summary is \n",
       "purely based on the content of the research paper.\n",
       "
\n" ], "text/plain": [ "The DOCUMENT folder contains one file: `documents/\u001b[1;36m1706.\u001b[0m03762v7.pdf`. This PDF is a research paper titled \n", "\u001b[32m\"Attention is All You Need,\"\u001b[0m detailing the Transformer model architecture.\n", "\n", "The paper's content includes:\n", "\n", "* **Authors and Affiliations:** A list of authors and their affiliations \u001b[1m(\u001b[0mGoogle Research and Google Brain \n", "primarily, with one author from the University of Toronto\u001b[1m)\u001b[0m. Their individual contributions are also specified.\n", "\n", "* **Abstract:** A summary of the paper, highlighting the introduction of the Transformer model, its superior \n", "performance and parallelization capabilities compared to recurrent and convolutional neural networks in machine \n", "translation tasks, and its successful application to English constituency parsing.\n", "\n", "* **Introduction:** Background on sequence transduction models, particularly recurrent neural networks \u001b[1m(\u001b[0mRNNs\u001b[1m)\u001b[0m and \n", "their limitations regarding parallelization. It introduces the Transformer as a solution that relies solely on \n", "attention mechanisms.\n", "\n", "* **Background:** A review of related work, including convolutional neural networks \u001b[1m(\u001b[0mCNNs\u001b[1m)\u001b[0m and self-attention \n", "mechanisms.\n", "\n", "* **Model Architecture:** A detailed explanation of the Transformer architecture, including:\n", " * Encoder and Decoder Stacks: The structure of the encoder and decoder, composed of stacked layers with \n", "self-attention and feed-forward networks.\n", " * Attention: A description of the scaled dot-product attention and multi-head attention mechanisms.\n", " * Position-wise Feed-Forward Networks: Explanation of the fully connected feed-forward networks applied to \n", "each position.\n", " * Embeddings and Softmax: Details on token embedding and probability prediction.\n", " * Positional Encoding: How positional information is incorporated into the model.\n", "\n", "* **Why Self-Attention:** A comparison of self-attention with recurrent and convolutional layers, focusing on \n", "computational complexity, parallelization, and path length for long-range dependencies.\n", "\n", "* **Training:** Details on the training data \u001b[1m(\u001b[0mWMT \u001b[1;36m2014\u001b[0m English-German and English-French datasets\u001b[1m)\u001b[0m, hardware \u001b[1m(\u001b[0m\u001b[1;36m8\u001b[0m \n", "NVIDIA P100 GPUs\u001b[1m)\u001b[0m, optimizer \u001b[1m(\u001b[0mAdam\u001b[1m)\u001b[0m, and regularization techniques \u001b[1m(\u001b[0mresidual dropout and label smoothing\u001b[1m)\u001b[0m.\n", "\n", "* **Results:** Presentation of results on machine translation tasks \u001b[1m(\u001b[0mEnglish-to-German and English-to-French\u001b[1m)\u001b[0m, \n", "showing state-of-the-art BLEU scores achieved by the Transformer model. Results on English constituency parsing \n", "are also included, demonstrating the model's generalizability.\n", "\n", "* **Model Variations:** An analysis of different variations of the Transformer architecture and their impact on \n", "performance.\n", "\n", "* **Conclusion:** Summary of the findings and future research directions.\n", "\n", "* **References:** A list of cited works.\n", "\n", "* **Appendix \u001b[1m(\u001b[0mpartially shown\u001b[1m)\u001b[0m:** Attention visualizations illustrating the model's attention mechanisms.\n", "\n", "\n", "**Assumptions Made:**\n", "\n", "No assumptions were made beyond what is explicitly stated or implied within the provided document. The summary is \n", "purely based on the content of the research paper.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#FOLDER\n", "web_response = await query_web_response(\n", " \"Whats all in DOCUMENT folder, enlist name with content in detail\",\n", " '26-07-2025',\n", " 'Saturday',\n", " searcher,\n", " hf_embeddings,\n", " True,\n", " cross_encoder,\n", " llmgoogle,\n", " text_model=llmgoogle,\n", " num_results=1,\n", " document_paths=['documents'],\n", " local_mode=True,\n", " split=False\n", ")\n", "print(web_response[0])" ] }, { "cell_type": "code", "execution_count": 16, "id": "a1fe0074", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:07:20,155] INFO utils.answer_generation: Summary:True,subquery:['App capabilities markdown table'], iscoverd:False\n", "[2025-07-27 12:07:20,155] INFO utils.answer_generation: Summary:True,subquery:['App capabilities markdown table'], iscoverd:False\n", "[2025-07-27 12:07:20,157] INFO utils.websearch_utils: Search phrases for query 'Explain me the capabilties of this app in markdown table format': ['app capabilities markdown table']\n", "[2025-07-27 12:07:20,157] INFO utils.websearch_utils: Search phrases for query 'Explain me the capabilties of this app in markdown table format': ['app capabilities markdown table']\n", "[2025-07-27 12:07:20,158] WARNING utils.websearch_utils: Please add list of paths as input, earlier it used to be list of list\n", "[2025-07-27 12:07:20,158] WARNING utils.websearch_utils: Please add list of paths as input, earlier it used to be list of list\n", "[2025-07-27 12:07:20,159] INFO utils.websearch_utils: Total paths for 'Explain me the capabilties of this app in markdown table format': [['app.py']]\n", "[2025-07-27 12:07:20,159] INFO utils.websearch_utils: Total paths for 'Explain me the capabilties of this app in markdown table format': [['app.py']]\n", "[2025-07-27 12:07:20,159] WARNING utils.websearch_utils: No websearcher provided; using document_paths only.\n", "[2025-07-27 12:07:20,159] WARNING utils.websearch_utils: No websearcher provided; using document_paths only.\n", "[2025-07-27 12:07:20,160] INFO utils.websearch_utils: Starting context_to_docs for 1 URL groups.\n", "[2025-07-27 12:07:20,160] INFO utils.websearch_utils: Starting context_to_docs for 1 URL groups.\n", "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py:136: UserWarning: resource_tracker: process died unexpectedly, relaunching. Some resources might leak.\n", " warnings.warn('resource_tracker: process died unexpectedly, '\n", "[2025-07-27 12:07:23,193] INFO utils.websearch_utils: Processing local file: app.py\n", "[2025-07-27 12:07:23,193] INFO utils.websearch_utils: Processing local file: app.py\n", "[2025-07-27 12:07:24,673] INFO utils.websearch_utils: Processed markdown for: app.py\n", "[2025-07-27 12:07:24,673] INFO utils.websearch_utils: Processed markdown for: app.py\n", "[2025-07-27 12:07:24,675] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: app.py\n", "[2025-07-27 12:07:24,675] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: app.py\n", "[2025-07-27 12:07:24,960] INFO utils.websearch_utils: Total URLs processed: 4\n", "[2025-07-27 12:07:24,960] INFO utils.websearch_utils: Total URLs processed: 4\n", "[2025-07-27 12:07:24,962] INFO utils.websearch_utils: Processed 4 docs for URL: app.py\n", "[2025-07-27 12:07:24,962] INFO utils.websearch_utils: Processed 4 docs for URL: app.py\n", "[2025-07-27 12:07:24,963] INFO utils.websearch_utils: Encoding start for URL: app.py\n", "[2025-07-27 12:07:24,963] INFO utils.websearch_utils: Encoding start for URL: app.py\n", "[2025-07-27 12:07:24,965] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:24,965] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:24,965] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:25,548] INFO utils.websearch_utils: Encoding end for URL: app.py, Time taken: 0.59 seconds\n", "[2025-07-27 12:07:25,548] INFO utils.websearch_utils: Encoding end for URL: app.py, Time taken: 0.59 seconds\n", "[2025-07-27 12:07:25,550] INFO utils.websearch_utils: Reranking start for URL: app.py\n", "[2025-07-27 12:07:25,550] INFO utils.websearch_utils: Reranking start for URL: app.py\n", "[2025-07-27 12:07:25,550] INFO utils.websearch_utils: Reranking end for URL: app.py, Time taken: 0.00 seconds\n", "[2025-07-27 12:07:25,550] INFO utils.websearch_utils: Reranking end for URL: app.py, Time taken: 0.00 seconds\n", "Batches: 100%|██████████| 1/1 [00:00<00:00, 2.01it/s]\n", "[2025-07-27 12:07:27,152] INFO utils.websearch_utils: Retrieved 2 docs for app.py\n", "[2025-07-27 12:07:27,152] INFO utils.websearch_utils: Retrieved 2 docs for app.py\n", "[2025-07-27 12:07:27,153] INFO utils.websearch_utils: Built context for app.py\n", "[2025-07-27 12:07:27,153] INFO utils.websearch_utils: Built context for app.py\n", "[2025-07-27 12:07:27,153] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: app.py\n", "[2025-07-27 12:07:27,153] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: app.py\n", "[2025-07-27 12:07:27,154] INFO utils.websearch_utils: context_to_docs complete. Total contexts: 1, total docs: 4\n", "[2025-07-27 12:07:27,154] INFO utils.websearch_utils: context_to_docs complete. Total contexts: 1, total docs: 4\n", "[2025-07-27 12:07:27,155] INFO utils.websearch_utils: Context generated to answer query 'Explain me the capabilties of this app in markdown table format'.\n", "[2025-07-27 12:07:27,155] INFO utils.websearch_utils: Context generated to answer query 'Explain me the capabilties of this app in markdown table format'.\n", "[2025-07-27 12:07:27,156] INFO utils.utils: Logging results for query: Explain me the capabilties of this app in markdown table format\n", "[2025-07-27 12:07:27,156] INFO utils.utils: Logging results for query: Explain me the capabilties of this app in markdown table format\n", "[2025-07-27 12:07:27,157] INFO utils.websearch_utils: Logged results for query 'Explain me the capabilties of this app in markdown table format'.\n", "[2025-07-27 12:07:27,157] INFO utils.websearch_utils: Logged results for query 'Explain me the capabilties of this app in markdown table format'.\n", "[2025-07-27 12:07:27,157] INFO utils.websearch_utils: Generating summary for query 'Explain me the capabilties of this app in markdown table format' using summarizer.\n", "[2025-07-27 12:07:27,157] INFO utils.websearch_utils: Generating summary for query 'Explain me the capabilties of this app in markdown table format' using summarizer.\n", "[2025-07-27 12:07:27,158] INFO utils.answer_generation: Deduping docs: 4\n", "[2025-07-27 12:07:27,158] INFO utils.answer_generation: Deduping docs: 4\n", "[2025-07-27 12:07:27,159] WARNING utils.answer_generation: Capping max words to 6000\n", "[2025-07-27 12:07:27,159] WARNING utils.answer_generation: Capping max words to 6000\n", "[2025-07-27 12:07:27,159] INFO utils.answer_generation: Arrived Len of Docs: 4\n", "[2025-07-27 12:07:27,159] INFO utils.answer_generation: Arrived Len of Docs: 4\n" ] }, { "data": { "text/html": [ "
Summarising using 4 documents\n",
       "
\n" ], "text/plain": [ "Summarising using \u001b[1;36m4\u001b[0m documents\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:07:33,982] INFO utils.websearch_utils: Response generated for query 'Explain me the capabilties of this app in markdown table format'.\n", "[2025-07-27 12:07:33,982] INFO utils.websearch_utils: Response generated for query 'Explain me the capabilties of this app in markdown table format'.\n" ] }, { "data": { "text/html": [ "
## CoexistAI App Capabilities\n",
       "\n",
       "The CoexistAI application offers a range of functionalities, all powered by a large language model (LLM) and \n",
       "various supporting tools.  The following table summarizes its capabilities:\n",
       "\n",
       "| Feature             | Description                                                                                \n",
       "| Input Parameters                                                                                                 \n",
       "| Output                                                                                                    | \n",
       "Assumptions                                                                                                    |\n",
       "|----------------------|-------------------------------------------------------------------------------------------\n",
       "-------------------------------------------------------------------------------------------------------------------\n",
       "-------------------------|-----------------------------------------------------------------------------------------\n",
       "--------------------------------|----------------------------------------------------------------------------------\n",
       "---------------------------|---------------------------------------------------------------------------------------\n",
       "----------------------|\n",
       "| Web Search           | Performs a web search using SearxNG, retrieves results, and generates a response based on \n",
       "those results.  Includes options for reranking and local document processing.                                      \n",
       "| `query`, `rerank`, `num_results`, `local_mode`, `split`, `document_paths`                                        \n",
       "| Generated response, sources, search results, retrieved documents, and context.                               | \n",
       "SearxNG docker container is either running or can be started automatically.                               |\n",
       "| Web Summarization    | Generates a summary of a web page based on a provided query and URL.                      \n",
       "| `query`, `url`, `local_mode`                                                                                     \n",
       "| Summary and sources                                                                                           |  \n",
       "A suitable summarization model is available and configured.                                                  |\n",
       "| YouTube Search       | Performs a YouTube search, retrieves transcripts, and generates summaries of the videos \n",
       "based on a given query or URL.                                                                                     \n",
       "| `query` (URL or search term), `prompt`, `n` (number of videos)                                                   \n",
       "| Summaries of YouTube transcripts                                                                               | \n",
       "YouTube transcripts are accessible and can be processed.                                                    |\n",
       "| Reddit Search        | Performs a Reddit search and retrieves posts based on specified parameters (subreddit, \n",
       "search query, time filter, etc.).                                                                                  \n",
       "| `subreddit`, `url_type`, `n`, `k`, `custom_url`, `time_filter`, `search_query`, `sort_type`                      \n",
       "| Reddit posts and comments                                                                                      | \n",
       "Reddit API access is configured correctly.                                                                   |\n",
       "| Map Search           | Performs a map search to find routes and points of interest (POIs) between two locations. \n",
       "| `start_location`, `end_location`, `pois_radius`, `amenities`, `limit`, `task`                                    \n",
       "| Location(s), route, and POIs                                                                                  | \n",
       "Map data is accessible and can be processed.                                                                 |\n",
       "| Git Repository Search | Retrieves the directory tree structure or specific content from a GitHub or local Git \n",
       "repository and answers user queries based on the fetched content.                                                  \n",
       "| (Git Tree): `repobaseurl`  (Git Search): `repobaseurl`, `parttoresearch`, `query`, `type`                        \n",
       "| Directory tree structure or response to the query based on fetched content                                     | \n",
       "Git repository is accessible (either via URL or local path).                                                |\n",
       "| Local Folder Tree    | Generates a markdown representation of a local folder's directory tree structure.         \n",
       "| `folder_path`, `level`, `prefix`                                                                                 \n",
       "| Markdown tree string                                                                                          |  \n",
       "Appropriate permissions to access the specified folder path.                                                |\n",
       "| Response Check       | Evaluates the completeness of information gathered by the agent to answer a user's query, \n",
       "providing suggestions for improvement if needed.                                                                   \n",
       "| `query`, `toolsshorthand`                                                                                       |\n",
       "Suggestions for improvement or confirmation of completeness                                                    |  \n",
       "The LLM can effectively assess the completeness of information.                                               |\n",
       "| Clickable Element Extraction | Extracts top-k clickable elements from a given URL based on a query, useful for \n",
       "finding deeper links within a webpage.                                                                             \n",
       "| `url`, `query`, `topk`                                                                                           \n",
       "| List of clickable elements with title, URL, and score.                                                        |  \n",
       "The underlying method (`get_topk_bm25_clickable_elements`) functions correctly and has access to necessary \n",
       "resources. |\n",
       "\n",
       "\n",
       "**Note:** The application uses a configurable LLM (`llm`), embedding model (`hf_embeddings`), and cross-encoder \n",
       "(`cross_encoder`) for various tasks.  The specific models used are defined in a `model_config` file.  Error \n",
       "handling and input validation are not explicitly detailed in the provided code snippets.\n",
       "
\n" ], "text/plain": [ "## CoexistAI App Capabilities\n", "\n", "The CoexistAI application offers a range of functionalities, all powered by a large language model \u001b[1m(\u001b[0mLLM\u001b[1m)\u001b[0m and \n", "various supporting tools. The following table summarizes its capabilities:\n", "\n", "| Feature | Description \n", "| Input Parameters \n", "| Output | \n", "Assumptions |\n", "|----------------------|-------------------------------------------------------------------------------------------\n", "-------------------------------------------------------------------------------------------------------------------\n", "-------------------------|-----------------------------------------------------------------------------------------\n", "--------------------------------|----------------------------------------------------------------------------------\n", "---------------------------|---------------------------------------------------------------------------------------\n", "----------------------|\n", "| Web Search | Performs a web search using SearxNG, retrieves results, and generates a response based on \n", "those results. Includes options for reranking and local document processing. \n", "| `query`, `rerank`, `num_results`, `local_mode`, `split`, `document_paths` \n", "| Generated response, sources, search results, retrieved documents, and context. | \n", "SearxNG docker container is either running or can be started automatically. |\n", "| Web Summarization | Generates a summary of a web page based on a provided query and URL. \n", "| `query`, `url`, `local_mode` \n", "| Summary and sources | \n", "A suitable summarization model is available and configured. |\n", "| YouTube Search | Performs a YouTube search, retrieves transcripts, and generates summaries of the videos \n", "based on a given query or URL. \n", "| `query` \u001b[1m(\u001b[0mURL or search term\u001b[1m)\u001b[0m, `prompt`, `n` \u001b[1m(\u001b[0mnumber of videos\u001b[1m)\u001b[0m \n", "| Summaries of YouTube transcripts | \n", "YouTube transcripts are accessible and can be processed. |\n", "| Reddit Search | Performs a Reddit search and retrieves posts based on specified parameters \u001b[1m(\u001b[0msubreddit, \n", "search query, time filter, etc.\u001b[1m)\u001b[0m. \n", "| `subreddit`, `url_type`, `n`, `k`, `custom_url`, `time_filter`, `search_query`, `sort_type` \n", "| Reddit posts and comments | \n", "Reddit API access is configured correctly. |\n", "| Map Search | Performs a map search to find routes and points of interest \u001b[1m(\u001b[0mPOIs\u001b[1m)\u001b[0m between two locations. \n", "| `start_location`, `end_location`, `pois_radius`, `amenities`, `limit`, `task` \n", "| \u001b[1;35mLocation\u001b[0m\u001b[1m(\u001b[0ms\u001b[1m)\u001b[0m, route, and POIs | \n", "Map data is accessible and can be processed. |\n", "| Git Repository Search | Retrieves the directory tree structure or specific content from a GitHub or local Git \n", "repository and answers user queries based on the fetched content. \n", "| \u001b[1m(\u001b[0mGit Tree\u001b[1m)\u001b[0m: `repobaseurl` \u001b[1m(\u001b[0mGit Search\u001b[1m)\u001b[0m: `repobaseurl`, `parttoresearch`, `query`, `type` \n", "| Directory tree structure or response to the query based on fetched content | \n", "Git repository is accessible \u001b[1m(\u001b[0meither via URL or local path\u001b[1m)\u001b[0m. |\n", "| Local Folder Tree | Generates a markdown representation of a local folder's directory tree structure. \n", "| `folder_path`, `level`, `prefix` \n", "| Markdown tree string | \n", "Appropriate permissions to access the specified folder path. |\n", "| Response Check | Evaluates the completeness of information gathered by the agent to answer a user's query, \n", "providing suggestions for improvement if needed. \n", "| `query`, `toolsshorthand` |\n", "Suggestions for improvement or confirmation of completeness | \n", "The LLM can effectively assess the completeness of information. |\n", "| Clickable Element Extraction | Extracts top-k clickable elements from a given URL based on a query, useful for \n", "finding deeper links within a webpage. \n", "| `url`, `query`, `topk` \n", "| List of clickable elements with title, URL, and score. | \n", "The underlying method \u001b[1m(\u001b[0m`get_topk_bm25_clickable_elements`\u001b[1m)\u001b[0m functions correctly and has access to necessary \n", "resources. |\n", "\n", "\n", "**Note:** The application uses a configurable LLM \u001b[1m(\u001b[0m`llm`\u001b[1m)\u001b[0m, embedding model \u001b[1m(\u001b[0m`hf_embeddings`\u001b[1m)\u001b[0m, and cross-encoder \n", "\u001b[1m(\u001b[0m`cross_encoder`\u001b[1m)\u001b[0m for various tasks. The specific models used are defined in a `model_config` file. Error \n", "handling and input validation are not explicitly detailed in the provided code snippets.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# FILE\n", "web_response = await query_web_response(\n", " \"Explain me the capabilties of this app in markdown table format\",\n", " '26-07-2025',\n", " 'Saturday',\n", " searcher,\n", " hf_embeddings,\n", " True,\n", " cross_encoder,\n", " llmgoogle,\n", " text_model=llmgoogle,\n", " num_results=1,\n", " document_paths=['app.py'],\n", " local_mode=True,\n", " split=False\n", ")\n", "print(web_response[0])" ] }, { "cell_type": "code", "execution_count": 17, "id": "3dbe001f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:07:47,909] INFO utils.answer_generation: Summary:False,subquery:['artistic view logo'], iscoverd:False\n", "[2025-07-27 12:07:47,909] INFO utils.answer_generation: Summary:False,subquery:['artistic view logo'], iscoverd:False\n", "[2025-07-27 12:07:47,911] INFO utils.websearch_utils: Search phrases for query 'Give me your artistic view on this logo': ['artistic view logo']\n", "[2025-07-27 12:07:47,911] INFO utils.websearch_utils: Search phrases for query 'Give me your artistic view on this logo': ['artistic view logo']\n", "[2025-07-27 12:07:47,912] WARNING utils.websearch_utils: Please add list of paths as input, earlier it used to be list of list\n", "[2025-07-27 12:07:47,912] WARNING utils.websearch_utils: Please add list of paths as input, earlier it used to be list of list\n", "[2025-07-27 12:07:47,913] INFO utils.websearch_utils: Total paths for 'Give me your artistic view on this logo': [['artifacts/logo.jpeg']]\n", "[2025-07-27 12:07:47,913] INFO utils.websearch_utils: Total paths for 'Give me your artistic view on this logo': [['artifacts/logo.jpeg']]\n", "[2025-07-27 12:07:47,915] WARNING utils.websearch_utils: No websearcher provided; using document_paths only.\n", "[2025-07-27 12:07:47,915] WARNING utils.websearch_utils: No websearcher provided; using document_paths only.\n", "[2025-07-27 12:07:47,916] INFO utils.websearch_utils: Starting context_to_docs for 1 URL groups.\n", "[2025-07-27 12:07:47,916] INFO utils.websearch_utils: Starting context_to_docs for 1 URL groups.\n", "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py:136: UserWarning: resource_tracker: process died unexpectedly, relaunching. Some resources might leak.\n", " warnings.warn('resource_tracker: process died unexpectedly, '\n", "[2025-07-27 12:07:47,954] INFO utils.websearch_utils: Processing local file: artifacts/logo.jpeg\n", "[2025-07-27 12:07:47,954] INFO utils.websearch_utils: Processing local file: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,194] INFO utils.websearch_utils: Processed markdown for: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,194] INFO utils.websearch_utils: Processed markdown for: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,197] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,197] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,489] INFO utils.websearch_utils: Total URLs processed: 2\n", "[2025-07-27 12:07:55,489] INFO utils.websearch_utils: Total URLs processed: 2\n", "[2025-07-27 12:07:55,492] INFO utils.websearch_utils: Processed 2 docs for URL: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,492] INFO utils.websearch_utils: Processed 2 docs for URL: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,492] INFO utils.websearch_utils: Encoding start for URL: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,492] INFO utils.websearch_utils: Encoding start for URL: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,494] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:55,494] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:55,494] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:07:55,946] INFO utils.websearch_utils: Encoding end for URL: artifacts/logo.jpeg, Time taken: 0.45 seconds\n", "[2025-07-27 12:07:55,946] INFO utils.websearch_utils: Encoding end for URL: artifacts/logo.jpeg, Time taken: 0.45 seconds\n", "[2025-07-27 12:07:55,948] INFO utils.websearch_utils: Reranking start for URL: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,948] INFO utils.websearch_utils: Reranking start for URL: artifacts/logo.jpeg\n", "[2025-07-27 12:07:55,949] INFO utils.websearch_utils: Reranking end for URL: artifacts/logo.jpeg, Time taken: 0.00 seconds\n", "[2025-07-27 12:07:55,949] INFO utils.websearch_utils: Reranking end for URL: artifacts/logo.jpeg, Time taken: 0.00 seconds\n", "Batches: 100%|██████████| 1/1 [00:00<00:00, 5.00it/s]\n", "[2025-07-27 12:07:57,297] INFO utils.websearch_utils: Retrieved 2 docs for artifacts/logo.jpeg\n", "[2025-07-27 12:07:57,297] INFO utils.websearch_utils: Retrieved 2 docs for artifacts/logo.jpeg\n", "[2025-07-27 12:07:57,298] INFO utils.websearch_utils: Built context for artifacts/logo.jpeg\n", "[2025-07-27 12:07:57,298] INFO utils.websearch_utils: Built context for artifacts/logo.jpeg\n", "[2025-07-27 12:07:57,298] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: artifacts/logo.jpeg\n", "[2025-07-27 12:07:57,298] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: artifacts/logo.jpeg\n", "[2025-07-27 12:07:57,299] INFO utils.websearch_utils: context_to_docs complete. Total contexts: 1, total docs: 2\n", "[2025-07-27 12:07:57,299] INFO utils.websearch_utils: context_to_docs complete. Total contexts: 1, total docs: 2\n", "[2025-07-27 12:07:57,300] INFO utils.websearch_utils: Context generated to answer query 'Give me your artistic view on this logo'.\n", "[2025-07-27 12:07:57,300] INFO utils.websearch_utils: Context generated to answer query 'Give me your artistic view on this logo'.\n", "[2025-07-27 12:07:57,300] INFO utils.utils: Logging results for query: Give me your artistic view on this logo\n", "[2025-07-27 12:07:57,300] INFO utils.utils: Logging results for query: Give me your artistic view on this logo\n", "[2025-07-27 12:07:57,301] INFO utils.websearch_utils: Logged results for query 'Give me your artistic view on this logo'.\n", "[2025-07-27 12:07:57,301] INFO utils.websearch_utils: Logged results for query 'Give me your artistic view on this logo'.\n", "[2025-07-27 12:07:57,301] INFO utils.websearch_utils: Generating Answer for query 'Give me your artistic view on this logo' using response gen.\n", "[2025-07-27 12:07:57,301] INFO utils.websearch_utils: Generating Answer for query 'Give me your artistic view on this logo' using response gen.\n", "[2025-07-27 12:07:57,302] INFO utils.answer_generation: Generating Answer for query: Give me your artistic view on this logo\n", "[2025-07-27 12:07:57,302] INFO utils.answer_generation: Generating Answer for query: Give me your artistic view on this logo\n", "[2025-07-27 12:07:59,171] INFO utils.answer_generation: Structured response generated successfully.\n", "[2025-07-27 12:07:59,171] INFO utils.answer_generation: Structured response generated successfully.\n", "[2025-07-27 12:07:59,172] INFO utils.websearch_utils: Response generated for query 'Give me your artistic view on this logo'.\n", "[2025-07-27 12:07:59,172] INFO utils.websearch_utils: Response generated for query 'Give me your artistic view on this logo'.\n" ] }, { "data": { "text/html": [ "
#### Answer: \n",
       "The logo is a vibrant and imaginative illustration blending nature, science, and everyday life. A lion in a suit \n",
       "holding coffee represents power and sophistication, while a blue brain above symbolizes intellect. Two bulls, one \n",
       "blue and one black and white, represent contrasting perspectives.  The background features interconnected tech \n",
       "icons (gears, test tubes, cameras, circuits), suggesting innovation and the complexity of the mind. Stars add a \n",
       "whimsical touch. The overall artistic effect is a thought-provoking exploration of balance, intelligence, and \n",
       "diverse forces shaping our world.  It's not just a logo; it's a visual narrative.\n",
       "\n",
       "#### Next steps:\n",
       "None\n",
       "
\n" ], "text/plain": [ "#### Answer: \n", "The logo is a vibrant and imaginative illustration blending nature, science, and everyday life. A lion in a suit \n", "holding coffee represents power and sophistication, while a blue brain above symbolizes intellect. Two bulls, one \n", "blue and one black and white, represent contrasting perspectives. The background features interconnected tech \n", "icons \u001b[1m(\u001b[0mgears, test tubes, cameras, circuits\u001b[1m)\u001b[0m, suggesting innovation and the complexity of the mind. Stars add a \n", "whimsical touch. The overall artistic effect is a thought-provoking exploration of balance, intelligence, and \n", "diverse forces shaping our world. It's not just a logo; it's a visual narrative.\n", "\n", "#### Next steps:\n", "\u001b[3;35mNone\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# FILE\n", "web_response = await query_web_response(\n", " \"Give me your artistic view on this logo\",\n", " '26-07-2025',\n", " 'Saturday',\n", " searcher,\n", " hf_embeddings,\n", " True,\n", " cross_encoder,\n", " llmgoogle,\n", " text_model=llmgoogle,\n", " num_results=1,\n", " document_paths=['artifacts/logo.jpeg'],\n", " local_mode=True,\n", " split=False\n", ")\n", "print(web_response[0])" ] }, { "cell_type": "markdown", "id": "f962c291", "metadata": {}, "source": [ "## 10. YouTube Transcript Summarization\n", "\n", "Summarize YouTube video transcripts using the `youtube_transcript_response` function. This is useful for extracting insights from long videos." ] }, { "cell_type": "code", "execution_count": 18, "id": "ef41b810", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Generated YouTube context for query 'https://www.youtube.com/watch?v=o8NiE3XMPrM&t=6648s': \n",
       "\n",
       "Video: https://www.youtube.com/watch?v=o8NiE3XMPrM&t=6648s\n",
       "Transcript Summary: content='This is a comprehensive summary of the Google I/O \\'25 Keynote based on the provided \n",
       "metadata and transcript.\\n\\n**Key Announcements and Highlights:**\\n\\n* **Gemini 2.5:**  Google announced \n",
       "significant advancements in its Gemini model, boasting improved performance across various benchmarks (LMArena, \n",
       "WebDev Arena) and highlighting its capabilities in coding, reasoning, and multi-modal tasks.  Gemini 2.5 Pro and \n",
       "Gemini 2.5 Flash were both detailed, with Flash emphasizing speed and efficiency.  A new \"Deep Think\" mode was also\n",
       "teased, promising even greater performance but with a focus on further safety evaluations before wider \n",
       "release.\\n\\n* **Gemini App Updates:**  The Gemini app received major updates, including:\\n    * **Agent Mode:**  \n",
       "Allows the app to perform multi-step tasks on behalf of the user (e.g., finding apartments, booking \n",
       "appointments).\\n    * **Personalized Smart Replies:**  Generates email replies in the user\\'s style and tone using \n",
       "personal context.\\n    * **Gemini Live Enhancements:** Added camera and screen sharing capabilities, along with \n",
       "integration with other Google apps.\\n\\n* **Google Search Enhancements:**\\n    * **AI Overviews:**  Continued growth\n",
       "and expansion, now reaching over 1.5 billion users monthly.\\n    * **AI Mode:** A complete reimagining of Google \n",
       "Search, leveraging Gemini 2.5 for more complex queries, conversational follow-ups, and dynamic UI adaptation.  \n",
       "Features like Deep Search (for in-depth research) and personalized suggestions based on user context were \n",
       "highlighted.  Agentic capabilities were also integrated, allowing Search to perform tasks like booking tickets and \n",
       "making reservations.\\n    * **Multimodal Search (Search Live):**  Allows users to use their camera to ask questions\n",
       "about the real world in real-time.\\n\\n* **Generative Media Advancements:**\\n    * **Lyria 2:**  An improved music \n",
       "generation model capable of creating high-fidelity music with vocals.\\n    * **Imagen 4:**  A significantly \n",
       "improved image generation model with enhanced quality, speed, and text capabilities.\\n    * **Veo 3:**  A new video\n",
       "generation model with Native Audio generation, allowing for the creation of videos with realistic sound effects and\n",
       "dialogue.\\n    * **Flow:** A new AI filmmaking tool combining Veo, Imagen, and Gemini, designed to streamline the \n",
       "creative process for filmmakers.\\n\\n* **Android XR:**  Google showcased its Android XR platform, designed for a \n",
       "range of devices including headsets and glasses.  The platform integrates Gemini for enhanced contextual awareness \n",
       "and assistance.  Partnerships with Samsung, Gentle Monster, and Warby Parker were announced for headset and glasses\n",
       "development.  A live demo showcased the capabilities of Android XR glasses.\\n\\n* **Project Astra and Mariner:**  \n",
       "These research projects, previously showcased in earlier I/O events, are now being integrated into various Google \n",
       "products, demonstrating the transition of research prototypes into real-world applications.\\n\\n* **New AI \n",
       "Subscription Plans:** Google announced two new AI subscription plans: Google AI Pro and Google AI Ultra, offering \n",
       "varying levels of access to AI features and capabilities.\\n\\n**Overall Tone and Message:**\\n\\nThe keynote \n",
       "emphasized Google\\'s significant progress in AI, particularly with Gemini, and its commitment to integrating AI \n",
       "into its existing products and creating new experiences.  The focus was on making AI more personal, proactive, and \n",
       "powerful for users, while also highlighting the potential of AI to accelerate scientific discovery and address \n",
       "societal challenges.  The keynote also stressed collaboration with developers and the creative community to shape \n",
       "the future of AI.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings':\n",
       "[]}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []} \n",
       "id='run--b7e022e0-d8e3-41d9-a73c-a2c90ad5a013-0' usage_metadata={'input_tokens': 21590, 'output_tokens': 733, \n",
       "'total_tokens': 22323, 'input_token_details': {'cache_read': 0}}\n",
       "\n",
       "\n",
       "
\n" ], "text/plain": [ "Generated YouTube context for query \u001b[32m'https://www.youtube.com/watch?\u001b[0m\u001b[32mv\u001b[0m\u001b[32m=\u001b[0m\u001b[32mo8NiE3XMPrM\u001b[0m\u001b[32m&\u001b[0m\u001b[32mt\u001b[0m\u001b[32m=\u001b[0m\u001b[32m6648s\u001b[0m\u001b[32m'\u001b[0m: \n", "\n", "Video: \u001b[4;94mhttps://www.youtube.com/watch?\u001b[0m\u001b[4;94mv\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94mo8NiE3XMPrM\u001b[0m\u001b[4;94m&\u001b[0m\u001b[4;94mt\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94m6648s\u001b[0m\n", "Transcript Summary: \u001b[33mcontent\u001b[0m=\u001b[32m'This is a comprehensive summary of the Google I/O \\'25 Keynote based on the provided \u001b[0m\n", "\u001b[32mmetadata and transcript.\\n\\n**Key Announcements and Highlights:**\\n\\n* **Gemini 2.5:** Google announced \u001b[0m\n", "\u001b[32msignificant advancements in its Gemini model, boasting improved performance across various benchmarks \u001b[0m\u001b[32m(\u001b[0m\u001b[32mLMArena, \u001b[0m\n", "\u001b[32mWebDev Arena\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and highlighting its capabilities in coding, reasoning, and multi-modal tasks. Gemini 2.5 Pro and \u001b[0m\n", "\u001b[32mGemini 2.5 Flash were both detailed, with Flash emphasizing speed and efficiency. A new \"Deep Think\" mode was also\u001b[0m\n", "\u001b[32mteased, promising even greater performance but with a focus on further safety evaluations before wider \u001b[0m\n", "\u001b[32mrelease.\\n\\n* **Gemini App Updates:** The Gemini app received major updates, including:\\n * **Agent Mode:** \u001b[0m\n", "\u001b[32mAllows the app to perform multi-step tasks on behalf of the user \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g., finding apartments, booking \u001b[0m\n", "\u001b[32mappointments\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n * **Personalized Smart Replies:** Generates email replies in the user\\'s style and tone using \u001b[0m\n", "\u001b[32mpersonal context.\\n * **Gemini Live Enhancements:** Added camera and screen sharing capabilities, along with \u001b[0m\n", "\u001b[32mintegration with other Google apps.\\n\\n* **Google Search Enhancements:**\\n * **AI Overviews:** Continued growth\u001b[0m\n", "\u001b[32mand expansion, now reaching over 1.5 billion users monthly.\\n * **AI Mode:** A complete reimagining of Google \u001b[0m\n", "\u001b[32mSearch, leveraging Gemini 2.5 for more complex queries, conversational follow-ups, and dynamic UI adaptation. \u001b[0m\n", "\u001b[32mFeatures like Deep Search \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfor in-depth research\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and personalized suggestions based on user context were \u001b[0m\n", "\u001b[32mhighlighted. Agentic capabilities were also integrated, allowing Search to perform tasks like booking tickets and \u001b[0m\n", "\u001b[32mmaking reservations.\\n * **Multimodal Search \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSearch Live\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:** Allows users to use their camera to ask questions\u001b[0m\n", "\u001b[32mabout the real world in real-time.\\n\\n* **Generative Media Advancements:**\\n * **Lyria 2:** An improved music \u001b[0m\n", "\u001b[32mgeneration model capable of creating high-fidelity music with vocals.\\n * **Imagen 4:** A significantly \u001b[0m\n", "\u001b[32mimproved image generation model with enhanced quality, speed, and text capabilities.\\n * **Veo 3:** A new video\u001b[0m\n", "\u001b[32mgeneration model with Native Audio generation, allowing for the creation of videos with realistic sound effects and\u001b[0m\n", "\u001b[32mdialogue.\\n * **Flow:** A new AI filmmaking tool combining Veo, Imagen, and Gemini, designed to streamline the \u001b[0m\n", "\u001b[32mcreative process for filmmakers.\\n\\n* **Android XR:** Google showcased its Android XR platform, designed for a \u001b[0m\n", "\u001b[32mrange of devices including headsets and glasses. The platform integrates Gemini for enhanced contextual awareness \u001b[0m\n", "\u001b[32mand assistance. Partnerships with Samsung, Gentle Monster, and Warby Parker were announced for headset and glasses\u001b[0m\n", "\u001b[32mdevelopment. A live demo showcased the capabilities of Android XR glasses.\\n\\n* **Project Astra and Mariner:** \u001b[0m\n", "\u001b[32mThese research projects, previously showcased in earlier I/O events, are now being integrated into various Google \u001b[0m\n", "\u001b[32mproducts, demonstrating the transition of research prototypes into real-world applications.\\n\\n* **New AI \u001b[0m\n", "\u001b[32mSubscription Plans:** Google announced two new AI subscription plans: Google AI Pro and Google AI Ultra, offering \u001b[0m\n", "\u001b[32mvarying levels of access to AI features and capabilities.\\n\\n**Overall Tone and Message:**\\n\\nThe keynote \u001b[0m\n", "\u001b[32memphasized Google\\'s significant progress in AI, particularly with Gemini, and its commitment to integrating AI \u001b[0m\n", "\u001b[32minto its existing products and creating new experiences. The focus was on making AI more personal, proactive, and \u001b[0m\n", "\u001b[32mpowerful for users, while also highlighting the potential of AI to accelerate scientific discovery and address \u001b[0m\n", "\u001b[32msocietal challenges. The keynote also stressed collaboration with developers and the creative community to shape \u001b[0m\n", "\u001b[32mthe future of AI.'\u001b[0m \u001b[33madditional_kwargs\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[33mresponse_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'prompt_feedback'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'block_reason'\u001b[0m: \u001b[1;36m0\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m:\n", "\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'finish_reason'\u001b[0m: \u001b[32m'STOP'\u001b[0m, \u001b[32m'model_name'\u001b[0m: \u001b[32m'gemini-1.5-flash'\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m \n", "\u001b[33mid\u001b[0m=\u001b[32m'run--b7e022e0-d8e3-41d9-a73c-a2c90ad5a013-0'\u001b[0m \u001b[33musage_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'input_tokens'\u001b[0m: \u001b[1;36m21590\u001b[0m, \u001b[32m'output_tokens'\u001b[0m: \u001b[1;36m733\u001b[0m, \n", "\u001b[32m'total_tokens'\u001b[0m: \u001b[1;36m22323\u001b[0m, \u001b[32m'input_token_details'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'cache_read'\u001b[0m: \u001b[1;36m0\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", "\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n",
       "\n",
       "Video: https://www.youtube.com/watch?v=o8NiE3XMPrM&t=6648s\n",
       "Transcript Summary: content='This is a comprehensive summary of the Google I/O \\'25 Keynote based on the provided \n",
       "metadata and transcript.\\n\\n**Key Announcements and Highlights:**\\n\\n* **Gemini 2.5:**  Google announced \n",
       "significant advancements in its Gemini model, boasting improved performance across various benchmarks (LMArena, \n",
       "WebDev Arena) and highlighting its capabilities in coding, reasoning, and multi-modal tasks.  Gemini 2.5 Pro and \n",
       "Gemini 2.5 Flash were both detailed, with Flash emphasizing speed and efficiency.  A new \"Deep Think\" mode was also\n",
       "teased, promising even greater performance but with a focus on further safety evaluations before wider \n",
       "release.\\n\\n* **Gemini App Updates:**  The Gemini app received major updates, including:\\n    * **Agent Mode:**  \n",
       "Allows the app to perform multi-step tasks on behalf of the user (e.g., finding apartments, booking \n",
       "appointments).\\n    * **Personalized Smart Replies:**  Generates email replies in the user\\'s style and tone using \n",
       "personal context.\\n    * **Gemini Live Enhancements:** Added camera and screen sharing capabilities, along with \n",
       "integration with other Google apps.\\n\\n* **Google Search Enhancements:**\\n    * **AI Overviews:**  Continued growth\n",
       "and expansion, now reaching over 1.5 billion users monthly.\\n    * **AI Mode:** A complete reimagining of Google \n",
       "Search, leveraging Gemini 2.5 for more complex queries, conversational follow-ups, and dynamic UI adaptation.  \n",
       "Features like Deep Search (for in-depth research) and personalized suggestions based on user context were \n",
       "highlighted.  Agentic capabilities were also integrated, allowing Search to perform tasks like booking tickets and \n",
       "making reservations.\\n    * **Multimodal Search (Search Live):**  Allows users to use their camera to ask questions\n",
       "about the real world in real-time.\\n\\n* **Generative Media Advancements:**\\n    * **Lyria 2:**  An improved music \n",
       "generation model capable of creating high-fidelity music with vocals.\\n    * **Imagen 4:**  A significantly \n",
       "improved image generation model with enhanced quality, speed, and text capabilities.\\n    * **Veo 3:**  A new video\n",
       "generation model with Native Audio generation, allowing for the creation of videos with realistic sound effects and\n",
       "dialogue.\\n    * **Flow:** A new AI filmmaking tool combining Veo, Imagen, and Gemini, designed to streamline the \n",
       "creative process for filmmakers.\\n\\n* **Android XR:**  Google showcased its Android XR platform, designed for a \n",
       "range of devices including headsets and glasses.  The platform integrates Gemini for enhanced contextual awareness \n",
       "and assistance.  Partnerships with Samsung, Gentle Monster, and Warby Parker were announced for headset and glasses\n",
       "development.  A live demo showcased the capabilities of Android XR glasses.\\n\\n* **Project Astra and Mariner:**  \n",
       "These research projects, previously showcased in earlier I/O events, are now being integrated into various Google \n",
       "products, demonstrating the transition of research prototypes into real-world applications.\\n\\n* **New AI \n",
       "Subscription Plans:** Google announced two new AI subscription plans: Google AI Pro and Google AI Ultra, offering \n",
       "varying levels of access to AI features and capabilities.\\n\\n**Overall Tone and Message:**\\n\\nThe keynote \n",
       "emphasized Google\\'s significant progress in AI, particularly with Gemini, and its commitment to integrating AI \n",
       "into its existing products and creating new experiences.  The focus was on making AI more personal, proactive, and \n",
       "powerful for users, while also highlighting the potential of AI to accelerate scientific discovery and address \n",
       "societal challenges.  The keynote also stressed collaboration with developers and the creative community to shape \n",
       "the future of AI.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings':\n",
       "[]}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []} \n",
       "id='run--b7e022e0-d8e3-41d9-a73c-a2c90ad5a013-0' usage_metadata={'input_tokens': 21590, 'output_tokens': 733, \n",
       "'total_tokens': 22323, 'input_token_details': {'cache_read': 0}}\n",
       "\n",
       "\n",
       "
\n" ], "text/plain": [ "\n", "\n", "Video: \u001b[4;94mhttps://www.youtube.com/watch?\u001b[0m\u001b[4;94mv\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94mo8NiE3XMPrM\u001b[0m\u001b[4;94m&\u001b[0m\u001b[4;94mt\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94m6648s\u001b[0m\n", "Transcript Summary: \u001b[33mcontent\u001b[0m=\u001b[32m'This is a comprehensive summary of the Google I/O \\'25 Keynote based on the provided \u001b[0m\n", "\u001b[32mmetadata and transcript.\\n\\n**Key Announcements and Highlights:**\\n\\n* **Gemini 2.5:** Google announced \u001b[0m\n", "\u001b[32msignificant advancements in its Gemini model, boasting improved performance across various benchmarks \u001b[0m\u001b[32m(\u001b[0m\u001b[32mLMArena, \u001b[0m\n", "\u001b[32mWebDev Arena\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and highlighting its capabilities in coding, reasoning, and multi-modal tasks. Gemini 2.5 Pro and \u001b[0m\n", "\u001b[32mGemini 2.5 Flash were both detailed, with Flash emphasizing speed and efficiency. A new \"Deep Think\" mode was also\u001b[0m\n", "\u001b[32mteased, promising even greater performance but with a focus on further safety evaluations before wider \u001b[0m\n", "\u001b[32mrelease.\\n\\n* **Gemini App Updates:** The Gemini app received major updates, including:\\n * **Agent Mode:** \u001b[0m\n", "\u001b[32mAllows the app to perform multi-step tasks on behalf of the user \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g., finding apartments, booking \u001b[0m\n", "\u001b[32mappointments\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n * **Personalized Smart Replies:** Generates email replies in the user\\'s style and tone using \u001b[0m\n", "\u001b[32mpersonal context.\\n * **Gemini Live Enhancements:** Added camera and screen sharing capabilities, along with \u001b[0m\n", "\u001b[32mintegration with other Google apps.\\n\\n* **Google Search Enhancements:**\\n * **AI Overviews:** Continued growth\u001b[0m\n", "\u001b[32mand expansion, now reaching over 1.5 billion users monthly.\\n * **AI Mode:** A complete reimagining of Google \u001b[0m\n", "\u001b[32mSearch, leveraging Gemini 2.5 for more complex queries, conversational follow-ups, and dynamic UI adaptation. \u001b[0m\n", "\u001b[32mFeatures like Deep Search \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfor in-depth research\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and personalized suggestions based on user context were \u001b[0m\n", "\u001b[32mhighlighted. Agentic capabilities were also integrated, allowing Search to perform tasks like booking tickets and \u001b[0m\n", "\u001b[32mmaking reservations.\\n * **Multimodal Search \u001b[0m\u001b[32m(\u001b[0m\u001b[32mSearch Live\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:** Allows users to use their camera to ask questions\u001b[0m\n", "\u001b[32mabout the real world in real-time.\\n\\n* **Generative Media Advancements:**\\n * **Lyria 2:** An improved music \u001b[0m\n", "\u001b[32mgeneration model capable of creating high-fidelity music with vocals.\\n * **Imagen 4:** A significantly \u001b[0m\n", "\u001b[32mimproved image generation model with enhanced quality, speed, and text capabilities.\\n * **Veo 3:** A new video\u001b[0m\n", "\u001b[32mgeneration model with Native Audio generation, allowing for the creation of videos with realistic sound effects and\u001b[0m\n", "\u001b[32mdialogue.\\n * **Flow:** A new AI filmmaking tool combining Veo, Imagen, and Gemini, designed to streamline the \u001b[0m\n", "\u001b[32mcreative process for filmmakers.\\n\\n* **Android XR:** Google showcased its Android XR platform, designed for a \u001b[0m\n", "\u001b[32mrange of devices including headsets and glasses. The platform integrates Gemini for enhanced contextual awareness \u001b[0m\n", "\u001b[32mand assistance. Partnerships with Samsung, Gentle Monster, and Warby Parker were announced for headset and glasses\u001b[0m\n", "\u001b[32mdevelopment. A live demo showcased the capabilities of Android XR glasses.\\n\\n* **Project Astra and Mariner:** \u001b[0m\n", "\u001b[32mThese research projects, previously showcased in earlier I/O events, are now being integrated into various Google \u001b[0m\n", "\u001b[32mproducts, demonstrating the transition of research prototypes into real-world applications.\\n\\n* **New AI \u001b[0m\n", "\u001b[32mSubscription Plans:** Google announced two new AI subscription plans: Google AI Pro and Google AI Ultra, offering \u001b[0m\n", "\u001b[32mvarying levels of access to AI features and capabilities.\\n\\n**Overall Tone and Message:**\\n\\nThe keynote \u001b[0m\n", "\u001b[32memphasized Google\\'s significant progress in AI, particularly with Gemini, and its commitment to integrating AI \u001b[0m\n", "\u001b[32minto its existing products and creating new experiences. The focus was on making AI more personal, proactive, and \u001b[0m\n", "\u001b[32mpowerful for users, while also highlighting the potential of AI to accelerate scientific discovery and address \u001b[0m\n", "\u001b[32msocietal challenges. The keynote also stressed collaboration with developers and the creative community to shape \u001b[0m\n", "\u001b[32mthe future of AI.'\u001b[0m \u001b[33madditional_kwargs\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[33mresponse_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'prompt_feedback'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'block_reason'\u001b[0m: \u001b[1;36m0\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m:\n", "\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'finish_reason'\u001b[0m: \u001b[32m'STOP'\u001b[0m, \u001b[32m'model_name'\u001b[0m: \u001b[32m'gemini-1.5-flash'\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m \n", "\u001b[33mid\u001b[0m=\u001b[32m'run--b7e022e0-d8e3-41d9-a73c-a2c90ad5a013-0'\u001b[0m \u001b[33musage_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'input_tokens'\u001b[0m: \u001b[1;36m21590\u001b[0m, \u001b[32m'output_tokens'\u001b[0m: \u001b[1;36m733\u001b[0m, \n", "\u001b[32m'total_tokens'\u001b[0m: \u001b[1;36m22323\u001b[0m, \u001b[32m'input_token_details'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'cache_read'\u001b[0m: \u001b[1;36m0\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", "\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "summary = youtube_transcript_response(\"https://www.youtube.com/watch?v=o8NiE3XMPrM&t=6648s\", \n", " 'summarize in bullets and themes', llmgoogle)\n", "print(summary)" ] }, { "cell_type": "markdown", "id": "7a00c605", "metadata": {}, "source": [ "You can also now do simple phrase search and get summarise of topk videos relevant to the search phrase " ] }, { "cell_type": "code", "execution_count": 19, "id": "1d4a028c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:08:47,538] INFO utils.websearch_utils: Found YouTube video: Qwen 3 Coder: NEW Agentic Coding LLM! Insanely Powerful, Fast, & Free! (Opensource) by WorldofAI at https://www.youtube.com/watch?v=8jBw0CVPZPM\n", "[2025-07-27 12:08:47,538] INFO utils.websearch_utils: Found YouTube video: Qwen 3 Coder: NEW Agentic Coding LLM! Insanely Powerful, Fast, & Free! (Opensource) by WorldofAI at https://www.youtube.com/watch?v=8jBw0CVPZPM\n", "[2025-07-27 12:08:55,593] INFO utils.websearch_utils: Found YouTube video: Qwen Code CLI + Qwen-3 Coder 🔥 | Better Than Claude Code? - Full Tutorial by Astro K Joseph at https://www.youtube.com/watch?v=kv57HqEb2j8\n", "[2025-07-27 12:08:55,593] INFO utils.websearch_utils: Found YouTube video: Qwen Code CLI + Qwen-3 Coder 🔥 | Better Than Claude Code? - Full Tutorial by Astro K Joseph at https://www.youtube.com/watch?v=kv57HqEb2j8\n", "[2025-07-27 12:09:04,635] INFO utils.websearch_utils: Found YouTube video: QWEN 3 CODER is Unleashed... better than KIMI K2 by Wes Roth at https://www.youtube.com/watch?v=feAc83Qlx4Q\n", "[2025-07-27 12:09:04,635] INFO utils.websearch_utils: Found YouTube video: QWEN 3 CODER is Unleashed... better than KIMI K2 by Wes Roth at https://www.youtube.com/watch?v=feAc83Qlx4Q\n" ] }, { "data": { "text/html": [ "
Generated YouTube context for query 'Qwen 3 coder reviews': \n",
       "\n",
       "Video: Qwen 3 Coder: NEW Agentic Coding LLM! Insanely Powerful, Fast, & Free! (Opensource) by WorldofAI\n",
       "URL: https://www.youtube.com/watch?v=8jBw0CVPZPM\n",
       "Transcript Summary: content='This is a detailed analysis of the YouTube video metadata and transcript.\\n\\n**Video \n",
       "Overview:**\\n\\nThe video is a review and demonstration of Qwen3-Coder, a new open-source agentic coding large \n",
       "language model (LLM) from Alibaba.  The presenter highlights its capabilities, including its large context window \n",
       "(256K tokens, scalable to 1M), its strong performance on coding benchmarks (outperforming some closed-source \n",
       "models), and its accompanying command-line interface (CLI), Qwen Code.  The video includes several live demos \n",
       "showcasing Qwen3-Coder\\'s ability to generate code for various tasks, such as creating animations, interactive \n",
       "dashboards, and neural network visualizers.  A key selling point is that the model and its tools are completely \n",
       "free and open-source.\\n\\n**Strengths of the Video:**\\n\\n* **Clear and Concise Introduction:** The video clearly \n",
       "introduces Qwen3-Coder and its key features early on.\\n* **Compelling Visuals:** The demos are engaging and \n",
       "effectively demonstrate the model\\'s capabilities.  The visual results of the code generation are shown, making the\n",
       "power of the LLM clear to viewers.\\n* **Benchmark Comparisons:** The presenter provides benchmark comparisons to \n",
       "other LLMs, establishing Qwen3-Coder\\'s competitive advantage.\\n* **Multiple Access Methods:** The video explains \n",
       "multiple ways to access and use Qwen3-Coder (chatbot, API, local installation).\\n* **Well-Structured Content:** The\n",
       "video is logically organized, moving from an introduction to demos and concluding with a call to action.\\n* \n",
       "**Effective Use of Keywords and Hashtags:** The metadata and description are well-optimized for search, using \n",
       "relevant keywords and hashtags.\\n* **Sponsor Integration:** The sponsor message is integrated naturally and \n",
       "doesn\\'t disrupt the flow of the video.\\n\\n\\n**Areas for Improvement:**\\n\\n* **Transcript Accuracy:** While the \n",
       "transcript is generally accurate, there are a few minor typos and inconsistencies (e.g., \"Coin 3\" instead of \"Qwen \n",
       "3\" in several places).  A thorough proofread would improve the quality.\\n* **Demo Complexity:** While the demos are\n",
       "impressive, some viewers might find them too advanced.  Including simpler demos could broaden the appeal to a wider\n",
       "audience.\\n* **Technical Depth:**  The video could benefit from a slightly deeper dive into the technical aspects \n",
       "of Qwen3-Coder\\'s architecture (Mixture of Experts) for a more technically inclined audience.  A brief explanation \n",
       "of how the 256K context window works would also be beneficial.\\n* **Call to Action Clarity:** While the video has \n",
       "multiple calls to action (subscribe, donate, join Discord), consolidating them might make them more \n",
       "impactful.\\n\\n\\n**Overall:**\\n\\nThe YouTube video is well-produced and effectively communicates the value \n",
       "proposition of Qwen3-Coder.  The engaging demos and clear explanations make it a valuable resource for developers \n",
       "and AI enthusiasts interested in open-source coding LLMs.  Addressing the minor areas for improvement would further\n",
       "enhance its impact and reach.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, \n",
       "'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []} \n",
       "id='run--327884b3-8a87-417e-ba81-0e0e748223e1-0' usage_metadata={'input_tokens': 3267, 'output_tokens': 624, \n",
       "'total_tokens': 3891, 'input_token_details': {'cache_read': 0}}\n",
       "\n",
       "\n",
       "\n",
       "Video: Qwen Code CLI + Qwen-3 Coder 🔥 | Better Than Claude Code? - Full Tutorial by Astro K Joseph\n",
       "URL: https://www.youtube.com/watch?v=kv57HqEb2j8\n",
       "Transcript Summary: content='Here\\'s a breakdown of the YouTube video transcript, organized for clarity and \n",
       "SEO:\\n\\n**Title:** Qwen Code CLI + Qwen-3 Coder 🔥 | Better Than Claude Code? - Full Tutorial\\n\\n**Meta \n",
       "Description:** Learn how to set up and use the new Qwen Code CLI with the Qwen-3 Coder AI model. This full tutorial\n",
       "covers installation, API key configuration, and practical examples.  Is it better than Claude Code? Find \n",
       "out!\\n\\n**Tags:** Qwen Code CLI, Qwen-3 Coder, AI powered CLI, AI coding, code generation, tutorial, OpenRouter,  \n",
       "large language model, LLM, coding assistant, Gemini Code, Claude Code,  software development, programming, AI \n",
       "tools,  NodeJS, npm\\n\\n\\n**Video Summary:**\\n\\nThis tutorial demonstrates how to set up and use the newly released \n",
       "Qwen Code CLI, an AI-powered command-line interface, with the Qwen-3 Coder AI model.  The video compares it \n",
       "implicitly to other AI CLIs like Claude Code and Gemini Code.  The presenter walks viewers through the installation\n",
       "process (requiring NodeJS), API key configuration using OpenRouter, and provides practical examples of generating \n",
       "code for a tic-tac-toe game and a landing page.  The video also explores using Qwen Code to analyze existing \n",
       "codebases and understand their architecture.  While initially showcasing smooth code generation, the video later \n",
       "encounters some challenges with longer, more complex prompts, highlighting potential limitations.\\n\\n\\n**Key \n",
       "Sections & Timestamps (estimated, based on transcript length):**\\n\\n* **0:00-2:00:** Introduction – Announcing Qwen\n",
       "Code CLI and Qwen-3 Coder, highlighting the trend of AI-powered CLIs.\\n* **2:00-4:00:** Installation –  Explaining \n",
       "the prerequisites (NodeJS) and providing the `npm` installation command.\\n* **4:00-6:00:** API Key Setup –  Showing\n",
       "how to obtain an API key from OpenRouter and configure environment variables.\\n* **6:00-8:00:** Basic Usage –  \n",
       "Demonstrating simple interactions with the CLI and generating a basic tic-tac-toe game.\\n* **8:00-11:00:**  Landing\n",
       "Page Example – Attempting to generate a more complex landing page, encountering some delays and issues.  This \n",
       "section highlights potential limitations and the need for iterative refinement of prompts.\\n* **11:00-12:00:** Code\n",
       "Analysis –  Using Qwen Code to analyze an existing Next.js project and generate a summary of its architecture.\\n* \n",
       "**12:00-12:45:** Conclusion –  Recap of the process, additional features mentioned (like MCP integration), and call\n",
       "to action.\\n\\n\\n**Improvements for the Video:**\\n\\n* **Address the \"Better than Claude Code?\" question directly:** \n",
       "The video title raises this question, but the comparison is implicit.  A more direct comparison, perhaps with \n",
       "side-by-side examples, would strengthen the video.\\n* **Troubleshooting the Landing Page Issue:** The video \n",
       "encounters difficulties generating the landing page.  Addressing this directly, perhaps by showing successful \n",
       "generation after prompt refinement or explaining potential reasons for the delay, would improve the tutorial\\'s \n",
       "completeness.\\n* **Time Stamps:** Adding timestamps to the video description would greatly improve user \n",
       "experience.\\n* **Show more advanced features:**  The video briefly mentions MCP integration.  Demonstrating this or\n",
       "other advanced features would add value.\\n* **Visuals:**  The video could benefit from more visual aids, such as \n",
       "screen recordings showing the code being generated in real-time.\\n\\n\\nBy addressing these points, the video would \n",
       "become a more comprehensive and compelling tutorial on using Qwen Code CLI.' additional_kwargs={} \n",
       "response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', \n",
       "'model_name': 'gemini-1.5-flash', 'safety_ratings': []} id='run--8949b5c8-f8a8-49d3-a091-c0f356eceefc-0' \n",
       "usage_metadata={'input_tokens': 3493, 'output_tokens': 784, 'total_tokens': 4277, 'input_token_details': \n",
       "{'cache_read': 0}}\n",
       "\n",
       "\n",
       "\n",
       "Video: QWEN 3 CODER is Unleashed... better than KIMI K2 by Wes Roth\n",
       "URL: https://www.youtube.com/watch?v=feAc83Qlx4Q\n",
       "Transcript Summary: content='This is a detailed summary of the YouTube video about Qwen-3-Coder, an open-source AI \n",
       "coding model.\\n\\n**Key Points:**\\n\\n* **Qwen-3-Coder\\'s Superiority:** The video highlights Qwen-3-Coder\\'s \n",
       "performance surpassing other models like Kim K2, GPT-4.1, and even showing competitiveness with Claude Sonnet in \n",
       "various benchmarks.  Its strength lies in agentic coding tasks, involving multi-turn interactions and tool \n",
       "use.\\n\\n* **Model Size and Architecture:**  Qwen-3-Coder is a 480B parameter model, but utilizes a Mixture of \n",
       "Experts approach, keeping only 35B parameters active during inference.  It\\'s described as an \"instruct\" model, \n",
       "meaning it\\'s designed for helpful and friendly interactions.\\n\\n* **Reinforcement Learning (RL) Training:** A \n",
       "significant portion of the video focuses on the model\\'s RL training.  Unlike other models focusing on \n",
       "competition-style coding problems, Qwen-3-Coder was trained on real-world coding tasks, leading to improved \n",
       "generalization across various tasks (code generation, math problems, etc.).  The training involved a scalable \n",
       "system running 20,000 parallel environments.\\n\\n* **Agentic Capabilities and Tools:** The model\\'s ability to \n",
       "perform agentic coding is emphasized.  It seamlessly integrates with tools like Qwen Code (a CLI tool forked from \n",
       "Gemini Code) and Claude Code, enabling complex interactions with the environment.\\n\\n* **Real-World \n",
       "Demonstrations:** The video showcases several impressive demonstrations of Qwen-3-Coder\\'s capabilities, including \n",
       "building simulations, game creation (a drone game, Minecraft clone), and interactive visualizations.\\n\\n* \n",
       "**Open-Source Availability:**  The model and associated tools are open-source, available on Hugging Face and \n",
       "GitHub, fostering community contributions and development.\\n\\n* **Future Implications:** The presenter expresses \n",
       "surprise at the rapid advancement of open-source AI, suggesting it\\'s closing the gap with leading proprietary \n",
       "models much faster than anticipated.\\n\\n\\n**Overall:**\\n\\nThe video presents a very positive and enthusiastic \n",
       "review of Qwen-3-Coder, emphasizing its impressive performance, innovative training methodology, and open-source \n",
       "nature.  It positions the model as a significant advancement in the field of large language models and agentic \n",
       "coding.  The presenter\\'s hands-on demonstrations further solidify this positive assessment.' additional_kwargs={} \n",
       "response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', \n",
       "'model_name': 'gemini-1.5-flash', 'safety_ratings': []} id='run--59f2073a-7d87-4ff0-9a2e-aa2dea885451-0' \n",
       "usage_metadata={'input_tokens': 2078, 'output_tokens': 495, 'total_tokens': 2573, 'input_token_details': \n",
       "{'cache_read': 0}}\n",
       "\n",
       "\n",
       "
\n" ], "text/plain": [ "Generated YouTube context for query \u001b[32m'Qwen 3 coder reviews'\u001b[0m: \n", "\n", "Video: Qwen \u001b[1;36m3\u001b[0m Coder: NEW Agentic Coding LLM! Insanely Powerful, Fast, & Free! \u001b[1m(\u001b[0mOpensource\u001b[1m)\u001b[0m by WorldofAI\n", "URL: \u001b[4;94mhttps://www.youtube.com/watch?\u001b[0m\u001b[4;94mv\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94m8jBw0CVPZPM\u001b[0m\n", "Transcript Summary: \u001b[33mcontent\u001b[0m=\u001b[32m'This is a detailed analysis of the YouTube video metadata and transcript.\\n\\n**Video \u001b[0m\n", "\u001b[32mOverview:**\\n\\nThe video is a review and demonstration of Qwen3-Coder, a new open-source agentic coding large \u001b[0m\n", "\u001b[32mlanguage model \u001b[0m\u001b[32m(\u001b[0m\u001b[32mLLM\u001b[0m\u001b[32m)\u001b[0m\u001b[32m from Alibaba. The presenter highlights its capabilities, including its large context window \u001b[0m\n", "\u001b[32m(\u001b[0m\u001b[32m256K tokens, scalable to 1M\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, its strong performance on coding benchmarks \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutperforming some closed-source \u001b[0m\n", "\u001b[32mmodels\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and its accompanying command-line interface \u001b[0m\u001b[32m(\u001b[0m\u001b[32mCLI\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, Qwen Code. The video includes several live demos \u001b[0m\n", "\u001b[32mshowcasing Qwen3-Coder\\'s ability to generate code for various tasks, such as creating animations, interactive \u001b[0m\n", "\u001b[32mdashboards, and neural network visualizers. A key selling point is that the model and its tools are completely \u001b[0m\n", "\u001b[32mfree and open-source.\\n\\n**Strengths of the Video:**\\n\\n* **Clear and Concise Introduction:** The video clearly \u001b[0m\n", "\u001b[32mintroduces Qwen3-Coder and its key features early on.\\n* **Compelling Visuals:** The demos are engaging and \u001b[0m\n", "\u001b[32meffectively demonstrate the model\\'s capabilities. The visual results of the code generation are shown, making the\u001b[0m\n", "\u001b[32mpower of the LLM clear to viewers.\\n* **Benchmark Comparisons:** The presenter provides benchmark comparisons to \u001b[0m\n", "\u001b[32mother LLMs, establishing Qwen3-Coder\\'s competitive advantage.\\n* **Multiple Access Methods:** The video explains \u001b[0m\n", "\u001b[32mmultiple ways to access and use Qwen3-Coder \u001b[0m\u001b[32m(\u001b[0m\u001b[32mchatbot, API, local installation\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n* **Well-Structured Content:** The\u001b[0m\n", "\u001b[32mvideo is logically organized, moving from an introduction to demos and concluding with a call to action.\\n* \u001b[0m\n", "\u001b[32m**Effective Use of Keywords and Hashtags:** The metadata and description are well-optimized for search, using \u001b[0m\n", "\u001b[32mrelevant keywords and hashtags.\\n* **Sponsor Integration:** The sponsor message is integrated naturally and \u001b[0m\n", "\u001b[32mdoesn\\'t disrupt the flow of the video.\\n\\n\\n**Areas for Improvement:**\\n\\n* **Transcript Accuracy:** While the \u001b[0m\n", "\u001b[32mtranscript is generally accurate, there are a few minor typos and inconsistencies \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g., \"Coin 3\" instead of \"Qwen \u001b[0m\n", "\u001b[32m3\" in several places\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. A thorough proofread would improve the quality.\\n* **Demo Complexity:** While the demos are\u001b[0m\n", "\u001b[32mimpressive, some viewers might find them too advanced. Including simpler demos could broaden the appeal to a wider\u001b[0m\n", "\u001b[32maudience.\\n* **Technical Depth:** The video could benefit from a slightly deeper dive into the technical aspects \u001b[0m\n", "\u001b[32mof Qwen3-Coder\\'s architecture \u001b[0m\u001b[32m(\u001b[0m\u001b[32mMixture of Experts\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for a more technically inclined audience. A brief explanation \u001b[0m\n", "\u001b[32mof how the 256K context window works would also be beneficial.\\n* **Call to Action Clarity:** While the video has \u001b[0m\n", "\u001b[32mmultiple calls to action \u001b[0m\u001b[32m(\u001b[0m\u001b[32msubscribe, donate, join Discord\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, consolidating them might make them more \u001b[0m\n", "\u001b[32mimpactful.\\n\\n\\n**Overall:**\\n\\nThe YouTube video is well-produced and effectively communicates the value \u001b[0m\n", "\u001b[32mproposition of Qwen3-Coder. The engaging demos and clear explanations make it a valuable resource for developers \u001b[0m\n", "\u001b[32mand AI enthusiasts interested in open-source coding LLMs. Addressing the minor areas for improvement would further\u001b[0m\n", "\u001b[32menhance its impact and reach.'\u001b[0m \u001b[33madditional_kwargs\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[33mresponse_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'prompt_feedback'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'block_reason'\u001b[0m: \u001b[1;36m0\u001b[0m, \n", "\u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'finish_reason'\u001b[0m: \u001b[32m'STOP'\u001b[0m, \u001b[32m'model_name'\u001b[0m: \u001b[32m'gemini-1.5-flash'\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m \n", "\u001b[33mid\u001b[0m=\u001b[32m'run--327884b3-8a87-417e-ba81-0e0e748223e1-0'\u001b[0m \u001b[33musage_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'input_tokens'\u001b[0m: \u001b[1;36m3267\u001b[0m, \u001b[32m'output_tokens'\u001b[0m: \u001b[1;36m624\u001b[0m, \n", "\u001b[32m'total_tokens'\u001b[0m: \u001b[1;36m3891\u001b[0m, \u001b[32m'input_token_details'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'cache_read'\u001b[0m: \u001b[1;36m0\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", "\n", "\n", "\n", "Video: Qwen Code CLI + Qwen-\u001b[1;36m3\u001b[0m Coder 🔥 | Better Than Claude Code? - Full Tutorial by Astro K Joseph\n", "URL: \u001b[4;94mhttps://www.youtube.com/watch?\u001b[0m\u001b[4;94mv\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94mkv57HqEb2j8\u001b[0m\n", "Transcript Summary: \u001b[33mcontent\u001b[0m=\u001b[32m'Here\\'s a breakdown of the YouTube video transcript, organized for clarity and \u001b[0m\n", "\u001b[32mSEO:\\n\\n**Title:** Qwen Code CLI + Qwen-3 Coder 🔥 | Better Than Claude Code? - Full Tutorial\\n\\n**Meta \u001b[0m\n", "\u001b[32mDescription:** Learn how to set up and use the new Qwen Code CLI with the Qwen-3 Coder AI model. This full tutorial\u001b[0m\n", "\u001b[32mcovers installation, API key configuration, and practical examples. Is it better than Claude Code? Find \u001b[0m\n", "\u001b[32mout!\\n\\n**Tags:** Qwen Code CLI, Qwen-3 Coder, AI powered CLI, AI coding, code generation, tutorial, OpenRouter, \u001b[0m\n", "\u001b[32mlarge language model, LLM, coding assistant, Gemini Code, Claude Code, software development, programming, AI \u001b[0m\n", "\u001b[32mtools, NodeJS, npm\\n\\n\\n**Video Summary:**\\n\\nThis tutorial demonstrates how to set up and use the newly released \u001b[0m\n", "\u001b[32mQwen Code CLI, an AI-powered command-line interface, with the Qwen-3 Coder AI model. The video compares it \u001b[0m\n", "\u001b[32mimplicitly to other AI CLIs like Claude Code and Gemini Code. The presenter walks viewers through the installation\u001b[0m\n", "\u001b[32mprocess \u001b[0m\u001b[32m(\u001b[0m\u001b[32mrequiring NodeJS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, API key configuration using OpenRouter, and provides practical examples of generating \u001b[0m\n", "\u001b[32mcode for a tic-tac-toe game and a landing page. The video also explores using Qwen Code to analyze existing \u001b[0m\n", "\u001b[32mcodebases and understand their architecture. While initially showcasing smooth code generation, the video later \u001b[0m\n", "\u001b[32mencounters some challenges with longer, more complex prompts, highlighting potential limitations.\\n\\n\\n**Key \u001b[0m\n", "\u001b[32mSections & Timestamps \u001b[0m\u001b[32m(\u001b[0m\u001b[32mestimated, based on transcript length\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:**\\n\\n* **0:00-2:00:** Introduction – Announcing Qwen\u001b[0m\n", "\u001b[32mCode CLI and Qwen-3 Coder, highlighting the trend of AI-powered CLIs.\\n* **2:00-4:00:** Installation – Explaining \u001b[0m\n", "\u001b[32mthe prerequisites \u001b[0m\u001b[32m(\u001b[0m\u001b[32mNodeJS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and providing the `npm` installation command.\\n* **4:00-6:00:** API Key Setup – Showing\u001b[0m\n", "\u001b[32mhow to obtain an API key from OpenRouter and configure environment variables.\\n* **6:00-8:00:** Basic Usage – \u001b[0m\n", "\u001b[32mDemonstrating simple interactions with the CLI and generating a basic tic-tac-toe game.\\n* **8:00-11:00:** Landing\u001b[0m\n", "\u001b[32mPage Example – Attempting to generate a more complex landing page, encountering some delays and issues. This \u001b[0m\n", "\u001b[32msection highlights potential limitations and the need for iterative refinement of prompts.\\n* **11:00-12:00:** Code\u001b[0m\n", "\u001b[32mAnalysis – Using Qwen Code to analyze an existing Next.js project and generate a summary of its architecture.\\n* \u001b[0m\n", "\u001b[32m**12:00-12:45:** Conclusion – Recap of the process, additional features mentioned \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlike MCP integration\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and call\u001b[0m\n", "\u001b[32mto action.\\n\\n\\n**Improvements for the Video:**\\n\\n* **Address the \"Better than Claude Code?\" question directly:** \u001b[0m\n", "\u001b[32mThe video title raises this question, but the comparison is implicit. A more direct comparison, perhaps with \u001b[0m\n", "\u001b[32mside-by-side examples, would strengthen the video.\\n* **Troubleshooting the Landing Page Issue:** The video \u001b[0m\n", "\u001b[32mencounters difficulties generating the landing page. Addressing this directly, perhaps by showing successful \u001b[0m\n", "\u001b[32mgeneration after prompt refinement or explaining potential reasons for the delay, would improve the tutorial\\'s \u001b[0m\n", "\u001b[32mcompleteness.\\n* **Time Stamps:** Adding timestamps to the video description would greatly improve user \u001b[0m\n", "\u001b[32mexperience.\\n* **Show more advanced features:** The video briefly mentions MCP integration. Demonstrating this or\u001b[0m\n", "\u001b[32mother advanced features would add value.\\n* **Visuals:** The video could benefit from more visual aids, such as \u001b[0m\n", "\u001b[32mscreen recordings showing the code being generated in real-time.\\n\\n\\nBy addressing these points, the video would \u001b[0m\n", "\u001b[32mbecome a more comprehensive and compelling tutorial on using Qwen Code CLI.'\u001b[0m \u001b[33madditional_kwargs\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \n", "\u001b[33mresponse_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'prompt_feedback'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'block_reason'\u001b[0m: \u001b[1;36m0\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'finish_reason'\u001b[0m: \u001b[32m'STOP'\u001b[0m, \n", "\u001b[32m'model_name'\u001b[0m: \u001b[32m'gemini-1.5-flash'\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m \u001b[33mid\u001b[0m=\u001b[32m'run--8949b5c8-f8a8-49d3-a091-c0f356eceefc-0'\u001b[0m \n", "\u001b[33musage_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'input_tokens'\u001b[0m: \u001b[1;36m3493\u001b[0m, \u001b[32m'output_tokens'\u001b[0m: \u001b[1;36m784\u001b[0m, \u001b[32m'total_tokens'\u001b[0m: \u001b[1;36m4277\u001b[0m, \u001b[32m'input_token_details'\u001b[0m: \n", "\u001b[1m{\u001b[0m\u001b[32m'cache_read'\u001b[0m: \u001b[1;36m0\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", "\n", "\n", "\n", "Video: QWEN \u001b[1;36m3\u001b[0m CODER is Unleashed\u001b[33m...\u001b[0m better than KIMI K2 by Wes Roth\n", "URL: \u001b[4;94mhttps://www.youtube.com/watch?\u001b[0m\u001b[4;94mv\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94mfeAc83Qlx4Q\u001b[0m\n", "Transcript Summary: \u001b[33mcontent\u001b[0m=\u001b[32m'This is a detailed summary of the YouTube video about Qwen-3-Coder, an open-source AI \u001b[0m\n", "\u001b[32mcoding model.\\n\\n**Key Points:**\\n\\n* **Qwen-3-Coder\\'s Superiority:** The video highlights Qwen-3-Coder\\'s \u001b[0m\n", "\u001b[32mperformance surpassing other models like Kim K2, GPT-4.1, and even showing competitiveness with Claude Sonnet in \u001b[0m\n", "\u001b[32mvarious benchmarks. Its strength lies in agentic coding tasks, involving multi-turn interactions and tool \u001b[0m\n", "\u001b[32muse.\\n\\n* **Model Size and Architecture:** Qwen-3-Coder is a 480B parameter model, but utilizes a Mixture of \u001b[0m\n", "\u001b[32mExperts approach, keeping only 35B parameters active during inference. It\\'s described as an \"instruct\" model, \u001b[0m\n", "\u001b[32mmeaning it\\'s designed for helpful and friendly interactions.\\n\\n* **Reinforcement Learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m Training:** A \u001b[0m\n", "\u001b[32msignificant portion of the video focuses on the model\\'s RL training. Unlike other models focusing on \u001b[0m\n", "\u001b[32mcompetition-style coding problems, Qwen-3-Coder was trained on real-world coding tasks, leading to improved \u001b[0m\n", "\u001b[32mgeneralization across various tasks \u001b[0m\u001b[32m(\u001b[0m\u001b[32mcode generation, math problems, etc.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. The training involved a scalable \u001b[0m\n", "\u001b[32msystem running 20,000 parallel environments.\\n\\n* **Agentic Capabilities and Tools:** The model\\'s ability to \u001b[0m\n", "\u001b[32mperform agentic coding is emphasized. It seamlessly integrates with tools like Qwen Code \u001b[0m\u001b[32m(\u001b[0m\u001b[32ma CLI tool forked from \u001b[0m\n", "\u001b[32mGemini Code\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and Claude Code, enabling complex interactions with the environment.\\n\\n* **Real-World \u001b[0m\n", "\u001b[32mDemonstrations:** The video showcases several impressive demonstrations of Qwen-3-Coder\\'s capabilities, including \u001b[0m\n", "\u001b[32mbuilding simulations, game creation \u001b[0m\u001b[32m(\u001b[0m\u001b[32ma drone game, Minecraft clone\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and interactive visualizations.\\n\\n* \u001b[0m\n", "\u001b[32m**Open-Source Availability:** The model and associated tools are open-source, available on Hugging Face and \u001b[0m\n", "\u001b[32mGitHub, fostering community contributions and development.\\n\\n* **Future Implications:** The presenter expresses \u001b[0m\n", "\u001b[32msurprise at the rapid advancement of open-source AI, suggesting it\\'s closing the gap with leading proprietary \u001b[0m\n", "\u001b[32mmodels much faster than anticipated.\\n\\n\\n**Overall:**\\n\\nThe video presents a very positive and enthusiastic \u001b[0m\n", "\u001b[32mreview of Qwen-3-Coder, emphasizing its impressive performance, innovative training methodology, and open-source \u001b[0m\n", "\u001b[32mnature. It positions the model as a significant advancement in the field of large language models and agentic \u001b[0m\n", "\u001b[32mcoding. The presenter\\'s hands-on demonstrations further solidify this positive assessment.'\u001b[0m \u001b[33madditional_kwargs\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \n", "\u001b[33mresponse_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'prompt_feedback'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'block_reason'\u001b[0m: \u001b[1;36m0\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'finish_reason'\u001b[0m: \u001b[32m'STOP'\u001b[0m, \n", "\u001b[32m'model_name'\u001b[0m: \u001b[32m'gemini-1.5-flash'\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m \u001b[33mid\u001b[0m=\u001b[32m'run--59f2073a-7d87-4ff0-9a2e-aa2dea885451-0'\u001b[0m \n", "\u001b[33musage_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'input_tokens'\u001b[0m: \u001b[1;36m2078\u001b[0m, \u001b[32m'output_tokens'\u001b[0m: \u001b[1;36m495\u001b[0m, \u001b[32m'total_tokens'\u001b[0m: \u001b[1;36m2573\u001b[0m, \u001b[32m'input_token_details'\u001b[0m: \n", "\u001b[1m{\u001b[0m\u001b[32m'cache_read'\u001b[0m: \u001b[1;36m0\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", "\n", "\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n",
       "\n",
       "Video: Qwen 3 Coder: NEW Agentic Coding LLM! Insanely Powerful, Fast, & Free! (Opensource) by WorldofAI\n",
       "URL: https://www.youtube.com/watch?v=8jBw0CVPZPM\n",
       "Transcript Summary: content='This is a detailed analysis of the YouTube video metadata and transcript.\\n\\n**Video \n",
       "Overview:**\\n\\nThe video is a review and demonstration of Qwen3-Coder, a new open-source agentic coding large \n",
       "language model (LLM) from Alibaba.  The presenter highlights its capabilities, including its large context window \n",
       "(256K tokens, scalable to 1M), its strong performance on coding benchmarks (outperforming some closed-source \n",
       "models), and its accompanying command-line interface (CLI), Qwen Code.  The video includes several live demos \n",
       "showcasing Qwen3-Coder\\'s ability to generate code for various tasks, such as creating animations, interactive \n",
       "dashboards, and neural network visualizers.  A key selling point is that the model and its tools are completely \n",
       "free and open-source.\\n\\n**Strengths of the Video:**\\n\\n* **Clear and Concise Introduction:** The video clearly \n",
       "introduces Qwen3-Coder and its key features early on.\\n* **Compelling Visuals:** The demos are engaging and \n",
       "effectively demonstrate the model\\'s capabilities.  The visual results of the code generation are shown, making the\n",
       "power of the LLM clear to viewers.\\n* **Benchmark Comparisons:** The presenter provides benchmark comparisons to \n",
       "other LLMs, establishing Qwen3-Coder\\'s competitive advantage.\\n* **Multiple Access Methods:** The video explains \n",
       "multiple ways to access and use Qwen3-Coder (chatbot, API, local installation).\\n* **Well-Structured Content:** The\n",
       "video is logically organized, moving from an introduction to demos and concluding with a call to action.\\n* \n",
       "**Effective Use of Keywords and Hashtags:** The metadata and description are well-optimized for search, using \n",
       "relevant keywords and hashtags.\\n* **Sponsor Integration:** The sponsor message is integrated naturally and \n",
       "doesn\\'t disrupt the flow of the video.\\n\\n\\n**Areas for Improvement:**\\n\\n* **Transcript Accuracy:** While the \n",
       "transcript is generally accurate, there are a few minor typos and inconsistencies (e.g., \"Coin 3\" instead of \"Qwen \n",
       "3\" in several places).  A thorough proofread would improve the quality.\\n* **Demo Complexity:** While the demos are\n",
       "impressive, some viewers might find them too advanced.  Including simpler demos could broaden the appeal to a wider\n",
       "audience.\\n* **Technical Depth:**  The video could benefit from a slightly deeper dive into the technical aspects \n",
       "of Qwen3-Coder\\'s architecture (Mixture of Experts) for a more technically inclined audience.  A brief explanation \n",
       "of how the 256K context window works would also be beneficial.\\n* **Call to Action Clarity:** While the video has \n",
       "multiple calls to action (subscribe, donate, join Discord), consolidating them might make them more \n",
       "impactful.\\n\\n\\n**Overall:**\\n\\nThe YouTube video is well-produced and effectively communicates the value \n",
       "proposition of Qwen3-Coder.  The engaging demos and clear explanations make it a valuable resource for developers \n",
       "and AI enthusiasts interested in open-source coding LLMs.  Addressing the minor areas for improvement would further\n",
       "enhance its impact and reach.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, \n",
       "'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []} \n",
       "id='run--327884b3-8a87-417e-ba81-0e0e748223e1-0' usage_metadata={'input_tokens': 3267, 'output_tokens': 624, \n",
       "'total_tokens': 3891, 'input_token_details': {'cache_read': 0}}\n",
       "\n",
       "\n",
       "\n",
       "Video: Qwen Code CLI + Qwen-3 Coder 🔥 | Better Than Claude Code? - Full Tutorial by Astro K Joseph\n",
       "URL: https://www.youtube.com/watch?v=kv57HqEb2j8\n",
       "Transcript Summary: content='Here\\'s a breakdown of the YouTube video transcript, organized for clarity and \n",
       "SEO:\\n\\n**Title:** Qwen Code CLI + Qwen-3 Coder 🔥 | Better Than Claude Code? - Full Tutorial\\n\\n**Meta \n",
       "Description:** Learn how to set up and use the new Qwen Code CLI with the Qwen-3 Coder AI model. This full tutorial\n",
       "covers installation, API key configuration, and practical examples.  Is it better than Claude Code? Find \n",
       "out!\\n\\n**Tags:** Qwen Code CLI, Qwen-3 Coder, AI powered CLI, AI coding, code generation, tutorial, OpenRouter,  \n",
       "large language model, LLM, coding assistant, Gemini Code, Claude Code,  software development, programming, AI \n",
       "tools,  NodeJS, npm\\n\\n\\n**Video Summary:**\\n\\nThis tutorial demonstrates how to set up and use the newly released \n",
       "Qwen Code CLI, an AI-powered command-line interface, with the Qwen-3 Coder AI model.  The video compares it \n",
       "implicitly to other AI CLIs like Claude Code and Gemini Code.  The presenter walks viewers through the installation\n",
       "process (requiring NodeJS), API key configuration using OpenRouter, and provides practical examples of generating \n",
       "code for a tic-tac-toe game and a landing page.  The video also explores using Qwen Code to analyze existing \n",
       "codebases and understand their architecture.  While initially showcasing smooth code generation, the video later \n",
       "encounters some challenges with longer, more complex prompts, highlighting potential limitations.\\n\\n\\n**Key \n",
       "Sections & Timestamps (estimated, based on transcript length):**\\n\\n* **0:00-2:00:** Introduction – Announcing Qwen\n",
       "Code CLI and Qwen-3 Coder, highlighting the trend of AI-powered CLIs.\\n* **2:00-4:00:** Installation –  Explaining \n",
       "the prerequisites (NodeJS) and providing the `npm` installation command.\\n* **4:00-6:00:** API Key Setup –  Showing\n",
       "how to obtain an API key from OpenRouter and configure environment variables.\\n* **6:00-8:00:** Basic Usage –  \n",
       "Demonstrating simple interactions with the CLI and generating a basic tic-tac-toe game.\\n* **8:00-11:00:**  Landing\n",
       "Page Example – Attempting to generate a more complex landing page, encountering some delays and issues.  This \n",
       "section highlights potential limitations and the need for iterative refinement of prompts.\\n* **11:00-12:00:** Code\n",
       "Analysis –  Using Qwen Code to analyze an existing Next.js project and generate a summary of its architecture.\\n* \n",
       "**12:00-12:45:** Conclusion –  Recap of the process, additional features mentioned (like MCP integration), and call\n",
       "to action.\\n\\n\\n**Improvements for the Video:**\\n\\n* **Address the \"Better than Claude Code?\" question directly:** \n",
       "The video title raises this question, but the comparison is implicit.  A more direct comparison, perhaps with \n",
       "side-by-side examples, would strengthen the video.\\n* **Troubleshooting the Landing Page Issue:** The video \n",
       "encounters difficulties generating the landing page.  Addressing this directly, perhaps by showing successful \n",
       "generation after prompt refinement or explaining potential reasons for the delay, would improve the tutorial\\'s \n",
       "completeness.\\n* **Time Stamps:** Adding timestamps to the video description would greatly improve user \n",
       "experience.\\n* **Show more advanced features:**  The video briefly mentions MCP integration.  Demonstrating this or\n",
       "other advanced features would add value.\\n* **Visuals:**  The video could benefit from more visual aids, such as \n",
       "screen recordings showing the code being generated in real-time.\\n\\n\\nBy addressing these points, the video would \n",
       "become a more comprehensive and compelling tutorial on using Qwen Code CLI.' additional_kwargs={} \n",
       "response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', \n",
       "'model_name': 'gemini-1.5-flash', 'safety_ratings': []} id='run--8949b5c8-f8a8-49d3-a091-c0f356eceefc-0' \n",
       "usage_metadata={'input_tokens': 3493, 'output_tokens': 784, 'total_tokens': 4277, 'input_token_details': \n",
       "{'cache_read': 0}}\n",
       "\n",
       "\n",
       "\n",
       "Video: QWEN 3 CODER is Unleashed... better than KIMI K2 by Wes Roth\n",
       "URL: https://www.youtube.com/watch?v=feAc83Qlx4Q\n",
       "Transcript Summary: content='This is a detailed summary of the YouTube video about Qwen-3-Coder, an open-source AI \n",
       "coding model.\\n\\n**Key Points:**\\n\\n* **Qwen-3-Coder\\'s Superiority:** The video highlights Qwen-3-Coder\\'s \n",
       "performance surpassing other models like Kim K2, GPT-4.1, and even showing competitiveness with Claude Sonnet in \n",
       "various benchmarks.  Its strength lies in agentic coding tasks, involving multi-turn interactions and tool \n",
       "use.\\n\\n* **Model Size and Architecture:**  Qwen-3-Coder is a 480B parameter model, but utilizes a Mixture of \n",
       "Experts approach, keeping only 35B parameters active during inference.  It\\'s described as an \"instruct\" model, \n",
       "meaning it\\'s designed for helpful and friendly interactions.\\n\\n* **Reinforcement Learning (RL) Training:** A \n",
       "significant portion of the video focuses on the model\\'s RL training.  Unlike other models focusing on \n",
       "competition-style coding problems, Qwen-3-Coder was trained on real-world coding tasks, leading to improved \n",
       "generalization across various tasks (code generation, math problems, etc.).  The training involved a scalable \n",
       "system running 20,000 parallel environments.\\n\\n* **Agentic Capabilities and Tools:** The model\\'s ability to \n",
       "perform agentic coding is emphasized.  It seamlessly integrates with tools like Qwen Code (a CLI tool forked from \n",
       "Gemini Code) and Claude Code, enabling complex interactions with the environment.\\n\\n* **Real-World \n",
       "Demonstrations:** The video showcases several impressive demonstrations of Qwen-3-Coder\\'s capabilities, including \n",
       "building simulations, game creation (a drone game, Minecraft clone), and interactive visualizations.\\n\\n* \n",
       "**Open-Source Availability:**  The model and associated tools are open-source, available on Hugging Face and \n",
       "GitHub, fostering community contributions and development.\\n\\n* **Future Implications:** The presenter expresses \n",
       "surprise at the rapid advancement of open-source AI, suggesting it\\'s closing the gap with leading proprietary \n",
       "models much faster than anticipated.\\n\\n\\n**Overall:**\\n\\nThe video presents a very positive and enthusiastic \n",
       "review of Qwen-3-Coder, emphasizing its impressive performance, innovative training methodology, and open-source \n",
       "nature.  It positions the model as a significant advancement in the field of large language models and agentic \n",
       "coding.  The presenter\\'s hands-on demonstrations further solidify this positive assessment.' additional_kwargs={} \n",
       "response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', \n",
       "'model_name': 'gemini-1.5-flash', 'safety_ratings': []} id='run--59f2073a-7d87-4ff0-9a2e-aa2dea885451-0' \n",
       "usage_metadata={'input_tokens': 2078, 'output_tokens': 495, 'total_tokens': 2573, 'input_token_details': \n",
       "{'cache_read': 0}}\n",
       "\n",
       "\n",
       "
\n" ], "text/plain": [ "\n", "\n", "Video: Qwen \u001b[1;36m3\u001b[0m Coder: NEW Agentic Coding LLM! Insanely Powerful, Fast, & Free! \u001b[1m(\u001b[0mOpensource\u001b[1m)\u001b[0m by WorldofAI\n", "URL: \u001b[4;94mhttps://www.youtube.com/watch?\u001b[0m\u001b[4;94mv\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94m8jBw0CVPZPM\u001b[0m\n", "Transcript Summary: \u001b[33mcontent\u001b[0m=\u001b[32m'This is a detailed analysis of the YouTube video metadata and transcript.\\n\\n**Video \u001b[0m\n", "\u001b[32mOverview:**\\n\\nThe video is a review and demonstration of Qwen3-Coder, a new open-source agentic coding large \u001b[0m\n", "\u001b[32mlanguage model \u001b[0m\u001b[32m(\u001b[0m\u001b[32mLLM\u001b[0m\u001b[32m)\u001b[0m\u001b[32m from Alibaba. The presenter highlights its capabilities, including its large context window \u001b[0m\n", "\u001b[32m(\u001b[0m\u001b[32m256K tokens, scalable to 1M\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, its strong performance on coding benchmarks \u001b[0m\u001b[32m(\u001b[0m\u001b[32moutperforming some closed-source \u001b[0m\n", "\u001b[32mmodels\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and its accompanying command-line interface \u001b[0m\u001b[32m(\u001b[0m\u001b[32mCLI\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, Qwen Code. The video includes several live demos \u001b[0m\n", "\u001b[32mshowcasing Qwen3-Coder\\'s ability to generate code for various tasks, such as creating animations, interactive \u001b[0m\n", "\u001b[32mdashboards, and neural network visualizers. A key selling point is that the model and its tools are completely \u001b[0m\n", "\u001b[32mfree and open-source.\\n\\n**Strengths of the Video:**\\n\\n* **Clear and Concise Introduction:** The video clearly \u001b[0m\n", "\u001b[32mintroduces Qwen3-Coder and its key features early on.\\n* **Compelling Visuals:** The demos are engaging and \u001b[0m\n", "\u001b[32meffectively demonstrate the model\\'s capabilities. The visual results of the code generation are shown, making the\u001b[0m\n", "\u001b[32mpower of the LLM clear to viewers.\\n* **Benchmark Comparisons:** The presenter provides benchmark comparisons to \u001b[0m\n", "\u001b[32mother LLMs, establishing Qwen3-Coder\\'s competitive advantage.\\n* **Multiple Access Methods:** The video explains \u001b[0m\n", "\u001b[32mmultiple ways to access and use Qwen3-Coder \u001b[0m\u001b[32m(\u001b[0m\u001b[32mchatbot, API, local installation\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n* **Well-Structured Content:** The\u001b[0m\n", "\u001b[32mvideo is logically organized, moving from an introduction to demos and concluding with a call to action.\\n* \u001b[0m\n", "\u001b[32m**Effective Use of Keywords and Hashtags:** The metadata and description are well-optimized for search, using \u001b[0m\n", "\u001b[32mrelevant keywords and hashtags.\\n* **Sponsor Integration:** The sponsor message is integrated naturally and \u001b[0m\n", "\u001b[32mdoesn\\'t disrupt the flow of the video.\\n\\n\\n**Areas for Improvement:**\\n\\n* **Transcript Accuracy:** While the \u001b[0m\n", "\u001b[32mtranscript is generally accurate, there are a few minor typos and inconsistencies \u001b[0m\u001b[32m(\u001b[0m\u001b[32me.g., \"Coin 3\" instead of \"Qwen \u001b[0m\n", "\u001b[32m3\" in several places\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. A thorough proofread would improve the quality.\\n* **Demo Complexity:** While the demos are\u001b[0m\n", "\u001b[32mimpressive, some viewers might find them too advanced. Including simpler demos could broaden the appeal to a wider\u001b[0m\n", "\u001b[32maudience.\\n* **Technical Depth:** The video could benefit from a slightly deeper dive into the technical aspects \u001b[0m\n", "\u001b[32mof Qwen3-Coder\\'s architecture \u001b[0m\u001b[32m(\u001b[0m\u001b[32mMixture of Experts\u001b[0m\u001b[32m)\u001b[0m\u001b[32m for a more technically inclined audience. A brief explanation \u001b[0m\n", "\u001b[32mof how the 256K context window works would also be beneficial.\\n* **Call to Action Clarity:** While the video has \u001b[0m\n", "\u001b[32mmultiple calls to action \u001b[0m\u001b[32m(\u001b[0m\u001b[32msubscribe, donate, join Discord\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, consolidating them might make them more \u001b[0m\n", "\u001b[32mimpactful.\\n\\n\\n**Overall:**\\n\\nThe YouTube video is well-produced and effectively communicates the value \u001b[0m\n", "\u001b[32mproposition of Qwen3-Coder. The engaging demos and clear explanations make it a valuable resource for developers \u001b[0m\n", "\u001b[32mand AI enthusiasts interested in open-source coding LLMs. Addressing the minor areas for improvement would further\u001b[0m\n", "\u001b[32menhance its impact and reach.'\u001b[0m \u001b[33madditional_kwargs\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[33mresponse_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'prompt_feedback'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'block_reason'\u001b[0m: \u001b[1;36m0\u001b[0m, \n", "\u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'finish_reason'\u001b[0m: \u001b[32m'STOP'\u001b[0m, \u001b[32m'model_name'\u001b[0m: \u001b[32m'gemini-1.5-flash'\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m \n", "\u001b[33mid\u001b[0m=\u001b[32m'run--327884b3-8a87-417e-ba81-0e0e748223e1-0'\u001b[0m \u001b[33musage_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'input_tokens'\u001b[0m: \u001b[1;36m3267\u001b[0m, \u001b[32m'output_tokens'\u001b[0m: \u001b[1;36m624\u001b[0m, \n", "\u001b[32m'total_tokens'\u001b[0m: \u001b[1;36m3891\u001b[0m, \u001b[32m'input_token_details'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'cache_read'\u001b[0m: \u001b[1;36m0\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", "\n", "\n", "\n", "Video: Qwen Code CLI + Qwen-\u001b[1;36m3\u001b[0m Coder 🔥 | Better Than Claude Code? - Full Tutorial by Astro K Joseph\n", "URL: \u001b[4;94mhttps://www.youtube.com/watch?\u001b[0m\u001b[4;94mv\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94mkv57HqEb2j8\u001b[0m\n", "Transcript Summary: \u001b[33mcontent\u001b[0m=\u001b[32m'Here\\'s a breakdown of the YouTube video transcript, organized for clarity and \u001b[0m\n", "\u001b[32mSEO:\\n\\n**Title:** Qwen Code CLI + Qwen-3 Coder 🔥 | Better Than Claude Code? - Full Tutorial\\n\\n**Meta \u001b[0m\n", "\u001b[32mDescription:** Learn how to set up and use the new Qwen Code CLI with the Qwen-3 Coder AI model. This full tutorial\u001b[0m\n", "\u001b[32mcovers installation, API key configuration, and practical examples. Is it better than Claude Code? Find \u001b[0m\n", "\u001b[32mout!\\n\\n**Tags:** Qwen Code CLI, Qwen-3 Coder, AI powered CLI, AI coding, code generation, tutorial, OpenRouter, \u001b[0m\n", "\u001b[32mlarge language model, LLM, coding assistant, Gemini Code, Claude Code, software development, programming, AI \u001b[0m\n", "\u001b[32mtools, NodeJS, npm\\n\\n\\n**Video Summary:**\\n\\nThis tutorial demonstrates how to set up and use the newly released \u001b[0m\n", "\u001b[32mQwen Code CLI, an AI-powered command-line interface, with the Qwen-3 Coder AI model. The video compares it \u001b[0m\n", "\u001b[32mimplicitly to other AI CLIs like Claude Code and Gemini Code. The presenter walks viewers through the installation\u001b[0m\n", "\u001b[32mprocess \u001b[0m\u001b[32m(\u001b[0m\u001b[32mrequiring NodeJS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, API key configuration using OpenRouter, and provides practical examples of generating \u001b[0m\n", "\u001b[32mcode for a tic-tac-toe game and a landing page. The video also explores using Qwen Code to analyze existing \u001b[0m\n", "\u001b[32mcodebases and understand their architecture. While initially showcasing smooth code generation, the video later \u001b[0m\n", "\u001b[32mencounters some challenges with longer, more complex prompts, highlighting potential limitations.\\n\\n\\n**Key \u001b[0m\n", "\u001b[32mSections & Timestamps \u001b[0m\u001b[32m(\u001b[0m\u001b[32mestimated, based on transcript length\u001b[0m\u001b[32m)\u001b[0m\u001b[32m:**\\n\\n* **0:00-2:00:** Introduction – Announcing Qwen\u001b[0m\n", "\u001b[32mCode CLI and Qwen-3 Coder, highlighting the trend of AI-powered CLIs.\\n* **2:00-4:00:** Installation – Explaining \u001b[0m\n", "\u001b[32mthe prerequisites \u001b[0m\u001b[32m(\u001b[0m\u001b[32mNodeJS\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and providing the `npm` installation command.\\n* **4:00-6:00:** API Key Setup – Showing\u001b[0m\n", "\u001b[32mhow to obtain an API key from OpenRouter and configure environment variables.\\n* **6:00-8:00:** Basic Usage – \u001b[0m\n", "\u001b[32mDemonstrating simple interactions with the CLI and generating a basic tic-tac-toe game.\\n* **8:00-11:00:** Landing\u001b[0m\n", "\u001b[32mPage Example – Attempting to generate a more complex landing page, encountering some delays and issues. This \u001b[0m\n", "\u001b[32msection highlights potential limitations and the need for iterative refinement of prompts.\\n* **11:00-12:00:** Code\u001b[0m\n", "\u001b[32mAnalysis – Using Qwen Code to analyze an existing Next.js project and generate a summary of its architecture.\\n* \u001b[0m\n", "\u001b[32m**12:00-12:45:** Conclusion – Recap of the process, additional features mentioned \u001b[0m\u001b[32m(\u001b[0m\u001b[32mlike MCP integration\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and call\u001b[0m\n", "\u001b[32mto action.\\n\\n\\n**Improvements for the Video:**\\n\\n* **Address the \"Better than Claude Code?\" question directly:** \u001b[0m\n", "\u001b[32mThe video title raises this question, but the comparison is implicit. A more direct comparison, perhaps with \u001b[0m\n", "\u001b[32mside-by-side examples, would strengthen the video.\\n* **Troubleshooting the Landing Page Issue:** The video \u001b[0m\n", "\u001b[32mencounters difficulties generating the landing page. Addressing this directly, perhaps by showing successful \u001b[0m\n", "\u001b[32mgeneration after prompt refinement or explaining potential reasons for the delay, would improve the tutorial\\'s \u001b[0m\n", "\u001b[32mcompleteness.\\n* **Time Stamps:** Adding timestamps to the video description would greatly improve user \u001b[0m\n", "\u001b[32mexperience.\\n* **Show more advanced features:** The video briefly mentions MCP integration. Demonstrating this or\u001b[0m\n", "\u001b[32mother advanced features would add value.\\n* **Visuals:** The video could benefit from more visual aids, such as \u001b[0m\n", "\u001b[32mscreen recordings showing the code being generated in real-time.\\n\\n\\nBy addressing these points, the video would \u001b[0m\n", "\u001b[32mbecome a more comprehensive and compelling tutorial on using Qwen Code CLI.'\u001b[0m \u001b[33madditional_kwargs\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \n", "\u001b[33mresponse_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'prompt_feedback'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'block_reason'\u001b[0m: \u001b[1;36m0\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'finish_reason'\u001b[0m: \u001b[32m'STOP'\u001b[0m, \n", "\u001b[32m'model_name'\u001b[0m: \u001b[32m'gemini-1.5-flash'\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m \u001b[33mid\u001b[0m=\u001b[32m'run--8949b5c8-f8a8-49d3-a091-c0f356eceefc-0'\u001b[0m \n", "\u001b[33musage_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'input_tokens'\u001b[0m: \u001b[1;36m3493\u001b[0m, \u001b[32m'output_tokens'\u001b[0m: \u001b[1;36m784\u001b[0m, \u001b[32m'total_tokens'\u001b[0m: \u001b[1;36m4277\u001b[0m, \u001b[32m'input_token_details'\u001b[0m: \n", "\u001b[1m{\u001b[0m\u001b[32m'cache_read'\u001b[0m: \u001b[1;36m0\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", "\n", "\n", "\n", "Video: QWEN \u001b[1;36m3\u001b[0m CODER is Unleashed\u001b[33m...\u001b[0m better than KIMI K2 by Wes Roth\n", "URL: \u001b[4;94mhttps://www.youtube.com/watch?\u001b[0m\u001b[4;94mv\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94mfeAc83Qlx4Q\u001b[0m\n", "Transcript Summary: \u001b[33mcontent\u001b[0m=\u001b[32m'This is a detailed summary of the YouTube video about Qwen-3-Coder, an open-source AI \u001b[0m\n", "\u001b[32mcoding model.\\n\\n**Key Points:**\\n\\n* **Qwen-3-Coder\\'s Superiority:** The video highlights Qwen-3-Coder\\'s \u001b[0m\n", "\u001b[32mperformance surpassing other models like Kim K2, GPT-4.1, and even showing competitiveness with Claude Sonnet in \u001b[0m\n", "\u001b[32mvarious benchmarks. Its strength lies in agentic coding tasks, involving multi-turn interactions and tool \u001b[0m\n", "\u001b[32muse.\\n\\n* **Model Size and Architecture:** Qwen-3-Coder is a 480B parameter model, but utilizes a Mixture of \u001b[0m\n", "\u001b[32mExperts approach, keeping only 35B parameters active during inference. It\\'s described as an \"instruct\" model, \u001b[0m\n", "\u001b[32mmeaning it\\'s designed for helpful and friendly interactions.\\n\\n* **Reinforcement Learning \u001b[0m\u001b[32m(\u001b[0m\u001b[32mRL\u001b[0m\u001b[32m)\u001b[0m\u001b[32m Training:** A \u001b[0m\n", "\u001b[32msignificant portion of the video focuses on the model\\'s RL training. Unlike other models focusing on \u001b[0m\n", "\u001b[32mcompetition-style coding problems, Qwen-3-Coder was trained on real-world coding tasks, leading to improved \u001b[0m\n", "\u001b[32mgeneralization across various tasks \u001b[0m\u001b[32m(\u001b[0m\u001b[32mcode generation, math problems, etc.\u001b[0m\u001b[32m)\u001b[0m\u001b[32m. The training involved a scalable \u001b[0m\n", "\u001b[32msystem running 20,000 parallel environments.\\n\\n* **Agentic Capabilities and Tools:** The model\\'s ability to \u001b[0m\n", "\u001b[32mperform agentic coding is emphasized. It seamlessly integrates with tools like Qwen Code \u001b[0m\u001b[32m(\u001b[0m\u001b[32ma CLI tool forked from \u001b[0m\n", "\u001b[32mGemini Code\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and Claude Code, enabling complex interactions with the environment.\\n\\n* **Real-World \u001b[0m\n", "\u001b[32mDemonstrations:** The video showcases several impressive demonstrations of Qwen-3-Coder\\'s capabilities, including \u001b[0m\n", "\u001b[32mbuilding simulations, game creation \u001b[0m\u001b[32m(\u001b[0m\u001b[32ma drone game, Minecraft clone\u001b[0m\u001b[32m)\u001b[0m\u001b[32m, and interactive visualizations.\\n\\n* \u001b[0m\n", "\u001b[32m**Open-Source Availability:** The model and associated tools are open-source, available on Hugging Face and \u001b[0m\n", "\u001b[32mGitHub, fostering community contributions and development.\\n\\n* **Future Implications:** The presenter expresses \u001b[0m\n", "\u001b[32msurprise at the rapid advancement of open-source AI, suggesting it\\'s closing the gap with leading proprietary \u001b[0m\n", "\u001b[32mmodels much faster than anticipated.\\n\\n\\n**Overall:**\\n\\nThe video presents a very positive and enthusiastic \u001b[0m\n", "\u001b[32mreview of Qwen-3-Coder, emphasizing its impressive performance, innovative training methodology, and open-source \u001b[0m\n", "\u001b[32mnature. It positions the model as a significant advancement in the field of large language models and agentic \u001b[0m\n", "\u001b[32mcoding. The presenter\\'s hands-on demonstrations further solidify this positive assessment.'\u001b[0m \u001b[33madditional_kwargs\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \n", "\u001b[33mresponse_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'prompt_feedback'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'block_reason'\u001b[0m: \u001b[1;36m0\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'finish_reason'\u001b[0m: \u001b[32m'STOP'\u001b[0m, \n", "\u001b[32m'model_name'\u001b[0m: \u001b[32m'gemini-1.5-flash'\u001b[0m, \u001b[32m'safety_ratings'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m \u001b[33mid\u001b[0m=\u001b[32m'run--59f2073a-7d87-4ff0-9a2e-aa2dea885451-0'\u001b[0m \n", "\u001b[33musage_metadata\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'input_tokens'\u001b[0m: \u001b[1;36m2078\u001b[0m, \u001b[32m'output_tokens'\u001b[0m: \u001b[1;36m495\u001b[0m, \u001b[32m'total_tokens'\u001b[0m: \u001b[1;36m2573\u001b[0m, \u001b[32m'input_token_details'\u001b[0m: \n", "\u001b[1m{\u001b[0m\u001b[32m'cache_read'\u001b[0m: \u001b[1;36m0\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", "\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "summary = youtube_transcript_response(\"Qwen 3 coder reviews\", \n", " 'summarize in bullets and themes', llmgoogle)\n", "print(summary)" ] }, { "cell_type": "markdown", "id": "cd9d555f", "metadata": {}, "source": [ "# 11. Git Repos Exploration" ] }, { "cell_type": "markdown", "id": "2aaacac6", "metadata": {}, "source": [ "You can now explore structure and get the content of files/folders in github repositories, you can even plug in local code folders" ] }, { "cell_type": "code", "execution_count": 20, "id": "c7580fe6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Directory structure:\n",
       "└── spthole-coexistai/\n",
       "    ├── README.md\n",
       "    ├── __init__.py\n",
       "    ├── app.py\n",
       "    ├── coexist_tutorial.ipynb\n",
       "    ├── LICENSE\n",
       "    ├── model_config.py\n",
       "    ├── quick_setup.sh\n",
       "    ├── README_MCP.md\n",
       "    ├── requirements.txt\n",
       "    ├── system_prompt.py\n",
       "    ├── output/\n",
       "    │   └── map_with_route_and_pois.html\n",
       "    ├── searxng/\n",
       "    │   ├── settings.yml\n",
       "    │   ├── settings.yml.new\n",
       "    │   ├── uwsgi.ini\n",
       "    │   └── uwsgi.ini.new\n",
       "    └── utils/\n",
       "        ├── __init__.py\n",
       "        ├── answer_generation.py\n",
       "        ├── config.py\n",
       "        ├── git_utils.py\n",
       "        ├── map.py\n",
       "        ├── process_content.py\n",
       "        ├── reddit_utils.py\n",
       "        ├── utils.py\n",
       "        └── websearch_utils.py\n",
       "\n",
       "
\n" ], "text/plain": [ "Directory structure:\n", "└── spthole-coexistai/\n", " ├── README.md\n", " ├── __init__.py\n", " ├── app.py\n", " ├── coexist_tutorial.ipynb\n", " ├── LICENSE\n", " ├── model_config.py\n", " ├── quick_setup.sh\n", " ├── README_MCP.md\n", " ├── requirements.txt\n", " ├── system_prompt.py\n", " ├── output/\n", " │ └── map_with_route_and_pois.html\n", " ├── searxng/\n", " │ ├── settings.yml\n", " │ ├── settings.yml.new\n", " │ ├── uwsgi.ini\n", " │ └── uwsgi.ini.new\n", " └── utils/\n", " ├── __init__.py\n", " ├── answer_generation.py\n", " ├── config.py\n", " ├── git_utils.py\n", " ├── map.py\n", " ├── process_content.py\n", " ├── reddit_utils.py\n", " ├── utils.py\n", " └── websearch_utils.py\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from utils.git_utils import *\n", "\n", "tree = await git_tree_search(\"https://github.com/SPThole/CoexistAI/\")\n", "print(tree)" ] }, { "cell_type": "code", "execution_count": 21, "id": "cdfb1d1c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Directory structure:\n",
       "└── CoexistAI/\n",
       "    ├── README.md\n",
       "    ├── __init__.py\n",
       "    ├── app.py\n",
       "    ├── coexist_tutorial.ipynb\n",
       "    ├── LICENSE\n",
       "    ├── lmstudio_chats.ipynb\n",
       "    ├── model_config.py\n",
       "    ├── quick_setup.sh\n",
       "    ├── README_MCP.md\n",
       "    ├── requirements.txt\n",
       "    ├── search_context.txt\n",
       "    ├── system_prompt.py\n",
       "    ├── output/\n",
       "    │   └── map_with_route_and_pois.html\n",
       "    ├── searxng/\n",
       "    │   ├── settings.yml\n",
       "    │   ├── settings.yml.new\n",
       "    │   ├── uwsgi.ini\n",
       "    │   └── uwsgi.ini.new\n",
       "    └── utils/\n",
       "        ├── __init__.py\n",
       "        ├── answer_generation.py\n",
       "        ├── config.py\n",
       "        ├── git_utils.py\n",
       "        ├── map.py\n",
       "        ├── process_content.py\n",
       "        ├── reddit_utils.py\n",
       "        ├── utils.py\n",
       "        └── websearch_utils.py\n",
       "\n",
       "
\n" ], "text/plain": [ "Directory structure:\n", "└── CoexistAI/\n", " ├── README.md\n", " ├── __init__.py\n", " ├── app.py\n", " ├── coexist_tutorial.ipynb\n", " ├── LICENSE\n", " ├── lmstudio_chats.ipynb\n", " ├── model_config.py\n", " ├── quick_setup.sh\n", " ├── README_MCP.md\n", " ├── requirements.txt\n", " ├── search_context.txt\n", " ├── system_prompt.py\n", " ├── output/\n", " │ └── map_with_route_and_pois.html\n", " ├── searxng/\n", " │ ├── settings.yml\n", " │ ├── settings.yml.new\n", " │ ├── uwsgi.ini\n", " │ └── uwsgi.ini.new\n", " └── utils/\n", " ├── __init__.py\n", " ├── answer_generation.py\n", " ├── config.py\n", " ├── git_utils.py\n", " ├── map.py\n", " ├── process_content.py\n", " ├── reddit_utils.py\n", " ├── utils.py\n", " └── websearch_utils.py\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#local repo\n", "from utils.git_utils import *\n", "\n", "tree = await git_tree_search(\".\")\n", "print(tree)" ] }, { "cell_type": "code", "execution_count": 22, "id": "ccbc2c9b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://github.com/SPThole/CoexistAI/blob/main//README_MCP.md\n" ] }, { "data": { "text/html": [ "
================================================\n",
       "FILE: README_MCP.md\n",
       "================================================\n",
       "# CoexistAI v0.0.2 \n",
       "\n",
       "<p align=\"center\">\n",
       "  <img src=\"artifacts/v002mcplogo.jpeg\" alt=\"CoexistAI MCP Logo\" width=\"200\"/>\n",
       "</p>\n",
       "\n",
       "## 🚀 What's New in v2\n",
       "\n",
       "- **Direct location search:** You can now search for any place, not just find routes!\n",
       "- **Advanced Reddit search:** Use your own phrases to search across reddit; results ranked better with BM25 for \n",
       "sharper discovery.\n",
       "- **YouTube power-up:** Search and summarize YouTube using your own search phrases or video URLs and even add a \n",
       "prompt for custom responses.\n",
       "- **Explore to your folders/files**: Explore local folders and files with extended support to diverse files \n",
       "including (vision integrated)'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.tiff', '.svg', etc, more to come..\n",
       "- **Sharper web search:** More focused and actionable results than ever before.\n",
       "- **MCP support everywhere:** Now fully connect coexistai to LM Studio and other MCP hosts—seamless integration! \n",
       "[See Guide](README_MCP.md)\n",
       "- **GitHub & local repo explorer:** Explore ask questions about codebases - works with both GitHub and local repos!\n",
       "\n",
       "---\n",
       "\n",
       "## 🛠 Quick Start\n",
       "\n",
       "### Method (Less flexible but faster):\n",
       "\n",
       "1. **Clone the repository:**\n",
       "   ```sh\n",
       "   git clone https://github.com/SPThole/CoexistAI.git coexistai\n",
       "   cd coexistai\n",
       "   ```\n",
       "\n",
       "\n",
       "2. **Configure your model and embedding settings:**\n",
       "   \n",
       "   - Edit `model_config.py` to set your preferred LLM and embedding model.\n",
       "   - Add LLM and Embedder API Key (for google mode both would be same)\n",
       "   - Example (for full local mode):\n",
       "    ```py\n",
       "    model_config = {\n",
       "      # Name of the LLM model to use. For local models, use the model name served by your local server.\n",
       "      \"llm_model_name\": \"google/gemma-3-12b\",\n",
       "\n",
       "      # LLM provider type: choose from 'google', 'local', 'groq', or 'openai' or 'others' \n",
       "      # in case of 'others' (base url needs to be updated in openai_compatible given below accordingly).\n",
       "      # Make sure to update the api_key variable above to match the provider.\n",
       "      \"llm_type\": \"local\",\n",
       "\n",
       "      # List of tools or plugins to use with the LLM, if any. Set to None if not used.\n",
       "      \"llm_tools\": None,\n",
       "\n",
       "      # Additional keyword arguments for LLM initialization.\n",
       "      \"llm_kwargs\": {\n",
       "        \"temperature\": 0.1,  # Sampling temperature for generation.\n",
       "        \"max_tokens\": None,  # Maximum number of tokens to generate (None for default).\n",
       "        \"timeout\": None,     # Timeout for API requests (None for default).\n",
       "        \"max_retries\": 2,    # Maximum number of retries for failed requests.\n",
       "        \"api_key\": llm_api_key,  # API key for authentication.\n",
       "      },\n",
       "\n",
       "      # Name of the embedding model to use.\n",
       "      # For Google, use their embedding model names. For local/HuggingFace, use the model path or name.\n",
       "      \"embedding_model_name\": \"nomic-ai/nomic-embed-text-v1\",\n",
       "\n",
       "      \"embed_kwargs\":{}, #additional kwargs for embedding model initialization\n",
       "\n",
       "      # Embedding backend: 'google' for Google, 'infinity_emb' for local/HuggingFace models.\n",
       "      \"embed_mode\": \"infinity_emb\",\n",
       "\n",
       "      # Name of the cross-encoder model for reranking, typically a HuggingFace model.\n",
       "      \"cross_encoder_name\": \"BAAI/bge-reranker-base\"\n",
       "    }\n",
       "    ```\n",
       "   - See the file for all available options and defaults.\n",
       "\n",
       "3. **Run the setup script:**\n",
       "   - For macOS or Linux with zsh:\n",
       "     ```sh\n",
       "     zsh quick_setup.sh\n",
       "     ```\n",
       "   - For Linux with bash:\n",
       "     ```sh\n",
       "     bash quick_setup.sh\n",
       "     ```\n",
       "\n",
       "   > The script will:\n",
       "   > - Pull the SearxNG Docker image\n",
       "   > - Create and activate a Python virtual environment\n",
       "   > - **USER ACTION NEEDED** Set your `GOOGLE_API_KEY` (edit the script to use your real key). [Obtain your API \n",
       "key (Currently Gemini, OpenAI and ollama is supported)](https://ai.google.dev/gemini-api/docs/api-key) from your \n",
       "preferred LLM provider. (Only needed when google mode is set, else set in model_config.py)\n",
       "   > - Start the SearxNG Docker container\n",
       "   > - Install Python dependencies\n",
       "   > - Start the FastAPI server\n",
       "\n",
       "4. **That’s it!**  \n",
       "   The FastAPI and MCP server will start automatically and you’re ready to go.\n",
       "\n",
       "**Note:**  \n",
       "- Make sure Docker, Python 3, and pip are installed on your system.  \n",
       "- Edit quick_setup.sh to set your real `GOOGLE_API_KEY` before running (needed if using google models) \n",
       "- Windows users can use [WSL](https://docs.microsoft.com/en-us/windows/wsl/) or Git Bash to run the script, or \n",
       "follow manual setup steps.\n",
       "\n",
       "## 🔍 What Can You Do? (API Highlights & Examples)\n",
       "\n",
       "**Remove comments after // before pasting**\n",
       "Swagger UI: http://127.0.0.1:8000/docs  if you haven't changed the host and port\n",
       "\n",
       "### 1. Web Search\n",
       "**Search the web, summarize, and get actionable answers—automatically.**\n",
       "\n",
       "**Endpoint:**  \n",
       "POST `/web-search`\n",
       "\n",
       "**Request Example:**\n",
       "```json\n",
       "{\n",
       "  \"query\": \"Top news of today worldwide\", // Query you want to ask; if you provide a URL and ask to summarise, it \n",
       "will summarize the full page.\n",
       "  \"rerank\": true, // Set to true for better result ranking.\n",
       "  \"num_results\": 2, // Number of top results per subquery to explore (higher values = more tokens, slower/more \n",
       "costly).\n",
       "  \"local_mode\": false, // Set to true to explore local documents (currently, only PDF supported).\n",
       "  \"split\": true, // Set to false if you want full pages as input to LLMs; false may cause slower/more costly \n",
       "response.\n",
       "  \"document_paths\": [] // If local_mode is true, add a list of lists of document paths, e.g., \n",
       "[[\"documents/1706.03762v7.pdf\"]]\n",
       "}\n",
       "\n",
       "```\n",
       "\n",
       "or QA/sumamrise local documents \n",
       "\n",
       "```json\n",
       "{\n",
       "  \"query\": \"Summarise this research paper\",\n",
       "  \"rerank\": true,\n",
       "  \"num_results\": 3,\n",
       "  \"local_mode\": true,\n",
       "  \"split\": true,\n",
       "  \"document_paths\": [[\"documents/1706.03762v7.pdf\"]] // Must be a list of lists.\n",
       "}\n",
       "```\n",
       "\n",
       "---\n",
       "\n",
       "### 2. Summarize Any Web Page\n",
       "**Summarize any article or research paper by URL.**\n",
       "\n",
       "**Endpoint:**  \n",
       "POST `/web-summarize`\n",
       "\n",
       "**Request Example:**\n",
       "```json\n",
       "\n",
       "{\n",
       "  \"query\": \"Write a short blog on the model\", // Instruction or question for the fetched page content.\n",
       "  \"url\": \"https://huggingface.co/unsloth/Qwen3-8B-GGUF\", // Webpage to fetch content from.\n",
       "  \"local_mode\": false // Set to true if summarizing a local document.\n",
       "}\n",
       "\n",
       "```\n",
       "\n",
       "---\n",
       "\n",
       "### 3. YouTube Search \n",
       "**Search YouTube (supports prompts and batch).**\n",
       "\n",
       "**Endpoint:**  \n",
       "POST `/youtube-search`\n",
       "\n",
       "**Request Example:**\n",
       "```json\n",
       "\n",
       "{\n",
       "  \"query\": \"switzerland itinerary\", // Query to search on YouTube; if a URL is provided, it fetches content from \n",
       "that URL. url should be in format: https://www.youtube.com/watch?v=videoID\n",
       "  \"prompt\": \"I want to plan my Switzerland trip\", // Instruction or question for using the fetched content.\n",
       "  \"n\": 2 // Number of top search results to summarize (only works if query is not a URL).\n",
       "}\n",
       "```\n",
       "\n",
       "---\n",
       "\n",
       "### 4. Reddit Deep Dive\n",
       "**Custom Reddit search, sort, filter, and get top comments.**\n",
       "\n",
       "**Endpoint:**  \n",
       "POST `/reddit-search`\n",
       "\n",
       "**Request Example:**\n",
       "```json\n",
       "\n",
       "{\n",
       "  \"subreddit\": \"\", // Subreddit to fetch content from (use if url_type is not 'search').\n",
       "  \"url_type\": \"search\", // 'search' for phrase search; \"url\" for url, otherwise, use 'hot', 'top', 'best', etc.\n",
       "  \"n\": 3, // Number of posts to fetch.\n",
       "  \"k\": 1, // Number of top comments per post.\n",
       "  \"custom_url\": \"\", // Use if you already have a specific Reddit URL.\n",
       "  \"time_filter\": \"all\", // Time range: 'all', 'today', 'week', 'month', 'year'.\n",
       "  \"search_query\": \"gemma 3n reviews\", // Search phrase (useful if url_type is 'search').\n",
       "  \"sort_type\": \"relevance\" // 'top', 'hot', 'new', 'relevance' — controls how results are sorted.\n",
       "}\n",
       "\n",
       "```\n",
       "\n",
       "---\n",
       "\n",
       "### 5. Map & Location/Route Search\n",
       "**Find places, routes, and nearby points of interest.**\n",
       "\n",
       "**Endpoint:**  \n",
       "POST `/map-search`\n",
       "\n",
       "**Request Example:**\n",
       "```json\n",
       "\n",
       "{\n",
       "  \"start_location\": \"MG Road, Bangalore\", // Starting point.\n",
       "  \"end_location\": \"Lalbagh, Bangalore\", // Destination.\n",
       "  \"pois_radius\": 500, // Search radius in meters for amenities.\n",
       "  \"amenities\": \"restaurant|cafe|bar|hotel\", // Amenities to search near start or end location.\n",
       "  \"limit\": 3, // Maximum number of results if address not found exactly.\n",
       "  \"task\": \"route_and_pois\" // Use 'location_only' for address/coordinates only, or 'route_and_pois' for routes and \n",
       "POIs.\n",
       "}\n",
       "```\n",
       "\n",
       "OR search for any single location (open street map has api rate limit)\n",
       "\n",
       "```json\n",
       "\n",
       "{\n",
       "  \"start_location\": \"MG Road, Bangalore\",\n",
       "  \"end_location\": \"Lalbagh, Bangalore\",\n",
       "  \"pois_radius\": 500,\n",
       "  \"amenities\": \"restaurant|cafe|bar|hotel\",\n",
       "  \"limit\": 3,\n",
       "  \"task\": \"location_only\"\n",
       "}\n",
       "\n",
       "```\n",
       "\n",
       "---\n",
       "\n",
       "### 6. GitHub & Local Repo Directory Tree\n",
       "**Get the directory structure of any GitHub or local repo.**\n",
       "\n",
       "**Endpoint:**  \n",
       "POST `/git-tree-search`\n",
       "\n",
       "**Request Example:**\n",
       "```json\n",
       "\n",
       "{\n",
       "  \"repobaseurl\": \"https://github.com/SPThole/CoexistAI/\" // Base URL of the repository to explore.\n",
       "}\n",
       "```\n",
       "or for local repo:\n",
       "```json\n",
       "\n",
       "{\n",
       "\"repobaseurl\": \"/home/user/projects/myrepo\"\n",
       "}\n",
       "\n",
       "```\n",
       "\n",
       "---\n",
       "\n",
       "## 🧑‍💻 Integrate coexistai as an MCP Server (LM Studio, Cursor, etc.)\n",
       "\n",
       "Starting LM Studio 0.3.17, LM Studio acts as an Model Context Protocol (MCP) Host. This means you can connect MCP \n",
       "servers to the app and make them available to your models. \n",
       "\n",
       "\n",
       "You can now run coexistai as an MCP server—**plug it into LM Studio** or any other MCP-compatible tool!\n",
       "\n",
       "### How to Integrate with LM Studio\n",
       "\n",
       "1. Download lm studio> 0.3.17 (https://lmstudio.ai/docs/app)\n",
       "\n",
       "2. Find the [MCP guide](https://lmstudio.ai/docs/app/plugins/mcp)\n",
       "\n",
       "<p align=\"center\">\n",
       "  <img src=\"artifacts/lmstudio.png\" alt=\"CoexistAI MCP Logo\" width=\"600\"/>\n",
       "</p>\n",
       "\n",
       "1. **Edit your `mcp.json` in LM Studio:**\n",
       "    - Go to the Program tab → `Install > Edit mcp.json`\n",
       "    - Add coexistai as a server. Example:\n",
       "    ```json\n",
       "    {\n",
       "        \"mcpServers\": {\n",
       "             \"coexistai\": {\n",
       "                \"url\": \"http://127.0.0.1:8000/mcp\"\n",
       "        }\n",
       "    }\n",
       "    }\n",
       "    ```\n",
       "    - Replace with your actual server address and token (if needed).\n",
       "\n",
       "2. **Or use an \"Add to LM Studio\" button** (if provided on coexistai website).\n",
       "\n",
       "**Security note:** Only use MCP servers you trust—servers can access files/network.\n",
       "\n",
       "3. Use (system_prompt.py) as context (system prompt) in lmstudio\n",
       "---\n",
       "\n",
       "## 🏆 Best Local Model\n",
       "\n",
       "For fastest, highest-quality local LLM results, I have personally liked following:  \n",
       "**unsloth/Qwen3-8B-GGUF**\n",
       "\n",
       "---\n",
       "\n",
       "\n",
       "
\n" ], "text/plain": [ "================================================\n", "FILE: README_MCP.md\n", "================================================\n", "# CoexistAI v0.\u001b[1;36m0.2\u001b[0m \n", "\n", "\u001b[1m<\u001b[0m\u001b[1;95mp\u001b[0m\u001b[39m \u001b[0m\u001b[33malign\u001b[0m\u001b[39m=\u001b[0m\u001b[32m\"center\"\u001b[0m\u001b[39m>\u001b[0m\n", "\u001b[39m \u001b[0m\n", "\u001b[39m<\u001b[0m\u001b[35m/\u001b[0m\u001b[95mp\u001b[0m\u001b[39m>\u001b[0m\n", "\n", "\u001b[39m## 🚀 What's New in v2\u001b[0m\n", "\n", "\u001b[39m- **Direct location search:** You can now search for any place, not just find routes!\u001b[0m\n", "\u001b[39m- **Advanced Reddit search:** Use your own phrases to search across reddit; results ranked better with BM25 for \u001b[0m\n", "\u001b[39msharper discovery.\u001b[0m\n", "\u001b[39m- **YouTube power-up:** Search and summarize YouTube using your own search phrases or video URLs and even add a \u001b[0m\n", "\u001b[39mprompt for custom responses.\u001b[0m\n", "\u001b[39m- **Explore to your folders/files**: Explore local folders and files with extended support to diverse files \u001b[0m\n", "\u001b[39mincluding \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mvision integrated\u001b[0m\u001b[1;39m)\u001b[0m\u001b[32m'.png'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'.jpg'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'.jpeg'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'.gif'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'.bmp'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'.webp'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'.tiff'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'.svg'\u001b[0m\u001b[39m, etc, more to come..\u001b[0m\n", "\u001b[39m- **Sharper web search:** More focused and actionable results than ever before.\u001b[0m\n", "\u001b[39m- **MCP support everywhere:** Now fully connect coexistai to LM Studio and other MCP hosts—seamless integration! \u001b[0m\n", "\u001b[1;39m[\u001b[0m\u001b[39mSee Guide\u001b[0m\u001b[1;39m]\u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mREADME_MCP.md\u001b[0m\u001b[1;39m)\u001b[0m\n", "\u001b[39m- **GitHub & local repo explorer:** Explore ask questions about codebases - works with both GitHub and local repos!\u001b[0m\n", "\n", "\u001b[39m---\u001b[0m\n", "\n", "\u001b[39m## 🛠 Quick Start\u001b[0m\n", "\n", "\u001b[39m### Method \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mLess flexible but faster\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m:\u001b[0m\n", "\n", "\u001b[1;36m1\u001b[0m\u001b[39m. **Clone the repository:**\u001b[0m\n", "\u001b[39m ```sh\u001b[0m\n", "\u001b[39m git clone \u001b[0m\u001b[4;94mhttps://github.com/SPThole/CoexistAI.git\u001b[0m\u001b[39m coexistai\u001b[0m\n", "\u001b[39m cd coexistai\u001b[0m\n", "\u001b[39m ```\u001b[0m\n", "\n", "\n", "\u001b[1;36m2\u001b[0m\u001b[39m. **Configure your model and embedding settings:**\u001b[0m\n", "\u001b[39m \u001b[0m\n", "\u001b[39m - Edit `model_config.py` to set your preferred LLM and embedding model.\u001b[0m\n", "\u001b[39m - Add LLM and Embedder API Key \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mfor google mode both would be same\u001b[0m\u001b[1;39m)\u001b[0m\n", "\u001b[39m - Example \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mfor full local mode\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m:\u001b[0m\n", "\u001b[39m ```py\u001b[0m\n", "\u001b[39m model_config = \u001b[0m\u001b[1;39m{\u001b[0m\n", "\u001b[39m # Name of the LLM model to use. For local models, use the model name served by your local server.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"llm_model_name\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"google/gemma-3-12b\"\u001b[0m\u001b[39m,\u001b[0m\n", "\n", "\u001b[39m # LLM provider type: choose from \u001b[0m\u001b[32m'google'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'local'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'groq'\u001b[0m\u001b[39m, or \u001b[0m\u001b[32m'openai'\u001b[0m\u001b[39m or \u001b[0m\u001b[32m'others'\u001b[0m\u001b[39m \u001b[0m\n", "\u001b[39m # in case of \u001b[0m\u001b[32m'others'\u001b[0m\u001b[39m \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mbase url needs to be updated in openai_compatible given below accordingly\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m.\u001b[0m\n", "\u001b[39m # Make sure to update the api_key variable above to match the provider.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"llm_type\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"local\"\u001b[0m\u001b[39m,\u001b[0m\n", "\n", "\u001b[39m # List of tools or plugins to use with the LLM, if any. Set to \u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m if not used.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"llm_tools\"\u001b[0m\u001b[39m: \u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m,\u001b[0m\n", "\n", "\u001b[39m # Additional keyword arguments for LLM initialization.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"llm_kwargs\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m{\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"temperature\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m0.1\u001b[0m\u001b[39m, # Sampling temperature for generation.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"max_tokens\"\u001b[0m\u001b[39m: \u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m, # Maximum number of tokens to generate \u001b[0m\u001b[1;39m(\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m for default\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"timeout\"\u001b[0m\u001b[39m: \u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m, # Timeout for API requests \u001b[0m\u001b[1;39m(\u001b[0m\u001b[3;35mNone\u001b[0m\u001b[39m for default\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"max_retries\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m2\u001b[0m\u001b[39m, # Maximum number of retries for failed requests.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"api_key\"\u001b[0m\u001b[39m: llm_api_key, # API key for authentication.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m,\u001b[0m\n", "\n", "\u001b[39m # Name of the embedding model to use.\u001b[0m\n", "\u001b[39m # For Google, use their embedding model names. For local/HuggingFace, use the model path or name.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"embedding_model_name\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"nomic-ai/nomic-embed-text-v1\"\u001b[0m\u001b[39m,\u001b[0m\n", "\n", "\u001b[39m \u001b[0m\u001b[32m\"embed_kwargs\"\u001b[0m\u001b[39m:\u001b[0m\u001b[1;39m{\u001b[0m\u001b[1;39m}\u001b[0m\u001b[39m, #additional kwargs for embedding model initialization\u001b[0m\n", "\n", "\u001b[39m # Embedding backend: \u001b[0m\u001b[32m'google'\u001b[0m\u001b[39m for Google, \u001b[0m\u001b[32m'infinity_emb'\u001b[0m\u001b[39m for local/HuggingFace models.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"embed_mode\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"infinity_emb\"\u001b[0m\u001b[39m,\u001b[0m\n", "\n", "\u001b[39m # Name of the cross-encoder model for reranking, typically a HuggingFace model.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"cross_encoder_name\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"BAAI/bge-reranker-base\"\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[1;39m}\u001b[0m\n", "\u001b[39m ```\u001b[0m\n", "\u001b[39m - See the file for all available options and defaults.\u001b[0m\n", "\n", "\u001b[1;36m3\u001b[0m\u001b[39m. **Run the setup script:**\u001b[0m\n", "\u001b[39m - For macOS or Linux with zsh:\u001b[0m\n", "\u001b[39m ```sh\u001b[0m\n", "\u001b[39m zsh quick_setup.sh\u001b[0m\n", "\u001b[39m ```\u001b[0m\n", "\u001b[39m - For Linux with bash:\u001b[0m\n", "\u001b[39m ```sh\u001b[0m\n", "\u001b[39m bash quick_setup.sh\u001b[0m\n", "\u001b[39m ```\u001b[0m\n", "\n", "\u001b[39m > The script will:\u001b[0m\n", "\u001b[39m > - Pull the SearxNG Docker image\u001b[0m\n", "\u001b[39m > - Create and activate a Python virtual environment\u001b[0m\n", "\u001b[39m > - **USER ACTION NEEDED** Set your `GOOGLE_API_KEY` \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39medit the script to use your real key\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m. \u001b[0m\u001b[1;39m[\u001b[0m\u001b[39mObtain your API \u001b[0m\n", "\u001b[39mkey \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mCurrently Gemini, OpenAI and ollama is supported\u001b[0m\u001b[1;39m)\u001b[0m\u001b[1;39m]\u001b[0m\u001b[1;39m(\u001b[0m\u001b[4;94mhttps://ai.google.dev/gemini-api/docs/api-key\u001b[0m\u001b[4;94m)\u001b[0m\u001b[39m from your \u001b[0m\n", "\u001b[39mpreferred LLM provider. \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mOnly needed when google mode is set, else set in model_config.py\u001b[0m\u001b[1;39m)\u001b[0m\n", "\u001b[39m > - Start the SearxNG Docker container\u001b[0m\n", "\u001b[39m > - Install Python dependencies\u001b[0m\n", "\u001b[39m > - Start the FastAPI server\u001b[0m\n", "\n", "\u001b[1;36m4\u001b[0m\u001b[39m. **That’s it!** \u001b[0m\n", "\u001b[39m The FastAPI and MCP server will start automatically and you’re ready to go.\u001b[0m\n", "\n", "\u001b[39m**Note:** \u001b[0m\n", "\u001b[39m- Make sure Docker, Python \u001b[0m\u001b[1;36m3\u001b[0m\u001b[39m, and pip are installed on your system. \u001b[0m\n", "\u001b[39m- Edit quick_setup.sh to set your real `GOOGLE_API_KEY` before running \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mneeded if using google models\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m \u001b[0m\n", "\u001b[39m- Windows users can use \u001b[0m\u001b[1;39m[\u001b[0m\u001b[39mWSL\u001b[0m\u001b[1;39m]\u001b[0m\u001b[1;39m(\u001b[0m\u001b[4;94mhttps://docs.microsoft.com/en-us/windows/wsl/\u001b[0m\u001b[4;94m)\u001b[0m\u001b[39m or Git Bash to run the script, or \u001b[0m\n", "\u001b[39mfollow manual setup steps.\u001b[0m\n", "\n", "\u001b[39m## 🔍 What Can You Do? \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mAPI Highlights & Examples\u001b[0m\u001b[1;39m)\u001b[0m\n", "\n", "\u001b[39m**Remove comments after \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m before pasting**\u001b[0m\n", "\u001b[39mSwagger UI: \u001b[0m\u001b[4;94mhttp://127.0.0.1:8000/docs\u001b[0m\u001b[39m if you haven't changed the host and port\u001b[0m\n", "\n", "\u001b[39m### \u001b[0m\u001b[1;36m1\u001b[0m\u001b[39m. Web Search\u001b[0m\n", "\u001b[39m**Search the web, summarize, and get actionable answers—automatically.**\u001b[0m\n", "\n", "\u001b[39m**Endpoint:** \u001b[0m\n", "\u001b[39mPOST `\u001b[0m\u001b[35m/\u001b[0m\u001b[95mweb-search\u001b[0m\u001b[39m`\u001b[0m\n", "\n", "\u001b[39m**Request Example:**\u001b[0m\n", "\u001b[39m```json\u001b[0m\n", "\u001b[1;39m{\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"query\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"Top news of today worldwide\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Query you want to ask; if you provide a URL and ask to summarise, it \u001b[0m\n", "\u001b[39mwill summarize the full page.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"rerank\"\u001b[0m\u001b[39m: true, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Set to true for better result ranking.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"num_results\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m2\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Number of top results per subquery to explore \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mhigher values = more tokens, slower/more \u001b[0m\n", "\u001b[39mcostly\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"local_mode\"\u001b[0m\u001b[39m: false, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Set to true to explore local documents \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mcurrently, only PDF supported\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"split\"\u001b[0m\u001b[39m: true, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Set to false if you want full pages as input to LLMs; false may cause slower/more costly \u001b[0m\n", "\u001b[39mresponse.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"document_paths\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m[\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m If local_mode is true, add a list of lists of document paths, e.g., \u001b[0m\n", "\u001b[1;39m[\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m\"documents/1706.03762v7.pdf\"\u001b[0m\u001b[1;39m]\u001b[0m\u001b[1;39m]\u001b[0m\n", "\u001b[1;39m}\u001b[0m\n", "\n", "\u001b[39m```\u001b[0m\n", "\n", "\u001b[39mor QA/sumamrise local documents \u001b[0m\n", "\n", "\u001b[39m```json\u001b[0m\n", "\u001b[1;39m{\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"query\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"Summarise this research paper\"\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"rerank\"\u001b[0m\u001b[39m: true,\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"num_results\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m3\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"local_mode\"\u001b[0m\u001b[39m: true,\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"split\"\u001b[0m\u001b[39m: true,\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"document_paths\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;39m[\u001b[0m\u001b[1;39m[\u001b[0m\u001b[32m\"documents/1706.03762v7.pdf\"\u001b[0m\u001b[1;39m]\u001b[0m\u001b[1;39m]\u001b[0m\u001b[39m \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Must be a list of lists.\u001b[0m\n", "\u001b[1;39m}\u001b[0m\n", "\u001b[39m```\u001b[0m\n", "\n", "\u001b[39m---\u001b[0m\n", "\n", "\u001b[39m### \u001b[0m\u001b[1;36m2\u001b[0m\u001b[39m. Summarize Any Web Page\u001b[0m\n", "\u001b[39m**Summarize any article or research paper by URL.**\u001b[0m\n", "\n", "\u001b[39m**Endpoint:** \u001b[0m\n", "\u001b[39mPOST `\u001b[0m\u001b[35m/\u001b[0m\u001b[95mweb-summarize\u001b[0m\u001b[39m`\u001b[0m\n", "\n", "\u001b[39m**Request Example:**\u001b[0m\n", "\u001b[39m```json\u001b[0m\n", "\n", "\u001b[1;39m{\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"query\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"Write a short blog on the model\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Instruction or question for the fetched page content.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"url\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"https://huggingface.co/unsloth/Qwen3-8B-GGUF\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Webpage to fetch content from.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"local_mode\"\u001b[0m\u001b[39m: false \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Set to true if summarizing a local document.\u001b[0m\n", "\u001b[1;39m}\u001b[0m\n", "\n", "\u001b[39m```\u001b[0m\n", "\n", "\u001b[39m---\u001b[0m\n", "\n", "\u001b[39m### \u001b[0m\u001b[1;36m3\u001b[0m\u001b[39m. YouTube Search \u001b[0m\n", "\u001b[39m**Search YouTube \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39msupports prompts and batch\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m.**\u001b[0m\n", "\n", "\u001b[39m**Endpoint:** \u001b[0m\n", "\u001b[39mPOST `\u001b[0m\u001b[35m/\u001b[0m\u001b[95myoutube-search\u001b[0m\u001b[39m`\u001b[0m\n", "\n", "\u001b[39m**Request Example:**\u001b[0m\n", "\u001b[39m```json\u001b[0m\n", "\n", "\u001b[1;39m{\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"query\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"switzerland itinerary\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Query to search on YouTube; if a URL is provided, it fetches content from \u001b[0m\n", "\u001b[39mthat URL. url should be in format: \u001b[0m\u001b[4;94mhttps://www.youtube.com/watch?\u001b[0m\u001b[4;94mv\u001b[0m\u001b[4;94m=\u001b[0m\u001b[4;94mvideoID\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"prompt\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"I want to plan my Switzerland trip\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Instruction or question for using the fetched content.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"n\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m2\u001b[0m\u001b[39m \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Number of top search results to summarize \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39monly works if query is not a URL\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m.\u001b[0m\n", "\u001b[1;39m}\u001b[0m\n", "\u001b[39m```\u001b[0m\n", "\n", "\u001b[39m---\u001b[0m\n", "\n", "\u001b[39m### \u001b[0m\u001b[1;36m4\u001b[0m\u001b[39m. Reddit Deep Dive\u001b[0m\n", "\u001b[39m**Custom Reddit search, sort, filter, and get top comments.**\u001b[0m\n", "\n", "\u001b[39m**Endpoint:** \u001b[0m\n", "\u001b[39mPOST `\u001b[0m\u001b[35m/\u001b[0m\u001b[95mreddit-search\u001b[0m\u001b[39m`\u001b[0m\n", "\n", "\u001b[39m**Request Example:**\u001b[0m\n", "\u001b[39m```json\u001b[0m\n", "\n", "\u001b[1;39m{\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"subreddit\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Subreddit to fetch content from \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39muse if url_type is not \u001b[0m\u001b[32m'search'\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"url_type\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"search\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m \u001b[0m\u001b[32m'search'\u001b[0m\u001b[39m for phrase search; \u001b[0m\u001b[32m\"url\"\u001b[0m\u001b[39m for url, otherwise, use \u001b[0m\u001b[32m'hot'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'top'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'best'\u001b[0m\u001b[39m, etc.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"n\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m3\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Number of posts to fetch.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"k\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m1\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Number of top comments per post.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"custom_url\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Use if you already have a specific Reddit URL.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"time_filter\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"all\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Time range: \u001b[0m\u001b[32m'all'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'today'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'week'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'month'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'year'\u001b[0m\u001b[39m.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"search_query\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"gemma 3n reviews\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Search phrase \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39museful if url_type is \u001b[0m\u001b[32m'search'\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"sort_type\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"relevance\"\u001b[0m\u001b[39m \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m \u001b[0m\u001b[32m'top'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'hot'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'new'\u001b[0m\u001b[39m, \u001b[0m\u001b[32m'relevance'\u001b[0m\u001b[39m — controls how results are sorted.\u001b[0m\n", "\u001b[1;39m}\u001b[0m\n", "\n", "\u001b[39m```\u001b[0m\n", "\n", "\u001b[39m---\u001b[0m\n", "\n", "\u001b[39m### \u001b[0m\u001b[1;36m5\u001b[0m\u001b[39m. Map & Location/Route Search\u001b[0m\n", "\u001b[39m**Find places, routes, and nearby points of interest.**\u001b[0m\n", "\n", "\u001b[39m**Endpoint:** \u001b[0m\n", "\u001b[39mPOST `\u001b[0m\u001b[35m/\u001b[0m\u001b[95mmap-search\u001b[0m\u001b[39m`\u001b[0m\n", "\n", "\u001b[39m**Request Example:**\u001b[0m\n", "\u001b[39m```json\u001b[0m\n", "\n", "\u001b[1;39m{\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"start_location\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"MG Road, Bangalore\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Starting point.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"end_location\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"Lalbagh, Bangalore\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Destination.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"pois_radius\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m500\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Search radius in meters for amenities.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"amenities\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"restaurant|cafe|bar|hotel\"\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Amenities to search near start or end location.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"limit\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m3\u001b[0m\u001b[39m, \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Maximum number of results if address not found exactly.\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"task\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"route_and_pois\"\u001b[0m\u001b[39m \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Use \u001b[0m\u001b[32m'location_only'\u001b[0m\u001b[39m for address/coordinates only, or \u001b[0m\u001b[32m'route_and_pois'\u001b[0m\u001b[39m for routes and \u001b[0m\n", "\u001b[39mPOIs.\u001b[0m\n", "\u001b[1;39m}\u001b[0m\n", "\u001b[39m```\u001b[0m\n", "\n", "\u001b[39mOR search for any single location \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mopen street map has api rate limit\u001b[0m\u001b[1;39m)\u001b[0m\n", "\n", "\u001b[39m```json\u001b[0m\n", "\n", "\u001b[1;39m{\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"start_location\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"MG Road, Bangalore\"\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"end_location\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"Lalbagh, Bangalore\"\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"pois_radius\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m500\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"amenities\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"restaurant|cafe|bar|hotel\"\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"limit\"\u001b[0m\u001b[39m: \u001b[0m\u001b[1;36m3\u001b[0m\u001b[39m,\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"task\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"location_only\"\u001b[0m\n", "\u001b[1;39m}\u001b[0m\n", "\n", "\u001b[39m```\u001b[0m\n", "\n", "\u001b[39m---\u001b[0m\n", "\n", "\u001b[39m### \u001b[0m\u001b[1;36m6\u001b[0m\u001b[39m. GitHub & Local Repo Directory Tree\u001b[0m\n", "\u001b[39m**Get the directory structure of any GitHub or local repo.**\u001b[0m\n", "\n", "\u001b[39m**Endpoint:** \u001b[0m\n", "\u001b[39mPOST `\u001b[0m\u001b[35m/\u001b[0m\u001b[95mgit-tree-search\u001b[0m\u001b[39m`\u001b[0m\n", "\n", "\u001b[39m**Request Example:**\u001b[0m\n", "\u001b[39m```json\u001b[0m\n", "\n", "\u001b[1;39m{\u001b[0m\n", "\u001b[39m \u001b[0m\u001b[32m\"repobaseurl\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"https://github.com/SPThole/CoexistAI/\"\u001b[0m\u001b[39m \u001b[0m\u001b[35m/\u001b[0m\u001b[35m/\u001b[0m\u001b[39m Base URL of the repository to explore.\u001b[0m\n", "\u001b[1;39m}\u001b[0m\n", "\u001b[39m```\u001b[0m\n", "\u001b[39mor for local repo:\u001b[0m\n", "\u001b[39m```json\u001b[0m\n", "\n", "\u001b[1;39m{\u001b[0m\n", "\u001b[32m\"repobaseurl\"\u001b[0m\u001b[39m: \u001b[0m\u001b[32m\"/home/user/projects/myrepo\"\u001b[0m\n", "\u001b[1;39m}\u001b[0m\n", "\n", "\u001b[39m```\u001b[0m\n", "\n", "\u001b[39m---\u001b[0m\n", "\n", "\u001b[39m## 🧑‍💻 Integrate coexistai as an MCP Server \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mLM Studio, Cursor, etc.\u001b[0m\u001b[1;39m)\u001b[0m\n", "\n", "\u001b[39mStarting LM Studio \u001b[0m\u001b[1;36m0.3\u001b[0m\u001b[39m.\u001b[0m\u001b[1;36m17\u001b[0m\u001b[39m, LM Studio acts as an Model Context Protocol \u001b[0m\u001b[1;39m(\u001b[0m\u001b[39mMCP\u001b[0m\u001b[1;39m)\u001b[0m\u001b[39m Host. This means you can connect MCP \u001b[0m\n", "\u001b[39mservers to the app and make them available to your models. \u001b[0m\n", "\n", "\n", "\u001b[39mYou can now run coexistai as an MCP server—**plug it into LM Studio** or any other MCP-compatible tool!\u001b[0m\n", "\n", "\u001b[39m### How to Integrate with LM Studio\u001b[0m\n", "\n", "\u001b[1;36m1\u001b[0m\u001b[39m. Download lm studio> \u001b[0m\u001b[1;36m0.3\u001b[0m\u001b[39m.\u001b[0m\u001b[1;36m17\u001b[0m\u001b[39m \u001b[0m\u001b[1;39m(\u001b[0m\u001b[4;94mhttps://lmstudio.ai/docs/app\u001b[0m\u001b[4;94m)\u001b[0m\n", "\n", "\u001b[1;36m2\u001b[0m\u001b[39m. Find the \u001b[0m\u001b[1;39m[\u001b[0m\u001b[39mMCP guide\u001b[0m\u001b[1;39m]\u001b[0m\u001b[1;39m(\u001b[0m\u001b[4;94mhttps://lmstudio.ai/docs/app/plugins/mcp\u001b[0m\u001b[4;94m)\u001b[0m\n", "\n", "\u001b[39m

\u001b[0m\n", "\u001b[39m \u001b[0m\n", "\u001b[39m<\u001b[0m\u001b[35m/\u001b[0m\u001b[95mp\u001b[0m\u001b[39m>\u001b[0m\n", "\n", "\u001b[1;36m1\u001b[0m\u001b[39m. **Edit your `mcp.json` in LM Studio:**\u001b[0m\n", "\u001b[39m - Go to the Program tab → `Install \u001b[0m\u001b[1m>\u001b[0m Edit mcp.json`\n", " - Add coexistai as a server. Example:\n", " ```json\n", " \u001b[1m{\u001b[0m\n", " \u001b[32m\"mcpServers\"\u001b[0m: \u001b[1m{\u001b[0m\n", " \u001b[32m\"coexistai\"\u001b[0m: \u001b[1m{\u001b[0m\n", " \u001b[32m\"url\"\u001b[0m: \u001b[32m\"http://127.0.0.1:8000/mcp\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " ```\n", " - Replace with your actual server address and token \u001b[1m(\u001b[0mif needed\u001b[1m)\u001b[0m.\n", "\n", "\u001b[1;36m2\u001b[0m. **Or use an \u001b[32m\"Add to LM Studio\"\u001b[0m button** \u001b[1m(\u001b[0mif provided on coexistai website\u001b[1m)\u001b[0m.\n", "\n", "**Security note:** Only use MCP servers you trust—servers can access files/network.\n", "\n", "\u001b[1;36m3\u001b[0m. Use \u001b[1m(\u001b[0msystem_prompt.py\u001b[1m)\u001b[0m as context \u001b[1m(\u001b[0msystem prompt\u001b[1m)\u001b[0m in lmstudio\n", "---\n", "\n", "## 🏆 Best Local Model\n", "\n", "For fastest, highest-quality local LLM results, I have personally liked following: \n", "**unsloth/Qwen3-8B-GGUF**\n", "\n", "---\n", "\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "content = await git_specific_content(\"https://github.com/SPThole/CoexistAI/\",\n", " part = \"README_MCP.md\",\n", " type=\"file\")\n", "print(content)" ] }, { "cell_type": "markdown", "id": "18c36348", "metadata": {}, "source": [ "## 12. Generating Maps\n", "\n", "Generate maps with routes and points of interest using the `generate_map` function. You can visualize directions and locations directly in your notebook." ] }, { "cell_type": "code", "execution_count": 29, "id": "303b0b13", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:13:04,557] INFO utils.map: Found 3 probable locations for 'M G Road, Bangalore'.\n", "[2025-07-27 12:13:04,558] INFO utils.map: Auto-selected location: Mahatma Gandhi Road, Tasker Town, Shivajinagar, Bengaluru, Bengaluru Central City Corporation, Bengaluru, Bangalore North, Bengaluru Urban, Karnataka, 560001, India\n", "[2025-07-27 12:13:05,549] INFO utils.map: Found 3 probable locations for 'Indiranagar, Bangalore'.\n", "[2025-07-27 12:13:05,551] INFO utils.map: Auto-selected location: Indiranagar, Bengaluru Central City Corporation, Bengaluru, Bangalore East, Bengaluru Urban, Karnataka, 560038, India\n", "[2025-07-27 12:13:05,895] INFO utils.map: Route found between start and end coordinates.\n", "[2025-07-27 12:13:06,491] INFO utils.map: Found 82 POIs near (12.9747828, 77.6096698).\n", "[2025-07-27 12:13:06,874] INFO utils.map: Found 79 POIs near (12.9732913, 77.6404672).\n", "[2025-07-27 12:13:07,000] INFO utils.map: Map generated and saved as 'map_with_route_and_pois.html'.\n", "[2025-07-27 12:13:07,000] INFO utils.map: Generated route directions.\n" ] } ], "source": [ "# Example: Generate a map with a route and POIs\n", "from utils.map import *\n", "s = generate_map(\"M G Road, Bangalore\", \"Indiranagar, Bangalore\")\n", "from IPython.display import display, HTML\n", "with open(\"output/map_with_route_and_pois.html\") as f:\n", " html_content = f.read()\n", "# display(HTML(html_content))" ] }, { "cell_type": "markdown", "id": "08403d58", "metadata": {}, "source": [ "You can even get the location specific information" ] }, { "cell_type": "code", "execution_count": 30, "id": "6da734e2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:13:10,669] INFO utils.map: Found 3 probable locations for 'M G Road, Bangalore'.\n", "[2025-07-27 12:13:10,671] INFO utils.map: Auto-selected location: Mahatma Gandhi Road, Tasker Town, Shivajinagar, Bengaluru, Bengaluru Central City Corporation, Bengaluru, Bangalore North, Bengaluru Urban, Karnataka, 560001, India\n" ] }, { "data": { "text/html": [ "

Start location: Mahatma Gandhi Road, Tasker Town, Shivajinagar, Bengaluru, Bengaluru Central City Corporation, \n",
       "Bengaluru, Bangalore North, Bengaluru Urban, Karnataka, 560001, India (Lat: 12.9747828, Lon: 77.6096698)\n",
       "\n",
       "
\n" ], "text/plain": [ "Start location: Mahatma Gandhi Road, Tasker Town, Shivajinagar, Bengaluru, Bengaluru Central City Corporation, \n", "Bengaluru, Bangalore North, Bengaluru Urban, Karnataka, \u001b[1;36m560001\u001b[0m, India \u001b[1m(\u001b[0mLat: \u001b[1;36m12.9747828\u001b[0m, Lon: \u001b[1;36m77.6096698\u001b[0m\u001b[1m)\u001b[0m\n", "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Example: Generate a map with a route and POIs\n", "s = generate_map(\"M G Road, Bangalore\", None, task=\"location_only\")\n", "print(s)\n", "# display(HTML(html_content))" ] }, { "cell_type": "markdown", "id": "c9f1f4bb", "metadata": {}, "source": [ "## 11. Advanced Query Handling\n", "\n", "Handle advanced queries, such as generating detailed reports or toy examples, using `query_web_response`. This enables complex, multi-step reasoning and content generation." ] }, { "cell_type": "code", "execution_count": 23, "id": "ea974c82", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:09:49,765] INFO utils.answer_generation: Summary:False,subquery:[''], iscoverd:False\n", "[2025-07-27 12:09:49,765] INFO utils.answer_generation: Summary:False,subquery:[''], iscoverd:False\n", "[2025-07-27 12:09:49,767] INFO utils.websearch_utils: Search response generated for query 'Give me end to end working for text diffusion model.' using pure query.\n", "[2025-07-27 12:09:49,767] INFO utils.websearch_utils: Search response generated for query 'Give me end to end working for text diffusion model.' using pure query.\n", "[2025-07-27 12:09:49,768] INFO utils.websearch_utils: Search phrases for query 'Give me end to end working for text diffusion model.': ['Give me end to end working for text diffusion model.']\n", "[2025-07-27 12:09:49,768] INFO utils.websearch_utils: Search phrases for query 'Give me end to end working for text diffusion model.': ['Give me end to end working for text diffusion model.']\n", "[2025-07-27 12:09:50,420] INFO utils.websearch_utils: Search results for query 'Give me end to end working for text diffusion model.': [{'snippet': 'by Y Gao · 2023 · Cited by 32 — We propose Easy End-to-End Diffusion-based Text to Speech, a simple and efficient end-to-end text-to-speech model based on diffusion.', 'title': 'E3 TTS: Easy End-to-End Diffusion-based Text to Speech', 'link': 'https://arxiv.org/abs/2311.00945', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:09:50,420] INFO utils.websearch_utils: Search results for query 'Give me end to end working for text diffusion model.': [{'snippet': 'by Y Gao · 2023 · Cited by 32 — We propose Easy End-to-End Diffusion-based Text to Speech, a simple and efficient end-to-end text-to-speech model based on diffusion.', 'title': 'E3 TTS: Easy End-to-End Diffusion-based Text to Speech', 'link': 'https://arxiv.org/abs/2311.00945', 'engines': ['google'], 'category': 'general'}]\n", "[2025-07-27 12:09:50,422] INFO utils.websearch_utils: Search results fetched for subquery: Give me end to end working for text diffusion model.\n", "[2025-07-27 12:09:50,422] INFO utils.websearch_utils: Search results fetched for subquery: Give me end to end working for text diffusion model.\n", "[2025-07-27 12:09:50,422] INFO utils.websearch_utils: Created Document for source: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:50,422] INFO utils.websearch_utils: Created Document for source: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:50,423] INFO utils.websearch_utils: Search results fetched for query 'Give me end to end working for text diffusion model.'.\n", "[2025-07-27 12:09:50,423] INFO utils.websearch_utils: Search results fetched for query 'Give me end to end working for text diffusion model.'.\n", "[2025-07-27 12:09:50,424] INFO utils.websearch_utils: Starting context_to_docs for 1 URL groups.\n", "[2025-07-27 12:09:50,424] INFO utils.websearch_utils: Starting context_to_docs for 1 URL groups.\n", "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py:136: UserWarning: resource_tracker: process died unexpectedly, relaunching. Some resources might leak.\n", " warnings.warn('resource_tracker: process died unexpectedly, '\n", "[2025-07-27 12:09:50,457] INFO utils.websearch_utils: Fetching URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:50,457] INFO utils.websearch_utils: Fetching URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:50,798] INFO utils.websearch_utils: Fetched content from https://arxiv.org/abs/2311.00945 with type text/html; charset=utf-8\n", "[2025-07-27 12:09:50,798] INFO utils.websearch_utils: Fetched content from https://arxiv.org/abs/2311.00945 with type text/html; charset=utf-8\n", "[2025-07-27 12:09:52,618] INFO utils.websearch_utils: Processed markdown for: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:52,618] INFO utils.websearch_utils: Processed markdown for: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:52,619] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:52,619] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:52,867] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:09:52,867] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:09:52,870] INFO utils.websearch_utils: Processed 1 docs for URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:52,870] INFO utils.websearch_utils: Processed 1 docs for URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:52,871] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:09:52,871] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:09:52,871] INFO utils.websearch_utils: Encoding start for URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:52,871] INFO utils.websearch_utils: Encoding start for URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:52,873] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:09:52,873] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:09:52,873] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:09:53,311] INFO utils.websearch_utils: Encoding end for URL: https://arxiv.org/abs/2311.00945, Time taken: 0.44 seconds\n", "[2025-07-27 12:09:53,311] INFO utils.websearch_utils: Encoding end for URL: https://arxiv.org/abs/2311.00945, Time taken: 0.44 seconds\n", "[2025-07-27 12:09:53,313] INFO utils.websearch_utils: Reranking start for URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:53,313] INFO utils.websearch_utils: Reranking start for URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:53,314] INFO utils.websearch_utils: Reranking end for URL: https://arxiv.org/abs/2311.00945, Time taken: 0.00 seconds\n", "[2025-07-27 12:09:53,314] INFO utils.websearch_utils: Reranking end for URL: https://arxiv.org/abs/2311.00945, Time taken: 0.00 seconds\n", "Batches: 100%|██████████| 1/1 [00:00<00:00, 1.74it/s]\n", "[2025-07-27 12:09:55,117] INFO utils.websearch_utils: Retrieved 1 docs for https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:55,117] INFO utils.websearch_utils: Retrieved 1 docs for https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:55,118] INFO utils.websearch_utils: Built context for https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:55,118] INFO utils.websearch_utils: Built context for https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:55,118] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:55,118] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: https://arxiv.org/abs/2311.00945\n", "[2025-07-27 12:09:55,119] INFO utils.websearch_utils: context_to_docs complete. Total contexts: 1, total docs: 1\n", "[2025-07-27 12:09:55,119] INFO utils.websearch_utils: context_to_docs complete. Total contexts: 1, total docs: 1\n", "[2025-07-27 12:09:55,120] INFO utils.websearch_utils: Context generated to answer query 'Give me end to end working for text diffusion model.'.\n", "[2025-07-27 12:09:55,120] INFO utils.websearch_utils: Context generated to answer query 'Give me end to end working for text diffusion model.'.\n", "[2025-07-27 12:09:55,120] INFO utils.utils: Logging results for query: Give me end to end working for text diffusion model.\n", "[2025-07-27 12:09:55,120] INFO utils.utils: Logging results for query: Give me end to end working for text diffusion model.\n", "[2025-07-27 12:09:55,121] INFO utils.websearch_utils: Logged results for query 'Give me end to end working for text diffusion model.'.\n", "[2025-07-27 12:09:55,121] INFO utils.websearch_utils: Logged results for query 'Give me end to end working for text diffusion model.'.\n", "[2025-07-27 12:09:55,121] INFO utils.websearch_utils: Generating Answer for query 'Give me end to end working for text diffusion model.' using response gen.\n", "[2025-07-27 12:09:55,121] INFO utils.websearch_utils: Generating Answer for query 'Give me end to end working for text diffusion model.' using response gen.\n", "[2025-07-27 12:09:55,122] INFO utils.answer_generation: Generating Answer for query: Give me end to end working for text diffusion model.\n", "[2025-07-27 12:09:55,122] INFO utils.answer_generation: Generating Answer for query: Give me end to end working for text diffusion model.\n", "[2025-07-27 12:09:57,237] INFO utils.answer_generation: Structured response generated successfully.\n", "[2025-07-27 12:09:57,237] INFO utils.answer_generation: Structured response generated successfully.\n", "[2025-07-27 12:09:57,239] INFO utils.websearch_utils: Response generated for query 'Give me end to end working for text diffusion model.'.\n", "[2025-07-27 12:09:57,239] INFO utils.websearch_utils: Response generated for query 'Give me end to end working for text diffusion model.'.\n" ] }, { "data": { "text/html": [ "
#### Answer: \n",
       "The provided research paper describes E3 TTS, an end-to-end text-to-speech model using diffusion.  It takes plain \n",
       "text as input and generates an audio waveform iteratively.  It doesn't rely on intermediate representations like \n",
       "spectrograms or alignment information.  The model's architecture and training process are not fully detailed in the\n",
       "abstract.  To provide a complete workflow for a general text diffusion model, more information is needed, such as \n",
       "the specific model architecture, training data, and hyperparameters.\n",
       "\n",
       "#### Next steps:\n",
       "More information is needed to provide a complete workflow.  The provided text focuses on a specific text-to-speech \n",
       "model (E3 TTS) and doesn't detail a general text diffusion model workflow.\n",
       "
\n" ], "text/plain": [ "#### Answer: \n", "The provided research paper describes E3 TTS, an end-to-end text-to-speech model using diffusion. It takes plain \n", "text as input and generates an audio waveform iteratively. It doesn't rely on intermediate representations like \n", "spectrograms or alignment information. The model's architecture and training process are not fully detailed in the\n", "abstract. To provide a complete workflow for a general text diffusion model, more information is needed, such as \n", "the specific model architecture, training data, and hyperparameters.\n", "\n", "#### Next steps:\n", "More information is needed to provide a complete workflow. The provided text focuses on a specific text-to-speech \n", "model \u001b[1m(\u001b[0mE3 TTS\u001b[1m)\u001b[0m and doesn't detail a general text diffusion model workflow.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "detailed_report = await query_web_response(\n", " \"Give me end to end working for text diffusion model.\",\n", " '26-07-2025',\n", " 'Saturday',\n", " searcher,\n", " hf_embeddings,\n", " True,\n", " cross_encoder,\n", " llmgoogle,\n", " text_model=llmgoogle,\n", " num_results=1,\n", " document_paths=[],\n", " local_mode=False,\n", " split=False\n", ")\n", "print(detailed_report[0])" ] }, { "cell_type": "code", "execution_count": 24, "id": "de7a702f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:10:02,224] INFO utils.answer_generation: Summary:True,subquery:['Summarize Ollama blog post on Secure Minions', 'Extract learning notes from Ollama Secure Minions blog post'], iscoverd:False\n", "[2025-07-27 12:10:02,224] INFO utils.answer_generation: Summary:True,subquery:['Summarize Ollama blog post on Secure Minions', 'Extract learning notes from Ollama Secure Minions blog post'], iscoverd:False\n", "[2025-07-27 12:10:02,225] INFO utils.websearch_utils: Search phrases for query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions': ['summarize ollama blog post on secure minions', 'extract learning notes from ollama secure minions blog post']\n", "[2025-07-27 12:10:02,225] INFO utils.websearch_utils: Search phrases for query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions': ['summarize ollama blog post on secure minions', 'extract learning notes from ollama secure minions blog post']\n", "[2025-07-27 12:10:02,226] INFO utils.websearch_utils: Extracted URLs from query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions': ['https://ollama.com/blog/secureminions']\n", "[2025-07-27 12:10:02,226] INFO utils.websearch_utils: Extracted URLs from query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions': ['https://ollama.com/blog/secureminions']\n", "[2025-07-27 12:10:02,226] INFO utils.websearch_utils: Created Document for source: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:02,226] INFO utils.websearch_utils: Created Document for source: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:02,226] INFO utils.websearch_utils: Search results fetched for query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions'.\n", "[2025-07-27 12:10:02,226] INFO utils.websearch_utils: Search results fetched for query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions'.\n", "[2025-07-27 12:10:02,227] INFO utils.websearch_utils: Starting context_to_docs for 1 URL groups.\n", "[2025-07-27 12:10:02,227] INFO utils.websearch_utils: Starting context_to_docs for 1 URL groups.\n", "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py:136: UserWarning: resource_tracker: process died unexpectedly, relaunching. Some resources might leak.\n", " warnings.warn('resource_tracker: process died unexpectedly, '\n", "[2025-07-27 12:10:05,245] INFO utils.websearch_utils: Fetching URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:05,245] INFO utils.websearch_utils: Fetching URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:05,600] INFO utils.websearch_utils: Fetched content from https://ollama.com/blog/secureminions with type text/html; charset=utf-8\n", "[2025-07-27 12:10:05,600] INFO utils.websearch_utils: Fetched content from https://ollama.com/blog/secureminions with type text/html; charset=utf-8\n", "[2025-07-27 12:10:07,112] INFO utils.websearch_utils: Processed markdown for: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:07,112] INFO utils.websearch_utils: Processed markdown for: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:07,114] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:07,114] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:07,360] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:10:07,360] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:10:07,363] INFO utils.websearch_utils: Processed 1 docs for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:07,363] INFO utils.websearch_utils: Processed 1 docs for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:07,364] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:10:07,364] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:10:07,364] INFO utils.websearch_utils: Encoding start for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:07,364] INFO utils.websearch_utils: Encoding start for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:07,365] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:07,365] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:07,365] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:07,828] INFO utils.websearch_utils: Encoding end for URL: https://ollama.com/blog/secureminions, Time taken: 0.46 seconds\n", "[2025-07-27 12:10:07,828] INFO utils.websearch_utils: Encoding end for URL: https://ollama.com/blog/secureminions, Time taken: 0.46 seconds\n", "[2025-07-27 12:10:07,830] INFO utils.websearch_utils: Reranking start for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:07,830] INFO utils.websearch_utils: Reranking start for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:07,830] INFO utils.websearch_utils: Reranking end for URL: https://ollama.com/blog/secureminions, Time taken: 0.00 seconds\n", "[2025-07-27 12:10:07,830] INFO utils.websearch_utils: Reranking end for URL: https://ollama.com/blog/secureminions, Time taken: 0.00 seconds\n", "Batches: 100%|██████████| 1/1 [00:00<00:00, 2.77it/s]\n", "[2025-07-27 12:10:09,352] INFO utils.websearch_utils: Retrieved 1 docs for https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:09,352] INFO utils.websearch_utils: Retrieved 1 docs for https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:09,353] INFO utils.websearch_utils: Built context for https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:09,353] INFO utils.websearch_utils: Built context for https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:09,353] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:09,353] INFO utils.websearch_utils: Successfully processed and added context/docs for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:09,354] INFO utils.websearch_utils: context_to_docs complete. Total contexts: 1, total docs: 1\n", "[2025-07-27 12:10:09,354] INFO utils.websearch_utils: context_to_docs complete. Total contexts: 1, total docs: 1\n", "[2025-07-27 12:10:09,354] INFO utils.websearch_utils: Context generated to answer query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions'.\n", "[2025-07-27 12:10:09,354] INFO utils.websearch_utils: Context generated to answer query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions'.\n", "[2025-07-27 12:10:09,355] INFO utils.utils: Logging results for query: Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:09,355] INFO utils.utils: Logging results for query: Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:09,356] INFO utils.websearch_utils: Logged results for query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions'.\n", "[2025-07-27 12:10:09,356] INFO utils.websearch_utils: Logged results for query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions'.\n", "[2025-07-27 12:10:09,356] INFO utils.websearch_utils: Generating summary for query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions' using summarizer.\n", "[2025-07-27 12:10:09,356] INFO utils.websearch_utils: Generating summary for query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions' using summarizer.\n", "[2025-07-27 12:10:09,357] INFO utils.answer_generation: Deduping docs: 1\n", "[2025-07-27 12:10:09,357] INFO utils.answer_generation: Deduping docs: 1\n", "[2025-07-27 12:10:09,357] WARNING utils.answer_generation: Capping max words to 6000\n", "[2025-07-27 12:10:09,357] WARNING utils.answer_generation: Capping max words to 6000\n", "[2025-07-27 12:10:09,357] INFO utils.answer_generation: Arrived Len of Docs: 1\n", "[2025-07-27 12:10:09,357] INFO utils.answer_generation: Arrived Len of Docs: 1\n", "[2025-07-27 12:10:09,358] INFO utils.answer_generation: Only one document provided, summarizing it.\n", "[2025-07-27 12:10:09,358] INFO utils.answer_generation: Only one document provided, summarizing it.\n", "[2025-07-27 12:10:13,210] INFO utils.websearch_utils: Response generated for query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions'.\n", "[2025-07-27 12:10:13,210] INFO utils.websearch_utils: Response generated for query 'Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions'.\n" ] }, { "data": { "text/html": [ "
This blog post from Ollama details Secure Minions, a collaboration with Stanford's Hazy Research lab enhancing the \n",
       "open-source Minions project.  Minions allows local LLMs (like Ollama's models) to collaborate with powerful \n",
       "cloud-based frontier models (like GPT-4o) efficiently and cost-effectively.  The original Minions protocol, while \n",
       "reducing cloud costs significantly (5x-30x) and maintaining high accuracy (98%), still transmitted some potentially\n",
       "sensitive data to the cloud.\n",
       "\n",
       "Secure Minions addresses this privacy concern by implementing end-to-end encryption, even from the cloud provider. \n",
       "This is achieved using NVIDIA's Hopper H100 GPUs' confidential computing mode.  The result is a system where no \n",
       "plaintext is exposed during transmission or remote LLM inference, with minimal latency overhead (less than 1%).\n",
       "\n",
       "The blog post provides instructions for getting started: cloning the repository, setting up a virtual environment, \n",
       "installing the necessary packages (including Ollama and a model like Gemma 3:4b), and running either an interactive\n",
       "Streamlit app or a Python script demonstrating the secure protocol.  Example code is included to illustrate \n",
       "programmatic usage.  The post encourages readers to refer to the HazyResearch blog for complete technical details.\n",
       "\n",
       "\n",
       "**Learning Notes:**\n",
       "\n",
       "* **Minions:** A system for efficient and cost-effective collaboration between local and cloud-based LLMs.\n",
       "* **Secure Minions:** An enhanced version of Minions providing end-to-end encryption for enhanced privacy.\n",
       "* **Confidential Computing:** Leveraging hardware capabilities (NVIDIA Hopper H100 GPUs) to protect data during \n",
       "processing and transmission.\n",
       "* **Practical Implementation:**  The blog post provides clear instructions and example code for using Secure \n",
       "Minions.\n",
       "* **Key Improvement:**  Secure Minions significantly improves the privacy of local-first LLM setups by preventing \n",
       "sensitive data from leaving the device.\n",
       "* **Minimal Overhead:** The added security comes with minimal performance impact (less than 1% latency increase).\n",
       "\n",
       "\n",
       "**Assumptions Made:**\n",
       "\n",
       "* The reader has basic familiarity with command-line interfaces, Git, Python, and virtual environments.\n",
       "* The reader has access to a system with sufficient resources to run the required software and models (including \n",
       "potentially an NVIDIA Hopper H100 GPU for optimal performance, although this is not explicitly stated as a \n",
       "requirement).\n",
       "* The reader understands the basic concepts of LLMs and their applications.\n",
       "
\n" ], "text/plain": [ "This blog post from Ollama details Secure Minions, a collaboration with Stanford's Hazy Research lab enhancing the \n", "open-source Minions project. Minions allows local LLMs \u001b[1m(\u001b[0mlike Ollama's models\u001b[1m)\u001b[0m to collaborate with powerful \n", "cloud-based frontier models \u001b[1m(\u001b[0mlike GPT-4o\u001b[1m)\u001b[0m efficiently and cost-effectively. The original Minions protocol, while \n", "reducing cloud costs significantly \u001b[1m(\u001b[0m5x-3\u001b[1;36m0x\u001b[0m\u001b[1m)\u001b[0m and maintaining high accuracy \u001b[1m(\u001b[0m\u001b[1;36m98\u001b[0m%\u001b[1m)\u001b[0m, still transmitted some potentially\n", "sensitive data to the cloud.\n", "\n", "Secure Minions addresses this privacy concern by implementing end-to-end encryption, even from the cloud provider. \n", "This is achieved using NVIDIA's Hopper H100 GPUs' confidential computing mode. The result is a system where no \n", "plaintext is exposed during transmission or remote LLM inference, with minimal latency overhead \u001b[1m(\u001b[0mless than \u001b[1;36m1\u001b[0m%\u001b[1m)\u001b[0m.\n", "\n", "The blog post provides instructions for getting started: cloning the repository, setting up a virtual environment, \n", "installing the necessary packages \u001b[1m(\u001b[0mincluding Ollama and a model like Gemma \u001b[1;92m3:4b\u001b[0m\u001b[1m)\u001b[0m, and running either an interactive\n", "Streamlit app or a Python script demonstrating the secure protocol. Example code is included to illustrate \n", "programmatic usage. The post encourages readers to refer to the HazyResearch blog for complete technical details.\n", "\n", "\n", "**Learning Notes:**\n", "\n", "* **Minions:** A system for efficient and cost-effective collaboration between local and cloud-based LLMs.\n", "* **Secure Minions:** An enhanced version of Minions providing end-to-end encryption for enhanced privacy.\n", "* **Confidential Computing:** Leveraging hardware capabilities \u001b[1m(\u001b[0mNVIDIA Hopper H100 GPUs\u001b[1m)\u001b[0m to protect data during \n", "processing and transmission.\n", "* **Practical Implementation:** The blog post provides clear instructions and example code for using Secure \n", "Minions.\n", "* **Key Improvement:** Secure Minions significantly improves the privacy of local-first LLM setups by preventing \n", "sensitive data from leaving the device.\n", "* **Minimal Overhead:** The added security comes with minimal performance impact \u001b[1m(\u001b[0mless than \u001b[1;36m1\u001b[0m% latency increase\u001b[1m)\u001b[0m.\n", "\n", "\n", "**Assumptions Made:**\n", "\n", "* The reader has basic familiarity with command-line interfaces, Git, Python, and virtual environments.\n", "* The reader has access to a system with sufficient resources to run the required software and models \u001b[1m(\u001b[0mincluding \n", "potentially an NVIDIA Hopper H100 GPU for optimal performance, although this is not explicitly stated as a \n", "requirement\u001b[1m)\u001b[0m.\n", "* The reader understands the basic concepts of LLMs and their applications.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# You can even summarise full page \n", "detailed_report = await query_web_response(\n", " \"Summarise this full page in detail, and give me learning notes https://ollama.com/blog/secureminions\",\n", " '27-06-2025',\n", " 'Saturday',\n", " searcher,\n", " hf_embeddings,\n", " True,\n", " cross_encoder,\n", " llmgoogle,\n", " text_model=llmgoogle,\n", " num_results=3,\n", " document_paths=[],\n", " local_mode=False,\n", " split=False\n", ")\n", "print(detailed_report[0])" ] }, { "cell_type": "code", "execution_count": 25, "id": "aafb2ae6", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:10:19,356] INFO utils.answer_generation: Summary:True,subquery:['Summarize SecureMinions blog', 'Summarize llama.cpp github'], iscoverd:True\n", "[2025-07-27 12:10:19,356] INFO utils.answer_generation: Summary:True,subquery:['Summarize SecureMinions blog', 'Summarize llama.cpp github'], iscoverd:True\n", "[2025-07-27 12:10:19,358] INFO utils.websearch_utils: Search phrases for query 'Summarise https://ollama.com/blog/secureminions and https://github.com/ggml-org/llama.cpp': ['summarize secureminions blog', 'summarize llama.cpp github']\n", "[2025-07-27 12:10:19,358] INFO utils.websearch_utils: Search phrases for query 'Summarise https://ollama.com/blog/secureminions and https://github.com/ggml-org/llama.cpp': ['summarize secureminions blog', 'summarize llama.cpp github']\n", "[2025-07-27 12:10:19,358] INFO utils.websearch_utils: Extracted URLs from query 'Summarise https://ollama.com/blog/secureminions and https://github.com/ggml-org/llama.cpp': ['https://ollama.com/blog/secureminions', 'https://github.com/ggml-org/llama.cpp']\n", "[2025-07-27 12:10:19,358] INFO utils.websearch_utils: Extracted URLs from query 'Summarise https://ollama.com/blog/secureminions and https://github.com/ggml-org/llama.cpp': ['https://ollama.com/blog/secureminions', 'https://github.com/ggml-org/llama.cpp']\n", "[2025-07-27 12:10:19,359] INFO utils.websearch_utils: Created Document for source: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:19,359] INFO utils.websearch_utils: Created Document for source: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:19,360] INFO utils.websearch_utils: Created Document for source: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:19,360] INFO utils.websearch_utils: Created Document for source: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:19,361] INFO utils.websearch_utils: Search results fetched for query 'Summarise https://ollama.com/blog/secureminions and https://github.com/ggml-org/llama.cpp'.\n", "[2025-07-27 12:10:19,361] INFO utils.websearch_utils: Search results fetched for query 'Summarise https://ollama.com/blog/secureminions and https://github.com/ggml-org/llama.cpp'.\n", "[2025-07-27 12:10:19,362] INFO utils.websearch_utils: Starting context_to_docs for 1 URL groups.\n", "[2025-07-27 12:10:19,362] INFO utils.websearch_utils: Starting context_to_docs for 1 URL groups.\n", "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py:136: UserWarning: resource_tracker: process died unexpectedly, relaunching. Some resources might leak.\n", " warnings.warn('resource_tracker: process died unexpectedly, '\n", "[2025-07-27 12:10:22,396] INFO utils.websearch_utils: Fetching URL: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:22,396] INFO utils.websearch_utils: Fetching URL: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:22,396] INFO utils.websearch_utils: Fetching URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:22,396] INFO utils.websearch_utils: Fetching URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:23,126] INFO utils.websearch_utils: Fetched content from https://github.com/ggml-org/llama.cpp with type text/html; charset=utf-8\n", "[2025-07-27 12:10:23,126] INFO utils.websearch_utils: Fetched content from https://github.com/ggml-org/llama.cpp with type text/html; charset=utf-8\n", "[2025-07-27 12:10:23,698] INFO utils.websearch_utils: Fetched content from https://ollama.com/blog/secureminions with type text/html; charset=utf-8\n", "[2025-07-27 12:10:23,698] INFO utils.websearch_utils: Fetched content from https://ollama.com/blog/secureminions with type text/html; charset=utf-8\n", "[2025-07-27 12:10:24,925] INFO utils.websearch_utils: Processed markdown for: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:24,925] INFO utils.websearch_utils: Processed markdown for: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:24,926] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:24,926] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:25,113] INFO utils.websearch_utils: Processed markdown for: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:25,113] INFO utils.websearch_utils: Processed markdown for: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:25,113] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:25,113] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:25,248] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:10:25,248] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:10:25,252] INFO utils.websearch_utils: Processed 1 docs for URL: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:25,252] INFO utils.websearch_utils: Processed 1 docs for URL: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:25,252] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:10:25,252] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:10:25,253] INFO utils.websearch_utils: Encoding start for URL: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:25,253] INFO utils.websearch_utils: Encoding start for URL: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:25,254] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:25,254] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:25,254] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:25,415] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:10:25,415] INFO utils.websearch_utils: Total URLs processed: 1\n", "[2025-07-27 12:10:25,420] INFO utils.websearch_utils: Processed 1 docs for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:25,420] INFO utils.websearch_utils: Processed 1 docs for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:25,420] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:10:25,420] INFO utils.websearch_utils: URLs and markdown links removed from text.\n", "[2025-07-27 12:10:25,421] INFO utils.websearch_utils: Encoding start for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:25,421] INFO utils.websearch_utils: Encoding start for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:25,422] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:25,422] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:25,422] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:25,749] INFO utils.websearch_utils: Encoding end for URL: https://github.com/ggml-org/llama.cpp, Time taken: 0.50 seconds\n", "[2025-07-27 12:10:25,749] INFO utils.websearch_utils: Encoding end for URL: https://github.com/ggml-org/llama.cpp, Time taken: 0.50 seconds\n", "[2025-07-27 12:10:25,750] INFO utils.websearch_utils: Reranking start for URL: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:25,750] INFO utils.websearch_utils: Reranking start for URL: https://github.com/ggml-org/llama.cpp\n", "[2025-07-27 12:10:25,751] INFO utils.websearch_utils: Reranking end for URL: https://github.com/ggml-org/llama.cpp, Time taken: 0.00 seconds\n", "[2025-07-27 12:10:25,751] INFO utils.websearch_utils: Reranking end for URL: https://github.com/ggml-org/llama.cpp, Time taken: 0.00 seconds\n", "[2025-07-27 12:10:26,830] INFO utils.websearch_utils: Encoding end for URL: https://ollama.com/blog/secureminions, Time taken: 1.41 seconds\n", "[2025-07-27 12:10:26,830] INFO utils.websearch_utils: Encoding end for URL: https://ollama.com/blog/secureminions, Time taken: 1.41 seconds\n", "[2025-07-27 12:10:26,831] INFO utils.websearch_utils: Reranking start for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:26,831] INFO utils.websearch_utils: Reranking start for URL: https://ollama.com/blog/secureminions\n", "[2025-07-27 12:10:26,832] INFO utils.websearch_utils: Reranking end for URL: https://ollama.com/blog/secureminions, Time taken: 0.00 seconds\n", "[2025-07-27 12:10:26,832] INFO utils.websearch_utils: Reranking end for URL: https://ollama.com/blog/secureminions, Time taken: 0.00 seconds\n", "Batches: 100%|██████████| 1/1 [00:00<00:00, 2.65it/s]\n", "Batches: 0%| | 0/1 [00:00Summarising using 2 documents\n", "\n" ], "text/plain": [ "Summarising using \u001b[1;36m2\u001b[0m documents\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:10:30,668] INFO utils.websearch_utils: Response generated for query 'Summarise https://ollama.com/blog/secureminions and https://github.com/ggml-org/llama.cpp'.\n", "[2025-07-27 12:10:30,668] INFO utils.websearch_utils: Response generated for query 'Summarise https://ollama.com/blog/secureminions and https://github.com/ggml-org/llama.cpp'.\n" ] }, { "data": { "text/html": [ "
Here's a summary addressing the user's query, combining information from the provided documents:\n",
       "\n",
       "**Summary:**\n",
       "\n",
       "The Ollama blog post details \"Secure Minions,\" a collaboration between Ollama and Stanford's Hazy Research lab.  \n",
       "Secure Minions enhances the original Minions project, which allows local LLMs (like Ollama's models) to collaborate\n",
       "with powerful cloud-based frontier models (like GPT-4o) while minimizing data transfer to the cloud.  The key \n",
       "improvement in Secure Minions is the implementation of end-to-end encryption using NVIDIA's Hopper H100 GPUs' \n",
       "confidential computing mode. This ensures that no plaintext data is exposed during transmission or inference, even \n",
       "from the cloud provider.  The overhead is minimal (less than 1% added latency), even with large models and long \n",
       "prompts.  The blog post provides instructions for setting up and running Secure Minions, including using a provided\n",
       "Streamlit app and example Python code.\n",
       "\n",
       "The GitHub repository for `llama.cpp` describes a C/C++ library for LLM inference.  It aims for minimal setup and \n",
       "high performance across various hardware platforms.  It supports several backends (including Metal, CUDA, HIP, and \n",
       "others), allowing for inference on CPUs, GPUs, and other accelerators.  The repository includes tools like \n",
       "`llama-cli` (a command-line interface), `llama-server` (an OpenAI API-compatible server), and `llama-bench` (for \n",
       "benchmarking).  It also provides instructions for obtaining and quantizing models, primarily using the GGUF format,\n",
       "and offers support for models from Hugging Face and other sources.\n",
       "\n",
       "\n",
       "**Assumptions Made:**\n",
       "\n",
       "* No assumptions were made beyond the information explicitly presented in the provided documents.  The summary \n",
       "directly reflects the content of the blog post and the GitHub repository description.\n",
       "
\n" ], "text/plain": [ "Here's a summary addressing the user's query, combining information from the provided documents:\n", "\n", "**Summary:**\n", "\n", "The Ollama blog post details \u001b[32m\"Secure Minions,\"\u001b[0m a collaboration between Ollama and Stanford's Hazy Research lab. \n", "Secure Minions enhances the original Minions project, which allows local LLMs \u001b[1m(\u001b[0mlike Ollama's models\u001b[1m)\u001b[0m to collaborate\n", "with powerful cloud-based frontier models \u001b[1m(\u001b[0mlike GPT-4o\u001b[1m)\u001b[0m while minimizing data transfer to the cloud. The key \n", "improvement in Secure Minions is the implementation of end-to-end encryption using NVIDIA's Hopper H100 GPUs' \n", "confidential computing mode. This ensures that no plaintext data is exposed during transmission or inference, even \n", "from the cloud provider. The overhead is minimal \u001b[1m(\u001b[0mless than \u001b[1;36m1\u001b[0m% added latency\u001b[1m)\u001b[0m, even with large models and long \n", "prompts. The blog post provides instructions for setting up and running Secure Minions, including using a provided\n", "Streamlit app and example Python code.\n", "\n", "The GitHub repository for `llama.cpp` describes a C/C++ library for LLM inference. It aims for minimal setup and \n", "high performance across various hardware platforms. It supports several backends \u001b[1m(\u001b[0mincluding Metal, CUDA, HIP, and \n", "others\u001b[1m)\u001b[0m, allowing for inference on CPUs, GPUs, and other accelerators. The repository includes tools like \n", "`llama-cli` \u001b[1m(\u001b[0ma command-line interface\u001b[1m)\u001b[0m, `llama-server` \u001b[1m(\u001b[0man OpenAI API-compatible server\u001b[1m)\u001b[0m, and `llama-bench` \u001b[1m(\u001b[0mfor \n", "benchmarking\u001b[1m)\u001b[0m. It also provides instructions for obtaining and quantizing models, primarily using the GGUF format,\n", "and offers support for models from Hugging Face and other sources.\n", "\n", "\n", "**Assumptions Made:**\n", "\n", "* No assumptions were made beyond the information explicitly presented in the provided documents. The summary \n", "directly reflects the content of the blog post and the GitHub repository description.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# or multiple pages\n", "detailed_report = await query_web_response(\n", " \"Summarise https://ollama.com/blog/secureminions and https://github.com/ggml-org/llama.cpp\",\n", " '27-06-2025',\n", " 'Saturday',\n", " searcher,\n", " hf_embeddings,\n", " True,\n", " cross_encoder,\n", " llmgoogle,\n", " text_model=llmgoogle,\n", " num_results=3,\n", " document_paths=[],\n", " local_mode=False,\n", " split=False\n", ")\n", "print(detailed_report[0])" ] }, { "cell_type": "code", "execution_count": 26, "id": "6a622a9f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:10:35,780] INFO utils.answer_generation: Summary:True,subquery:['Explain each code file concisely', 'Give 2 lines max for each explanation'], iscoverd:False\n", "[2025-07-27 12:10:35,780] INFO utils.answer_generation: Summary:True,subquery:['Explain each code file concisely', 'Give 2 lines max for each explanation'], iscoverd:False\n", "[2025-07-27 12:10:35,781] INFO utils.websearch_utils: Search phrases for query 'Read the code file and for each explain me what it does in concise manner, 2 lines max for each': ['explain each code file concisely', 'give 2 lines max for each explanation']\n", "[2025-07-27 12:10:35,781] INFO utils.websearch_utils: Search phrases for query 'Read the code file and for each explain me what it does in concise manner, 2 lines max for each': ['explain each code file concisely', 'give 2 lines max for each explanation']\n", "[2025-07-27 12:10:35,782] WARNING utils.websearch_utils: Please add list of paths as input, earlier it used to be list of list\n", "[2025-07-27 12:10:35,782] WARNING utils.websearch_utils: Please add list of paths as input, earlier it used to be list of list\n", "[2025-07-27 12:10:35,784] INFO utils.websearch_utils: Total paths for 'Read the code file and for each explain me what it does in concise manner, 2 lines max for each': [['utils/map.py', 'app.py']]\n", "[2025-07-27 12:10:35,784] INFO utils.websearch_utils: Total paths for 'Read the code file and for each explain me what it does in concise manner, 2 lines max for each': [['utils/map.py', 'app.py']]\n", "[2025-07-27 12:10:35,785] WARNING utils.websearch_utils: No websearcher provided; using document_paths only.\n", "[2025-07-27 12:10:35,785] WARNING utils.websearch_utils: No websearcher provided; using document_paths only.\n", "[2025-07-27 12:10:35,785] INFO utils.websearch_utils: Starting context_to_docs for 2 URL groups.\n", "[2025-07-27 12:10:35,785] INFO utils.websearch_utils: Starting context_to_docs for 2 URL groups.\n", "/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py:136: UserWarning: resource_tracker: process died unexpectedly, relaunching. Some resources might leak.\n", " warnings.warn('resource_tracker: process died unexpectedly, '\n", "[2025-07-27 12:10:35,817] INFO utils.websearch_utils: Processing local file: utils/map.py\n", "[2025-07-27 12:10:35,817] INFO utils.websearch_utils: Processing local file: utils/map.py\n", "[2025-07-27 12:10:35,818] INFO utils.websearch_utils: Processing local file: utils/map.py\n", "[2025-07-27 12:10:35,818] INFO utils.websearch_utils: Processing local file: utils/map.py\n", "[2025-07-27 12:10:35,823] INFO utils.websearch_utils: Processing local file: app.py\n", "[2025-07-27 12:10:35,823] INFO utils.websearch_utils: Processing local file: app.py\n", "[2025-07-27 12:10:35,825] INFO utils.websearch_utils: Processing local file: app.py\n", "[2025-07-27 12:10:35,825] INFO utils.websearch_utils: Processing local file: app.py\n", "[2025-07-27 12:10:37,822] INFO utils.websearch_utils: Processed markdown for: app.py\n", "[2025-07-27 12:10:37,822] INFO utils.websearch_utils: Processed markdown for: app.py\n", "[2025-07-27 12:10:37,822] INFO utils.websearch_utils: Processed markdown for: utils/map.py\n", "[2025-07-27 12:10:37,822] INFO utils.websearch_utils: Processed markdown for: utils/map.py\n", "[2025-07-27 12:10:37,823] INFO utils.websearch_utils: Processed markdown for: utils/map.py\n", "[2025-07-27 12:10:37,823] INFO utils.websearch_utils: Processed markdown for: utils/map.py\n", "[2025-07-27 12:10:37,823] INFO utils.websearch_utils: Processed markdown for: app.py\n", "[2025-07-27 12:10:37,823] INFO utils.websearch_utils: Processed markdown for: app.py\n", "[2025-07-27 12:10:37,824] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: app.py\n", "[2025-07-27 12:10:37,824] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: app.py\n", "[2025-07-27 12:10:37,825] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: utils/map.py\n", "[2025-07-27 12:10:37,825] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: utils/map.py\n", "[2025-07-27 12:10:37,826] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: utils/map.py\n", "[2025-07-27 12:10:37,826] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: utils/map.py\n", "[2025-07-27 12:10:37,827] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: app.py\n", "[2025-07-27 12:10:37,827] INFO utils.websearch_utils: Successfully processed and added document(s) for URL: app.py\n", "[2025-07-27 12:10:39,227] INFO utils.websearch_utils: Total URLs processed: 4\n", "[2025-07-27 12:10:39,227] INFO utils.websearch_utils: Total URLs processed: 4\n", "[2025-07-27 12:10:39,229] INFO utils.websearch_utils: Total URLs processed: 4\n", "[2025-07-27 12:10:39,229] INFO utils.websearch_utils: Total URLs processed: 4\n", "[2025-07-27 12:10:39,229] INFO utils.websearch_utils: Total URLs processed: 9\n", "[2025-07-27 12:10:39,229] INFO utils.websearch_utils: Total URLs processed: 9\n", "[2025-07-27 12:10:39,229] INFO utils.websearch_utils: Total URLs processed: 9\n", "[2025-07-27 12:10:39,229] INFO utils.websearch_utils: Total URLs processed: 9\n", "[2025-07-27 12:10:39,232] INFO utils.websearch_utils: Processed 4 docs for URL: app.py\n", "[2025-07-27 12:10:39,232] INFO utils.websearch_utils: Processed 4 docs for URL: app.py\n", "[2025-07-27 12:10:39,232] INFO utils.websearch_utils: Processed 9 docs for URL: utils/map.py\n", "[2025-07-27 12:10:39,232] INFO utils.websearch_utils: Processed 9 docs for URL: utils/map.py\n", "[2025-07-27 12:10:39,232] INFO utils.websearch_utils: Processed 4 docs for URL: app.py\n", "[2025-07-27 12:10:39,232] INFO utils.websearch_utils: Processed 4 docs for URL: app.py\n", "[2025-07-27 12:10:39,233] INFO utils.websearch_utils: Encoding start for URL: app.py\n", "[2025-07-27 12:10:39,233] INFO utils.websearch_utils: Encoding start for URL: app.py\n", "[2025-07-27 12:10:39,233] INFO utils.websearch_utils: Processed 9 docs for URL: utils/map.py\n", "[2025-07-27 12:10:39,235] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,233] INFO utils.websearch_utils: Processed 9 docs for URL: utils/map.py\n", "[2025-07-27 12:10:39,233] INFO utils.websearch_utils: Encoding start for URL: utils/map.py\n", "[2025-07-27 12:10:39,235] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,234] INFO utils.websearch_utils: Encoding start for URL: app.py\n", "[2025-07-27 12:10:39,233] INFO utils.websearch_utils: Encoding start for URL: utils/map.py\n", "[2025-07-27 12:10:39,236] INFO utils.websearch_utils: Encoding start for URL: utils/map.py\n", "[2025-07-27 12:10:39,237] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,235] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,237] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,234] INFO utils.websearch_utils: Encoding start for URL: app.py\n", "[2025-07-27 12:10:39,245] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,245] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,236] INFO utils.websearch_utils: Encoding start for URL: utils/map.py\n", "[2025-07-27 12:10:39,246] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,246] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,247] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,237] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,247] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,253] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,245] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,253] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,246] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,277] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,247] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,277] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,300] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,253] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,300] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,277] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:39,300] INFO chromadb.telemetry.product.posthog: Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n", "[2025-07-27 12:10:40,069] INFO utils.websearch_utils: Encoding end for URL: utils/map.py, Time taken: 0.84 seconds\n", "[2025-07-27 12:10:40,069] INFO utils.websearch_utils: Encoding end for URL: utils/map.py, Time taken: 0.84 seconds\n", "[2025-07-27 12:10:40,070] INFO utils.websearch_utils: Reranking start for URL: utils/map.py\n", "[2025-07-27 12:10:40,070] INFO utils.websearch_utils: Reranking start for URL: utils/map.py\n", "[2025-07-27 12:10:40,071] INFO utils.websearch_utils: Reranking end for URL: utils/map.py, Time taken: 0.00 seconds\n", "[2025-07-27 12:10:40,071] INFO utils.websearch_utils: Reranking end for URL: utils/map.py, Time taken: 0.00 seconds\n", "Batches: 0%| | 0/1 [00:00Summarising using 13 documents\n", "\n" ], "text/plain": [ "Summarising using \u001b[1;36m13\u001b[0m documents\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "[2025-07-27 12:10:46,010] INFO utils.websearch_utils: Response generated for query 'Read the code file and for each explain me what it does in concise manner, 2 lines max for each'.\n", "[2025-07-27 12:10:46,010] INFO utils.websearch_utils: Response generated for query 'Read the code file and for each explain me what it does in concise manner, 2 lines max for each'.\n" ] }, { "data": { "text/html": [ "
The code implements a FastAPI application providing various search functionalities.  It uses several utility \n",
       "modules (`utils/map.py`, `utils/websearch_utils.py`, `utils/reddit_utils.py`, `utils/git_utils.py`) for web search,\n",
       "Reddit search, map-based route planning with POIs, and Git repository interaction.  The application also includes \n",
       "functionality for summarizing web pages and YouTube videos, and checking the completeness of research.  A large \n",
       "language model (LLM) is used for several tasks, including response generation and summarization.  The \n",
       "`model_config.py` file is assumed to contain configurations for the LLM and embedding models.  The application uses\n",
       "a SearxNG docker container for web searches (assumed to be started automatically if not already running).  The \n",
       "`utils/map.py` module uses the Overpass and Nominatim APIs for POI retrieval and geocoding, and the OSRM API for \n",
       "route calculation.  The map is generated using Folium and saved as an HTML file.\n",
       "
\n" ], "text/plain": [ "The code implements a FastAPI application providing various search functionalities. It uses several utility \n", "modules \u001b[1m(\u001b[0m`utils/map.py`, `utils/websearch_utils.py`, `utils/reddit_utils.py`, `utils/git_utils.py`\u001b[1m)\u001b[0m for web search,\n", "Reddit search, map-based route planning with POIs, and Git repository interaction. The application also includes \n", "functionality for summarizing web pages and YouTube videos, and checking the completeness of research. A large \n", "language model \u001b[1m(\u001b[0mLLM\u001b[1m)\u001b[0m is used for several tasks, including response generation and summarization. The \n", "`model_config.py` file is assumed to contain configurations for the LLM and embedding models. The application uses\n", "a SearxNG docker container for web searches \u001b[1m(\u001b[0massumed to be started automatically if not already running\u001b[1m)\u001b[0m. The \n", "`utils/map.py` module uses the Overpass and Nominatim APIs for POI retrieval and geocoding, and the OSRM API for \n", "route calculation. The map is generated using Folium and saved as an HTML file.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# you can even do search/summary within local files\n", "\n", "# or multiple pages\n", "detailed_report = await query_web_response(\n", " \"Read the code file and for each explain me what it does in concise manner, 2 lines max for each\",\n", " '27-06-2025',\n", " 'Friday',\n", " searcher,\n", " hf_embeddings,\n", " True,\n", " cross_encoder,\n", " llmgoogle,\n", " text_model=llmgoogle,\n", " num_results=3,\n", " document_paths=['utils/map.py','app.py'],\n", " local_mode=True,\n", " split=False\n", ")\n", "print(detailed_report[0])" ] }, { "cell_type": "markdown", "id": "3c1b7c9e", "metadata": {}, "source": [ "The next cell demonstrates how to summarize Reddit posts and comments from the \"OpenAI\" subreddit using the `reddit_reader_response` function. It retrieves the top 2 \"hot\" posts and summarizes their content with the help of the `llmgoogle` generative model. This is useful for quickly extracting key insights from Reddit discussions." ] }, { "cell_type": "code", "execution_count": 27, "id": "72ef9873", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Fetching posts from /r/OpenAI\n",
       "
\n" ], "text/plain": [ "Fetching posts from \u001b[35m/r/\u001b[0m\u001b[95mOpenAI\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Fetching comments for post: Agent casually clicking the \"I am not a robot\" button\n",
       "
\n" ], "text/plain": [ "Fetching comments for post: Agent casually clicking the \u001b[32m\"I am not a robot\"\u001b[0m button\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Fetching comments for post: Offering researchers $1 billion is not normal\n",
       "
\n" ], "text/plain": [ "Fetching comments for post: Offering researchers $\u001b[1;36m1\u001b[0m billion is not normal\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
This Reddit summary covers two separate posts from r/OpenAI.  Neither post contains substantial discussion; the \n",
       "comments are brief and offer limited elaboration on the post titles.\n",
       "\n",
       "The first post, titled \"Agent casually clicking the 'I am not a robot' button\" \n",
       "(https://www.reddit.com/r/OpenAI/comments/1m9c15h/agent_casually_clicking_the_i_am_not_a_robot/),  received a score\n",
       "of 2502. The post itself contains no text. The comments are limited to: \"That’s hilarious\" and  \"\"This step is \n",
       "necessary to prove I'm not a bot\"\".  These comments are humorous observations on the irony of an AI agent \n",
       "performing a CAPTCHA designed to prevent automated actions.\n",
       "\n",
       "The second post, \"Offering researchers $1 billion is not normal\" \n",
       "(https://www.reddit.com/r/OpenAI/comments/1m9pgjo/offering_researchers_1_billion_is_not_normal/), scored 1180.  \n",
       "This post also lacks substantive text. The comments speculate on the motivations behind such large offers. One \n",
       "comment expresses skepticism about Mark Zuckerberg's judgment, citing his metaverse investments as an example of \n",
       "poor decision-making.  Another comment suggests that mega-wealthy individuals might believe they are the only ones \n",
       "capable of guiding AI development towards \"recursive self-improvement,\" implying a belief in the necessity of their\n",
       "involvement in directing AI's future.  Neither comment provides concrete evidence to support their claims.\n",
       "
\n" ], "text/plain": [ "This Reddit summary covers two separate posts from r/OpenAI. Neither post contains substantial discussion; the \n", "comments are brief and offer limited elaboration on the post titles.\n", "\n", "The first post, titled \u001b[32m\"Agent casually clicking the 'I am not a robot' button\"\u001b[0m \n", "\u001b[1m(\u001b[0m\u001b[4;94mhttps://www.reddit.com/r/OpenAI/comments/1m9c15h/agent_casually_clicking_the_i_am_not_a_robot/\u001b[0m\u001b[4;94m)\u001b[0m\u001b[4;94m,\u001b[0m received a score\n", "of \u001b[1;36m2502\u001b[0m. The post itself contains no text. The comments are limited to: \u001b[32m\"That’s hilarious\"\u001b[0m and \u001b[32m\"\"\u001b[0mThis step is \n", "necessary to prove I'm not a bot\"\". These comments are humorous observations on the irony of an AI agent \n", "performing a CAPTCHA designed to prevent automated actions.\n", "\n", "The second post, \u001b[32m\"Offering researchers $1 billion is not normal\"\u001b[0m \n", "\u001b[1m(\u001b[0m\u001b[4;94mhttps://www.reddit.com/r/OpenAI/comments/1m9pgjo/offering_researchers_1_billion_is_not_normal/\u001b[0m\u001b[4;94m)\u001b[0m\u001b[4;94m,\u001b[0m scored \u001b[1;36m1180\u001b[0m. \n", "This post also lacks substantive text. The comments speculate on the motivations behind such large offers. One \n", "comment expresses skepticism about Mark Zuckerberg's judgment, citing his metaverse investments as an example of \n", "poor decision-making. Another comment suggests that mega-wealthy individuals might believe they are the only ones \n", "capable of guiding AI development towards \u001b[32m\"recursive self-improvement,\"\u001b[0m implying a belief in the necessity of their\n", "involvement in directing AI's future. Neither comment provides concrete evidence to support their claims.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from utils.reddit_utils import *\n", "summary = reddit_reader_response(\n", " subreddit=\"OpenAI\",\n", " url_type=\"hot\",\n", " n=2,\n", " k=2,\n", " custom_url=None,\n", " time_filter=\"today\",\n", " search_query=None,\n", " sort_type=\"hot\",\n", " model=llmgoogle\n", ")\n", "print(summary)" ] }, { "cell_type": "code", "execution_count": 28, "id": "795ed716", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Fetching posts from search search\n",
       "
\n" ], "text/plain": [ "Fetching posts from search search\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Fetching comments for post: Is there a way to use qwen 3 coder inside vs code or cursor\n",
       "
\n" ], "text/plain": [ "Fetching comments for post: Is there a way to use qwen \u001b[1;36m3\u001b[0m coder inside vs code or cursor\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Fetching comments for post: Kimi K2 vs Qwen 3 Coder - Coding Tests\n",
       "
\n" ], "text/plain": [ "Fetching comments for post: Kimi K2 vs Qwen \u001b[1;36m3\u001b[0m Coder - Coding Tests\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
This Reddit summary focuses on user reviews of the Qwen 3 Coder large language model (LLM), specifically concerning\n",
       "its use in coding contexts within VS Code and other IDEs.\n",
       "\n",
       "**Reddit Post 1:  \n",
       "(https://www.reddit.com/r/LocalLLaMA/comments/1m79sp9/is_there_a_way_to_use_qwen_3_coder_inside_vs_code/)**\n",
       "\n",
       "This post asks about integrating Qwen 3 Coder into VS Code or Cursor.  The comments suggest using extensions that \n",
       "support OpenAI-compatible APIs.  Specifically, Roocode for VS Code is mentioned as an option that can connect to \n",
       "local models or OpenRouter.\n",
       "\n",
       "**Reddit Post 2: (https://www.reddit.com/r/ChatGPTCoding/comments/1m7myxo/kimi_k2_vs_qwen_3_coder_coding_tests/)**\n",
       "\n",
       "This post compares Qwen 3 Coder to Kimi K2 in coding tasks across various IDEs (VSCode, Cline, Roo Code, Windsurf).\n",
       "The author conducted tests and provides the following observations:\n",
       "\n",
       "* **Overall Performance:**  Kimi K2 performed better overall in the author's tests.  The author notes that Kimi K2 \n",
       "felt comparable to Sonnet 4 in many aspects.\n",
       "\n",
       "* **Tool Calling:** Qwen 3 Coder exhibited near-flawless tool calling within VSCode, while Kimi K2 struggled with \n",
       "this functionality, similar to Gemini 2.5 Pro.  However, Qwen 3 Coder also performed well with Roo Code's tool \n",
       "calls.\n",
       "\n",
       "* **Instruction Following:** Kimi K2 demonstrated superior instruction following compared to Qwen 3 Coder.\n",
       "\n",
       "* **Cost:** Qwen 3 Coder proved significantly more expensive, particularly when using Alibaba as the inference \n",
       "provider (up to $60 per million tokens).  Switching to other providers within OpenRouter reduced the cost. Kimi K2 \n",
       "was approximately half the price of Qwen 3 Coder.\n",
       "\n",
       "* **Safety:**  The author warns about potential safety issues with Kimi K2 in Windsurf, noting that it deleted a \n",
       "development database in Azure after receiving a dangerous command.  The author recommends denying dangerous \n",
       "commands in Windsurf's settings.\n",
       "\n",
       "* **Video:** A video demonstrating one of the tests is linked: (https://youtu.be/ljCO7RyqCMY).  This video likely \n",
       "provides further visual detail on the comparative performance of the two models.\n",
       "\n",
       "\n",
       "In summary, while Qwen 3 Coder shows promise in its tool-calling capabilities within certain IDEs, Kimi K2 appears \n",
       "to outperform it in overall performance and instruction following, at a significantly lower cost.  However, safety \n",
       "concerns regarding dangerous command execution were raised for Kimi K2.  The cost of using Qwen 3 Coder is heavily \n",
       "dependent on the chosen inference provider.\n",
       "
\n" ], "text/plain": [ "This Reddit summary focuses on user reviews of the Qwen \u001b[1;36m3\u001b[0m Coder large language model \u001b[1m(\u001b[0mLLM\u001b[1m)\u001b[0m, specifically concerning\n", "its use in coding contexts within VS Code and other IDEs.\n", "\n", "**Reddit Post \u001b[1;36m1\u001b[0m: \n", "\u001b[1m(\u001b[0m\u001b[4;94mhttps://www.reddit.com/r/LocalLLaMA/comments/1m79sp9/is_there_a_way_to_use_qwen_3_coder_inside_vs_code/\u001b[0m\u001b[4;94m)\u001b[0m**\n", "\n", "This post asks about integrating Qwen \u001b[1;36m3\u001b[0m Coder into VS Code or Cursor. The comments suggest using extensions that \n", "support OpenAI-compatible APIs. Specifically, Roocode for VS Code is mentioned as an option that can connect to \n", "local models or OpenRouter.\n", "\n", "**Reddit Post \u001b[1;36m2\u001b[0m: \u001b[1m(\u001b[0m\u001b[4;94mhttps://www.reddit.com/r/ChatGPTCoding/comments/1m7myxo/kimi_k2_vs_qwen_3_coder_coding_tests/\u001b[0m\u001b[4;94m)\u001b[0m**\n", "\n", "This post compares Qwen \u001b[1;36m3\u001b[0m Coder to Kimi K2 in coding tasks across various IDEs \u001b[1m(\u001b[0mVSCode, Cline, Roo Code, Windsurf\u001b[1m)\u001b[0m.\n", "The author conducted tests and provides the following observations:\n", "\n", "* **Overall Performance:** Kimi K2 performed better overall in the author's tests. The author notes that Kimi K2 \n", "felt comparable to Sonnet \u001b[1;36m4\u001b[0m in many aspects.\n", "\n", "* **Tool Calling:** Qwen \u001b[1;36m3\u001b[0m Coder exhibited near-flawless tool calling within VSCode, while Kimi K2 struggled with \n", "this functionality, similar to Gemini \u001b[1;36m2.5\u001b[0m Pro. However, Qwen \u001b[1;36m3\u001b[0m Coder also performed well with Roo Code's tool \n", "calls.\n", "\n", "* **Instruction Following:** Kimi K2 demonstrated superior instruction following compared to Qwen \u001b[1;36m3\u001b[0m Coder.\n", "\n", "* **Cost:** Qwen \u001b[1;36m3\u001b[0m Coder proved significantly more expensive, particularly when using Alibaba as the inference \n", "provider \u001b[1m(\u001b[0mup to $\u001b[1;36m60\u001b[0m per million tokens\u001b[1m)\u001b[0m. Switching to other providers within OpenRouter reduced the cost. Kimi K2 \n", "was approximately half the price of Qwen \u001b[1;36m3\u001b[0m Coder.\n", "\n", "* **Safety:** The author warns about potential safety issues with Kimi K2 in Windsurf, noting that it deleted a \n", "development database in Azure after receiving a dangerous command. The author recommends denying dangerous \n", "commands in Windsurf's settings.\n", "\n", "* **Video:** A video demonstrating one of the tests is linked: \u001b[1m(\u001b[0m\u001b[4;94mhttps://youtu.be/ljCO7RyqCMY\u001b[0m\u001b[4;94m)\u001b[0m\u001b[4;94m.\u001b[0m This video likely \n", "provides further visual detail on the comparative performance of the two models.\n", "\n", "\n", "In summary, while Qwen \u001b[1;36m3\u001b[0m Coder shows promise in its tool-calling capabilities within certain IDEs, Kimi K2 appears \n", "to outperform it in overall performance and instruction following, at a significantly lower cost. However, safety \n", "concerns regarding dangerous command execution were raised for Kimi K2. The cost of using Qwen \u001b[1;36m3\u001b[0m Coder is heavily \n", "dependent on the chosen inference provider.\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from utils.reddit_utils import *\n", "summary = reddit_reader_response(\n", " subreddit=\"\",\n", " url_type=\"search\",\n", " n=2,\n", " k=2,\n", " custom_url=None,\n", " time_filter=\"today\",\n", " search_query=\"Qwen 3 coder reviews\",\n", " sort_type=\"hot\",\n", " model=llmgoogle\n", ")\n", "print(summary)" ] }, { "cell_type": "code", "execution_count": null, "id": "8b330d75", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "f286a26f", "metadata": {}, "source": [ "# 🎙️ Now you can create Podcast from any text" ] }, { "cell_type": "code", "execution_count": 3, "id": "40ac2dff-7452-4a28-a91c-f444168fa1ab", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2025-09-01 01:32:43,695] WARNING phonemizer: words count mismatch on 100.0% of the lines (1/1)\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from utils.tts_utils import *\n", "from IPython.display import Audio\n", "await text_to_speech(text=\"\"\"CoexistAI has added new text to speech capability based on Kokoro model!\"\"\",\n", " lang = \"en-us\",\n", " filename = \"output/basic_tts/sample_output.wav\",\n", " voice = \"am_santa\")\n", "\n", "\n", "# If you already saved the file\n", "Audio(\"output/basic_tts/sample_output.wav\", autoplay=True)" ] }, { "cell_type": "code", "execution_count": 7, "id": "f59ec489-a3bc-483b-a6f1-65f10329cf3e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from utils.tts_utils import *\n", "filename = await podcasting_from_text(text=\"\"\"CoexistAI brings together Text-to-Speech (TTS) and Podcast generation to make long-form audio creation simple and accessible. With TTS, you can instantly convert any written content into natural-sounding speech, while the podcast capability lets you take larger pieces of text (articles, transcripts, notes, or scripts) and package them into complete podcast episodes. Whether you want to listen back to your own writing, share knowledge in an audio format, or build an automated pipeline for voice-based content delivery, \n", "this service provides an easy-to-use API and ready-made integrations to go from text → audio in just a few lines of code.\"\"\",\n", "theme=\"Make a interesting audio podcast to introduce the new capability of CoexistAI, make it fun to listen\",\n", "llm = llmgoogle)\n", "\n", "# If you already saved the file\n", "Audio(filename, autoplay=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "5504d16c-d1cd-42a4-bfaa-ddb3e3993a09", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "85516c5e", "metadata": {}, "source": [ "---\n", "\n", "This concludes the tutorial for the **coexist**. You have learned how to perform web search, document processing, answer generation, YouTube summarization, and more. For further details, refer to the project README or explore the codebase for advanced usage patterns. " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: config/model_config.json ================================================ { "llm_model_name": "jan-nano", "llm_type": "local", "embed_mode": "infinity_emb", "embedding_model_name": "nomic-ai/nomic-embed-text-v1", "llm_kwargs": { "temperature": 0.1, "max_tokens": null, "timeout": null, "max_retries": 2 }, "embed_kwargs": { "google_api_key": "" }, "HOST_APP": "host.docker.internal", "PORT_NUM_APP": 8000, "HOST_SEARXNG": "searxng", "PORT_NUM_SEARXNG": 8080, "openai_compatible": { "google": "https://generativelanguage.googleapis.com/v1beta/openai/", "local": "http://host.docker.internal:1234/v1", "groq": "https://api.groq.com/openai/v1", "openai": "https://api.openai.com/v1", "others": "https://openrouter.ai/api/v1" } } ================================================ FILE: demo_queries.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "id": "89ceb341", "metadata": {}, "source": [ "# Few Example Usage Patterns\n", "\n", "This notebook provides a comprehensive overview of how to utilize CoexistAI MCP after connecting it with your preferred LLMs or agents. For illustration, several use case scenarios are included, showcasing interactions with the google/gemma-3-12b model (local gguf) integrated via CoexistAI MCP. These examples highlight just a fraction of the capabilities available—CoexistAI MCP offers a robust set of tools for advanced AI workflows, automation, and experimentation.\n", "\n", "With CoexistAI MCP, you can:\n", "- Seamlessly connect and manage multiple language models or agents.\n", "- Execute complex queries and tasks across different models.\n", "- Automate repetitive processes and orchestrate multi-step workflows.\n", "- Integrate local and cloud-based models for flexible deployment.\n", "- Monitor, evaluate, and optimize model performance in real time.\n", "\n", "Feel free to experiment with the provided examples and explore additional features. The toolset is designed to be extensible, enabling you to discover new usage patterns and tailor solutions to your specific needs. Share your findings and let us know how you leverage CoexistAI MCP in your projects!" ] }, { "cell_type": "markdown", "id": "78b7f13a", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "167af503", "metadata": {}, "source": [ "# Usage Pattern: Travel\n", "\n", "The **Travel** usage pattern demonstrates how CoexistAI MCP can help users plan trips, create itineraries, and discover local experiences by leveraging multiple tools and AI models.\n", "\n", "### Example Workflow\n", "\n", "- **User Query:** \"I have 2 days in Munnar, what things to do? Make the itinerary diverse, I am vegetarian.\"\n", "- **AI Actions:** \n", " - Research attractions, vegetarian food options, and create a balanced itinerary.\n", " - Use web search, map search, and YouTube search for comprehensive information.\n", " - Validate the itinerary for completeness and diversity.\n", "\n", "### Additional Ideas for Travel Use Cases\n", "\n", "- **Personalized Recommendations:** \n", " Suggest activities based on user interests (adventure, relaxation, culture, food, etc.).\n", "\n", "- **Budget Planning:** \n", " Estimate costs for travel, accommodation, food, and activities.\n", "\n", "- **Local Events & Festivals:** \n", " Find events, festivals, or exhibitions happening during the user's travel dates.\n", "\n", "- **Weather & Packing Tips:** \n", " Provide weather forecasts and packing suggestions for the destination.\n", "\n", "- **Accessibility & Safety:** \n", " Highlight accessible attractions, safety tips, and emergency contacts.\n", "\n", "- **Multi-city Itineraries:** \n", " Plan routes and schedules for trips covering multiple cities or regions.\n", "\n", "- **Language & Cultural Insights:** \n", " Offer basic language phrases, cultural etiquette, and local customs.\n", "\n", "- **Transport Options:** \n", " Compare public transport, taxis, car rentals, and walking routes.\n", "\n", "- **Visa & Documentation:** \n", " Advise on visa requirements, travel documents, and local regulations.\n", "\n", "- **Photo Spots & Hidden Gems:** \n", " Recommend scenic locations for photography and lesser-known attractions.\n", "\n", "This pattern can be extended to automate travel planning, optimize routes, and provide real-time updates, making trip organization seamless and personalized." ] }, { "cell_type": "markdown", "id": "1ae60631", "metadata": {}, "source": [ "# Example\n", "\n", "\n", "\n", "**User:** I have 2 days in Munnar, what things to do? Make the itinerary diverse, I am vegeterian. \n", "\n", "**google/gemma-3-12b:** Okay, here's a plan for your 2 days in Munnar, incorporating vegetarian options and aiming for diversity. I will follow the mandatory workflow.\n", "\n", "1. Break Down the Task:\n", "\n", "A: Research popular tourist attractions in Munnar.\n", "B: Identify vegetarian food options available in Munnar.\n", "C: Create a day-by-day itinerary, balancing sightseeing with relaxation and incorporating vegetarian meals.\n", "D: Check if the itinerary is complete and covers all aspects of a diverse experience.\n", "2. Think About All Tools:\n", "\n", "get_web_search: To find popular attractions, vegetarian restaurants, and general information about Munnar.\n", "get_map_search: To locate places on a map and estimate travel times.\n", "get_youtube_search: To get visual insights into the locations.\n", "3. Plan Tool Use:\n", "\n", "I'll start with get_web_search to gather information about Munnar attractions and vegetarian food. Then, I’ll use get_map_search for location details and travel times. Finally, I will use get_youtube_search to get a visual idea of the places.\n", "\n", "get_web_search\n", "({\"query\":\"popular tourist attractions in Munnar\"}…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "Okay, great! Now I have a list of popular attractions in Munnar. Next, I'll research vegetarian food options.\n", "\n", "get_web_search\n", "({\"query\":\"vegetarian restaurants in Munnar\"}…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "Excellent! Now I have a good list of vegetarian restaurants in Munnar. Let's get some visual context for the attractions with YouTube search.\n", "\n", "Model failed to generate a tool call\n", "\n", "\n", "You are absolutely correct. My apologies, I missed the prompt parameter requirement for get_youtube_search. Let me re-run that with a prompt.\n", "\n", "get_youtube_search\n", "({\"query\":\"Eravikulam National Park Munnar\",\"prompt…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Okay, that's a very detailed analysis of the YouTube video. I don’t need to focus on improving the video itself; I just needed the visual context for Eravikulam National Park. It looks beautiful! Now let's move forward with planning the itinerary.\n", "\n", "4. Coverage Check:\n", "\n", "I have information about popular attractions and vegetarian restaurants in Munnar, as well as a glimpse of what Eravikulam National Park is like. I need to create an actual itinerary now. Before doing that, I will use get_response_check to ensure I haven't missed anything crucial.\n", "\n", "get_response_check\n", "({\"query\":\"Create a 2-day vegetarian itinerary for …})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "You are absolutely correct. The previous information was just a collection of facts, not an actual itinerary. I need to structure the information into a day-by-day plan with timings and transportation considerations.\n", "\n", "Now I will construct the 2-day vegetarian itinerary for Munnar.\n", "\n", "Day 1: Tea Gardens & Scenic Views\n", "\n", "8:00 AM: Breakfast at Saravana Bhavan (Nullattani) - South Indian breakfast staples.\n", "9:00 AM: Visit a tea garden (e.g., Kolukkumalai Tea Estate – requires early start and jeep ride). Alternatively, visit Tata Tea Museum for a more accessible experience. (Allow 3-4 hours including travel time).\n", "1:00 PM: Lunch at Hotel Sri Nivas (GH Road) - Try their dosas or South Indian thali.\n", "2:30 PM: Pothamedu View Point – Enjoy panoramic views of the tea gardens and valleys. (Allow 1-2 hours).\n", "4:00 PM: Blossom International Park – Relaxing stroll through a garden with various flowers and plants. (Allow 1-2 hours)\n", "6:00 PM: Dinner at Sree Krishna Marwadi Gujarati Restaurant (AM Road) - Enjoy North Indian/Gujarati vegetarian cuisine.\n", "Day 2: Waterfalls, Culture & Heights\n", "\n", "8:00 AM: Breakfast at Taste the Brews (Near KSRTC Bus Station) – Try their homemade cakes and coffee.\n", "9:00 AM: Visit Attukad Waterfalls - Enjoy a short trek to the waterfall. (Allow 2-3 hours including travel time).\n", "12:00 PM: Lunch at Purohit Restaurant (Hotel Dhanyasree) – Wide variety of vegetarian dishes, including vegan options.\n", "1:30 PM: Top Station – Drive up for stunning views of the Western Ghats and surrounding valleys. (Allow 3-4 hours including travel time).\n", "5:30 PM: Visit Mount Carmel Church - A historical church with beautiful architecture. (Allow 1 hour)\n", "7:00 PM: Dinner at Kaivalyam Retreat Restaurant (Moolakada) – Enjoy fresh, local vegetarian food.\n", "Transportation: Hiring a car or taxi is recommended for flexibility and convenience. Buses are available but may have limited routes and schedules.\n", "\n", "I will now use get_response_check again to ensure this itinerary is complete and addresses all the missing aspects identified earlier.\n", "\n", "get_response_check\n", "({\"query\":\"Evaluate the 2-day vegetarian itinerary …})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "Excellent! The get_response_check confirms that the itinerary is complete and addresses all the necessary points.\n", "\n", "Here's the final 2-day vegetarian itinerary for Munnar:\n", "\n", "Day 1: Tea Gardens & Scenic Views\n", "\n", "8:00 AM: Breakfast at Saravana Bhavan (Nullattani) - South Indian breakfast staples.\n", "9:00 AM: Visit a tea garden (e.g., Kolukkumalai Tea Estate – requires early start and jeep ride). Alternatively, visit Tata Tea Museum for a more accessible experience. (Allow 3-4 hours including travel time).\n", "1:00 PM: Lunch at Hotel Sri Nivas (GH Road) - Try their dosas or South Indian thali.\n", "2:30 PM: Pothamedu View Point – Enjoy panoramic views of the tea gardens and valleys. (Allow 1-2 hours).\n", "4:00 PM: Blossom International Park – Relaxing stroll through a garden with various flowers and plants. (Allow 1-2 hours)\n", "6:00 PM: Dinner at Sree Krishna Marwadi Gujarati Restaurant (AM Road) - Enjoy North Indian/Gujarati vegetarian cuisine.\n", "Day 2: Waterfalls, Culture & Heights\n", "\n", "8:00 AM: Breakfast at Taste the Brews (Near KSRTC Bus Station) – Try their homemade cakes and coffee.\n", "9:00 AM: Visit Attukad Waterfalls - Enjoy a short trek to the waterfall. (Allow 2-3 hours including travel time).\n", "12:00 PM: Lunch at Purohit Restaurant (Hotel Dhanyasree) – Wide variety of vegetarian dishes, including vegan options.\n", "1:30 PM: Top Station – Drive up for stunning views of the Western Ghats and surrounding valleys. (Allow 3-4 hours including travel time).\n", "5:30 PM: Visit Mount Carmel Church - A historical church with beautiful architecture. (Allow 1 hour)\n", "7:00 PM: Dinner at Kaivalyam Retreat Restaurant (Moolakada) – Enjoy fresh, local vegetarian food.\n", "Transportation: Hiring a car or taxi is recommended for flexibility and convenience. Buses are available but may have limited routes and schedules." ] }, { "cell_type": "code", "execution_count": null, "id": "90c17e9d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "b3a3bad9", "metadata": {}, "source": [ "# Usage Pattern: Financial Analysis\n", "\n", "The **Financial Analysis** usage pattern demonstrates how CoexistAI MCP can assist users in analyzing market trends, tracking stock performance, and making informed investment decisions by leveraging multiple tools and AI models.\n", "\n", "### Example Workflow\n", "\n", "- **User Query:** \"Top gainers and losers of today Nifty 50 recent trading day?\"\n", "- **AI Actions:** \n", " - Retrieve latest market data for Nifty 50.\n", " - Identify top performing and underperforming stocks.\n", " - Summarize percentage changes and closing prices.\n", " - Validate the response for completeness and accuracy.\n", "\n", "### Additional Ideas for Financial Analysis Use Cases\n", "\n", "- **Portfolio Tracking:** \n", " Monitor and analyze the performance of a user's investment portfolio over time.\n", "\n", "- **Sector Analysis:** \n", " Compare performance across different sectors (IT, Pharma, Banking, etc.) within major indices.\n", "\n", "- **Historical Trends:** \n", " Visualize historical price movements and identify patterns or anomalies.\n", "\n", "- **News & Sentiment Analysis:** \n", " Integrate financial news and sentiment analysis to assess market impact.\n", "\n", "- **Risk Assessment:** \n", " Evaluate risk metrics such as volatility, beta, and drawdown for selected stocks.\n", "\n", "- **Earnings & Events Calendar:** \n", " Track upcoming earnings releases, dividends, and corporate actions.\n", "\n", "- **Custom Alerts:** \n", " Set up alerts for price movements, volume spikes, or news events.\n", "\n", "- **Fundamental Analysis:** \n", " Analyze company fundamentals including P/E ratio, EPS, revenue growth, and debt levels.\n", "\n", "- **Technical Analysis:** \n", " Apply technical indicators (RSI, MACD, moving averages) for trading signals.\n", "\n", "- **Global Market Comparison:** \n", " Compare Indian indices with global benchmarks (S&P 500, FTSE, Nikkei, etc.).\n", "\n", "This pattern can be extended to automate financial reporting, generate investment insights, and provide real-time market updates, making financial analysis more accessible and actionable." ] }, { "cell_type": "markdown", "id": "35eaa718", "metadata": {}, "source": [ "# Example\n", "**User:** Top gainers and losers of today nifty 50 recent trading day?\n", "\n", "**google/gemma-3-12b:** \n", "Okay, I will find the top gainers and losers of today's Nifty 50 based on recent trading data.\n", "\n", "get_web_search\n", "({\"query\":\"top gainers and losers nifty 50 today\",\"…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "Here's a summary of the top Nifty 50 gainers and losers for today, July 25th, as of 15:59 based on information from Moneycontrol.com and NSE India.\n", "\n", "Top Nifty 50 Gainers:\n", "\n", "Cipla: Gained 3.00%, closing at ₹1,532.50\n", "SBI Life Insurance: Gained 2.15%, closing at ₹1,832.10\n", "Apollo Hospital: Gained 1.43%, closing at ₹7,468.50\n", "Dr Reddys Labs: Gained 0.98%, closing at ₹1,277.90\n", "HDFC Life: Gained 0.67%, closing at ₹762.35\n", "Sun Pharma: Gained 0.41%, closing at ₹1,699.00\n", "Bharti Airtel: Gained 0.09%, closing at ₹1,937.90\n", "Top Nifty 50 Losers:\n", "\n", "Bajaj Finance: Lost 4.71%, closing at ₹913.75\n", "IndusInd Bank: Lost 2.85%, closing at ₹823.70\n", "Shriram Finance: Lost 2.79%, closing at ₹615.85\n", "Bajaj Auto: Lost 2.57%, closing at ₹8,075.50\n", "Tech Mahindra: Lost 2.45%, closing at ₹1,461.90\n", "Power Grid Corp: Lost 2.42%, closing at ₹292.10\n", "Infosys: Lost 2.37%, closing at ₹1,515.70\n", "Bajaj Finserv: Lost 2.29%, closing at ₹1,985.80\n", "Adani Enterprises: Lost 2.24%, closing at ₹2,550.10\n", "Trent: Lost 2.15%, closing at ₹5,043.00\n", "Nestle: Lost 2.05%, closing at ₹2,274.60\n", "get_response_check\n", "({\"query\":\"Top gainers and losers of today nifty 50…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "The response check confirms that all aspects of the query are covered. Here's a summary of the top Nifty 50 gainers and losers for today, July 25th, as of 15:59 based on information from Moneycontrol.com and NSE India.\n", "\n", "Top Nifty 50 Gainers:\n", "\n", "Cipla: Gained 3.00%, closing at ₹1,532.50\n", "SBI Life Insurance: Gained 2.15%, closing at ₹1,832.10\n", "Apollo Hospital: Gained 1.43%, closing at ₹7,468.50\n", "Dr Reddys Labs: Gained 0.98%, closing at ₹1,277.90\n", "HDFC Life: Gained 0.67%, closing at ₹762.35\n", "Sun Pharma: Gained 0.41%, closing at ₹1,699.00\n", "Bharti Airtel: Gained 0.09%, closing at ₹1,937.90\n", "Top Nifty 50 Losers:\n", "\n", "Bajaj Finance: Lost 4.71%, closing at ₹913.75\n", "IndusInd Bank: Lost 2.85%, closing at ₹823.70\n", "Shriram Finance: Lost 2.79%, closing at ₹615.85\n", "Bajaj Auto: Lost 2.57%, closing at ₹8,075.50\n", "Tech Mahindra: Lost 2.45%, closing at ₹1,461.90\n", "Power Grid Corp: Lost 2.42%, closing at ₹292.10\n", "Infosys: Lost 2.37%, closing at ₹1,515.70\n", "Bajaj Finserv: Lost 2.29%, closing at ₹1,985.80\n", "Adani Enterprises: Lost 2.24%, closing at ₹2,550.10\n", "Trent: Lost 2.15%, closing at ₹5,043.00\n", "Nestle: Lost 2.05%, closing at ₹2,274.60" ] }, { "cell_type": "code", "execution_count": null, "id": "2b41bf50", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "2a2e7cba", "metadata": {}, "source": [ "# Usage Pattern: Recent Updates\n", "\n", "The **Recent Updates** usage pattern demonstrates how CoexistAI MCP can help users stay informed about the latest developments across various domains—such as news, finance, sports, politics, weather, AI models, tools, and technologies—by leveraging multiple sources and intelligent workflows.\n", "\n", "### Example Workflow\n", "\n", "- **User Query:** \"What are the latest updates in global finance today?\"\n", "- **AI Actions:** \n", " - Research recent events, releases, or changes in the specified domain.\n", " - Summarize key updates, trends, and notable highlights.\n", " - Provide official sources, links, and relevant documentation.\n", " - Validate the response for completeness and accuracy.\n", "\n", "### Additional Ideas for Recent Updates Use Cases\n", "\n", "- **Breaking News Alerts:** \n", " Get real-time notifications about major news stories, political developments, or global events.\n", "\n", "- **Market & Finance Updates:** \n", " Track stock market movements, currency fluctuations, and economic indicators.\n", "\n", "- **Weather & Disaster Monitoring:** \n", " Receive updates on weather forecasts, severe weather warnings, and natural disasters.\n", "\n", "- **Political Analysis:** \n", " Summarize election results, policy changes, and legislative updates.\n", "\n", "- **Sports Highlights:** \n", " Provide scores, match summaries, and player statistics for ongoing tournaments.\n", "\n", "- **Event Comparison:** \n", " Compare new developments with previous occurrences or trends, highlighting changes and impacts.\n", "\n", "- **Release Monitoring:** \n", " Track and notify users about new releases, updates, or major events in selected domains (e.g., tech, entertainment).\n", "\n", "- **Technical & Analytical Deep Dives:** \n", " Offer detailed breakdowns of significant updates, including analysis and implications.\n", "\n", "- **Integration Guides:** \n", " Provide instructions for incorporating new information or updates into workflows or reports.\n", "\n", "- **Community & Expert Insights:** \n", " Aggregate feedback, expert opinions, and discussions from relevant communities and forums.\n", "\n", "- **Security & Regulatory Updates:** \n", " Inform users about security incidents, regulatory changes, or compliance news.\n", "\n", "- **Tooling & Resource Ecosystem:** \n", " Highlight new tools, resources, or platforms supporting recent updates.\n", "\n", "- **Documentation Summaries:** \n", " Summarize official reports, changelogs, and news articles for quick reference.\n", "\n", "- **Demo & Showcase:** \n", " Link to demos, sample analyses, or showcase applications utilizing the latest updates.\n", "\n", "This pattern can be extended to automate update tracking, generate concise summaries, and provide actionable insights, making it easier for users to keep pace with rapid changes across any area of interest." ] }, { "cell_type": "markdown", "id": "fc5a6287", "metadata": {}, "source": [ "# Example\n", "\n", "**User:** Can you please let me know whats novelty in new Qwen3 coder model, also give me some technical details around it with link to download the model\n", "\n", "**google/gemma-3-12b:**\n", "Okay, I will research the novelty and technical details of the new Qwen3-Coder model and provide a response based on my findings.\n", "\n", "get_web_search\n", "({\"query\":\"Qwen3-Coder model novelty and technical …})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "Okay, here's a breakdown of the Qwen3-Coder model based on the web search results.\n", "\n", "FLASHCARD:\n", "\n", "Model Overview: Qwen3-Coder is Alibaba Cloud’s code-focused version of their Qwen3 large language model series. It's designed for agentic coding tasks and comes in multiple sizes, with a top variant being 480B parameters using a Mixture-of-Experts (MoE) architecture.\n", "Key Novelty - Agentic Coding: The primary focus is on excelling at agentic coding – performing complex coding tasks that involve interaction and tool use. It currently sets state-of-the-art results among open models in this area.\n", "Technical Details:\n", "MoE Architecture: The 480B model utilizes a Mixture of Experts architecture, activating only 35B parameters for efficiency.\n", "Long Context Length: Supports an impressive 256K context window, allowing it to handle extensive codebases and complex interactions.\n", "Training Methodology: Trained using scaled-up Code Reinforcement Learning (Code RL) on real-world coding tasks and long-horizon RL (Agent RL). This encourages multi-turn interaction with tools.\n", "Tool Parser: Employs a new tool parser to facilitate agentic interactions.\n", "Download Locations:\n", "Hugging Face & ModelScope: Checkpoints are available on both platforms; search for names starting with \"Qwen3-Coder-\".\n", "LM Studio: The model is also accessible through LM Studio.\n", "Qwen Code CLI Tool: A research-purpose command-line tool adapted from Gemini CLI, offering enhanced parser and tool support, can be installed via npm.\n", "Tokenizer Note: It's crucial to use the new tokenizer specifically designed for Qwen3-Coder when performing function calling.\n", "get_response_check\n", "({\"query\":\"whats novelty in new Qwen3 coder model, …})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "Okay, based on the response check, all aspects of the user's query appear to be covered. Here’s a final answer summarizing the information:\n", "\n", "Qwen3-Coder is Alibaba Cloud’s code-focused version of their Qwen3 large language model series, designed for agentic coding tasks and excelling in complex coding interactions. The most powerful variant boasts 480 billion parameters using a Mixture-of-Experts (MoE) architecture with 35 billion active parameters. Key technical features include a substantial 256K context window and training via Code Reinforcement Learning (Code RL) and Agent RL, alongside a new tool parser for enhanced agentic capabilities.\n", "\n", "You can download the model checkpoints from Hugging Face and ModelScope (search for names starting with \"Qwen3-Coder-\"), access it through LM Studio, or utilize the Qwen Code CLI Tool via npm. Remember to use the dedicated tokenizer designed specifically for Qwen3-Coder when performing function calling.\n", "\n", "I have fulfilled all requests and utilized available tools as instructed.\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f92ab07d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "ad3a27a5", "metadata": {}, "source": [ "# Usage Pattern: Maps & Location Discovery\n", "\n", "The **Maps & Location Discovery** usage pattern demonstrates how CoexistAI MCP can help users find places, plan routes, and explore points of interest by leveraging mapping tools and AI models.\n", "\n", "### Example Workflow\n", "\n", "- **User Query:** \"Can you find some hotels near MG road Bangalore, on my way to go to Viswesvarya Museum. Can you also tell me some specialties of both the places?\"\n", "- **AI Actions:** \n", " - Use map search to identify hotels along the route from MG Road to Visvesvaraya Museum.\n", " - Retrieve reviews and details for suggested hotels.\n", " - Research and summarize the specialties and attractions of MG Road and Visvesvaraya Museum.\n", " - Validate the response for completeness and accuracy.\n", "\n", "### Additional Ideas for Maps & Location Use Cases\n", "\n", "- **Route Planning:** \n", " Suggest optimal routes for walking, driving, or public transport between multiple locations.\n", "\n", "- **Nearby Amenities:** \n", " Find restaurants, ATMs, hospitals, or other amenities near a specified location.\n", "\n", "- **Custom POI Search:** \n", " Locate points of interest (POIs) such as parks, museums, or shopping centers within a defined radius.\n", "\n", "- **Event Mapping:** \n", " Display locations of events, festivals, or exhibitions on a map.\n", "\n", "- **Accessibility Mapping:** \n", " Highlight wheelchair-accessible routes and facilities.\n", "\n", "- **Traffic & Travel Time:** \n", " Provide real-time traffic updates and estimated travel times.\n", "\n", "- **Multi-stop Itineraries:** \n", " Plan routes with multiple stops and optimize for shortest distance or time.\n", "\n", "- **Safety & Emergency:** \n", " Show nearby police stations, fire stations, and emergency services.\n", "\n", "- **Historical & Cultural Mapping:** \n", " Map out historical landmarks and cultural sites for educational tours.\n", "\n", "- **Custom Map Generation:** \n", " Create interactive maps with custom markers, routes, and overlays for presentations or reports.\n", "\n", "This pattern can be extended to automate location-based searches, generate interactive maps, and provide actionable insights, making navigation and exploration seamless and personalized." ] }, { "cell_type": "markdown", "id": "cbada7bb", "metadata": {}, "source": [ "# Example\n", "**User:** Can you find some hotels near MG road Bangalore, on my way to go to Viswesvarya Museum. Can you also tell me some specialties of both the places?\n", "\n", "**google/gemma-3-12bAI:**\n", "Okay, I will plan this out step by step.\n", "\n", "1. Break Down the Task:\n", "\n", "A: Find hotels near MG Road, Bangalore, suitable for someone going to Visvesvaraya Museum.\n", "B: Determine the specialties/attractions of MG Road and Visvesvaraya Museum.\n", "2. Think About All Tools:\n", "\n", "For A: get_map_search (to find hotels along a route), get_web_search (to check hotel reviews/details).\n", "For B: get_web_search (for both MG Road and Visvesvaraya Museum to gather information about their specialties).\n", "3. Plan Tool Use:\n", "\n", "Use get_map_search to find hotels along the route from a reasonable starting point near MG Road to Visvesvaraya Museum.\n", "Use get_web_search to get more details on the suggested hotels and their reviews.\n", "Use get_web_search to identify specialties of both MG Road and Visvesvaraya Museum.\n", "4. Execute & Write FLASHCARD:\n", "\n", "get_map_search\n", "({\"start_location\":\"MG Road, Bangalore\",\"end_locati…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Okay, I have the route information. Now I need to identify hotels along that route and then find out about the specialties of MG Road and Visvesvaraya Museum.\n", "\n", "get_web_search\n", "({\"query\":\"hotels near MG Road Bangalore reviews\",\"…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Okay, I have a list of hotels near MG Road with reviews and price ranges. Now I need to gather information about the specialties of both MG Road and Visvesvaraya Museum.\n", "\n", "get_web_search\n", "({\"query\":\"specialties of MG Road Bangalore\",\"num_r…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Okay, I have information about the specialties of MG Road. Now I need to find out what makes Visvesvaraya Museum special.\n", "\n", "get_web_search\n", "({\"query\":\"specialties of Visvesvaraya Museum Banga…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Okay, I have gathered all the necessary information about hotels near MG Road, specialties of MG Road, and the exhibits at Visvesvaraya Museum. Now it's time to synthesize this into a comprehensive response for the user.\n", "\n", "get_response_check\n", "({\"query\":\"Find some hotels near MG road Bangalore,…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "Okay, the response check confirms that all aspects of the user's query have been addressed. Here’s a comprehensive answer for the user:\n", "\n", "Here are some hotels near MG Road in Bangalore, suitable for your trip to Visvesvaraya Museum:\n", "\n", "The Oberoi Bengaluru: (9.3 rating) - A luxurious option with award-winning gardens and a spa. Prices start around Rs. 730 and can go up to Rs. 17,000.\n", "Radisson Bengaluru City Center: (8.4 rating) – Offers comfortable rooms with free WiFi and breakfast.\n", "Vivanta Bengaluru Residency Road: (8.4 rating) - Features a spa, outdoor pool, and fitness center.\n", "Olive Zip Rest House Road by Embassy Group: (7.0 rating) - A more budget-friendly option in a convenient location.\n", "Ivory Tower: (8.2 rating) – Offers spacious rooms.\n", "MG Road itself is known for its high-end shopping, diverse restaurants, and vibrant atmosphere. You'll find shops selling sarees, handicrafts like sandalwood items, and popular dining spots like Vivanta by Taj, On the Edge, and Hard Rock Cafe. Other attractions include Cauvery Handicrafts and St Patrick’s Church.\n", "\n", "Visvesvaraya Museum offers a variety of exhibits including Engine Hall, ElectroTechnic Gallery, Fun Science Gallery, the Wright Brothers' Aeroplane exhibit, Space Technology Gallery, Dinosaur Enclave, and Science Park. They also have shows like 3D shows and Science On A Sphere." ] }, { "cell_type": "code", "execution_count": null, "id": "4205806d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "d5b14dad", "metadata": {}, "source": [ "# Usage Pattern: Code Assistance\n", "\n", "The **Code Assistance** usage pattern demonstrates how CoexistAI MCP connects to local and web repositories, codebases, and developer communities to gather context and provide intelligent support for coding tasks.\n", "\n", "### Example Workflow\n", "\n", "- **User Query:** \"Can you help me refactor this Python function for better readability and performance?\"\n", "- **AI Actions:** \n", " - Connect to relevant local or remote repositories to retrieve the code and its context.\n", " - Search documentation, commit history, and community discussions for best practices and related examples.\n", " - Analyze the code in context, considering project structure, dependencies, and coding standards.\n", " - Suggest or apply refactoring based on insights from code, documentation, and community knowledge.\n", " - Validate changes using available tests or by referencing similar solutions from trusted sources.\n", "\n", "### Additional Ideas for Code Assistance Use Cases\n", "\n", "- **Contextual Code Generation:** \n", " Generate code snippets or modules by referencing existing codebases and community solutions.\n", "\n", "- **Repository-Aware Debugging:** \n", " Identify and fix errors by searching across local and web repositories, issue trackers, and forums.\n", "\n", "- **Collaborative Code Review:** \n", " Aggregate feedback from code reviews, pull requests, and community discussions.\n", "\n", "- **Documentation Linking:** \n", " Automatically link code to relevant documentation, wiki pages, and external resources.\n", "\n", "- **Unit Test Discovery:** \n", " Find and suggest relevant tests from local and public repositories.\n", "\n", "- **API Usage Examples:** \n", " Retrieve integration patterns and usage examples from code repositories and developer Q&A sites.\n", "\n", "- **Performance Benchmarking:** \n", " Compare code performance using benchmarks and optimization tips from community sources.\n", "\n", "- **Cross-Language Reference:** \n", " Locate equivalent implementations and migration guides from open-source projects.\n", "\n", "- **Security Insights:** \n", " Surface security advisories and recommended practices from trusted repositories and communities.\n", "\n", "- **Codebase Navigation:** \n", " Help users explore large projects by connecting to repo trees, search tools, and documentation.\n", "\n", "This pattern leverages CoexistAI MCP’s ability to connect to diverse code sources and communities, enabling richer, context-aware assistance that goes beyond direct code editing to include knowledge, collaboration, and best practices." ] }, { "cell_type": "markdown", "id": "410bfca4", "metadata": {}, "source": [ "# Example\n", "\n", "**User:** Can you explain how map works functionality in coexistAI repo by SPThole\n", "\n", "**google/gemma-3-12b:**\n", "Okay, I will investigate the map functionality within the coexistAI repository by SPThole. Here's my plan:\n", "\n", "Get Git Tree: First, I need to understand the structure of the repository to locate relevant files related to mapping functionalities.\n", "Inspect Files: Once I have a tree view, I will identify potential file paths containing map-related code or configuration.\n", "Search for Code/Configuration: If necessary, I'll use get_git_search to examine the contents of specific files identified in step 2.\n", "Summarize Findings: Finally, I’ll synthesize my findings into a clear explanation.\n", "get_git_tree\n", "({\"repobaseurl\":\"https://github.com/SPThole/coexist…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "Okay, I see the repository structure now. It appears there's a file named map.py within the utils/ directory that is likely related to map functionality. There’s also an HTML file map_with_route_and_pois.html in the output folder which might contain generated maps.\n", "\n", "Now I will use get_git_search to examine the contents of map.py. This should give me a better understanding of how the map functionality is implemented.\n", "\n", "get_git_search\n", "({\"repobaseurl\":\"https://github.com/SPThole/coexist…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Okay, that's a very detailed explanation of the utils/map.py file. It outlines the functions used to geocode locations, calculate routes using OSRM, find POIs with Overpass API, and create interactive maps with Folium. The core function is generate_map, which orchestrates the entire process.\n", "\n", "Now I need to check if all aspects of map functionality are covered before providing a final answer.\n", "\n", "get_response_check\n", "({\"query\":\"Explain how map functionality in coexist…})\n", "\n", "mcp/coexistai\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "Okay, the response check confirms that all aspects of the query are covered. Here's a summary of how map functionality works in the coexistAI repository by SPThole:\n", "\n", "The utils/map.py file within the coexistAI repository provides functionality for generating interactive maps with routes and points of interest (POIs). It leverages several external APIs and libraries:\n", "\n", "Nominatim: Used for geocoding, converting location names into latitude and longitude coordinates.\n", "OSRM: Calculates driving routes between specified coordinates.\n", "Overpass API: Retrieves POIs (restaurants, cafes, etc.) within a defined radius of a given location.\n", "Folium: Creates the interactive map itself, displaying markers for locations, routes, and POIs.\n", "The core function, generate_map, orchestrates this process: it takes start and end locations, defines a search radius for POIs, and uses the aforementioned APIs to generate a map with a route and surrounding points of interest. The resulting map is saved as an HTML file (map_with_route_and_pois.html) within the output directory. The code also includes functions for error handling, rate limiting considerations, and setting user agent information for API requests." ] }, { "cell_type": "markdown", "id": "69f61dfd", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "coexist_try", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.1" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: docker-compose.yml ================================================ version: '3.8' services: app: build: . restart: unless-stopped ports: - "8000:8000" volumes: - ./:/app/ # Mount project root so users can edit model_config.py locally after pulling image env_file: - ./.env environment: # Inside the compose network the searxng service is reachable at hostname 'searxng' on port 8080 - HOST_SEARXNG=searxng - PORT_NUM_SEARXNG=8080 # Ensure app binds to all interfaces inside the container (host access via localhost:8000) - HOST_APP=0.0.0.0 - PORT_NUM_APP=8000 depends_on: - searxng searxng: image: searxng/searxng:latest restart: unless-stopped ports: - "8085:8080" volumes: - ./searxng:/etc/searxng environment: # Base URL for external access (host) is http://localhost:8085/ - SEARXNG_BASE_URL=http://localhost:8085/ - SEARXNG_PORT=8085 - SEARXNG_BIND_ADDRESS=0.0.0.0 # For internal container-to-container calls use the service hostname and container port - SEARXNG_BASE_URL_INTERNAL=http://searxng:8080/ ================================================ FILE: entrypoint.sh ================================================ #!/bin/sh set -e # Load environment variables from /app/.env if present if [ -f "/app/.env" ]; then export $(grep -v '^#' /app/.env | xargs) fi echo "Starting CoexistAI (in container)" # Ensure searxng is running is handled by docker-compose service; start app PORT_NUM_APP=${PORT_NUM_APP:-8000} HOST_APP=${HOST_APP:-0.0.0.0} # If model_config.py exists in mounted volume it will override the packaged one echo "Using model_config from $(pwd)/model_config.py" # Check for wget or curl for downloads if command -v wget &> /dev/null; then DOWNLOADER_CMD="wget" DOWNLOADER_ARGS="-O" elif command -v curl &> /dev/null; then DOWNLOADER_CMD="curl" DOWNLOADER_ARGS="-L -o" else echo "Neither wget nor curl could be found in container. Please install one." exit 1 fi # Download large model assets if not present (mirror quick_setup.sh behaviour) if [ ! -f kokoro-v1.0.onnx ]; then echo "Downloading kokoro-v1.0.onnx..." $DOWNLOADER_CMD $DOWNLOADER_ARGS kokoro-v1.0.onnx https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx || true else echo "kokoro-v1.0.onnx already exists, skipping download." fi if [ ! -f voices-v1.0.bin ]; then echo "Downloading voices-v1.0.bin..." $DOWNLOADER_CMD $DOWNLOADER_ARGS voices-v1.0.bin https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin || true else echo "voices-v1.0.bin already exists, skipping download." fi # Run the app using the coexistaienv python (mirrors quick_setup.sh flow) # Ensure HOST_APP is 0.0.0.0 inside container so the service is reachable from the host if [ "${HOST_APP}" = "127.0.0.1" ] || [ "${HOST_APP}" = "localhost" ]; then echo "HOST_APP was set to ${HOST_APP}; overriding to 0.0.0.0 so the container listens on all interfaces" HOST_APP=0.0.0.0 fi # Export so subprocesses (uvicorn/python) inherit the values export HOST_APP export PORT_NUM_APP if [ -f /opt/coexistaienv/bin/activate ]; then echo "Activating coexistaienv and launching uvicorn (binding to 0.0.0.0 port=${PORT_NUM_APP})" # Run without --reload in container to avoid the reloader binding to localhost and resetting connections exec /opt/coexistaienv/bin/python -m uvicorn app:app --host 0.0.0.0 --port ${PORT_NUM_APP} else echo "coexistaienv not found, falling back to system python" # Fallback: run without --reload in container exec uvicorn app:app --host 0.0.0.0 --port ${PORT_NUM_APP} fi ================================================ FILE: model_config.py ================================================ import os import json """ This module defines the configuration for language model (LLM) and embedding models. Attributes: api_key (str): The OpenAI API key, loaded from the environment variable 'OPENAI_API_KEY'. model_config (dict): A dictionary containing configuration parameters for LLM and embedding models. Keys: - "llm_model_name" (str): Name of the LLM model to use. - "llm_type" (str): Type of the LLM provider (e.g., "openai"). "local" is for lmstudio, for ollama and other local models use "others" with base_url updated in openai_compatible. - If you using others llm type, then check the openai_compatible url dict for others key, you can generally find it by "googling YOUR provider name openai api base compatilble url" - "llm_base_url" (str): Base URL for the LLM API endpoint. - "llm_tools" (list): List of tools or plugins to use with the LLM. - "llm_kwargs" (dict): Additional keyword arguments for LLM initialization. - "temperature" (float): Sampling temperature for generation. - "max_tokens" (int or None): Maximum number of tokens to generate. - "timeout" (int or None): Timeout for API requests. - "max_retries" (int): Maximum number of retries for failed requests. - "api_key" (str): API key for authentication. - "embedding_model_name" (str): Name of the embedding model to use. - "embed_mode" (str): Embedding mode or backend. - "cross_encoder_name" (str): Name of the cross-encoder model for reranking. """ ############## PORT and HOST SETTINGS (can be overridden via env vars) DEFAULT_PORT_NUM_SEARXNG = 8080 DEFAULT_PORT_NUM_APP = 8000 DEFAULT_HOST_APP = "host.docker.internal" # Use host.docker.internal for Docker on Mac/Windows else localhost DEFAULT_HOST_SEARXNG = "searxng" # Use container hostname in Docker if running docker else localhost # Allow runtime or build-time overrides via environment variables PORT_NUM_SEARXNG = int(os.environ.get('PORT_NUM_SEARXNG', DEFAULT_PORT_NUM_SEARXNG)) PORT_NUM_APP = int(os.environ.get('PORT_NUM_APP', DEFAULT_PORT_NUM_APP)) HOST_APP = os.environ.get('HOST_APP', DEFAULT_HOST_APP) HOST_SEARXNG = os.environ.get('HOST_SEARXNG', DEFAULT_HOST_SEARXNG) ############### # API keys: prefer provider-specific vars, fall back to the old GOOGLE_API_KEY for compatibility llm_api_key = os.environ.get('LLM_API_KEY', os.environ.get('GOOGLE_API_KEY', 'DUMMY')) embed_api_key = os.environ.get('EMBED_API_KEY', os.environ.get('GOOGLE_API_KEY', 'DUMMY')) # Build the model_config dict but read values from env when provided. This lets users # override defaults using Docker build-args (baked into image via ENV) or runtime -e flags. def _env_bool(key, default=False): v = os.environ.get(key) if v is None: return default return v.lower() in ("1", "true", "yes", "y") def _env_json(key, default=None): v = os.environ.get(key) if not v: return default try: return json.loads(v) except Exception: return default model_config = { "llm_model_name": os.environ.get('LLM_MODEL_NAME', 'gemini-2.0-flash'), "llm_type": os.environ.get('LLM_TYPE', 'google'), "llm_tools": _env_json('LLM_TOOLS', None), "llm_kwargs": { "temperature": float(os.environ.get('LLM_TEMPERATURE', 0.1)), "max_tokens": None if os.environ.get('LLM_MAX_TOKENS') is None else int(os.environ.get('LLM_MAX_TOKENS')), "timeout": None if os.environ.get('LLM_TIMEOUT') is None else int(os.environ.get('LLM_TIMEOUT')), "max_retries": int(os.environ.get('LLM_MAX_RETRIES', 2)), "api_key": llm_api_key, }, "embedding_model_name": os.environ.get('EMBEDDING_MODEL_NAME', 'models/embedding-001'), "embed_kwargs": _env_json('EMBED_KWARGS', {"google_api_key": embed_api_key}), "embed_mode": os.environ.get('EMBED_MODE', 'google'), "cross_encoder_name": os.environ.get('CROSS_ENCODER_NAME', 'BAAI/bge-reranker-base') } # Runtime file-backed config support CONFIG_PATH = os.environ.get('CONFIG_PATH', os.path.join(os.path.dirname(__file__), 'config', 'model_config.json')) def _load_config_file(path): """Load overrides from a JSON config file and apply into module globals/model_config. Expected keys: llm_model_name, llm_type, llm_tools, llm_kwargs (dict), embedding_model_name, embed_mode, embed_kwargs, cross_encoder_name, llm_api_key, embed_api_key, or *_file entries pointing to files that contain the secret values. """ try: with open(path, 'r') as f: cfg = json.load(f) except Exception as e: raise RuntimeError(f"Failed to read config file {path}: {e}") # secrets: direct or file-backed if 'llm_api_key' in cfg: globals()['llm_api_key'] = cfg['llm_api_key'] elif 'llm_api_key_file' in cfg: try: with open(cfg['llm_api_key_file'], 'r') as kf: globals()['llm_api_key'] = kf.read().strip() except Exception: pass if 'embed_api_key' in cfg: globals()['embed_api_key'] = cfg['embed_api_key'] elif 'embed_api_key_file' in cfg: try: with open(cfg['embed_api_key_file'], 'r') as kf: globals()['embed_api_key'] = kf.read().strip() except Exception: pass # top-level simple fields for key in ('llm_model_name', 'llm_type', 'llm_tools', 'embedding_model_name', 'embed_mode', 'cross_encoder_name'): if key in cfg: model_config[key] = cfg[key] # allow overriding openai_compatible mapping if 'openai_compatible' in cfg and isinstance(cfg['openai_compatible'], dict): # update or create the mapping try: globals()['openai_compatible'].update(cfg['openai_compatible']) except Exception: globals()['openai_compatible'] = cfg['openai_compatible'] # allow overriding default host/port values for k in ('DEFAULT_HOST_APP', 'DEFAULT_PORT_NUM_APP', 'DEFAULT_HOST_SEARXNG', 'DEFAULT_PORT_NUM_SEARXNG'): if k in cfg: globals()[k] = cfg[k] # allow runtime host/port overrides in config file for k in ('HOST_APP', 'PORT_NUM_APP', 'HOST_SEARXNG', 'PORT_NUM_SEARXNG'): if k in cfg: # coerce numeric ports to int if 'PORT' in k: try: val = int(cfg[k]) except Exception: val = globals().get(k) else: val = cfg[k] globals()[k] = val # dict fields to merge if 'llm_kwargs' in cfg and isinstance(cfg['llm_kwargs'], dict): model_config.setdefault('llm_kwargs', {}).update(cfg['llm_kwargs']) if 'embed_kwargs' in cfg and isinstance(cfg['embed_kwargs'], dict): model_config.setdefault('embed_kwargs', {}).update(cfg['embed_kwargs']) # ensure api keys are set into llm/embed kwargs model_config.setdefault('llm_kwargs', {})['api_key'] = globals().get('llm_api_key', model_config['llm_kwargs'].get('api_key')) model_config.setdefault('embed_kwargs', {}) if 'google_api_key' not in model_config['embed_kwargs']: model_config['embed_kwargs']['google_api_key'] = globals().get('embed_api_key', model_config['embed_kwargs'].get('google_api_key')) return model_config def reload_model_config(path=None): """Reload configuration from JSON file. If path is None, uses CONFIG_PATH env var or default. Raises RuntimeError if load fails. """ p = path or os.environ.get('CONFIG_PATH') or CONFIG_PATH if not p or not os.path.exists(p): raise FileNotFoundError(f"Config path not found: {p}") return _load_config_file(p) # Try to load config at import time if present; this is non-fatal. try: if os.path.exists(CONFIG_PATH): _load_config_file(CONFIG_PATH) except Exception: # ignore errors during import; admin endpoints or manual reload can show issues pass # NO CHANGE NEEDED UNLESS PROVIDER CHANGES THE BASE URLS, OR YOU WANT TO USE DIFFERENT PROVIDER UNDER "others" openai_compatible = { 'google': "https://generativelanguage.googleapis.com/v1beta/openai/", 'local': "http://127.0.0.1:1234/v1", 'groq': 'https://api.groq.com/openai/v1', 'openai':'https://api.openai.com/v1', 'others': 'https://openrouter.ai/api/v1' # for an example I have added here the openrouter api, since its openai compatible } #####IF YOU WANT TO GO ALL LOCAL # model_config = { # # Name of the LLM model to use. For local models, use the model name served by your local server. # "llm_model_name": "google/gemma-3-12b", # # LLM provider type: choose from 'google', 'local', 'groq', or 'openai' or 'others' # # in case of 'others' (base url needs to be updated in openai_compatible given below accordingly). # # Make sure to update the api_key variable above to match the provider. # "llm_type": "local", # # List of tools or plugins to use with the LLM, if any. Set to None if not used. # "llm_tools": None, # # Additional keyword arguments for LLM initialization. # "llm_kwargs": { # "temperature": 0.1, # Sampling temperature for generation. # "max_tokens": None, # Maximum number of tokens to generate (None for default). # "timeout": None, # Timeout for API requests (None for default). # "max_retries": 2, # Maximum number of retries for failed requests. # "api_key": llm_api_key, # API key for authentication. # }, # # Name of the embedding model to use. # # For Google, use their embedding model names. For local/HuggingFace, use the model path or name. # "embedding_model_name": "nomic-ai/nomic-embed-text-v1", # "embed_kwargs":{}, #additional kwargs for embedding model initialization # # Embedding backend: 'google' for Google, 'infinity_emb' for local/HuggingFace models. # "embed_mode": "infinity_emb", # # Name of the cross-encoder model for reranking, typically a HuggingFace model. # "cross_encoder_name": "BAAI/bge-reranker-base" # } ================================================ FILE: output/map_with_route_and_pois.html ================================================
================================================ FILE: quick_setup.sh ================================================ #!/bin/zsh # Quick Shell Setup for CoexistAI (macOS/zsh) echo "Pulling SearxNG Docker image..." docker pull searxng/searxng echo "📚 Installing infinity_emb in separate environment..." echo "Creating and activating Python virtual environment..." python3.12 -m venv infinity_env source infinity_env/bin/activate pip install 'infinity_emb[all]' pip install --upgrade "transformers<4.49" pip install --upgrade "typer==0.19.1" "click>=8.1.3" "optimum==1.27.0" deactivate echo "✅ Infinity environment setup complete" # (Optional) Create and activate a Python virtual environment echo "Creating and activating Python virtual environment..." python3.12 -m venv coexistaienv source coexistaienv/bin/activate pip install 'markitdown[all]' # You can neglect this if you dont want to use google models (either llm or embedding) echo "Setting GOOGLE_API_KEY, add any other keys which you want to store in environment (edit this script to use your real key)" export GOOGLE_API_KEY=REPLACE_YOUR_API_KEY_HERE_WITHOUT_QUOTES_AND_SPACES # Spin up the SearxNG Docker container echo "Starting SearxNG Docker container..." PORT_NUM_SEARXNG=$(python3.12 -c "from model_config import PORT_NUM_SEARXNG; print(PORT_NUM_SEARXNG)") HOST_SEARXNG=$(python3.12 -c "from model_config import HOST_SEARXNG; print(HOST_SEARXNG)") # Stop and remove existing searxng container if it exists if [ "$(docker ps -aq -f name=searxng)" ]; then echo "Stopping and removing existing SearxNG container..." docker stop searxng 2>/dev/null || true docker rm searxng 2>/dev/null || true fi # Start new SearxNG container docker run -d \ --name searxng \ -p ${PORT_NUM_SEARXNG}:8080 \ -v $(pwd)/searxng:/etc/searxng:rw \ -e SEARXNG_BASE_URL=http://${HOST_SEARXNG}:${PORT_NUM_SEARXNG}/ \ -e SEARXNG_PORT=${PORT_NUM_SEARXNG} \ -e SEARXNG_BIND_ADDRESS=${HOST_SEARXNG} \ --restart unless-stopped \ searxng/searxng:latest echo "SearxNG container started successfully!" # Install Python dependencies echo "Installing Python dependencies..." pip install -r ./requirements.txt # Adding tts files # Check if wget is installed # Check if wget or curl is installed if command -v wget &> /dev/null; then DOWNLOADER_CMD="wget" DOWNLOADER_ARGS="-O" elif command -v curl &> /dev/null; then DOWNLOADER_CMD="curl" DOWNLOADER_ARGS="-L -o" else echo "Neither wget nor curl could be found, please install one to continue." exit 1 fi # Download kokoro-v1.0.onnx if not present if [ ! -f kokoro-v1.0.onnx ]; then $DOWNLOADER_CMD $DOWNLOADER_ARGS kokoro-v1.0.onnx https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx else echo "kokoro-v1.0.onnx already exists, skipping download." fi # Download voices-v1.0.bin if not present if [ ! -f voices-v1.0.bin ]; then $DOWNLOADER_CMD $DOWNLOADER_ARGS voices-v1.0.bin https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin else echo "voices-v1.0.bin already exists, skipping download." fi # 8. Start the FastAPI app echo "Starting FastAPI app..." cd . || exit 1 # Get port and host values from model_config PORT_NUM_APP=$(python3.12 -c "from model_config import PORT_NUM_APP; print(PORT_NUM_APP)") HOST_APP=$(python3.12 -c "from model_config import HOST_APP; print(HOST_APP)") uvicorn app:app --host ${HOST_APP} --port ${PORT_NUM_APP} --reload --reload-include '*.json' ================================================ FILE: quick_setup_docker.sh ================================================ #!/usr/bin/env bash # start_and_wait.sh - start docker compose and wait until CoexistAI app reports ready # Usage: ./start_and_wait.sh #!/usr/bin/env bash # start_and_wait.sh - start docker compose and wait until CoexistAI app reports ready # Usage: ./start_and_wait.sh set -euo pipefail if docker image inspect coexistai-app > /dev/null 2>&1; then COMPOSE_CMD="docker compose up -d" else COMPOSE_CMD="docker compose up -d --build" fi TIMEOUT=${1:-500} # seconds to wait (default 500s) INTERVAL=3 echo "Running: $COMPOSE_CMD" $COMPOSE_CMD echo "" echo "Started containers (detached). Streaming logs from app container..." echo "===============================================================================" # Start streaming logs in the background and to stdout docker compose logs -f app 2>&1 & LOGS_PID=$! # Give logs a moment to start sleep 0.5 echo "Waiting for CoexistAI to report ready on http://localhost:8000/status (timeout ${TIMEOUT}s)..." echo "" START=$(date +%s) while true; do if [ $(( $(date +%s) - START )) -ge $TIMEOUT ]; then kill $LOGS_PID 2>/dev/null || true wait $LOGS_PID 2>/dev/null || true echo "" echo "===============================================================================" echo "ERROR: Timed out waiting for app to become ready after ${TIMEOUT}s" echo "Check full logs with: docker compose logs app --tail=500" exit 2 fi # fetch status JSON and extract status field using python for reliable parsing status=$(curl -s http://localhost:8000/status || true) if [ -n "$status" ]; then st=$(printf '%s' "$status" | python3 -c 'import sys,json try: o=json.load(sys.stdin) print(o.get("status","")) except Exception: print("")') if [ "$st" = "ready" ]; then kill $LOGS_PID 2>/dev/null || true wait $LOGS_PID 2>/dev/null || true echo "" echo "===============================================================================" echo "✓ CoexistAI status: READY" echo "===============================================================================" break fi if [ "$st" = "error" ]; then kill $LOGS_PID 2>/dev/null || true wait $LOGS_PID 2>/dev/null || true echo "" echo "===============================================================================" echo "ERROR: CoexistAI reported error state" echo "Check logs with: docker compose logs app --tail=200" echo "===============================================================================" exit 3 fi fi sleep $INTERVAL done echo "" echo "Done: app is ready." echo "Access the admin panel to configure models:" echo " http://localhost:8000/admin" echo "Default ADMIN_TOKEN is 123456 (changeable in .env)" echo "===============================================================================" exit 0 ================================================ FILE: requirements.txt ================================================ fastapi uvicorn pydantic aiohttp langchain<1.0.0 langchain-google-genai langchain-openai langchain-text-splitters langgraph pandas python-dotenv requests tqdm PyPDF2 pdfminer.six folium Pillow beautifulsoup4 httpx typing_extensions browser-use ipykernel streamlit rich langchain_huggingface langchain_community chromadb pymupdf pymupdf4llm rank_bm25 langchain_experimental einops sentence-transformers youtube_search fastapi-mcp gitingest langchain-groq youtube_transcript_api>=1.1.0 slowapi charset-normalizer kokoro-onnx soundfile langchain-chroma ================================================ FILE: searxng/settings.yml ================================================ general: # Debug mode, only for development. Is overwritten by ${SEARXNG_DEBUG} debug: false # displayed name instance_name: "my-instance" # For example: https://example.com/privacy privacypolicy_url: false # use true to use your own donation page written in searx/info/en/donate.md # use false to disable the donation link donation_url: false # mailto:contact@example.com contact_url: false # record stats enable_metrics: true brand: new_issue_url: https://github.com/searxng/searxng/issues/new docs_url: https://docs.searxng.org/ public_instances: https://searx.space wiki_url: https://github.com/searxng/searxng/wiki issue_url: https://github.com/searxng/searxng/issues # custom: # maintainer: "Jon Doe" # # Custom entries in the footer: [title]: [link] # links: # Uptime: https://uptime.searxng.org/history/darmarit-org # About: "https://searxng.org" search: # Filter results. 0: None, 1: Moderate, 2: Strict safe_search: 0 # Existing autocomplete backends: "dbpedia", "duckduckgo", "google", "yandex", "mwmbl", # "seznam", "startpage", "stract", "swisscows", "qwant", "wikipedia" - leave blank to turn it off # by default. autocomplete: "" # minimun characters to type before autocompleter starts autocomplete_min: 4 # Default search language - leave blank to detect from browser information or # use codes from 'languages.py' default_lang: "auto" # max_page: 0 # if engine supports paging, 0 means unlimited numbers of pages # Available languages # languages: # - all # - en # - en-US # - de # - it-IT # - fr # - fr-BE # Optimized ban times for faster recovery ban_time_on_fail: 3 # Reduce max ban time for quicker engine recovery max_ban_time_on_fail: 60 suspended_times: # Engine suspension time after error (in seconds; set to 0 to disable) # For error "Access denied" and "HTTP error [402, 403]" SearxEngineAccessDenied: 86400 # For error "CAPTCHA" SearxEngineCaptcha: 86400 # For error "Too many request" and "HTTP error 429" SearxEngineTooManyRequests: 3600 # Cloudflare CAPTCHA cf_SearxEngineCaptcha: 1296000 cf_SearxEngineAccessDenied: 86400 # ReCAPTCHA recaptcha_SearxEngineCaptcha: 604800 # remove format to deny access, use lower case. # formats: [html, csv, json, rss] formats: - html - json server: # Is overwritten by ${SEARXNG_PORT} and ${SEARXNG_BIND_ADDRESS} port: 8888 bind_address: "127.0.0.1" # public URL of the instance, to ensure correct inbound links. Is overwritten # by ${SEARXNG_URL}. base_url: http://localhost:8080/ # "http://example.com/location" limiter: false # rate limit the number of request on the instance, block some bots public_instance: false # enable features designed only for public instances # If your instance owns a /etc/searxng/settings.yml file, then set the following # values there. secret_key: "your_secret_key" # Is overwritten by ${SEARXNG_SECRET} # Proxying image results through searx image_proxy: false # Use HTTP/1.1 for better performance and keep-alive support http_protocol_version: "1.1" # POST queries are more secure as they don't show up in history but may cause # problems when using Firefox containers method: "POST" default_http_headers: X-Content-Type-Options: nosniff X-Download-Options: noopen X-Robots-Tag: noindex, nofollow Referrer-Policy: no-referrer redis: # Enable Redis for caching to significantly improve performance # URL to connect redis database. Is overwritten by ${SEARXNG_REDIS_URL}. # https://docs.searxng.org/admin/settings/settings_redis.html#settings-redis url: "redis://localhost:6379/0" ui: # Custom static path - leave it blank if you didn't change static_path: "" static_use_hash: false # Custom templates path - leave it blank if you didn't change templates_path: "" # query_in_title: When true, the result page's titles contains the query # it decreases the privacy, since the browser can records the page titles. query_in_title: false # infinite_scroll: When true, automatically loads the next page when scrolling to bottom of the current page. infinite_scroll: false # ui theme default_theme: simple # center the results ? center_alignment: false # URL prefix of the internet archive, don't forget trailing slash (if needed). # cache_url: "https://webcache.googleusercontent.com/search?q=cache:" # Default interface locale - leave blank to detect from browser information or # use codes from the 'locales' config section default_locale: "" # Open result links in a new tab by default # results_on_new_tab: false theme_args: # style of simple theme: auto, light, dark simple_style: auto # Perform search immediately if a category selected. # Disable to select multiple categories at once and start the search manually. search_on_category_select: true # Hotkeys: default or vim hotkeys: default # Lock arbitrary settings on the preferences page. To find the ID of the user # setting you want to lock, check the ID of the form on the page "preferences". # # preferences: # lock: # - language # - autocomplete # - method # - query_in_title # searx supports result proxification using an external service: # https://github.com/asciimoo/morty uncomment below section if you have running # morty proxy the key is base64 encoded (keep the !!binary notation) # Note: since commit af77ec3, morty accepts a base64 encoded key. # # result_proxy: # url: http://127.0.0.1:3000/ # # the key is a base64 encoded string, the YAML !!binary prefix is optional # key: !!binary "your_morty_proxy_key" # # [true|false] enable the "proxy" button next to each result # proxify_results: true # communication with search engines # outgoing: # Optimized timeout settings for better performance request_timeout: 2.0 # Reduce maximum timeout to prevent slow engines from blocking max_request_timeout: 8.0 # suffix of searx_useragent, could contain information like an email address # to the administrator useragent_suffix: "" # The maximum number of concurrent connections that may be established. pool_connections: 100 # Allow the connection pool to maintain keep-alive connections below this # point. pool_maxsize: 20 # See https://www.python-httpx.org/http2/ enable_http2: true # uncomment below section if you want to use a custom server certificate # see https://www.python-httpx.org/advanced/#changing-the-verification-defaults # and https://www.python-httpx.org/compatibility/#ssl-configuration # verify: ~/.mitmproxy/mitmproxy-ca-cert.cer # # uncomment below section if you want to use a proxyq see: SOCKS proxies # https://2.python-requests.org/en/latest/user/advanced/#proxies # are also supported: see # https://2.python-requests.org/en/latest/user/advanced/#socks # # proxies: # all://: # - http://proxy1:8080 # - http://proxy2:8080 # # using_tor_proxy: true # # Extra seconds to add in order to account for the time taken by the proxy # # extra_proxy_timeout: 10 # # uncomment below section only if you have more than one network interface # which can be the source of outgoing search requests # # source_ips: # - 1.1.1.1 # - 1.1.1.2 # - fe80::/126 # External plugin configuration, for more details see # https://docs.searxng.org/dev/plugins.html # # plugins: # - plugin1 # - plugin2 # - ... # Comment or un-comment plugin to activate / deactivate by default. # # enabled_plugins: # # these plugins are enabled if nothing is configured .. # - 'Hash plugin' # - 'Self Information' # - 'Tracker URL remover' # - 'Ahmia blacklist' # activation depends on outgoing.using_tor_proxy # # these plugins are disabled if nothing is configured .. # - 'Hostnames plugin' # see 'hostnames' configuration below # - 'Basic Calculator' # - 'Open Access DOI rewrite' # - 'Tor check plugin' # # Read the docs before activate: auto-detection of the language could be # # detrimental to users expectations / users can activate the plugin in the # # preferences if they want. # - 'Autodetect search language' # Configuration of the "Hostnames plugin": # # hostnames: # replace: # '(.*\.)?youtube\.com$': 'invidious.example.com' # '(.*\.)?youtu\.be$': 'invidious.example.com' # '(.*\.)?reddit\.com$': 'teddit.example.com' # '(.*\.)?redd\.it$': 'teddit.example.com' # '(www\.)?twitter\.com$': 'nitter.example.com' # remove: # - '(.*\.)?facebook.com$' # low_priority: # - '(.*\.)?google(\..*)?$' # high_priority: # - '(.*\.)?wikipedia.org$' # # Alternatively you can use external files for configuring the "Hostnames plugin": # # hostnames: # replace: 'rewrite-hosts.yml' # # Content of 'rewrite-hosts.yml' (place the file in the same directory as 'settings.yml'): # '(.*\.)?youtube\.com$': 'invidious.example.com' # '(.*\.)?youtu\.be$': 'invidious.example.com' # checker: # disable checker when in debug mode off_when_debug: true # use "scheduling: false" to disable scheduling # scheduling: interval or int # to activate the scheduler: # * uncomment "scheduling" section # * add "cache2 = name=searxngcache,items=2000,blocks=2000,blocksize=4096,bitmap=1" # to your uwsgi.ini # scheduling: # start_after: [300, 1800] # delay to start the first run of the checker # every: [86400, 90000] # how often the checker runs # additional tests: only for the YAML anchors (see the engines section) # additional_tests: rosebud: &test_rosebud matrix: query: rosebud lang: en result_container: - not_empty - ['one_title_contains', 'citizen kane'] test: - unique_results android: &test_android matrix: query: ['android'] lang: ['en', 'de', 'fr', 'zh-CN'] result_container: - not_empty - ['one_title_contains', 'google'] test: - unique_results # tests: only for the YAML anchors (see the engines section) tests: infobox: &tests_infobox infobox: matrix: query: ["linux", "new york", "bbc"] result_container: - has_infobox categories_as_tabs: general: images: videos: news: map: music: it: science: files: social media: engines: - name: 9gag engine: 9gag shortcut: 9g disabled: true - name: annas archive engine: annas_archive disabled: true shortcut: aa # - name: annas articles # engine: annas_archive # shortcut: aaa # # https://docs.searxng.org/dev/engines/online/annas_archive.html # aa_content: 'magazine' # book_fiction, book_unknown, book_nonfiction, book_comic # aa_ext: 'pdf' # pdf, epub, .. # aa_sort: oldest' # newest, oldest, largest, smallest - name: apk mirror engine: apkmirror timeout: 4.0 shortcut: apkm disabled: true - name: apple app store engine: apple_app_store shortcut: aps disabled: true # Requires Tor - name: ahmia engine: ahmia categories: onions enable_http: true shortcut: ah - name: anaconda engine: xpath paging: true first_page_num: 0 search_url: https://anaconda.org/search?q={query}&page={pageno} results_xpath: //tbody/tr url_xpath: ./td/h5/a[last()]/@href title_xpath: ./td/h5 content_xpath: ./td[h5]/text() categories: it timeout: 6.0 shortcut: conda disabled: true - name: arch linux wiki engine: archlinux shortcut: al - name: artic engine: artic shortcut: arc timeout: 4.0 - name: arxiv engine: arxiv shortcut: arx timeout: 4.0 - name: ask engine: ask shortcut: ask disabled: true # tmp suspended: dh key too small # - name: base # engine: base # shortcut: bs - name: bandcamp engine: bandcamp shortcut: bc categories: music - name: wikipedia engine: wikipedia shortcut: wp # add "list" to the array to get results in the results list display_type: ["infobox"] base_url: 'https://{language}.wikipedia.org/' categories: [general] - name: bilibili engine: bilibili shortcut: bil disabled: true - name: bing engine: bing shortcut: bi disabled: true - name: bing images engine: bing_images shortcut: bii - name: bing news engine: bing_news shortcut: bin - name: bing videos engine: bing_videos shortcut: biv - name: bitbucket engine: xpath paging: true search_url: https://bitbucket.org/repo/all/{pageno}?name={query} url_xpath: //article[@class="repo-summary"]//a[@class="repo-link"]/@href title_xpath: //article[@class="repo-summary"]//a[@class="repo-link"] content_xpath: //article[@class="repo-summary"]/p categories: [it, repos] timeout: 4.0 disabled: true shortcut: bb about: website: https://bitbucket.org/ wikidata_id: Q2493781 official_api_documentation: https://developer.atlassian.com/bitbucket use_official_api: false require_api_key: false results: HTML - name: bpb engine: bpb shortcut: bpb disabled: true - name: btdigg engine: btdigg shortcut: bt disabled: true - name: openverse engine: openverse categories: images shortcut: opv - name: media.ccc.de engine: ccc_media shortcut: c3tv # We don't set language: de here because media.ccc.de is not just # for a German audience. It contains many English videos and many # German videos have English subtitles. disabled: true - name: chefkoch engine: chefkoch shortcut: chef # to show premium or plus results too: # skip_premium: false # - name: core.ac.uk # engine: core # categories: science # shortcut: cor # # get your API key from: https://core.ac.uk/api-keys/register/ # api_key: 'unset' - name: cppreference engine: cppreference shortcut: cpp paging: false disabled: true - name: crossref engine: crossref shortcut: cr timeout: 30 disabled: true - name: crowdview engine: json_engine shortcut: cv categories: general paging: false search_url: https://crowdview-next-js.onrender.com/api/search-v3?query={query} results_query: results url_query: link title_query: title content_query: snippet disabled: true about: website: https://crowdview.ai/ - name: yep engine: yep shortcut: yep categories: general search_type: web timeout: 5 disabled: true - name: yep images engine: yep shortcut: yepi categories: images search_type: images disabled: true - name: yep news engine: yep shortcut: yepn categories: news search_type: news disabled: true - name: curlie engine: xpath shortcut: cl categories: general disabled: true paging: true lang_all: '' search_url: https://curlie.org/search?q={query}&lang={lang}&start={pageno}&stime=92452189 page_size: 20 results_xpath: //div[@id="site-list-content"]/div[@class="site-item"] url_xpath: ./div[@class="title-and-desc"]/a/@href title_xpath: ./div[@class="title-and-desc"]/a/div content_xpath: ./div[@class="title-and-desc"]/div[@class="site-descr"] about: website: https://curlie.org/ wikidata_id: Q60715723 use_official_api: false require_api_key: false results: HTML - name: currency engine: currency_convert categories: general shortcut: cc - name: bahnhof engine: json_engine search_url: https://www.bahnhof.de/api/stations/search/{query} url_prefix: https://www.bahnhof.de/ url_query: slug title_query: name content_query: state shortcut: bf disabled: true about: website: https://www.bahn.de wikidata_id: Q22811603 use_official_api: false require_api_key: false results: JSON language: de tests: bahnhof: matrix: query: berlin lang: en result_container: - not_empty - ['one_title_contains', 'Berlin Hauptbahnhof'] test: - unique_results - name: deezer engine: deezer shortcut: dz disabled: true - name: destatis engine: destatis shortcut: destat disabled: true - name: deviantart engine: deviantart shortcut: da timeout: 3.0 - name: ddg definitions engine: duckduckgo_definitions shortcut: ddd weight: 2 disabled: true tests: *tests_infobox # cloudflare protected # - name: digbt # engine: digbt # shortcut: dbt # timeout: 6.0 # disabled: true - name: docker hub engine: docker_hub shortcut: dh categories: [it, packages] - name: erowid engine: xpath paging: true first_page_num: 0 page_size: 30 search_url: https://www.erowid.org/search.php?q={query}&s={pageno} url_xpath: //dl[@class="results-list"]/dt[@class="result-title"]/a/@href title_xpath: //dl[@class="results-list"]/dt[@class="result-title"]/a/text() content_xpath: //dl[@class="results-list"]/dd[@class="result-details"] categories: [] shortcut: ew disabled: true about: website: https://www.erowid.org/ wikidata_id: Q1430691 official_api_documentation: use_official_api: false require_api_key: false results: HTML # - name: elasticsearch # shortcut: es # engine: elasticsearch # base_url: http://localhost:9200 # username: elastic # password: changeme # index: my-index # # available options: match, simple_query_string, term, terms, custom # query_type: match # # if query_type is set to custom, provide your query here # #custom_query_json: {"query":{"match_all": {}}} # #show_metadata: false # disabled: true - name: wikidata engine: wikidata shortcut: wd timeout: 3.0 weight: 2 # add "list" to the array to get results in the results list display_type: ["infobox"] tests: *tests_infobox categories: [general] - name: duckduckgo engine: duckduckgo shortcut: ddg - name: duckduckgo images engine: duckduckgo_extra categories: [images, web] ddg_category: images shortcut: ddi disabled: true - name: duckduckgo videos engine: duckduckgo_extra categories: [videos, web] ddg_category: videos shortcut: ddv disabled: true - name: duckduckgo news engine: duckduckgo_extra categories: [news, web] ddg_category: news shortcut: ddn disabled: true - name: duckduckgo weather engine: duckduckgo_weather shortcut: ddw disabled: true - name: apple maps engine: apple_maps shortcut: apm disabled: true timeout: 5.0 - name: emojipedia engine: emojipedia timeout: 4.0 shortcut: em disabled: true - name: tineye engine: tineye shortcut: tin timeout: 9.0 disabled: true - name: etymonline engine: xpath paging: true search_url: https://etymonline.com/search?page={pageno}&q={query} url_xpath: //a[contains(@class, "word__name--")]/@href title_xpath: //a[contains(@class, "word__name--")] content_xpath: //section[contains(@class, "word__defination")] first_page_num: 1 shortcut: et categories: [dictionaries] about: website: https://www.etymonline.com/ wikidata_id: Q1188617 official_api_documentation: use_official_api: false require_api_key: false results: HTML # - name: ebay # engine: ebay # shortcut: eb # base_url: 'https://www.ebay.com' # disabled: true # timeout: 5 - name: 1x engine: www1x shortcut: 1x timeout: 3.0 disabled: true - name: fdroid engine: fdroid shortcut: fd disabled: true - name: findthatmeme engine: findthatmeme shortcut: ftm disabled: true - name: flickr categories: images shortcut: fl # You can use the engine using the official stable API, but you need an API # key, see: https://www.flickr.com/services/apps/create/ # engine: flickr # api_key: 'apikey' # required! # Or you can use the html non-stable engine, activated by default engine: flickr_noapi - name: free software directory engine: mediawiki shortcut: fsd categories: [it, software wikis] base_url: https://directory.fsf.org/ search_type: title timeout: 5.0 disabled: true about: website: https://directory.fsf.org/ wikidata_id: Q2470288 # - name: freesound # engine: freesound # shortcut: fnd # disabled: true # timeout: 15.0 # API key required, see: https://freesound.org/docs/api/overview.html # api_key: MyAPIkey - name: frinkiac engine: frinkiac shortcut: frk disabled: true - name: fyyd engine: fyyd shortcut: fy timeout: 8.0 disabled: true - name: genius engine: genius shortcut: gen - name: gentoo engine: gentoo shortcut: ge timeout: 10.0 - name: gitlab engine: json_engine paging: true search_url: https://gitlab.com/api/v4/projects?search={query}&page={pageno} url_query: web_url title_query: name_with_namespace content_query: description page_size: 20 categories: [it, repos] shortcut: gl timeout: 10.0 disabled: true about: website: https://about.gitlab.com/ wikidata_id: Q16639197 official_api_documentation: https://docs.gitlab.com/ee/api/ use_official_api: false require_api_key: false results: JSON - name: github engine: github shortcut: gh - name: codeberg # https://docs.searxng.org/dev/engines/online/gitea.html engine: gitea base_url: https://codeberg.org shortcut: cb disabled: true - name: gitea.com engine: gitea base_url: https://gitea.com shortcut: gitea disabled: true - name: goodreads engine: goodreads shortcut: good timeout: 4.0 disabled: true - name: google engine: google shortcut: go # additional_tests: # android: *test_android - name: google images engine: google_images shortcut: goi # additional_tests: # android: *test_android # dali: # matrix: # query: ['Dali Christ'] # lang: ['en', 'de', 'fr', 'zh-CN'] # result_container: # - ['one_title_contains', 'Salvador'] - name: google news engine: google_news shortcut: gon # additional_tests: # android: *test_android - name: google videos engine: google_videos shortcut: gov # additional_tests: # android: *test_android - name: google scholar engine: google_scholar shortcut: gos - name: google play apps engine: google_play categories: [files, apps] shortcut: gpa play_categ: apps disabled: true - name: google play movies engine: google_play categories: videos shortcut: gpm play_categ: movies disabled: true - name: material icons engine: material_icons categories: images shortcut: mi disabled: true - name: gpodder engine: json_engine shortcut: gpod timeout: 4.0 paging: false search_url: https://gpodder.net/search.json?q={query} url_query: url title_query: title content_query: description page_size: 19 categories: music disabled: true about: website: https://gpodder.net wikidata_id: Q3093354 official_api_documentation: https://gpoddernet.readthedocs.io/en/latest/api/ use_official_api: false requires_api_key: false results: JSON - name: habrahabr engine: xpath paging: true search_url: https://habr.com/en/search/page{pageno}/?q={query} results_xpath: //article[contains(@class, "tm-articles-list__item")] url_xpath: .//a[@class="tm-title__link"]/@href title_xpath: .//a[@class="tm-title__link"] content_xpath: .//div[contains(@class, "article-formatted-body")] categories: it timeout: 4.0 disabled: true shortcut: habr about: website: https://habr.com/ wikidata_id: Q4494434 official_api_documentation: https://habr.com/en/docs/help/api/ use_official_api: false require_api_key: false results: HTML - name: hackernews engine: hackernews shortcut: hn disabled: true - name: hex engine: hex shortcut: hex disabled: true # Valid values: name inserted_at updated_at total_downloads recent_downloads sort_criteria: "recent_downloads" page_size: 10 - name: crates.io engine: crates shortcut: crates disabled: true timeout: 6.0 - name: hoogle engine: xpath search_url: https://hoogle.haskell.org/?hoogle={query} results_xpath: '//div[@class="result"]' title_xpath: './/div[@class="ans"]//a' url_xpath: './/div[@class="ans"]//a/@href' content_xpath: './/div[@class="from"]' page_size: 20 categories: [it, packages] shortcut: ho about: website: https://hoogle.haskell.org/ wikidata_id: Q34010 official_api_documentation: https://hackage.haskell.org/api use_official_api: false require_api_key: false results: JSON - name: imdb engine: imdb shortcut: imdb timeout: 6.0 disabled: true - name: imgur engine: imgur shortcut: img disabled: true - name: ina engine: ina shortcut: in timeout: 6.0 disabled: true - name: invidious engine: invidious # Instanes will be selected randomly, see https://api.invidious.io/ for # instances that are stable (good uptime) and close to you. base_url: - https://invidious.io.lol - https://invidious.fdn.fr - https://yt.artemislena.eu - https://invidious.tiekoetter.com - https://invidious.flokinet.to - https://vid.puffyan.us - https://invidious.privacydev.net - https://inv.tux.pizza shortcut: iv timeout: 3.0 disabled: true - name: jisho engine: jisho shortcut: js timeout: 3.0 disabled: true - name: kickass engine: kickass base_url: - https://kickasstorrents.to - https://kickasstorrents.cr - https://kickasstorrent.cr - https://kickass.sx - https://kat.am shortcut: kc timeout: 4.0 - name: lemmy communities engine: lemmy lemmy_type: Communities shortcut: leco - name: lemmy users engine: lemmy network: lemmy communities lemmy_type: Users shortcut: leus - name: lemmy posts engine: lemmy network: lemmy communities lemmy_type: Posts shortcut: lepo - name: lemmy comments engine: lemmy network: lemmy communities lemmy_type: Comments shortcut: lecom - name: library genesis engine: xpath # search_url: https://libgen.is/search.php?req={query} search_url: https://libgen.rs/search.php?req={query} url_xpath: //a[contains(@href,"book/index.php?md5")]/@href title_xpath: //a[contains(@href,"book/")]/text()[1] content_xpath: //td/a[1][contains(@href,"=author")]/text() categories: files timeout: 7.0 disabled: true shortcut: lg about: website: https://libgen.fun/ wikidata_id: Q22017206 official_api_documentation: use_official_api: false require_api_key: false results: HTML - name: z-library engine: zlibrary shortcut: zlib categories: files timeout: 7.0 - name: library of congress engine: loc shortcut: loc categories: images - name: libretranslate engine: libretranslate # https://github.com/LibreTranslate/LibreTranslate?tab=readme-ov-file#mirrors base_url: - https://translate.terraprint.co - https://trans.zillyhuhn.com # api_key: abc123 shortcut: lt disabled: true - name: lingva engine: lingva shortcut: lv # set lingva instance in url, by default it will use the official instance # url: https://lingva.thedaviddelta.com - name: lobste.rs engine: xpath search_url: https://lobste.rs/search?q={query}&what=stories&order=relevance results_xpath: //li[contains(@class, "story")] url_xpath: .//a[@class="u-url"]/@href title_xpath: .//a[@class="u-url"] content_xpath: .//a[@class="domain"] categories: it shortcut: lo timeout: 5.0 disabled: true about: website: https://lobste.rs/ wikidata_id: Q60762874 official_api_documentation: use_official_api: false require_api_key: false results: HTML - name: mastodon users engine: mastodon mastodon_type: accounts base_url: https://mastodon.social shortcut: mau - name: mastodon hashtags engine: mastodon mastodon_type: hashtags base_url: https://mastodon.social shortcut: mah # - name: matrixrooms # engine: mrs # # https://docs.searxng.org/dev/engines/online/mrs.html # # base_url: https://mrs-api-host # shortcut: mtrx # disabled: true - name: mdn shortcut: mdn engine: json_engine categories: [it] paging: true search_url: https://developer.mozilla.org/api/v1/search?q={query}&page={pageno} results_query: documents url_query: mdn_url url_prefix: https://developer.mozilla.org title_query: title content_query: summary about: website: https://developer.mozilla.org wikidata_id: Q3273508 official_api_documentation: null use_official_api: false require_api_key: false results: JSON - name: metacpan engine: metacpan shortcut: cpan disabled: true number_of_results: 20 # - name: meilisearch # engine: meilisearch # shortcut: mes # enable_http: true # base_url: http://localhost:7700 # index: my-index - name: mixcloud engine: mixcloud shortcut: mc # MongoDB engine # Required dependency: pymongo # - name: mymongo # engine: mongodb # shortcut: md # exact_match_only: false # host: '127.0.0.1' # port: 27017 # enable_http: true # results_per_page: 20 # database: 'business' # collection: 'reviews' # name of the db collection # key: 'name' # key in the collection to search for - name: mozhi engine: mozhi base_url: - https://mozhi.aryak.me - https://translate.bus-hit.me - https://nyc1.mz.ggtyler.dev # mozhi_engine: google - see https://mozhi.aryak.me for supported engines timeout: 4.0 shortcut: mz disabled: true - name: mwmbl engine: mwmbl # api_url: https://api.mwmbl.org shortcut: mwm disabled: true - name: npm engine: npm shortcut: npm timeout: 5.0 disabled: true - name: nyaa engine: nyaa shortcut: nt disabled: true - name: mankier engine: json_engine search_url: https://www.mankier.com/api/v2/mans/?q={query} results_query: results url_query: url title_query: name content_query: description categories: it shortcut: man about: website: https://www.mankier.com/ official_api_documentation: https://www.mankier.com/api use_official_api: true require_api_key: false results: JSON # read https://docs.searxng.org/dev/engines/online/mullvad_leta.html # - name: mullvadleta # engine: mullvad_leta # use_cache: true # Only 100 non-cache searches per day, suggested only for private instances # search_url: https://leta.mullvad.net # categories: [general, web] # shortcut: ml - name: odysee engine: odysee shortcut: od disabled: true - name: openairedatasets engine: json_engine paging: true search_url: https://api.openaire.eu/search/datasets?format=json&page={pageno}&size=10&title={query} results_query: response/results/result url_query: metadata/oaf:entity/oaf:result/children/instance/webresource/url/$ title_query: metadata/oaf:entity/oaf:result/title/$ content_query: metadata/oaf:entity/oaf:result/description/$ content_html_to_text: true categories: "science" shortcut: oad timeout: 5.0 about: website: https://www.openaire.eu/ wikidata_id: Q25106053 official_api_documentation: https://api.openaire.eu/ use_official_api: false require_api_key: false results: JSON - name: openairepublications engine: json_engine paging: true search_url: https://api.openaire.eu/search/publications?format=json&page={pageno}&size=10&title={query} results_query: response/results/result url_query: metadata/oaf:entity/oaf:result/children/instance/webresource/url/$ title_query: metadata/oaf:entity/oaf:result/title/$ content_query: metadata/oaf:entity/oaf:result/description/$ content_html_to_text: true categories: science shortcut: oap timeout: 5.0 about: website: https://www.openaire.eu/ wikidata_id: Q25106053 official_api_documentation: https://api.openaire.eu/ use_official_api: false require_api_key: false results: JSON - name: openmeteo engine: open_meteo shortcut: om disabled: true # - name: opensemanticsearch # engine: opensemantic # shortcut: oss # base_url: 'http://localhost:8983/solr/opensemanticsearch/' - name: openstreetmap engine: openstreetmap shortcut: osm - name: openrepos engine: xpath paging: true search_url: https://openrepos.net/search/node/{query}?page={pageno} url_xpath: //li[@class="search-result"]//h3[@class="title"]/a/@href title_xpath: //li[@class="search-result"]//h3[@class="title"]/a content_xpath: //li[@class="search-result"]//div[@class="search-snippet-info"]//p[@class="search-snippet"] categories: files timeout: 4.0 disabled: true shortcut: or about: website: https://openrepos.net/ wikidata_id: official_api_documentation: use_official_api: false require_api_key: false results: HTML - name: packagist engine: json_engine paging: true search_url: https://packagist.org/search.json?q={query}&page={pageno} results_query: results url_query: url title_query: name content_query: description categories: [it, packages] disabled: true timeout: 5.0 shortcut: pack about: website: https://packagist.org wikidata_id: Q108311377 official_api_documentation: https://packagist.org/apidoc use_official_api: true require_api_key: false results: JSON - name: pdbe engine: pdbe shortcut: pdb # Hide obsolete PDB entries. Default is not to hide obsolete structures # hide_obsolete: false - name: photon engine: photon shortcut: ph - name: pinterest engine: pinterest shortcut: pin - name: piped engine: piped shortcut: ppd categories: videos piped_filter: videos timeout: 3.0 # URL to use as link and for embeds frontend_url: https://srv.piped.video # Instance will be selected randomly, for more see https://piped-instances.kavin.rocks/ backend_url: - https://pipedapi.kavin.rocks - https://pipedapi-libre.kavin.rocks - https://pipedapi.adminforge.de - name: piped.music engine: piped network: piped shortcut: ppdm categories: music piped_filter: music_songs timeout: 3.0 - name: piratebay engine: piratebay shortcut: tpb # You may need to change this URL to a proxy if piratebay is blocked in your # country url: https://thepiratebay.org/ timeout: 3.0 - name: pixiv shortcut: pv engine: pixiv disabled: true inactive: true pixiv_image_proxies: - https://pximg.example.org # A proxy is required to load the images. Hosting an image proxy server # for Pixiv: # --> https://pixivfe.pages.dev/hosting-image-proxy-server/ # Proxies from public instances. Ask the public instances owners if they # agree to receive traffic from SearXNG! # --> https://codeberg.org/VnPower/PixivFE#instances # --> https://github.com/searxng/searxng/pull/3192#issuecomment-1941095047 # image proxy of https://pixiv.cat # - https://i.pixiv.cat # image proxy of https://www.pixiv.pics # - https://pximg.cocomi.eu.org # image proxy of https://pixivfe.exozy.me # - https://pximg.exozy.me # image proxy of https://pixivfe.ducks.party # - https://pixiv.ducks.party # image proxy of https://pixiv.perennialte.ch # - https://pximg.perennialte.ch - name: podcastindex engine: podcastindex shortcut: podcast # Required dependency: psychopg2 # - name: postgresql # engine: postgresql # database: postgres # username: postgres # password: postgres # limit: 10 # query_str: 'SELECT * from my_table WHERE my_column = %(query)s' # shortcut : psql - name: presearch engine: presearch search_type: search categories: [general, web] shortcut: ps timeout: 4.0 disabled: true - name: presearch images engine: presearch network: presearch search_type: images categories: [images, web] timeout: 4.0 shortcut: psimg disabled: true - name: presearch videos engine: presearch network: presearch search_type: videos categories: [general, web] timeout: 4.0 shortcut: psvid disabled: true - name: presearch news engine: presearch network: presearch search_type: news categories: [news, web] timeout: 4.0 shortcut: psnews disabled: true - name: pub.dev engine: xpath shortcut: pd search_url: https://pub.dev/packages?q={query}&page={pageno} paging: true results_xpath: //div[contains(@class,"packages-item")] url_xpath: ./div/h3/a/@href title_xpath: ./div/h3/a content_xpath: ./div/div/div[contains(@class,"packages-description")]/span categories: [packages, it] timeout: 3.0 disabled: true first_page_num: 1 about: website: https://pub.dev/ official_api_documentation: https://pub.dev/help/api use_official_api: false require_api_key: false results: HTML - name: pubmed engine: pubmed shortcut: pub timeout: 3.0 - name: pypi shortcut: pypi engine: pypi - name: qwant qwant_categ: web engine: qwant shortcut: qw categories: [general, web] additional_tests: rosebud: *test_rosebud - name: qwant news qwant_categ: news engine: qwant shortcut: qwn categories: news network: qwant - name: qwant images qwant_categ: images engine: qwant shortcut: qwi categories: [images, web] network: qwant - name: qwant videos qwant_categ: videos engine: qwant shortcut: qwv categories: [videos, web] network: qwant # - name: library # engine: recoll # shortcut: lib # base_url: 'https://recoll.example.org/' # search_dir: '' # mount_prefix: /export # dl_prefix: 'https://download.example.org' # timeout: 30.0 # categories: files # disabled: true # - name: recoll library reference # engine: recoll # base_url: 'https://recoll.example.org/' # search_dir: reference # mount_prefix: /export # dl_prefix: 'https://download.example.org' # shortcut: libr # timeout: 30.0 # categories: files # disabled: true - name: radio browser engine: radio_browser shortcut: rb - name: reddit engine: reddit shortcut: re page_size: 25 - name: rottentomatoes engine: rottentomatoes shortcut: rt disabled: true # Required dependency: redis # - name: myredis # shortcut : rds # engine: redis_server # exact_match_only: false # host: '127.0.0.1' # port: 6379 # enable_http: true # password: '' # db: 0 # tmp suspended: bad certificate # - name: scanr structures # shortcut: scs # engine: scanr_structures # disabled: true - name: searchmysite engine: xpath shortcut: sms categories: general paging: true search_url: https://searchmysite.net/search/?q={query}&page={pageno} results_xpath: //div[contains(@class,'search-result')] url_xpath: .//a[contains(@class,'result-link')]/@href title_xpath: .//span[contains(@class,'result-title-txt')]/text() content_xpath: ./p[@id='result-hightlight'] disabled: true about: website: https://searchmysite.net - name: sepiasearch engine: sepiasearch shortcut: sep - name: soundcloud engine: soundcloud shortcut: sc - name: stackoverflow engine: stackexchange shortcut: st api_site: 'stackoverflow' categories: [it, q&a] - name: askubuntu engine: stackexchange shortcut: ubuntu api_site: 'askubuntu' categories: [it, q&a] - name: internetarchivescholar engine: internet_archive_scholar shortcut: ias timeout: 15.0 - name: superuser engine: stackexchange shortcut: su api_site: 'superuser' categories: [it, q&a] - name: discuss.python engine: discourse shortcut: dpy base_url: 'https://discuss.python.org' categories: [it, q&a] disabled: true - name: caddy.community engine: discourse shortcut: caddy base_url: 'https://caddy.community' categories: [it, q&a] disabled: true - name: pi-hole.community engine: discourse shortcut: pi categories: [it, q&a] base_url: 'https://discourse.pi-hole.net' disabled: true - name: searchcode code engine: searchcode_code shortcut: scc disabled: true # - name: searx # engine: searx_engine # shortcut: se # instance_urls : # - http://127.0.0.1:8888/ # - ... # disabled: true - name: semantic scholar engine: semantic_scholar disabled: true shortcut: se # Spotify needs API credentials # - name: spotify # engine: spotify # shortcut: stf # api_client_id: ******* # api_client_secret: ******* # - name: solr # engine: solr # shortcut: slr # base_url: http://localhost:8983 # collection: collection_name # sort: '' # sorting: asc or desc # field_list: '' # comma separated list of field names to display on the UI # default_fields: '' # default field to query # query_fields: '' # query fields # enable_http: true # - name: springer nature # engine: springer # # get your API key from: https://dev.springernature.com/signup # # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601" # api_key: 'unset' # shortcut: springer # timeout: 15.0 - name: startpage engine: startpage shortcut: sp timeout: 6.0 disabled: true additional_tests: rosebud: *test_rosebud - name: tokyotoshokan engine: tokyotoshokan shortcut: tt timeout: 6.0 disabled: true - name: solidtorrents engine: solidtorrents shortcut: solid timeout: 4.0 base_url: - https://solidtorrents.to - https://bitsearch.to # For this demo of the sqlite engine download: # https://liste.mediathekview.de/filmliste-v2.db.bz2 # and unpack into searx/data/filmliste-v2.db # Query to test: "!demo concert" # # - name: demo # engine: sqlite # shortcut: demo # categories: general # result_template: default.html # database: searx/data/filmliste-v2.db # query_str: >- # SELECT title || ' (' || time(duration, 'unixepoch') || ')' AS title, # COALESCE( NULLIF(url_video_hd,''), NULLIF(url_video_sd,''), url_video) AS url, # description AS content # FROM film # WHERE title LIKE :wildcard OR description LIKE :wildcard # ORDER BY duration DESC - name: tagesschau engine: tagesschau # when set to false, display URLs from Tagesschau, and not the actual source # (e.g. NDR, WDR, SWR, HR, ...) use_source_url: true shortcut: ts disabled: true - name: tmdb engine: xpath paging: true categories: movies search_url: https://www.themoviedb.org/search?page={pageno}&query={query} results_xpath: //div[contains(@class,"movie") or contains(@class,"tv")]//div[contains(@class,"card")] url_xpath: .//div[contains(@class,"poster")]/a/@href thumbnail_xpath: .//img/@src title_xpath: .//div[contains(@class,"title")]//h2 content_xpath: .//div[contains(@class,"overview")] shortcut: tm disabled: true # Requires Tor - name: torch engine: xpath paging: true search_url: http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and results_xpath: //table//tr url_xpath: ./td[2]/a title_xpath: ./td[2]/b content_xpath: ./td[2]/small categories: onions enable_http: true shortcut: tch # torznab engine lets you query any torznab compatible indexer. Using this # engine in combination with Jackett opens the possibility to query a lot of # public and private indexers directly from SearXNG. More details at: # https://docs.searxng.org/dev/engines/online/torznab.html # # - name: Torznab EZTV # engine: torznab # shortcut: eztv # base_url: http://localhost:9117/api/v2.0/indexers/eztv/results/torznab # enable_http: true # if using localhost # api_key: xxxxxxxxxxxxxxx # show_magnet_links: true # show_torrent_files: false # # https://github.com/Jackett/Jackett/wiki/Jackett-Categories # torznab_categories: # optional # - 2000 # - 5000 # tmp suspended - too slow, too many errors # - name: urbandictionary # engine : xpath # search_url : https://www.urbandictionary.com/define.php?term={query} # url_xpath : //*[@class="word"]/@href # title_xpath : //*[@class="def-header"] # content_xpath: //*[@class="meaning"] # shortcut: ud - name: unsplash engine: unsplash shortcut: us - name: yandex music engine: yandex_music shortcut: ydm disabled: true # https://yandex.com/support/music/access.html inactive: true - name: yahoo engine: yahoo shortcut: yh disabled: true - name: yahoo news engine: yahoo_news shortcut: yhn - name: youtube shortcut: yt # You can use the engine using the official stable API, but you need an API # key See: https://console.developers.google.com/project # # engine: youtube_api # api_key: 'apikey' # required! # # Or you can use the html non-stable engine, activated by default engine: youtube_noapi - name: dailymotion engine: dailymotion shortcut: dm - name: vimeo engine: vimeo shortcut: vm - name: wiby engine: json_engine paging: true search_url: https://wiby.me/json/?q={query}&p={pageno} url_query: URL title_query: Title content_query: Snippet categories: [general, web] shortcut: wib disabled: true about: website: https://wiby.me/ - name: alexandria engine: json_engine shortcut: alx categories: general paging: true search_url: https://api.alexandria.org/?a=1&q={query}&p={pageno} results_query: results title_query: title url_query: url content_query: snippet timeout: 1.5 disabled: true about: website: https://alexandria.org/ official_api_documentation: https://github.com/alexandria-org/alexandria-api/raw/master/README.md use_official_api: true require_api_key: false results: JSON - name: wikibooks engine: mediawiki weight: 0.5 shortcut: wb categories: [general, wikimedia] base_url: "https://{language}.wikibooks.org/" search_type: text disabled: true about: website: https://www.wikibooks.org/ wikidata_id: Q367 - name: wikinews engine: mediawiki shortcut: wn categories: [news, wikimedia] base_url: "https://{language}.wikinews.org/" search_type: text srsort: create_timestamp_desc about: website: https://www.wikinews.org/ wikidata_id: Q964 - name: wikiquote engine: mediawiki weight: 0.5 shortcut: wq categories: [general, wikimedia] base_url: "https://{language}.wikiquote.org/" search_type: text disabled: true additional_tests: rosebud: *test_rosebud about: website: https://www.wikiquote.org/ wikidata_id: Q369 - name: wikisource engine: mediawiki weight: 0.5 shortcut: ws categories: [general, wikimedia] base_url: "https://{language}.wikisource.org/" search_type: text disabled: true about: website: https://www.wikisource.org/ wikidata_id: Q263 - name: wikispecies engine: mediawiki shortcut: wsp categories: [general, science, wikimedia] base_url: "https://species.wikimedia.org/" search_type: text disabled: true about: website: https://species.wikimedia.org/ wikidata_id: Q13679 tests: wikispecies: matrix: query: "Campbell, L.I. et al. 2011: MicroRNAs" lang: en result_container: - not_empty - ['one_title_contains', 'Tardigrada'] test: - unique_results - name: wiktionary engine: mediawiki shortcut: wt categories: [dictionaries, wikimedia] base_url: "https://{language}.wiktionary.org/" search_type: text about: website: https://www.wiktionary.org/ wikidata_id: Q151 - name: wikiversity engine: mediawiki weight: 0.5 shortcut: wv categories: [general, wikimedia] base_url: "https://{language}.wikiversity.org/" search_type: text disabled: true about: website: https://www.wikiversity.org/ wikidata_id: Q370 - name: wikivoyage engine: mediawiki weight: 0.5 shortcut: wy categories: [general, wikimedia] base_url: "https://{language}.wikivoyage.org/" search_type: text disabled: true about: website: https://www.wikivoyage.org/ wikidata_id: Q373 - name: wikicommons.images engine: wikicommons shortcut: wc categories: images search_type: images number_of_results: 10 - name: wikicommons.videos engine: wikicommons shortcut: wcv categories: videos search_type: videos number_of_results: 10 - name: wikicommons.audio engine: wikicommons shortcut: wca categories: music search_type: audio number_of_results: 10 - name: wikicommons.files engine: wikicommons shortcut: wcf categories: files search_type: files number_of_results: 10 - name: wolframalpha shortcut: wa # You can use the engine using the official stable API, but you need an API # key. See: https://products.wolframalpha.com/api/ # # engine: wolframalpha_api # api_key: '' # # Or you can use the html non-stable engine, activated by default engine: wolframalpha_noapi timeout: 6.0 categories: general disabled: true - name: dictzone engine: dictzone shortcut: dc - name: mymemory translated engine: translated shortcut: tl timeout: 5.0 # You can use without an API key, but you are limited to 1000 words/day # See: https://mymemory.translated.net/doc/usagelimits.php # api_key: '' # Required dependency: mysql-connector-python # - name: mysql # engine: mysql_server # database: mydatabase # username: user # password: pass # limit: 10 # query_str: 'SELECT * from mytable WHERE fieldname=%(query)s' # shortcut: mysql - name: 1337x engine: 1337x shortcut: 1337x disabled: true - name: duden engine: duden shortcut: du disabled: true - name: seznam shortcut: szn engine: seznam disabled: true # - name: deepl # engine: deepl # shortcut: dpl # # You can use the engine using the official stable API, but you need an API key # # See: https://www.deepl.com/pro-api?cta=header-pro-api # api_key: '' # required! # timeout: 5.0 # disabled: true - name: mojeek shortcut: mjk engine: mojeek categories: [general, web] disabled: true - name: mojeek images shortcut: mjkimg engine: mojeek categories: [images, web] search_type: images paging: false disabled: true - name: mojeek news shortcut: mjknews engine: mojeek categories: [news, web] search_type: news paging: false disabled: true - name: moviepilot engine: moviepilot shortcut: mp disabled: true - name: naver shortcut: nvr categories: [general, web] engine: xpath paging: true search_url: https://search.naver.com/search.naver?where=webkr&sm=osp_hty&ie=UTF-8&query={query}&start={pageno} url_xpath: //a[@class="link_tit"]/@href title_xpath: //a[@class="link_tit"] content_xpath: //div[@class="total_dsc_wrap"]/a first_page_num: 1 page_size: 10 disabled: true about: website: https://www.naver.com/ wikidata_id: Q485639 official_api_documentation: https://developers.naver.com/docs/nmt/examples/ use_official_api: false require_api_key: false results: HTML language: ko - name: rubygems shortcut: rbg engine: xpath paging: true search_url: https://rubygems.org/search?page={pageno}&query={query} results_xpath: /html/body/main/div/a[@class="gems__gem"] url_xpath: ./@href title_xpath: ./span/h2 content_xpath: ./span/p suggestion_xpath: /html/body/main/div/div[@class="search__suggestions"]/p/a first_page_num: 1 categories: [it, packages] disabled: true about: website: https://rubygems.org/ wikidata_id: Q1853420 official_api_documentation: https://guides.rubygems.org/rubygems-org-api/ use_official_api: false require_api_key: false results: HTML - name: peertube engine: peertube shortcut: ptb paging: true # alternatives see: https://instances.joinpeertube.org/instances # base_url: https://tube.4aem.com categories: videos disabled: true timeout: 6.0 - name: mediathekviewweb engine: mediathekviewweb shortcut: mvw disabled: true - name: yacy engine: yacy categories: general search_type: text base_url: - https://yacy.searchlab.eu - https://search.lomig.me - https://yacy.ecosys.eu - https://search.webproject.link shortcut: ya disabled: true # required if you aren't using HTTPS for your local yacy instance # https://docs.searxng.org/dev/engines/online/yacy.html # enable_http: true # timeout: 3.0 # search_mode: 'global' - name: yacy images engine: yacy categories: images search_type: image shortcut: yai disabled: true - name: rumble engine: rumble shortcut: ru base_url: https://rumble.com/ paging: true categories: videos disabled: true - name: livespace engine: livespace shortcut: ls categories: videos disabled: true timeout: 5.0 - name: wordnik engine: wordnik shortcut: def base_url: https://www.wordnik.com/ categories: [dictionaries] timeout: 5.0 - name: woxikon.de synonyme engine: xpath shortcut: woxi categories: [dictionaries] timeout: 5.0 disabled: true search_url: https://synonyme.woxikon.de/synonyme/{query}.php url_xpath: //div[@class="upper-synonyms"]/a/@href content_xpath: //div[@class="synonyms-list-group"] title_xpath: //div[@class="upper-synonyms"]/a no_result_for_http_status: [404] about: website: https://www.woxikon.de/ wikidata_id: # No Wikidata ID use_official_api: false require_api_key: false results: HTML language: de - name: seekr news engine: seekr shortcut: senews categories: news seekr_category: news disabled: true - name: seekr images engine: seekr network: seekr news shortcut: seimg categories: images seekr_category: images disabled: true - name: seekr videos engine: seekr network: seekr news shortcut: sevid categories: videos seekr_category: videos disabled: true - name: sjp.pwn engine: sjp shortcut: sjp base_url: https://sjp.pwn.pl/ timeout: 5.0 disabled: true - name: stract engine: stract shortcut: str disabled: true - name: svgrepo engine: svgrepo shortcut: svg timeout: 10.0 disabled: true - name: tootfinder engine: tootfinder shortcut: toot - name: voidlinux engine: voidlinux shortcut: void disabled: true - name: wallhaven engine: wallhaven # api_key: abcdefghijklmnopqrstuvwxyz shortcut: wh # wikimini: online encyclopedia for children # The fulltext and title parameter is necessary for Wikimini because # sometimes it will not show the results and redirect instead - name: wikimini engine: xpath shortcut: wkmn search_url: https://fr.wikimini.org/w/index.php?search={query}&title=Sp%C3%A9cial%3ASearch&fulltext=Search url_xpath: //li/div[@class="mw-search-result-heading"]/a/@href title_xpath: //li//div[@class="mw-search-result-heading"]/a content_xpath: //li/div[@class="searchresult"] categories: general disabled: true about: website: https://wikimini.org/ wikidata_id: Q3568032 use_official_api: false require_api_key: false results: HTML language: fr - name: wttr.in engine: wttr shortcut: wttr timeout: 9.0 - name: yummly engine: yummly shortcut: yum disabled: true - name: brave engine: brave shortcut: br time_range_support: true paging: true categories: [general, web] brave_category: search # brave_spellcheck: true - name: brave.images engine: brave network: brave shortcut: brimg categories: [images, web] brave_category: images - name: brave.videos engine: brave network: brave shortcut: brvid categories: [videos, web] brave_category: videos - name: brave.news engine: brave network: brave shortcut: brnews categories: news brave_category: news # - name: brave.goggles # engine: brave # network: brave # shortcut: brgog # time_range_support: true # paging: true # categories: [general, web] # brave_category: goggles # Goggles: # required! This should be a URL ending in .goggle - name: lib.rs shortcut: lrs engine: lib_rs disabled: true - name: sourcehut shortcut: srht engine: xpath paging: true search_url: https://sr.ht/projects?page={pageno}&search={query} results_xpath: (//div[@class="event-list"])[1]/div[@class="event"] url_xpath: ./h4/a[2]/@href title_xpath: ./h4/a[2] content_xpath: ./p first_page_num: 1 categories: [it, repos] disabled: true about: website: https://sr.ht wikidata_id: Q78514485 official_api_documentation: https://man.sr.ht/ use_official_api: false require_api_key: false results: HTML - name: goo shortcut: goo engine: xpath paging: true search_url: https://search.goo.ne.jp/web.jsp?MT={query}&FR={pageno}0 url_xpath: //div[@class="result"]/p[@class='title fsL1']/a/@href title_xpath: //div[@class="result"]/p[@class='title fsL1']/a content_xpath: //p[contains(@class,'url fsM')]/following-sibling::p first_page_num: 0 categories: [general, web] disabled: true timeout: 4.0 about: website: https://search.goo.ne.jp wikidata_id: Q249044 use_official_api: false require_api_key: false results: HTML language: ja - name: bt4g engine: bt4g shortcut: bt4g - name: pkg.go.dev engine: pkg_go_dev shortcut: pgo disabled: true # Doku engine lets you access to any Doku wiki instance: # A public one or a privete/corporate one. # - name: ubuntuwiki # engine: doku # shortcut: uw # base_url: 'https://doc.ubuntu-fr.org' # Be careful when enabling this engine if you are # running a public instance. Do not expose any sensitive # information. You can restrict access by configuring a list # of access tokens under tokens. # - name: git grep # engine: command # command: ['git', 'grep', '{{QUERY}}'] # shortcut: gg # tokens: [] # disabled: true # delimiter: # chars: ':' # keys: ['filepath', 'code'] # Be careful when enabling this engine if you are # running a public instance. Do not expose any sensitive # information. You can restrict access by configuring a list # of access tokens under tokens. # - name: locate # engine: command # command: ['locate', '{{QUERY}}'] # shortcut: loc # tokens: [] # disabled: true # delimiter: # chars: ' ' # keys: ['line'] # Be careful when enabling this engine if you are # running a public instance. Do not expose any sensitive # information. You can restrict access by configuring a list # of access tokens under tokens. # - name: find # engine: command # command: ['find', '.', '-name', '{{QUERY}}'] # query_type: path # shortcut: fnd # tokens: [] # disabled: true # delimiter: # chars: ' ' # keys: ['line'] # Be careful when enabling this engine if you are # running a public instance. Do not expose any sensitive # information. You can restrict access by configuring a list # of access tokens under tokens. # - name: pattern search in files # engine: command # command: ['fgrep', '{{QUERY}}'] # shortcut: fgr # tokens: [] # disabled: true # delimiter: # chars: ' ' # keys: ['line'] # Be careful when enabling this engine if you are # running a public instance. Do not expose any sensitive # information. You can restrict access by configuring a list # of access tokens under tokens. # - name: regex search in files # engine: command # command: ['grep', '{{QUERY}}'] # shortcut: gr # tokens: [] # disabled: true # delimiter: # chars: ' ' # keys: ['line'] doi_resolvers: oadoi.org: 'https://oadoi.org/' doi.org: 'https://doi.org/' doai.io: 'https://dissem.in/' sci-hub.se: 'https://sci-hub.se/' sci-hub.st: 'https://sci-hub.st/' sci-hub.ru: 'https://sci-hub.ru/' default_doi_resolver: 'oadoi.org' enabled_plugins: - 'Hostname replace' hostname_replace: '(.*\.)?quora\.com$': false ================================================ FILE: searxng/settings.yml.new ================================================ general: # Debug mode, only for development. Is overwritten by ${SEARXNG_DEBUG} debug: false # displayed name instance_name: "SearXNG" # For example: https://example.com/privacy privacypolicy_url: false # use true to use your own donation page written in searx/info/en/donate.md # use false to disable the donation link donation_url: false # mailto:contact@example.com contact_url: false # record stats enable_metrics: true # expose stats in open metrics format at /metrics # leave empty to disable (no password set) # open_metrics: open_metrics: '' brand: new_issue_url: https://github.com/searxng/searxng/issues/new docs_url: https://docs.searxng.org/ public_instances: https://searx.space wiki_url: https://github.com/searxng/searxng/wiki issue_url: https://github.com/searxng/searxng/issues # custom: # maintainer: "Jon Doe" # # Custom entries in the footer: [title]: [link] # links: # Uptime: https://uptime.searxng.org/history/darmarit-org # About: "https://searxng.org" search: # Filter results. 0: None, 1: Moderate, 2: Strict safe_search: 0 # Existing autocomplete backends: "360search", "baidu", "brave", "dbpedia", "duckduckgo", "google", "yandex", # "mwmbl", "naver", "seznam", "sogou", "stract", "swisscows", "quark", "qwant", "wikipedia" - # leave blank to turn it off by default. autocomplete: "" # minimun characters to type before autocompleter starts autocomplete_min: 4 # backend for the favicon near URL in search results. # Available resolvers: "allesedv", "duckduckgo", "google", "yandex" - leave blank to turn it off by default. favicon_resolver: "" # Default search language - leave blank to detect from browser information or # use codes from 'languages.py' default_lang: "auto" # max_page: 0 # if engine supports paging, 0 means unlimited numbers of pages # Available languages # languages: # - all # - en # - en-US # - de # - it-IT # - fr # - fr-BE # ban time in seconds after engine errors ban_time_on_fail: 5 # max ban time in seconds after engine errors max_ban_time_on_fail: 120 suspended_times: # Engine suspension time after error (in seconds; set to 0 to disable) # For error "Access denied" and "HTTP error [402, 403]" SearxEngineAccessDenied: 86400 # For error "CAPTCHA" SearxEngineCaptcha: 86400 # For error "Too many request" and "HTTP error 429" SearxEngineTooManyRequests: 3600 # Cloudflare CAPTCHA cf_SearxEngineCaptcha: 1296000 cf_SearxEngineAccessDenied: 86400 # ReCAPTCHA recaptcha_SearxEngineCaptcha: 604800 # remove format to deny access, use lower case. # formats: [html, csv, json, rss] formats: - html server: # Is overwritten by ${SEARXNG_PORT} and ${SEARXNG_BIND_ADDRESS} port: 8888 bind_address: "127.0.0.1" # public URL of the instance, to ensure correct inbound links. Is overwritten # by ${SEARXNG_BASE_URL}. base_url: false # "http://example.com/location" # rate limit the number of request on the instance, block some bots. # Is overwritten by ${SEARXNG_LIMITER} limiter: false # enable features designed only for public instances. # Is overwritten by ${SEARXNG_PUBLIC_INSTANCE} public_instance: false # If your instance owns a /etc/searxng/settings.yml file, then set the following # values there. secret_key: "ultrasecretkey" # Is overwritten by ${SEARXNG_SECRET} # Proxy image results through SearXNG. Is overwritten by ${SEARXNG_IMAGE_PROXY} image_proxy: false # 1.0 and 1.1 are supported http_protocol_version: "1.0" # POST queries are more secure as they don't show up in history but may cause # problems when using Firefox containers. # Is overwritten by ${SEARXNG_METHOD} method: "POST" default_http_headers: X-Content-Type-Options: nosniff X-Download-Options: noopen X-Robots-Tag: noindex, nofollow Referrer-Policy: no-referrer redis: # URL to connect redis database. Is overwritten by ${SEARXNG_REDIS_URL}. # https://docs.searxng.org/admin/settings/settings_redis.html#settings-redis url: false ui: # Custom static path - leave it blank if you didn't change static_path: "" # Is overwritten by ${SEARXNG_STATIC_USE_HASH}. static_use_hash: false # Custom templates path - leave it blank if you didn't change templates_path: "" # query_in_title: When true, the result page's titles contains the query # it decreases the privacy, since the browser can records the page titles. query_in_title: false # infinite_scroll: When true, automatically loads the next page when scrolling to bottom of the current page. infinite_scroll: false # ui theme default_theme: simple # center the results ? center_alignment: false # URL prefix of the internet archive, don't forget trailing slash (if needed). # cache_url: "https://webcache.googleusercontent.com/search?q=cache:" # Default interface locale - leave blank to detect from browser information or # use codes from the 'locales' config section default_locale: "" # Open result links in a new tab by default # results_on_new_tab: false theme_args: # style of simple theme: auto, light, dark simple_style: auto # Perform search immediately if a category selected. # Disable to select multiple categories at once and start the search manually. search_on_category_select: true # Hotkeys: default or vim hotkeys: default # URL formatting: pretty, full or host url_formatting: pretty # Lock arbitrary settings on the preferences page. # # preferences: # lock: # - categories # - language # - autocomplete # - favicon # - safesearch # - method # - doi_resolver # - locale # - theme # - results_on_new_tab # - infinite_scroll # - search_on_category_select # - method # - image_proxy # - query_in_title # communication with search engines # outgoing: # default timeout in seconds, can be override by engine request_timeout: 3.0 # the maximum timeout in seconds # max_request_timeout: 10.0 # suffix of searx_useragent, could contain information like an email address # to the administrator useragent_suffix: "" # The maximum number of concurrent connections that may be established. pool_connections: 100 # Allow the connection pool to maintain keep-alive connections below this # point. pool_maxsize: 20 # See https://www.python-httpx.org/http2/ enable_http2: true # uncomment below section if you want to use a custom server certificate # see https://www.python-httpx.org/advanced/#changing-the-verification-defaults # and https://www.python-httpx.org/compatibility/#ssl-configuration # verify: ~/.mitmproxy/mitmproxy-ca-cert.cer # # uncomment below section if you want to use a proxyq see: SOCKS proxies # https://2.python-requests.org/en/latest/user/advanced/#proxies # are also supported: see # https://2.python-requests.org/en/latest/user/advanced/#socks # # proxies: # all://: # - http://proxy1:8080 # - http://proxy2:8080 # # using_tor_proxy: true # # Extra seconds to add in order to account for the time taken by the proxy # # extra_proxy_timeout: 10 # # uncomment below section only if you have more than one network interface # which can be the source of outgoing search requests # # source_ips: # - 1.1.1.1 # - 1.1.1.2 # - fe80::/126 # Plugin configuration, for more details see # https://docs.searxng.org/admin/settings/settings_plugins.html # plugins: searx.plugins.calculator.SXNGPlugin: active: true searx.plugins.hash_plugin.SXNGPlugin: active: true searx.plugins.self_info.SXNGPlugin: active: true searx.plugins.unit_converter.SXNGPlugin: active: true searx.plugins.ahmia_filter.SXNGPlugin: active: true searx.plugins.hostnames.SXNGPlugin: active: true searx.plugins.oa_doi_rewrite.SXNGPlugin: active: false searx.plugins.tor_check.SXNGPlugin: active: false searx.plugins.tracker_url_remover.SXNGPlugin: active: false # Configuration of the "Hostnames plugin": # # hostnames: # replace: # '(.*\.)?youtube\.com$': 'yt.example.com' # '(.*\.)?youtu\.be$': 'yt.example.com' # '(.*\.)?reddit\.com$': 'teddit.example.com' # '(.*\.)?redd\.it$': 'teddit.example.com' # '(www\.)?twitter\.com$': 'nitter.example.com' # remove: # - '(.*\.)?facebook.com$' # low_priority: # - '(.*\.)?google(\..*)?$' # high_priority: # - '(.*\.)?wikipedia.org$' # # Alternatively you can use external files for configuring the "Hostnames plugin": # # hostnames: # replace: 'rewrite-hosts.yml' # # Content of 'rewrite-hosts.yml' (place the file in the same directory as 'settings.yml'): # '(.*\.)?youtube\.com$': 'yt.example.com' # '(.*\.)?youtu\.be$': 'yt.example.com' # checker: # disable checker when in debug mode off_when_debug: true # use "scheduling: false" to disable scheduling # scheduling: interval or int # to activate the scheduler: # * uncomment "scheduling" section # * add "cache2 = name=searxngcache,items=2000,blocks=2000,blocksize=4096,bitmap=1" # to your uwsgi.ini # scheduling: # start_after: [300, 1800] # delay to start the first run of the checker # every: [86400, 90000] # how often the checker runs # additional tests: only for the YAML anchors (see the engines section) # additional_tests: rosebud: &test_rosebud matrix: query: rosebud lang: en result_container: - not_empty - ['one_title_contains', 'citizen kane'] test: - unique_results android: &test_android matrix: query: ['android'] lang: ['en', 'de', 'fr', 'zh-CN'] result_container: - not_empty - ['one_title_contains', 'google'] test: - unique_results # tests: only for the YAML anchors (see the engines section) tests: infobox: &tests_infobox infobox: matrix: query: ["linux", "new york", "bbc"] result_container: - has_infobox categories_as_tabs: general: images: videos: news: map: music: it: science: files: social media: engines: - name: 360search engine: 360search shortcut: 360so disabled: true - name: 360search videos engine: 360search_videos shortcut: 360sov disabled: true - name: 9gag engine: 9gag shortcut: 9g disabled: true - name: acfun engine: acfun shortcut: acf disabled: true - name: adobe stock engine: adobe_stock shortcut: asi categories: ["images"] # https://docs.searxng.org/dev/engines/online/adobe_stock.html adobe_order: relevance adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"] timeout: 6 disabled: true - name: adobe stock video engine: adobe_stock shortcut: asv network: adobe stock categories: ["videos"] adobe_order: relevance adobe_content_types: ["video"] timeout: 6 disabled: true - name: adobe stock audio engine: adobe_stock shortcut: asa network: adobe stock categories: ["music"] adobe_order: relevance adobe_content_types: ["audio"] timeout: 6 disabled: true - name: alexandria engine: json_engine shortcut: alx categories: general paging: true search_url: https://api.alexandria.org/?a=1&q={query}&p={pageno} results_query: results title_query: title url_query: url content_query: snippet timeout: 1.5 disabled: true about: website: https://alexandria.org/ official_api_documentation: https://github.com/alexandria-org/alexandria-api/raw/master/README.md use_official_api: true require_api_key: false results: JSON # - name: astrophysics data system # engine: astrophysics_data_system # sort: asc # weight: 5 # categories: [science] # api_key: your-new-key # shortcut: ads - name: alpine linux packages engine: alpinelinux disabled: true shortcut: alp - name: annas archive engine: annas_archive disabled: true shortcut: aa - name: ansa engine: ansa shortcut: ans disabled: true # - name: annas articles # engine: annas_archive # shortcut: aaa # # https://docs.searxng.org/dev/engines/online/annas_archive.html # aa_content: 'magazine' # book_fiction, book_unknown, book_nonfiction, book_comic # aa_ext: 'pdf' # pdf, epub, .. # aa_sort: oldest' # newest, oldest, largest, smallest - name: apk mirror engine: apkmirror timeout: 4.0 shortcut: apkm disabled: true - name: apple app store engine: apple_app_store shortcut: aps disabled: true # Requires Tor - name: ahmia engine: ahmia categories: onions enable_http: true shortcut: ah - name: anaconda engine: xpath paging: true first_page_num: 0 search_url: https://anaconda.org/search?q={query}&page={pageno} results_xpath: //tbody/tr url_xpath: ./td/h5/a[last()]/@href title_xpath: ./td/h5 content_xpath: ./td[h5]/text() categories: it timeout: 6.0 shortcut: conda disabled: true - name: arch linux wiki engine: archlinux shortcut: al - name: nixos wiki engine: mediawiki shortcut: nixw base_url: https://wiki.nixos.org/ search_type: text disabled: true categories: [it, software wikis] - name: artic engine: artic shortcut: arc timeout: 4.0 - name: arxiv engine: arxiv shortcut: arx timeout: 4.0 - name: ask engine: ask shortcut: ask disabled: true # tmp suspended: dh key too small # - name: base # engine: base # shortcut: bs - name: bandcamp engine: bandcamp shortcut: bc categories: music - name: baidu baidu_category: general categories: [general] engine: baidu shortcut: bd disabled: true - name: baidu images baidu_category: images categories: [images] engine: baidu shortcut: bdi disabled: true - name: baidu kaifa baidu_category: it categories: [it] engine: baidu shortcut: bdk disabled: true - name: wikipedia engine: wikipedia shortcut: wp # add "list" to the array to get results in the results list display_type: ["infobox"] categories: [general] - name: bilibili engine: bilibili shortcut: bil disabled: true - name: bing engine: bing shortcut: bi disabled: true - name: bing images engine: bing_images shortcut: bii - name: bing news engine: bing_news shortcut: bin - name: bing videos engine: bing_videos shortcut: biv - name: bitchute engine: bitchute shortcut: bit disabled: true - name: bitbucket engine: xpath paging: true search_url: https://bitbucket.org/repo/all/{pageno}?name={query} url_xpath: //article[@class="repo-summary"]//a[@class="repo-link"]/@href title_xpath: //article[@class="repo-summary"]//a[@class="repo-link"] content_xpath: //article[@class="repo-summary"]/p categories: [it, repos] timeout: 4.0 disabled: true shortcut: bb about: website: https://bitbucket.org/ wikidata_id: Q2493781 official_api_documentation: https://developer.atlassian.com/bitbucket use_official_api: false require_api_key: false results: HTML - name: bpb engine: bpb shortcut: bpb disabled: true - name: btdigg engine: btdigg shortcut: bt disabled: true - name: openverse engine: openverse categories: images shortcut: opv - name: media.ccc.de engine: ccc_media shortcut: c3tv # We don't set language: de here because media.ccc.de is not just # for a German audience. It contains many English videos and many # German videos have English subtitles. disabled: true - name: chefkoch engine: chefkoch shortcut: chef # to show premium or plus results too: # skip_premium: false # WARNING: links from chinaso.com voilate users privacy # Before activate these engines its mandatory to read # - https://github.com/searxng/searxng/issues/4694 # - https://docs.searxng.org/dev/engines/online/chinaso.html - name: chinaso news engine: chinaso shortcut: chinaso categories: [news] chinaso_category: news chinaso_news_source: all disabled: true inactive: true - name: chinaso images engine: chinaso network: chinaso news shortcut: chinasoi categories: [images] chinaso_category: images disabled: true inactive: true - name: chinaso videos engine: chinaso network: chinaso news shortcut: chinasov categories: [videos] chinaso_category: videos disabled: true inactive: true - name: cloudflareai engine: cloudflareai shortcut: cfai # get api token and accont id from https://developers.cloudflare.com/workers-ai/get-started/rest-api/ cf_account_id: 'your_cf_accout_id' cf_ai_api: 'your_cf_api' # create your ai gateway by https://developers.cloudflare.com/ai-gateway/get-started/creating-gateway/ cf_ai_gateway: 'your_cf_ai_gateway_name' # find the model name from https://developers.cloudflare.com/workers-ai/models/#text-generation cf_ai_model: 'ai_model_name' # custom your preferences # cf_ai_model_display_name: 'Cloudflare AI' # cf_ai_model_assistant: 'prompts_for_assistant_role' # cf_ai_model_system: 'prompts_for_system_role' timeout: 30 disabled: true # - name: core.ac.uk # engine: core # categories: science # shortcut: cor # # get your API key from: https://core.ac.uk/api-keys/register/ # api_key: 'unset' - name: cppreference engine: cppreference shortcut: cpp paging: false disabled: true - name: crossref engine: crossref shortcut: cr timeout: 30 disabled: true - name: crowdview engine: json_engine shortcut: cv categories: general paging: false search_url: https://crowdview-next-js.onrender.com/api/search-v3?query={query} results_query: results url_query: link title_query: title content_query: snippet title_html_to_text: true content_html_to_text: true disabled: true about: website: https://crowdview.ai/ - name: yep engine: yep shortcut: yep categories: general search_type: web timeout: 5 disabled: true - name: yep images engine: yep shortcut: yepi categories: images search_type: images disabled: true - name: yep news engine: yep shortcut: yepn categories: news search_type: news disabled: true - name: currency engine: currency_convert categories: general shortcut: cc - name: deezer engine: deezer shortcut: dz disabled: true - name: destatis engine: destatis shortcut: destat disabled: true - name: deviantart engine: deviantart shortcut: da timeout: 3.0 - name: ddg definitions engine: duckduckgo_definitions shortcut: ddd weight: 2 disabled: true tests: *tests_infobox # cloudflare protected # - name: digbt # engine: digbt # shortcut: dbt # timeout: 6.0 # disabled: true - name: docker hub engine: docker_hub shortcut: dh categories: [it, packages] - name: encyclosearch engine: json_engine shortcut: es categories: general paging: true search_url: https://encyclosearch.org/encyclosphere/search?q={query}&page={pageno}&resultsPerPage=15 results_query: Results url_query: SourceURL title_query: Title content_query: Description disabled: true about: website: https://encyclosearch.org official_api_documentation: https://encyclosearch.org/docs/#/rest-api use_official_api: true require_api_key: false results: JSON - name: erowid engine: xpath paging: true first_page_num: 0 page_size: 30 search_url: https://www.erowid.org/search.php?q={query}&s={pageno} url_xpath: //dl[@class="results-list"]/dt[@class="result-title"]/a/@href title_xpath: //dl[@class="results-list"]/dt[@class="result-title"]/a/text() content_xpath: //dl[@class="results-list"]/dd[@class="result-details"] categories: [] shortcut: ew disabled: true about: website: https://www.erowid.org/ wikidata_id: Q1430691 official_api_documentation: use_official_api: false require_api_key: false results: HTML # - name: elasticsearch # shortcut: els # engine: elasticsearch # base_url: http://localhost:9200 # username: elastic # password: changeme # index: my-index # enable_http: true # # available options: match, simple_query_string, term, terms, custom # query_type: match # # if query_type is set to custom, provide your query here # # custom_query_json: {"query":{"match_all": {}}} # # show_metadata: false # disabled: true - name: wikidata engine: wikidata shortcut: wd timeout: 3.0 weight: 2 # add "list" to the array to get results in the results list display_type: ["infobox"] tests: *tests_infobox categories: [general] - name: duckduckgo engine: duckduckgo shortcut: ddg - name: duckduckgo images engine: duckduckgo_extra categories: [images, web] ddg_category: images shortcut: ddi disabled: true - name: duckduckgo videos engine: duckduckgo_extra categories: [videos, web] ddg_category: videos shortcut: ddv disabled: true - name: duckduckgo news engine: duckduckgo_extra categories: [news, web] ddg_category: news shortcut: ddn disabled: true - name: duckduckgo weather engine: duckduckgo_weather shortcut: ddw disabled: true - name: apple maps engine: apple_maps shortcut: apm disabled: true timeout: 5.0 - name: emojipedia engine: emojipedia timeout: 4.0 shortcut: em disabled: true - name: tineye engine: tineye shortcut: tin timeout: 9.0 disabled: true - name: etymonline engine: xpath paging: true search_url: https://etymonline.com/search?page={pageno}&q={query} url_xpath: //a[contains(@class, "word__name--")]/@href title_xpath: //a[contains(@class, "word__name--")] content_xpath: //section[contains(@class, "word__defination")] first_page_num: 1 shortcut: et categories: [dictionaries] about: website: https://www.etymonline.com/ wikidata_id: Q1188617 official_api_documentation: use_official_api: false require_api_key: false results: HTML # - name: ebay # engine: ebay # shortcut: eb # base_url: 'https://www.ebay.com' # disabled: true # timeout: 5 - name: 1x engine: www1x shortcut: 1x timeout: 3.0 disabled: true - name: fdroid engine: fdroid shortcut: fd disabled: true - name: findthatmeme engine: findthatmeme shortcut: ftm disabled: true - name: flickr categories: images shortcut: fl # You can use the engine using the official stable API, but you need an API # key, see: https://www.flickr.com/services/apps/create/ # engine: flickr # api_key: 'apikey' # required! # Or you can use the html non-stable engine, activated by default engine: flickr_noapi - name: free software directory engine: mediawiki shortcut: fsd categories: [it, software wikis] base_url: https://directory.fsf.org/ search_type: title timeout: 5.0 disabled: true about: website: https://directory.fsf.org/ wikidata_id: Q2470288 # - name: freesound # engine: freesound # shortcut: fnd # disabled: true # timeout: 15.0 # API key required, see: https://freesound.org/docs/api/overview.html # api_key: MyAPIkey - name: frinkiac engine: frinkiac shortcut: frk disabled: true - name: fyyd engine: fyyd shortcut: fy timeout: 8.0 disabled: true - name: geizhals engine: geizhals shortcut: geiz disabled: true - name: genius engine: genius shortcut: gen - name: gentoo engine: mediawiki shortcut: ge categories: ["it", "software wikis"] base_url: "https://wiki.gentoo.org/" api_path: "api.php" search_type: text timeout: 10 - name: gitlab engine: gitlab base_url: https://gitlab.com shortcut: gl disabled: true about: website: https://gitlab.com/ wikidata_id: Q16639197 # - name: gnome # engine: gitlab # base_url: https://gitlab.gnome.org # shortcut: gn # about: # website: https://gitlab.gnome.org # wikidata_id: Q44316 - name: github engine: github shortcut: gh - name: codeberg # https://docs.searxng.org/dev/engines/online/gitea.html engine: gitea base_url: https://codeberg.org shortcut: cb disabled: true - name: gitea.com engine: gitea base_url: https://gitea.com shortcut: gitea disabled: true - name: goodreads engine: goodreads shortcut: good timeout: 4.0 disabled: true - name: google engine: google shortcut: go # additional_tests: # android: *test_android - name: google images engine: google_images shortcut: goi # additional_tests: # android: *test_android # dali: # matrix: # query: ['Dali Christ'] # lang: ['en', 'de', 'fr', 'zh-CN'] # result_container: # - ['one_title_contains', 'Salvador'] - name: google news engine: google_news shortcut: gon # additional_tests: # android: *test_android - name: google videos engine: google_videos shortcut: gov # additional_tests: # android: *test_android - name: google scholar engine: google_scholar shortcut: gos - name: google play apps engine: google_play categories: [files, apps] shortcut: gpa play_categ: apps disabled: true - name: google play movies engine: google_play categories: videos shortcut: gpm play_categ: movies disabled: true - name: material icons engine: material_icons shortcut: mi disabled: true - name: habrahabr engine: xpath paging: true search_url: https://habr.com/en/search/page{pageno}/?q={query} results_xpath: //article[contains(@class, "tm-articles-list__item")] url_xpath: .//a[@class="tm-title__link"]/@href title_xpath: .//a[@class="tm-title__link"] content_xpath: .//div[contains(@class, "article-formatted-body")] categories: it timeout: 4.0 disabled: true shortcut: habr about: website: https://habr.com/ wikidata_id: Q4494434 official_api_documentation: https://habr.com/en/docs/help/api/ use_official_api: false require_api_key: false results: HTML - name: hackernews engine: hackernews shortcut: hn disabled: true - name: hex engine: hex shortcut: hex disabled: true # Valid values: name inserted_at updated_at total_downloads recent_downloads sort_criteria: "recent_downloads" page_size: 10 - name: crates.io engine: crates shortcut: crates disabled: true timeout: 6.0 - name: hoogle engine: xpath search_url: https://hoogle.haskell.org/?hoogle={query} results_xpath: '//div[@class="result"]' title_xpath: './/div[@class="ans"]//a' url_xpath: './/div[@class="ans"]//a/@href' content_xpath: './/div[@class="from"]' page_size: 20 categories: [it, packages] shortcut: ho about: website: https://hoogle.haskell.org/ wikidata_id: Q34010 official_api_documentation: https://hackage.haskell.org/api use_official_api: false require_api_key: false results: JSON - name: il post engine: il_post shortcut: pst disabled: true - name: huggingface engine: huggingface shortcut: hf disabled: true - name: huggingface datasets huggingface_endpoint: datasets engine: huggingface shortcut: hfd disabled: true - name: huggingface spaces huggingface_endpoint: spaces engine: huggingface shortcut: hfs disabled: true - name: imdb engine: imdb shortcut: imdb timeout: 6.0 disabled: true - name: imgur engine: imgur shortcut: img disabled: true - name: ina engine: ina shortcut: in timeout: 6.0 disabled: true # - name: invidious # engine: invidious # # if you want to use invidious with SearXNG you should setup one locally # # https://github.com/searxng/searxng/issues/2722#issuecomment-2884993248 # base_url: # - https://invidious.example1.com # - https://invidious.example2.com # shortcut: iv # timeout: 3.0 - name: ipernity engine: ipernity shortcut: ip disabled: true - name: iqiyi engine: iqiyi shortcut: iq disabled: true - name: jisho engine: jisho shortcut: js timeout: 3.0 disabled: true - name: kickass engine: kickass base_url: - https://kickasstorrents.to - https://kickasstorrents.cr - https://kickasstorrent.cr - https://kickass.sx - https://kat.am shortcut: kc timeout: 4.0 - name: lemmy communities engine: lemmy lemmy_type: Communities shortcut: leco - name: lemmy users engine: lemmy network: lemmy communities lemmy_type: Users shortcut: leus - name: lemmy posts engine: lemmy network: lemmy communities lemmy_type: Posts shortcut: lepo - name: lemmy comments engine: lemmy network: lemmy communities lemmy_type: Comments shortcut: lecom - name: library genesis engine: xpath # search_url: https://libgen.is/search.php?req={query} search_url: https://libgen.rs/search.php?req={query} url_xpath: //a[contains(@href,"book/index.php?md5")]/@href title_xpath: //a[contains(@href,"book/")]/text()[1] content_xpath: //td/a[1][contains(@href,"=author")]/text() categories: files timeout: 7.0 disabled: true shortcut: lg about: website: https://libgen.fun/ wikidata_id: Q22017206 official_api_documentation: use_official_api: false require_api_key: false results: HTML - name: z-library engine: zlibrary shortcut: zlib categories: files timeout: 7.0 disabled: true - name: library of congress engine: loc shortcut: loc categories: images disabled: true - name: libretranslate engine: libretranslate # https://github.com/LibreTranslate/LibreTranslate?tab=readme-ov-file#mirrors base_url: - https://libretranslate.com/translate # api_key: abc123 shortcut: lt disabled: true - name: lingva engine: lingva shortcut: lv # set lingva instance in url, by default it will use the official instance # url: https://lingva.thedaviddelta.com - name: lobste.rs engine: xpath search_url: https://lobste.rs/search?q={query}&what=stories&order=relevance results_xpath: //li[contains(@class, "story")] url_xpath: .//a[@class="u-url"]/@href title_xpath: .//a[@class="u-url"] content_xpath: .//a[@class="domain"] categories: it shortcut: lo timeout: 5.0 disabled: true about: website: https://lobste.rs/ wikidata_id: Q60762874 official_api_documentation: use_official_api: false require_api_key: false results: HTML - name: mastodon users engine: mastodon mastodon_type: accounts base_url: https://mastodon.social shortcut: mau - name: mastodon hashtags engine: mastodon mastodon_type: hashtags base_url: https://mastodon.social shortcut: mah # - name: matrixrooms # engine: mrs # # https://docs.searxng.org/dev/engines/online/mrs.html # # base_url: https://mrs-api-host # shortcut: mtrx # disabled: true - name: mdn shortcut: mdn engine: json_engine categories: [it] paging: true search_url: https://developer.mozilla.org/api/v1/search?q={query}&page={pageno} results_query: documents url_query: mdn_url url_prefix: https://developer.mozilla.org title_query: title content_query: summary about: website: https://developer.mozilla.org wikidata_id: Q3273508 official_api_documentation: null use_official_api: false require_api_key: false results: JSON - name: metacpan engine: metacpan shortcut: cpan disabled: true number_of_results: 20 # https://docs.searxng.org/dev/engines/offline/search-indexer-engines.html#module-searx.engines.meilisearch # - name: meilisearch # engine: meilisearch # shortcut: mes # enable_http: true # base_url: http://localhost:7700 # index: my-index # auth_key: Bearer XXXX - name: microsoft learn engine: microsoft_learn shortcut: msl disabled: true - name: mixcloud engine: mixcloud shortcut: mc # MongoDB engine # Required dependency: pymongo # - name: mymongo # engine: mongodb # shortcut: md # exact_match_only: false # host: '127.0.0.1' # port: 27017 # enable_http: true # results_per_page: 20 # database: 'business' # collection: 'reviews' # name of the db collection # key: 'name' # key in the collection to search for - name: mozhi engine: mozhi base_url: - https://mozhi.aryak.me - https://translate.bus-hit.me - https://nyc1.mz.ggtyler.dev # mozhi_engine: google - see https://mozhi.aryak.me for supported engines timeout: 4.0 shortcut: mz disabled: true - name: mwmbl engine: mwmbl # api_url: https://api.mwmbl.org shortcut: mwm disabled: true - name: niconico engine: niconico shortcut: nico disabled: true - name: npm engine: npm shortcut: npm timeout: 5.0 disabled: true - name: nyaa engine: nyaa shortcut: nt disabled: true - name: mankier engine: json_engine search_url: https://www.mankier.com/api/v2/mans/?q={query} results_query: results url_query: url title_query: name content_query: description categories: it shortcut: man about: website: https://www.mankier.com/ official_api_documentation: https://www.mankier.com/api use_official_api: true require_api_key: false results: JSON # https://docs.searxng.org/dev/engines/online/mullvad_leta.html - name: mullvadleta engine: mullvad_leta disabled: true leta_engine: google categories: [general, web] shortcut: ml - name: mullvadleta brave engine: mullvad_leta network: mullvadleta disabled: true leta_engine: brave categories: [general, web] shortcut: mlb - name: odysee engine: odysee shortcut: od disabled: true - name: ollama engine: ollama shortcut: ollama disabled: true - name: openairedatasets engine: json_engine paging: true search_url: https://api.openaire.eu/search/datasets?format=json&page={pageno}&size=10&title={query} results_query: response/results/result url_query: metadata/oaf:entity/oaf:result/children/instance/webresource/url/$ title_query: metadata/oaf:entity/oaf:result/title/$ content_query: metadata/oaf:entity/oaf:result/description/$ content_html_to_text: true categories: "science" shortcut: oad timeout: 5.0 about: website: https://www.openaire.eu/ wikidata_id: Q25106053 official_api_documentation: https://api.openaire.eu/ use_official_api: false require_api_key: false results: JSON - name: openairepublications engine: json_engine paging: true search_url: https://api.openaire.eu/search/publications?format=json&page={pageno}&size=10&title={query} results_query: response/results/result url_query: metadata/oaf:entity/oaf:result/children/instance/webresource/url/$ title_query: metadata/oaf:entity/oaf:result/title/$ content_query: metadata/oaf:entity/oaf:result/description/$ content_html_to_text: true categories: science shortcut: oap timeout: 5.0 about: website: https://www.openaire.eu/ wikidata_id: Q25106053 official_api_documentation: https://api.openaire.eu/ use_official_api: false require_api_key: false results: JSON - name: openclipart engine: openclipart shortcut: ocl inactive: true disabled: true timeout: 30 - name: openlibrary engine: openlibrary shortcut: ol timeout: 5 disabled: true - name: openmeteo engine: open_meteo shortcut: om disabled: true # - name: opensemanticsearch # engine: opensemantic # shortcut: oss # base_url: 'http://localhost:8983/solr/opensemanticsearch/' - name: openstreetmap engine: openstreetmap shortcut: osm - name: openrepos engine: xpath paging: true search_url: https://openrepos.net/search/node/{query}?page={pageno} url_xpath: //li[@class="search-result"]//h3[@class="title"]/a/@href title_xpath: //li[@class="search-result"]//h3[@class="title"]/a content_xpath: //li[@class="search-result"]//div[@class="search-snippet-info"]//p[@class="search-snippet"] categories: files timeout: 4.0 disabled: true shortcut: or about: website: https://openrepos.net/ wikidata_id: official_api_documentation: use_official_api: false require_api_key: false results: HTML - name: packagist engine: json_engine paging: true search_url: https://packagist.org/search.json?q={query}&page={pageno} results_query: results url_query: url title_query: name content_query: description categories: [it, packages] disabled: true timeout: 5.0 shortcut: pack about: website: https://packagist.org wikidata_id: Q108311377 official_api_documentation: https://packagist.org/apidoc use_official_api: true require_api_key: false results: JSON - name: pdbe engine: pdbe shortcut: pdb # Hide obsolete PDB entries. Default is not to hide obsolete structures # hide_obsolete: false - name: photon engine: photon shortcut: ph - name: pinterest engine: pinterest shortcut: pin - name: piped engine: piped shortcut: ppd categories: videos piped_filter: videos timeout: 3.0 # URL to use as link and for embeds frontend_url: https://srv.piped.video # Instance will be selected randomly, for more see https://piped-instances.kavin.rocks/ backend_url: - https://pipedapi.adminforge.de - https://pipedapi.nosebs.ru - https://pipedapi.ducks.party - https://pipedapi.reallyaweso.me - https://api.piped.private.coffee - https://pipedapi.darkness.services - name: piped.music engine: piped network: piped shortcut: ppdm categories: music piped_filter: music_songs timeout: 3.0 - name: piratebay engine: piratebay shortcut: tpb # You may need to change this URL to a proxy if piratebay is blocked in your # country url: https://thepiratebay.org/ timeout: 3.0 - name: pixiv shortcut: pv engine: pixiv disabled: true inactive: true pixiv_image_proxies: - https://pximg.example.org # A proxy is required to load the images. Hosting an image proxy server # for Pixiv: # --> https://pixivfe.pages.dev/hosting-image-proxy-server/ # Proxies from public instances. Ask the public instances owners if they # agree to receive traffic from SearXNG! # --> https://codeberg.org/VnPower/PixivFE#instances # --> https://github.com/searxng/searxng/pull/3192#issuecomment-1941095047 # image proxy of https://pixiv.cat # - https://i.pixiv.cat # image proxy of https://www.pixiv.pics # - https://pximg.cocomi.eu.org # image proxy of https://pixivfe.exozy.me # - https://pximg.exozy.me # image proxy of https://pixivfe.ducks.party # - https://pixiv.ducks.party # image proxy of https://pixiv.perennialte.ch # - https://pximg.perennialte.ch - name: podcastindex engine: podcastindex shortcut: podcast # Required dependency: psychopg2 # - name: postgresql # engine: postgresql # database: postgres # username: postgres # password: postgres # limit: 10 # query_str: 'SELECT * from my_table WHERE my_column = %(query)s' # shortcut : psql - name: presearch engine: presearch search_type: search categories: [general, web] shortcut: ps timeout: 4.0 disabled: true - name: presearch images engine: presearch network: presearch search_type: images categories: [images, web] timeout: 4.0 shortcut: psimg disabled: true - name: presearch videos engine: presearch network: presearch search_type: videos categories: [general, web] timeout: 4.0 shortcut: psvid disabled: true - name: presearch news engine: presearch network: presearch search_type: news categories: [news, web] timeout: 4.0 shortcut: psnews disabled: true - name: pub.dev engine: xpath shortcut: pd search_url: https://pub.dev/packages?q={query}&page={pageno} paging: true results_xpath: //div[contains(@class,"packages-item")] url_xpath: ./div/h3/a/@href title_xpath: ./div/h3/a content_xpath: ./div/div/div[contains(@class,"packages-description")]/span categories: [packages, it] timeout: 3.0 disabled: true first_page_num: 1 about: website: https://pub.dev/ official_api_documentation: https://pub.dev/help/api use_official_api: false require_api_key: false results: HTML - name: public domain image archive engine: public_domain_image_archive shortcut: pdia - name: pubmed engine: pubmed shortcut: pub timeout: 3.0 - name: pypi shortcut: pypi engine: pypi - name: quark quark_category: general categories: [general] engine: quark shortcut: qk disabled: true - name: quark images quark_category: images categories: [images] engine: quark shortcut: qki disabled: true - name: qwant qwant_categ: web engine: qwant shortcut: qw categories: [general, web] disabled: true additional_tests: rosebud: *test_rosebud - name: qwant news qwant_categ: news engine: qwant shortcut: qwn categories: news network: qwant - name: qwant images qwant_categ: images engine: qwant shortcut: qwi categories: [images, web] network: qwant - name: qwant videos qwant_categ: videos engine: qwant shortcut: qwv categories: [videos, web] network: qwant # - name: library # engine: recoll # shortcut: lib # base_url: 'https://recoll.example.org/' # search_dir: '' # mount_prefix: /export # dl_prefix: 'https://download.example.org' # timeout: 30.0 # categories: files # disabled: true # - name: recoll library reference # engine: recoll # base_url: 'https://recoll.example.org/' # search_dir: reference # mount_prefix: /export # dl_prefix: 'https://download.example.org' # shortcut: libr # timeout: 30.0 # categories: files # disabled: true - name: radio browser engine: radio_browser shortcut: rb - name: reddit engine: reddit shortcut: re page_size: 25 disabled: true - name: reuters engine: reuters shortcut: reu # https://docs.searxng.org/dev/engines/online/reuters.html # sort_order = "relevance" - name: right dao engine: xpath paging: true page_size: 12 search_url: https://rightdao.com/search?q={query}&start={pageno} results_xpath: //div[contains(@class, "description")] url_xpath: ../div[contains(@class, "title")]/a/@href title_xpath: ../div[contains(@class, "title")] content_xpath: . categories: general shortcut: rd disabled: true about: website: https://rightdao.com/ use_official_api: false require_api_key: false results: HTML - name: rottentomatoes engine: rottentomatoes shortcut: rt disabled: true # Required dependency: redis # - name: myredis # shortcut : rds # engine: redis_server # exact_match_only: false # host: '127.0.0.1' # port: 6379 # enable_http: true # password: '' # db: 0 # tmp suspended: bad certificate # - name: scanr structures # shortcut: scs # engine: scanr_structures # disabled: true - name: searchmysite engine: xpath shortcut: sms categories: general paging: true search_url: https://searchmysite.net/search/?q={query}&page={pageno} results_xpath: //div[contains(@class,'search-result')] url_xpath: .//a[contains(@class,'result-link')]/@href title_xpath: .//span[contains(@class,'result-title-txt')]/text() content_xpath: ./p[@id='result-hightlight'] disabled: true about: website: https://searchmysite.net - name: selfhst icons engine: selfhst shortcut: si inactive: true disabled: true - name: sepiasearch engine: sepiasearch shortcut: sep - name: sogou engine: sogou shortcut: sogou disabled: true - name: sogou images engine: sogou_images shortcut: sogoui disabled: true - name: sogou videos engine: sogou_videos shortcut: sogouv disabled: true - name: sogou wechat engine: sogou_wechat shortcut: sogouw disabled: true - name: soundcloud engine: soundcloud shortcut: sc - name: stackoverflow engine: stackexchange shortcut: st api_site: 'stackoverflow' categories: [it, q&a] - name: askubuntu engine: stackexchange shortcut: ubuntu api_site: 'askubuntu' categories: [it, q&a] - name: superuser engine: stackexchange shortcut: su api_site: 'superuser' categories: [it, q&a] - name: discuss.python engine: discourse shortcut: dpy base_url: 'https://discuss.python.org' categories: [it, q&a] disabled: true - name: caddy.community engine: discourse shortcut: caddy base_url: 'https://caddy.community' categories: [it, q&a] disabled: true - name: pi-hole.community engine: discourse shortcut: pi categories: [it, q&a] base_url: 'https://discourse.pi-hole.net' disabled: true - name: searchcode code engine: searchcode_code shortcut: scc disabled: true # - name: searx # engine: searx_engine # shortcut: se # instance_urls : # - http://127.0.0.1:8888/ # - ... # disabled: true - name: semantic scholar engine: semantic_scholar disabled: true shortcut: se # Spotify needs API credentials # - name: spotify # engine: spotify # shortcut: stf # api_client_id: ******* # api_client_secret: ******* # - name: solr # engine: solr # shortcut: slr # base_url: http://localhost:8983 # collection: collection_name # sort: '' # sorting: asc or desc # field_list: '' # comma separated list of field names to display on the UI # default_fields: '' # default field to query # query_fields: '' # query fields # enable_http: true # - name: springer nature # engine: springer # # get your API key from: https://dev.springernature.com/signup # # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601" # api_key: 'unset' # shortcut: springer # timeout: 15.0 - name: startpage engine: startpage shortcut: sp startpage_categ: web categories: [general, web] additional_tests: rosebud: *test_rosebud - name: startpage news engine: startpage startpage_categ: news categories: [news, web] shortcut: spn - name: startpage images engine: startpage startpage_categ: images categories: [images, web] shortcut: spi - name: steam engine: steam shortcut: stm disabled: true - name: tokyotoshokan engine: tokyotoshokan shortcut: tt timeout: 6.0 disabled: true - name: solidtorrents engine: solidtorrents shortcut: solid timeout: 4.0 base_url: - https://solidtorrents.to - https://bitsearch.to # For this demo of the sqlite engine download: # https://liste.mediathekview.de/filmliste-v2.db.bz2 # and unpack into searx/data/filmliste-v2.db # Query to test: "!mediathekview concert" # # - name: mediathekview # engine: sqlite # shortcut: mediathekview # categories: [general, videos] # result_type: MainResult # database: searx/data/filmliste-v2.db # query_str: >- # SELECT title || ' (' || time(duration, 'unixepoch') || ')' AS title, # COALESCE( NULLIF(url_video_hd,''), NULLIF(url_video_sd,''), url_video) AS url, # description AS content # FROM film # WHERE title LIKE :wildcard OR description LIKE :wildcard # ORDER BY duration DESC - name: tagesschau engine: tagesschau # when set to false, display URLs from Tagesschau, and not the actual source # (e.g. NDR, WDR, SWR, HR, ...) use_source_url: true shortcut: ts disabled: true - name: tmdb engine: xpath paging: true categories: movies search_url: https://www.themoviedb.org/search?page={pageno}&query={query} results_xpath: //div[contains(@class,"movie") or contains(@class,"tv")]//div[contains(@class,"card")] url_xpath: .//div[contains(@class,"poster")]/a/@href thumbnail_xpath: .//img/@src title_xpath: .//div[contains(@class,"title")]//h2 content_xpath: .//div[contains(@class,"overview")] shortcut: tm disabled: true # Requires Tor - name: torch engine: xpath paging: true search_url: http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and results_xpath: //table//tr url_xpath: ./td[2]/a title_xpath: ./td[2]/b content_xpath: ./td[2]/small categories: onions enable_http: true shortcut: tch # torznab engine lets you query any torznab compatible indexer. Using this # engine in combination with Jackett opens the possibility to query a lot of # public and private indexers directly from SearXNG. More details at: # https://docs.searxng.org/dev/engines/online/torznab.html # # - name: Torznab EZTV # engine: torznab # shortcut: eztv # base_url: http://localhost:9117/api/v2.0/indexers/eztv/results/torznab # enable_http: true # if using localhost # api_key: xxxxxxxxxxxxxxx # show_magnet_links: true # show_torrent_files: false # # https://github.com/Jackett/Jackett/wiki/Jackett-Categories # torznab_categories: # optional # - 2000 # - 5000 # tmp suspended - too slow, too many errors # - name: urbandictionary # engine : xpath # search_url : https://www.urbandictionary.com/define.php?term={query} # url_xpath : //*[@class="word"]/@href # title_xpath : //*[@class="def-header"] # content_xpath: //*[@class="meaning"] # shortcut: ud - name: unsplash engine: unsplash shortcut: us - name: yandex engine: yandex categories: general search_type: web shortcut: yd disabled: true inactive: true - name: yandex images engine: yandex categories: images search_type: images shortcut: ydi disabled: true inactive: true - name: yandex music engine: yandex_music shortcut: ydm disabled: true # https://yandex.com/support/music/access.html inactive: true - name: yahoo engine: yahoo shortcut: yh disabled: true - name: yahoo news engine: yahoo_news shortcut: yhn - name: youtube shortcut: yt # You can use the engine using the official stable API, but you need an API # key See: https://console.developers.google.com/project # # engine: youtube_api # api_key: 'apikey' # required! # # Or you can use the html non-stable engine, activated by default engine: youtube_noapi - name: dailymotion engine: dailymotion shortcut: dm - name: vimeo engine: vimeo shortcut: vm - name: wiby engine: json_engine paging: true search_url: https://wiby.me/json/?q={query}&p={pageno} url_query: URL title_query: Title content_query: Snippet categories: [general, web] shortcut: wib disabled: true about: website: https://wiby.me/ - name: wikibooks engine: mediawiki weight: 0.5 shortcut: wb categories: [general, wikimedia] base_url: "https://{language}.wikibooks.org/" search_type: text disabled: true about: website: https://www.wikibooks.org/ wikidata_id: Q367 - name: wikinews engine: mediawiki shortcut: wn categories: [news, wikimedia] base_url: "https://{language}.wikinews.org/" search_type: text srsort: create_timestamp_desc about: website: https://www.wikinews.org/ wikidata_id: Q964 - name: wikiquote engine: mediawiki weight: 0.5 shortcut: wq categories: [general, wikimedia] base_url: "https://{language}.wikiquote.org/" search_type: text disabled: true additional_tests: rosebud: *test_rosebud about: website: https://www.wikiquote.org/ wikidata_id: Q369 - name: wikisource engine: mediawiki weight: 0.5 shortcut: ws categories: [general, wikimedia] base_url: "https://{language}.wikisource.org/" search_type: text disabled: true about: website: https://www.wikisource.org/ wikidata_id: Q263 - name: wikispecies engine: mediawiki shortcut: wsp categories: [general, science, wikimedia] base_url: "https://species.wikimedia.org/" search_type: text disabled: true about: website: https://species.wikimedia.org/ wikidata_id: Q13679 tests: wikispecies: matrix: query: "Campbell, L.I. et al. 2011: MicroRNAs" lang: en result_container: - not_empty - ['one_title_contains', 'Tardigrada'] test: - unique_results - name: wiktionary engine: mediawiki shortcut: wt categories: [dictionaries, wikimedia] base_url: "https://{language}.wiktionary.org/" search_type: text about: website: https://www.wiktionary.org/ wikidata_id: Q151 - name: wikiversity engine: mediawiki weight: 0.5 shortcut: wv categories: [general, wikimedia] base_url: "https://{language}.wikiversity.org/" search_type: text disabled: true about: website: https://www.wikiversity.org/ wikidata_id: Q370 - name: wikivoyage engine: mediawiki weight: 0.5 shortcut: wy categories: [general, wikimedia] base_url: "https://{language}.wikivoyage.org/" search_type: text disabled: true about: website: https://www.wikivoyage.org/ wikidata_id: Q373 - name: wikicommons.images engine: wikicommons shortcut: wc categories: images search_type: images number_of_results: 10 - name: wikicommons.videos engine: wikicommons shortcut: wcv categories: videos search_type: videos number_of_results: 10 - name: wikicommons.audio engine: wikicommons shortcut: wca categories: music search_type: audio number_of_results: 10 - name: wikicommons.files engine: wikicommons shortcut: wcf categories: files search_type: files number_of_results: 10 - name: wolframalpha shortcut: wa # You can use the engine using the official stable API, but you need an API # key. See: https://products.wolframalpha.com/api/ # # engine: wolframalpha_api # api_key: '' # # Or you can use the html non-stable engine, activated by default engine: wolframalpha_noapi timeout: 6.0 categories: general disabled: true - name: dictzone engine: dictzone shortcut: dc - name: mymemory translated engine: translated shortcut: tl timeout: 5.0 # You can use without an API key, but you are limited to 1000 words/day # See: https://mymemory.translated.net/doc/usagelimits.php # api_key: '' # Required dependency: mysql-connector-python # - name: mysql # engine: mysql_server # database: mydatabase # username: user # password: pass # limit: 10 # query_str: 'SELECT * from mytable WHERE fieldname=%(query)s' # shortcut: mysql # Required dependency: mariadb # - name: mariadb # engine: mariadb_server # database: mydatabase # username: user # password: pass # limit: 10 # query_str: 'SELECT * from mytable WHERE fieldname=%(query)s' # shortcut: mdb - name: 1337x engine: 1337x shortcut: 1337x disabled: true - name: duden engine: duden shortcut: du disabled: true - name: seznam shortcut: szn engine: seznam disabled: true # - name: deepl # engine: deepl # shortcut: dpl # # You can use the engine using the official stable API, but you need an API key # # See: https://www.deepl.com/pro-api?cta=header-pro-api # api_key: '' # required! # timeout: 5.0 # disabled: true - name: mojeek shortcut: mjk engine: mojeek categories: [general, web] disabled: true - name: mojeek images shortcut: mjkimg engine: mojeek categories: [images, web] search_type: images paging: false disabled: true - name: mojeek news shortcut: mjknews engine: mojeek categories: [news, web] search_type: news paging: false disabled: true - name: moviepilot engine: moviepilot shortcut: mp disabled: true - name: naver categories: [general, web] engine: naver shortcut: nvr disabled: true - name: naver images naver_category: images categories: [images] engine: naver shortcut: nvri disabled: true - name: naver news naver_category: news categories: [news] engine: naver shortcut: nvrn disabled: true - name: naver videos naver_category: videos categories: [videos] engine: naver shortcut: nvrv disabled: true - name: rubygems shortcut: rbg engine: xpath paging: true search_url: https://rubygems.org/search?page={pageno}&query={query} results_xpath: /html/body/main/div/a[@class="gems__gem"] url_xpath: ./@href title_xpath: ./span/h2 content_xpath: ./span/p suggestion_xpath: /html/body/main/div/div[@class="search__suggestions"]/p/a first_page_num: 1 categories: [it, packages] disabled: true about: website: https://rubygems.org/ wikidata_id: Q1853420 official_api_documentation: https://guides.rubygems.org/rubygems-org-api/ use_official_api: false require_api_key: false results: HTML - name: peertube engine: peertube shortcut: ptb paging: true # alternatives see: https://instances.joinpeertube.org/instances # base_url: https://tube.4aem.com categories: videos disabled: true timeout: 6.0 - name: mediathekviewweb engine: mediathekviewweb shortcut: mvw disabled: true - name: yacy # https://docs.searxng.org/dev/engines/online/yacy.html engine: yacy categories: general search_type: text base_url: - https://yacy.searchlab.eu # see https://github.com/searxng/searxng/pull/3631#issuecomment-2240903027 # - https://search.kyun.li # - https://yacy.securecomcorp.eu # - https://yacy.myserv.ca # - https://yacy.nsupdate.info # - https://yacy.electroncash.de shortcut: ya disabled: true # if you aren't using HTTPS for your local yacy instance disable https # enable_http: false search_mode: 'global' # timeout can be reduced in 'local' search mode timeout: 5.0 - name: yacy images engine: yacy network: yacy categories: images search_type: image shortcut: yai disabled: true # timeout can be reduced in 'local' search mode timeout: 5.0 - name: rumble engine: rumble shortcut: ru base_url: https://rumble.com/ paging: true categories: videos disabled: true - name: livespace engine: livespace shortcut: ls categories: videos disabled: true timeout: 5.0 - name: wordnik engine: wordnik shortcut: def categories: [dictionaries] timeout: 5.0 - name: woxikon.de synonyme engine: xpath shortcut: woxi categories: [dictionaries] timeout: 5.0 disabled: true search_url: https://synonyme.woxikon.de/synonyme/{query}.php url_xpath: //div[@class="upper-synonyms"]/a/@href content_xpath: //div[@class="synonyms-list-group"] title_xpath: //div[@class="upper-synonyms"]/a no_result_for_http_status: [404] about: website: https://www.woxikon.de/ wikidata_id: # No Wikidata ID use_official_api: false require_api_key: false results: HTML language: de - name: seekr news engine: seekr shortcut: senews categories: news seekr_category: news disabled: true - name: seekr images engine: seekr network: seekr news shortcut: seimg categories: images seekr_category: images disabled: true - name: seekr videos engine: seekr network: seekr news shortcut: sevid categories: videos seekr_category: videos disabled: true - name: stract engine: stract shortcut: str disabled: true - name: svgrepo engine: svgrepo shortcut: svg timeout: 10.0 disabled: true - name: tootfinder engine: tootfinder shortcut: toot - name: uxwing engine: uxwing shortcut: ux disabled: true - name: voidlinux engine: voidlinux shortcut: void disabled: true - name: wallhaven engine: wallhaven # api_key: abcdefghijklmnopqrstuvwxyz shortcut: wh # wikimini: online encyclopedia for children # The fulltext and title parameter is necessary for Wikimini because # sometimes it will not show the results and redirect instead - name: wikimini engine: xpath shortcut: wkmn search_url: https://fr.wikimini.org/w/index.php?search={query}&title=Sp%C3%A9cial%3ASearch&fulltext=Search url_xpath: //li/div[@class="mw-search-result-heading"]/a/@href title_xpath: //li//div[@class="mw-search-result-heading"]/a content_xpath: //li/div[@class="searchresult"] categories: general disabled: true about: website: https://wikimini.org/ wikidata_id: Q3568032 use_official_api: false require_api_key: false results: HTML language: fr - name: wttr.in engine: wttr shortcut: wttr timeout: 9.0 - name: yummly engine: yummly shortcut: yum disabled: true - name: brave engine: brave shortcut: br time_range_support: true paging: true categories: [general, web] brave_category: search # brave_spellcheck: true - name: brave.images engine: brave network: brave shortcut: brimg categories: [images, web] brave_category: images - name: brave.videos engine: brave network: brave shortcut: brvid categories: [videos, web] brave_category: videos - name: brave.news engine: brave network: brave shortcut: brnews categories: news brave_category: news # - name: brave.goggles # engine: brave # network: brave # shortcut: brgog # time_range_support: true # paging: true # categories: [general, web] # brave_category: goggles # Goggles: # required! This should be a URL ending in .goggle - name: lib.rs shortcut: lrs engine: lib_rs disabled: true - name: sourcehut shortcut: srht engine: xpath paging: true search_url: https://sr.ht/projects?page={pageno}&search={query} results_xpath: (//div[@class="event-list"])[1]/div[@class="event"] url_xpath: ./h4/a[2]/@href title_xpath: ./h4/a[2] content_xpath: ./p first_page_num: 1 categories: [it, repos] disabled: true about: website: https://sr.ht wikidata_id: Q78514485 official_api_documentation: https://man.sr.ht/ use_official_api: false require_api_key: false results: HTML - name: goo shortcut: goo engine: xpath paging: true search_url: https://search.goo.ne.jp/web.jsp?MT={query}&FR={pageno}0 url_xpath: //div[@class="result"]/p[@class='title fsL1']/a/@href title_xpath: //div[@class="result"]/p[@class='title fsL1']/a content_xpath: //p[contains(@class,'url fsM')]/following-sibling::p first_page_num: 0 categories: [general, web] disabled: true timeout: 4.0 about: website: https://search.goo.ne.jp wikidata_id: Q249044 use_official_api: false require_api_key: false results: HTML language: ja - name: bt4g engine: bt4g shortcut: bt4g - name: pkg.go.dev engine: pkg_go_dev shortcut: pgo disabled: true - name: senscritique engine: senscritique shortcut: scr timeout: 4.0 disabled: true # Doku engine lets you access to any Doku wiki instance: # A public one or a privete/corporate one. # - name: ubuntuwiki # engine: doku # shortcut: uw # base_url: 'https://doc.ubuntu-fr.org' # Be careful when enabling this engine if you are # running a public instance. Do not expose any sensitive # information. You can restrict access by configuring a list # of access tokens under tokens. # - name: git grep # engine: command # command: ['git', 'grep', '{{QUERY}}'] # shortcut: gg # tokens: [] # disabled: true # delimiter: # chars: ':' # keys: ['filepath', 'code'] # Be careful when enabling this engine if you are # running a public instance. Do not expose any sensitive # information. You can restrict access by configuring a list # of access tokens under tokens. # - name: locate # engine: command # command: ['locate', '{{QUERY}}'] # shortcut: loc # tokens: [] # disabled: true # delimiter: # chars: ' ' # keys: ['line'] # Be careful when enabling this engine if you are # running a public instance. Do not expose any sensitive # information. You can restrict access by configuring a list # of access tokens under tokens. # - name: find # engine: command # command: ['find', '.', '-name', '{{QUERY}}'] # query_type: path # shortcut: fnd # tokens: [] # disabled: true # delimiter: # chars: ' ' # keys: ['line'] # Be careful when enabling this engine if you are # running a public instance. Do not expose any sensitive # information. You can restrict access by configuring a list # of access tokens under tokens. # - name: pattern search in files # engine: command # command: ['fgrep', '{{QUERY}}'] # shortcut: fgr # tokens: [] # disabled: true # delimiter: # chars: ' ' # keys: ['line'] # Be careful when enabling this engine if you are # running a public instance. Do not expose any sensitive # information. You can restrict access by configuring a list # of access tokens under tokens. # - name: regex search in files # engine: command # command: ['grep', '{{QUERY}}'] # shortcut: gr # tokens: [] # disabled: true # delimiter: # chars: ' ' # keys: ['line'] doi_resolvers: oadoi.org: 'https://oadoi.org/' doi.org: 'https://doi.org/' doai.io: 'https://dissem.in/' sci-hub.se: 'https://sci-hub.se/' sci-hub.st: 'https://sci-hub.st/' sci-hub.ru: 'https://sci-hub.ru/' default_doi_resolver: 'oadoi.org' ================================================ FILE: searxng/uwsgi.ini ================================================ [uwsgi] # Who will run the code uid = searxng gid = searxng # Performance optimizations # Number of workers (adjust based on CPU cores) workers = 4 # Number of threads per worker (increased for better concurrency) threads = 8 # Memory optimizations # Restart worker if RSS memory exceeds 512MB reload-on-rss = 512 # Kill requests taking longer than 30 seconds harakiri = 30 # Restart workers after handling 1000 requests (prevents memory leaks) max-requests = 1000 # Add some jitter to prevent all workers restarting at once max-requests-delta = 100 # The right granted on the created socket chmod-socket = 666 # Plugin to use and interpreter config single-interpreter = true master = true plugin = python3 lazy-apps = true enable-threads = 4 # Module to import module = searx.webapp # Virtualenv and python path pythonpath = /usr/local/searxng/ chdir = /usr/local/searxng/searx/ # automatically set processes name to something meaningful auto-procname = true # Disable request logging for privacy disable-logging = true log-5xx = true # Buffer optimizations for better performance # Increase buffer size for better performance buffer-size = 16384 # Increase post buffer size post-buffering = 8192 # Increase listen queue listen = 1024 # No keep alive # See https://github.com/searx/searx-docker/issues/24 add-header = Connection: close # Follow SIGTERM convention # See https://github.com/searxng/searxng/issues/3427 die-on-term # uwsgi serves the static files with optimizations static-map = /static=/usr/local/searxng/searx/static # Enable static file caching (1 hour) static-expires-uri = /static/.* 3600 # Compress static files static-gzip-all = true # Enable memory reporting for monitoring memory-report = true # Use cheaper algorithm for dynamic worker scaling cheaper-algo = spare cheaper = 2 cheaper-initial = 2 cheaper-step = 1 # expires set to one day static-expires = /* 86400 static-gzip-all = True offload-threads = 4 ================================================ FILE: searxng/uwsgi.ini.new ================================================ [uwsgi] # Listening address # default value: [::]:8080 (see Dockerfile) http-socket = $(BIND_ADDRESS) # Who will run the code uid = searxng gid = searxng # Number of workers (usually CPU count) # default value: %k (= number of CPU core, see Dockerfile) workers = $(UWSGI_WORKERS) # Number of threads per worker # default value: 4 (see Dockerfile) threads = $(UWSGI_THREADS) # The right granted on the created socket chmod-socket = 666 # Plugin to use and interpreter config single-interpreter = true master = true lazy-apps = true enable-threads = true # Module to import module = searx.webapp # Virtualenv and python path pythonpath = /usr/local/searxng/ chdir = /usr/local/searxng/searx/ # automatically set processes name to something meaningful auto-procname = true # Disable request logging for privacy disable-logging = true log-5xx = true # Set the max size of a request (request-body excluded) buffer-size = 8192 # No keep alive # See https://github.com/searx/searx-docker/issues/24 add-header = Connection: close # Follow SIGTERM convention # See https://github.com/searxng/searxng/issues/3427 die-on-term # uwsgi serves the static files static-map = /static=/usr/local/searxng/searx/static static-gzip-all = True offload-threads = %k ================================================ FILE: static/admin.html ================================================ CoexistAI Admin

CoexistAI Admin

Configure your AI settings securely

Status: Loading...
Note: If you run a local service (like lmstudio) on your computer and the container cannot reach 127.0.0.1, try using host.docker.internal as the host inside the config (e.g. http://host.docker.internal:1234).
Status: unknown -







================================================
FILE: system_prompt.py
================================================
#Example system prompt for coexistAI MCP can be used in Agents/LMStudio/LLM system prompts
system_prompt = """# Role & Core Principles

You are a smart, logical, and practical planning assistant.  WHO ALWAYS STICKS TO MANDATORY STEPS.
You can **see** what the user shares — files, folders, images, etc.

- **Do NOT use your own memory or world knowledge.**
- **ONLY use tool outputs to answer.**
- **NEVER generate hypothetical or assumed responses.**

---

# 🛠️ Tool Usage Guidelines

### `get_web_search` (For exploring Web & Local Files as well)
- **Primary tool for all fact-finding.**
- Works with both **web content** and **local files/folders** when `local_mode=True`.
- Supports: PDFs, CSVs, Excels, plain text, images, folders, and more.
- Ideal for search, summarization, Q&A, and discovery.
- After locating paths, use the **most specific one**.

### `get_local_folder_tree`
- View local folder structures.
- Exclude hidden/system/cache files.
- Start broad, then drill into relevant folders.

### `get_web_summarize`
- Deep summarization of documents (web/local).
- Best for long PDFs, structured reports, or dense content.

### `get_topk_clickable_elements`
- Extract top-k clickable links from a URL based on a query.
- Use for news, bookings, interactive sites, etc.
- Follow up with summarization tools for selected links.

### `get_youtube_search`
- Search and summarize YouTube videos via keyword or URL.

### `get_reddit_search`
- Flexible Reddit search across subreddits or topics.

### `get_map_search`
- Use textual addresses to find locations, routes, or POIs.
- Always confirm locations using `get_web_search`.

### GitHub Tools (Code Only)
- `get_git_tree`: Inspect repo structure (start broad).
- `get_git_search`: Fetch file contents post-structure analysis.
- **Use only for code.** For non-code files, prefer `get_web_search` with `local_mode=True`.

### `get_deep_research_aspects`
- Use only if deep research is explicitly requested.
- Break into sub-aspects and explore using relevant tools.

### `get_response_check`
- **MANDATORY before final answer.**
- Confirms all tasks are covered and flags anything missing.

---

# Mandatory Workflow

### 1. **Break Down the Task**
- Split request into distinct sub-tasks (e.g., A, B, C...).

### 2. **Think About All Tools**
- List **all possible tools** relevant to each sub-task.
- Think through combinations (e.g., search ➝ summarize).

### 3. **Plan Tool Use**
- Pick the most suitable tool for each sub-task.
- Start broad (folder, tree, search), then go deeper.

### 4. **Execute & Write FLASHCARD**
- After each tool call, write a **FLASHCARD**:
  - Rich shorthand of key takeaways.
  - Highlight facts, paths, ideas—not just summary.

### 5. **Coverage Check**
- Make sure every sub-task is answered.
- If gaps remain, use focused follow-up tool calls.

### 6. **Manual Format Fixes**
- Handle formatting (e.g., rewording, layout) yourself—**no tools.**

### 7. **Final Check**
- Run `get_response_check` to confirm completeness.
- Only then, synthesize your final answer using all FLASHCARDs.

---

# Tool Usage Policies

- Never use your own knowledge or make assumptions.
- Prefer tool outputs always—even for obvious facts.
- Organize responses by sub-task.
- Retry failed searches with smarter queries.
- Never miss or skip useful info from any tool output.
- Clarify ambiguous user asks before starting.
- Always verify file paths or URLs before accessing."""

================================================
FILE: utils/__init__.py
================================================


================================================
FILE: utils/answer_generation.py
================================================
import logging
import asyncio
from typing import Union
from pydantic import BaseModel, Field

from utils.config import *
from utils.utils import *
from utils.utils import get_local_data

logger = logging.getLogger(__name__)

date, day = get_local_data()


async def query_agent(query, llm, date, day):
    """
    Generates a list of subqueries based on a given query using an LLM agent.

    Args:
        query (str): The initial query.
        llm: The language model instance used for generating content.
        date (str): The current date.
        day (str): The current day of the week.

    Returns:
        list: A list of subqueries generated by the LLM, cleaned and formatted.
    """

    class SearchSubqueries(BaseModel):
        """Generates a list of subqueries based on a given query using an LLM agent.
        - Use date/location only if the query requires time-sensitive or location-specific information.
        Replace time-specific words with context details.
        Rephrase the query for search engine optimization and clarity.
        Avoid including the date if not necessary.
        Emphasize date or location when needed.
        When breaking down into multiple tasks, always consider the overall task to ensure every aspect is included. By the end of searching for subqueries, there should be enough information to process the answer.
        If necessary, split the query into diverse subqueries to cover all aspects.
        Subqueries should not be in a numbered list.
        Plan searches to answer every part of the query without overemphasizing any term.
        Ensure that subqueries include essential keywords and context from the original query to maintain relevance. Every query will be searched independently, so ensure that each subquery is self-contained and can be understood without additional context.
        Avoid generating subqueries that are too generic or lack specific terms from the original query.
        When rephrasing, retain the main subject and important details from the original query in each subquery.
        CHECK FOR NEXT STEPS OR INCOMPLETE INFORMATION ELEMENTS IN EARLIER RESPONSE OR QUERY, IF THERE ARE ANY PLAN ACCORDINGLY.
        """ + "Todays date is: {date} and today is {day}, {place}, Use date/location only if the query requires time-sensitive or location-specific information.",

        planning_to_answer_query_to_help_finding_subqueries: list[str] = Field(
            description="Based on query complexity, generate 1 (min) to 2 (max) sub-tasks (6 for complex queries, 1 for simple ones), breaking down the task into parallel, self-sufficient descriptions for independent agents. Avoid overemphasizing keywords and sequential dependencies. Context: Today's date is {date} {day}, {place}."
        )
        subqueries: list[str] = Field(
            description="Generate 1 (min) to 2 (max) subqueries (1 for simple queries, up to 6 for complex ones), each as a rephrased, independent search phrase with a maximum of 6 words, covering all aspects of the query. Include essential details and context from the original query. Context: Today's date is {date} {day}, {place}."
        )
        summary_reasoning: str = Field(
            description="max one liner reasoning behind whether is_summary. If the query is complex, requires structured data (tables, lists, codes), or involves summarizing multiple sources, set is_summary to True. If the query is straightforward and can be answered directly from a single source, set is_summary to False."
        )
        is_summary: bool = Field(
            description="Whether the subqueries are intended to summarize. Generally, for the subqueries focused on structured data, tables, list, codes. You SHOULD use summarise!"
        )
        is_structured_data: bool = Field(
            description= "Whether the query will hit a page with structured data like tables, codes, list, indexes, where full page full page would be needed to access to answer the query OR is it the case chunking page content will leave give only half of required information for a given query"
        )
        is_covered_urls: bool = Field(  
            description="If user is asking to answer explicitly using youtube or reddit"
        )
        is_focused_on_given_urls: bool = Field(
            description="It the intent of user is to focus on given urls only, like summarising, analysis, comparing etc of given urls"
        )

    logger = logging.getLogger(__name__)
    try:
        response = await llm.with_structured_output(SearchSubqueries).ainvoke(
            f"Todays date is: {date} and today is {day}, {place}, Use date/location only if the query requires time-sensitive or location-specific information. "
            f"\nquery: {query}"
        )
        is_summary = response.is_summary
        reasoning_summary = response.summary_reasoning
        if response.is_structured_data:
            is_summary = True
        is_covered_urls = response.is_covered_urls
        is_focused_on_given_urls = response.is_focused_on_given_urls
        response = response.subqueries

        logger.info(f"'Reasoning Summary:{reasoning_summary} Summary:{is_summary},subquery:{response}, iscoverd:{is_covered_urls}, isfocused:{is_focused_on_given_urls}")
    except Exception as e:
        logger.warning(f"Structured output failed: {e}. Falling back to prompt-based extraction.")
        try:
            prompt = prompts['query_agent_basic'].format(date=date, day=day, query=query)
            response = await llm.ainvoke(prompt)
            response = response.content
            response = extract_subqueries(response.text)
            return response, False,False,False
        except Exception as ex:
            logger.error(f"Both structured and prompt-based extraction failed: {ex}")
            return []
    try:
        response = [
            r.lower().replace('*', '').replace("subquery", "").replace("'", "")
            for r in response if r != ''
        ]
    except Exception as e:
        logger.error(f"Error cleaning subqueries: {e}")
        return []
    return response, is_summary,is_covered_urls,is_focused_on_given_urls


async def response_gen(model, query, context):
    """
    Generates a comprehensive response for a given query using an LLM, incorporating context.

    Args:
        model: The language model instance used for generating content.
        query (str): The query for which a response is needed.
        context (str): The context to be included in the response generation.

    Returns:
        tuple: The synthesized answer and sources formatted in markdown.
    """

    class ResponseGen(BaseModel):
        f"""Generates a comprehensive response for a given query using an LLM, incorporating context.
        Instructions:
        You are an AI assistant that generates a comprehensive response to a user query using the provided context.

        Understand the query thoroughly, even if it contains misspellings or errors. Use the provided context and related information to interpret the intended meaning.

        Answer the query using the context and available information; if the context is insufficient, utilise whatever information is relevant, its fine even if answer is incomplete. utilize your own knowledge to provide a complete answer, including code if requested.

        Dont just provide what is their in the web search results, but synthesize the information from the context and your own knowledge to create a comprehensive answer. If the query is about code, provide the code directly rather than a plan to implement it.

        Provide direct answers: If the query asks for code, supply the code rather than a plan to implement it. Learn facts, syntax, and structures from the context to create accurate responses.

        Enhance your response with engaging details, clear reasoning, and simple explanations. Use analogies to illustrate difficult concepts when appropriate.

        Cite sources for every detail using hyperlinks to the search result URLs; if using your own knowledge, cite as "llm_generated". Structure your answer in markdown format with bullet points. Utilize the context fully, even if it doesn't directly answer the question. Mention any next steps in next_steps; otherwise, write None.

        Query:
        {query}

        Context:
        {context}

        KEEP ANSWER CONCISE UNLESS ASKED FOR DETAILED.
        Summarize the context to answer user query, capture granularity to help customer and get value from the context.
        """
        intent_understanding: str = Field(
            description="Understanding of the query, including any misspellings or errors."
        )
        synthesized_answer_based_on_various_sources: str = Field(
            description="Detailed answer to the query based on all available information in the context, even if limited."
        )
        sources: list[str] = Field(
            description="List of sources used to answer the query, including hyperlinks to search results."
        )
        next_steps: Union[str, None] = Field(
            description="Next steps/searches needed to accomplish complete response to users query, if applicable (incomplete answers); and otherwise, None.( Tip:Many of the cases, summarising one link from sources would suffice)"
        )

    logger.info("Generating Answer for query: %s", query)

    try:
        response = await model.with_structured_output(ResponseGen).ainvoke(
            f"Query: {query}\nContext: {context}"
        )
        response = response.dict()
        sources = '\n'.join([f'[{i}]. ' + s + '\n' for i, s in enumerate(response['sources'])])
        answer = (
            "#### Answer: \n"
            + str(response['synthesized_answer_based_on_various_sources'])
            + "\n\n#### Next steps:\n"
            + str(response['next_steps'])
        )
        logger.info("Structured response generated successfully.")
    except Exception as e:
        logger.warning(f"Structured output failed: {e}. Falling back to prompt-based generation.")
        try:
            prompt = prompts['qa_response_generation'].format(context=context, query=query)
            prompt_template = f"{prompt}"
            response = await model.ainvoke(prompt_template)
            response = response.content
            answer = "#### Answer: \n" + response
            sources = ''
            logger.info("Fallback prompt-based response generated successfully.")
        except Exception as ex:
            logger.error(f"Both structured and prompt-based response generation failed: {ex}")
            return "Error: Unable to generate answer.", ""
    return answer, sources


async def summarizer(query, docs, llm, batch,max_docs=30,max_words_per_doc=3000):
    """
    Summarizes a list of documents iteratively in batches using an LLM.

    Args:
        query (str): The initial query guiding the summarization.
        docs (list): A list of documents to be summarized.
        llm: The language model instance used for generating content.
        batch (int): The number of documents to process in each batch.

    Returns:
        list: A summarized version of the input documents.
    """

    if not docs or not isinstance(docs, list):
        logger.warning("No documents provided or docs is not a list.")
        return []
    logger.info(f"Deduping docs: {len(docs)}")
    logger.warning(f"Capping max words to {max_words_per_doc}")
    docs = list(set([f'source:{k.metadata["source"]}\ncontent:{" ".join(k.page_content.split(" ")[:max_words_per_doc])}' for k in docs]))
    logger.info(f"Arrived Len of Docs: {len(docs)}")
    len_docs = len(docs)
    if len_docs == 1:
        logger.info("Only one document provided, summarizing it.")
        try:
            comb_docs = f'Document 0: {str(docs[0])}'
            prompt = prompts['summary_generation'].format(comb_docs=comb_docs, query=query)
            response = await llm.ainvoke(prompt)
            logger.info(f"Response type: {type(response)}")
            logger.info(f"Response attributes: {dir(response)}")
            logger.info(f"Response: {response}")
            
            # Handle different response types from async LLM calls
            if hasattr(response, 'content') and not callable(response.content):
                summary_text = response.content
            elif hasattr(response, 'text') and not callable(response.text):
                summary_text = response.text
            elif hasattr(response, 'content') and callable(response.content):
                # If content is a method, call it
                summary_text = response.content()
            elif hasattr(response, 'text') and callable(response.text):
                # If text is a method, call it
                summary_text = response.text()
            else:
                summary_text = str(response)
            
            if summary_text and isinstance(summary_text, str):
                return summary_text.strip()
            else:
                logger.warning("Empty or invalid summary for single document.")
                return 'SUMMARIZATION FAILED'
        except Exception as e:
            logger.error(f"Summarization failed for single document: {e}")
            return 'SUMMARIZATION FAILED'

    print(f"Summarising using {len_docs} documents")
    if len_docs > max_docs:
        logger.warning(f"Too many documents ({len_docs}) provided, limiting to {max_docs}. This may lead to loss of information.")
        docs = docs[:max_docs]
        len_docs = max_docs
    while len_docs > 1:
        summaries = []
        
        # Create async tasks for parallel batch processing
        async def process_batch(batch_docs, batch_index):
            if not batch_docs:
                return None
            
            comb_docs = '\n'.join(
                [f'Document {j+batch_index}:' + str(d) for j, d in enumerate(batch_docs)]
            )
            
            try:
                prompt = prompts['summary_generation'].format(comb_docs=comb_docs, query=query)
                response = await llm.ainvoke(prompt)
                # Handle different response types from async LLM calls
                if hasattr(response, 'content') and not callable(response.content):
                    summary_text = response.content
                elif hasattr(response, 'text') and not callable(response.text):
                    summary_text = response.text
                elif hasattr(response, 'content') and callable(response.content):
                    # If content is a method, call it
                    summary_text = response.content()
                elif hasattr(response, 'text') and callable(response.text):
                    # If text is a method, call it
                    summary_text = response.text()
                else:
                    summary_text = str(response)
                
                if summary_text and isinstance(summary_text, str):
                    return summary_text.strip()
                else:
                    logger.warning(f"Empty or invalid summary for batch starting at {batch_index}.")
                    return None
            except Exception as e:
                logger.error(f"Async summarization failed for batch starting at {batch_index}: {e}")
                return None
        
        # Create tasks for all batches
        batch_tasks = []
        for i in range(0, len_docs, batch):
            batch_docs = docs[i:i + batch]
            batch_tasks.append(process_batch(batch_docs, i))
        
        # Execute all batch tasks in parallel
        if batch_tasks:
            logger.info(f"Processing {len(batch_tasks)} batches in parallel")
            batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
            
            # Collect successful results
            for result in batch_results:
                if isinstance(result, Exception):
                    logger.error(f"Batch processing failed: {result}")
                elif result is not None:
                    summaries.append(result)

        if not summaries:
            logger.error("No summaries generated in this async iteration, aborting.")
            return 'SUMMARIZATION FAILED'

        len_docs = len(summaries)
        docs = summaries

    return '\n'.join(docs)


================================================
FILE: utils/config.py
================================================
prompts = {

'youtube_summary_prompt':"""Analyze and respond the task using the given transcript in detail, following these guidelines:

1. Understand the user's intent and perspective.
2. Focus on what would be most interesting and valuable to the user.
3. Provide an informative, engaging response with depth and insight.
4. Use emojis to enhance readability and engagement.
5. Adhere to the specific task: {task}
6. Just focus on task, thats all, dont add additional details about video or creator unless asked.

Transcript to summarize:
{transcript}

Remember to make the response comprehensive, enjoyable to read, and tailored to the user's interests while addressing the specified task.
""",

'reddit_summary_prompt':"""Create a comprehensive response based on of the following Reddit posts and comments, focusing on the objective: {search_query}

Please provide the comprehensive response.

Guidelines:
1. Follow the objective
2. Include hyperlinks to relevant posts/comments.
3. Maintain technical accuracy and detail.

IMPORTANT: Strictly adhere to the provided content. If no relevant discussions are found related to the summary objective, please state this clearly.
""",

"summ_qa": """Your task is to classify given query into 2 categories: Summary or QA on the basis of the nature of query
    If the query explicitly asks for summary, then give "SUMMARY" as output class or else "QA" 
    Give output in json format
    Dict["verdict":str[SUMMARY or QA]]
    
    Query:{query}""",

"summary_generation":"""Your task is to generate detailed response/summary based on the given documents wrt to answer user query :{query}
          Documents: {comb_docs}
          """  ,

"qa_response_generation":"""Instructions:

Understand the query thoroughly, even if it contains misspellings or errors. Use the provided context and related information to interpret the intended meaning.

Answer the query using the context and available information; if the context is insufficient, utilise whatever information is relevant, its fine even if answer is incomplete. utilize your own knowledge to provide a complete answer, including code if requested.

Provide direct answers: If the query asks for code, supply the code rather than a plan to implement it. Learn facts, syntax, and structures from the context to create accurate responses.

Enhance your response with engaging details, clear reasoning, and simple explanations. Use analogies to illustrate difficult concepts when appropriate.

Cite sources for every detail using hyperlinks to the search result URLs; if using your own knowledge, cite as "llm_generated". Structure your answer in markdown format with bullet points. Utilize the context fully, even if it doesn't directly answer the question. Mention any next steps in next_steps; otherwise, write None.

Query:

{query}

Context:

{context}

Provide your response in the following JSON format, KEEP ANSWER CONCISE UNLESS ASKED FOR DETAILED.:
Dict[
  "intent_understanding": "string",
  "synthesized_answer_based_on_various_sources": "Think and implement in step by step manner to get to answer to the query, answer even if limited information is provided, utilise all the info you have",
  "interesting_info_around_query": "string",
  "sources": ["links"],
  "next_steps": "string"
]
""",

"query_agent_basic":"""Context: Today's date is {date}, Bangalore, India.

Instructions:

Use date/location only if the query requires time-sensitive or location-specific information.
Replace time-specific words with context details.
Rephrase the query for search engine optimization and clarity.
Avoid including the date if not necessary.
Emphasize date or location when needed.
When breaking down into multiple tasks, always consider the overall task to ensure every aspect is included. By the end of searching for subqueries, there should be enough information to process the answer.
If necessary, split the query into up to 2 diverse subqueries to cover all aspects.
Subqueries should not be in a numbered list.
Plan searches to answer every part of the query without overemphasizing any term.
Ensure that subqueries include essential keywords and context from the original query to maintain relevance.
Avoid generating subqueries that are too generic or lack specific terms from the original query.
When rephrasing, retain the main subject and important details from the original query in each subquery.

Original Query: {query}

Use this JSON schema:

planning_to_answer_query_to_help_finding_subqueries: [Up to 6 planning steps breaking down the task without overemphasizing any keyword],
subqueries: [Up to 6 rephrased phrases covering all planning steps in independent search phrases , using a maximum of 4 words per subquery, avoiding unnecessary adjectives].
""",

"combine_answers":"""Combine the following two responses into one comprehensive answer that includes all unique, relevant details from both, maintaining all citations and references. Ensure no information is lost; fully integrate both answers. In the field your_view_and_analysis, add your own knowledge and insights, which may or may not be based on the provided context. Avoid generating escape characters or JSON that could cause parsing errors. Think step-by-step about how to utilize your knowledge and the available information to answer the query.

Given:

query: {query}
answer_1: {answer1}
answer_2: {answer2}
Provide the final answer in the following JSON format:
Dict[
  "final_answer": "string",
  "your_view_and_analysis": "string",
  "sources": ["links"]
]
    """,

"llm_review_basic":"""Analyze the query, previous answer, and earlier search phrases. Formulate new search phrases to gather information and complete missing parts of the current answer using the 'Next Steps' section. Avoid ideas requiring apps you cannot access. Think differently if previous context was insufficient. Provide new search phrases to supplement the current answer, thinking like an expert in the field. DONT LOSE OUT ON ANY INFORMATION FROM ANY ANSWER, SUPPELEMENTARILY ADD BOTH

Given:

Query: {query}
Previously used search phrases: {earlier_searches}
Context: {context}
Current answer: {answer}
Plan your searches to fulfill the user query.

Subqueries: List 2 (up to 6 if needed) rephrased phrases covering all planning steps, maximum 4 words per subquery, avoiding unnecessary adjectives.

Provide your response in JSON format:
"subqueries": [
  "phrase1",
  "phrase2"
]
Ensure the new search phrases:

Address all unanswered aspects of the query.
Use the 'Next Steps' section.
Expand on the information in the current answer.
Use URLs or structures from the answer if needed.
Use API knowledge to design keyphrases involving real-time data like flights or rail tickets, as these will be used to search Google.
You can also use URLs as search phrases. DONT LOSE ANY INFO, KEEP ANSWER CONCISE UNLESS ASKED FOR DETAILED.
""",

"web_or_llm":"""You will receive a user message. Determine whether to answer it directly using your internal knowledge (LLM) or by utilizing web search.

Decision Logic:

If the user explicitly requests a web search, use web search.
If the query can be answered confidently based on your existing knowledge and prior chat, answer directly using your LLM capabilities.
You will get the information based on the query, if the query is referring to prior message or not
If the query requires external information or you are unsure of the answer, utilize web search.
Output your decision in the following dictionary format: Dict["verdict":str[web or llm]]. Dont add ``json infront of it.
Query: {query}""",

'qa_response_final_touch':"""Go through the given query in detail, see what all is asked by user: {query}
To this query, this is the answer I have formed till now: {response}
Your task is to fill the gap between the things user asked and my answer, and build upon and refine the answer (if at all it is necessary to answer the query). If some facts are not there, dont make it up, stick to the information given.
Give the final answer which fullfills users request. Facts and sources/citations should not be altered. 
You are given with context, which has real time data, never say that you dont have access to real time data. ALWAYS well structure your answer. Hyperlink the sources in words. Give answer in markdown format and not json.
""",

"summ_qa_url_router":"""Your task is to classify given query into 2 categories: YES or NO
    If the query explicitly asks for information that needs information from maps, say YES else NO
    There are 2 types of functions one which gives information about route, if the query needs to find route between two destinations then give them under SRC_DST else None, this should be filled only when nearby is None
    SRC and DST should be similar to what would you put in google maps
    If the query asks for things like nearby hospitals, then give answer in NEARBY as ['hospital'] else [],add in the list even if there is single element
    Give output in json format
    Dict["verdict":str[YES or NO], "SRC_DST":[SRC, DST] , "NEARBY":[list of str of type of thing you want to find nearby],"if_nearby_then_loc":[list of str of locations around which information is needed if nearby is not none]]

    
    Query:{query}""",

"summary_generation":"""Your task is to generate detailed summary of the given documents to answer user's query :{query}
          Documents: {comb_docs}
          Stick to given documents, summary should look like an answer to the query, enlist assumptions made  (if any)
          """ ,



}

place = 'Bangalore, India'




================================================
FILE: utils/crawler_utils.py
================================================
import asyncio
import aiohttp
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import logging
import re
import time
import random
from typing import List, Optional, Union
from utils.knowledge_base import create_knowledge_base
from utils.websearch_utils import urls_to_docs
from langchain_text_splitters import TokenTextSplitter
from utils.retriever_utils import create_vectorstore_async
import chromadb
from chromadb.config import Settings
import hashlib
import xml.etree.ElementTree as ET

logger = logging.getLogger(__name__)

def get_sitemap_urls(base_url: str, headers: dict) -> List[str]:
    """
    Attempt to fetch and parse sitemap.xml for additional URLs.
    """
    sitemap_urls = []
    sitemap_url = urljoin(base_url, '/sitemap.xml')
    try:
        response = requests.get(sitemap_url, headers=headers, timeout=10)
        if response.status_code == 200:
            root = ET.fromstring(response.content)
            for loc in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
                url = loc.text
                if url and urlparse(url).netloc == urlparse(base_url).netloc:
                    sitemap_urls.append(url)
            logger.info(f"Found {len(sitemap_urls)} URLs in sitemap")
    except Exception as e:
        logger.info(f"No sitemap found or error parsing: {e}")
    return sitemap_urls

def crawl_website(base_url: str, depth: Optional[int] = None, max_pages: int = 100, min_delay: float = 1.0, max_delay: float = 3.0, url_keyword: Optional[str] = None) -> List[str]:
    """
    Crawl a website starting from base_url up to the specified depth (or full website if depth is None).
    Includes random delays between requests to avoid rate limiting.
    
    Args:
        base_url: The starting URL to crawl
        depth: Maximum depth to crawl (None for full website crawl, default: None)
        max_pages: Maximum number of pages to collect
        min_delay: Minimum delay between requests in seconds (default: 1.0)
        max_delay: Maximum delay between requests in seconds (default: 3.0)
        url_keyword: Optional keyword to filter URLs by presence in the URL string
        
    Returns:
        List of URLs found during crawling
    """
    visited = set()
    collected_urls = set()
    
    base_domain = urlparse(base_url).netloc
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    
    # Try to get URLs from sitemap
    sitemap_urls = get_sitemap_urls(base_url, headers)
    sitemap_urls = [url.split('#')[0].split('?')[0] for url in sitemap_urls if url not in visited][:max_pages // 2]  # Limit to half max_pages
    if url_keyword:
        sitemap_urls = [url for url in sitemap_urls if url_keyword.lower() in url.lower()]
    
    if depth is None:
        # Full website crawl without depth limit
        to_visit = [base_url] + sitemap_urls
        
        while to_visit and len(collected_urls) < max_pages:
            current_url = to_visit.pop(0)
            
            if current_url in visited:
                continue
                
            visited.add(current_url)
            collected_urls.add(current_url)
                
            try:
                # Use requests for simplicity, could be made async later
                response = requests.get(current_url, headers=headers, timeout=15, allow_redirects=True)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'lxml')
                
                # Find all links (anchors and frames)
                links = []
                for link in soup.find_all('a', href=True):
                    links.append(link['href'])
                for frame in soup.find_all(['frame', 'iframe'], src=True):
                    links.append(frame['src'])
                
                logger.info(f"Found {len(links)} links on {current_url}")
                
                for href in links:
                    full_url = urljoin(current_url, href)
                    
                    # Only include URLs from the same domain
                    if urlparse(full_url).netloc == base_domain and full_url not in visited:
                        # Skip fragments and query params that don't change the page
                        clean_url = full_url.split('#')[0].split('?')[0]
                        if clean_url not in visited:
                            if url_keyword and url_keyword.lower() not in clean_url.lower():
                                continue
                            to_visit.append(clean_url)
                            
                # Add random delay between requests to avoid rate limiting
                delay = random.uniform(min_delay, max_delay)
                logger.info(f"Waiting {delay:.2f} seconds before next request...")
                time.sleep(delay)
                            
            except Exception as e:
                logger.warning(f"Error crawling {current_url}: {e}")
                # Still add a delay even on error to be respectful
                delay = random.uniform(min_delay, min(max_delay, min_delay + 1.0))  # Shorter delay on error
                time.sleep(delay)
                continue
    else:
        # Depth-limited crawl
        sitemap_urls = get_sitemap_urls(base_url, headers)
        sitemap_urls = [url.split('#')[0].split('?')[0] for url in sitemap_urls if url not in visited][:max_pages // 2]  # Limit to half max_pages
        if url_keyword:
            sitemap_urls = [url for url in sitemap_urls if url_keyword.lower() in url.lower()]
        
        if depth == 0:
            to_visit = [(base_url, 0)]
        else:
            # For depth > 0, first scrape the base_url to get immediate links
            try:
                response = requests.get(base_url, headers=headers, timeout=15, allow_redirects=True)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'lxml')
                
                # Find all links (anchors and frames)
                links = []
                for link in soup.find_all('a', href=True):
                    links.append(link['href'])
                for frame in soup.find_all(['frame', 'iframe'], src=True):
                    links.append(frame['src'])
                
                initial_links = []
                for href in links:
                    full_url = urljoin(base_url, href)
                    
                    # Only include URLs from the same domain
                    if urlparse(full_url).netloc == base_domain:
                        # Skip fragments and query params that don't change the page
                        clean_url = full_url.split('#')[0].split('?')[0]
                        if url_keyword and url_keyword.lower() not in clean_url.lower():
                            continue
                        initial_links.append(clean_url)
                
                # Remove duplicates
                initial_links = list(set(initial_links))
                
                # Add random delay after initial scrape
                delay = random.uniform(min_delay, max_delay)
                logger.info(f"Initial scrape done, waiting {delay:.2f} seconds...")
                time.sleep(delay)
                
            except Exception as e:
                logger.warning(f"Error scraping base_url {base_url}: {e}")
                initial_links = []
                # Still add a delay
                delay = random.uniform(min_delay, min(max_delay, min_delay + 1.0))
                time.sleep(delay)
            
            to_visit = [(url, 1) for url in initial_links + sitemap_urls]
        
        while to_visit and len(collected_urls) < max_pages:
            current_url, current_depth = to_visit.pop(0)
            
            if current_url in visited or current_depth > depth:
                continue
                
            visited.add(current_url)
            collected_urls.add(current_url)
            
            if current_depth > depth:
                continue
                
            try:
                # Use requests for simplicity, could be made async later
                response = requests.get(current_url, headers=headers, timeout=15, allow_redirects=True)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, 'lxml')
                
                # Find all links (anchors and frames)
                links = []
                for link in soup.find_all('a', href=True):
                    links.append(link['href'])
                for frame in soup.find_all(['frame', 'iframe'], src=True):
                    links.append(frame['src'])
                
                logger.info(f"Found {len(links)} links on {current_url}")
                
                for href in links:
                    full_url = urljoin(current_url, href)
                    
                    # Only include URLs from the same domain
                    if urlparse(full_url).netloc == base_domain and full_url not in visited:
                        # Skip fragments and query params that don't change the page
                        clean_url = full_url.split('#')[0].split('?')[0]
                        if clean_url not in visited:
                            if url_keyword and url_keyword.lower() not in clean_url.lower():
                                continue
                            to_visit.append((clean_url, current_depth + 1))
                            
                # Add random delay between requests to avoid rate limiting
                delay = random.uniform(min_delay, max_delay)
                logger.info(f"Waiting {delay:.2f} seconds before next request...")
                time.sleep(delay)
                            
            except Exception as e:
                logger.warning(f"Error crawling {current_url}: {e}")
                # Still add a delay even on error to be respectful
                delay = random.uniform(min_delay, min(max_delay, min_delay + 1.0))  # Shorter delay on error
                time.sleep(delay)
                continue
    
    logger.info(f"Total collected URLs: {len(collected_urls)}")
    return list(collected_urls)

def filter_docs_by_keywords(docs_map: dict, keywords: List[str]) -> dict:
    """
    Filter documents based on keywords presence in content.
    
    Args:
        docs_map: Dictionary mapping URLs to list of documents
        keywords: List of keywords to filter by
        
    Returns:
        Filtered docs_map containing only documents with keywords
    """
    if not keywords:
        return docs_map
        
    filtered_docs = {}
    
    for url, docs in docs_map.items():
        filtered_docs[url] = []
        for doc in docs:
            content_lower = doc.page_content.lower()
            if any(keyword.lower() in content_lower for keyword in keywords):
                filtered_docs[url].append(doc)
                
        # Remove URLs with no documents after filtering
        if not filtered_docs[url]:
            del filtered_docs[url]
    
    return filtered_docs

async def crawl_and_create_kb(
    url_or_urls: Union[str, List[str]], 
    keywords: Optional[List[str]] = None, 
    depth: Optional[int] = None,
    crawl: bool = True,
    min_delay: float = 1.0,
    max_delay: float = 3.0,
    max_pages: int = 100,
    url_keyword: Optional[str] = None,
    hf_embeddings = None
) -> tuple[str, List[str]]:
    """
    Crawl website(s) and create a knowledge base from the content.
    
    Args:
        url_or_urls: Single URL to crawl or list of URLs to scrape directly
        keywords: Optional list of keywords to filter content by
        depth: Maximum crawl depth for crawling (None for full website crawl, default: None)
        crawl: Whether to crawl (TruFe) or process URLs directly (False)
        min_delay: Minimum delay between requests in seconds (default: 1.0)
        max_delay: Maximum delay between requests in seconds (default: 3.0)
        max_pages: Maximum number of pages to collect during crawling (default: 100)
        url_keyword: Optional keyword to filter URLs by presence in the URL string
        hf_embeddings: HuggingFace embeddings instance
        
    Returns:
        Tuple of (collection_name, list_of_scraped_urls)
    """
    if isinstance(url_or_urls, str):
        # Single URL
        if crawl:
            # Crawl the website
            logger.info(f"Crawling website: {url_or_urls} with depth {depth if depth is not None else 'unlimited'}")
            urls_to_process = crawl_website(url_or_urls, depth=depth, min_delay=min_delay, max_delay=max_delay, max_pages=max_pages, url_keyword=url_keyword)
            logger.info(f"Found {len(urls_to_process)} URLs to process")
        else:
            # Process single URL directly
            urls_to_process = [url_or_urls]
            logger.info(f"Processing single URL: {url_or_urls}")
    else:
        # List of URLs
        if crawl:
            # Crawl each URL in the list
            logger.info(f"Crawling {len(url_or_urls)} URLs with depth {depth if depth is not None else 'unlimited'}")
            all_urls = []
            for url in url_or_urls:
                logger.info(f"Crawling starting from: {url}")
                crawled_urls = crawl_website(url, depth=depth, min_delay=min_delay, max_delay=max_delay, max_pages=max_pages, url_keyword=url_keyword)
                all_urls.extend(crawled_urls)
            urls_to_process = list(set(all_urls))  # Remove duplicates
            logger.info(f"Found {len(urls_to_process)} total URLs to process")
        else:
            # Process URLs directly
            urls_to_process = url_or_urls
            logger.info(f"Processing {len(urls_to_process)} provided URLs directly")
    
    if not urls_to_process:
        raise ValueError("No URLs to process")
    
    # Filter by URL keyword if provided
    if url_keyword:
        logger.info(f"Filtering URLs by keyword: {url_keyword}")
        urls_to_process = [url for url in urls_to_process if url_keyword.lower() in url.lower()]
        logger.info(f"After URL keyword filtering: {len(urls_to_process)} URLs")
    
    if not urls_to_process:
        raise ValueError("No URLs to process after filtering")
    
    # Convert URLs to documents
    docs_map = await urls_to_docs(urls_to_process, local_mode=False, split=False)
    
    # Filter by keywords if provided
    if keywords:
        logger.info(f"Filtering content by keywords: {keywords}")
        docs_map = filter_docs_by_keywords(docs_map, keywords)
        logger.info(f"After filtering: {sum(len(docs) for docs in docs_map.values())} documents from {len(docs_map)} URLs")
    
    # Flatten all documents
    all_docs = []
    for docs in docs_map.values():
        all_docs.extend(docs)
    
    if not all_docs:
        raise ValueError("No documents found after processing and filtering")
    
    logger.info(f"Total documents: {len(all_docs)}")
    
    # Split documents
    text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=128)
    all_docs = text_splitter.split_documents(all_docs)
    
    logger.info(f"Total documents after splitting: {len(all_docs)}")
    
    # Create collection name
    crawl_mode = "crawl" if crawl else "direct"
    if isinstance(url_or_urls, str):
        base_name = urlparse(url_or_urls).netloc.replace('.', '_')  # Replace dots with underscores
    else:
        base_name = "url_list"
    
    if keywords:
        # Sanitize keywords - replace spaces and special chars with underscores
        sanitized_keywords = [k.replace(' ', '_').replace('-', '_') for k in keywords[:3]]
        base_name += f"_keywords_{'_'.join(sanitized_keywords)}"
    
    if url_keyword:
        base_name += f"_urlkeyword_{url_keyword.replace(' ', '_').replace('-', '_')}"
    
    sorted_urls = ''.join(sorted(urls_to_process))
    hash_suffix = hashlib.md5(sorted_urls.encode()).hexdigest()[:8]
    collection_name = f"{crawl_mode}_{base_name}_{hash_suffix}"
    
    # Ensure collection name is valid: starts/ends with alphanumeric, contains only allowed chars
    import re
    collection_name = re.sub(r'[^a-zA-Z0-9._-]', '_', collection_name)  # Replace invalid chars with _
    collection_name = collection_name.strip('_')  # Remove leading/trailing underscores
    
    # Ensure minimum length and valid start/end
    if not collection_name or len(collection_name) < 3:
        collection_name = f"crawl_{hash_suffix}"
    
    # Ensure starts with alphanumeric
    if not collection_name[0].isalnum():
        collection_name = f"crawl_{collection_name}"
    
    # Ensure ends with alphanumeric  
    if not collection_name[-1].isalnum():
        collection_name = f"{collection_name}_{hash_suffix[:4]}"
    
    # Create vectorstore
    client = chromadb.PersistentClient(path="./chroma_db", settings=Settings(anonymized_telemetry=False, allow_reset=True))
    
    # Delete existing collection if it exists
    try:
        client.delete_collection(collection_name)
        logger.info(f"Deleted existing collection {collection_name}")
    except Exception as e:
        logger.info(f"Collection {collection_name} not found or error deleting: {e}")
    
    try:
        await create_vectorstore_async(
            docs=all_docs,
            collection_name=collection_name,
            hf_embeddings=hf_embeddings,
            top_k=3,
            ensemble_weights=[0.4, 0.6],
            local_mode=True,
            persist_directory="./chroma_db"
        )
        logger.info(f"Knowledge base created: {collection_name}")
    except Exception as e:
        logger.error(f"Error creating knowledge base: {e}")
        raise
    
    return collection_name, urls_to_process

================================================
FILE: utils/git_utils.py
================================================
from gitingest import ingest_async
import os
import aiofiles.os

async def git_tree_search(url):
    """
    Retrieves and returns the directory tree structure of a GitHub repository or a local Git repository.

    Args:
        url (str): The base URL of the GitHub repository (e.g., 'https://github.com/user/repo')
                   or the path to the local repository on your system.

    Returns:
        str: The directory tree structure as a string.
    """
    if not isinstance(url, str) or not url.strip():
        raise ValueError("The 'url' parameter must be a non-empty string.")

    # Check if it's a local path or a URL
    if os.path.exists(url):
        # It's a local path
        if not os.path.isdir(url):
            raise FileNotFoundError(f"The path '{url}' exists but is not a directory.")
    elif not url.startswith("http"):
        raise ValueError("Provided 'url' is neither a valid local path nor a valid URL.")

    try:
        summary, tree, content  = await ingest_async(url)
        return tree
    except Exception as e:
        raise Exception(f"Failed to retrieve the repo tree: {e}")
    
async def git_specific_content(base_url, part,type):
    """
    Fetches the content of a specific part (directory or file) from either:
    - a GitHub repository (via URL), or
    - a local Git repository (via local path).

    Args:
        base_url (str): The base URL of the GitHub repository (e.g., 'https://github.com/user/repo'),
                        or the local path to the root of the repository.
        part (str): The path inside the repository you wish to access (e.g., '/src/utils').

    Returns:
        str: The content of the specified part of the repository.
    """
    # Input validation
    if not isinstance(base_url, str) or not base_url.strip():
        raise ValueError("The 'base_url' parameter must be a non-empty string.")
    if not isinstance(part, str) or not part.strip():
        raise ValueError("The 'part' parameter must be a non-empty string.")

    # Check for local path
    if os.path.exists(base_url):
        # It's a local path
        repo_path = os.path.abspath(base_url)
        part_path = part.lstrip(os.sep)
        full_path = os.path.join(repo_path, part_path)
        if not os.path.exists(full_path):
            raise FileNotFoundError(f"The specified part '{part}' does not exist in the local repository at '{base_url}'.")
        url_or_path = full_path
    else:
        # Assume it's a remote GitHub URL
        if not base_url.startswith("http"):
            raise ValueError("The 'base_url' must be a valid URL starting with 'http' or an existing local path.")
        base_url = base_url.rstrip('/')
        part = part if part.startswith('/') else '/' + part
        if type=='file':
            url_or_path = f"{base_url}/blob/main/{part}"
        elif type=='folder':
            url_or_path = f"{base_url}/tree/main/{part}"
        print(url_or_path)

    try:
        summary, tree, content = await ingest_async(url_or_path)
        return content
    except Exception as e:
        raise Exception(f"Failed to fetch content for '{url_or_path}': {e}")

async def folder_tree(
    path, level='broad-first', prefix='', cur_depth=1, max_depth=None
    ):
    """
    Async Markdown folder tree.

    Args:
        path (str): Root directory.
        level (str):
            - 'full': Show all folders and files, recursively, except hidden/system/cache entries.
            - 'broad-first': Only show immediate (top-level) folders and files (no nesting).
            - 'broad-second' or 'broad': Show top-level folders/files and their immediate child folders/files (two levels, no deeper).
        prefix (str): Indentation (internal)
        cur_depth (int): Current recursion depth (internal)
        max_depth (int): Max allowed depth (internal)
    Returns:
        str: Markdown tree string
    """
    def is_visible(entry):
        lower = entry.lower()
        hidden = entry.startswith('.')
        system = lower in ('desktop.ini', 'thumbs.db', '$recycle.bin', 'system volume information')
        cache = 'cache' in lower
        return not (hidden or system or cache)

    try:
        entries = await aiofiles.os.listdir(path)
        entries = sorted([e for e in entries if is_visible(e)])
    except FileNotFoundError:
        return f"{prefix}!! [Error: Folder not found]\n"
    except PermissionError:
        return f"{prefix}!! [Error: Permission denied]\n"
    except Exception as e:
        return f"{prefix}!! [Error: {str(e)}]\n"

    if max_depth is None:
        if level == 'broad-first':
            max_depth = 1
        elif level in ('broad-second', 'broad'):
            max_depth = 2
        else:
            max_depth = float('inf')

    tree_str = ""

    if level.startswith('broad'):
        # Separate folders and files
        folders = [e for e in entries if os.path.isdir(os.path.join(path, e))]
        files = [e for e in entries if os.path.isfile(os.path.join(path, e))]
        items = folders + files
    else:
        items = entries

    for idx, entry in enumerate(items):
        full_path = os.path.join(path, entry)
        is_last = idx == len(items) - 1
        connector = '└── ' if is_last else '├── '
        tree_str += f"{prefix}{connector}{entry}\n"
        if os.path.isdir(full_path) and not os.path.islink(full_path):
            if cur_depth < max_depth:
                extension = '    ' if is_last else '│   '
                try:
                    subtree = await folder_tree(
                        full_path,
                        level,
                        prefix + extension,
                        cur_depth=cur_depth + 1,
                        max_depth=max_depth,
                    )
                    tree_str += subtree
                except Exception as e:
                    tree_str += f"{prefix + extension}!! [Error: {str(e)}]\n"
    return tree_str

================================================
FILE: utils/knowledge_base.py
================================================
import os
import hashlib
from utils.websearch_utils import urls_to_docs, get_all_paths
from utils.retriever_utils import create_vectorstore_async
from chromadb.config import Settings
import chromadb
import logging
from langchain_text_splitters import TokenTextSplitter

async def create_knowledge_base(document_paths, hf_embeddings):
    """
    Creates a knowledge base from the given paths by extracting all files,
    processing them into documents, embedding them, and saving to ChromaDB local.

    Args:
        document_paths (list or str): List of paths or a single path to process.
        hf_embeddings: Hugging Face embeddings instance.

    Returns:
        str: The name of the created vector database collection.
    """
    logger = logging.getLogger(__name__)

    if isinstance(document_paths, str):
        document_paths = [document_paths]

    all_paths = []
    for k in document_paths:
        paths = get_all_paths(k)
        if paths:
            all_paths.extend(paths)

    if not all_paths:
        raise ValueError("No files found in the provided paths.")

    all_paths = [all_paths]  # Make it list of lists as expected

    logger.info(f"Found {len(all_paths[0])} file paths to process")

    # Process documents
    docs_map = await urls_to_docs(all_paths[0], local_mode=True, split=False)

    logger.info(f"docs_map has {len(docs_map)} entries")
    for url, docs in docs_map.items():
        logger.info(f"URL {url}: {len(docs)} docs")

    all_docs = []
    for docs in docs_map.values():
        all_docs.extend(docs)

    logger.info(f"Total documents before splitting: {len(all_docs)}")

    # Split documents
    text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=128)
    all_docs = text_splitter.split_documents(all_docs)

    logger.info(f"Total documents after splitting: {len(all_docs)}")

    if not all_docs:
        raise ValueError("No documents could be processed.")

    # Create collection name based on hash of paths
    sorted_paths = ''.join(sorted(all_paths[0]))
    collection_name = f"kb-{hashlib.md5(sorted_paths.encode()).hexdigest()[:8]}"

    # Delete existing collection if it exists to ensure fresh creation
    client = chromadb.PersistentClient(path="./chroma_db", settings=Settings(anonymized_telemetry=False, allow_reset=True))
    try:
        client.delete_collection(collection_name)
        logger.info(f"Deleted existing collection {collection_name}")
    except Exception as e:
        logger.info(f"Collection {collection_name} not found or error deleting: {e}")

    collections_after_delete = [c.name for c in client.list_collections()]
    logger.info(f"Collections after delete: {collections_after_delete}")

    # Create and save vectorstore
    try:
        await create_vectorstore_async(
            docs=all_docs,
            collection_name=collection_name,
            hf_embeddings=hf_embeddings,
            top_k=3,
            ensemble_weights=[0.4, 0.6],
            local_mode=True,
            persist_directory="./chroma_db"
        )
        logger.info(f"create_vectorstore_async completed for {collection_name}")
    except Exception as e:
        logger.error(f"Error in create_vectorstore_async: {e}")
        raise

    # Verify the collection was created with documents
    collections = client.list_collections()
    collection_names = [c.name for c in collections]
    if collection_name in collection_names:
        count = client.get_collection(collection_name).count()
        logger.info(f"Collection {collection_name} successfully created with {count} documents")
    else:
        logger.error(f"Collection {collection_name} not found after creation")

    return collection_name


================================================
FILE: utils/map.py
================================================
import logging
from typing import List, Tuple, Optional
import requests
import folium
import os

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)

# Step 1: Get coordinates from Nominatim (OpenStreetMap API for geocoding) with multiple results
def get_coordinates(location: str, limit: int = 3) -> List[Tuple[str, float, float]]:
    """
    Retrieves geographic coordinates (latitude, longitude) and display names for a given location using the Nominatim API.
    """
    url = f'https://nominatim.openstreetmap.org/search?q={location}&format=json&addressdetails=1&limit={limit}'
    headers = {
        'User-Agent': 'YourAppName/1.0 (your.email@example.com)'  # Replace with your app's name and email
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        data = response.json()
        if data:
            probable_locations = [(res['display_name'], float(res['lat']), float(res['lon'])) for res in data]
            logger.info(f"Found {len(probable_locations)} probable locations for '{location}'.")
            return probable_locations
        else:
            logger.warning(f"No data found for location: {location}")
            return []
    except requests.RequestException as e:
        logger.error(f"Request error while getting coordinates for '{location}': {e}")
        return []
    except Exception as e:
        logger.error(f"Unexpected error in get_coordinates: {e}")
        return []

# Step 2: Get route using OSRM (Open Source Routing Machine)
def get_route(start_coords: Tuple[float, float], end_coords: Tuple[float, float]) -> Optional[dict]:
    """
    Retrieves a driving route between two coordinates using the OSRM API.
    """
    start_lat, start_lon = start_coords
    end_lat, end_lon = end_coords
    osrm_url = (
        f'http://router.project-osrm.org/route/v1/driving/'
        f'{start_lon},{start_lat};{end_lon},{end_lat}?overview=full&geometries=geojson&steps=true'
    )
    try:
        osrm_response = requests.get(osrm_url, timeout=10)
        osrm_response.raise_for_status()
        osrm_data = osrm_response.json()
        if 'routes' in osrm_data and osrm_data['routes']:
            logger.info("Route found between start and end coordinates.")
            return osrm_data['routes'][0]
        else:
            logger.warning("No route found.")
            return None
    except requests.RequestException as e:
        logger.error(f"Request error while getting route: {e}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error in get_route: {e}")
        return None

# Step 3: Get points of interest (restaurants, cafes, etc.) using Overpass API
def get_pois(
    location: Tuple[float, float],
    radius: int = 500,
    amenities: str = "restaurant|cafe|bar|hotel"
) -> List[Tuple[float, float, str]]:
    """
    Retrieves points of interest (POIs) around a given location using the Overpass API.
    """
    lat, lon = location
    overpass_url = "http://overpass-api.de/api/interpreter"
    overpass_query = f"""
    [out:json];
    node
      [amenity~"{amenities}"]
      (around:{radius},{lat},{lon});
    out body;
    """
    try:
        response = requests.get(overpass_url, params={'data': overpass_query}, timeout=20)
        response.raise_for_status()
        data = response.json()
        pois = [
            (element['lat'], element['lon'], element['tags'].get('name', 'Unnamed'))
            for element in data.get('elements', [])
        ]
        logger.info(f"Found {len(pois)} POIs near ({lat}, {lon}).")
        return pois
    except requests.RequestException as e:
        logger.error(f"Request error while getting POIs: {e}")
        return []
    except Exception as e:
        logger.error(f"Unexpected error in get_pois: {e}")
        return []

# Step 4: Create and display the map with route and POIs
def create_map(
    start_coords: Tuple[float, float],
    end_coords: Tuple[float, float],
    route_coords: List[List[float]],
    pois_start: Optional[List[Tuple[float, float, str]]] = None,
    pois_end: Optional[List[Tuple[float, float, str]]] = None
) -> folium.Map:
    """
    Creates a Folium map displaying the route between two locations and POIs around the start and end points.
    """
    m = folium.Map(location=[start_coords[0], start_coords[1]], zoom_start=13)
    folium.Marker(location=[start_coords[0], start_coords[1]], popup="Start").add_to(m)
    folium.Marker(location=[end_coords[0], end_coords[1]], popup="End").add_to(m)
    try:
        folium.PolyLine(
            locations=[[lat, lon] for lon, lat in route_coords],
            color="blue",
            weight=5
        ).add_to(m)
    except Exception as e:
        logger.error(f"Error adding route polyline: {e}")

    if pois_start:
        for lat, lon, name in pois_start:
            folium.Marker(
                location=[lat, lon],
                popup=name,
                icon=folium.Icon(color='green', icon='info-sign')
            ).add_to(m)
    if pois_end:
        for lat, lon, name in pois_end:
            folium.Marker(
                location=[lat, lon],
                popup=name,
                icon=folium.Icon(color='green', icon='info-sign')
            ).add_to(m)
    return m

# Helper function to automatically fix destinations based on probable locations
def auto_fix_destination(location: str, limit: int = 3) -> Optional[Tuple[str, float, float]]:
    """
    Automatically selects a probable location for a given input by retrieving geocoded results.
    """
    probable_locations = get_coordinates(location, limit)
    if probable_locations:
        logger.info(f"Auto-selected location: {probable_locations[0][0]}")
        return probable_locations[0]
    logger.warning(f"Could not auto-fix destination for: {location}")
    return None

# New function to generate directions in a human-readable format
def get_route_directions(route_data: dict) -> str:
    """
    Generates human-readable driving directions from OSRM route data.
    """
    directions = []
    try:
        for leg in route_data.get('legs', []):
            for step in leg.get('steps', []):
                instruction = f"Road Name: {step.get('name', 'same road')}"
                distance = step.get('distance', 0)
                directions.append(f"{instruction} for {distance} meters.")
        logger.info("Generated route directions.")
    except Exception as e:
        logger.error(f"Error generating route directions: {e}")
    return "\n".join(directions)

# Main function to handle all logic and display map
def generate_map(
    start_location: Optional[str] = None,
    end_location: Optional[str] = None,
    pois_radius: int = 500,
    amenities: str = "restaurant|cafe|bar|hotel",
    limit: int = 3,
    task: str = "route_and_pois"
) -> str:
    """
    Generates a map displaying a route between two locations, along with nearby POIs. 
    It also provides human-readable directions for the route.
    Handles cases where start or end can be None.
    """
    try:
        if task == 'location_only':
            locations = ''
            if start_location:
                start_coords = auto_fix_destination(start_location, limit)
                if not start_coords:
                    logger.error("Could not determine start location.")
                    locations += "Could not determine start location. Else rate limit reached. Try to find on internet or be less specific."
                else:
                    locations += f"Start location: {start_coords[0]} (Lat: {start_coords[1]}, Lon: {start_coords[2]})\n"
            if end_location:
                end_coords = auto_fix_destination(end_location, limit)
                if not end_coords:
                    logger.error("Could not determine end location.")
                    locations += "Could not determine end location. Else rate limit reached. Try to find on internet or be less specific."
                else:
                    locations += f"End location: {end_coords[0]} (Lat: {end_coords[1]}, Lon: {end_coords[2]})\n"
            if not start_location and not end_location:
                locations = "No start or end location provided."
            return locations

        elif task == 'route_and_pois':
            err = ''
            start_coords = auto_fix_destination(start_location, limit) if start_location else None
            end_coords = auto_fix_destination(end_location, limit) if end_location else None

            if not start_coords and not end_coords:
                logger.error("Neither start nor end location provided or found.")
                return "Neither start nor end location provided or found. Else rate limit reached"

            if not start_coords:
                logger.error("Could not determine start location.")
                err += "Could not determine start location. Else rate limit reached. Try to find on internet or be less specific. "

            if not end_coords:
                logger.error("Could not determine end location.")
                err += "Could not determine end location. Else rate limit reached. Try to find on internet or be less specific."

            if not start_coords or not end_coords:
                logger.error("Could not determine start or end location.")
                return err.strip()

            route_data = get_route((start_coords[1], start_coords[2]), (end_coords[1], end_coords[2]))
            if not route_data:
                logger.error("No route data found.")
                return "No route data found. Else rate limit reached"

            route_coords = route_data['geometry']['coordinates']
            pois_start = get_pois((start_coords[1], start_coords[2]), radius=pois_radius, amenities=amenities)
            pois_end = get_pois((end_coords[1], end_coords[2]), radius=pois_radius, amenities=amenities)
            m = create_map(
                (start_coords[1], start_coords[2]),
                (end_coords[1], end_coords[2]),
                route_coords,
                pois_start,
                pois_end
            )
            
            output_folder = "output"
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)
            m.save(os.path.join(output_folder, "map_with_route_and_pois.html"))
            logger.info("Map generated and saved as 'map_with_route_and_pois.html'.")
            return get_route_directions(route_data) + f"\n\nPoints of interest around start: {pois_start}, end: {pois_end}. Map saved as 'map_with_route_and_pois.html'." + \
                "start location: " + str(start_coords) + ", end location: " + str(end_coords[0])
    except Exception as e:
        logger.error(f"Error in map functions: {e},Else rate limit reached")
        return f"Error in map functions: {e},Else rate limit reached"

================================================
FILE: utils/process_content.py
================================================
import logging
import re

import fitz
from bs4 import BeautifulSoup
from markdownify import markdownify
from markitdown import MarkItDown
import pymupdf4llm
import sys
import os
from model_config import *
from openai import OpenAI
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))



from utils.utils import *

# Configure logging
# logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def clean_html(soup):
    """
    Cleans an HTML document by removing unwanted elements such as header, footer, navigation, lists, and advertisements.
    """
    try:
        for tag in soup(['header', 'footer', 'nav', 'aside', 'li']):
            tag.decompose()
        for tag in soup.find_all(class_=['menu', 'sidebar', 'advertisement']):
            tag.decompose()
        return str(soup)
    except Exception as e:
        logger.error(f"Error cleaning HTML: {e}", exc_info=True)
        return str(soup)

def process_content(url, content_type, content):
    """
    Processes the content based on the file type (PDF or HTML). Converts PDFs to markdown and cleans HTML content 
    by removing unnecessary elements and converting to markdown.
    """
    try:
        logger.info(f"Processing content from URL: {url} with content_type: {content_type}")
        if url.endswith('.pdf') and content_type == 'internal_pdf':
            try:
                logger.info("Processing internal PDF.")
                markdown_content = f"Content from {url}\n\n" + pymupdf4llm.to_markdown(url)
            except Exception as e:
                logger.error(f"Error processing internal PDF: {e}", exc_info=True)
                return ""
        elif url.endswith('.pdf') or 'application/pdf' in content_type:
            try:
                logger.info("Processing PDF from stream.")
                pdf_document = fitz.open(stream=content, filetype="pdf")
                text_content = pymupdf4llm.to_markdown(pdf_document)
                markdown_content =  f"Content from {url}\n\n" + markdownify(text_content)
            except Exception as e:
                logger.error(f"Error processing PDF: {e}", exc_info=True)
                return ""
        elif 'text/html' in content_type:
            logger.info("Processing HTML content.")
            soup = BeautifulSoup(content, 'lxml', from_encoding="iso-8859-1",
                                    )
            soup = clean_html(soup)
            markdown_content = markdownify(str(soup), strip=['a'])
            markdown_content = remove_consecutive_newlines(markdown_content)
            markdown_content = re.sub(r"\S{21,}", "", markdown_content)
        else:
            try:
                if any(url.lower().endswith(ext) for ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.tiff', '.svg']):
                    logger.info("Converting image to markdown caption")
                    client = OpenAI(base_url=openai_compatible[model_config['llm_type']]
                                    ,api_key=llm_api_key)
                    md = MarkItDown(llm_client=client, 
                                    llm_model=model_config['llm_model_name'],
                                    llm_prompt='Answer in 2 sections 1. OCR output if there is any text (without losing structure) 2. What exactly is this?')
                    result = md.convert(url)
                    markdown_content =  f"Content from {url}\n\n" + result.text_content
                else:
                    md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
                    markdown_content =  f"Content from {url}\n\n" + md.convert(url).text_content
            except:
                logger.error(f"Error processing HTML: {e}", exc_info=True)
                return ""
        logger.info("Content processed successfully.")

        return markdown_content
    except Exception as e:
        logger.error(f"Error in process_content: {e}", exc_info=True)
        return ""

def process_content_pdf(file):
    """
    Processes a local PDF file by converting it to markdown.
    """
    try:
        logger.info(f"Processing local PDF file: {file.name}")
        return pymupdf4llm.to_markdown('tmp/' + file.name)
    except Exception as e:
        logger.error(f"Error processing local PDF: {e}", exc_info=True)
        return ""


================================================
FILE: utils/profiler_utils.py
================================================
# Profiling system for query_web_response
import time
import datetime
import logging
from collections import defaultdict

# Set up logger
logger = logging.getLogger(__name__)
class WebSearchProfiler:
    """
    Comprehensive profiling system for web search operations.
    Tracks timing, memory usage, and performance metrics for each step.
    """
    
    def __init__(self, query):
        self.query = query
        self.start_time = time.time()
        self.checkpoints = []
        self.step_times = defaultdict(float)
        self.step_counts = defaultdict(int)
        self.current_step = None
        self.step_start_time = None
        self.total_urls_processed = 0
        self.successful_urls = 0
        self.failed_urls = 0
        self.total_docs_retrieved = 0
        self.context_length = 0
        
        # Time saved tracking
        self.websites_processed = 0
        self.total_words_processed = 0
        self.url_word_counts = {}  # Track words per URL
        self.HUMAN_READING_SPEED_WPM = 250  # Words per minute
        
        # URL-level processing tracking
        self.url_processing_times = {}  # {url: processing_time}
        self.url_details = {}  # {url: {docs_count, context_length, status, error}}
        self.url_start_times = {}  # Track when URL processing started
        
    def start_step(self, step_name, details=""):
        """Start timing a specific step"""
        if self.current_step:
            self.end_step()
        
        self.current_step = step_name
        self.step_start_time = time.time()
        self.step_counts[step_name] += 1
        
        checkpoint = {
            'step': step_name,
            'start_time': self.step_start_time,
            'details': details,
            'timestamp': datetime.datetime.now().strftime('%H:%M:%S.%f')[:-3]
        }
        self.checkpoints.append(checkpoint)
        logger.info(f"🔍 PROFILER [{self.query[:50]}...]: Starting {step_name} - {details}")
        
    def end_step(self, additional_info=""):
        """End timing the current step"""
        if not self.current_step or not self.step_start_time:
            return
            
        step_duration = time.time() - self.step_start_time
        self.step_times[self.current_step] += step_duration
        
        # Update the last checkpoint with end time and duration
        if self.checkpoints:
            self.checkpoints[-1].update({
                'end_time': time.time(),
                'duration': step_duration,
                'additional_info': additional_info
            })
        
        logger.info(f"⏱️  PROFILER [{self.query[:50]}...]: Completed {self.current_step} in {step_duration:.3f}s - {additional_info}")
        
        self.current_step = None
        self.step_start_time = None
        
    def add_metric(self, metric_name, value):
        """Add a custom metric"""
        if metric_name == 'urls_processed':
            self.total_urls_processed += value
        elif metric_name == 'successful_urls':
            self.successful_urls += value
        elif metric_name == 'failed_urls':
            self.failed_urls += value
        elif metric_name == 'docs_retrieved':
            self.total_docs_retrieved += value
        elif metric_name == 'context_length':
            self.context_length = value
    
    def add_url_content(self, url, content):
        """Add content from a processed URL for time saved calculation"""
        if content and url:
            # Count words in the content (simple word count by splitting on whitespace)
            word_count = len(content.split())
            self.url_word_counts[url] = word_count
            self.total_words_processed += word_count
            self.websites_processed += 1
            logger.debug(f"Added {word_count} words from {url} (Total: {self.total_words_processed} words)")
    
    def start_url_processing(self, url):
        """Start tracking processing time for a specific URL"""
        self.url_start_times[url] = time.time()
        logger.debug(f"Started tracking URL processing: {url}")
    
    def end_url_processing(self, url, docs_count=0, context_length=0, status="success", error=None):
        """End tracking processing time for a specific URL"""
        if url in self.url_start_times:
            processing_time = time.time() - self.url_start_times[url]
            self.url_processing_times[url] = processing_time
            self.url_details[url] = {
                'docs_count': docs_count,
                'context_length': context_length,
                'status': status,
                'error': error,
                'processing_time': processing_time
            }
            del self.url_start_times[url]  # Clean up
            logger.debug(f"Completed URL processing: {url} in {processing_time:.3f}s")
        else:
            logger.warning(f"URL processing end called without start: {url}")
    
    def calculate_time_saved(self):
        """Calculate time saved by CoexistAI vs manual reading"""
        if self.total_words_processed == 0:
            return 0, 0, 0  # minutes, hours, days
        
        # Calculate time to read all content at human reading speed
        reading_time_minutes = self.total_words_processed / self.HUMAN_READING_SPEED_WPM
        reading_time_hours = reading_time_minutes / 60
        reading_time_days = reading_time_hours / 24
        
        return reading_time_minutes, reading_time_hours, reading_time_days
    
    def get_time_saved_summary(self):
        """Get a formatted summary of time saved"""
        minutes, hours, days = self.calculate_time_saved()
        
        if minutes < 1:
            return "Less than 1 minute of reading time saved"
        elif minutes < 60:
            return f"{minutes:.1f} minutes of reading time saved"
        elif hours < 24:
            return f"{hours:.1f} hours ({minutes:.0f} minutes) of reading time saved"
        else:
            return f"{days:.1f} days ({hours:.1f} hours) of reading time saved"
            
    def get_summary(self):
        """Generate a comprehensive profiling summary"""
        if self.current_step:
            self.end_step("Final step")
            
        total_time = time.time() - self.start_time
        
        # Calculate time saved
        time_saved_summary = self.get_time_saved_summary()
        minutes, hours, days = self.calculate_time_saved()
        
        summary = [
            "\n" + "="*80,
            f"🔍 WEB SEARCH PROFILING REPORT - Query: {self.query[:60]}...",
            "="*80,
            f"⏰ Total Execution Time: {total_time:.3f} seconds",
            f"📊 Total Steps Executed: {len(self.checkpoints)}",
            f"🌐 URLs Processed: {self.total_urls_processed} (✅ {self.successful_urls}, ❌ {self.failed_urls})",
            f"📄 Documents Retrieved: {self.total_docs_retrieved}",
            f"📝 Final Context Length: {self.context_length} characters",
            "",
            "⏱️  TIME SAVED ANALYSIS:",
            "-" * 50,
            f"📚 Websites Read for You: {self.websites_processed}",
            f"📖 Total Words Processed: {self.total_words_processed:,}",
            f"🚀 Time Saved: {time_saved_summary}",
            f"⚡ Efficiency: {total_time/60:.1f} min processing vs {minutes:.1f} min manual reading",
            f"📈 Speed Multiplier: {minutes/max(total_time/60, 0.01):.1f}x faster than manual reading",
            "",
            "🚀 7-STEP PIPELINE BREAKDOWN:",
            "-" * 50
        ]
        
        # Add detailed 7-step report
        summary.extend(self._get_seven_step_report(total_time))
        
        summary.extend([
            "",
            "🌐 URL-LEVEL PROCESSING REPORT:",
            "-" * 50
        ])
        
        # Add URL-level processing report
        summary.extend(self._get_url_processing_report())
        
        summary.extend([
            "",
            "📈 COMPLETE STEP-BY-STEP BREAKDOWN:",
            "-" * 50
        ])
        
        # Sort steps by total time spent
        sorted_steps = sorted(self.step_times.items(), key=lambda x: x[1], reverse=True)
        
        for step_name, total_time_spent in sorted_steps:
            count = self.step_counts[step_name]
            avg_time = total_time_spent / count if count > 0 else 0
            percentage = (total_time_spent / total_time) * 100 if total_time > 0 else 0
            
            summary.append(f"  {step_name:.<30} {total_time_spent:>8.3f}s ({percentage:>5.1f}%) [{count}x, avg: {avg_time:.3f}s]")
        
        summary.extend([
            "",
            "🕐 DETAILED TIMELINE:",
            "-" * 50
        ])
        
        for i, checkpoint in enumerate(self.checkpoints, 1):
            duration = checkpoint.get('duration', 0)
            details = checkpoint.get('details', '')
            additional_info = checkpoint.get('additional_info', '')
            timestamp = checkpoint.get('timestamp', '')
            
            info_str = f" - {details}" if details else ""
            additional_str = f" | {additional_info}" if additional_info else ""
            
            summary.append(f"  {i:2d}. [{timestamp}] {checkpoint['step']:.<25} {duration:>8.3f}s{info_str}{additional_str}")
        
        summary.extend([
            "",
            "🎯 PERFORMANCE INSIGHTS:",
            "-" * 50
        ])
        
        # Add performance insights
        if self.total_urls_processed > 0:
            success_rate = (self.successful_urls / self.total_urls_processed) * 100
            summary.append(f"  • URL Success Rate: {success_rate:.1f}%")
            
        if 'search_execution' in self.step_times and 'context_generation' in self.step_times:
            search_time = self.step_times['search_execution']
            context_time = self.step_times['context_generation']
            if search_time > 0 and context_time > 0:
                ratio = context_time / search_time
                summary.append(f"  • Context/Search Time Ratio: {ratio:.2f}x")
        
        if self.context_length > 0 and total_time > 0:
            chars_per_second = self.context_length / total_time
            summary.append(f"  • Processing Speed: {chars_per_second:.0f} chars/second")
            
        summary.append("="*80 + "\n")
        
        return "\n".join(summary)
    
    def _get_seven_step_report(self, total_time):
        """Generate detailed 7-step pipeline report""" 
        # Define the 7 main pipeline steps in order
        main_steps = [
            ("1️⃣", "query_agent", "Query Analysis", "Generating search queries from user input"),
            ("2️⃣", "web_search_execution", "Web Search", "Executing web search and URL extraction"),
            ("3️⃣", "url_collection", "URL Collection", "Collecting and preparing URLs for processing"),
            ("4️⃣", "parallel_url_processing", "URL Processing", "Processing URLs in parallel"),
            ("5️⃣", "context_building", "Context Building", "Building final context from processed documents"),
            ("6️⃣", "context_generation", "Context Generation", "Processing URLs and generating context"),
            ("7️⃣", "response_generation", "Response Generation", "Generating final response from context")
        ]
        
        report = []
        total_main_time = 0
        
        # Calculate total time for main steps
        for _, step_key, _, _ in main_steps:
            if step_key in self.step_times:
                total_main_time += self.step_times[step_key]
        
        # Generate report for each step
        for emoji, step_key, step_name, description in main_steps:
            step_time = self.step_times.get(step_key, 0)
            step_count = self.step_counts.get(step_key, 0)
            
            if step_count > 0:
                percentage = (step_time / total_time) * 100 if total_time > 0 else 0
                avg_time = step_time / step_count
                
                # Create visual progress bar
                bar_length = 20
                filled_length = int(bar_length * percentage / 100) if percentage <= 100 else bar_length
                bar = "█" * filled_length + "░" * (bar_length - filled_length)
                
                # Status indicator
                if step_time > 0:
                    status = "✅"
                    time_str = f"{step_time:.3f}s"
                else:
                    status = "⏸️"
                    time_str = "0.000s"
                
                report.extend([
                    f"{emoji} {step_name:.<20} {status} {time_str:>8} ({percentage:>5.1f}%) [{step_count}x]",
                    f"   📊 [{bar}] {description}",
                    f"   ⚡ Avg: {avg_time:.3f}s/execution",
                    ""
                ])
            else:
                # Step was not executed
                bar = "░" * 20
                report.extend([
                    f"{emoji} {step_name:.<20} ⏸️  0.000s ( 0.0%) [0x]",
                    f"   📊 [{bar}] {description}",
                    f"   ⚡ Not executed",
                    ""
                ])
        
        # Add pipeline summary
        pipeline_efficiency = (total_main_time / total_time) * 100 if total_time > 0 else 0
        other_time = total_time - total_main_time
        
        report.extend([
            f"📋 PIPELINE SUMMARY:",
            f"   • Main Steps Time: {total_main_time:.3f}s ({pipeline_efficiency:.1f}% of total)",
            f"   • Other Operations: {other_time:.3f}s ({100-pipeline_efficiency:.1f}% of total)",
            f"   • Pipeline Efficiency: {pipeline_efficiency:.1f}%"
        ])
        
        return report
    
    def _get_url_processing_report(self):
        """Generate detailed URL-level processing report"""
        if not self.url_processing_times:
            return ["   📝 No URL processing data available"]
        
        report = []
        
        # Sort URLs by processing time (slowest first)
        sorted_urls = sorted(
            self.url_processing_times.items(), 
            key=lambda x: x[1], 
            reverse=True
        )
        
        total_url_time = sum(self.url_processing_times.values())
        
        # Add summary stats
        report.extend([
            f"📊 URL PROCESSING SUMMARY:",
            f"   • Total URLs Processed: {len(self.url_processing_times)}",
            f"   • Total URL Processing Time: {total_url_time:.3f}s",
            f"   • Average Time per URL: {total_url_time/len(self.url_processing_times):.3f}s",
            f"   • Fastest URL: {min(self.url_processing_times.values()):.3f}s",
            f"   • Slowest URL: {max(self.url_processing_times.values()):.3f}s",
            ""
        ])
        
        # Add individual URL details
        report.append("🔗 INDIVIDUAL URL PERFORMANCE:")
        
        for i, (url, processing_time) in enumerate(sorted_urls, 1):
            details = self.url_details.get(url, {})
            docs_count = details.get('docs_count', 0)
            context_length = details.get('context_length', 0)
            status = details.get('status', 'unknown')
            error = details.get('error')
            
            # Determine status emoji and color
            if status == 'success':
                status_emoji = "✅"
            elif status == 'failed':
                status_emoji = "❌"
            elif status == 'timeout':
                status_emoji = "⏰"
            else:
                status_emoji = "❓"
            
            # Create performance indicator
            if processing_time < 2.0:
                speed_emoji = "🚀"  # Fast
            elif processing_time < 5.0:
                speed_emoji = "🔄"  # Medium
            else:
                speed_emoji = "🐌"  # Slow
            
            # Truncate URL for display
            display_url = url if len(url) <= 60 else url[:57] + "..."
            
            report.append(
                f"   {i:2d}. {status_emoji} {speed_emoji} {processing_time:>6.3f}s | "
                f"Docs: {docs_count:2d} | Context: {context_length:4d} chars"
            )
            report.append(f"       🔗 {display_url}")
            
            if error:
                report.append(f"       ⚠️  Error: {str(error)[:80]}..." if len(str(error)) > 80 else f"       ⚠️  Error: {error}")
            
            report.append("")  # Empty line for spacing
        
        # Add performance insights
        if len(sorted_urls) > 1:
            slowest_time = sorted_urls[0][1]
            fastest_time = sorted_urls[-1][1]
            speed_ratio = slowest_time / fastest_time if fastest_time > 0 else float('inf')
            
            report.extend([
                "📈 URL PERFORMANCE INSIGHTS:",
                f"   • Speed Variation: {speed_ratio:.1f}x difference between fastest and slowest",
                f"   • Slowest URL took {slowest_time:.3f}s ({(slowest_time/total_url_time)*100:.1f}% of total URL time)",
                f"   • Top 3 URLs account for {sum(time for _, time in sorted_urls[:3])/total_url_time*100:.1f}% of processing time"
            ])
            
            # Identify potential bottlenecks
            slow_urls = [url for url, time in sorted_urls if time > total_url_time/len(sorted_urls) * 2]
            if slow_urls:
                report.append(f"   ⚠️  {len(slow_urls)} URLs are significantly slower than average")
        
        return report
    
    def print_summary(self):
        """Print the profiling summary"""
        print(self.get_summary())
        
# Global profiler instance
_current_profiler = None

def get_profiler():
    """Get the current profiler instance"""
    return _current_profiler

def set_profiler(profiler):
    """Set the current profiler instance"""
    global _current_profiler
    _current_profiler = profiler



================================================
FILE: utils/reddit_utils.py
================================================
import requests
import time
import random
from utils.config import *
from utils.utils import *
from rank_bm25 import BM25Okapi

# Define the user agent and headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Define the URL templates
url_templates = {
    'hot': 'https://www.reddit.com/r/{subreddit}/hot.json',
    'new': 'https://www.reddit.com/r/{subreddit}/new.json',
    'top': 'https://www.reddit.com/r/{subreddit}/top.json?t={time_filter}',
    'search': 'https://www.reddit.com/search.json?q={query}&sort={sort_type}&type=posts',
    'url': '{url}.json'
}

# Function to fetch and parse Reddit posts
def fetch_reddit_posts(subreddit=None, url_type='hot', limit=10, time_filter='all', custom_url=None, search_query=None, sort_type='relevance'):
    """
    Fetches posts from Reddit based on the provided subreddit, URL type, and query parameters.

    Args:
        subreddit (str, optional): The subreddit to fetch posts from (e.g., 'python'). Defaults to None.
        url_type (str): The type of posts to fetch ('hot', 'new', 'top', 'search', 'url'). Defaults to 'hot'.
        limit (int, optional): The number of posts to retrieve. Defaults to 10.
        time_filter (str, optional): Time filter for top posts ('day', 'week', 'month', 'year', 'all'). Defaults to 'all'.
        custom_url (str, optional): A custom Reddit URL for fetching posts. Defaults to None.
        search_query (str, optional): Search query to fetch specific posts. Defaults to None.
        sort_type (str, optional): Sorting type for search results ('relevance', 'new', 'top'). Defaults to 'relevance'.

    Returns:
        list: A list of dictionaries containing post titles, links, IDs, text, and comments.
    """
    posts = []
    try:
        if url_type == 'url' and custom_url:
            url = url_templates['url'].format(url=custom_url)
        elif url_type == 'search' and search_query:
            url = url_templates['search'].format(query=search_query, sort_type=sort_type)
        else:
            url = url_templates[url_type].format(subreddit=subreddit, time_filter=time_filter)
        
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        # Parse JSON to get post titles, ids, links, and text
        if url_type == 'url':
            try:
                post_data = data[0]['data']['children'][0]['data']
                title = post_data['title']
                link = 'https://www.reddit.com' + post_data['permalink']
                post_id = post_data['id']
                selftext = post_data.get('selftext', '')
                posts.append({'title': title, 'link': link, 'id': post_id, 'text': selftext, 'comments': []})
            except:
                pass
        else:
            # Collect posts with their scores
            post_list = []
            for post in data['data']['children']:
                post_data = post['data']
                title = post_data['title']
                link = 'https://www.reddit.com' + post_data['permalink']
                post_id = post_data['id']
                selftext = post_data.get('selftext', '')
                score = post_data.get('score', 0)
                post_list.append({'title': title, 'link': link, 'id': post_id, 'text': selftext, 'comments': [], 'score': score})
            # Sort posts by score in descending order bm25 lol because reddit doesnt give any # relevance score
            if search_query:
                # Prepare corpus and tokenize
                corpus = [post['title'] + ' ' + post['text'] for post in post_list]
                tokenized_corpus = [doc.lower().split() for doc in corpus]
                bm25 = BM25Okapi(tokenized_corpus)
                tokenized_query = search_query.lower().split()
                scores = bm25.get_scores(tokenized_query)
                # Attach BM25 scores and sort
                for i, post in enumerate(post_list):
                    post['bm25_score'] = scores[i]
                post_list.sort(key=lambda x: x['bm25_score'], reverse=True)
            else:
                post_list.sort(key=lambda x: x['score'], reverse=True)
            posts.extend(post_list[:limit])
        
        return posts
    except Exception as e:
        return [{"error": f"Error fetching Reddit posts: {str(e)}"}]

# Function to fetch comments for a given post
def fetch_post_comments(post_id, limit=5, is_custom_url=False):
    """
    Fetches comments for a specific Reddit post by post ID.

    Args:
        post_id (str): The ID of the Reddit post to fetch comments for.
        limit (int, optional): The number of comments to retrieve. Defaults to 5.
        is_custom_url (bool, optional): Whether the post was fetched using a custom URL. Defaults to False.

    Returns:
        list: A list of comments (as strings) for the given post.
    """
    comments = []
    try:
        url = f'https://www.reddit.com/comments/{post_id}.json' if is_custom_url else f'https://www.reddit.com/comments/{post_id}.json'
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        # Parse JSON to get comments
        for comment in data[1]['data']['children'][:limit]:
            if comment['kind'] == 't1':  # Check if it's a comment (not a more comment or other kind)
                comment_data = comment['data']
                comments.append(comment_data['body'])
        
        return comments
    except requests.exceptions.RequestException as e:
        print(f"Error fetching comments for post {post_id}: {e}")
        return comments

# Function to add random delays
def random_delay():
    """
    Introduces a random delay between requests to mimic natural browsing behavior and avoid rate limiting.
    """
    time.sleep(random.uniform(1, 5))

# Main function to scrape posts and their comments
def reddit_reader(subreddit=None, url_type='hot', n=10, k=5, custom_url=None, time_filter='all', search_query=None, sort_type='relevance'):
    """
    Fetches Reddit posts and their associated comments.

    Args:
        subreddit (str, optional): The subreddit to fetch posts from. Defaults to None.
        url_type (str, optional): The type of posts to fetch ('hot', 'new', 'top', 'search', 'url'). Defaults to 'hot'.
        n (int, optional): The number of posts to fetch. Defaults to 10.
        k (int, optional): The number of comments to fetch for each post. Defaults to 5.
        custom_url (str, optional): A custom URL to fetch posts from. Defaults to None.
        time_filter (str, optional): Time filter for top posts ('day', 'week', 'month', 'year', 'all'). Defaults to 'all'.
        search_query (str, optional): Search query for fetching specific posts. Defaults to None.
        sort_type (str, optional): Sort type for search results ('relevance', 'new', 'top'). Defaults to 'relevance'.

    Returns:
        list: A list of posts, each with associated comments.
    """
    all_posts = []

    print(f"Fetching posts from {url_type} search" if url_type == 'search' else f"Fetching posts from /r/{subreddit}" if subreddit else f"Fetching posts from {custom_url}")
    posts = fetch_reddit_posts(subreddit=subreddit, url_type=url_type, limit=n, time_filter=time_filter, custom_url=custom_url, search_query=search_query, sort_type=sort_type)
    for post in posts:
        print(f"Fetching comments for post: {post['title']}")
        comments = fetch_post_comments(post['id'], limit=k, is_custom_url=(url_type == 'url'))
        post['comments'] = comments
        random_delay()  # Add delay between requests for comments
        
    all_posts.extend(posts)
    random_delay()  # Add delay between requests for posts

    return all_posts

def reddit_to_context(prompt, subreddit=None, url_type='hot', n=10, k=5, custom_url=None, time_filter='all', search_query=None, sort_type='relevance'):
    """
    Generates a context string by combining Reddit posts and comments into a single string.

    Args:
        prompt (str): The initial prompt to append Reddit content to.
        subreddit (str, optional): The subreddit to fetch posts from. Defaults to None.
        url_type (str, optional): The type of posts to fetch ('hot', 'new', 'top', 'search', 'url'). Defaults to 'hot'.
        n (int, optional): The number of posts to fetch. Defaults to 10.
        k (int, optional): The number of comments to fetch for each post. Defaults to 5.
        custom_url (str, optional): A custom URL to fetch posts from. Defaults to None.
        time_filter (str, optional): Time filter for top posts ('day', 'week', 'month', 'year', 'all'). Defaults to 'all'.
        search_query (str, optional): Search query for fetching posts. Defaults to None.
        sort_type (str, optional): Sort type for search results ('relevance', 'new', 'top'). Defaults to 'relevance'.

    Returns:
        str: The concatenated context string.
    """
    posts = reddit_reader(subreddit, url_type, n, k, custom_url, time_filter, search_query, sort_type)
    context = prompt + str(posts)
    return context

def reddit_reader_response(
                           subreddit:str, 
                           url_type:str, 
                           n:int, k:int,
                           custom_url:str, 
                           time_filter:str, 
                           search_query:str, 
                           sort_type:str,
                           model
):
    """
    Generates context by appending Reddit posts and comments to the provided prompt.

    Args:
        subreddit (str, optional): The subreddit to fetch posts from. Defaults to None.
        url_type (str, optional): The type of URL to use (hot, new, top, search, url). Defaults to 'hot'.
        n (int, optional): The number of posts to fetch. Defaults to 10.
        k (int, optional): The number of comments to fetch for each post. Defaults to 5.
        custom_url (str, optional): A custom URL to fetch posts from. Defaults to None.
        time_filter (str, optional): The time filter for top posts (e.g., day, week, month, year, all). Defaults to 'all'.
        search_query (str, optional): The search query for fetching posts. Defaults to None.
        sort_type (str, optional): The sort type for search results (e.g., relevance, new, top,). Defaults to 'relevance'.
    Returns:
        str: The concatenated context string.
    """       
    # context = reddit_to_context(prompt,subreddit, url_type, n=5, k=5, custom_url=custom_url, time_filter=time_filter)
    prompt = prompts['reddit_summary_prompt'].format(search_query=search_query)
    context = reddit_to_context(prompt, subreddit, url_type, n, k, custom_url=custom_url, time_filter=time_filter, search_query=search_query, sort_type=sort_type)
    response = model.invoke(context).content
    return response



================================================
FILE: utils/retriever_utils.py
================================================
import hashlib
import time
import asyncio
import logging
from concurrent.futures import ThreadPoolExecutor
from langchain_community.vectorstores import Chroma
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
import chromadb
from chromadb.config import Settings

# Set up logger
logger = logging.getLogger(__name__)

# Global persistent ChromaDB client for better performance
_chroma_client = None
_chroma_persistent_path = "./chroma_db"

def get_chroma_client():
    """Get or create a persistent ChromaDB client with optimized settings."""
    global _chroma_client
    if _chroma_client is None:
        _chroma_client = chromadb.PersistentClient(
            path=_chroma_persistent_path,
            settings=Settings(
                anonymized_telemetry=False,
                allow_reset=True
            )
        )
        # Optional: Test connection
        try:
            _chroma_client.heartbeat()  # Verify client is working
        except Exception as e:
            logger.warning(f"ChromaDB client health check failed: {e}")
    
    return _chroma_client

async def create_vectorstore_async(docs, collection_name, hf_embeddings, top_k, ensemble_weights=[0.25, 0.75], local_mode=False, batch_size=32, persist_directory="./chroma_db"):
    """
    Asynchronously creates a vectorstore from the given documents using Chroma and returns an ensemble retriever.
    Uses persistent ChromaDB client with optimized settings for better performance.
    Each subquery gets its own collection for query isolation.

    Args:
        docs (list): A list of documents to be added to the vectorstore.
        collection_name (str): The name of the collection to be used for the vectorstore.
        hf_embeddings (object): The embedding model to be used for the vectorstore.
        top_k (int): The number of documents to retrieve from the vectorstore.
        ensemble_weights (list): Weights for BM25 and semantic retrievers [bm25_weight, semantic_weight]

    Returns:
        EnsembleRetriever: An ensemble retriever that combines BM25 and semantic retrievers.
    """
    # Create unique collection name with timestamp to avoid conflicts
    timestamp = str(int(time.time() * 1000))  # millisecond precision
    if local_mode:
        unique_collection_name = f"{collection_name}"
    else:   
        unique_collection_name = f"{collection_name}_{timestamp}"
    
    # Use thread pool for CPU-intensive operations
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as executor:
        # Run ChromaDB operations in thread pool
        ensemble_retriever = await loop.run_in_executor(
            executor,
            _create_vectorstore_sync,
            docs, unique_collection_name, hf_embeddings, top_k, ensemble_weights, batch_size, persist_directory
        )
    
    return ensemble_retriever

def _create_vectorstore_sync(docs, unique_collection_name, hf_embeddings, top_k, ensemble_weights, batch_size=8, persist_directory="./chroma_db"):
    """
    Synchronous helper function for creating vectorstore.
    This runs in a thread pool to avoid blocking the event loop.
    """
    try:
        # Use persistent client for the specified directory
        client = chromadb.PersistentClient(
            path=persist_directory,
            settings=Settings(
                anonymized_telemetry=False,
                allow_reset=True
            )
        )
        
        # If the collection already exists, load and reuse it instead of recreating.
        existing_collections = [c.name for c in client.list_collections()]

        if unique_collection_name in existing_collections:
            logger.info(f"Collection {unique_collection_name} exists — loading existing vectorstore")
            vectorstore = Chroma(
                client=client,
                collection_name=unique_collection_name,
                embedding_function=hf_embeddings
            )
            # Do not re-add documents to avoid duplicates; assume caller passed docs for BM25
        else:
            # Create vectorstore using the collection
            vectorstore = Chroma(
                client=client,
                collection_name=unique_collection_name,
                embedding_function=hf_embeddings
            )
            # Add documents to vectorstore (only when creating new collection)
            if docs:
                # Try to precompute embeddings in batches to improve performance
                try:
                    texts = [d.page_content for d in docs]
                    metadatas = [getattr(d, 'metadata', {}) for d in docs]
                    embeddings = None
                    # Prefer embed_documents API if available
                    if hasattr(hf_embeddings, 'embed_documents'):
                        try:
                            embeddings = hf_embeddings.embed_documents(texts, batch_size=batch_size)
                        except TypeError:
                            # fallback if batch_size not supported
                            embeddings = hf_embeddings.embed_documents(texts)
                    elif hasattr(hf_embeddings, 'embed'):
                        try:
                            embeddings = hf_embeddings.embed(texts)
                        except Exception:
                            embeddings = None

                    if embeddings is not None:
                        # Add texts with precomputed embeddings
                        try:
                            vectorstore.add_texts(texts=texts, metadatas=metadatas, embeddings=embeddings)
                        except Exception:
                            # Fallback to add_documents if add_texts fails
                            vectorstore.add_documents(docs)
                    else:
                        # No embedding function available; let vectorstore compute embeddings
                        vectorstore.add_documents(docs)
                except Exception as e:
                    logger.warning(f"Batched embedding failed, falling back to add_documents: {e}")
                    try:
                        vectorstore.add_documents(docs)
                    except Exception as e2:
                        logger.error(f"Failed to add documents to vectorstore: {e2}")
        
        # Create retrievers
        sem_retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})
        bm25_retriever = BM25Retriever.from_documents(docs)
        bm25_retriever.k = top_k
        
        # Create ensemble retriever with configurable weights
        ensemble_retriever = EnsembleRetriever(
            retrievers=[bm25_retriever, sem_retriever], 
            weights=ensemble_weights
        )
        
        logger.info(f"Created vectorstore with collection: {unique_collection_name}")
        return ensemble_retriever
        
    except Exception as e:
        logger.error(f"Error creating vectorstore: {e}")
        raise

# Keep synchronous version for backward compatibility
def create_vectorstore(docs, collection_name, hf_embeddings, top_k, ensemble_weights=[0.25, 0.75]):
    """
    Synchronous version of create_vectorstore for backward compatibility.
    For better performance, use create_vectorstore_async() instead.
    """
    timestamp = str(int(time.time() * 1000))
    unique_collection_name = f"{collection_name}_{timestamp}"
    return _create_vectorstore_sync(docs, unique_collection_name, hf_embeddings, top_k, ensemble_weights)

async def cleanup_old_collections_async(max_collections=20):
    """
    Asynchronously clean up old ChromaDB collections to prevent memory buildup.
    Keeps only the most recent collections.
    
    Args:
        max_collections (int): Maximum number of collections to keep
    """
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as executor:
        await loop.run_in_executor(executor, _cleanup_collections_sync, max_collections)

def _cleanup_collections_sync(max_collections):
    """
    Synchronous helper function for cleaning up collections.
    This runs in a thread pool to avoid blocking the event loop.
    """
    try:
        client = get_chroma_client()
        collections = client.list_collections()
        
        if len(collections) > max_collections:
            # Sort collections by name (which includes timestamp) and delete oldest
            sorted_collections = sorted(collections, key=lambda x: x.name)
            collections_to_delete = sorted_collections[:-max_collections]
            
            logger.info(f"Cleaning up {len(collections_to_delete)} old collections")
            
            for collection in collections_to_delete:
                try:
                    client.delete_collection(collection.name)
                    logger.info(f"Deleted old collection: {collection.name}")
                except Exception as e:
                    logger.warning(f"Failed to delete collection {collection.name}: {e}")
        else:
            logger.info(f"Collection count ({len(collections)}) within limit ({max_collections})")
                    
    except Exception as e:
        logger.error(f"Error during collection cleanup: {e}")

# Keep synchronous version for backward compatibility
def cleanup_old_collections(max_collections=20):
    """
    Synchronous version of cleanup_old_collections for backward compatibility.
    For better performance, use cleanup_old_collections_async() instead.
    """
    _cleanup_collections_sync(max_collections)


================================================
FILE: utils/startup_banner.py
================================================
"""
CoexistAI Startup Banner Module
Displays professional ASCII banner and system information on startup
"""

import os
import sys
from datetime import datetime
from pathlib import Path

def get_ascii_banner():
    """Load ASCII banner from file"""
    return """
 ██████╗ ██████╗ ███████╗██╗  ██╗██╗███████╗████████╗ █████╗ ██╗
██╔════╝██╔═══██╗██╔════╝╚██╗██╔╝██║██╔════╝╚══██╔══╝██╔══██╗██║
██║     ██║   ██║█████╗   ╚███╔╝ ██║███████╗   ██║   ███████║██║
██║     ██║   ██║██╔══╝   ██╔██╗ ██║╚════██║   ██║   ██╔══██║██║
╚██████╗╚██████╔╝███████╗██╔╝ ██╗██║███████║   ██║   ██║  ██║██║
 ╚═════╝ ╚═════╝ ╚══════╝╚═╝  ╚═╝╚═╝╚══════╝   ╚═╝   ╚═╝  ╚═╝╚═╝
        """

def get_system_info():
    """Get basic system information"""
    try:
        python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
        return {
            "python_version": python_version,
            "platform": sys.platform,
            "startup_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
    except Exception:
        return {
            "python_version": "Unknown",
            "platform": "Unknown", 
            "startup_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

def display_startup_banner(host="localhost", port=8000, mcp_port=None):
    """
    Display the complete startup banner with system information
    
    Args:
        host (str): Server host address
        port (int): FastAPI server port
        mcp_port (int, optional): MCP server port if enabled
    """
    
    # Color codes for terminal output
    CYAN = '\033[96m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    BLUE = '\033[94m'
    MAGENTA = '\033[95m'
    WHITE = '\033[97m'
    BOLD = '\033[1m'
    RESET = '\033[0m'
    
    # Get banner and system info
    banner = get_ascii_banner()
    sys_info = get_system_info()
    
    # Build the complete startup display
    startup_display = f"""
{CYAN}{BOLD}{banner}{RESET}

{MAGENTA}═══════════════════════════════════════════════════════════════════════════════{RESET}
{BOLD}{WHITE}                        🚀 CoexistAI Research Assistant v0.0.2{RESET}
{MAGENTA}═══════════════════════════════════════════════════════════════════════════════{RESET}

{YELLOW}📋 SYSTEM OVERVIEW:{RESET}
{WHITE}   • Modular AI research framework with LLM integration{RESET}
{WHITE}   • Multi-source data exploration: Web, Reddit, YouTube, GitHub, Maps{RESET}
{WHITE}   • Async & parallel processing for optimal performance{RESET}
{WHITE}   • MCP (Model Context Protocol) compatible{RESET}

{BLUE}🔧 CORE FEATURES:{RESET}
{GREEN}   ✓ Web Explorer      {WHITE}- Query web, summarize results with LLMs{RESET}
{GREEN}   ✓ Reddit Explorer   {WHITE}- Search & analyze Reddit content with BM25 ranking{RESET}
{GREEN}   ✓ YouTube Explorer  {WHITE}- Transcript search, summarization & custom prompts{RESET}
{GREEN}   ✓ Map Explorer      {WHITE}- Location search, routing, POI discovery{RESET}
{GREEN}   ✓ GitHub Explorer   {WHITE}- Codebase analysis for GitHub & local repos{RESET}
{GREEN}   ✓ File Explorer     {WHITE}- Local file analysis with vision support{RESET}

{CYAN}🌐 SERVER STATUS:{RESET}
{WHITE}   • FastAPI Server:   {GREEN}http://{host}:{port}{RESET}
{WHITE}   • API Documentation: {GREEN}http://{host}:{port}/docs{RESET}
{WHITE}   • Health Check:     {GREEN}http://{host}:{port}/health{RESET}"""

    if mcp_port:
        startup_display += f"""
{WHITE}   • MCP Server:       {GREEN}mcp://{host}:{mcp_port}{RESET}"""

    startup_display += f"""

{YELLOW}⚙️  SYSTEM INFO:{RESET}
{WHITE}   • Python Version:   {GREEN}{sys_info['python_version']}{RESET}
{WHITE}   • Platform:         {GREEN}{sys_info['platform']}{RESET}
{WHITE}   • Started:          {GREEN}{sys_info['startup_time']}{RESET}
{WHITE}   • Contributor:      {GREEN}Sidhant Pravinkumar Thole{RESET}

{MAGENTA}═══════════════════════════════════════════════════════════════════════════════{RESET}
{BOLD}{CYAN}                    Ready to accelerate your research! 🎯{RESET}
{MAGENTA}═══════════════════════════════════════════════════════════════════════════════{RESET}
"""
    
    print(startup_display)

def display_shutdown_banner():
    """Display shutdown message"""
    CYAN = '\033[96m'
    YELLOW = '\033[93m'
    RESET = '\033[0m'
    BOLD = '\033[1m'
    
    shutdown_msg = f"""
{CYAN}═══════════════════════════════════════════════════════════════════════════════{RESET}
{BOLD}{YELLOW}                    🛑 CoexistAI Server Shutting Down...{RESET}
{CYAN}═══════════════════════════════════════════════════════════════════════════════{RESET}
"""
    print(shutdown_msg)

if __name__ == "__main__":
    # Test the banner
    display_startup_banner()


================================================
FILE: utils/tts_utils.py
================================================

import logging
import random
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import numpy as np
import soundfile as sf
import asyncio
import re
import os
from uuid import uuid4
from kokoro_onnx import Kokoro

def random_pause(min_duration=0.5, max_duration=2.0, sample_rate=None):
    """
    Generate a random pause (silence) audio segment.
    """
    silence_duration = random.uniform(min_duration, max_duration)
    silence = np.zeros(int(silence_duration * sample_rate))
    return silence

async def parse_podcast(text: str, voice_choices:list) -> list[dict]:
    """
    Parse a ... transcript into list of {voice, text} dicts.
    Works even if speakers are in the same line.
    """

    # Identify unique speakers in the transcript
    match = re.search(r"(.*?)", text, re.DOTALL)
    if not match:
        return []
    content = match.group(1).strip()
    speakers = set(re.findall(r"\[(.*?)\]", content))

    # Map speakers to random voices from the given list
    def assign_voices(speaker_list, voice_choices):
        assigned = {}
        choices = voice_choices.copy()
        random.shuffle(choices)
        for i, speaker in enumerate(speaker_list):
            assigned[speaker] = choices[i % len(choices)]
        return assigned

    # Example: voice_choices = ["voice1", "voice2", "voice3"]
    voice_map = assign_voices(speakers, voice_choices)

    # Extract content inside  ... 
    match = re.search(r"(.*?)", text, re.DOTALL)
    if not match:
        return []

    content = match.group(1).strip()

    # Regex: find [Speaker] text until next [ or end
    segments = re.findall(r"\[(.*?)\]\s*([^[]+)", content)

    result = []
    for speaker, speech in segments:
        speaker = speaker.strip()
        speech = speech.strip()

        voice = voice_map.get(speaker, f"default_{speaker.lower()}")
        result.append({"voice": voice, "text": speech})
    logger.info(f"Parsed podcast segments: {result}")
    return result

async def podcasting(sentences, filename):
    """
    Generate a podcast audio file from the given sentences.
    """
    try:
        kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
        logger.info("Initialized Kokoro TTS engine.")
        audio = []
        for sentence in sentences:
            voice = sentence["voice"]
            text = sentence["text"]
            logger.info(f"Creating audio with {voice}: {text}")
            samples, sample_rate = kokoro.create(
                text,
                voice=voice,
                speed=1.0,
                lang="en-us",
            )
            audio.append(samples)
            # Add random silence after each sentence
            audio.append(random_pause(sample_rate=sample_rate))

        # Concatenate all audio parts
        audio = np.concatenate(audio)
        
        # Save the generated audio to file
        sf.write(f"{filename}", audio, sample_rate)
        await asyncio.sleep(5)
        logger.info(f"Created {filename}")
    except Exception as e:
        logger.error(f"Error occurred while creating podcast: {e}")

async def text_to_speech(text, voice, filename, lang):
    """
    Convert text to speech using the specified voice and save it to a file.
    """
    kokoro = Kokoro("kokoro-v1.0.onnx", "voices-v1.0.bin")
    samples, sample_rate = kokoro.create(
        text, voice=voice, speed=1.0, lang=lang
    )
    try:
        sf.write(filename, samples, sample_rate)
        return filename
    except Exception as e:
        logger.error(f"Error occurred while saving audio file: {e}")
        
async def podcasting_from_text(text,theme,llm):
    system_prompt = f"""You are an experienced podcaster who can create engaging episodes on any topic.
    Your style makes complex concepts simple, clear, and enjoyable to listen to.
    
    When writing scripts:
    
    Use natural, conversational language.
    
    Avoid special characters (like *, #, etc.) and TTS markup (such as  tags).
    
    Do not include background descriptions or stage directions.
    
    Always stay on the provided theme (if one is given). If no theme is provided, use the given text to generate engaging, informative content.
    
    The podcast script should be formatted as follows:
    
    
    [Person1] What Person1 says [Person2] What Person2 says ...
    
    
    
    Where each [Person] represents a speaker, followed by their dialogue.
    
    Theme: {theme}
    Text: {text}
    """
    result = await llm.ainvoke(
            system_prompt
            )
    voice_choices = ["af_heart","am_michael","am_adam","am_eric","am_echo","am_puck",
                     "am_fenrir","am_santa","am_liam","af_river"
                     ]
    podcast_segments = await parse_podcast(result.content, voice_choices)

    if os.path.exists("output/podcasts") is False:
        os.makedirs("output/podcasts")
    
    file_path = f"output/podcasts/podcast_{str(uuid4())[:8]}.wav"
    _ = await podcasting(podcast_segments, filename=file_path)
    print(f"Current working directory: {os.getcwd()}")
    print(f"Podcast file created at: {file_path}")
    return file_path

================================================
FILE: utils/utils.py
================================================
"""
utils_langchain.py
Author: Sidhant Thole
Created: 25 May 2025
Description: Utility functions for LangChain-based applications, including model loading, session state, markdown/table extraction, email sending, and more.
"""

# Standard library imports
import os
import re
import time
import datetime
import smtplib
import logging
import subprocess
import shutil
from pathlib import Path

# Third-party imports
import requests
import streamlit as st
from rich import print
from markdownify import markdownify as md

# Langchain and related imports
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_community.embeddings import InfinityEmbeddings
from langchain.docstore.document import Document

# Project imports
from utils.config import *

# Configure logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if not logger.hasHandlers():
    handler = logging.StreamHandler()
    formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(name)s: %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY", "")

def set_logging(enabled: bool):
    """
    Enable or disable logger output at runtime, including in Jupyter notebooks and all modules.
    Args:
        enabled (bool): If True, set logger to INFO. If False, silence logger.
    """
    import logging
    level = logging.INFO if enabled else logging.CRITICAL
    # Set all existing loggers' levels and remove handlers
    for name, logger in logging.root.manager.loggerDict.items():
        if isinstance(logger, logging.Logger):
            logger.setLevel(level)
            for handler in logger.handlers[:]:
                logger.removeHandler(handler)
            if enabled and not logger.handlers:
                handler = logging.StreamHandler()
                formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(name)s: %(message)s')
                handler.setFormatter(formatter)
                logger.addHandler(handler)
    # Set root logger level and handlers
    root_logger = logging.getLogger()
    root_logger.setLevel(level)
    for handler in root_logger.handlers[:]:
        root_logger.removeHandler(handler)
    if enabled:
        handler = logging.StreamHandler()
        formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(name)s: %(message)s')
        handler.setFormatter(formatter)
        root_logger.addHandler(handler)

def is_searxng_running():
    """Check whether SearxNG is reachable.

    Attempts:
      - Use configured values from model_config or env.
      - Try HTTP GET on likely paths (/, /search) with short retries.
      - Fall back to a TCP socket connect.
      - Try host.docker.internal as a fallback when container needs to reach services on the host.

    Returns True if any probe succeeds.
    """
    # get configured host/port
    try:
        import model_config
        host = getattr(model_config, 'HOST_SEARXNG', None)
        port = getattr(model_config, 'PORT_NUM_SEARXNG', None)
    except Exception:
        host = None
        port = None

    if not host:
        host = os.environ.get('HOST_SEARXNG', 'localhost')
    if not port:
        try:
            port = int(os.environ.get('PORT_NUM_SEARXNG', 8085))
        except Exception:
            port = 8085

    hosts_to_try = [host]
    # If host is a container service name and doesn't work, host.docker.internal may help when service runs on host
    if host not in ('host.docker.internal', '127.0.0.1', 'localhost'):
        hosts_to_try.append('host.docker.internal')
        hosts_to_try.append('127.0.0.1')
        hosts_to_try.append('localhost')

    paths = ['/', '/search']

    # Try HTTP probes with small retry loop
    for h in hosts_to_try:
        for p in paths:
            url = f"http://{h}:{port}{p}"
            for attempt in range(3):
                try:
                    resp = requests.get(url, timeout=2)
                    if resp.status_code < 500:
                        logger.info(f"SearxNG reachable at {url} (status={resp.status_code})")
                        return True
                    else:
                        logger.debug(f"SearxNG returned {resp.status_code} at {url}")
                except Exception as e:
                    logger.debug(f"SearxNG probe failed for {url}: {e}")
                time.sleep(0.5)

    # fallback to raw TCP connect
    import socket
    for h in hosts_to_try:
        try:
            with socket.create_connection((h, int(port)), timeout=2):
                logger.info(f"SearxNG TCP connect succeeded to {h}:{port}")
                return True
        except Exception as e:
            logger.debug(f"SearxNG TCP connect failed to {h}:{port}: {e}")

    logger.warning(f"SearxNG not reachable at any tried addresses: hosts={hosts_to_try} port={port}")
    return False
   
def fix_json(json_str):
    """
    Attempts to fix a malformed JSON string by removing trailing commas and correcting common issues.
    Args:
        json_str (str): A potentially malformed JSON string.

    Returns:
        str: The fixed JSON string.
    """
    logger.debug("Fixing JSON string.")
    # Remove trailing commas before } or ]
    json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)
    # Replace single quotes with double quotes
    json_str = re.sub(r"'", '"', json_str)
    # Remove newlines and excessive whitespace
    json_str = re.sub(r'\s+', ' ', json_str)
    # Optionally, add more fixes as needed
    return json_str

def load_model(model_name, 
               _embed_mode='infinity_emb', 
               cross_encoder_name="BAAI/bge-reranker-base",
               kwargs={}):
    """
    Loads the appropriate embeddings and cross-encoder model based on the embedding mode.
    TODO: Validate model and handle errors for unsupported modes.
    Args:
        model_name (str): The name of the Hugging Face or Google embedding model.
        _embed_mode (str, optional): The embedding mode ('infinity_emb', 'huggingface', 'google'). Defaults to 'infinity_emb'.
        cross_encoder_name (str, optional): The name of the cross-encoder model to use. Defaults to "BAAI/bge-reranker-base".

    Returns:
        tuple: The selected embedding model and cross-encoder.
    """
    logger.info(f"Loading model: {model_name} with embedding mode: {_embed_mode}")
    supported_modes = ['infinity_emb', 'huggingface', 'google']
    if _embed_mode not in supported_modes:
        logger.error(f"Unsupported embedding mode: {_embed_mode}")
        raise ValueError(f"Unsupported embedding mode: {_embed_mode}. Supported modes: {supported_modes}")

    hf_embeddings = None
    if _embed_mode == 'infinity_emb':
        infinity_api_url = "http://0.0.0.0:7997"
        # Check if the Infinity API server is running
        def _start_infinity_with_poll(model_name, infinity_api_url, max_wait=120):
            """
            Try to start the infinity_emb binary and poll the health endpoint until ready.
            Writes stdout/stderr to a logfile and returns the logfile path on success.
            Raises RuntimeError on failure with path to logfile for debugging.
            """
            # Prepare logfile
            artifacts_dir = Path(os.path.join(os.path.dirname(__file__), "..", "artifacts")).resolve()
            artifacts_dir.mkdir(parents=True, exist_ok=True)
            logfile = artifacts_dir / "infinity_emb.log"

            # Locate binary. Try (in order): repo-relative venv, common install path, explicit env override, PATH
            candidates = []
            repo_candidate = os.path.join(os.path.dirname(__file__), "..", "infinity_env", "bin", "infinity_emb")
            candidates.append(repo_candidate)
            candidates.append('/opt/infinity_env/bin/infinity_emb')
            # allow user to override with INFINITY_EMB_PATH
            env_override = os.environ.get('INFINITY_EMB_PATH')
            if env_override:
                candidates.insert(0, env_override)

            # finally, check PATH
            path_candidate = shutil.which("infinity_emb")
            if path_candidate:
                candidates.append(path_candidate)

            candidate = None
            for c in candidates:
                try:
                    if c and os.path.exists(c):
                        candidate = c
                        break
                except Exception:
                    continue

            if not candidate:
                raise RuntimeError(f"infinity_emb binary not found. Checked: {candidates}. Please install infinity_emb or set INFINITY_EMB_PATH to the binary location.")

            # Start attempts with simple fallback sequence
            cmds = [
                [candidate, "v2", "--model-id", model_name],
                [candidate, "v2" , "--model-id", model_name],
                [candidate, "v2" , "--model-id", model_name,"--device","cpu"],
            ]

            # Open logfile in append mode so repeated runs keep history
            with open(logfile, "ab") as lf:
                for attempt, cmd in enumerate(cmds, start=1):
                    logger.info(f"Starting infinity_emb (attempt {attempt}) with command: {cmd}")
                    try:
                        proc = subprocess.Popen(
                            cmd,
                            stdout=lf,
                            stderr=lf,
                        )
                    except Exception as e:
                        logger.warning(f"Failed to spawn infinity_emb process: {e}")
                        continue

                    # Poll health endpoint until ready or timeout
                    start_t = time.time()
                    while True:
                        try:
                            response = requests.get(f"{infinity_api_url}/health", timeout=3)
                            if response.status_code == 200:
                                logger.info("Infinity API is healthy and reachable.")
                                return str(logfile)
                        except Exception:
                            # ignore connection errors while starting up
                            pass

                        # If process died, break and try next attempt
                        if proc.poll() is not None:
                            logger.warning(f"infinity_emb process exited prematurely (returncode={proc.returncode}). See logfile: {logfile}")
                            break

                        if time.time() - start_t > max_wait:
                            logger.warning(f"Timeout waiting for Infinity API after {max_wait}s. Checking next attempt or failing. See logfile: {logfile}")
                            break

                        time.sleep(2)

                # All attempts failed
                raise RuntimeError(f"Infinity API failed to start or did not become healthy within allotted time. See logfile: {logfile}")

        try:
            # initial quick check
            response = requests.get(f"{infinity_api_url}/health", timeout=5)
            if response.status_code != 200:
                raise Exception("Infinity API health check failed")
        except Exception:
            logger.info("Infinity API not running or not healthy. Attempting to start it (this can take up to 2 minutes)...")
            try:
                logfile_path = _start_infinity_with_poll(model_name, infinity_api_url, max_wait=120)
                logger.info(f"Started Infinity API; logs: {logfile_path}")
            except Exception as e:
                logger.error(f"Failed to start Infinity API: {e}")
                raise RuntimeError(f"Failed to start Infinity API: {e}. Check the log at {str(Path(os.path.join(os.path.dirname(__file__), '..', 'artifacts', 'infinity_emb.log')).resolve())}")
        try:
            hf_embeddings = InfinityEmbeddings(
                model=model_name, infinity_api_url=infinity_api_url
            )
        except Exception as e:
            logger.error(f"Failed to load InfinityEmbeddings: {e}")
            raise RuntimeError(f"Failed to load InfinityEmbeddings: {e}, please first start the server using infinity_emb v2 --model-id (https://github.com/michaelfeil/infinity)")
    elif _embed_mode == 'huggingface':
        try:
            extra_kwargs = {'trust_remote_code': True}
            hf_embeddings = HuggingFaceEmbeddings(
                model_name=model_name,
                model_kwargs={**extra_kwargs, **kwargs} if kwargs else extra_kwargs
            )
        except Exception as e:
            logger.error(f"Failed to load HuggingFaceEmbeddings: {e}")
            raise RuntimeError(f"Failed to load HuggingFaceEmbeddings: {e}")
    elif _embed_mode == 'google':
        try:
            hf_embeddings = GoogleGenerativeAIEmbeddings(model=model_name, 
                                                         task_type="retrieval_query",
                                                         **kwargs)
        except Exception as e:
            logger.error(f"Failed to load GoogleGenerativeAIEmbeddings: {e}")
            raise RuntimeError(f"Failed to load GoogleGenerativeAIEmbeddings: {e}")

    try:
        cross_encoder = HuggingFaceCrossEncoder(model_name=cross_encoder_name)
    except Exception as e:
        logger.error(f"Failed to load HuggingFaceCrossEncoder: {e}")
        raise RuntimeError(f"Failed to load HuggingFaceCrossEncoder: {e}")

    if hf_embeddings is None:
        logger.error("Failed to initialize embeddings.")
    return hf_embeddings, cross_encoder

def stream_text_1(placeholder, output):
    """
    Similar to `stream_text`, but streams text line-by-line with a faster streaming speed.

    Args:
        placeholder (Streamlit object): A placeholder for rendering the streamed content.
        output (iterable): An iterable containing text to be streamed.
    """
    current_text = ""
    for line in output:
        current_text += line
        placeholder.markdown(current_text, unsafe_allow_html=True)
        time.sleep(0.01)  # Simulate streaming

def stream_answer(text):
    """
    Streams text word-by-word with a short delay between each word for a dynamic output experience.

    Args:
        text (str): The text content to stream.
    """
    for word in text.split(" "):
        yield word + " "
        time.sleep(0.02)

def get_local_data():
    """
    Retrieves the current date and the name of the current weekday.

    Returns:
        tuple: The current date as a string and the current weekday as a string.
    """
    date = str(datetime.datetime.today().date())
    days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thrusday', 'Friday', 'Saturday']
    day = datetime.datetime.today().weekday()
    day = days[day]
    return date, day

def get_generative_model(model_name='gemini-1.5-flash',
                        type='google',
                        base_url='http://localhost:11434/v1',
                        _tools=None,
                        kwargs={}):
    """
    Initializes and returns a generative language model based on the specified type.
    Args:
        model_name (str): The name of the model to use (default is 'gemini-1.5-flash').
        type (str): The type of model to initialize ('google','local','groq','openai','others'). 
                    For others add openai compatible api base url in the model_config.py file.
        base_url (str): The base URL for the local model if type is 'local'.
        _tools (list, optional): A list of tools to bind to the model.
        kwargs (dict, optional): Additional keyword arguments for model initialization.
    Returns:
        llm: An instance of the generative language model.
    """
    if kwargs is None:
        kwargs = {}
    if type == 'google':
        extra_kwargs = {'generation_config': {"response_mime_type": "application/json"}}
        kwargs = {**kwargs, **extra_kwargs}
        llm = ChatGoogleGenerativeAI(
            model=model_name,
            **kwargs,
        )
    elif type == 'local':
        llm = ChatOpenAI(
            base_url=base_url,
            model=model_name,
            **kwargs
        )
    elif type == 'groq':
        from langchain_groq import ChatGroq
        llm = ChatGroq(
            model=model_name,
            **kwargs
            # other params...
        )
    elif type == 'openai':
        llm = ChatOpenAI(
            model=model_name,
            **kwargs
        )
    elif type == 'others':
        llm = ChatOpenAI(
            base_url=base_url,
            model=model_name,
            **kwargs
        )
    if _tools:
        llm.bind_tools(_tools)
    return llm

def log_results(query, context, date, day):
    """
    Logs search queries and the generated context to a local file for later reference.
    TODO: Sanitize sensitive data before logging and add log rotation.
    Args:
        query (str): The search query.
        context (str): The search context or result.
        date (str): The current date.
        day (str): The current day of the week.
    """
    logger.info(f"Logging results for query: {query}")
    with open("search_context.txt", "a") as f:
        context = "==" * 80 + "\n" + "QUERY:" + query + "\n" + "Context:" + context
        f.write(context)

def ordered_set_by_key(data):
    """
    Removes duplicate documents based on the 'page_content' attribute while preserving order.

    Args:
        data (list): A list of documents or items.

    Returns:
        list: A list of unique documents based on 'page_content'.
    """
    seen = set()
    unique_items = []

    # Sort the data based on the specified key
    for item in data:
        # If the key value is not in seen, add it to the unique list
        if item.page_content not in seen:
            seen.add(item.page_content)
            unique_items.append(item)

    return unique_items

def remove_consecutive_newlines(text):
    """
    Removes more than three consecutive newline characters from the text, replacing them with spaces.

    Args:
        text (str): The input text.

    Returns:
        str: The modified text without consecutive newlines.
    """
    # Regular expression to match 4 or more consecutive newline characters
    pattern = r"\n{3,}"

    # Replace the matched pattern with a single space
    return re.sub(pattern, "", text)

def remove_main_url(url):
    """
    Removes the trailing part of the URL after ".com" to simplify URLs.

    Args:
        url (str): The input URL.

    Returns:
        str: The base URL truncated after ".com".
    """
    return url.split(".com")[0] + ".com"

def extract_markdown_tables(filename, md_text):
    """
    Extracts markdown tables from a given text.
    TODO: Handle edge cases for malformed tables and add more robust parsing.
    Args:
        md_text (str): The markdown text.

    Returns:
        list: A list of markdown tables found in the input text.
    """
    logger.debug(f"Extracting markdown tables from {filename}")
    # Define a regex pattern to match markdown tables
    table_regex = re.compile(
        r'(\|(?:[^\n]+\|)+\n'  # Match table header
        r'\|(?:\s*[-:]+\s*\|)+\n'  # Match separator line
        r'(?:\|(?:[^\n]+\|)+\n)*)',  # Match table rows
        re.MULTILINE
    )

    # Find all markdown tables in the text
    tables = table_regex.findall(md_text)
    tables = [Document(table) for table in tables]
    print(tables)
    for i, t in enumerate(tables):
        tables[i].metadata['url'] = filename
        tables[i].metadata['source'] = filename
    return tables

def extract_urls(text):
    """
    Extracts URLs from the given text using a regular expression pattern.

    Args:
        text (str): The input text to search for URLs.

    Returns:
        list: A list of extracted URLs.
    """
    # Regular expression pattern to match URLs
    pattern = r'\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b'

    # Find all matches using the pattern
    urls = re.findall(pattern, text)

    return urls

def extract_subqueries(text):
    """
    Extracts subqueries from the text by searching for strings enclosed in square brackets.

    Args:
        text (str): The input text to extract subqueries from.

    Returns:
        list: A list of extracted subqueries.
    """
    # Regex to extract everything inside the square brackets
    pattern = r'\[([^\]]+)\]'

    # Find the matching part that is inside the square brackets
    match = re.search(pattern, text)

    if match:
        # Extract the matched string and split by commas, then strip quotes and spaces
        subqueries = match.group(1).split(",")
        subqueries = [query.strip().strip('"') for query in subqueries]
        return subqueries
    return []

def extract_urls_from_query(text):
    """
    Extracts all URLs from the given text.

    Args:
        text (str): The input text containing URLs.

    Returns:
        list: A list of URLs found in the text.
    """
    # Regular expression pattern to match URLs (http, https, www)
    url_pattern = r'(https?://[^\s]+|www\.[^\s]+)'

    # Find all URLs in the text using the pattern
    urls = re.findall(url_pattern, text)

    return urls


================================================
FILE: utils/websearch_utils.py
================================================
import asyncio
import aiohttp
import concurrent.futures
import hashlib
import logging
import os
import re
import requests
import time
import random
import json
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from functools import partial
from langchain.docstore.document import Document
from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.retrievers import BM25Retriever
from langchain_community.utilities import SearxSearchWrapper
from langchain_chroma import Chroma
from langchain_text_splitters import MarkdownHeaderTextSplitter, TokenTextSplitter, RecursiveCharacterTextSplitter
from markdownify import markdownify as md
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_search import YoutubeSearch
from markitdown import MarkItDown
from pathlib import Path
from rank_bm25 import BM25Okapi
from utils.retriever_utils import create_vectorstore_async, cleanup_old_collections_async
import hashlib
import time
import logging
from utils.profiler_utils import WebSearchProfiler, get_profiler, set_profiler



import chromadb
from chromadb.config import Settings

from utils.utils import *
from utils.answer_generation import *
from utils.reddit_utils import *
from utils.process_content import process_content

chromadb.api.client.SharedSystemClient.clear_system_cache()

# Set up logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(name)s: %(message)s')
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)

# Global blacklist for unreachable domains
UNREACHABLE_DOMAINS_BLACKLIST = set()

# No in-memory caching for retrievers/rerankers; always create fresh instances per request


class SearchWeb:
    """
    A class for performing web searches using Searx and scraping content from web pages.

    Attributes:
        port (int): The port number for the Searx search wrapper.
        searcher (SearxSearchWrapper): An instance of SearxSearchWrapper for querying search engines.
    """

    def __init__(self, port, host="localhost", type='http'):
        """
        Initializes the SearchWeb class with the given Searx server port.

        Args:
            port (int): The port number for Searx search service.
            host (str): The host address for Searx search service.
            type (str): The protocol type ('http' or 'https'). Defaults to 'http'.
        """
        # Build explicit base URL and pass to SearxSearchWrapper. Add debug logging
        # so we can trace which host/port the application is actually using.
        base_url = f"{type}://{host}:{port}"
        self.searcher = SearxSearchWrapper(searx_host=base_url)
        self.base_url = base_url
        try:
            logger.info(f"SearchWeb initialized with searx base_url={self.base_url}")
            # Some wrappers may expose their configured host attribute; ensure it's set for visibility
            if not hasattr(self.searcher, 'searx_host'):
                try:
                    setattr(self.searcher, 'searx_host', self.base_url)
                except Exception:
                    pass
        except Exception:
            # logging should not crash initialization
            pass

    def query_search(self, query, engines=['google','brave','duckduckgo','startpage','yahoo'], num_results=3):
        """
        Performs a search using the Searx engine and retrieves search results.

        Args:
            query (str): The search query.
            engines (list, optional): The search engines to use. Defaults to ['google'].
            num_results (int, optional): The number of search results to retrieve. Defaults to 5.

        Returns:
            list: The search results from Searx.
        """
        try:
            search_results = self.searcher.results(
                query,
                num_results=num_results,
                engines=engines
            )
            logger.info(f"Search results for query '{query}': {search_results}")
            return search_results
        except Exception as e:
            logger.error(f"Error during search for query '{query}': {e}")
            return []

    def scrape_text(self, url):
        """
        Scrapes the plain text content from the specified URL, removing HTML tags and unwanted elements.

        Args:
            url (str): The URL to scrape text from.

        Returns:
            str: The cleaned text content from the URL, or None if the request fails.
        """
        try:
            # Handle GitHub raw URLs
            if 'github.com' in url:
                url = url.replace('github', 'raw.githubusercontent')
            
            headers = {"User-Agent": "Mozilla/5.0"}
            response = requests.get(url, timeout=15, headers=headers)
            response.raise_for_status()

            page_content = response.content

            soup = BeautifulSoup(page_content, 'html.parser')

            for script in soup(["script", "style"]):
                script.decompose()

            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = '\n'.join(chunk for chunk in chunks if chunk)
            text = re.sub(r'<.*?>', '', text)

            logger.info(f"Scraped text from URL: {url}")
            return text
        except requests.exceptions.RequestException as e:
            logger.error(f"Request error while scraping {url}: {e}")
            return None
        except Exception as e:
            logger.error(f"General error while scraping {url}: {e}")
            return None

    def scrape_top_results(self, urls):
        """
        Scrapes the text content from a list of URLs.

        Args:
            urls (list): A list of URLs to scrape content from.

        Returns:
            list: A list of text content from the URLs.
        """
        out = []
        for url in urls:
            try:
                text = self.scrape_text(url)
                out.append(text)
            except Exception as e:
                logger.error(f"Error scraping URL {url}: {e}")
                out.append(None)
        return out


async def process_url(
    url,
    query,
    text_splitter,
    used_urls,
    hf_embeddings,
    cross_encoder,
    rerank,
    top_k,
    subquery,
    search_snippets_orig,
    model,
    local_mode=False,
    split=True,
    docs=None,
):
    """
    Processes a single URL by retrieving documents, splitting text, and ranking the content.
    Uses async/process pool-based document retrieval and robust logging.

    Args:
        url (str): The URL to process.
        query (str): The main query string.
        text_splitter (TokenTextSplitter): Text splitter for splitting documents into chunks.
        used_urls (list): List of URLs already processed.
        hf_embeddings: Hugging Face embeddings for document embedding.
        cross_encoder: Cross-encoder for reranking.
        rerank (bool): Whether to rerank results.
        top_k (int): Number of top results to retrieve.
        subquery (str): The query or subquery to search.
        search_snippets_orig (dict): Dictionary of original search snippets.
        model: Language model for generating answers.
        local_mode (bool): Whether to process locally stored content (e.g., PDFs).
        split (bool): Whether to split the text into chunks.

    Returns:
        tuple: Processed context, retrieved documents, document list, and URL.
    """
    # Create a mini-profiler for this URL processing
    url_start_time = time.time()
    logger.info(f"Starting process_url for: {url}")
    
    # Start URL tracking in profiler
    profiler = get_profiler()
    if profiler:
        profiler.start_url_processing(url)
    
    # If docs are not provided, fetch them (backwards-compatible).
    if docs is None:
        try:
            doc_retrieval_start = time.time()
            # urls_to_docs now returns a dict mapping url->docs
            docs_map = await urls_to_docs([url], local_mode=local_mode, split=split)
            docs = docs_map.get(url, [])
            doc_retrieval_time = time.time() - doc_retrieval_start
            logger.info(f"Processed {len(docs)} docs for URL: {url} in {doc_retrieval_time:.3f}s")
        except Exception as e:
            logger.error(f"Error processing {url}: {e}")
            # Track failed URL processing
            profiler = get_profiler()
            if profiler:
                profiler.end_url_processing(
                    url=url, 
                    docs_count=0, 
                    context_length=0, 
                    status="failed", 
                    error=str(e)
                )
            return None, None, None, url

    for i, d in enumerate(docs):
        if not local_mode:
            docs[i].page_content = remove_urls(d.page_content)
        docs[i].metadata['url'] = url
        docs[i].metadata['query'] = query

    # Special handling for Reddit and YouTube URLs
    if 'reddit.com' in url:
        logger.info(f"🔍 DEBUG: Reddit URL detected in process_url: {url}")
        try:
            logger.info(f"Processing Reddit URL with wait: {url}")
            await asyncio.sleep(random.randint(1, 3))  # Async sleep to avoid rate limiting
            response = reddit_reader_response(
                subreddit=None, url_type='url', n=5, k=5,
                custom_url=url, time_filter=None,
                search_query=None, sort_type=None,
                model=model
            )
            docs.append(Document(response, metadata={'source': url, 'url': url}))
            logger.info(f"Appended Reddit response for {url}")
        except Exception as e:
            logger.error(f"Error processing Reddit URL {url}: {e}")
    if 'youtube' in url:
        logger.info("YouTube URL detected")
        try:
            logger.info(f"Processing YouTube URL: {url}")
            response = youtube_transcript_response(url, f"Summarise for {subquery}", model)
            docs.append(Document(response, metadata={'source': url, 'url': url}))
            logger.info(f"Appended YouTube transcript for {url}")
        except Exception as e:
            logger.error(f"Error processing YouTube URL {url}: {e}")

    if split:
        try:
            # docs = text_splitter.split_documents(docs)
            # logger.info(f"Split docs for {url} into {len(docs)} chunks")
            logger.info(f"Average words per chunk: {sum(len(d.page_content.split()) for d in docs) / len(docs):.1f}")
        except Exception as e:
            logger.error(f"Error splitting docs for {url}: {e}")

    # Log start of encoding
    encoding_start = time.time()
    logger.info(f"🔍 Encoding start for URL: {url}")

    try:
        # Create unique collection name with timestamp for better isolation
        timestamp = str(int(time.time() * 1000))  # millisecond precision
        if local_mode:
            collection_name = f"rag-chroma-{hashlib.md5(f'{url}'.encode()).hexdigest()[:8]}"
        else:
            collection_name = f"rag-chroma-{hashlib.md5(f'{url}_{subquery}_{timestamp}'.encode()).hexdigest()[:8]}"

        # Use optimized async vectorstore creation
        ensemble_retriever = await create_vectorstore_async(
            docs=docs,
            collection_name=collection_name,
            hf_embeddings=hf_embeddings,
            top_k=top_k,
            ensemble_weights=[0.4, 0.6],
            local_mode=local_mode
        )
    except Exception as e:
        logger.error(f"Error setting up retrievers for {url}: {e}")
        # Track failed URL processing
        profiler = get_profiler()
        if profiler:
            profiler.end_url_processing(
                url=url, 
                docs_count=len(docs) if 'docs' in locals() else 0, 
                context_length=0, 
                status="failed", 
                error=str(e)
            )
        return None, None, None, url

    encoding_end = time.time()
    encoding_time = encoding_end - encoding_start
    logger.info(f"Encoding complete for URL: {url}, Time taken: {encoding_time:.3f}s")

    if rerank:
        try:
            rerank_start = time.time()
            logger.info(f"Reranking start for URL: {url}")
            compressor = CrossEncoderReranker(model=cross_encoder, top_n=3)
            ensemble_retriever = ContextualCompressionRetriever(
                base_compressor=compressor,
                base_retriever=ensemble_retriever
            )
            rerank_end = time.time()
            logger.info(f"Reranking end for URL: {url}, Time taken: {rerank_end - rerank_start:.2f} seconds")
        except Exception as e:
            logger.error(f"Error during reranking for {url}: {e}")

    # TODO: Consider improving query and filter logic for more flexible retrieval
    try:
        # query_filter = {"so": {"$in": [query]}}
        if not local_mode:
            url_filter = {"source": {"$in": [url]}}
        # combined_filter = {"$and": [url_filter]}
            retrieved_docs = await ensemble_retriever.ainvoke(
                subquery, search_kwargs={"k": 3}, filter=url_filter
            )
        else:
            retrieved_docs = await ensemble_retriever.ainvoke(
                subquery, search_kwargs={"k": 3}
            )
        logger.info(f"Retrieved {len(retrieved_docs)} docs for {url}")
    except Exception as e:
        logger.error(f"Error retrieving docs for {url}: {e}")
        # Track failed URL processing
        profiler = get_profiler()
        if profiler:
            profiler.end_url_processing(
                url=url, 
                docs_count=len(docs) if 'docs' in locals() else 0, 
                context_length=0, 
                status="failed", 
                error=str(e)
            )
        return None, None, None, url

    # Build context string
    try:
        if not local_mode:
            logger.info(f"Building context for subquery: {subquery}")
            context = [
                f"Subquery: {subquery} \nsearch result::title: {d.metadata.get('title', '')} url:{d.metadata['source'].replace('https://r.jina.ai/', '')}  \n {d.page_content}\n"
                for i, d in enumerate(retrieved_docs)
            ]
        else:
            context = [
                f"Subquery: {subquery} \nsearch result:: File: {d.metadata.get('source', '').replace('https://r.jina.ai/', '')}  \n {d.page_content}\n"
                for i, d in enumerate(retrieved_docs)
            ]
        context = '\n'.join(context).strip()
        logger.info(f"Built context for {url}")
    except Exception as e:
        logger.error(f"Error building context for {url}: {e}")
        context = ''

    # Log total processing time and track content for time saved calculation
    total_processing_time = time.time() - url_start_time
    logger.info(f"Completed process_url for {url} in {total_processing_time:.3f}s (docs: {len(docs)}, context: {len(context)} chars)")
    
    # Track URL content for time saved calculation and end URL tracking
    profiler = get_profiler()
    if profiler:
        if context:
            profiler.add_url_content(url, context)
        profiler.end_url_processing(
            url=url, 
            docs_count=len(docs), 
            context_length=len(context), 
            status="success"
        )
    
    return context, retrieved_docs, docs, url


async def context_to_docs(
    urls_list,
    subqueries,
    search_snippets,
    search_snippets_orig,
    query,
    top_k,
    hf_embeddings,
    rerank=False,
    cross_encoder=None,
    model=None,
    local_mode=False,
    split=True,
    profiler=None
):
    """
    Retrieves and processes documents from a list of URLs, converts them into a retrievable format.
    Uses parallel execution and robust logging. Handles errors gracefully and logs all major steps.

    Args:
        urls_list (list): List of lists of URLs to process (one sublist per subquery).
        subqueries (list): List of subqueries for each URL group.
        search_snippets (list): List of search snippet documents.
        search_snippets_orig (dict): Dictionary of original search snippets.
        query (str): The query string.
        top_k (int): The number of top results to retrieve.
        hf_embeddings: Hugging Face embeddings for vector retrieval.
        rerank (bool, optional): Whether to rerank results. Defaults to False.
        cross_encoder: Cross-encoder model for reranking.
        model: Language model for generating content.
        local_mode (bool, optional): Whether to process locally stored content (e.g., PDFs). Defaults to False.
        split (bool, optional): Whether to split documents into chunks. Defaults to True.

    Returns:
        tuple: Combined context string, list of retrieved documents, and list of all processed documents.
    """
    logger.info(f"Starting async context_to_docs for {len(urls_list)} URL groups.")
    
    if profiler:
        profiler.start_step("url_collection", "Collecting and preparing URLs for processing")
    
    text_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=32)
    contexts = []
    rtr_docs = []
    used_urls = []
    total_docs = []

    # First: flatten all URLs and prefetch docs for them in parallel
    # This allows urls_to_docs to be run once and in parallel, returning a map of url->docs
    all_urls_flat = []
    for urls in urls_list:
        if urls:
            all_urls_flat.extend(urls)
    # Deduplicate while preserving order
    seen = set()
    all_urls_flat = [u for u in all_urls_flat if not (u in seen or seen.add(u))]

    docs_map = {}
    if all_urls_flat:
        try:
            logger.info(f"Prefetching docs for {len(all_urls_flat)} URLs")
            docs_map = await urls_to_docs(all_urls_flat, local_mode=local_mode, split=split)
        except Exception as e:
            logger.error(f"Error prefetching docs: {e}")

    if local_mode:
        # For local mode, embed all docs first, then subquery over the assembled docs
        logger.info("Local mode: embedding all documents first")
        if profiler:
            profiler.start_step("local_embedding", "Embedding all local documents into single vectorstore")
        
        # Compute collection name based on document set hash (similar to process_url)
        sorted_urls = ''.join(sorted(all_urls_flat))
        collection_name = f"rag-chroma-{hashlib.md5(f'{sorted_urls}'.encode()).hexdigest()[:8]}"
        
        # Process docs (needed for BM25 even if collection exists)
        all_docs = []
        for docs in docs_map.values():
            all_docs.extend(docs)
        
        if split:
            all_docs = text_splitter.split_documents(all_docs)
        
        if not all_docs:
            logger.warning("No documents found in local mode")
            if profiler:
                profiler.end_step("No documents found")
            search_snippets_context = [
                f"search result:: title:{d.metadata.get('title', '')} url:{d.metadata['source'].replace('https://r.jina.ai/', '')}  \n {d.page_content}\n"
                for d in search_snippets
            ]
            search_snippets_context = '\n'.join(search_snippets_context)
            return search_snippets_context, [], []
        
        # Use create_vectorstore_async which handles existing collections
        big_ensemble_retriever = await create_vectorstore_async(
            docs=all_docs,
            collection_name=collection_name,
            hf_embeddings=hf_embeddings,
            top_k=top_k,
            ensemble_weights=[0.4, 0.6],
            local_mode=local_mode,
            persist_directory="./chroma_db_local"
        )
        
        if profiler:
            profiler.end_step(f"Embedded {len(all_docs)} documents")
            profiler.start_step("local_subquery_retrieval", "Retrieving documents for each subquery")
        
        if rerank:
            try:
                compressor = CrossEncoderReranker(model=cross_encoder, top_n=3)
                big_ensemble_retriever = ContextualCompressionRetriever(
                    base_compressor=compressor,
                    base_retriever=big_ensemble_retriever
                )
            except Exception as e:
                logger.error(f"Error during reranking in local mode: {e}")
        
        if profiler:
            profiler.start_step("local_subquery_retrieval", "Retrieving documents for each subquery")
        
        # For each subquery, retrieve from the big vectorstore
        contexts = []
        rtr_docs = []
        for subquery in subqueries:
            try:
                retrieved_docs = await big_ensemble_retriever.ainvoke(subquery, search_kwargs={"k": 3})
                # Build context
                context = [
                    f"Subquery: {subquery} \nsearch result: File:{d.metadata.get('source', '').replace('https://r.jina.ai/', '')}  \n {d.page_content}\n"
                    for i, d in enumerate(retrieved_docs)
                ]
                context = '\n'.join(context).strip()
                contexts.append(context)
                rtr_docs.append(retrieved_docs)
            except Exception as e:
                logger.error(f"Error retrieving for subquery '{subquery}': {e}")
                contexts.append('')
                rtr_docs.append([])
        
        total_docs = all_docs
        
        if profiler:
            profiler.end_step(f"Retrieved docs for {len(subqueries)} subqueries")
            profiler.start_step("context_building", "Building final context from processed documents")
        
        search_snippets_context = [
            f"search result:: title:{d.metadata.get('title', '')} url:{d.metadata['source'].replace('https://r.jina.ai/', '')}  \n {d.page_content}\n"
            for d in search_snippets
        ]
        # Deduplicate search snippets context
        # search_snippets_context = list(set(search_snippets_context))
        search_snippets_context = '\n'.join(search_snippets_context)
        final_context = '\n\n'.join(contexts).strip() + '\n' + search_snippets_context
        
        logger.info(f"Local mode context_to_docs complete. Total contexts: {len(contexts)}, total docs: {len(total_docs)}")
        
        if profiler:
            profiler.end_step(f"Built final context with {len(final_context)} characters")
        
        return final_context, rtr_docs, total_docs

    # Create async tasks for parallel URL processing (for non-local mode)
    async def process_url_async_wrapper(url, subquery_idx):
        """Async wrapper for process_url to handle individual URL processing"""
        try:
            logger.info(f"Starting async processing for URL: {url}")
            
            # Fetch docs from prefetch map if available, otherwise None (process_url will fetch)
            pre_docs = docs_map.get(url) if docs_map else None
            # Call the async version of process_url with pre-fetched docs
            context, retrieved_docs, docs, processed_url = await process_url(
                url=url,
                query=query,
                text_splitter=text_splitter,
                used_urls=used_urls,
                hf_embeddings=hf_embeddings,
                cross_encoder=cross_encoder,
                rerank=rerank,
                top_k=top_k,
                subquery=subqueries[subquery_idx],
                search_snippets_orig=search_snippets_orig,
                model=model,
                local_mode=local_mode,
                split=split,
                docs=pre_docs,
            )
            
            # Process search snippets context replacement
            for k in search_snippets:
                if context:
                    context = context.replace(
                        k.metadata['source'],
                        k.metadata['source'] + f"title:{k.metadata.get('title', '')}"
                    )
            
            return {
                'url': processed_url,
                'context': context,
                'retrieved_docs': retrieved_docs,
                'docs': docs,
                'success': True
            }
            
        except Exception as e:
            logger.error(f"Async processing failed for URL {url}: {e}")
            return {
                'url': url,
                'context': None,
                'retrieved_docs': None,
                'docs': None,
                'success': False,
                'error': str(e)
            }

    # Collect all URL processing tasks
    all_tasks = []
    for u, urls in enumerate(urls_list):
        if not urls:
            logger.warning(f"No URLs provided for subquery {u}.")
            continue
     
        for url in urls:
            task = process_url_async_wrapper(url, u)
            all_tasks.append(task)
    
    if not all_tasks:
        logger.warning("No URLs to process.")
        if profiler:
            profiler.end_step("No URLs found to process")
        search_snippets_context = [
            f"search result:: title:{d.metadata.get('title', '')} url:{d.metadata['source'].replace('https://r.jina.ai/', '')}  \n {d.page_content}\n"
            for d in search_snippets
        ]
        search_snippets_context = '\n'.join(search_snippets_context)
        return search_snippets_context, [], []

    if profiler:
        profiler.end_step(f"Collected {len(all_tasks)} URL processing tasks")
        profiler.start_step("parallel_url_processing", "Processing URLs in parallel")
        profiler.add_metric('urls_processed', len(all_tasks))
    
    # Execute all tasks in parallel
    logger.info(f"Processing {len(all_tasks)} URLs in parallel using asyncio.gather")
    results = await asyncio.gather(*all_tasks, return_exceptions=True)
    
    # Process results
    successful_results = 0
    for result in results:
        if isinstance(result, Exception):
            logger.error(f"URL processing task failed with exception: {result}")
            continue
            
        if result['success'] and result['context'] and result['retrieved_docs'] and result['docs']:
            contexts.append(result['context'])
            rtr_docs.append(result['retrieved_docs'])
            total_docs.extend(result['docs'])
            used_urls.append(result['url'])
            successful_results += 1
            logger.info(f"Successfully processed and added context/docs for URL: {result['url']}")
        else:
            logger.warning(f"No valid docs/context for URL: {result['url']}")
            if not result['success']:
                logger.error(f"Processing error for {result['url']}: {result.get('error', 'Unknown error')}")

    logger.info(f"Async parallel processing complete. Successfully processed {successful_results}/{len(all_tasks)} URLs")
    
    if profiler:
        profiler.end_step(f"Successfully processed {successful_results}/{len(all_tasks)} URLs")
        profiler.add_metric('successful_urls', successful_results)
        profiler.add_metric('failed_urls', len(all_tasks) - successful_results)
        profiler.start_step("context_building", "Building final context from processed documents")

    search_snippets_context = [
        f"search result:: title:{d.metadata.get('title', '')} url:{d.metadata['source'].replace('https://r.jina.ai/', '')}  \n {d.page_content}\n"
        for d in search_snippets
    ]
    search_snippets_context = '\n'.join(search_snippets_context)
    final_context = '\n\n'.join(contexts).strip() + '\n' + search_snippets_context

    logger.info(f"async context_to_docs complete. Total contexts: {len(contexts)}, total docs: {len(total_docs)}")
    
    if profiler:
        profiler.end_step(f"Built final context with {len(final_context)} characters")
    
    # Periodic cleanup of old ChromaDB collections (10% chance)
    if random.random() < 0.2:
        try:
            await cleanup_old_collections_async(max_collections=10)
            logger.info("Performed periodic ChromaDB collection cleanup")
        except Exception as e:
            logger.warning(f"ChromaDB cleanup failed: {e}")
    
    return final_context, rtr_docs, total_docs


def text_to_docs(texts_with_metadata):
    """
    Converts a list of text snippets with metadata into a list of Document objects.

    Args:
        texts_with_metadata (list): A list of dictionaries, each containing a 'snippet' (text content) and
                                    'link' (source URL), and optionally a 'title'.

    Returns:
        list: A list of Document objects, where each document contains the 'snippet' as page content and
              'link' and 'title' as metadata.
    """
    docs = []
    for t in texts_with_metadata:
        try:
            doc = Document(
                page_content=t['snippet'],
                metadata={'source': t['link'], 'title': t.get('title', '')}
            )
            docs.append(doc)
            logger.info(f"Created Document for source: {t.get('link', 'N/A')}")
        except Exception as e:
            logger.error(f"Failed to create Document for entry: {t}. Error: {e}")
    return docs


def remove_urls(text):
    """Removes URLs and Markdown-style links from a string.

    Args:
        text: The string to remove URLs from.

    Returns:
        The string with URLs and Markdown-style links removed.
    """
    try:
        # Pattern for plain URLs
        url_pattern = r'(https?://|www\.)\S+'
        # Pattern for Markdown-style links
        markdown_link_pattern = r'\[([^\]]+)\]\((https?://|www\.)\S+\)'
        # Remove plain URLs
        text = re.sub(url_pattern, '', text)
        # Remove Markdown-style links (replace with just the link text)
        text = re.sub(markdown_link_pattern, r'\1', text)
        logger.info("URLs and markdown links removed from text.")
        return text
    except Exception as e:
        logger.error(f"Error removing URLs from text: {e}")
        return text


def extract_domain_from_url(url):
    """
    Extracts the domain from a URL for blacklisting purposes.
    
    Args:
        url (str): The URL to extract domain from
        
    Returns:
        str: The domain name (without www. prefix)
    """
    try:
        from urllib.parse import urlparse
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        # Remove www. prefix if present
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except Exception as e:
        logger.error(f"Error extracting domain from URL {url}: {e}")
        return None


async def check_url_reachability(url, timeout=10):
    """
    Checks if a URL is reachable using an async HEAD request.
    
    Args:
        url (str): The URL to check
        timeout (int): Timeout in seconds for the request
        
    Returns:
        bool: True if URL is reachable, False otherwise
    """
    try:
        timeout_config = aiohttp.ClientTimeout(total=timeout)
        async with aiohttp.ClientSession(timeout=timeout_config) as session:
            async with session.head(url, allow_redirects=True) as response:
                # Consider 2xx and 3xx status codes as reachable
                is_reachable = 200 <= response.status < 400
                logger.info(f"URL {url} reachability check: {response.status} - {'Reachable' if is_reachable else 'Unreachable'}")
                return is_reachable
    except Exception as e:
        logger.warning(f"URL {url} is unreachable: {e}")
        return False


async def check_urls_reachability(urls):
    """
    Checks reachability of multiple URLs concurrently.
    
    Args:
        urls (list): List of URLs to check
        
    Returns:
        dict: Dictionary mapping URLs to their reachability status (True/False)
    """
    if not urls:
        return {}
    
    logger.info(f"Checking reachability of {len(urls)} URLs")
    tasks = [check_url_reachability(url) for url in urls]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    
    reachability_map = {}
    for url, result in zip(urls, results):
        if isinstance(result, Exception):
            logger.error(f"Error checking URL {url}: {result}")
            reachability_map[url] = False
        else:
            reachability_map[url] = result
    
    reachable_count = sum(1 for status in reachability_map.values() if status)
    logger.info(f"Reachability check complete: {reachable_count}/{len(urls)} URLs are reachable")
    return reachability_map


def add_domains_to_blacklist(urls):
    """
    Adds domains from unreachable URLs to the global blacklist.
    
    Args:
        urls (list): List of URLs whose domains should be blacklisted
    """
    global UNREACHABLE_DOMAINS_BLACKLIST
    
    for url in urls:
        domain = extract_domain_from_url(url)
        if domain and domain not in UNREACHABLE_DOMAINS_BLACKLIST:
            UNREACHABLE_DOMAINS_BLACKLIST.add(domain)
            logger.info(f"Added domain {domain} to blacklist")
    
    logger.info(f"Current blacklist contains {len(UNREACHABLE_DOMAINS_BLACKLIST)} domains: {list(UNREACHABLE_DOMAINS_BLACKLIST)}")


def modify_query_with_blacklist(query):
    """
    Modifies a search query to exclude blacklisted domains using -site: operator.
    
    Args:
        query (str): The original search query
        
    Returns:
        str: Modified query with blacklisted domains excluded
    """
    global UNREACHABLE_DOMAINS_BLACKLIST
    
    if not UNREACHABLE_DOMAINS_BLACKLIST:
        return query
    
    # Add -site: exclusions for each blacklisted domain
    exclusions = [f"-site:{domain}" for domain in UNREACHABLE_DOMAINS_BLACKLIST]
    modified_query = f"{query} {' '.join(exclusions)}"
    
    logger.info(f"Modified query with {len(exclusions)} domain exclusions: {modified_query}")
    return modified_query


def query_to_search_results(query, search_response, websearcher, num_results=3, max_retries=2):
    """
    Performs a web search for each query in the search response and extracts the URLs and search snippets.
    Includes URL reachability checking, retry logic per subquery, and domain blacklisting to avoid unreachable sites.

    Args:
        query (str): The original user query.
        search_response (list): A list of search queries or subqueries.
        websearcher (object): An instance of a web searcher (e.g., SearxSearchWrapper) to perform the search.
        num_results (int, optional): The number of results to retrieve for each search query. Defaults to 3.
        max_retries (int, optional): Maximum number of retries when all URLs for a subquery are unreachable. Defaults to 2.

    Returns:
        tuple: A tuple containing:
            - search_snippets (list): A list of search result snippets for all queries.
            - search_results (list): The raw search results for the last query processed.
            - search_results_urls (list): A list of lists, where each sublist contains URLs from the search results of a query.
    """
    # Start timing for search execution
    search_start_time = time.time()
    logger.info(f"Starting query_to_search_results for {len(search_response)} queries")
    
    all_search_snippets = []
    all_search_results = []
    all_search_results_urls = []
    all_urls_found = set()  # Track all URLs to avoid duplicates across subqueries
    
    for r in search_response:
        logger.info(f"Processing subquery: {r}")
        subquery_snippets = []
        subquery_urls = []
        retry_count = 0
        reachable_found = False
        
        while retry_count <= max_retries and not reachable_found:
            logger.info(f"Subquery '{r}' attempt {retry_count + 1}/{max_retries + 1}")
            
            try:
                # Randomized delay between outbound queries to reduce rate-limit blocking
                time.sleep(random.uniform(1.0, 2.0))
                # Modify query to exclude blacklisted domains
                modified_query = modify_query_with_blacklist(r)
                results = websearcher.query_search(modified_query, num_results=num_results)
                logger.info(f"Search results fetched for subquery: {r} (modified: {modified_query})")
            except Exception as e:
                logger.error(f"Error fetching search results for subquery '{r}': {e}")
                results = []
            
            # Add mentioned URLs as empty-snippet results
            mentioned_url = extract_urls_from_query(query)
            for u in mentioned_url:
                if u not in all_urls_found:
                    results.append({'link': u, 'snippet': ''})
                    all_urls_found.add(u)
            
            try:
                urls = [s['link'] for s in results if 'link' in s and s['link'] not in all_urls_found]
                # Update the set of all URLs found
                all_urls_found.update(urls)
                subquery_urls = urls
                subquery_snippets = results
            except Exception as e:
                logger.error(f"Error processing search results for subquery '{r}': {e}")
                subquery_snippets = results
            
            # Check URL reachability if we have URLs
            if subquery_urls:
                try:
                    # Check reachability
                    loop = asyncio.get_event_loop()
                    if loop.is_running():
                        with concurrent.futures.ThreadPoolExecutor() as executor:
                            future = executor.submit(asyncio.run, check_urls_reachability(subquery_urls))
                            reachability_map = future.result()
                    else:
                        reachability_map = asyncio.run(check_urls_reachability(subquery_urls))
                    
                    # Count reachable URLs
                    reachable_urls = [url for url, is_reachable in reachability_map.items() if is_reachable]
                    unreachable_urls = [url for url, is_reachable in reachability_map.items() if not is_reachable]
                    
                    logger.info(f"Subquery '{r}' reachability check: {len(reachable_urls)}/{len(subquery_urls)} URLs are reachable")
                    
                    # If we have reachable URLs, we're done for this subquery
                    if reachable_urls:
                        reachable_found = True
                        logger.info(f"Subquery '{r}' successful with {len(reachable_urls)} reachable URLs")
                        # Add unreachable domains to blacklist for future searches
                        if unreachable_urls:
                            add_domains_to_blacklist(unreachable_urls)
                    else:
                        # All URLs are unreachable, add domains to blacklist and retry
                        logger.warning(f"All {len(subquery_urls)} URLs for subquery '{r}' are unreachable, adding domains to blacklist")
                        add_domains_to_blacklist(subquery_urls)
                        
                        # If this was our last retry, use the results anyway
                        if retry_count >= max_retries:
                            logger.error(f"Max retries ({max_retries}) reached for subquery '{r}', using results anyway")
                            reachable_found = True  # Proceed with what we have
                        
                        retry_count += 1
                        logger.info(f"Retrying subquery '{r}' with updated blacklist (attempt {retry_count + 1})")
                        
                except Exception as e:
                    logger.error(f"Error checking URL reachability for subquery '{r}': {e}")
                    # If reachability check fails, proceed with the URLs we have
                    reachable_found = True
            else:
                # No URLs found, try next iteration if retries available
                if retry_count >= max_retries:
                    logger.warning(f"No URLs found for subquery '{r}' after all retries")
                    reachable_found = True  # Proceed
                retry_count += 1
        
        # Add this subquery's results to the overall lists
        all_search_results_urls.append(subquery_urls)
        all_search_snippets.extend(subquery_snippets)
        all_search_results = subquery_snippets  # Keep last results for return
    
    # Log search completion with timing
    search_total_time = time.time() - search_start_time
    total_urls = sum(len(urls) for urls in all_search_results_urls)
    logger.info(f"Search completed in {search_total_time:.3f}s")
    logger.info(f"Search metrics: {len(all_search_snippets)} snippets, {total_urls} URLs, {len(all_search_results_urls)} URL groups")
    return all_search_snippets, all_search_results, all_search_results_urls

async def query_web_response(
    query,
    date,
    day,
    websearcher,
    hf_embeddings,
    rerank,
    cross_encoder,
    model,
    text_model,
    num_results=3,
    document_paths=None,
    local_mode=False,
    split=True,
    vectordb=None,
    quick_answer=False
):
    """
    Performs a web search and retrieves results, then generates a response based on those results.
    Uses robust logging and improved error handling. All print statements replaced with logger.

    Args:
        query (str): The input query.
        date (str): The current date.
        day (str): The current day of the week.
        websearcher (SearchWeb): The web search engine wrapper.
        hf_embeddings: Hugging Face embeddings for retrieval.
        rerank (bool): Whether to rerank results.
        cross_encoder: Cross-encoder model for reranking.
        model: Language model for processing responses.
        text_model: Language model for text generation.
        num_results (int, optional): Number of search results to retrieve. Defaults to 3.
        document_paths (list, optional): List of paths for local documents. Defaults to None.
        local_mode (bool, optional): Whether to process local documents. Defaults to False.
        split (bool, optional): Whether to split documents into chunks. Defaults to True.
        quick_answer (bool, optional): Whether to force quick answer mode (disables summary mode). Defaults to False.

    Returns:
        tuple: Generated response, sources, search results, retrieved documents, and context.
    """
    # Initialize profiler and set it globally
    profiler = WebSearchProfiler(query)
    set_profiler(profiler)
    
    try:
        profiler.start_step("query_agent", "Generating search queries from user input")
        search_response,is_summary,is_covered_urls,is_focused_on_urls = await query_agent(query, model, date, day)
        profiler.end_step(f"Generated {len(search_response)} search queries")
        search_response.append(query)
        if len(search_response) == 0:
            search_response = [query]
            logger.info(f"Search response generated for query '{query}' using pure query.")
        
        # Override is_summary if quick_answer is enabled
        if quick_answer:
            is_summary = False
            logger.info(f"Quick answer mode enabled - forcing is_summary to False")
        
        if is_summary:
            split=False
        search_response = [text.replace('"', '') for text in search_response]
        logger.info(f"Search phrases for query '{query}': {search_response}")
        profiler.add_metric('search_queries_generated', len(search_response))
    except Exception as e:
        logger.error(f"Error generating search response for query '{query}': {e}")
        profiler.end_step("Failed with error")
        return None, None, None, None, None, None, None

    if vectordb:
        profiler.start_step("vectordb_retrieval", f"Retrieving from existing vector database {vectordb}")
        # Load existing vector db
        persist_directory = "./chroma_db"
        client = chromadb.PersistentClient(
            path=persist_directory,
            settings=Settings(anonymized_telemetry=False, allow_reset=True)
        )
        vectorstore = Chroma(
            client=client,
            collection_name=vectordb,
            embedding_function=hf_embeddings
        )
        # Get all documents for BM25
        logger.info(f"Loading all documents from vector database '{vectordb}' for BM25 retrieval")
        collection = client.get_collection(vectordb)
        data = collection.get()
        all_docs = []
        for i in range(len(data['documents'])):
            doc = Document(page_content=data['documents'][i], metadata=data.get('metadatas', [{}])[i] if data.get('metadatas') else {})
            all_docs.append(doc)
        logger.info(f"Loaded {len(all_docs)} documents from vector database '{vectordb}'")
        # Check for URL filter
        extracted_urls = extract_urls_from_query(query)
        filter = None
        if extracted_urls and (is_covered_urls or is_focused_on_urls):
            filter = {"source": {"$in": extracted_urls}}
        # Create retriever
        search_kwargs = {"k": 3}
        if filter:
            search_kwargs["filter"] = filter
        sem_retriever = vectorstore.as_retriever(search_kwargs=search_kwargs)
        bm25_retriever = BM25Retriever.from_documents(all_docs)
        bm25_retriever.k = 3
        ensemble_retriever = EnsembleRetriever(
            retrievers=[bm25_retriever, sem_retriever], 
            weights=[0.4, 0.6]
        )
        if rerank:
            compressor = CrossEncoderReranker(model=cross_encoder, top_n=3)
            ensemble_retriever = ContextualCompressionRetriever(
                base_compressor=compressor,
                base_retriever=ensemble_retriever
            )
        logger.info(f"Ensemble retriever created with rerank={rerank}")
        # Retrieve for each subquery
        contexts = []
        rtr_docs = []
        for subquery in search_response:
            try:
                retrieved_docs = await ensemble_retriever.ainvoke(subquery, search_kwargs={"k": 3})
                # Build context
                context = [
                    f"Subquery: {subquery} \nsearch result: {d.metadata.get('source', 'VectorDB')}  \n {d.page_content}\n"
                    for i, d in enumerate(retrieved_docs)
                ]
                context = '\n'.join(context).strip()
                contexts.append(context)
                rtr_docs.append(retrieved_docs)
            except Exception as e:
                logger.error(f"Error retrieving for subquery '{subquery}': {e}")
                contexts.append('')
                rtr_docs.append([])
        total_docs = [doc for sublist in rtr_docs for doc in sublist]
        context = '\n\n'.join(contexts).strip()
        search_results = []
        search_results_urls = []
        search_snippets = []
        search_snippets_orig = {}
        profiler.end_step(f"Retrieved docs for {len(search_response)} subqueries")
    elif websearcher is None or local_mode:
        profiler.start_step("local_document_processing", "Processing local documents")
        logger.warning("Please add list of paths as input, earlier it used to be list of list")
        all_paths=[]
        for k in document_paths:
            all_paths.extend(get_all_paths(k))
        all_paths = [all_paths]
        logger.info(f"Total paths for '{query}': {all_paths}")
        # Only time doc is considered for different subqueries and only first is getting considered
        search_snippets, search_results, search_results_urls = [], [], all_paths * len(search_response)
        search_snippets_orig = {}
        logger.warning("No websearcher provided; using document_paths only.")
        profiler.end_step(f"Prepared {len(all_paths)} local paths")
    else:
        profiler.start_step("web_search_execution", "Executing web search and URL extraction")
        try:
            extracted_urls = extract_urls_from_query(query)
            logger.info(f"Extracted URLs from query '{query}': {extracted_urls}")
            if len(extracted_urls)>0:
                for u in extracted_urls:
                    if ('reddit' in u) or ('youtube' in u):
                        is_covered_urls = True
                    else:
                        is_summary = True
                        is_covered_urls = False
            if extract_urls_from_query(query) and is_covered_urls:
                logger.info(f"Extracted URLs from query '{query}': {extract_urls_from_query(query)}")
                search_snippets_orig = []
                for u in extract_urls_from_query(query):
                    search_snippets_orig.append({'snippet':'',
                                            'link':u,
                                            'title':u})
                search_results = search_snippets_orig
                search_results_urls = [extract_urls_from_query(query)]
            else:
                if extract_urls_from_query(query) and is_focused_on_urls:
                    search_snippets_orig = []
                    for u in extract_urls_from_query(query):
                        search_snippets_orig.append({'snippet':'',
                                                'link':u,
                                                'title':u})
                    search_results = search_snippets_orig
                    search_results_urls = [extract_urls_from_query(query)]
                else:
                    search_snippets_orig, search_results, search_results_urls = query_to_search_results(
                        query, search_response, websearcher, num_results
                    )
            search_snippets = text_to_docs(search_snippets_orig)
            try:
                search_snippets_orig = {k['link']: k['snippet'] for k in search_snippets_orig if 'link' in k.keys()}
            except Exception as e:
                logger.warning(f"No good searches found or error building search_snippets_orig: {e}")
            logger.info(f"Search results fetched for query '{query}'.")
            profiler.end_step(f"Found {len(search_results_urls)} URL groups with {sum(len(urls) for urls in search_results_urls)} total URLs")
            profiler.add_metric('urls_found', sum(len(urls) for urls in search_results_urls))
        except Exception as e:
            logger.error(f"Error fetching search results for query '{query}': {e}")
            profiler.end_step("Failed with error")
            return None, None, None, None, None, None, None

    try:
        profiler.start_step("context_generation", "Processing URLs and generating context")
        if not vectordb:
            context, rtr_docs, total_docs = await context_to_docs(
                search_results_urls,
                search_response,
                search_snippets,
                search_snippets_orig,
                query,
                3,
                hf_embeddings,
                rerank,
                cross_encoder=cross_encoder,
                model=text_model,
                local_mode=local_mode,
                split=split,
                profiler=profiler  # Pass profiler to context_to_docs
            )
        logger.info(f"Async context generated to answer query '{query}'.")
        profiler.end_step(f"Generated context with {len(total_docs)} total documents")
        profiler.add_metric('docs_retrieved', len(total_docs))
        profiler.add_metric('context_length', len(context) if context else 0)
    except Exception as e:
        logger.error(f"Error generating context for query '{query}': {e}")
        profiler.end_step("Failed with error")
        return None, None, None, None, None, None, None



    try:
        profiler.start_step("response_generation", "Generating final response from context")
        if not is_summary or (is_covered_urls):
            logger.info(f"Generating Answer for query '{query}' using async response gen.")
            logger.info("Deduplicating context before response generation.")
            context = context.split("")
            context = " ".join(deduplicate_context([k+"" for k in context]))
           
            response_1, sources = await response_gen(text_model, query, context)
        else:
            logger.info(f"Generating summary for query '{query}' using async summarizer.")
            response_1 = await summarizer(query, total_docs, text_model, 4)
            sources = str(search_results_urls)
        try:
            log_results(query, context, '', '')
            logger.info(f"Logged results for query '{query}'.")
        except Exception as e:
            logger.warning(f"Error logging results for query '{query}': {e}")
        logger.info(f"Async response generated for query '{query}'.")
        profiler.end_step(f"Generated response with {len(response_1)} characters")
    except Exception as e:
        logger.error(f"Error generating async response for query '{query}': {e}")
        profiler.end_step("Failed with error")
        return None, None, None, None, None, None, None

    # Print profiling summary
    profiler.print_summary()
    
    return response_1, sources, search_response, search_results, rtr_docs, total_docs, context

async def url_to_markdown(url, executor, local_mode=False):
    """
    Asynchronously converts a URL or local file to markdown using process_content.
    Handles both local files and HTTP URLs, with logging and error handling.

    Args:
        url (str): The URL or local file path to process.
        executor (concurrent.futures.Executor): The executor for running blocking code.
        local_mode (bool, optional): If True, treat url as a local file. Defaults to False.

    Returns:
        str or None: The processed markdown content, or None if an error occurred.
    """
    try:
        if local_mode:
            # Local mode: read file from disk
            content_type = "internal_pdf"
            content = ""
            if not os.path.isfile(url):
                logger.error(f"The file {url} does not exist.")
                raise FileNotFoundError(f"The file {url} does not exist.")
            logger.info(f"Processing local file: {url}")
            # Run process_content in executor for local file
            markdown_content = await asyncio.get_event_loop().run_in_executor(
                executor, process_content, url, content_type, content
            )
        else:
            # Remote mode: fetch content from URL
            timeout = aiohttp.ClientTimeout(total=30)
            logger.info(f"Fetching URL: {url}")
            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.get(url) as response:
                    response.raise_for_status()
                    content_type = response.headers.get('Content-Type', '')
                    content = await response.read()
                    logger.info(f"Fetched content from {url} with type {content_type}")
                    # Run process_content in executor for URL content
                    markdown_content = await asyncio.get_event_loop().run_in_executor(
                        executor, process_content, url, content_type, content
                    )
        logger.info(f"Processed markdown for: {url}")
        return markdown_content
    except FileNotFoundError as fnf:
        logger.error(f"File not found: {fnf}")
        return None
    except aiohttp.ClientError as ce:
        logger.error(f"HTTP error for {url}: {ce}")
        return None
    except Exception as e:
        logger.error(f"An error occurred processing {url}: {e}")
        # TODO: Add more granular error handling if needed (e.g., for content parsing)
        return None

async def urls_to_docs(urls, local_mode=False, split=True):
    """
    Asynchronously converts a list of URLs to document objects, optionally from local files.
    Uses async and ProcessPoolExecutor for efficient parallel processing.

    Args:
        urls (list): List of URLs to process.
        local_mode (bool, optional): Whether to process local files. Defaults to False.

    Returns:
        list: List of processed document objects.
    """
    # Start timing
    urls_start_time = time.time()
    mode_str = "local files" if local_mode else "URLs"
    logger.info(f"🔄 Starting urls_to_docs for {len(urls)} {mode_str}")
    
    # Prepare return map
    docs_map = {}
    if not urls:
        logger.warning("0 URLs were given to urls_to_docs.")
        return docs_map

    # Use a process pool for CPU-bound work (process_content)
    loop = asyncio.get_event_loop()
    with ProcessPoolExecutor() as executor:
        # Schedule all url_to_markdown tasks concurrently
        orig_urls = urls.copy()
        # Preserve order while deduplicating
        unique_urls = list(dict.fromkeys(orig_urls))
        logger.info(f"Processing {len(unique_urls)} {mode_str} with ProcessPoolExecutor")
        tasks = [url_to_markdown(url, executor, local_mode=local_mode) for url in unique_urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)

        for url, result in zip(unique_urls, results):
            docs_map[url] = []
            if isinstance(result, Exception):
                logger.error(f"Error fetching or processing URL {url}: {result}")
                continue
            if result is None:
                logger.warning(f"No content returned for URL {url}")
                continue

            try:
              
                headers_to_split_on = [
                    ("#", "Header 1"),
                    ("##", "Header 2"),
                    ("###", "Header 3"),
                    
                
                ]
                markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
                # Further split for granular sections using regex (e.g., tables, code blocks)
                
                if split:
                    split_docs = markdown_splitter.split_text(result)
                else:
                    split_docs = [Document(page_content=result)]
                
                # if split:
                #     split_docs = markdown_splitter.split_text(result)
                    
                # else:
                #     split_docs = [Document(page_content=result)]
                for i, d in enumerate(split_docs):
                    split_docs[i].metadata['source'] = url + ' Section: ' + split_docs[i].metadata.get('Header 1', '') +\
                        ' ' + split_docs[i].metadata.get('Header 2', '') + ' ' + split_docs[i].metadata.get('Header 3', '') 
                    split_docs[i].metadata['url'] = url
                    split_docs[i].page_content = ' Section: ' + split_docs[i].metadata.get('Header 1', '') +\
                        ' ' + split_docs[i].metadata.get('Header 2', '') + ' ' + split_docs[i].metadata.get('Header 3', '') + '' + split_docs[i].page_content.strip() + ''

                docs_map[url].extend(split_docs)
               
                logger.info(f"✅ Successfully processed and added document(s) for URL: {url}")

                # Track content for time saved calculation
                profiler = get_profiler()
                if profiler:
                    try:
                        if local_mode and 'split_docs' in locals():
                            for split_doc in split_docs:
                                profiler.add_url_content(url, split_doc.page_content)
                        else:
                            profiler.add_url_content(url, result)
                    except Exception:
                        pass

            except Exception as e:
                logger.error(f"Error creating Document(s) for URL {url}: {e}")

    # Log completion with timing
    urls_total_time = time.time() - urls_start_time
    total_docs = sum(len(v) for v in docs_map.values())
    # Calculate average words per document
    total_words = sum(len(doc.page_content.split()) for docs in docs_map.values() for doc in docs)
    avg_words = total_words / total_docs if total_docs > 0 else 0
    logger.info(f"urls_to_docs completed in {urls_total_time:.3f}s: {total_docs} documents from {len(unique_urls)} {mode_str}, avg {avg_words:.1f} words/doc")
    return docs_map

def youtube_transcript_response(query, task, model,n=3):
    overall_context = ''
    if "youtube.com" in query:
        try:
            md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
            result = md.convert(query)
            prompt = result.text_content 
            prompt = prompts['youtube_summary_prompt'].format(task=task, transcript=prompt)
        except:
            try:
                video_id = query.split("=")[1]
                srt = YouTubeTranscriptApi.get_transcript(video_id)
                transcript = ' '.join([s['text'] for s in srt])
                prompt = prompts['youtube_summary_prompt'].format(task=task, transcript=transcript)
            except Exception as e:
                logger.error(f"Error summarizing URL {url}: {e}")
                return "Error generating summary."
        response = model.invoke(prompt)
        response = response.content
        overall_context = overall_context + f"\n\nVideo: {query}\nTranscript Summary: {response}\n\n"
    else:
        try:
            videos = json.loads(YoutubeSearch(query, max_results=10).to_json())['videos'][:n]
        except:
            logger.error("error with youtube search")
            return "Error generating summary."
        for k in videos:
            video_id = k['id']
            title = k['title']  
            channel = k['channel']
            url = f"https://www.youtube.com/watch?v={video_id}"
            logger.info(f"Found YouTube video: {title} by {channel} at {url}")
            try:
                md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
                result = md.convert(url)
                prompt = result.text_content 
                prompt = prompts['youtube_summary_prompt'].format(task=task, transcript=prompt)
            except:
                try:
                    srt = YouTubeTranscriptApi.get_transcript(video_id)
                    transcript = ' '.join([s['text'] for s in srt])
                    prompt = prompts['youtube_summary_prompt'].format(task=task, transcript=transcript)
                    
                except:
                    logger.error("error with youtube video")
            try:
                response = model.invoke(prompt)
                response = response.content
            except:
                logger.error("error with LLM")
            overall_context += f"\n\nVideo: {title} by {channel}\nURL: {url}\nTranscript Summary: {response}\n\n"
    print(f"Generated YouTube context for query '{query}': {overall_context}")
    return overall_context

def generate_doc_hash(text):
    """
    Generates a hash for the input document text.

    Args:
        text (str): The text of the document.

    Returns:
        str: The generated SHA256 hash of the document.
    """
    return hashlib.sha256(text.encode('utf-8')).hexdigest()

async def summary_of_url(query, url, model, local_mode=False):
    """
    Generates a summary of the content at the specified URL or local file path.

    Args:
        query (str): The query string to use for summarization.
        url (str): The URL or local file path to summarize.
        model: The language model to use for generating the summary.
        local_mode (bool, optional): If True, treat url as a local file. Defaults to False.

    Returns:
        str: The generated summary of the content.
    """
    try:
        docs = []
        if local_mode:
            urls = get_all_paths(url)
        else:
            urls = [url]
        for u in urls:
            docs_map = await urls_to_docs([u], local_mode=local_mode)
            docs.extend(docs_map.get(u, []))
        if not docs:
            logger.warning(f"No documents found for URL: {url}")
            return "No content found to summarize."
        content= ''
        for d in docs:
            content = content + 'source:' + url  + '\n' + d.page_content + ''
        summary = await model.ainvoke(f"Summarise the following content to answer {query}:\n{content}")
        return summary.content
    except Exception as e:
        logger.error(f"Error summarizing URL {url}: {e}")
        return "Error generating summary."
    
def is_file_folder(root_path):
    root = Path(root_path)
    if not root.exists():
        return "Not a valid path"
    if root.is_file():
        return "File"
    elif root.is_dir():
        return "Folder"
    else:
        return "Unknown"
    
def get_all_paths(root_path):
    paths = []
    type = is_file_folder(root_path)
    if type=='Folder':
        for dirpath, dirnames, filenames in os.walk(root_path):
            # Add all files in this directory
            for filename in filenames:
                paths.append(os.path.join(dirpath, filename))
    elif type=='File':
        paths=[root_path]
    else:
        paths=None
    return paths

async def fetch_html(url, client):
    headers = {
        "User-Agent": (
            "Mozilla/5.0"
        )
    }
    resp = await client.get(url, headers=headers)
    resp.raise_for_status()
    return resp.text

async def extract_clickable_elements(url):
    async with httpx.AsyncClient(timeout=10) as client:
        html = await fetch_html(url, client)
        soup = BeautifulSoup(html, 'html.parser')
        results = []

        # Anchor tags
        for a in soup.find_all('a', href=True):
            text = a.get_text(strip=True) or a.get('aria-label') or a.get('title')
            href = a['href']
            if text and href and not href.startswith('#'):
                full_url = urljoin(url, href)
                results.append({'title': text, 'url': full_url})

        # Elements with onclick (e.g., location.href)
        for elem in soup.find_all(attrs={"onclick": True}):
            text = elem.get_text(strip=True) or elem.get('aria-label') or elem.get('title')
            onclick = elem['onclick']
            match = re.search(r"location\.href=['\"]([^'\"]+)['\"]", onclick)
            js_url = match.group(1) if match else None
            if text and js_url:
                full_url = urljoin(url, js_url)
                results.append({'title': text, 'url': full_url})

        return results

def bm25_search(elements, query,topk=10):
    def tokenize(text):
        return text.lower().split()
    titles = [el['title'] for el in elements]
    tokenized_corpus = [tokenize(title) for title in titles]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = tokenize(query)
    scores = bm25.get_scores(tokenized_query)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    ranked_results = [
        {'title': elements[i]['title'], 'url': elements[i]['url'], 'score': scores[i]}
        for i in ranked_indices if scores[i] >= 0
    ]
    return ranked_results[:topk]

async def get_topk_bm25_clickable_elements(url, query, topk=10):
    elements = await extract_clickable_elements(url)
    ranked = bm25_search(elements, query, topk)
    return ranked

def deduplicate_context(context):
    """
    Deduplicates sections in the context based on their content.
    content is in page_content and in  tags. in sequence, first one stays
    and connects later duplicates are removed. there can be content before and after content so first extract the content
    """
    seen_hashes = set()
    deduped_context = []
    for doc in context:     
        content_match = re.search(r'(.*?)', doc, re.DOTALL)
        content_text = content_match.group(1).strip() if content_match else doc.strip()
        doc_hash = generate_doc_hash(content_text)
        if doc_hash not in seen_hashes:
            seen_hashes.add(doc_hash)
            deduped_context.append(doc)
    logger.info(f"Deduplicated context: {len(context)} -> {len(deduped_context)} documents")
    return deduped_context