Repository: PromtEngineer/localGPT Branch: main Commit: 4d41c7d1713b Files: 134 Total size: 878.8 KB Directory structure: gitextract_pt0n86zf/ ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ └── feature_request.md │ └── pull_request_template.md ├── .gitignore ├── CONTRIBUTING.md ├── DOCKER_README.md ├── DOCKER_TROUBLESHOOTING.md ├── Dockerfile.backend ├── Dockerfile.frontend ├── Dockerfile.rag-api ├── Documentation/ │ ├── api_reference.md │ ├── architecture_overview.md │ ├── deployment_guide.md │ ├── docker_usage.md │ ├── improvement_plan.md │ ├── indexing_pipeline.md │ ├── installation_guide.md │ ├── prompt_inventory.md │ ├── quick_start.md │ ├── retrieval_pipeline.md │ ├── system_overview.md │ ├── triage_system.md │ └── verifier.md ├── LICENSE ├── README.md ├── WATSONX_README.md ├── backend/ │ ├── README.md │ ├── database.py │ ├── ollama_client.py │ ├── requirements.txt │ ├── server.py │ ├── simple_pdf_processor.py │ ├── test_backend.py │ └── test_ollama_connectivity.py ├── batch_indexing_config.json ├── create_index_script.py ├── demo_batch_indexing.py ├── docker-compose.local-ollama.yml ├── docker-compose.yml ├── docker.env ├── env.example.watsonx ├── eslint.config.mjs ├── next.config.ts ├── package.json ├── postcss.config.mjs ├── rag_system/ │ ├── DOCUMENTATION.md │ ├── README.md │ ├── __init__.py │ ├── agent/ │ │ ├── __init__.py │ │ ├── loop.py │ │ └── verifier.py │ ├── api_server.py │ ├── api_server_with_progress.py │ ├── factory.py │ ├── indexing/ │ │ ├── __init__.py │ │ ├── contextualizer.py │ │ ├── embedders.py │ │ ├── graph_extractor.py │ │ ├── latechunk.py │ │ ├── multimodal.py │ │ ├── overview_builder.py │ │ └── representations.py │ ├── ingestion/ │ │ ├── __init__.py │ │ ├── chunking.py │ │ ├── docling_chunker.py │ │ └── document_converter.py │ ├── main.py │ ├── pipelines/ │ │ ├── __init__.py │ │ ├── indexing_pipeline.py │ │ └── retrieval_pipeline.py │ ├── requirements.txt │ ├── rerankers/ │ │ ├── __init__.py │ │ ├── reranker.py │ │ └── sentence_pruner.py │ ├── retrieval/ │ │ ├── __init__.py │ │ ├── query_transformer.py │ │ └── retrievers.py │ └── utils/ │ ├── batch_processor.py │ ├── logging_utils.py │ ├── ollama_client.py │ ├── validate_model_config.py │ └── watsonx_client.py ├── requirements-docker.txt ├── requirements.txt ├── run_system.py ├── setup_rag_system.sh ├── simple_create_index.sh ├── src/ │ ├── app/ │ │ ├── globals.css │ │ ├── layout.tsx │ │ └── page.tsx │ ├── components/ │ │ ├── IndexForm.tsx │ │ ├── IndexPicker.tsx │ │ ├── IndexWizard.tsx │ │ ├── LandingMenu.tsx │ │ ├── Markdown.tsx │ │ ├── ModelSelect.tsx │ │ ├── SessionIndexInfo.tsx │ │ ├── demo.tsx │ │ └── ui/ │ │ ├── AccordionGroup.tsx │ │ ├── GlassInput.tsx │ │ ├── GlassSelect.tsx │ │ ├── GlassToggle.tsx │ │ ├── InfoTooltip.tsx │ │ ├── avatar.tsx │ │ ├── badge.tsx │ │ ├── button.tsx │ │ ├── chat-bubble-demo.tsx │ │ ├── chat-bubble.tsx │ │ ├── chat-input.tsx │ │ ├── chat-settings-modal.tsx │ │ ├── conversation-page.tsx │ │ ├── dropdown-menu.tsx │ │ ├── empty-chat-state.tsx │ │ ├── localgpt-chat.tsx │ │ ├── message-loading.tsx │ │ ├── quick-chat.tsx │ │ ├── scroll-area.tsx │ │ ├── separator.tsx │ │ ├── session-chat.tsx │ │ ├── session-sidebar.tsx │ │ ├── sidebar.tsx │ │ ├── skeleton.tsx │ │ └── textarea.tsx │ ├── lib/ │ │ ├── api.ts │ │ ├── types.ts │ │ └── utils.ts │ ├── test-upload.html │ └── utils/ │ └── textNormalization.ts ├── start-docker.sh ├── system_health_check.py ├── tailwind.config.js ├── test_docker_build.sh ├── test_markdown_streaming.js └── tsconfig.json ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve LocalGPT title: '[BUG] ' labels: 'bug' assignees: '' --- ## 🐛 Bug Description A clear and concise description of what the bug is. ## 🔄 Steps to Reproduce 1. Go to '...' 2. Click on '...' 3. Scroll down to '...' 4. See error ## ✅ Expected Behavior A clear and concise description of what you expected to happen. ## ❌ Actual Behavior A clear and concise description of what actually happened. ## 📸 Screenshots If applicable, add screenshots to help explain your problem. ## 🖥️ Environment Information **Desktop/Server:** - OS: [e.g. macOS 13.4, Ubuntu 20.04, Windows 11] - Python Version: [e.g. 3.11.5] - Node.js Version: [e.g. 23.10.0] - Ollama Version: [e.g. 0.9.5] - Docker Version: [e.g. 24.0.6] (if using Docker) **Browser (if web interface issue):** - Browser: [e.g. Chrome, Safari, Firefox] - Version: [e.g. 118.0.0.0] ## 📋 System Health Check Please run `python system_health_check.py` and paste the output: ``` [Paste system health check output here] ``` ## 📝 Error Logs Please include relevant error messages or logs: ``` [Paste error logs here] ``` ## 🔧 Configuration - Deployment method: [Docker / Direct Python] - Models used: [e.g. qwen3:0.6b, qwen3:8b] - Document types: [e.g. PDF, DOCX, TXT] ## 📎 Additional Context Add any other context about the problem here. ## 🤔 Possible Solution If you have ideas for fixing the issue, please share them here. ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for LocalGPT title: '[FEATURE] ' labels: 'enhancement' assignees: '' --- ## 🚀 Feature Request ### 📝 Is your feature request related to a problem? Please describe. A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] ### 💡 Describe the solution you'd like A clear and concise description of what you want to happen. ### 🔄 Describe alternatives you've considered A clear and concise description of any alternative solutions or features you've considered. ### 🎯 Use Case Describe the specific use case or scenario where this feature would be valuable: - Who would use this feature? - When would they use it? - How would it improve their workflow? ### 📋 Acceptance Criteria What would need to be implemented for this feature to be considered complete? - [ ] Criterion 1 - [ ] Criterion 2 - [ ] Criterion 3 ### 🏗️ Implementation Ideas If you have ideas about how this could be implemented, please share: - Which components would be affected? - Any technical considerations? - Potential challenges? ### 📊 Priority How important is this feature to you? - [ ] Critical - Blocking my use case - [ ] High - Would significantly improve my workflow - [ ] Medium - Nice to have - [ ] Low - Minor improvement ### 📎 Additional Context Add any other context, screenshots, mockups, or examples about the feature request here. ### 🔗 Related Issues Link any related issues or discussions: ================================================ FILE: .github/pull_request_template.md ================================================ ## 📝 Description Brief description of what this PR does. Fixes #(issue number) ## 🎯 Type of Change - [ ] 🐛 Bug fix (non-breaking change which fixes an issue) - [ ] ✨ New feature (non-breaking change which adds functionality) - [ ] 💥 Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] 📚 Documentation update - [ ] 🧪 Test improvements - [ ] 🔧 Code refactoring - [ ] 🎨 UI/UX improvements ## 🧪 Testing ### Test Environment - [ ] Tested with Docker deployment - [ ] Tested with direct Python deployment - [ ] Tested on macOS - [ ] Tested on Linux - [ ] Tested on Windows ### Test Cases - [ ] All existing tests pass - [ ] New tests added for new functionality - [ ] Manual testing completed - [ ] System health check passes ```bash # Commands used for testing python system_health_check.py python run_system.py --health # Add any specific test commands here ``` ## 📋 Checklist ### Code Quality - [ ] Code follows the project's coding standards - [ ] Self-review of the code completed - [ ] Code is properly commented - [ ] Type hints added (Python) - [ ] No console.log statements left in production code ### Documentation - [ ] Documentation updated (if applicable) - [ ] API documentation updated (if applicable) - [ ] README updated (if applicable) - [ ] CONTRIBUTING.md guidelines followed ### Dependencies - [ ] No new dependencies added, or new dependencies are justified - [ ] requirements.txt updated (if applicable) - [ ] package.json updated (if applicable) ## 🖥️ Screenshots (if applicable) Add screenshots to help reviewers understand the changes. ## 📊 Performance Impact Describe any performance implications: - [ ] No performance impact - [ ] Performance improved - [ ] Performance may be affected (explain below) ## 🔄 Migration Notes If this is a breaking change, describe what users need to do: - [ ] No migration needed - [ ] Migration steps documented below ## 📎 Additional Notes Any additional information that reviewers should know. ================================================ FILE: .gitignore ================================================ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. # dependencies /node_modules /.pnp .pnp.* .yarn/* !.yarn/patches !.yarn/plugins !.yarn/releases !.yarn/versions # testing /coverage # next.js /.next/ /out/ # production /build # misc .DS_Store *.pem # debug npm-debug.log* yarn-debug.log* yarn-error.log* .pnpm-debug.log* # env files (can opt-in for committing if needed) .env* # vercel .vercel # typescript *.tsbuildinfo next-env.d.ts # Python __pycache__/ *.pyc # Local Data /index_store /shared_uploads chat_history.db *.pkl # Backend generated files backend/shared_uploads/ # Vector DB artefacts lancedb/ index_store/overviews/ # Logs and runtime output logs/ *.log # SQLite or other database files *.db #backend/*.db # backend/chat_history.db backend/chroma_db/ backend/chroma_db/** # Document and user-uploaded files (PDFs, images, etc.) rag_system/documents/ *.pdf # Ensure docker.env remains tracked !docker.env !backend/chat_data.db ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to LocalGPT Thank you for your interest in contributing to LocalGPT! This guide will help you get started with contributing to our private document intelligence platform. ## 🚀 Quick Start for Contributors ### Prerequisites - Python 3.8+ (we test with 3.11.5) - Node.js 16+ (we test with 23.10.0) - Git - Ollama (for local AI models) ### Development Setup 1. **Fork and Clone** ```bash # Fork the repository on GitHub, then clone your fork git clone https://github.com/YOUR_USERNAME/multimodal_rag.git cd multimodal_rag # Add upstream remote git remote add upstream https://github.com/PromtEngineer/multimodal_rag.git ``` 2. **Set Up Development Environment** ```bash # Install Python dependencies pip install -r requirements.txt # Install Node.js dependencies npm install # Install Ollama and models curl -fsSL https://ollama.ai/install.sh | sh ollama pull qwen3:0.6b ollama pull qwen3:8b ``` 3. **Verify Setup** ```bash # Run health check python system_health_check.py # Start development system python run_system.py --mode dev ``` ## 📋 Development Workflow ### Branch Strategy We use a feature branch workflow: - `main` - Production-ready code - `docker` - Docker deployment features and documentation - `feature/*` - New features - `fix/*` - Bug fixes - `docs/*` - Documentation updates ### Making Changes 1. **Create a Feature Branch** ```bash # Update your main branch git checkout main git pull upstream main # Create feature branch git checkout -b feature/your-feature-name ``` 2. **Make Your Changes** - Follow our [coding standards](#coding-standards) - Write tests for new functionality - Update documentation as needed 3. **Test Your Changes** ```bash # Run health checks python system_health_check.py # Test specific components python -m pytest tests/ -v # Test system integration python run_system.py --health ``` 4. **Commit Your Changes** ```bash git add . git commit -m "feat: add new feature description" ``` 5. **Push and Create PR** ```bash git push origin feature/your-feature-name # Create pull request on GitHub ``` ## 🎯 Types of Contributions ### 🐛 Bug Fixes - Check existing issues first - Include reproduction steps - Add tests to prevent regression ### ✨ New Features - Discuss in issues before implementing - Follow existing architecture patterns - Include comprehensive tests - Update documentation ### 📚 Documentation - Fix typos and improve clarity - Add examples and use cases - Update API documentation - Improve setup guides ### 🧪 Testing - Add unit tests - Improve integration tests - Add performance benchmarks - Test edge cases ## 📝 Coding Standards ### Python Code Style We follow PEP 8 with some modifications: ```python # Use type hints def process_document(file_path: str, config: Dict[str, Any]) -> ProcessingResult: """Process a document with the given configuration. Args: file_path: Path to the document file config: Processing configuration dictionary Returns: ProcessingResult object with metadata and chunks """ pass # Use descriptive variable names embedding_model_name = "Qwen/Qwen3-Embedding-0.6B" retrieval_results = retriever.search(query, top_k=20) # Use dataclasses for structured data @dataclass class IndexingConfig: embedding_batch_size: int = 50 enable_late_chunking: bool = True chunk_size: int = 512 ``` ### TypeScript/React Code Style ```typescript // Use TypeScript interfaces interface ChatMessage { id: string; content: string; role: 'user' | 'assistant'; timestamp: Date; sources?: DocumentSource[]; } // Use functional components with hooks const ChatInterface: React.FC = ({ sessionId }) => { const [messages, setMessages] = useState([]); const handleSendMessage = useCallback(async (content: string) => { // Implementation }, [sessionId]); return (
{/* Component JSX */}
); }; ``` ### File Organization ``` rag_system/ ├── agent/ # ReAct agent implementation ├── indexing/ # Document processing and indexing ├── retrieval/ # Search and retrieval components ├── pipelines/ # End-to-end processing pipelines ├── rerankers/ # Result reranking implementations └── utils/ # Shared utilities src/ ├── components/ # React components ├── lib/ # Utility functions and API clients └── app/ # Next.js app router pages ``` ## 🧪 Testing Guidelines ### Unit Tests ```python # Test file: tests/test_embeddings.py import pytest from rag_system.indexing.embedders import HuggingFaceEmbedder def test_embedding_generation(): embedder = HuggingFaceEmbedder("sentence-transformers/all-MiniLM-L6-v2") embeddings = embedder.create_embeddings(["test text"]) assert embeddings.shape[0] == 1 assert embeddings.shape[1] == 384 # Model dimension assert embeddings.dtype == np.float32 ``` ### Integration Tests ```python # Test file: tests/test_integration.py def test_end_to_end_indexing(): """Test complete document indexing pipeline.""" agent = get_agent("test") result = agent.index_documents(["test_document.pdf"]) assert result.success assert len(result.indexed_chunks) > 0 ``` ### Frontend Tests ```typescript // Test file: src/components/__tests__/ChatInterface.test.tsx import { render, screen, fireEvent } from '@testing-library/react'; import { ChatInterface } from '../ChatInterface'; test('sends message when form is submitted', async () => { render(); const input = screen.getByPlaceholderText('Type your message...'); const button = screen.getByRole('button', { name: /send/i }); fireEvent.change(input, { target: { value: 'test message' } }); fireEvent.click(button); expect(screen.getByText('test message')).toBeInTheDocument(); }); ``` ## 📖 Documentation Standards ### Code Documentation ```python def create_index( documents: List[str], config: IndexingConfig, progress_callback: Optional[Callable[[float], None]] = None ) -> IndexingResult: """Create a searchable index from documents. This function processes documents through the complete indexing pipeline: 1. Text extraction and chunking 2. Embedding generation 3. Vector database storage 4. BM25 index creation Args: documents: List of document file paths to index config: Indexing configuration with model settings and parameters progress_callback: Optional callback function for progress updates Returns: IndexingResult containing success status, metrics, and any errors Raises: IndexingError: If document processing fails ModelLoadError: If embedding model cannot be loaded Example: >>> config = IndexingConfig(embedding_batch_size=32) >>> result = create_index(["doc1.pdf", "doc2.pdf"], config) >>> print(f"Indexed {result.chunk_count} chunks") """ ``` ### API Documentation ```python # Use OpenAPI/FastAPI documentation @app.post("/chat", response_model=ChatResponse) async def chat_endpoint(request: ChatRequest) -> ChatResponse: """Chat with indexed documents. Send a natural language query and receive an AI-generated response based on the indexed document collection. - **query**: The user's question or prompt - **session_id**: Chat session identifier - **search_type**: Type of search (vector, hybrid, bm25) - **retrieval_k**: Number of documents to retrieve Returns a response with the AI-generated answer and source documents. """ ``` ## 🔧 Development Tools ### Recommended VS Code Extensions ```json { "recommendations": [ "ms-python.python", "ms-python.pylint", "ms-python.black-formatter", "bradlc.vscode-tailwindcss", "esbenp.prettier-vscode", "ms-vscode.vscode-typescript-next" ] } ``` ### Pre-commit Hooks ```bash # Install pre-commit pip install pre-commit # Set up hooks pre-commit install # Run manually pre-commit run --all-files ``` ### Development Scripts ```bash # Lint Python code python -m pylint rag_system/ # Format Python code python -m black rag_system/ # Type check python -m mypy rag_system/ # Lint TypeScript npm run lint # Format TypeScript npm run format ``` ## 🐛 Issue Reporting ### Bug Reports When reporting bugs, please include: 1. **Environment Information** ``` - OS: macOS 13.4 - Python: 3.11.5 - Node.js: 23.10.0 - Ollama: 0.9.5 ``` 2. **Steps to Reproduce** ``` 1. Start system with `python run_system.py` 2. Upload document via web interface 3. Ask question "What is this document about?" 4. Error occurs during response generation ``` 3. **Expected vs Actual Behavior** 4. **Error Messages and Logs** 5. **Screenshots (if applicable)** ### Feature Requests Include: - **Use Case**: Why is this feature needed? - **Proposed Solution**: How should it work? - **Alternatives**: What other approaches were considered? - **Additional Context**: Any relevant examples or references ## 📦 Release Process ### Version Numbering We use semantic versioning (semver): - `MAJOR.MINOR.PATCH` - Major: Breaking changes - Minor: New features (backward compatible) - Patch: Bug fixes ### Release Checklist - [ ] All tests pass - [ ] Documentation updated - [ ] Version bumped in relevant files - [ ] Changelog updated - [ ] Docker images built and tested - [ ] Release notes prepared ## 🤝 Community Guidelines ### Code of Conduct - Be respectful and inclusive - Focus on constructive feedback - Help others learn and grow - Maintain professional communication ### Getting Help - **GitHub Issues**: For bugs and feature requests - **GitHub Discussions**: For questions and general discussion - **Documentation**: Check existing docs first - **Code Review**: Provide thoughtful, actionable feedback ## 🎯 Project Priorities ### Current Focus Areas 1. **Performance Optimization**: Improving indexing and retrieval speed 2. **Model Support**: Adding more embedding and generation models 3. **User Experience**: Enhancing the web interface 4. **Documentation**: Improving setup and usage guides 5. **Testing**: Expanding test coverage ### Architecture Goals - **Modularity**: Components should be loosely coupled - **Extensibility**: Easy to add new models and features - **Performance**: Optimize for speed and memory usage - **Reliability**: Robust error handling and recovery - **Privacy**: Keep user data secure and local ## 📚 Additional Resources ### Learning Resources - [RAG System Architecture Overview](Documentation/architecture_overview.md) - [API Reference](Documentation/api_reference.md) - [Deployment Guide](Documentation/deployment_guide.md) - [Troubleshooting Guide](DOCKER_TROUBLESHOOTING.md) ### External References - [LangChain Documentation](https://python.langchain.com/) - [Ollama Documentation](https://ollama.ai/docs) - [Next.js Documentation](https://nextjs.org/docs) - [FastAPI Documentation](https://fastapi.tiangolo.com/) --- ## 🙏 Thank You! Thank you for contributing to LocalGPT! Your contributions help make private document intelligence accessible to everyone. For questions about contributing, please: 1. Check existing documentation 2. Search existing issues 3. Create a new issue with the `question` label 4. Join our community discussions Happy coding! 🚀 ================================================ FILE: DOCKER_README.md ================================================ # 🐳 LocalGPT Docker Deployment Guide This guide covers running LocalGPT using Docker containers with local Ollama for optimal performance. ## 🚀 Quick Start ### Complete Setup (5 Minutes) ```bash # 1. Install Ollama locally curl -fsSL https://ollama.ai/install.sh | sh # 2. Start Ollama server ollama serve # 3. Install required models (in another terminal) ollama pull qwen3:0.6b ollama pull qwen3:8b # 4. Clone and start LocalGPT git clone https://github.com/your-org/rag-system.git cd rag-system ./start-docker.sh # 5. Access the application open http://localhost:3000 ``` ## 📋 Prerequisites - **Docker Desktop** installed and running - **Ollama** installed locally (required for best performance) - **8GB+ RAM** (16GB recommended for larger models) - **10GB+ free disk space** ## 🏗️ Architecture ### Current Setup (Local Ollama + Docker Containers) ``` ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ Frontend │────│ Backend │────│ RAG API │ │ (Container) │ │ (Container) │ │ (Container) │ │ Port: 3000 │ │ Port: 8000 │ │ Port: 8001 │ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ │ API calls ▼ ┌─────────────────┐ │ Ollama │ │ (Local/Host) │ │ Port: 11434 │ └─────────────────┘ ``` **Why Local Ollama?** - ✅ Better performance (direct GPU access) - ✅ Simpler setup (one less container) - ✅ Easier model management - ✅ More reliable connection ## 🛠️ Container Details ### Frontend Container (rag-frontend) - **Image**: Custom Node.js 18 build - **Port**: 3000 - **Purpose**: Next.js web interface - **Health Check**: HTTP GET to / - **Memory**: ~500MB ### Backend Container (rag-backend) - **Image**: Custom Python 3.11 build - **Port**: 8000 - **Purpose**: Session management, chat history, API gateway - **Health Check**: HTTP GET to /health - **Memory**: ~300MB ### RAG API Container (rag-api) - **Image**: Custom Python 3.11 build - **Port**: 8001 - **Purpose**: Document indexing, retrieval, AI processing - **Health Check**: HTTP GET to /models - **Memory**: ~2GB (varies with model usage) ## 📂 Volume Mounts & Data ### Persistent Data - `./lancedb/` → Vector database storage - `./index_store/` → Document indexes and metadata - `./shared_uploads/` → Uploaded document files - `./backend/chat_data.db` → SQLite chat history database ### Shared Between Containers All containers share access to document storage and databases through bind mounts. ## 🔧 Configuration ### Environment Variables (docker.env) ```bash # Ollama Configuration OLLAMA_HOST=http://host.docker.internal:11434 # Service Configuration NODE_ENV=production RAG_API_URL=http://rag-api:8001 NEXT_PUBLIC_API_URL=http://localhost:8000 # Database Paths (inside containers) DATABASE_PATH=/app/backend/chat_data.db LANCEDB_PATH=/app/lancedb UPLOADS_PATH=/app/shared_uploads ``` ### Model Configuration The system uses these models by default: - **Embedding**: `Qwen/Qwen3-Embedding-0.6B` (1024 dimensions) - **Generation**: `qwen3:0.6b` (fast) or `qwen3:8b` (high quality) - **Reranking**: Built-in cross-encoder ## 🎯 Management Commands ### Start/Stop Services ```bash # Start all services ./start-docker.sh # Stop all services ./start-docker.sh stop # Restart services ./start-docker.sh stop && ./start-docker.sh ``` ### Monitor Services ```bash # Check container status ./start-docker.sh status docker compose ps # View live logs ./start-docker.sh logs docker compose logs -f # View specific service logs docker compose logs -f rag-api docker compose logs -f backend docker compose logs -f frontend ``` ### Manual Docker Compose ```bash # Start manually docker compose --env-file docker.env up --build -d # Stop manually docker compose down # Rebuild specific service docker compose build --no-cache rag-api docker compose up -d rag-api ``` ### Health Checks ```bash # Test all endpoints curl -f http://localhost:3000 && echo "✅ Frontend OK" curl -f http://localhost:8000/health && echo "✅ Backend OK" curl -f http://localhost:8001/models && echo "✅ RAG API OK" curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK" ``` ## 🐞 Debugging ### Access Container Shells ```bash # RAG API container (most debugging happens here) docker compose exec rag-api bash # Backend container docker compose exec backend bash # Frontend container docker compose exec frontend sh ``` ### Common Debug Commands ```bash # Test RAG system initialization docker compose exec rag-api python -c " from rag_system.main import get_agent agent = get_agent('default') print('✅ RAG System OK') " # Test Ollama connection from container docker compose exec rag-api curl http://host.docker.internal:11434/api/tags # Check environment variables docker compose exec rag-api env | grep OLLAMA # View Python packages docker compose exec rag-api pip list | grep -E "(torch|transformers|lancedb)" ``` ### Resource Monitoring ```bash # Monitor container resources docker stats # Check disk usage docker system df df -h ./lancedb ./shared_uploads # Check memory usage by service docker stats --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}" ``` ## 🚨 Troubleshooting ### Common Issues #### Container Won't Start ```bash # Check logs for specific error docker compose logs [service-name] # Rebuild from scratch ./start-docker.sh stop docker system prune -f ./start-docker.sh # Check for port conflicts lsof -i :3000 -i :8000 -i :8001 ``` #### Can't Connect to Ollama ```bash # Verify Ollama is running curl http://localhost:11434/api/tags # Restart Ollama pkill ollama ollama serve # Test from container docker compose exec rag-api curl http://host.docker.internal:11434/api/tags ``` #### Memory Issues ```bash # Check memory usage docker stats --no-stream free -h # On host # Increase Docker memory limit # Docker Desktop → Settings → Resources → Memory → 8GB+ # Use smaller models ollama pull qwen3:0.6b # Instead of qwen3:8b ``` #### Frontend Build Errors ```bash # Clean build docker compose build --no-cache frontend docker compose up -d frontend # Check frontend logs docker compose logs frontend ``` #### Database/Storage Issues ```bash # Check file permissions ls -la backend/chat_data.db ls -la lancedb/ # Reset permissions chmod 664 backend/chat_data.db chmod -R 755 lancedb/ shared_uploads/ # Test database access docker compose exec backend sqlite3 /app/backend/chat_data.db ".tables" ``` ### Performance Issues #### Slow Response Times - Use faster models: `qwen3:0.6b` instead of `qwen3:8b` - Increase Docker memory allocation - Ensure SSD storage for databases - Monitor with `docker stats` #### High Memory Usage - Reduce batch sizes in configuration - Use smaller embedding models - Clear unused Docker resources: `docker system prune` ### Complete Reset ```bash # Nuclear option - reset everything ./start-docker.sh stop docker system prune -a --volumes rm -rf lancedb/* shared_uploads/* backend/chat_data.db ./start-docker.sh ``` ## 🏆 Success Criteria Your Docker deployment is successful when: - ✅ `./start-docker.sh status` shows all containers healthy - ✅ All health checks pass (see commands above) - ✅ You can access http://localhost:3000 - ✅ You can upload documents and create indexes - ✅ You can chat with your documents - ✅ No errors in container logs ### Performance Benchmarks **Good Performance:** - Container startup: < 2 minutes - Index creation: < 2 min per 100MB document - Query response: < 30 seconds - Memory usage: < 4GB total containers **Optimal Performance:** - Container startup: < 1 minute - Index creation: < 1 min per 100MB document - Query response: < 10 seconds - Memory usage: < 2GB total containers ## 📚 Additional Resources - **Detailed Troubleshooting**: See `DOCKER_TROUBLESHOOTING.md` - **Complete Documentation**: See `Documentation/docker_usage.md` - **System Architecture**: See `Documentation/architecture_overview.md` - **Direct Development**: See main `README.md` for non-Docker setup --- **Happy Dockerizing! 🐳** Need help? Check the troubleshooting guide or open an issue. ================================================ FILE: DOCKER_TROUBLESHOOTING.md ================================================ # 🐳 Docker Troubleshooting Guide - LocalGPT _Last updated: 2025-01-07_ This guide helps diagnose and fix Docker-related issues with LocalGPT's containerized deployment. --- ## 🏁 Quick Health Check ### System Status Check ```bash # Check Docker daemon docker version # Check Ollama status curl http://localhost:11434/api/tags # Check containers ./start-docker.sh status # Test all endpoints curl -f http://localhost:3000 && echo "✅ Frontend OK" curl -f http://localhost:8000/health && echo "✅ Backend OK" curl -f http://localhost:8001/models && echo "✅ RAG API OK" curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK" ``` ### Expected Success Output ``` ✅ Frontend OK ✅ Backend OK ✅ RAG API OK ✅ Ollama OK ``` --- ## 🚨 Common Issues & Solutions ### 1. Docker Daemon Issues #### Problem: "Cannot connect to Docker daemon" ``` Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running? ``` #### Solution A: Restart Docker Desktop (macOS/Windows) ```bash # Quit Docker Desktop completely # macOS: Click Docker icon → "Quit Docker Desktop" # Windows: Right-click Docker icon → "Quit Docker Desktop" # Wait for it to fully shut down sleep 10 # Start Docker Desktop open -a Docker # macOS # Windows: Click Docker Desktop from Start menu # Wait for Docker to be ready (2-3 minutes) docker version ``` #### Solution B: Linux Docker Service ```bash # Check Docker service status sudo systemctl status docker # Restart Docker service sudo systemctl restart docker # Enable auto-start sudo systemctl enable docker # Test connection docker version ``` #### Solution C: Hard Reset ```bash # Kill all Docker processes sudo pkill -f docker # Remove socket files sudo rm -f /var/run/docker.sock sudo rm -f /Users/prompt/.docker/run/docker.sock # macOS # Restart Docker Desktop open -a Docker # macOS ``` ### 2. Ollama Connection Issues #### Problem: RAG API can't connect to Ollama ``` ConnectionError: Failed to connect to Ollama at http://host.docker.internal:11434 ``` #### Solution A: Verify Ollama is Running ```bash # Check if Ollama is running curl http://localhost:11434/api/tags # If not running, start it ollama serve # Install required models ollama pull qwen3:0.6b ollama pull qwen3:8b ``` #### Solution B: Test from Container ```bash # Test Ollama connection from RAG API container docker compose exec rag-api curl http://host.docker.internal:11434/api/tags # If this fails, check Docker network settings docker network ls docker network inspect rag_system_old_default ``` #### Solution C: Alternative Ollama Host ```bash # Edit docker.env to use different host echo "OLLAMA_HOST=http://172.17.0.1:11434" >> docker.env # Or use IP address echo "OLLAMA_HOST=http://$(ipconfig getifaddr en0):11434" >> docker.env # macOS ``` ### 3. Container Build Failures #### Problem: Frontend build fails ``` ERROR: Failed to build frontend container ``` #### Solution: Clean Build ```bash # Stop containers ./start-docker.sh stop # Clean Docker cache docker system prune -f docker builder prune -f # Rebuild frontend only docker compose build --no-cache frontend docker compose up -d frontend # Check logs docker compose logs frontend ``` #### Problem: Python package installation fails ``` ERROR: Could not install packages due to an EnvironmentError ``` #### Solution: Update Dependencies ```bash # Check requirements file exists ls -la requirements-docker.txt # Test package installation locally pip install -r requirements-docker.txt --dry-run # Rebuild with updated base image docker compose build --no-cache --pull rag-api ``` ### 4. Port Conflicts #### Problem: "Port already in use" ``` Error starting userland proxy: listen tcp4 0.0.0.0:3000: bind: address already in use ``` #### Solution: Find and Kill Conflicting Processes ```bash # Check what's using the ports lsof -i :3000 -i :8000 -i :8001 # Kill specific processes pkill -f "npm run dev" # Frontend pkill -f "server.py" # Backend pkill -f "api_server" # RAG API # Or kill by port sudo kill -9 $(lsof -t -i:3000) sudo kill -9 $(lsof -t -i:8000) sudo kill -9 $(lsof -t -i:8001) # Restart containers ./start-docker.sh ``` ### 5. Memory Issues #### Problem: Containers crash due to OOM (Out of Memory) ``` Container killed due to memory limit ``` #### Solution: Increase Docker Memory ```bash # Check current memory usage docker stats --no-stream # Increase Docker Desktop memory allocation # Docker Desktop → Settings → Resources → Memory → 8GB+ # Monitor memory usage docker stats # Use smaller models if needed ollama pull qwen3:0.6b # Instead of qwen3:8b ``` #### Problem: System running slow ```bash # Check host memory free -h # Linux vm_stat # macOS # Clean up Docker resources docker system prune -f docker volume prune -f ``` ### 6. Volume Mount Issues #### Problem: Permission denied accessing files ``` Permission denied: /app/lancedb ``` #### Solution: Fix Permissions ```bash # Create directories if they don't exist mkdir -p lancedb index_store shared_uploads backend # Fix permissions chmod -R 755 lancedb index_store shared_uploads chmod 664 backend/chat_data.db # Check ownership ls -la lancedb/ shared_uploads/ backend/ # Reset permissions if needed sudo chown -R $USER:$USER lancedb shared_uploads backend ``` #### Problem: Database file not found ``` No such file or directory: '/app/backend/chat_data.db' ``` #### Solution: Initialize Database ```bash # Create empty database file touch backend/chat_data.db # Or initialize with schema python -c " from backend.database import ChatDatabase db = ChatDatabase() db.init_database() print('Database initialized') " # Restart containers ./start-docker.sh stop ./start-docker.sh ``` --- ## 🔍 Advanced Debugging ### Container-Level Debugging #### Access Container Shells ```bash # RAG API container (most issues happen here) docker compose exec rag-api bash # Check environment variables docker compose exec rag-api env | grep -E "(OLLAMA|RAG|NODE)" # Test Python imports docker compose exec rag-api python -c " import sys print('Python version:', sys.version) from rag_system.main import get_agent print('✅ RAG system imports work') " # Backend container docker compose exec backend bash python -c " from backend.database import ChatDatabase print('✅ Database imports work') " # Frontend container docker compose exec frontend sh npm --version node --version ``` #### Check Container Resources ```bash # Monitor real-time resource usage docker stats # Check individual container health docker compose ps docker inspect rag-api --format='{{.State.Health.Status}}' # View container configurations docker compose config ``` #### Network Debugging ```bash # Check network connectivity docker compose exec rag-api ping backend docker compose exec backend ping rag-api docker compose exec rag-api ping host.docker.internal # Check DNS resolution docker compose exec rag-api nslookup host.docker.internal # Test HTTP connections docker compose exec rag-api curl -v http://backend:8000/health docker compose exec rag-api curl -v http://host.docker.internal:11434/api/tags ``` ### Log Analysis #### Container Logs ```bash # View all logs ./start-docker.sh logs # Follow specific service logs docker compose logs -f rag-api docker compose logs -f backend docker compose logs -f frontend # Search for errors docker compose logs rag-api 2>&1 | grep -i error docker compose logs backend 2>&1 | grep -i "traceback\|error" # Save logs to file docker compose logs > docker-debug.log 2>&1 ``` #### System Logs ```bash # Docker daemon logs (Linux) journalctl -u docker.service -f # macOS: Check Console app for Docker logs # Windows: Check Event Viewer ``` --- ## 🧪 Testing & Validation ### Manual Container Testing #### Test Individual Containers ```bash # Test RAG API alone docker build -f Dockerfile.rag-api -t test-rag-api . docker run --rm -p 8001:8001 -e OLLAMA_HOST=http://host.docker.internal:11434 test-rag-api & sleep 30 curl http://localhost:8001/models pkill -f test-rag-api # Test Backend alone docker build -f Dockerfile.backend -t test-backend . docker run --rm -p 8000:8000 test-backend & sleep 30 curl http://localhost:8000/health pkill -f test-backend ``` #### Integration Testing ```bash # Full system test ./start-docker.sh # Wait for all services to be ready sleep 60 # Test complete workflow curl -X POST http://localhost:8000/sessions \ -H "Content-Type: application/json" \ -d '{"title": "Test Session"}' # Test document upload (if you have a test PDF) # curl -X POST http://localhost:8000/upload -F "file=@test.pdf" # Clean up ./start-docker.sh stop ``` ### Automated Testing Script Create `test-docker-health.sh`: ```bash #!/bin/bash set -e echo "🐳 Docker Health Test Starting..." # Start containers ./start-docker.sh # Wait for services echo "⏳ Waiting for services to start..." sleep 60 # Test endpoints echo "🔍 Testing endpoints..." curl -f http://localhost:3000 && echo "✅ Frontend OK" || echo "❌ Frontend FAIL" curl -f http://localhost:8000/health && echo "✅ Backend OK" || echo "❌ Backend FAIL" curl -f http://localhost:8001/models && echo "✅ RAG API OK" || echo "❌ RAG API FAIL" curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK" || echo "❌ Ollama FAIL" # Test container health echo "🔍 Checking container health..." docker compose ps echo "🎉 Health test complete!" ``` --- ## 🔄 Recovery Procedures ### Complete System Reset #### Soft Reset ```bash # Stop containers ./start-docker.sh stop # Clean up Docker resources docker system prune -f # Restart containers ./start-docker.sh ``` #### Hard Reset (⚠️ Deletes all data) ```bash # Stop everything ./start-docker.sh stop # Remove all containers, images, and volumes docker system prune -a --volumes # Remove local data (CAUTION: This deletes all your documents and chat history) rm -rf lancedb/* shared_uploads/* backend/chat_data.db # Rebuild from scratch ./start-docker.sh ``` #### Selective Reset Reset only specific components: ```bash # Reset just the database ./start-docker.sh stop rm backend/chat_data.db ./start-docker.sh # Reset just vector storage ./start-docker.sh stop rm -rf lancedb/* ./start-docker.sh # Reset just uploaded documents rm -rf shared_uploads/* ``` --- ## 📊 Performance Optimization ### Resource Monitoring ```bash # Monitor containers continuously watch -n 5 'docker stats --no-stream' # Check disk usage docker system df du -sh lancedb shared_uploads backend # Monitor host resources htop # Linux top # macOS/Windows ``` ### Performance Tuning ```bash # Use smaller models for better performance ollama pull qwen3:0.6b # Instead of qwen3:8b # Reduce Docker memory if needed # Docker Desktop → Settings → Resources → Memory # Clean up regularly docker system prune -f docker volume prune -f ``` --- ## 🆘 When All Else Fails ### Alternative Deployment Options #### 1. Direct Development (No Docker) ```bash # Stop Docker containers ./start-docker.sh stop # Use direct development instead python run_system.py ``` #### 2. Minimal Docker (RAG API only) ```bash # Run only RAG API in Docker docker build -f Dockerfile.rag-api -t rag-api . docker run -p 8001:8001 rag-api # Run other components directly cd backend && python server.py & npm run dev ``` #### 3. Hybrid Approach ```bash # Run some services in Docker, others directly docker compose up -d rag-api cd backend && python server.py & npm run dev ``` ### Getting Help #### Diagnostic Information to Collect ```bash # System information docker version docker compose version uname -a # Container information docker compose ps docker compose config # Resource information docker stats --no-stream docker system df # Error logs docker compose logs > docker-errors.log 2>&1 ``` #### Support Channels 1. **Check GitHub Issues**: Search existing issues for similar problems 2. **Documentation**: Review the complete documentation in `Documentation/` 3. **Create Issue**: Include diagnostic information above --- ## ✅ Success Checklist Your Docker deployment is working correctly when: - ✅ `docker version` shows Docker is running - ✅ `curl http://localhost:11434/api/tags` shows Ollama is accessible - ✅ `./start-docker.sh status` shows all containers healthy - ✅ All health check URLs return 200 OK - ✅ You can access the frontend at http://localhost:3000 - ✅ You can create document indexes successfully - ✅ You can chat with your documents - ✅ No error messages in container logs **If all boxes are checked, your Docker deployment is successful! 🎉** --- **Still having issues?** Check the main `DOCKER_README.md` or create an issue with your diagnostic information. ================================================ FILE: Dockerfile.backend ================================================ FROM python:3.11-slim # Set working directory WORKDIR /app # Install system dependencies RUN apt-get update && apt-get install -y \ curl \ && rm -rf /var/lib/apt/lists/* # Copy requirements and install Python dependencies (using Docker-specific requirements) COPY requirements-docker.txt ./requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Copy backend code and dependencies COPY backend/ ./backend/ COPY rag_system/ ./rag_system/ # Create necessary directories and initialize database RUN mkdir -p shared_uploads logs backend # Expose port EXPOSE 8000 # Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \ CMD curl -f http://localhost:8000/health || exit 1 # Run the backend server WORKDIR /app/backend CMD ["python", "server.py"] ================================================ FILE: Dockerfile.frontend ================================================ FROM node:18-alpine # Set working directory WORKDIR /app # Install dependencies (including dev dependencies for build) COPY package.json package-lock.json ./ RUN npm ci # Copy source code and configuration files COPY src/ ./src/ COPY public/ ./public/ COPY next.config.ts ./ COPY tsconfig.json ./ COPY tailwind.config.js ./ COPY postcss.config.mjs ./ COPY eslint.config.mjs ./ # Build the application (skip linting for Docker) ENV NEXT_LINT=false RUN npm run build # Expose port EXPOSE 3000 # Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ CMD curl -f http://localhost:3000 || exit 1 # Start the application CMD ["npm", "start"] ================================================ FILE: Dockerfile.rag-api ================================================ FROM python:3.11-slim # Set working directory WORKDIR /app # Install system dependencies RUN apt-get update && apt-get install -y \ curl \ build-essential \ && rm -rf /var/lib/apt/lists/* # Copy requirements and install Python dependencies (using Docker-specific requirements) COPY requirements-docker.txt ./requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Copy RAG system code and backend dependencies COPY rag_system/ ./rag_system/ COPY backend/ ./backend/ # Create necessary directories RUN mkdir -p lancedb index_store shared_uploads logs # Expose port EXPOSE 8001 # Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ CMD curl -f http://localhost:8001/models || exit 1 # Run the RAG API server CMD ["python", "-m", "rag_system.api_server"] ================================================ FILE: Documentation/api_reference.md ================================================ # 📚 API Reference (Backend & RAG API) _Last updated: 2025-01-07_ --- ## Backend HTTP API (Python `backend/server.py`) **Base URL**: `http://localhost:8000` | Endpoint | Method | Description | Request Body | Success Response | |----------|--------|-------------|--------------|------------------| | `/health` | GET | Health probe incl. Ollama status & DB stats | – | 200 JSON `{ status, ollama_running, available_models, database_stats }` | | `/chat` | POST | Stateless chat (no session) | `{ message:str, model?:str, conversation_history?:[{role,content}]}` | 200 `{ response:str, model:str, message_count:int }` | | `/sessions` | GET | List all sessions | – | `{ sessions:ChatSession[], total:int }` | | `/sessions` | POST | Create session | `{ title?:str, model?:str }` | 201 `{ session:ChatSession, session_id }` | | `/sessions/` | GET | Get session + msgs | – | `{ session, messages }` | | `/sessions/` | DELETE | Delete session | – | `{ message, deleted_session_id }` | | `/sessions//rename` | POST | Rename session | `{ title:str }` | `{ message, session }` | | `/sessions//messages` | POST | Session chat (builds history) | See ChatRequest + retrieval opts ▼ | `{ response, session, user_message_id, ai_message_id }` | | `/sessions//documents` | GET | List uploaded docs | – | `{ files:string[], file_count:int, session }` | | `/sessions//upload` | POST multipart | Upload docs to session | field `files[]` | `{ message, uploaded_files, processing_results?, session_documents?, total_session_documents? }` | | `/sessions//index` | POST | Trigger RAG indexing for session | `{ latechunk?, doclingChunk?, chunkSize?, ... }` | `{ message }` | | `/sessions//indexes` | GET | List indexes linked to session | – | `{ indexes, total }` | | `/sessions//indexes/` | POST | Link index to session | – | `{ message }` | | `/sessions/cleanup` | GET | Remove empty sessions | – | `{ message, cleanup_count }` | | `/models` | GET | List generation / embedding models | – | `{ generation_models:str[], embedding_models:str[] }` | | `/indexes` | GET | List all indexes | – | `{ indexes, total }` | | `/indexes` | POST | Create index | `{ name:str, description?:str, metadata?:dict }` | `{ index_id }` | | `/indexes/` | GET | Get single index | – | `{ index }` | | `/indexes/` | DELETE | Delete index | – | `{ message, index_id }` | | `/indexes//upload` | POST multipart | Upload docs to index | field `files[]` | `{ message, uploaded_files }` | | `/indexes//build` | POST | Build / rebuild index (RAG) | `{ latechunk?, doclingChunk?, ...}` | 200 `{ response?, message?}` (idempotent) | --- ## RAG API (Python `rag_system/api_server.py`) **Base URL**: `http://localhost:8001` | Endpoint | Method | Description | Request Body | Success Response | |----------|--------|-------------|--------------|------------------| | `/chat` | POST | Run RAG query with full pipeline | See RAG ChatRequest ▼ | `{ answer:str, source_documents:[], reasoning?:str, confidence?:float }` | | `/chat/stream` | POST | Run RAG query with SSE streaming | Same as /chat | Server-Sent Events stream | | `/index` | POST | Index documents with full configuration | See Index Request ▼ | `{ message:str, indexed_files:[], table_name:str }` | | `/models` | GET | List available models | – | `{ generation_models:str[], embedding_models:str[] }` | ### RAG ChatRequest (Advanced Options) ```jsonc { "query": "string", // Required – user question "session_id": "string", // Optional – for session context "table_name": "string", // Optional – specific index table "compose_sub_answers": true, // Optional – compose sub-answers "query_decompose": true, // Optional – decompose complex queries "ai_rerank": false, // Optional – AI-powered reranking "context_expand": false, // Optional – context expansion "verify": true, // Optional – answer verification "retrieval_k": 20, // Optional – number of chunks to retrieve "context_window_size": 1, // Optional – context window size "reranker_top_k": 10, // Optional – top-k after reranking "search_type": "hybrid", // Optional – "hybrid|dense|fts" "dense_weight": 0.7, // Optional – dense search weight (0-1) "force_rag": false, // Optional – bypass triage, force RAG "provence_prune": false, // Optional – sentence-level pruning "provence_threshold": 0.8, // Optional – pruning threshold "model": "qwen3:8b" // Optional – generation model override } ``` ### Index Request (Document Indexing) ```jsonc { "file_paths": ["path1.pdf", "path2.pdf"], // Required – files to index "session_id": "string", // Required – session identifier "chunk_size": 512, // Optional – chunk size (default: 512) "chunk_overlap": 64, // Optional – chunk overlap (default: 64) "enable_latechunk": true, // Optional – enable late chunking "enable_docling_chunk": false, // Optional – enable DocLing chunking "retrieval_mode": "hybrid", // Optional – "hybrid|dense|fts" "window_size": 2, // Optional – context window "enable_enrich": true, // Optional – enable enrichment "embedding_model": "Qwen/Qwen3-Embedding-0.6B", // Optional – embedding model "enrich_model": "qwen3:0.6b", // Optional – enrichment model "overview_model_name": "qwen3:0.6b", // Optional – overview model "batch_size_embed": 50, // Optional – embedding batch size "batch_size_enrich": 25 // Optional – enrichment batch size } ``` > **Note on CORS** – All endpoints include `Access-Control-Allow-Origin: *` header. --- ## Frontend Wrapper (`src/lib/api.ts`) The React/Next.js frontend calls the backend via a typed wrapper. Important methods & payloads: | Method | Backend Endpoint | Payload Shape | |--------|------------------|---------------| | `checkHealth()` | `/health` | – | | `sendMessage({ message, model?, conversation_history? })` | `/chat` | ChatRequest | | `getSessions()` | `/sessions` | – | | `createSession(title?, model?)` | `/sessions` | – | | `getSession(sessionId)` | `/sessions/` | – | | `sendSessionMessage(sessionId, message, opts)` | `/sessions//messages` | `ChatRequest + retrieval opts` | | `uploadFiles(sessionId, files[])` | `/sessions//upload` | multipart | | `indexDocuments(sessionId)` | `/sessions//index` | opts similar to buildIndex | | `buildIndex(indexId, opts)` | `/indexes//build` | Index build options | | `linkIndexToSession` | `/sessions//indexes/` | – | --- ## Payload Definitions (Canonical) ### ChatRequest (frontend ⇄ backend) ```jsonc { "message": "string", // Required – raw user text "model": "string", // Optional – generation model id "conversation_history": [ // Optional – prior turn list { "role": "user|assistant", "content": "string" } ] } ``` ### Session Chat Extended Options ```jsonc { "composeSubAnswers": true, "decompose": true, "aiRerank": false, "contextExpand": false, "verify": true, "retrievalK": 10, "contextWindowSize": 5, "rerankerTopK": 20, "searchType": "fts|hybrid|dense", "denseWeight": 0.75, "force_rag": false } ``` ### Index Build Options ```jsonc { "latechunk": true, "doclingChunk": false, "chunkSize": 512, "chunkOverlap": 64, "retrievalMode": "hybrid|dense|fts", "windowSize": 2, "enableEnrich": true, "embeddingModel": "Qwen/Qwen3-Embedding-0.6B", "enrichModel": "qwen3:0.6b", "overviewModel": "qwen3:0.6b", "batchSizeEmbed": 64, "batchSizeEnrich": 32 } ``` --- _This reference is derived from static code analysis of `backend/server.py`, `rag_system/api_server.py`, and `src/lib/api.ts`. Keep it in sync with route or type changes._ ================================================ FILE: Documentation/architecture_overview.md ================================================ # 🏗️ System Architecture Overview _Last updated: 2025-07-06_ This document explains how data and control flow through the Advanced **RAG System** — from a user's browser all the way to model inference and back. It is intended as the **ground-truth reference** for engineers and integrators. --- ## 1. Bird's-Eye Diagram ```mermaid flowchart LR subgraph Client U["👤 User (Browser)"] FE["Next.js Front-end\nReact Components"] U --> FE end subgraph Network FE -->|HTTP/JSON| BE["Python HTTP Server\nbackend/server.py"] end subgraph Core["rag_system core package"] BE --> LOOP["Agent Loop\n(rag_system/agent/loop.py)"] BE --> IDX["Indexing Pipeline\n(pipelines/indexing_pipeline.py)"] LOOP --> RP["Retrieval Pipeline\n(pipelines/retrieval_pipeline.py)"] LOOP --> VER["Verifier (Grounding Check)"] RP --> RET["Retrievers\nBM25 | Dense | Hybrid"] RP --> RER["AI Reranker"] RP --> SYNT["Answer Synthesiser"] end subgraph Storage LDB[("LanceDB Vector Tables")] SQL[("SQLite – chat & metadata")] end subgraph Models OLLAMA["Ollama Server\n(qwen3, etc.)"] HF["HuggingFace Hosted\nEmbedding/Reranker Models"] end %% data edges IDX -->|chunks & embeddings| LDB RET -->|vector search| LDB LOOP -->|LLM calls| OLLAMA RP -->|LLM calls| OLLAMA VER -->|LLM calls| OLLAMA RP -->|rerank| HF BE -->|CRUD| SQL ``` --- ### Data-flow Narrative 1. **User** interacts with the Next.js UI; messages are posted via `src/lib/api.ts`. 2. **backend/server.py** receives JSON over HTTP, applies CORS, and proxies the request into `rag_system`. 3. **Agent Loop** decides (via _Triage_) whether to perform Retrieval-Augmented Generation (RAG) or direct LLM answering. 4. If RAG is chosen: 1. **Retrieval Pipeline** fetches candidates from **LanceDB** using BM25 + dense vectors. 2. **AI Reranker** (HF model) sorts snippets. 3. **Answer Synthesiser** calls **Ollama** to write the final answer. 5. Answers can be **Verified** for grounding (optional flag). 6. Index-building is an offline path triggered from the UI — PDF/📄 files are chunked, embedded and stored in LanceDB. --- ## 2. Component Documents The table below links to deep-dives for each major component. | **Component** | **Documentation** | |---------------|-------------------| | Agent Loop | [`system_overview.md`](system_overview.md) | | Indexing Pipeline | [`indexing_pipeline.md`](indexing_pipeline.md) | | Retrieval Pipeline | [`retrieval_pipeline.md`](retrieval_pipeline.md) | | Verifier | [`verifier.md`](verifier.md) | | Triage System | [`triage_system.md`](triage_system.md) | --- > **Change-management**: whenever architecture changes (new micro-service, different DB, etc.) update this overview diagram first, then individual component docs. ================================================ FILE: Documentation/deployment_guide.md ================================================ # 🚀 RAG System Deployment Guide _Last updated: 2025-01-07_ This guide provides comprehensive instructions for deploying the RAG system using both Docker and direct development approaches. --- ## 🎯 Deployment Options ### Option 1: Docker Deployment (Production) 🐳 - **Best for**: Production environments, containerized deployments, scaling - **Pros**: Isolated, reproducible, easy to manage - **Cons**: Slightly more complex setup, resource overhead ### Option 2: Direct Development (Development) 💻 - **Best for**: Development, debugging, customization - **Pros**: Direct access to code, faster iteration, easier debugging - **Cons**: More dependencies to manage --- ## 1. Prerequisites ### 1.1 System Requirements #### **Minimum Requirements** - **CPU**: 4 cores, 2.5GHz+ - **RAM**: 8GB (16GB recommended) - **Storage**: 50GB free space - **OS**: Linux, macOS, or Windows with WSL2 #### **Recommended Requirements** - **CPU**: 8+ cores, 3.0GHz+ - **RAM**: 32GB+ (for large models) - **Storage**: 200GB+ SSD - **GPU**: NVIDIA GPU with 8GB+ VRAM (optional, for acceleration) ### 1.2 Common Dependencies **Both deployment methods require:** ```bash # Ollama (required for both approaches) curl -fsSL https://ollama.ai/install.sh | sh # Git for cloning git 2.30+ ``` ### 1.3 Docker-Specific Dependencies **For Docker deployment:** ```bash # Docker & Docker Compose Docker Engine 24.0+ Docker Compose 2.20+ ``` ### 1.4 Direct Development Dependencies **For direct development:** ```bash # Python & Node.js Python 3.8+ Node.js 16+ npm 8+ ``` --- ## 2. 🐳 Docker Deployment ### 2.1 Installation #### **Step 1: Install Docker** **Ubuntu/Debian:** ```bash # Install Docker curl -fsSL https://get.docker.com -o get-docker.sh sudo sh get-docker.sh sudo usermod -aG docker $USER newgrp docker # Install Docker Compose V2 sudo apt-get update sudo apt-get install docker-compose-plugin ``` **macOS:** ```bash # Install Docker Desktop brew install --cask docker # Or download from: https://www.docker.com/products/docker-desktop ``` **Windows:** ```bash # Install Docker Desktop with WSL2 backend # Download from: https://www.docker.com/products/docker-desktop ``` #### **Step 2: Clone Repository** ```bash git clone https://github.com/your-org/rag-system.git cd rag-system ``` #### **Step 3: Install Ollama** ```bash # Install Ollama (runs locally even with Docker) curl -fsSL https://ollama.ai/install.sh | sh # Start Ollama ollama serve # In another terminal, install models ollama pull qwen3:0.6b ollama pull qwen3:8b ``` #### **Step 4: Launch Docker System** ```bash # Start all containers using the convenience script ./start-docker.sh # Or manually: docker compose --env-file docker.env up --build -d ``` #### **Step 5: Verify Deployment** ```bash # Check container status docker compose ps # Test all endpoints curl http://localhost:3000 # Frontend curl http://localhost:8000/health # Backend curl http://localhost:8001/models # RAG API curl http://localhost:11434/api/tags # Ollama ``` ### 2.2 Docker Management #### **Container Operations** ```bash # Start system ./start-docker.sh # Stop system ./start-docker.sh stop # View logs ./start-docker.sh logs # Check status ./start-docker.sh status # Manual Docker Compose commands docker compose ps # Check status docker compose logs -f # Follow logs docker compose down # Stop all containers docker compose up --build -d # Rebuild and restart ``` #### **Individual Container Management** ```bash # Restart specific service docker compose restart rag-api # View specific service logs docker compose logs -f backend # Execute commands in container docker compose exec rag-api python -c "print('Hello')" ``` --- ## 3. 💻 Direct Development ### 3.1 Installation #### **Step 1: Install Dependencies** **Python Dependencies:** ```bash # Clone repository git clone https://github.com/your-org/rag-system.git cd rag-system # Create virtual environment (recommended) python -m venv venv source venv/bin/activate # On Windows: venv\Scripts\activate # Install Python packages pip install -r requirements.txt ``` **Node.js Dependencies:** ```bash # Install Node.js dependencies npm install ``` #### **Step 2: Install and Configure Ollama** ```bash # Install Ollama curl -fsSL https://ollama.ai/install.sh | sh # Start Ollama ollama serve # In another terminal, install models ollama pull qwen3:0.6b ollama pull qwen3:8b ``` #### **Step 3: Launch System** **Option A: Integrated Launcher (Recommended)** ```bash # Start all components with one command python run_system.py ``` **Option B: Manual Component Startup** ```bash # Terminal 1: RAG API python -m rag_system.api_server # Terminal 2: Backend cd backend && python server.py # Terminal 3: Frontend npm run dev # Access at http://localhost:3000 ``` #### **Step 4: Verify Installation** ```bash # Check system health python system_health_check.py # Test endpoints curl http://localhost:3000 # Frontend curl http://localhost:8000/health # Backend curl http://localhost:8001/models # RAG API ``` ### 3.2 Direct Development Management #### **System Operations** ```bash # Start system python run_system.py # Check system health python system_health_check.py # Stop system # Press Ctrl+C in terminal running run_system.py ``` #### **Individual Component Management** ```bash # Start components individually python -m rag_system.api_server # RAG API on port 8001 cd backend && python server.py # Backend on port 8000 npm run dev # Frontend on port 3000 # Development tools npm run build # Build frontend for production pip install -r requirements.txt --upgrade # Update Python packages ``` --- ## 4. Architecture Comparison ### 4.1 Docker Architecture ```mermaid graph TB subgraph "Docker Containers" Frontend[Frontend Container
Next.js
Port 3000] Backend[Backend Container
Python API
Port 8000] RAG[RAG API Container
Document Processing
Port 8001] end subgraph "Local System" Ollama[Ollama Server
Port 11434] end Frontend --> Backend Backend --> RAG RAG --> Ollama ``` ### 4.2 Direct Development Architecture ```mermaid graph TB subgraph "Local Processes" Frontend[Next.js Dev Server
Port 3000] Backend[Python Backend
Port 8000] RAG[RAG API
Port 8001] Ollama[Ollama Server
Port 11434] end Frontend --> Backend Backend --> RAG RAG --> Ollama ``` --- ## 5. Configuration ### 5.1 Environment Variables #### **Docker Configuration (`docker.env`)** ```bash # Ollama Configuration OLLAMA_HOST=http://host.docker.internal:11434 # Service Configuration NODE_ENV=production RAG_API_URL=http://rag-api:8001 NEXT_PUBLIC_API_URL=http://localhost:8000 ``` #### **Direct Development Configuration** ```bash # Environment variables are set automatically by run_system.py # Override in environment if needed: export OLLAMA_HOST=http://localhost:11434 export RAG_API_URL=http://localhost:8001 ``` ### 5.2 Model Configuration #### **Default Models** ```python # Embedding Models EMBEDDING_MODELS = [ "Qwen/Qwen3-Embedding-0.6B", # Fast, 1024 dimensions "Qwen/Qwen3-Embedding-4B", # High quality, 2048 dimensions ] # Generation Models GENERATION_MODELS = [ "qwen3:0.6b", # Fast responses "qwen3:8b", # High quality ] ``` ### 5.3 Performance Tuning #### **Memory Settings** ```bash # For Docker: Increase memory allocation # Docker Desktop → Settings → Resources → Memory → 16GB+ # For Direct Development: Monitor with htop # or top on macOS ``` #### **Model Settings** ```python # Batch sizes (adjust based on available RAM) EMBEDDING_BATCH_SIZE = 50 # Reduce if OOM ENRICHMENT_BATCH_SIZE = 25 # Reduce if OOM # Chunk settings CHUNK_SIZE = 512 # Text chunk size CHUNK_OVERLAP = 64 # Overlap between chunks ``` --- ## 6. Operational Procedures ### 6.1 System Monitoring #### **Health Checks** ```bash # Comprehensive system check curl -f http://localhost:3000 && echo "✅ Frontend OK" curl -f http://localhost:8000/health && echo "✅ Backend OK" curl -f http://localhost:8001/models && echo "✅ RAG API OK" curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK" ``` #### **Performance Monitoring** ```bash # Docker monitoring docker stats # Direct development monitoring htop # Overall system nvidia-smi # GPU usage (if available) ``` ### 6.2 Log Management #### **Docker Logs** ```bash # All services docker compose logs -f # Specific service docker compose logs -f rag-api # Save logs to file docker compose logs > system.log 2>&1 ``` #### **Direct Development Logs** ```bash # Logs are printed to terminal # Redirect to file if needed: python run_system.py > system.log 2>&1 ``` ### 6.3 Backup and Restore #### **Data Backup** ```bash # Create backup directory mkdir -p backups/$(date +%Y%m%d) # Backup databases and indexes cp -r backend/chat_data.db backups/$(date +%Y%m%d)/ cp -r lancedb backups/$(date +%Y%m%d)/ cp -r index_store backups/$(date +%Y%m%d)/ # For Docker: also backup volumes docker compose down docker run --rm -v rag_system_old_ollama_data:/data -v $(pwd)/backups:/backup alpine tar czf /backup/ollama_models_$(date +%Y%m%d).tar.gz -C /data . ``` #### **Data Restore** ```bash # Stop system ./start-docker.sh stop # Docker # Or Ctrl+C for direct development # Restore files cp -r backups/YYYYMMDD/* ./ # Restart system ./start-docker.sh # Docker python run_system.py # Direct development ``` --- ## 7. Troubleshooting ### 7.1 Common Issues #### **Port Conflicts** ```bash # Check what's using ports lsof -i :3000 -i :8000 -i :8001 -i :11434 # For Docker: Stop conflicting containers ./start-docker.sh stop # For Direct: Kill processes pkill -f "npm run dev" pkill -f "server.py" pkill -f "api_server" ``` #### **Docker Issues** ```bash # Docker daemon not running docker version # Check if daemon responds # Restart Docker Desktop (macOS/Windows) # Or restart docker service (Linux) sudo systemctl restart docker # Clear Docker cache docker system prune -f ``` #### **Ollama Issues** ```bash # Check Ollama status curl http://localhost:11434/api/tags # Restart Ollama pkill ollama ollama serve # Reinstall models ollama pull qwen3:0.6b ollama pull qwen3:8b ``` ### 7.2 Performance Issues #### **Memory Problems** ```bash # Check memory usage free -h # Linux vm_stat # macOS docker stats # Docker containers # Solutions: # 1. Increase system RAM # 2. Reduce batch sizes in configuration # 3. Use smaller models (qwen3:0.6b instead of qwen3:8b) ``` #### **Slow Response Times** ```bash # Check model loading curl http://localhost:11434/api/tags # Monitor component response times time curl http://localhost:8001/models # Solutions: # 1. Use SSD storage # 2. Increase CPU cores # 3. Use GPU acceleration (if available) ``` --- ## 8. Production Considerations ### 8.1 Security #### **Network Security** ```bash # Use reverse proxy (nginx/traefik) for production # Enable HTTPS/TLS # Restrict port access with firewall ``` #### **Data Security** ```bash # Enable authentication in production # Encrypt sensitive data # Regular security updates ``` ### 8.2 Scaling #### **Horizontal Scaling** ```bash # Use Docker Swarm or Kubernetes # Load balance frontend and backend # Scale RAG API instances based on load ``` #### **Resource Optimization** ```bash # Use dedicated GPU nodes for AI workloads # Implement model caching # Optimize batch processing ``` --- ## 9. Success Criteria ### 9.1 Deployment Verification Your deployment is successful when: - ✅ All health checks pass - ✅ Frontend loads at http://localhost:3000 - ✅ You can create document indexes - ✅ You can chat with uploaded documents - ✅ No error messages in logs ### 9.2 Performance Benchmarks **Acceptable Performance:** - Index creation: < 2 minutes per 100MB document - Query response: < 30 seconds for complex questions - Memory usage: < 8GB total system memory **Optimal Performance:** - Index creation: < 1 minute per 100MB document - Query response: < 10 seconds for complex questions - Memory usage: < 16GB total system memory --- **Happy Deploying! 🚀** ================================================ FILE: Documentation/docker_usage.md ================================================ # 🐳 Docker Usage Guide - RAG System _Last updated: 2025-01-07_ This guide provides practical Docker commands and procedures for running the RAG system in containerized environments with local Ollama. --- ## 📋 Prerequisites ### Required Setup - Docker Desktop installed and running - Ollama installed locally (even for Docker deployment) - 8GB+ RAM available ### Architecture Overview ``` ┌─────────────────────────────────────┐ │ Docker Containers │ ├─────────────────────────────────────┤ │ Frontend (Port 3000) │ │ Backend (Port 8000) │ │ RAG API (Port 8001) │ └─────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────┐ │ Local System │ ├─────────────────────────────────────┤ │ Ollama Server (Port 11434) │ └─────────────────────────────────────┘ ``` --- ## 1. Quick Start Commands ### Step 1: Clone and Setup ```bash # Clone repository git clone cd rag_system_old # Verify Docker is running docker version ``` ### Step 2: Install and Configure Ollama (Required) **⚠️ Important**: Even with Docker, Ollama must be installed locally for optimal performance. ```bash # Install Ollama curl -fsSL https://ollama.ai/install.sh | sh # Start Ollama (in one terminal) ollama serve # Install required models (in another terminal) ollama pull qwen3:0.6b # Fast model (650MB) ollama pull qwen3:8b # High-quality model (4.7GB) # Verify models are installed ollama list # Test Ollama connection curl http://localhost:11434/api/tags ``` ### Step 3: Start Docker Containers ```bash # Start all containers ./start-docker.sh # Stop all containers ./start-docker.sh stop # View logs ./start-docker.sh logs # Check status ./start-docker.sh status # Restart containers ./start-docker.sh stop ./start-docker.sh ``` ### 1.2 Service Access Once running, access the system at: - **Frontend**: http://localhost:3000 - **Backend API**: http://localhost:8000 - **RAG API**: http://localhost:8001 - **Ollama**: http://localhost:11434 --- ## 2. Container Management ### 2.1 Using the Convenience Script ```bash # Start all containers ./start-docker.sh # Stop all containers ./start-docker.sh stop # View logs ./start-docker.sh logs # Check status ./start-docker.sh status # Restart containers ./start-docker.sh stop ./start-docker.sh ``` ### 2.2 Manual Docker Compose Commands ```bash # Start all services docker compose --env-file docker.env up --build -d # Check status docker compose ps # View logs docker compose logs -f # Stop all services docker compose down # Force rebuild docker compose build --no-cache docker compose up --build -d ``` ### 2.3 Individual Service Management ```bash # Start specific service docker compose up -d frontend docker compose up -d backend docker compose up -d rag-api # Restart specific service docker compose restart rag-api # Stop specific service docker compose stop backend # View specific service logs docker compose logs -f rag-api ``` --- ## 3. Development Workflow ### 3.1 Code Changes ```bash # After frontend changes docker compose restart frontend # After backend changes docker compose restart backend # After RAG system changes docker compose restart rag-api # Rebuild after dependency changes docker compose build --no-cache rag-api docker compose up -d rag-api ``` ### 3.2 Debugging Containers ```bash # Access container shell docker compose exec frontend sh docker compose exec backend bash docker compose exec rag-api bash # Run commands in container docker compose exec rag-api python -c "from rag_system.main import get_agent; print('✅ RAG System OK')" docker compose exec backend curl http://localhost:8000/health # Check environment variables docker compose exec rag-api env | grep OLLAMA ``` ### 3.3 Development vs Production ```bash # Development mode (if docker-compose.dev.yml exists) docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d # Production mode (default) docker compose --env-file docker.env up -d ``` --- ## 4. Logging & Monitoring ### 4.1 Log Management ```bash # View all logs docker compose logs # View specific service logs docker compose logs frontend docker compose logs backend docker compose logs rag-api # Follow logs in real-time docker compose logs -f # View last N lines docker compose logs --tail=100 # View logs with timestamps docker compose logs -t # Save logs to file docker compose logs > system.log 2>&1 # View logs since specific time docker compose logs --since=2h docker compose logs --since=2025-01-01T00:00:00 ``` ### 4.2 System Monitoring ```bash # Monitor resource usage docker stats # Monitor specific containers docker stats rag-frontend rag-backend rag-api # Check container health docker compose ps # System information docker system info docker system df ``` --- ## 5. Ollama Integration ### 5.1 Ollama Setup ```bash # Install Ollama (one-time setup) curl -fsSL https://ollama.ai/install.sh | sh # Start Ollama server ollama serve # Check Ollama status curl http://localhost:11434/api/tags # Install models ollama pull qwen3:0.6b # Fast model ollama pull qwen3:8b # High-quality model # List installed models ollama list ``` ### 5.2 Ollama Management ```bash # Check model status from container docker compose exec rag-api curl http://host.docker.internal:11434/api/tags # Test Ollama connection curl -X POST http://localhost:11434/api/generate \ -H "Content-Type: application/json" \ -d '{"model": "qwen3:0.6b", "prompt": "Hello", "stream": false}' # Monitor Ollama logs (if running with logs) # Ollama logs appear in the terminal where you ran 'ollama serve' ``` ### 5.3 Model Management ```bash # Update models ollama pull qwen3:0.6b ollama pull qwen3:8b # Remove unused models ollama rm old-model-name # Check model information ollama show qwen3:0.6b ``` --- ## 6. Data Management ### 6.1 Volume Management ```bash # List volumes docker volume ls # View volume usage docker system df -v # Backup volumes docker run --rm -v rag_system_old_lancedb:/data -v $(pwd)/backup:/backup alpine tar czf /backup/lancedb_backup.tar.gz -C /data . # Clean unused volumes docker volume prune ``` ### 6.2 Database Management ```bash # Access SQLite database docker compose exec backend sqlite3 /app/backend/chat_data.db # Backup database cp backend/chat_data.db backup/chat_data_$(date +%Y%m%d).db # Check LanceDB tables from container docker compose exec rag-api python -c " import lancedb db = lancedb.connect('/app/lancedb') print('Tables:', db.table_names()) " ``` ### 6.3 File Management ```bash # Access shared files docker compose exec rag-api ls -la /app/shared_uploads # Copy files to/from containers docker cp local_file.pdf rag-api:/app/shared_uploads/ docker cp rag-api:/app/shared_uploads/file.pdf ./local_file.pdf # Check disk usage docker compose exec rag-api df -h ``` --- ## 7. Troubleshooting ### 7.1 Common Issues #### Container Won't Start ```bash # Check Docker daemon docker version # Check for port conflicts lsof -i :3000 -i :8000 -i :8001 # Check container logs docker compose logs [service-name] # Restart Docker Desktop # macOS/Windows: Restart Docker Desktop # Linux: sudo systemctl restart docker ``` #### Ollama Connection Issues ```bash # Check Ollama is running curl http://localhost:11434/api/tags # Restart Ollama pkill ollama ollama serve # Check from container docker compose exec rag-api curl http://host.docker.internal:11434/api/tags ``` #### Performance Issues ```bash # Check resource usage docker stats # Increase Docker memory (Docker Desktop Settings) # Recommended: 8GB+ for Docker # Check container health docker compose ps ``` ### 7.2 Reset and Clean ```bash # Stop everything ./start-docker.sh stop # Clean containers and images docker system prune -a # Clean volumes (⚠️ deletes data) docker volume prune # Complete reset (⚠️ deletes everything) docker compose down -v docker system prune -a --volumes ``` ### 7.3 Health Checks ```bash # Comprehensive health check curl -f http://localhost:3000 && echo "✅ Frontend OK" curl -f http://localhost:8000/health && echo "✅ Backend OK" curl -f http://localhost:8001/models && echo "✅ RAG API OK" curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK" # Check all container status docker compose ps # Test model loading docker compose exec rag-api python -c " from rag_system.main import get_agent agent = get_agent('default') print('✅ RAG System initialized successfully') " ``` --- ## 8. Advanced Usage ### 8.1 Production Deployment ```bash # Use production environment export NODE_ENV=production # Start with resource limits docker compose --env-file docker.env up -d # Enable automatic restarts docker update --restart unless-stopped $(docker ps -q) ``` ### 8.2 Scaling ```bash # Scale specific services docker compose up -d --scale backend=2 --scale rag-api=2 # Use Docker Swarm for clustering docker swarm init docker stack deploy -c docker-compose.yml rag-system ``` ### 8.3 Security ```bash # Scan images for vulnerabilities docker scout cves rag-frontend docker scout cves rag-backend docker scout cves rag-api # Update base images docker compose build --no-cache --pull ``` --- ## 9. Configuration ### 9.1 Environment Variables The system uses `docker.env` for configuration: ```bash # Ollama configuration OLLAMA_HOST=http://host.docker.internal:11434 # Service configuration NODE_ENV=production RAG_API_URL=http://rag-api:8001 NEXT_PUBLIC_API_URL=http://localhost:8000 ``` ### 9.2 Custom Configuration ```bash # Create custom environment file cp docker.env docker.custom.env # Edit custom configuration nano docker.custom.env # Use custom configuration docker compose --env-file docker.custom.env up -d ``` --- ## 10. Success Checklist Your Docker deployment is successful when: - ✅ All containers are running: `docker compose ps` - ✅ Ollama is accessible: `curl http://localhost:11434/api/tags` - ✅ Frontend loads: `curl http://localhost:3000` - ✅ Backend responds: `curl http://localhost:8000/health` - ✅ RAG API works: `curl http://localhost:8001/models` - ✅ You can create indexes and chat with documents ### Performance Expectations **Acceptable Performance:** - Container startup: < 2 minutes - Memory usage: < 4GB Docker containers + Ollama - Response time: < 30 seconds for complex queries **Optimal Performance:** - Container startup: < 1 minute - Memory usage: < 2GB Docker containers + Ollama - Response time: < 10 seconds for complex queries --- **Happy Containerizing! 🐳** ================================================ FILE: Documentation/improvement_plan.md ================================================ # RAG System – Improvement Road-map _Revision: 2025-07-05_ This document captures high-impact enhancements identified during the July 2025 code-review. Items are grouped by theme and include a short rationale plus suggested implementation notes. **No code has been changed – this file is planning only.** --- ## 1. Retrieval Accuracy & Speed | ID | Item | Rationale | Notes | |----|------|-----------|-------| | 1.1 | Late-chunk result merging | Returned snippets can be single late-chunks → fragmented. | After retrieval, gather sibling chunks (±1) and concatenate before reranking / display. | | 1.2 | Tiered retrieval (ANN pre-filter) | Large indexes → LanceDB full scan can be slow. | Use in-memory FAISS/HNSW to narrow to top-N, then exact LanceDB search. | | 1.3 | Dynamic fusion weights | Different corpora favour dense vs BM25 differently. | Learn weight on small validation set; store in index `metadata`. | | 1.4 | Query expansion via KG | Use extracted entities to enrich queries. | Requires Graph-RAG path clean-up first. | ## 2. Routing / Triage | ID | Item | Rationale | |----|------|-----------| | 2.1 | Embed + cache document overviews | LLM router costs tokens; cosine-similarity pre-check is cheaper. | | 2.2 | Session-level routing memo | Avoid repeated LLM triage for follow-up queries. | | 2.3 | Remove legacy pattern rules | Simplifies maintenance once overview & ML routing mature. | ## 3. Indexing Pipeline | ID | Item | Rationale | |----|------|-----------| | 3.1 | Parallel document conversion | PDF→MD + chunking is serial today; speed gains possible. | | 3.2 | Incremental indexing | Re-embedding whole corpus wastes time. | | 3.3 | Auto GPU dtype selection | Use FP16 on CUDA / MPS for memory and speed. | | 3.4 | Post-build health check | Catch broken indexes (dim mismatch etc.) early. | ## 4. Embedding Model Management * **Registry file** mapping tag → dims/source/license. UI & backend validate against it. * **Embedder pool** caches loaded HF/Ollama weights per model to save RAM. ## 5. Database & Storage * LanceDB table GC for orphaned tables. * Scheduled SQLite `VACUUM` when fragmentation > X %. ## 6. Observability & Ops * JSON structured logging. * `/metrics` endpoint for Prometheus. * Deep health-probe (`/health/deep`) exercising end-to-end query. ## 7. Front-end UX * SSE-driven progress bar for indexing. * Matched-term highlighting in retrieved snippets. * Preset buttons (Fast / Balanced / High-Recall) for retrieval settings. ## 8. Testing & CI * Replace deleted BM25 tests with LanceDB hybrid tests. * Integration test: build → query → assert ≥1 doc. * GitHub Action that spins up Ollama, pulls small embedding model, runs smoke test. ## 9. Codebase Hygiene * Graph-RAG integration (currently disabled, can be implemented if needed). * Consolidate duplicate config keys (`embedding_model_name`, etc.). * Run `mypy --strict`, pylint, and black in CI. --- ### 🧹 System Cleanup (Priority: **HIGH**) Reduce complexity and improve maintainability. * **✅ COMPLETED**: Remove experimental DSPy integration and unused modules (35+ files removed) * **✅ COMPLETED**: Clean up duplicate or obsolete documentation files * **✅ COMPLETED**: Remove unused import statements and dependencies * **✅ COMPLETED**: Consolidate similar configuration files * **✅ COMPLETED**: Remove broken or non-functional ReAct agent implementation ### Priority Matrix (suggested order) 1. **Critical reliability**: 3.4, 5.1, 9.2 2. **User-visible wins**: 1.1, 7.1, 7.2 3. **Performance**: 1.2, 3.1, 3.3 4. **Long-term maintainability**: 2.3, 9.1, 9.3 Feel free to rearrange based on team objectives and resource availability. ================================================ FILE: Documentation/indexing_pipeline.md ================================================ # 🗂️ Indexing Pipeline _Implementation entry-point: `rag_system/pipelines/indexing_pipeline.py` + helpers in `indexing/` & `ingestion/`._ ## Overview Transforms raw documents (PDF, TXT, etc.) into search-ready **chunks** with embeddings, storing them in LanceDB and generating auxiliary assets (overviews, context summaries). ## High-Level Diagram ```mermaid flowchart TD A["Uploaded Files"] --> B{Converter} B -->|PDF→text| C["Plain Text"] C --> D{Chunker} D -->|docling| D1[DocLing Chunking] D -->|latechunk| D2[Late Chunking] D -->|standard| D3[Fixed-size] D1 & D2 & D3 --> E["Contextual Enricher"] E -->|local ctx summary| F["Embedding Generator"] F -->|vectors| G[(LanceDB Table)] E --> H["Overview Builder"] H -->|JSONL| OVR[[`index_store/overviews/.jsonl`]] ``` ## Steps in Detail | Step | Module | Key Classes | Notes | |------|--------|------------|-------| | Conversion | `ingestion/pdf_converter.py` | `PDFConverter` | Uses `Docling` library to extract text with structure preservation. | | Chunking | `ingestion/chunking.py`, `indexing/latechunk.py`, `ingestion/docling_chunker.py` | `MarkdownRecursiveChunker`, `DoclingChunker` | Controlled by flags `latechunk`, `doclingChunk`, `chunkSize`, `chunkOverlap`. | | Contextual Enrichment | `indexing/contextualizer.py` | `ContextualEnricher` | Generates per-chunk summaries (LLM call). | | Embedding | `indexing/embedders.py`, `indexing/representations.py` | `QwenEmbedder`, `EmbeddingGenerator` | Batch size tunable (`batchSizeEmbed`). Uses Qwen3-Embedding models. | | LanceDB Ingest | `index_store/lancedb/…` | – | Each index has a dedicated table `text_pages_`. | | Overview | `indexing/overview_builder.py` | `OverviewBuilder` | First-N chunks summarised for triage routing. | ### Control Flow (Code) 1. **backend/server.py → handle_build_index()** collects files + opts and POSTs to `/index` endpoint on advanced RAG API (local process). 2. **indexing_pipeline.IndexingPipeline.run()** orchestrates conversion → chunking → enrichment → embedding → storage. 3. Metadata (chunk_size, models, etc.) stored in SQLite `indexes` table. ## Configuration Flags | Flag | Description | Default | |------|-------------|---------| | `latechunk` | Merge k adjacent sibling chunks at query time | false | | `doclingChunk` | Use DocLing structural chunking | false | | `chunkSize` / `chunkOverlap` | Standard fixed slicing | 512 / 64 | | `enableEnrich` | Run contextual summaries | true | | `embeddingModel` | Override embedder | `Qwen/Qwen3-Embedding-0.6B` | | `overviewModel` | Model used in `OverviewBuilder` | `qwen3:0.6b` | | `batchSizeEmbed / Enrich` | Batch sizes | 50 / 25 | ## Error Handling * Duplicate LanceDB table ➟ now idempotent (commit `af99b38`). * Failed PDF parse ➟ chunker skips file, logs warning. ## Extension Ideas * Add OCR layer before PDF conversion. * Store embeddings in Remote LanceDB instance (update URL in config). ## Detailed Implementation Analysis ### Pipeline Architecture Pattern The `IndexingPipeline` uses a **sequential processing pattern** with parallel batch operations. Each stage processes all documents before moving to the next stage, enabling efficient memory usage and progress tracking. ```python def run(self, file_paths: List[str]): with timer("Complete Indexing Pipeline"): # Stage 1: Document Processing & Chunking all_chunks = [] doc_chunks_map = {} # Stage 2: Contextual Enrichment (optional) if self.contextual_enricher: all_chunks = self.contextual_enricher.enrich_batch(all_chunks) # Stage 3: Dense Indexing (embedding + storage) if self.vector_indexer: self.vector_indexer.index_chunks(all_chunks, table_name) # Stage 4: Graph Extraction (optional) if self.graph_extractor: self.graph_extractor.extract_and_store(all_chunks) ``` ### Document Processing Deep-Dive #### PDF Conversion Strategy ```python # PDFConverter uses Docling for robust text extraction with structure def convert_to_markdown(self, file_path: str) -> List[Tuple[str, Dict, Any]]: # Quick heuristic: if PDF has text layer, skip OCR for speed use_ocr = not self._pdf_has_text(file_path) converter = self.converter_ocr if use_ocr else self.converter_no_ocr result = converter.convert(file_path) markdown_content = result.document.export_to_markdown() metadata = {"source": file_path} # Return DoclingDocument object for advanced chunkers return [(markdown_content, metadata, result.document)] ``` **Benefits**: - Preserves document structure (headings, lists, tables) - Automatic OCR fallback for image-based PDFs - Maintains page-level metadata for source attribution - Structured output supports advanced chunking strategies #### Chunking Strategy Selection ```python # Dynamic chunker selection based on config chunker_mode = config.get("chunker_mode", "legacy") if chunker_mode == "docling": self.chunker = DoclingChunker( max_tokens=chunk_size, overlap=overlap_sentences, tokenizer_model="Qwen/Qwen3-Embedding-0.6B" ) else: self.chunker = MarkdownRecursiveChunker( max_chunk_size=chunk_size, min_chunk_size=min(chunk_overlap, chunk_size // 4) ) ``` #### Recursive Markdown Chunking Algorithm ```python def chunk(self, text: str, document_id: str, metadata: Dict) -> List[Dict]: # Priority hierarchy for splitting separators = [ "\n\n# ", # H1 headers (highest priority) "\n\n## ", # H2 headers "\n\n### ", # H3 headers "\n\n", # Paragraph breaks "\n", # Line breaks ". ", # Sentence boundaries " " # Word boundaries (last resort) ] chunks = [] current_chunk = "" for separator in separators: if len(current_chunk) <= self.max_chunk_size: continue # Split on current separator parts = current_chunk.split(separator) # Reassemble with overlap for i, part in enumerate(parts): if len(part) > self.max_chunk_size: # Recursively split large parts continue # Add overlap from previous chunk if i > 0 and len(chunks) > 0: overlap_text = chunks[-1]["text"][-self.chunk_overlap:] part = overlap_text + separator + part chunks.append({ "text": part, "document_id": document_id, "metadata": {**metadata, "chunk_index": len(chunks)} }) ``` ### DocLing Chunking Implementation #### Token-Aware Sentence Packing ```python class DoclingChunker: def __init__(self, max_tokens: int = 512, overlap: int = 1, tokenizer_model: str = "Qwen/Qwen3-Embedding-0.6B"): self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_model) self.max_tokens = max_tokens self.overlap = overlap # sentences of overlap def split_markdown(self, markdown: str, document_id: str, metadata: Dict): sentences = self._sentence_split(markdown) chunks = [] window = [] while sentences: # Add sentences until token limit while (sentences and self._token_len(" ".join(window + [sentences[0]])) <= self.max_tokens): window.append(sentences.pop(0)) if not window: # Single sentence > limit window.append(sentences.pop(0)) # Create chunk chunk_text = " ".join(window) chunks.append({ "chunk_id": f"{document_id}_{len(chunks)}", "text": chunk_text, "metadata": { **metadata, "chunk_index": len(chunks), "heading_path": metadata.get("heading_path", []), "block_type": metadata.get("block_type", "paragraph") } }) # Add overlap for next chunk if self.overlap and sentences: overlap_sentences = window[-self.overlap:] sentences = overlap_sentences + sentences window = [] return chunks ``` #### Document Structure Preservation ```python def chunk_document(self, doc, document_id: str, metadata: Dict): """Walk DoclingDocument tree and emit structured chunks.""" chunks = [] current_heading_path = [] buffer = [] # Process document elements in reading order for txt_item in doc.texts: role = getattr(txt_item, "role", None) if role == "heading": self._flush_buffer(buffer, chunks, current_heading_path) level = getattr(txt_item, "level", 1) # Update heading hierarchy current_heading_path = current_heading_path[:level-1] current_heading_path.append(txt_item.text.strip()) continue # Accumulate text in token-aware buffer text_piece = txt_item.text if self._buffer_would_exceed_limit(buffer, text_piece): self._flush_buffer(buffer, chunks, current_heading_path) buffer.append(text_piece) self._flush_buffer(buffer, chunks, current_heading_path) return chunks ``` ### Contextual Enrichment Implementation #### Batch Processing Pattern ```python class ContextualEnricher: def enrich_batch(self, chunks: List[Dict]) -> List[Dict]: enriched_chunks = [] # Process in batches to manage memory for i in range(0, len(chunks), self.batch_size): batch = chunks[i:i + self.batch_size] # Parallel enrichment within batch with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: futures = [ executor.submit(self._enrich_single_chunk, chunk) for chunk in batch ] for future in concurrent.futures.as_completed(futures): enriched_chunks.append(future.result()) return enriched_chunks ``` #### Contextual Prompt Engineering ```python def _generate_context_summary(self, chunk_text: str, surrounding_context: str) -> str: prompt = f""" Analyze this text chunk and provide a concise summary that captures: 1. Main topics and key information 2. Context within the broader document 3. Relevance for search and retrieval Document Context: {surrounding_context} Chunk to Analyze: {chunk_text} Summary (max 2 sentences): """ response = self.llm_client.complete( prompt=prompt, model=self.ollama_config["enrichment_model"] # qwen3:0.6b ) return response.strip() ``` ### Embedding Generation Pipeline #### Model Selection Strategy ```python def select_embedder(model_name: str, ollama_host: str = None): """Select appropriate embedder based on model name.""" if "Qwen3-Embedding" in model_name: return QwenEmbedder(model_name=model_name) elif "bge-" in model_name: return BGEEmbedder(model_name=model_name) elif ollama_host and model_name in ["nomic-embed-text"]: return OllamaEmbedder(model_name=model_name, host=ollama_host) else: # Default to Qwen embedder return QwenEmbedder(model_name="Qwen/Qwen3-Embedding-0.6B") ``` #### Batch Embedding Generation ```python class QwenEmbedder: def create_embeddings(self, texts: List[str]) -> np.ndarray: """Generate embeddings in batches for efficiency.""" embeddings = [] for i in range(0, len(texts), self.batch_size): batch = texts[i:i + self.batch_size] # Tokenize and encode inputs = self.tokenizer( batch, padding=True, truncation=True, max_length=512, return_tensors='pt' ) with torch.no_grad(): outputs = self.model(**inputs) # Mean pooling over token embeddings batch_embeddings = outputs.last_hidden_state.mean(dim=1) embeddings.append(batch_embeddings.cpu().numpy()) return np.vstack(embeddings) ``` ### LanceDB Storage Implementation #### Table Management Strategy ```python class LanceDBManager: def create_table_if_not_exists(self, table_name: str, schema: Schema): """Create LanceDB table with proper schema.""" try: table = self.db.open_table(table_name) print(f"Table {table_name} already exists") return table except FileNotFoundError: # Table doesn't exist, create it table = self.db.create_table( table_name, schema=schema, mode="create" ) print(f"Created new table: {table_name}") return table def index_chunks(self, chunks: List[Dict], table_name: str): """Store chunks with embeddings in LanceDB.""" table = self.get_table(table_name) # Prepare data for insertion records = [] for chunk in chunks: record = { "chunk_id": chunk["chunk_id"], "text": chunk["text"], "vector": chunk["embedding"].tolist(), "metadata": json.dumps(chunk["metadata"]), "document_id": chunk["metadata"]["document_id"], "chunk_index": chunk["metadata"]["chunk_index"] } records.append(record) # Batch insert table.add(records) # Create vector index for fast similarity search table.create_index("vector", config=IvfPq(num_partitions=256)) ``` ### Overview Building for Query Routing #### Document Summarization Strategy ```python class OverviewBuilder: def build_overview(self, chunks: List[Dict], document_id: str) -> Dict: """Generate document overview for query routing.""" # Take first N chunks for overview (usually most important) sample_chunks = chunks[:self.max_chunks_for_overview] combined_text = "\n\n".join([c["text"] for c in sample_chunks]) overview_prompt = f""" Analyze this document and create a brief overview that includes: 1. Main topic and purpose 2. Key themes and concepts 3. Document type and domain 4. Relevant search keywords Document text: {combined_text} Overview (max 3 sentences): """ overview = self.llm_client.complete( prompt=overview_prompt, model=self.overview_model # qwen3:0.6b for speed ) return { "document_id": document_id, "overview": overview.strip(), "chunk_count": len(chunks), "keywords": self._extract_keywords(combined_text), "created_at": datetime.now().isoformat() } def save_overview(self, overview: Dict): """Save overview to JSONL file for query routing.""" overview_path = f"./index_store/overviews/{overview['document_id']}.jsonl" with open(overview_path, 'w') as f: json.dump(overview, f) ``` ### Performance Optimizations #### Memory Management ```python class IndexingPipeline: def __init__(self, config: Dict, ollama_client: OllamaClient, ollama_config: Dict): # Lazy initialization to save memory self._pdf_converter = None self._chunker = None self._embedder = None def _get_embedder(self): """Lazy load embedder to avoid memory overhead.""" if self._embedder is None: model_name = self.config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B") self._embedder = select_embedder(model_name) return self._embedder def process_document_batch(self, file_paths: List[str]): """Process documents in batches to manage memory.""" for batch_start in range(0, len(file_paths), self.batch_size): batch = file_paths[batch_start:batch_start + self.batch_size] # Process batch self._process_batch(batch) # Cleanup to free memory if hasattr(self, '_embedder') and self._embedder: self._embedder.cleanup() ``` #### Parallel Processing ```python def run_parallel_processing(self, file_paths: List[str]): """Process multiple documents in parallel.""" with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: futures = [] for file_path in file_paths: future = executor.submit(self._process_single_file, file_path) futures.append(future) # Collect results results = [] for future in concurrent.futures.as_completed(futures): try: result = future.result(timeout=300) # 5 minute timeout results.append(result) except Exception as e: print(f"Error processing file: {e}") return results ``` ### Error Handling and Recovery #### Graceful Degradation ```python def run(self, file_paths: List[str], table_name: str): """Main pipeline with comprehensive error handling.""" processed_files = [] failed_files = [] for file_path in file_paths: try: # Attempt processing chunks = self._process_single_file(file_path) if chunks: # Store successfully processed chunks self._store_chunks(chunks, table_name) processed_files.append(file_path) else: print(f"⚠️ No chunks generated from {file_path}") failed_files.append((file_path, "No chunks generated")) except Exception as e: print(f"❌ Error processing {file_path}: {e}") failed_files.append((file_path, str(e))) continue # Continue with other files # Return summary return { "processed": len(processed_files), "failed": len(failed_files), "processed_files": processed_files, "failed_files": failed_files } ``` #### Recovery Mechanisms ```python def recover_from_partial_failure(self, table_name: str, document_id: str): """Recover from partial indexing failures.""" try: # Check what was already processed table = self.db_manager.get_table(table_name) existing_chunks = table.search().where(f"document_id = '{document_id}'").to_list() if existing_chunks: print(f"Found {len(existing_chunks)} existing chunks for {document_id}") return True # Cleanup partial data self._cleanup_partial_data(table_name, document_id) return False except Exception as e: print(f"Recovery failed: {e}") return False ``` ### Configuration and Customization #### Pipeline Configuration Options ```python DEFAULT_CONFIG = { "chunking": { "strategy": "docling", # "docling", "recursive", "fixed" "max_tokens": 512, "overlap": 64, "min_chunk_size": 100 }, "embedding": { "model_name": "Qwen/Qwen3-Embedding-0.6B", "batch_size": 32, "max_length": 512 }, "enrichment": { "enabled": True, "model": "qwen3:0.6b", "batch_size": 16 }, "overview": { "enabled": True, "max_chunks": 5, "model": "qwen3:0.6b" }, "storage": { "create_index": True, "index_type": "IvfPq", "num_partitions": 256 } } ``` #### Custom Processing Hooks ```python class IndexingPipeline: def __init__(self, config: Dict, hooks: Dict = None): self.hooks = hooks or {} def _run_hook(self, hook_name: str, *args, **kwargs): """Execute custom processing hooks.""" if hook_name in self.hooks: return self.hooks[hook_name](*args, **kwargs) return None def process_chunk(self, chunk: Dict) -> Dict: """Process single chunk with custom hooks.""" # Pre-processing hook chunk = self._run_hook("pre_chunk_process", chunk) or chunk # Standard processing if self.contextual_enricher: chunk = self.contextual_enricher.enrich_chunk(chunk) # Post-processing hook chunk = self._run_hook("post_chunk_process", chunk) or chunk return chunk ``` --- ## Current Implementation Status ### Completed Features ✅ - DocLing-based PDF processing with OCR fallback - Multiple chunking strategies (DocLing, Recursive, Fixed-size) - Qwen3-Embedding-0.6B integration - Contextual enrichment with qwen3:0.6b - LanceDB storage with vector indexing - Overview generation for query routing - Batch processing and parallel execution - Comprehensive error handling ### In Development 🚧 - Graph extraction and knowledge graph building - Multimodal processing for images and tables - Advanced late-chunking optimization - Distributed processing support ### Planned Features 📋 - Custom model fine-tuning pipeline - Real-time incremental indexing - Cross-document relationship extraction - Advanced metadata enrichment --- ## Performance Benchmarks | Document Type | Processing Speed | Memory Usage | Storage Efficiency | |---------------|------------------|--------------|-------------------| | Text PDFs | 2-5 pages/sec | 2-4GB | 1MB/100 pages | | Image PDFs | 0.5-1 page/sec | 4-8GB | 2MB/100 pages | | Technical Docs | 1-3 pages/sec | 3-6GB | 1.5MB/100 pages | | Research Papers | 2-4 pages/sec | 2-4GB | 1.2MB/100 pages | ## Extension Points ### Custom Chunkers ```python class CustomChunker(BaseChunker): def chunk(self, text: str, document_id: str, metadata: Dict) -> List[Dict]: # Implement custom chunking logic pass ``` ### Custom Embedders ```python class CustomEmbedder(BaseEmbedder): def create_embeddings(self, texts: List[str]) -> np.ndarray: # Implement custom embedding generation pass ``` ### Custom Enrichers ```python class CustomEnricher(BaseEnricher): def enrich_chunk(self, chunk: Dict) -> Dict: # Implement custom enrichment logic pass ``` ================================================ FILE: Documentation/installation_guide.md ================================================ # 📦 RAG System Installation Guide _Last updated: 2025-01-07_ This guide provides step-by-step instructions for installing and setting up the RAG system using either Docker or direct development approaches. --- ## 🎯 Installation Options ### Option 1: Docker Deployment (Production Ready) 🐳 - **Best for**: Production environments, isolated setups, easy management - **Requirements**: Docker Desktop + Local Ollama - **Setup time**: ~10 minutes ### Option 2: Direct Development (Developer Friendly) 💻 - **Best for**: Development, customization, debugging - **Requirements**: Python + Node.js + Ollama - **Setup time**: ~15 minutes --- ## 1. Prerequisites ### 1.1 System Requirements #### **Minimum Requirements** - **CPU**: 4 cores, 2.5GHz+ - **RAM**: 8GB (16GB recommended) - **Storage**: 50GB free space - **OS**: macOS 10.15+, Ubuntu 20.04+, Windows 10+ #### **Recommended Requirements** - **CPU**: 8+ cores, 3.0GHz+ - **RAM**: 32GB+ (for large models) - **Storage**: 200GB+ SSD - **GPU**: NVIDIA GPU with 8GB+ VRAM (optional) ### 1.2 Common Dependencies **Required for both approaches:** - **Ollama**: AI model runtime (always required) - **Git**: 2.30+ for cloning repository **Docker-specific:** - **Docker Desktop**: 24.0+ with Docker Compose **Direct Development-specific:** - **Python**: 3.8+ - **Node.js**: 16+ with npm --- ## 2. Ollama Installation (Required for Both) ### 2.1 Install Ollama #### **macOS/Linux:** ```bash # Install Ollama curl -fsSL https://ollama.ai/install.sh | sh # Verify installation ollama --version ``` #### **Windows:** ```bash # Download from: https://ollama.ai/download # Run the installer and follow setup wizard ``` ### 2.2 Configure Ollama ```bash # Start Ollama server ollama serve # In another terminal, install required models ollama pull qwen3:0.6b # Fast model (650MB) ollama pull qwen3:8b # High-quality model (4.7GB) # Verify models are installed ollama list # Test Ollama ollama run qwen3:0.6b "Hello, how are you?" ``` **⚠️ Important**: Keep Ollama running (`ollama serve`) for the entire setup process. --- ## 3. 🐳 Docker Installation & Setup ### 3.1 Install Docker #### **macOS:** ```bash # Install Docker Desktop via Homebrew brew install --cask docker # Or download from: https://www.docker.com/products/docker-desktop/ # Start Docker Desktop from Applications # Verify installation docker --version docker compose version ``` #### **Ubuntu/Debian:** ```bash # Update system sudo apt-get update # Install Docker using convenience script curl -fsSL https://get.docker.com -o get-docker.sh sudo sh get-docker.sh # Add user to docker group sudo usermod -aG docker $USER newgrp docker # Install Docker Compose V2 sudo apt-get install docker-compose-plugin # Verify installation docker --version docker compose version ``` #### **Windows:** 1. Download Docker Desktop from https://www.docker.com/products/docker-desktop/ 2. Run installer and enable WSL 2 integration 3. Restart computer and start Docker Desktop 4. Verify in PowerShell: `docker --version` ### 3.2 Clone and Setup RAG System ```bash # Clone repository git clone cd rag_system_old # Verify Ollama is running curl http://localhost:11434/api/tags # Start Docker containers ./start-docker.sh # Wait for containers to start (2-3 minutes) sleep 120 # Verify deployment ./start-docker.sh status ``` ### 3.3 Test Docker Deployment ```bash # Test all endpoints curl -f http://localhost:3000 && echo "✅ Frontend OK" curl -f http://localhost:8000/health && echo "✅ Backend OK" curl -f http://localhost:8001/models && echo "✅ RAG API OK" curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK" # Access the application open http://localhost:3000 ``` --- ## 4. 💻 Direct Development Setup ### 4.1 Install Development Dependencies #### **Python Setup:** ```bash # Clone repository git clone https://github.com/your-org/rag-system.git cd rag-system # Create virtual environment (recommended) python -m venv venv # Activate virtual environment source venv/bin/activate # macOS/Linux # venv\Scripts\activate # Windows # Install Python dependencies pip install -r requirements.txt # Verify Python setup python -c "import torch; print('✅ PyTorch OK')" python -c "import transformers; print('✅ Transformers OK')" python -c "import lancedb; print('✅ LanceDB OK')" ``` #### **Node.js Setup:** ```bash # Install Node.js dependencies npm install # Verify Node.js setup node --version # Should be 16+ npm --version npm list --depth=0 ``` ### 4.2 Start Direct Development ```bash # Ensure Ollama is running curl http://localhost:11434/api/tags # Start all components with one command python run_system.py # Or start components manually in separate terminals: # Terminal 1: python -m rag_system.api_server # Terminal 2: cd backend && python server.py # Terminal 3: npm run dev ``` ### 4.3 Test Direct Development ```bash # Check system health python system_health_check.py # Test endpoints curl -f http://localhost:3000 && echo "✅ Frontend OK" curl -f http://localhost:8000/health && echo "✅ Backend OK" curl -f http://localhost:8001/models && echo "✅ RAG API OK" # Access the application open http://localhost:3000 ``` --- ## 5. Detailed Installation Steps ### 5.1 Repository Setup ```bash # Clone repository git clone https://github.com/your-org/rag-system.git cd rag-system # Check repository structure ls -la # Create required directories mkdir -p lancedb index_store shared_uploads logs backend touch backend/chat_data.db # Set permissions chmod -R 755 lancedb index_store shared_uploads chmod 664 backend/chat_data.db ``` ### 5.2 Configuration #### **Environment Variables** For Docker (automatic via `docker.env`): ```bash OLLAMA_HOST=http://host.docker.internal:11434 NODE_ENV=production RAG_API_URL=http://rag-api:8001 NEXT_PUBLIC_API_URL=http://localhost:8000 ``` For Direct Development (set automatically by `run_system.py`): ```bash OLLAMA_HOST=http://localhost:11434 RAG_API_URL=http://localhost:8001 NEXT_PUBLIC_API_URL=http://localhost:8000 ``` #### **Model Configuration** The system defaults to these models: - **Embedding**: `Qwen/Qwen3-Embedding-0.6B` (1024 dimensions) - **Generation**: `qwen3:0.6b` for fast responses, `qwen3:8b` for quality - **Reranking**: Built-in cross-encoder ### 5.3 Database Initialization ```bash # Initialize SQLite database python -c " from backend.database import ChatDatabase db = ChatDatabase() db.init_database() print('✅ Database initialized') " # Verify database sqlite3 backend/chat_data.db ".tables" ``` --- ## 6. Verification & Testing ### 6.1 System Health Checks #### **Comprehensive Health Check:** ```bash # For Docker deployment ./start-docker.sh status docker compose ps # For Direct development python system_health_check.py # Universal health check curl -f http://localhost:3000 && echo "✅ Frontend OK" curl -f http://localhost:8000/health && echo "✅ Backend OK" curl -f http://localhost:8001/models && echo "✅ RAG API OK" curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK" ``` #### **RAG System Test:** ```bash # Test RAG system initialization python -c " from rag_system.main import get_agent agent = get_agent('default') print('✅ RAG System initialized successfully') " # Test embedding generation python -c " from rag_system.main import get_agent agent = get_agent('default') embedder = agent.retrieval_pipeline._get_text_embedder() test_emb = embedder.create_embeddings(['Hello world']) print(f'✅ Embedding generated: {test_emb.shape}') " ``` ### 6.2 Functional Testing #### **Document Upload Test:** 1. Access http://localhost:3000 2. Click "Create New Index" 3. Upload a PDF document 4. Configure settings and build index 5. Test chat functionality #### **API Testing:** ```bash # Test session creation curl -X POST http://localhost:8000/sessions \ -H "Content-Type: application/json" \ -d '{"title": "Test Session"}' # Test models endpoint curl http://localhost:8001/models # Test health endpoints curl http://localhost:8000/health curl http://localhost:8001/health ``` --- ## 7. Troubleshooting Installation ### 7.1 Common Issues #### **Ollama Issues:** ```bash # Ollama not responding curl http://localhost:11434/api/tags # If fails, restart Ollama pkill ollama ollama serve # Reinstall models if needed ollama pull qwen3:0.6b ollama pull qwen3:8b ``` #### **Docker Issues:** ```bash # Docker daemon not running docker version # Restart Docker Desktop (macOS/Windows) # Or restart docker service (Linux) sudo systemctl restart docker # Clear Docker cache if build fails docker system prune -f ``` #### **Python Issues:** ```bash # Check Python version python --version # Should be 3.8+ # Check virtual environment which python pip list | grep torch # Reinstall dependencies pip install -r requirements.txt --force-reinstall ``` #### **Node.js Issues:** ```bash # Check Node version node --version # Should be 16+ # Clear and reinstall rm -rf node_modules package-lock.json npm install ``` ### 7.2 Performance Issues #### **Memory Problems:** ```bash # Check system memory free -h # Linux vm_stat # macOS # For Docker: Increase memory allocation # Docker Desktop → Settings → Resources → Memory → 8GB+ # Use smaller models ollama pull qwen3:0.6b # Instead of qwen3:8b ``` #### **Slow Performance:** - Use SSD storage for databases (`lancedb/`, `shared_uploads/`) - Increase CPU cores if possible - Close unnecessary applications - Use smaller batch sizes in configuration --- ## 8. Post-Installation Setup ### 8.1 Model Optimization ```bash # Install additional models (optional) ollama pull nomic-embed-text # Alternative embedding model ollama pull llama3.1:8b # Alternative generation model # Test model switching curl -X POST http://localhost:8001/chat \ -H "Content-Type: application/json" \ -d '{"query": "Hello", "model": "qwen3:8b"}' ``` ### 8.2 Security Configuration ```bash # Set proper file permissions chmod 600 backend/chat_data.db # Restrict database access chmod 700 lancedb/ # Restrict vector DB access # Configure firewall (production) sudo ufw allow 3000/tcp # Frontend sudo ufw deny 8000/tcp # Backend (internal only) sudo ufw deny 8001/tcp # RAG API (internal only) ``` ### 8.3 Backup Setup ```bash # Create backup script cat > backup_system.sh << 'EOF' #!/bin/bash BACKUP_DIR="backups/$(date +%Y%m%d_%H%M%S)" mkdir -p "$BACKUP_DIR" # Backup databases and indexes cp -r backend/chat_data.db "$BACKUP_DIR/" cp -r lancedb "$BACKUP_DIR/" cp -r index_store "$BACKUP_DIR/" cp -r shared_uploads "$BACKUP_DIR/" echo "Backup completed: $BACKUP_DIR" EOF chmod +x backup_system.sh ``` --- ## 9. Success Criteria ### 9.1 Installation Complete When: - ✅ All health checks pass without errors - ✅ Frontend loads at http://localhost:3000 - ✅ All models are installed and responding - ✅ You can create document indexes - ✅ You can chat with uploaded documents - ✅ No error messages in logs/terminal ### 9.2 Performance Benchmarks **Acceptable Performance:** - System startup: < 5 minutes - Index creation: < 2 minutes per 100MB document - Query response: < 30 seconds - Memory usage: < 8GB total **Optimal Performance:** - System startup: < 2 minutes - Index creation: < 1 minute per 100MB document - Query response: < 10 seconds - Memory usage: < 4GB total --- ## 10. Next Steps ### 10.1 Getting Started 1. **Upload Documents**: Create your first index with PDF documents 2. **Explore Features**: Try different query types and models 3. **Customize**: Adjust model settings and chunk sizes 4. **Scale**: Add more documents and create multiple indexes ### 10.2 Additional Resources - **Quick Start**: See `Documentation/quick_start.md` - **Docker Usage**: See `Documentation/docker_usage.md` - **System Architecture**: See `Documentation/architecture_overview.md` - **API Reference**: See `Documentation/api_reference.md` --- **Congratulations! 🎉** Your RAG system is now ready to use. Visit http://localhost:3000 to start chatting with your documents. ================================================ FILE: Documentation/prompt_inventory.md ================================================ # 📜 Prompt Inventory (Ground-Truth) _All generation / verification prompts currently hard-coded in the codebase._ _Last updated: 2025-07-06_ > Edit process: if you change a prompt in code, please **update this file** or, once we migrate to the central registry, delete the entry here. --- ## 1. Indexing / Context Enrichment | ID | File & Lines | Variable / Builder | Purpose | |----|--------------|--------------------|---------| | `overview_builder.default` | `rag_system/indexing/overview_builder.py` `12-21` | `DEFAULT_PROMPT` | Generate 1-paragraph document overview for search-time routing. | `contextualizer.system` | `rag_system/indexing/contextualizer.py` `11` | `SYSTEM_PROMPT` | System instruction: explain summarisation role. | `contextualizer.local_context` | same file `13-15` | `LOCAL_CONTEXT_PROMPT_TEMPLATE` | Human message – wraps neighbouring chunks. | `contextualizer.chunk` | same file `17-19` | `CHUNK_PROMPT_TEMPLATE` | Human message – shows the target chunk. | `graph_extractor.entities` | `rag_system/indexing/graph_extractor.py` `20-31` | `entity_prompt` | Ask LLM to list entities. | `graph_extractor.relationships` | same file `53-64` | `relationship_prompt` | Ask LLM to list relationships. ## 2. Retrieval / Query Transformation | ID | File & Lines | Purpose | |----|--------------|---------| | `query_transformer.expand` | `rag_system/retrieval/query_transformer.py` `10-26` | Produce query rewrites (keywords, boolean). | | `hyde.hypothetical_doc` | same `115-122` | HyDE hypothetical document generator. | | `graph_query.translate` | same `124-140` | Translate user question to JSON KG query. | ## 3. Pipeline Answer Synthesis | ID | File & Lines | Purpose | |----|--------------|---------| | `retrieval_pipeline.synth_final` | `rag_system/pipelines/retrieval_pipeline.py` `217-256` | Turn verified facts into answer (with directives 1-6). | ## 4. Agent – Classical Loop | ID | File & Lines | Purpose | |----|--------------|---------| | `agent.loop.initial_thought` | `rag_system/agent/loop.py` `157-180` | First LLM call to think about query. | | `agent.loop.verify_path` | same `190-205` | Secondary thought loop. | | `agent.loop.compose_sub` | same `506-542` | Compose answer from sub-answers. | | `agent.loop.router` | same `648-660` | Decide which subsystem handles query. | ## 5. Verifier | ID | File & Lines | Purpose | |----|--------------|---------| | `verifier.fact_check` | `rag_system/agent/verifier.py` `18-58` | Strict JSON-format grounding verifier. | ## 6. Backend Router (Fast path) | ID | File & Lines | Purpose | |----|--------------|---------| | `backend.router` | `backend/server.py` `435-448` | Decide "RAG vs direct LLM" before heavy processing. | ## 7. Miscellaneous | ID | File & Lines | Purpose | |----|--------------|---------| | `vision.placeholder` | `rag_system/utils/ollama_client.py` `169` | Dummy prompt for VLM colour check. | --- ### Missing / To-Do 1. Verify whether **ReActAgent.PROMPT_TEMPLATE** captures every placeholder – some earlier lines may need explicit ID when we move to central registry. 2. Search TS/JS code once the backend prompts are ported (currently none). --- **Next step:** create `rag_system/prompts/registry.yaml` and start moving each prompt above into a key–value entry with identical IDs. Update callers gradually using the helper proposed earlier. ================================================ FILE: Documentation/quick_start.md ================================================ # ⚡ Quick Start Guide - RAG System _Get up and running in 5 minutes!_ --- ## 🚀 Choose Your Deployment Method ### Option 1: Docker Deployment (Production Ready) 🐳 Best for: Production deployments, isolated environments, easy scaling ### Option 2: Direct Development (Developer Friendly) 💻 Best for: Development, customization, debugging, faster iteration --- ## 🐳 Docker Deployment ### Prerequisites - Docker Desktop installed and running - 8GB+ RAM available - Internet connection ### Step 1: Clone and Setup ```bash # Clone repository git clone cd rag_system_old # Ensure Docker is running docker version ``` ### Step 2: Install Ollama Locally **Even with Docker, Ollama runs locally for better performance:** ```bash # Install Ollama curl -fsSL https://ollama.ai/install.sh | sh # Start Ollama (in one terminal) ollama serve # Install models (in another terminal) ollama pull qwen3:0.6b ollama pull qwen3:8b ``` ### Step 3: Start Docker Containers ```bash # Start all containers ./start-docker.sh # Or manually: docker compose --env-file docker.env up --build -d ``` ### Step 4: Verify Deployment ```bash # Check container status docker compose ps # Test endpoints curl http://localhost:3000 # Frontend curl http://localhost:8000/health # Backend curl http://localhost:8001/models # RAG API ``` ### Step 5: Access Application Open your browser to: **http://localhost:3000** --- ## 💻 Direct Development ### Prerequisites - Python 3.8+ - Node.js 16+ and npm - 8GB+ RAM available ### Step 1: Clone and Install Dependencies ```bash # Clone repository git clone cd rag_system_old # Install Python dependencies pip install -r requirements.txt # Install Node.js dependencies npm install ``` ### Step 2: Install and Configure Ollama ```bash # Install Ollama curl -fsSL https://ollama.ai/install.sh | sh # Start Ollama (in one terminal) ollama serve # Install models (in another terminal) ollama pull qwen3:0.6b ollama pull qwen3:8b ``` ### Step 3: Start the System ```bash # Start all components with one command python run_system.py ``` **Or start components manually in separate terminals:** ```bash # Terminal 1: RAG API python -m rag_system.api_server # Terminal 2: Backend cd backend && python server.py # Terminal 3: Frontend npm run dev ``` ### Step 4: Verify Installation ```bash # Check system health python system_health_check.py # Test endpoints curl http://localhost:3000 # Frontend curl http://localhost:8000/health # Backend curl http://localhost:8001/models # RAG API ``` ### Step 5: Access Application Open your browser to: **http://localhost:3000** --- ## 🎯 First Use Guide ### 1. Create a Chat Session - Click "New Chat" in the interface - Give your session a descriptive name ### 2. Upload Documents - Click "Create New Index" button - Upload PDF files from your computer - Configure processing options: - **Chunk Size**: 512 (recommended) - **Embedding Model**: Qwen/Qwen3-Embedding-0.6B - **Enable Enrichment**: Yes - Click "Build Index" and wait for processing ### 3. Start Chatting - Select your built index - Ask questions about your documents: - "What is this document about?" - "Summarize the key points" - "What are the main findings?" - "Compare the arguments in section 3 and 5" --- ## 🔧 Management Commands ### Docker Commands ```bash # Container management ./start-docker.sh # Start all containers ./start-docker.sh stop # Stop all containers ./start-docker.sh logs # View logs ./start-docker.sh status # Check status # Manual Docker Compose docker compose ps # Check status docker compose logs -f # Follow logs docker compose down # Stop containers docker compose up --build -d # Rebuild and start ``` ### Direct Development Commands ```bash # System management python run_system.py # Start all services python system_health_check.py # Check system health # Individual components python -m rag_system.api_server # RAG API only cd backend && python server.py # Backend only npm run dev # Frontend only # Stop: Press Ctrl+C in terminal running services ``` --- ## 🆘 Quick Troubleshooting ### Docker Issues **Containers not starting?** ```bash # Check Docker daemon docker version # Restart Docker Desktop and try again ./start-docker.sh ``` **Port conflicts?** ```bash # Check what's using ports lsof -i :3000 -i :8000 -i :8001 # Stop conflicting processes ./start-docker.sh stop ``` ### Direct Development Issues **Import errors?** ```bash # Check Python installation python --version # Should be 3.8+ # Reinstall dependencies pip install -r requirements.txt --force-reinstall ``` **Node.js errors?** ```bash # Check Node version node --version # Should be 16+ # Reinstall dependencies rm -rf node_modules package-lock.json npm install ``` ### Common Issues **Ollama not responding?** ```bash # Check if Ollama is running curl http://localhost:11434/api/tags # Restart Ollama pkill ollama ollama serve ``` **Out of memory?** ```bash # Check memory usage docker stats # For Docker htop # For direct development # Recommended: 16GB+ RAM for optimal performance ``` --- ## 📊 System Verification Run this comprehensive check: ```bash # Check all endpoints curl -f http://localhost:3000 && echo "✅ Frontend OK" curl -f http://localhost:8000/health && echo "✅ Backend OK" curl -f http://localhost:8001/models && echo "✅ RAG API OK" curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK" # For Docker: Check containers docker compose ps ``` --- ## 🎉 Success! If you see: - ✅ All services responding - ✅ Frontend accessible at http://localhost:3000 - ✅ No error messages You're ready to start using LocalGPT! ### What's Next? 1. **📚 Upload Documents**: Add your PDF files to create indexes 2. **💬 Start Chatting**: Ask questions about your documents 3. **🔧 Customize**: Explore different models and settings 4. **📖 Learn More**: Check the full documentation below ### 📁 Key Files ``` rag-system/ ├── 🐳 start-docker.sh # Docker deployment script ├── 🏃 run_system.py # Direct development launcher ├── 🩺 system_health_check.py # System verification ├── 📋 requirements.txt # Python dependencies ├── 📦 package.json # Node.js dependencies ├── 📁 Documentation/ # Complete documentation └── 📁 rag_system/ # Core system code ``` ### 📖 Additional Resources - **🏗️ Architecture**: See `Documentation/architecture_overview.md` - **🔧 Configuration**: See `Documentation/system_overview.md` - **🚀 Deployment**: See `Documentation/deployment_guide.md` - **🐛 Troubleshooting**: See `DOCKER_TROUBLESHOOTING.md` --- **Happy RAG-ing! 🚀** --- ## 🛠️ Indexing Scripts The repository includes several convenient scripts for document indexing: ### Simple Index Creation Script For quick document indexing without the UI: ```bash # Basic usage ./simple_create_index.sh "Index Name" "document.pdf" # Multiple documents ./simple_create_index.sh "Research Papers" "paper1.pdf" "paper2.pdf" "notes.txt" # Using wildcards ./simple_create_index.sh "Invoice Collection" ./invoices/*.pdf ``` **Supported file types**: PDF, TXT, DOCX, MD ### Batch Indexing Script For processing large document collections: ```bash # Using the Python batch indexing script python demo_batch_indexing.py # Or using the direct indexing script python create_index_script.py ``` These scripts automatically: - ✅ Check prerequisites (Ollama running, Python dependencies) - ✅ Validate document formats - ✅ Create database entries - ✅ Process documents with the RAG pipeline - ✅ Generate searchable indexes --- ================================================ FILE: Documentation/retrieval_pipeline.md ================================================ # 📥 Retrieval Pipeline _Maps to `rag_system/pipelines/retrieval_pipeline.py` and helpers in `retrieval/`, `rerankers/`._ ## Role Given a **user query** and one or more indexed tables, retrieve the most relevant text chunks and synthesise an answer. ## Sub-components | Stage | Module | Key Classes / Fns | Notes | |-------|--------|-------------------|-------| | Query Pre-processing | `retrieval/query_transformer.py` | `QueryTransformer`, `HyDEGenerator`, `GraphQueryTranslator` | Expands, rewrites, or translates the raw query. | | Retrieval | `retrieval/retrievers.py` | `BM25Retriever`, `DenseRetriever`, `HybridRetriever` | Abstract over LanceDB vector + FTS search. | | Reranking | `rerankers/reranker.py` | `ColBERTSmall`, fallback `bge-reranker` | Optionally improves result ordering. | | Synthesis | `pipelines/retrieval_pipeline.py` | `_synthesize_final_answer()` | Calls LLM with evidence snippets. | ## End-to-End Flow ```mermaid flowchart LR Q["User Query"] --> XT["Query Transformer"] XT -->|variants| RETRIEVE subgraph Retrieval RET_BM25[BM25] --> MERGE RET_DENSE[Dense Vector] --> MERGE style RET_BM25 fill:#444,stroke:#ccc,color:#fff style RET_DENSE fill:#444,stroke:#ccc,color:#fff end MERGE --> RERANK RERANK --> K[["Top-K Chunks"]] K --> SYNTH["Answer Synthesiser\n(LLM)"] SYNTH --> A["Answer + Sources"] ``` ### Narrative 1. **Query Transformer** may expand the query (keyword list, HyDE doc, KG translation) depending on `searchType`. 2. **Retrievers** execute BM25 and/or dense similarity against LanceDB. Combination controlled by `retrievalMode` and `denseWeight`. 3. **Reranker** (if `aiRerank=true` or hybrid search) scores snippets; top `rerankerTopK` chosen. 4. **Synthesiser** streams an LLM completion using the prompt described in `prompt_inventory.md` (`retrieval_pipeline.synth_final`). ## Configuration Flags (passed from UI → backend) | Flag | Default | Effect | |------|---------|--------| | `searchType` | `fts` | UI label (FTS / Dense / Hybrid). | | `retrievalK` | 10 | Initial candidate count per retriever. | | `contextWindowSize` | 5 | How many adjacent chunks to merge (late-chunk). | | `rerankerTopK` | 20 | How many docs to pass into AI reranker. | | `denseWeight` | 0.5 | When `hybrid`, linear mix weight. | | `aiRerank` | bool | Toggle reranker. | | `verify` | bool | If true, pass answer to **Verifier** component. | ## Interfaces * Reads from **LanceDB** tables `text_pages_`. * Calls **Ollama** generation model specified in `PIPELINE_CONFIGS`. * Exposes `RetrievalPipeline.answer_stream()` iterator consumed by SSE API. ## Extension Points * Plug new retriever by inheriting `BaseRetriever` and registering in `retrievers.py`. * Swap reranker model via `EXTERNAL_MODELS['reranker_model']`. * Custom answer prompt can be overridden by passing `prompt_override` to `_synthesize_final_answer()` (not yet surfaced in UI). ## Detailed Implementation Analysis ### Core Architecture Pattern The `RetrievalPipeline` uses **lazy initialization** for all components to avoid heavy memory usage during startup. Each component (embedder, retrievers, rerankers) is only loaded when first accessed via private `_get_*()` methods. ```python def _get_text_embedder(self): if self.text_embedder is None: self.text_embedder = select_embedder( self.config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B"), self.ollama_config.get("host") ) return self.text_embedder ``` ### Thread Safety Implementation **Critical Issue**: ColBERT reranker and model loading are not thread-safe. The system uses multiple locks: ```python # Global locks to prevent race conditions _rerank_lock: Lock = Lock() # Protects .rank() calls _ai_reranker_init_lock: Lock = Lock() # Prevents concurrent model loading _sentence_pruner_lock: Lock = Lock() # Serializes Provence model init ``` When multiple queries run in parallel, only one thread can initialize heavy models or perform reranking operations. ### Retrieval Strategy Deep-Dive #### 1. Multi-Vector Dense Retrieval (`_get_dense_retriever()`) ```python self.dense_retriever = MultiVectorRetriever( db_manager, # LanceDB connection text_embedder, # Qwen3-Embedding embedder vision_model=None, # Optional multimodal fusion_config={} # Score combination rules ) ``` **Process**: 1. Query → embedding vector (1024D for Qwen3-Embedding-0.6B) 2. LanceDB ANN search using IVF-PQ index 3. Cosine similarity scoring 4. Returns top-K with metadata #### 2. BM25 Full-Text Search (`_get_bm25_retriever()`) ```python # Uses SQLite FTS5 under the hood SELECT chunk_id, text, bm25(fts_table) as score FROM fts_table WHERE fts_table MATCH ? ORDER BY bm25(fts_table) LIMIT ? ``` **Token Processing**: - Stemming via Porter algorithm - Stop-word removal - N-gram tokenization (configurable) #### 3. Hybrid Score Fusion When both retrievers are enabled: ```python final_score = (1 - dense_weight) * bm25_score + dense_weight * dense_score ``` Default `dense_weight = 0.7` favors semantic over lexical matching (updated from 0.5). ### Late-Chunk Merging Algorithm **Problem**: Small chunks lose context; large chunks dilute relevance. **Solution**: Retrieve small chunks, then expand with neighbors. ```python def _get_surrounding_chunks_lancedb(self, chunk, window_size): start_index = max(0, chunk_index - window_size) end_index = chunk_index + window_size sql_filter = f"document_id = '{document_id}' AND chunk_index >= {start_index} AND chunk_index <= {end_index}" results = tbl.search().where(sql_filter).to_list() # Sort by chunk_index to maintain document order return sorted(results, key=lambda x: x.get("chunk_index", 0)) ``` **Benefits**: - Maintains granular search precision - Provides richer context for answer generation - Configurable window size (default: 5 chunks = ~2500 tokens) ### AI Reranker Implementation #### ColBERT Strategy (via rerankers-lib) ```python from rerankers import Reranker self.ai_reranker = Reranker("answerdotai/answerai-colbert-small-v1", model_type="colbert") # Usage scores = reranker.rank(query, [doc.text for doc in candidates]) ``` **ColBERT Architecture**: - **Query encoding**: Each token → 128D vector - **Document encoding**: Each token → 128D vector - **Interaction**: MaxSim between all query-doc token pairs - **Advantage**: Fine-grained token-level matching #### Fallback: BGE Cross-Encoder ```python # When ColBERT fails/unavailable from sentence_transformers import CrossEncoder model = CrossEncoder('BAAI/bge-reranker-base') scores = model.predict([(query, doc.text) for doc in candidates]) ``` ### Answer Synthesis Pipeline #### Prompt Engineering Pattern ```python def _synthesize_final_answer(self, query: str, facts: str, *, event_callback=None): prompt = f""" You are an AI assistant specialised in answering questions from retrieved context. Context you receive • VERIFIED FACTS – text snippets retrieved from the user's documents. • ORIGINAL QUESTION – the user's actual query. Instructions 1. Evaluate each snippet for relevance to the ORIGINAL QUESTION 2. Synthesise an answer **using only information from relevant snippets** 3. If snippets contradict, mention the contradiction explicitly 4. If insufficient information: "I could not find that information in the provided documents." 5. Provide thorough, well-structured answer with relevant numbers/names 6. Do **not** introduce external knowledge ––––– Retrieved Snippets ––––– {facts} –––––––––––––––––––––––––––––– ORIGINAL QUESTION: "{query}" """ response = self.llm_client.complete_stream( prompt=prompt, model=self.ollama_config["generation_model"] # qwen3:8b ) for chunk in response: if event_callback: event_callback({"type": "answer_chunk", "content": chunk}) yield chunk ``` **Advanced Features**: - **Source Attribution**: Automatic citation generation - **Confidence Scoring**: Based on retrieval scores and snippet relevance - **Answer Verification**: Optional grounding check via Verifier component ### Query Processing and Transformation #### Query Decomposition ```python class QueryDecomposer: def decompose_query(self, query: str) -> List[str]: """Break complex queries into simpler sub-queries.""" decomposition_prompt = f""" Break down this complex question into 2-4 simpler sub-questions that would help answer the original question. Original question: {query} Sub-questions: 1. 2. 3. 4. """ response = self.llm_client.complete( prompt=decomposition_prompt, model=self.enrichment_model # qwen3:0.6b for speed ) # Parse response into list of sub-queries return self._parse_subqueries(response) ``` #### HyDE (Hypothetical Document Embeddings) ```python class HyDEGenerator: def generate_hypothetical_doc(self, query: str) -> str: """Generate hypothetical document that would answer the query.""" hyde_prompt = f""" Generate a hypothetical document passage that would perfectly answer this question: Question: {query} Hypothetical passage: """ response = self.llm_client.complete( prompt=hyde_prompt, model=self.enrichment_model ) return response.strip() ``` ### Caching and Performance Optimization #### Semantic Query Caching ```python class RetrievalPipeline: def __init__(self, config, ollama_client, ollama_config): # TTL cache for embeddings and results self.query_cache = TTLCache(maxsize=100, ttl=300) # 5 min TTL self.embedding_cache = LRUCache(maxsize=500) self.semantic_threshold = 0.98 # Similarity threshold for cache hits def get_cached_result(self, query: str, session_id: str = None) -> Optional[Dict]: """Check for semantically similar cached queries.""" query_embedding = self._get_text_embedder().create_embeddings([query])[0] for cached_query, cached_data in self.query_cache.items(): cached_embedding = cached_data["embedding"] similarity = cosine_similarity([query_embedding], [cached_embedding])[0][0] if similarity > self.semantic_threshold: # Check session scope if configured if self.cache_scope == "session" and cached_data.get("session_id") != session_id: continue print(f"🎯 Cache hit: {similarity:.3f} similarity") return cached_data["result"] return None ``` #### Batch Processing Optimizations ```python def process_query_batch(self, queries: List[str]) -> List[Dict]: """Process multiple queries efficiently.""" # Batch embed all queries query_embeddings = self._get_text_embedder().create_embeddings(queries) # Batch search results = [] for i, query in enumerate(queries): embedding = query_embeddings[i] # Search with pre-computed embedding dense_results = self._search_dense_with_embedding(embedding) bm25_results = self._search_bm25(query) # Combine and rerank combined = self._combine_results(dense_results, bm25_results) reranked = self._rerank_batch([query], [combined])[0] results.append(reranked) return results ``` ### Advanced Search Features #### Conversational Context Integration ```python def answer_with_history(self, query: str, conversation_history: List[Dict], **kwargs): """Answer query with conversation context.""" # Build conversational context context_prompt = self._build_conversation_context(conversation_history) # Expand query with context expanded_query = f"{context_prompt}\n\nCurrent question: {query}" # Process with expanded context return self.answer_stream(expanded_query, **kwargs) def _build_conversation_context(self, history: List[Dict]) -> str: """Build context from conversation history.""" context_parts = [] for turn in history[-3:]: # Last 3 turns for context if turn.get("role") == "user": context_parts.append(f"Previous question: {turn['content']}") elif turn.get("role") == "assistant": # Extract key points from previous answers context_parts.append(f"Previous context: {turn['content'][:200]}...") return "\n".join(context_parts) ``` #### Multi-Index Search ```python def search_multiple_indexes(self, query: str, index_ids: List[str], **kwargs): """Search across multiple document indexes.""" all_results = [] for index_id in index_ids: table_name = f"text_pages_{index_id}" try: # Search individual index index_results = self._search_single_index(query, table_name, **kwargs) # Add index metadata for result in index_results: result["source_index"] = index_id all_results.extend(index_results) except Exception as e: print(f"⚠️ Error searching index {index_id}: {e}") continue # Global reranking across all indexes if len(all_results) > kwargs.get("retrieval_k", 20): all_results = self._rerank_global(query, all_results, **kwargs) return all_results ``` ### Error Handling and Resilience #### Graceful Degradation ```python def answer_stream(self, query: str, **kwargs): """Main answer method with comprehensive error handling.""" try: # Try full pipeline return self._answer_stream_full_pipeline(query, **kwargs) except Exception as e: print(f"⚠️ Full pipeline failed: {e}") try: # Fallback: Dense-only search kwargs["search_type"] = "dense" kwargs["ai_rerank"] = False return self._answer_stream_fallback(query, **kwargs) except Exception as e2: print(f"⚠️ Fallback failed: {e2}") # Last resort: Direct LLM answer return self._direct_llm_answer(query) def _direct_llm_answer(self, query: str): """Direct LLM answer as last resort.""" prompt = f""" The document retrieval system is temporarily unavailable. Please provide a helpful response acknowledging this limitation. User question: {query} Response: """ response = self.llm_client.complete_stream( prompt=prompt, model=self.ollama_config["generation_model"] ) yield "⚠️ Document search unavailable. Providing general response:\n\n" for chunk in response: yield chunk ``` #### Recovery Mechanisms ```python def recover_from_embedding_failure(self, query: str, **kwargs): """Recover when embedding model fails.""" print("🔄 Attempting embedding model recovery...") # Try to reinitialize embedder try: self.text_embedder = None # Clear failed instance embedder = self._get_text_embedder() # Reinitialize # Test with simple query test_embedding = embedder.create_embeddings(["test"]) if test_embedding is not None: print("✅ Embedding model recovered") return True except Exception as e: print(f"❌ Recovery failed: {e}") # Fallback to BM25-only search kwargs["search_type"] = "bm25" kwargs["ai_rerank"] = False print("🔄 Falling back to keyword search only") return False ``` ### Performance Monitoring and Metrics #### Query Performance Tracking ```python class PerformanceTracker: def __init__(self): self.metrics = { "query_count": 0, "avg_response_time": 0, "cache_hit_rate": 0, "error_rate": 0, "embedding_time": 0, "retrieval_time": 0, "reranking_time": 0, "synthesis_time": 0 } @contextmanager def track_query(self, query: str): """Context manager for tracking query performance.""" start_time = time.time() try: yield # Success metrics duration = time.time() - start_time self.metrics["query_count"] += 1 self.metrics["avg_response_time"] = ( (self.metrics["avg_response_time"] * (self.metrics["query_count"] - 1) + duration) / self.metrics["query_count"] ) except Exception as e: # Error metrics self.metrics["error_rate"] = ( self.metrics["error_rate"] * self.metrics["query_count"] + 1 ) / (self.metrics["query_count"] + 1) raise e finally: self.metrics["query_count"] += 1 ``` #### Resource Usage Monitoring ```python def monitor_memory_usage(self): """Monitor memory usage of pipeline components.""" import psutil import gc process = psutil.Process() memory_info = process.memory_info() print(f"Memory Usage: {memory_info.rss / 1024 / 1024:.1f} MB") # Component-specific monitoring if hasattr(self, 'text_embedder') and self.text_embedder: print(f"Embedder loaded: {type(self.text_embedder).__name__}") if hasattr(self, 'ai_reranker') and self.ai_reranker: print(f"Reranker loaded: {type(self.ai_reranker).__name__}") # Suggest cleanup if memory usage is high if memory_info.rss > 8 * 1024 * 1024 * 1024: # 8GB print("⚠️ High memory usage detected - consider cleanup") gc.collect() ``` --- ## Configuration Reference ### Default Pipeline Configuration ```python RETRIEVAL_CONFIG = { "retriever": "multivector", "search_type": "hybrid", "retrieval_k": 20, "reranker_top_k": 10, "dense_weight": 0.7, "late_chunking": { "enabled": True, "window_size": 5 }, "ai_rerank": True, "verify_answers": False, "cache_enabled": True, "cache_ttl": 300, "semantic_cache_threshold": 0.98 } ``` ### Model Configuration ```python MODEL_CONFIG = { "embedding_model": "Qwen/Qwen3-Embedding-0.6B", "generation_model": "qwen3:8b", "enrichment_model": "qwen3:0.6b", "reranker_model": "answerdotai/answerai-colbert-small-v1", "fallback_reranker": "BAAI/bge-reranker-base" } ``` ### Performance Tuning ```python PERFORMANCE_CONFIG = { "batch_sizes": { "embedding": 32, "reranking": 16, "synthesis": 1 }, "timeouts": { "embedding": 30, "retrieval": 60, "reranking": 30, "synthesis": 120 }, "memory_limits": { "max_cache_size": 1000, "max_results_per_query": 100, "chunk_size_limit": 2048 } } ``` ## Extension Examples ### Custom Retriever Implementation ```python class CustomRetriever(BaseRetriever): def search(self, query: str, k: int = 10) -> List[Dict]: """Implement custom search logic.""" # Your custom retrieval implementation pass def get_embeddings(self, texts: List[str]) -> np.ndarray: """Generate embeddings for custom retrieval.""" # Your custom embedding logic pass ``` ### Custom Reranker Implementation ```python class CustomReranker(BaseReranker): def rank(self, query: str, documents: List[Dict]) -> List[Dict]: """Implement custom reranking logic.""" # Your custom reranking implementation pass ``` ### Custom Query Transformer ```python class CustomQueryTransformer: def transform(self, query: str, context: Dict = None) -> str: """Transform query based on context.""" # Your custom query transformation logic pass ``` ================================================ FILE: Documentation/system_overview.md ================================================ # 🏗️ RAG System - Complete System Overview _Last updated: 2025-01-09_ This document provides a comprehensive overview of the Advanced Retrieval-Augmented Generation (RAG) System, covering its architecture, components, data flow, and operational characteristics. --- ## 1. System Architecture ### 1.1 High-Level Architecture The RAG system implements a sophisticated 4-tier microservices architecture: ```mermaid graph TB subgraph "Client Layer" Browser[👤 User Browser] UI[Next.js Frontend
React/TypeScript] Browser --> UI end subgraph "API Gateway Layer" Backend[Backend Server
Python HTTP Server
Port 8000] UI -->|REST API| Backend end subgraph "Processing Layer" RAG[RAG API Server
Document Processing
Port 8001] Backend -->|Internal API| RAG end subgraph "LLM Service Layer" Ollama[Ollama Server
LLM Inference
Port 11434] RAG -->|Model Calls| Ollama end subgraph "Storage Layer" SQLite[(SQLite Database
Sessions & Metadata)] LanceDB[(LanceDB
Vector Embeddings)] FileSystem[File System
Documents & Indexes] Backend --> SQLite RAG --> LanceDB RAG --> FileSystem end ``` ### 1.2 Component Breakdown | Component | Technology | Port | Purpose | |-----------|------------|------|---------| | **Frontend** | Next.js 15, React 19, TypeScript | 3000 | User interface, chat interactions | | **Backend** | Python 3.11, HTTP Server | 8000 | API gateway, session management, routing | | **RAG API** | Python 3.11, Advanced NLP | 8001 | Document processing, retrieval, generation | | **Ollama** | Go-based LLM server | 11434 | Local LLM inference (embedding, generation) | | **SQLite** | Embedded database | - | Sessions, messages, index metadata | | **LanceDB** | Vector database | - | Document embeddings, similarity search | --- ## 2. Core Functionality ### 2.1 Intelligent Dual-Layer Routing The system's key innovation is its **dual-layer routing architecture** that optimizes both speed and intelligence: #### **Layer 1: Speed Optimization Routing** - **Location**: `backend/server.py` - **Purpose**: Route simple queries to Direct LLM (~1.3s) vs complex queries to RAG Pipeline (~20s) - **Decision Logic**: Pattern matching, keyword detection, query complexity analysis ```python # Example routing decisions "Hello!" → Direct LLM (greeting pattern) "What does the document say about pricing?" → RAG Pipeline (document keyword) "What's 2+2?" → Direct LLM (simple + short) "Summarize the key findings from the report" → RAG Pipeline (complex + indicators) ``` #### **Layer 2: Intelligence Optimization Routing** - **Location**: `rag_system/agent/loop.py` - **Purpose**: Within RAG pipeline, route to optimal processing method - **Methods**: - `direct_answer`: General knowledge queries - `rag_query`: Document-specific queries requiring retrieval - `graph_query`: Entity relationship queries (future feature) ### 2.2 Document Processing Pipeline #### **Indexing Process** 1. **Document Upload**: PDF files uploaded via web interface 2. **Text Extraction**: Docling library extracts text with layout preservation 3. **Chunking**: Intelligent chunking with configurable strategies (DocLing, Late Chunking, Standard) 4. **Embedding**: Text converted to vector embeddings using Qwen models 5. **Storage**: Vectors stored in LanceDB with metadata in SQLite #### **Retrieval Process** 1. **Query Processing**: User query analyzed and contextualized 2. **Embedding**: Query converted to vector embedding 3. **Search**: Hybrid search combining vector similarity and BM25 keyword matching 4. **Reranking**: AI-powered reranking for relevance optimization 5. **Synthesis**: LLM generates final answer using retrieved context ### 2.3 Advanced Features #### **Query Decomposition** - Complex queries automatically broken into sub-queries - Parallel processing of sub-queries for efficiency - Intelligent composition of final answers #### **Contextual Enrichment** - Conversation history integration - Context-aware query expansion - Session-based memory management #### **Verification System** - Answer verification against source documents - Confidence scoring and grounding checks - Source attribution and citation --- ## 3. Data Architecture ### 3.1 Storage Systems #### **SQLite Database** (`backend/chat_data.db`) ```sql -- Core tables sessions -- Chat sessions with metadata messages -- Individual messages and responses indexes -- Document index metadata session_indexes -- Links sessions to their indexes ``` #### **LanceDB Vector Store** (`./lancedb/`) ``` tables/ ├── text_pages_[uuid] -- Document text embeddings ├── image_pages_[uuid] -- Image embeddings (future) └── metadata_[uuid] -- Document metadata ``` #### **File System** (`./index_store/`) ``` index_store/ ├── overviews/ -- Document summaries for routing ├── bm25/ -- BM25 keyword indexes └── graph/ -- Knowledge graph data ``` ### 3.2 Data Flow 1. **Document Upload** → File System (`shared_uploads/`) 2. **Processing** → Embeddings stored in LanceDB 3. **Metadata** → Index info stored in SQLite 4. **Query** → Search LanceDB + SQLite coordination 5. **Response** → Message history stored in SQLite --- ## 4. Model Architecture ### 4.1 Configurable Model Pipeline The system supports multiple embedding and generation models with automatic switching: #### **Current Model Configuration** ```python EXTERNAL_MODELS = { "embedding_model": "Qwen/Qwen3-Embedding-0.6B", # 1024D "reranker_model": "answerdotai/answerai-colbert-small-v1", # ColBERT reranker "vision_model": "Qwen/Qwen-VL-Chat", # Vision model for multimodal "fallback_reranker": "BAAI/bge-reranker-base", # Backup reranker } OLLAMA_CONFIG = { "generation_model": "qwen3:8b", # High-quality generation "enrichment_model": "qwen3:0.6b", # Fast enrichment/routing "host": "http://localhost:11434" } ``` #### **Model Switching** - **Per-Session**: Each chat session can use different embedding models - **Automatic**: System automatically switches models based on index metadata - **Dynamic**: Models loaded just-in-time to optimize memory usage ### 4.2 Supported Models #### **Embedding Models** - `Qwen/Qwen3-Embedding-0.6B` (1024D) - Default, fast and high-quality #### **Generation Models** (via Ollama) - `qwen3:8b` - Primary generation model (high quality) - `qwen3:0.6b` - Fast enrichment and routing model #### **Reranking Models** - `answerdotai/answerai-colbert-small-v1` - Primary ColBERT reranker - `BAAI/bge-reranker-base` - Fallback cross-encoder reranker #### **Vision Models** (Multimodal) - `Qwen/Qwen-VL-Chat` - Vision-language model for image processing --- ## 5. Pipeline Configurations ### 5.1 Default Production Pipeline ```python PIPELINE_CONFIGS = { "default": { "description": "Production-ready pipeline with hybrid search, AI reranking, and verification", "storage": { "lancedb_uri": "./lancedb", "text_table_name": "text_pages_v3", "bm25_path": "./index_store/bm25", "graph_path": "./index_store/graph/knowledge_graph.gml" }, "retrieval": { "retriever": "multivector", "search_type": "hybrid", "late_chunking": { "enabled": True, "table_suffix": "_lc_v3" }, "dense": { "enabled": True, "weight": 0.7 }, "bm25": { "enabled": True, "index_name": "rag_bm25_index" } }, "embedding_model_name": "Qwen/Qwen3-Embedding-0.6B", "reranker": { "enabled": True, "model_name": "answerdotai/answerai-colbert-small-v1", "top_k": 20 } } } ``` ### 5.2 Processing Options #### **Chunking Strategies** - **Standard**: Fixed-size chunks with overlap - **DocLing**: Structure-aware chunking using DocLing library - **Late Chunking**: Small chunks expanded at query time #### **Enrichment Options** - **Contextual Enrichment**: AI-generated chunk summaries - **Overview Building**: Document-level summaries for routing - **Graph Extraction**: Entity and relationship extraction --- ## 6. Performance Characteristics ### 6.1 Response Times | Operation | Time Range | Notes | |-----------|------------|-------| | Simple Chat | 1-3 seconds | Direct LLM, no retrieval | | Document Query | 5-15 seconds | Includes retrieval and reranking | | Complex Analysis | 15-30 seconds | Multi-step reasoning | | Document Indexing | 2-5 min/100MB | Depends on enrichment settings | ### 6.2 Memory Usage | Component | Memory Usage | Notes | |-----------|--------------|-------| | Embedding Model | 1-2GB | Qwen3-Embedding-0.6B | | Generation Model | 8-16GB | qwen3:8b | | Reranker Model | 500MB-1GB | ColBERT reranker | | Database Cache | 500MB-2GB | LanceDB and SQLite | ### 6.3 Scalability - **Concurrent Users**: 5-10 users with 16GB RAM - **Document Capacity**: 10,000+ documents per index - **Query Throughput**: 10-20 queries/minute per instance - **Storage**: Approximately 1MB per 100 pages indexed --- ## 7. Security & Privacy ### 7.1 Data Privacy - **Local Processing**: All AI models run locally via Ollama - **No External Calls**: No data sent to external APIs - **Document Isolation**: Documents stored locally with session-based access - **User Isolation**: Each session maintains separate context --- ## 8. Configuration & Customization ### 8.1 Model Configuration Models can be configured in `rag_system/main.py`: ```python # Embedding model configuration EXTERNAL_MODELS = { "embedding_model": "Qwen/Qwen3-Embedding-0.6B", # Your preferred model "reranker_model": "answerdotai/answerai-colbert-small-v1", } # Generation model configuration OLLAMA_CONFIG = { "generation_model": "qwen3:8b", # Your LLM model "enrichment_model": "qwen3:0.6b", # Your fast model } ``` ### 8.2 Pipeline Configuration Processing behavior configured in `PIPELINE_CONFIGS`: ```python PIPELINE_CONFIGS = { "retrieval": { "search_type": "hybrid", "dense": {"weight": 0.7}, "bm25": {"enabled": True} }, "chunking": { "chunk_size": 512, "chunk_overlap": 64, "enable_latechunk": True, "enable_docling": True } } ``` ### 8.3 UI Configuration Frontend behavior configured in environment variables: ```bash NEXT_PUBLIC_API_URL=http://localhost:8000 NEXT_PUBLIC_ENABLE_STREAMING=true NEXT_PUBLIC_MAX_FILE_SIZE=50MB ``` --- ## 9. Monitoring & Observability ### 9.1 Logging System - **Structured Logging**: JSON-formatted logs with timestamps - **Log Levels**: DEBUG, INFO, WARNING, ERROR - **Log Rotation**: Automatic log file rotation - **Component Isolation**: Separate logs per service ### 9.2 Health Monitoring - **Health Endpoints**: `/health` on all services - **Service Dependencies**: Cascading health checks - **Performance Metrics**: Response times, error rates - **Resource Monitoring**: Memory, CPU, disk usage ### 9.3 Debugging Features - **Debug Mode**: Detailed operation tracing - **Query Inspection**: Step-by-step query processing - **Model Switching Logs**: Embedding model change tracking - **Error Reporting**: Comprehensive error context --- ## ⚙️ Configuration Modes The system supports multiple configuration modes optimized for different use cases: ### **Default Mode** (`"default"`) - **Description**: Production-ready pipeline with full features - **Search**: Hybrid (dense + BM25) with 0.7 dense weight - **Reranking**: AI-powered ColBERT reranker - **Query Processing**: Query decomposition enabled - **Verification**: Grounding verification enabled - **Performance**: ~3-8 seconds per query - **Memory**: ~10-16GB (with models loaded) ### **Fast Mode** (`"fast"`) - **Description**: Speed-optimized pipeline with minimal overhead - **Search**: Vector-only (no BM25, no late chunking) - **Reranking**: Disabled - **Query Processing**: Single-pass, no decomposition - **Verification**: Disabled - **Performance**: ~1-3 seconds per query - **Memory**: ~8-12GB (with models loaded) ### **BM25 Mode** (`"bm25"`) - **Description**: Traditional keyword-based search - **Search**: BM25 only - **Use Case**: Exact keyword matching, legacy compatibility ### **Graph RAG Mode** (`"graph_rag"`) - **Description**: Knowledge graph integration (currently disabled) - **Status**: Available for future implementation - **Use Case**: Relationship-aware retrieval --- ## 10. Development & Extension ### 10.1 Architecture Principles - **Modular Design**: Clear separation of concerns - **Configuration-Driven**: Behavior controlled via config files - **Lazy Loading**: Components loaded on-demand - **Thread Safety**: Proper synchronization for concurrent access ### 10.2 Extension Points - **Custom Retrievers**: Implement `BaseRetriever` interface - **Custom Chunkers**: Extend chunking strategies - **Custom Models**: Add new embedding or generation models - **Custom Pipelines**: Create specialized processing workflows ### 10.3 Testing Strategy - **Unit Tests**: Individual component testing - **Integration Tests**: End-to-end workflow testing - **Performance Tests**: Load and stress testing - **Health Checks**: Automated system validation --- > **Note**: This overview reflects the current implementation as of 2025-01-09. For the latest changes, check the git history and individual component documentation. ================================================ FILE: Documentation/triage_system.md ================================================ # 🔀 Triage / Routing System _Maps to `rag_system/agent/loop.Agent._should_use_rag`, `_route_using_overviews`, and the fast-path router in `backend/server.py`._ ## Purpose Determine, for every incoming query, whether it should be answered by: 1. **Direct LLM Generation** (no retrieval) — faster, cheaper. 2. **Retrieval-Augmented Generation (RAG)** — when the answer likely requires document context. ## Decision Signals | Signal | Source | Notes | |--------|--------|-------| | Keyword/regex check | `backend/server.py` (fast path) | Hard-coded quick wins (`what time`, `define`, etc.). | | Index presence | SQLite (session → indexes) | If no indexes linked, direct LLM. | | Overview routing | `_route_using_overviews()` | Uses document overviews and enrichment model to predict relevance. | | LLM router prompt | `agent/loop.py` lines 648-665 | Final arbitrator (Ollama call, JSON output). | ## High-level Flow ```mermaid flowchart TD Q["Incoming Query"] --> S1{Session\nHas Indexes?} S1 -- no --> LLM["Direct LLM Generation"] S1 -- yes --> S2{Fast Regex\nHeuristics} S2 -- match--> LLM S2 -- no --> S3{Overview\nRelevance > τ?} S3 -- low --> LLM S3 -- high --> S4[LLM Router\n(prompt @648)] S4 -- "route: RAG" --> RAG["Retrieval Pipeline"] S4 -- "route: DIRECT" --> LLM ``` ## Detailed Sequence (Code-level) 1. **backend/server.py** * `handle_session_chat()` builds `router_prompt` (line ~435) and makes a **first pass** decision before calling the heavy agent code. 2. **agent.loop._should_use_rag()** * Re-evaluates using richer features (e.g., token count, query type). 3. **Overviews Phase** (`_route_using_overviews()`) * Loads JSONL overviews file per index. * Calls enrichment model (`qwen3:0.6b`) with prompt: _"Does this overview mention … ? "_ → returns yes/no. 4. **LLM Router** (prompt lines 648-665) * JSON-only response `{ "route": "RAG" | "DIRECT" }`. ## Interfaces & Dependencies | Component | Calls / Data | |-----------|--------------| | SQLite `chat_sessions` | Reads `indexes` column to know linked index IDs. | | LanceDB Overviews | Reads `index_store/overviews/.jsonl`. | | `OllamaClient` | Generates LLM router decision. | ## Config Flags * `PIPELINE_CONFIGS.triage.enabled` – global toggle. * Env var `TRIAGE_OVERVIEW_THRESHOLD` – min similarity score to prefer RAG (default 0.35). ## Failure / Fallback Modes 1. If overview file missing → skip to LLM router. 2. If LLM router errors → default to RAG (safer) but log warning. --- _Keep this document updated whenever routing heuristics, thresholds, or prompt wording change._ ================================================ FILE: Documentation/verifier.md ================================================ # ✅ Answer Verifier _File: `rag_system/agent/verifier.py`_ ## Objective Assess whether an answer produced by RAG is **grounded** in the retrieved context snippets. ## Prompt (see `prompt_inventory.md` `verifier.fact_check`) Strict JSON schema: ```jsonc { "verdict": "SUPPORTED" | "NOT_SUPPORTED" | "NEEDS_CLARIFICATION", "is_grounded": true | false, "reasoning": "< ≤30 words >", "confidence_score": 0-100 } ``` ## Sequence Diagram ```mermaid sequenceDiagram participant RP as Retrieval Pipeline participant V as Verifier participant LLM as Ollama RP->>V: query, context, answer V->>LLM: verification prompt LLM-->>V: JSON verdict V-->>RP: VerificationResult ``` ## Usage Sites | Caller | Code | When | |--------|------|------| | `RetrievalPipeline.answer_stream()` | `pipelines/retrieval_pipeline.py` | If `verify=true` flag from frontend. | | `Agent.loop.run()` | fallback path | Experimental for composed answers. | ## Config | Flag | Default | Meaning | |------|---------|---------| | `verify` | false | Frontend toggle; if true verifier runs. | | `generation_model` | `qwen3:8b` | Same model as answer generation. ## Failure Modes * If LLM returns invalid JSON → parse exception handled, result = NOT_SUPPORTED. * If verification call times out → pipeline logs but still returns answer (unverified). --- _Keep updated when schema or usage flags change._ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2025 PromptEngineer Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # LocalGPT - Private Document Intelligence Platform

PromtEngineer%2FlocalGPT | Trendshift

[![GitHub Stars](https://img.shields.io/github/stars/PromtEngineer/localGPT?style=flat-square)](https://github.com/PromtEngineer/localGPT/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/PromtEngineer/localGPT?style=flat-square)](https://github.com/PromtEngineer/localGPT/network/members) [![GitHub Issues](https://img.shields.io/github/issues/PromtEngineer/localGPT?style=flat-square)](https://github.com/PromtEngineer/localGPT/issues) [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/PromtEngineer/localGPT?style=flat-square)](https://github.com/PromtEngineer/localGPT/pulls) [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg?style=flat-square)](https://www.python.org/downloads/) [![License](https://img.shields.io/badge/license-MIT-green.svg?style=flat-square)](LICENSE) [![Docker](https://img.shields.io/badge/docker-supported-blue.svg?style=flat-square)](https://www.docker.com/)

Follow on X Join our Discord

## 🚀 What is LocalGPT? LocalGPT is a **fully private, on-premise Document Intelligence platform**. Ask questions, summarise, and uncover insights from your files with state-of-the-art AI—no data ever leaves your machine. More than a traditional RAG (Retrieval-Augmented Generation) tool, LocalGPT features a **hybrid search engine** that blends semantic similarity, keyword matching, and [Late Chunking](https://jina.ai/news/late-chunking-in-long-context-embedding-models/) for long-context precision. A **smart router** automatically selects between RAG and direct LLM answering for every query, while **contextual enrichment** and sentence-level [Context Pruning](https://huggingface.co/naver/provence-reranker-debertav3-v1) surface only the most relevant content. An independent **verification** pass adds an extra layer of accuracy. The architecture is **modular and lightweight**—enable only the components you need. With a pure-Python core and minimal dependencies, LocalGPT is simple to deploy, run, and maintain on any infrastructure.The system has minimal dependencies on frameworks and libraries, making it easy to deploy and maintain. The RAG system is pure python and does not require any additional dependencies. ## ▶️ Video Watch this [video](https://youtu.be/JTbtGH3secI) to get started with LocalGPT. | Home | Create Index | Chat | |------|--------------|------| | ![](Documentation/images/Home.png) | ![](Documentation/images/Index%20Creation.png) | ![](Documentation/images/Retrieval%20Process.png) | ## ✨ Features - **Utmost Privacy**: Your data remains on your computer, ensuring 100% security. - **Versatile Model Support**: Seamlessly integrate a variety of open-source models via Ollama. - **Diverse Embeddings**: Choose from a range of open-source embeddings. - **Reuse Your LLM**: Once downloaded, reuse your LLM without the need for repeated downloads. - **Chat History**: Remembers your previous conversations (in a session). - **API**: LocalGPT has an API that you can use for building RAG Applications. - **GPU, CPU, HPU & MPS Support**: Supports multiple platforms out of the box, Chat with your data using `CUDA`, `CPU`, `HPU (Intel® Gaudi®)` or `MPS` and more! ### 📖 Document Processing - **Multi-format Support**: PDF, DOCX, TXT, Markdown, and more (Currently only PDF is supported) - **Contextual Enrichment**: Enhanced document understanding with AI-generated context, inspired by [Contextual Retrieval](https://www.anthropic.com/news/contextual-retrieval) - **Batch Processing**: Handle multiple documents simultaneously ### 🤖 AI-Powered Chat - **Natural Language Queries**: Ask questions in plain English - **Source Attribution**: Every answer includes document references - **Smart Routing**: Automatically chooses between RAG and direct LLM responses - **Query Decomposition**: Breaks complex queries into sub-questions for better answers - **Semantic Caching**: TTL-based caching with similarity matching for faster responses - **Session-Aware History**: Maintains conversation context across interactions - **Answer Verification**: Independent verification pass for accuracy - **Multiple AI Models**: Ollama for inference, HuggingFace for embeddings and reranking ### 🛠️ Developer-Friendly - **RESTful APIs**: Complete API access for integration - **Real-time Progress**: Live updates during document processing - **Flexible Configuration**: Customize models, chunk sizes, and search parameters - **Extensible Architecture**: Plugin system for custom components ### 🎨 Modern Interface - **Intuitive Web UI**: Clean, responsive design - **Session Management**: Organize conversations by topic - **Index Management**: Easy document collection management - **Real-time Chat**: Streaming responses for immediate feedback --- ## 🚀 Quick Start Note: The installation is currently only tested on macOS. ### Prerequisites - Python 3.8 or higher (tested with Python 3.11.5) - Node.js 16+ and npm (tested with Node.js 23.10.0, npm 10.9.2) - Docker (optional, for containerized deployment) - 8GB+ RAM (16GB+ recommended) - Ollama (required for both deployment approaches) ### ***NOTE*** Before this brach is moved to the main branch, please clone this branch for instalation: ```bash git clone -b localgpt-v2 https://github.com/PromtEngineer/localGPT.git cd localGPT ``` ### Option 1: Docker Deployment ```bash # Clone the repository git clone https://github.com/PromtEngineer/localGPT.git cd localGPT # Install Ollama locally (required even for Docker) curl -fsSL https://ollama.ai/install.sh | sh ollama pull qwen3:0.6b ollama pull qwen3:8b # Start Ollama ollama serve # Start with Docker (in a new terminal) ./start-docker.sh # Access the application open http://localhost:3000 ``` **Docker Management Commands:** ```bash # Check container status docker compose ps # View logs docker compose logs -f # Stop containers ./start-docker.sh stop ``` ### Option 2: Direct Development (Recommended for Development) ```bash # Clone the repository git clone https://github.com/PromtEngineer/localGPT.git cd localGPT # Install Python dependencies pip install -r requirements.txt # Key dependencies installed: # - torch==2.4.1, transformers==4.51.0 (AI models) # - lancedb (vector database) # - rank_bm25, fuzzywuzzy (search algorithms) # - sentence_transformers, rerankers (embedding/reranking) # - docling (document processing) # - colpali-engine (multimodal processing - support coming soon) # Install Node.js dependencies npm install # Install and start Ollama curl -fsSL https://ollama.ai/install.sh | sh ollama pull qwen3:0.6b ollama pull qwen3:8b ollama serve # Start the system (in a new terminal) python run_system.py # Access the application open http://localhost:3000 ``` **System Management:** ```bash # Check system health (comprehensive diagnostics) python system_health_check.py # Check service status and health python run_system.py --health # Start in production mode python run_system.py --mode prod # Skip frontend (backend + RAG API only) python run_system.py --no-frontend # View aggregated logs python run_system.py --logs-only # Stop all services python run_system.py --stop # Or press Ctrl+C in the terminal running python run_system.py ``` **Service Architecture:** The `run_system.py` launcher manages four key services: - **Ollama Server** (port 11434): AI model serving - **RAG API Server** (port 8001): Document processing and retrieval - **Backend Server** (port 8000): Session management and API endpoints - **Frontend Server** (port 3000): React/Next.js web interface ### Option 3: Manual Component Startup ```bash # Terminal 1: Start Ollama ollama serve # Terminal 2: Start RAG API python -m rag_system.api_server # Terminal 3: Start Backend cd backend && python server.py # Terminal 4: Start Frontend npm run dev # Access at http://localhost:3000 ``` --- ### Detailed Installation #### 1. Install System Dependencies **Ubuntu/Debian:** ```bash sudo apt update sudo apt install python3.8 python3-pip nodejs npm docker.io docker-compose ``` **macOS:** ```bash brew install python@3.8 node npm docker docker-compose ``` **Windows:** ```bash # Install Python 3.8+, Node.js, and Docker Desktop # Then use PowerShell or WSL2 ``` #### 2. Install AI Models **Install Ollama (Recommended):** ```bash # Install Ollama curl -fsSL https://ollama.ai/install.sh | sh # Pull recommended models ollama pull qwen3:0.6b # Fast generation model ollama pull qwen3:8b # High-quality generation model ``` #### 3. Configure Environment ```bash # Copy environment template cp .env.example .env # Edit configuration nano .env ``` **Key Configuration Options:** ```env # AI Models (referenced in rag_system/main.py) OLLAMA_HOST=http://localhost:11434 # Database Paths (used by backend and RAG system) DATABASE_PATH=./backend/chat_data.db VECTOR_DB_PATH=./lancedb # Server Settings (used by run_system.py) BACKEND_PORT=8000 FRONTEND_PORT=3000 RAG_API_PORT=8001 # Optional: Override default models GENERATION_MODEL=qwen3:8b ENRICHMENT_MODEL=qwen3:0.6b EMBEDDING_MODEL=Qwen/Qwen3-Embedding-0.6B RERANKER_MODEL=answerdotai/answerai-colbert-small-v1 ``` #### 4. Initialize the System ```bash # Run system health check python system_health_check.py # Initialize databases python -c "from backend.database import ChatDatabase; ChatDatabase().init_database()" # Test installation python -c "from rag_system.main import get_agent; print('✅ Installation successful!')" # Validate complete setup python run_system.py --health ``` --- ## 🎯 Getting Started ### 1. Create Your First Index An **index** is a collection of processed documents that you can chat with. #### Using the Web Interface: 1. Open http://localhost:3000 2. Click "Create New Index" 3. Upload your documents (PDF, DOCX, TXT) 4. Configure processing options 5. Click "Build Index" #### Using Scripts: ```bash # Simple script approach ./simple_create_index.sh "My Documents" "path/to/document.pdf" # Interactive script python create_index_script.py ``` #### Using API: ```bash # Create index curl -X POST http://localhost:8000/indexes \ -H "Content-Type: application/json" \ -d '{"name": "My Index", "description": "My documents"}' # Upload documents curl -X POST http://localhost:8000/indexes/INDEX_ID/upload \ -F "files=@document.pdf" # Build index curl -X POST http://localhost:8000/indexes/INDEX_ID/build ``` ### 2. Start Chatting Once your index is built: 1. **Create a Chat Session**: Click "New Chat" or use an existing session 2. **Select Your Index**: Choose which document collection to query 3. **Ask Questions**: Type natural language questions about your documents 4. **Get Answers**: Receive AI-generated responses with source citations ### 3. Advanced Features #### Custom Model Configuration ```bash # Use different models for different tasks curl -X POST http://localhost:8000/sessions \ -H "Content-Type: application/json" \ -d '{ "title": "High Quality Session", "model": "qwen3:8b", "embedding_model": "Qwen/Qwen3-Embedding-4B" }' ``` #### Batch Document Processing ```bash # Process multiple documents at once python demo_batch_indexing.py --config batch_indexing_config.json ``` #### API Integration ```python import requests # Chat with your documents via API response = requests.post('http://localhost:8000/chat', json={ 'query': 'What are the key findings in the research papers?', 'session_id': 'your-session-id', 'search_type': 'hybrid', 'retrieval_k': 20 }) print(response.json()['response']) ``` --- ## 🔧 Configuration ### Model Configuration LocalGPT supports multiple AI model providers with centralized configuration: #### Ollama Models (Local Inference) ```python OLLAMA_CONFIG = { "host": "http://localhost:11434", "generation_model": "qwen3:8b", # Main text generation "enrichment_model": "qwen3:0.6b" # Lightweight routing/enrichment } ``` #### External Models (HuggingFace Direct) ```python EXTERNAL_MODELS = { "embedding_model": "Qwen/Qwen3-Embedding-0.6B", # 1024 dimensions "reranker_model": "answerdotai/answerai-colbert-small-v1", # ColBERT reranker "fallback_reranker": "BAAI/bge-reranker-base" # Backup reranker } ``` ### Pipeline Configuration LocalGPT offers two main pipeline configurations: #### Default Pipeline (Production-Ready) ```python "default": { "description": "Production-ready pipeline with hybrid search, AI reranking, and verification", "storage": { "lancedb_uri": "./lancedb", "text_table_name": "text_pages_v3", "bm25_path": "./index_store/bm25" }, "retrieval": { "retriever": "multivector", "search_type": "hybrid", "late_chunking": {"enabled": True}, "dense": {"enabled": True, "weight": 0.7}, "bm25": {"enabled": True} }, "reranker": { "enabled": True, "type": "ai", "strategy": "rerankers-lib", "model_name": "answerdotai/answerai-colbert-small-v1", "top_k": 10 }, "query_decomposition": {"enabled": True, "max_sub_queries": 3}, "verification": {"enabled": True}, "retrieval_k": 20, "contextual_enricher": {"enabled": True, "window_size": 1} } ``` #### Fast Pipeline (Speed-Optimized) ```python "fast": { "description": "Speed-optimized pipeline with minimal overhead", "retrieval": { "search_type": "vector_only", "late_chunking": {"enabled": False} }, "reranker": {"enabled": False}, "query_decomposition": {"enabled": False}, "verification": {"enabled": False}, "retrieval_k": 10, "contextual_enricher": {"enabled": False} } ``` ### Search Configuration ```python SEARCH_CONFIG = { 'hybrid': { 'dense_weight': 0.7, 'sparse_weight': 0.3, 'retrieval_k': 20, 'reranker_top_k': 10 } } ``` --- ## 🛠️ Troubleshooting ### Common Issues #### Installation Problems ```bash # Check Python version python --version # Should be 3.8+ # Check dependencies pip list | grep -E "(torch|transformers|lancedb)" # Reinstall dependencies pip install -r requirements.txt --force-reinstall ``` #### Model Loading Issues ```bash # Check Ollama status ollama list curl http://localhost:11434/api/tags # Pull missing models ollama pull qwen3:0.6b ``` #### Database Issues ```bash # Check database connectivity python -c "from backend.database import ChatDatabase; db = ChatDatabase(); print('✅ Database OK')" # Reset database (WARNING: This deletes all data) rm backend/chat_data.db python -c "from backend.database import ChatDatabase; ChatDatabase().init_database()" ``` #### Performance Issues ```bash # Check system resources python system_health_check.py # Monitor memory usage htop # or Task Manager on Windows # Optimize for low-memory systems export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512 ``` ### Getting Help 1. **Check Logs**: The system creates structured logs in the `logs/` directory: - `logs/system.log`: Main system events and errors - `logs/ollama.log`: Ollama server logs - `logs/rag-api.log`: RAG API processing logs - `logs/backend.log`: Backend server logs - `logs/frontend.log`: Frontend build and runtime logs 2. **System Health**: Run comprehensive diagnostics: ```bash python system_health_check.py # Full system diagnostics python run_system.py --health # Service status check ``` 3. **Health Endpoints**: Check individual service health: - Backend: `http://localhost:8000/health` - RAG API: `http://localhost:8001/health` - Ollama: `http://localhost:11434/api/tags` 4. **Documentation**: Check the [Technical Documentation](TECHNICAL_DOCS.md) 5. **GitHub Issues**: Report bugs and request features 6. **Community**: Join our Discord/Slack community --- ## 🔗 API Reference ### Core Endpoints #### Chat API ```http # Session-based chat (recommended) POST /sessions/{session_id}/chat Content-Type: application/json { "query": "What are the main topics discussed?", "search_type": "hybrid", "retrieval_k": 20, "ai_rerank": true, "context_window_size": 5 } # Legacy chat endpoint POST /chat Content-Type: application/json { "query": "What are the main topics discussed?", "session_id": "uuid", "search_type": "hybrid", "retrieval_k": 20 } ``` #### Index Management ```http # Create index POST /indexes Content-Type: application/json { "name": "My Index", "description": "Description", "config": "default" } # Get all indexes GET /indexes # Get specific index GET /indexes/{id} # Upload documents to index POST /indexes/{id}/upload Content-Type: multipart/form-data files: [file1.pdf, file2.pdf, ...] # Build index (process uploaded documents) POST /indexes/{id}/build Content-Type: application/json { "config_mode": "default", "enable_enrich": true, "chunk_size": 512 } # Delete index DELETE /indexes/{id} ``` #### Session Management ```http # Create session POST /sessions Content-Type: application/json { "title": "My Session", "model": "qwen3:0.6b" } # Get all sessions GET /sessions # Get specific session GET /sessions/{session_id} # Get session documents GET /sessions/{session_id}/documents # Get session indexes GET /sessions/{session_id}/indexes # Link index to session POST /sessions/{session_id}/indexes/{index_id} # Delete session DELETE /sessions/{session_id} # Rename session POST /sessions/{session_id}/rename Content-Type: application/json { "new_title": "Updated Session Name" } ``` ### Advanced Features #### Query Decomposition The system can break complex queries into sub-questions for better answers: ```http POST /sessions/{session_id}/chat Content-Type: application/json { "query": "Compare the methodologies and analyze their effectiveness", "query_decompose": true, "compose_sub_answers": true } ``` #### Answer Verification Independent verification pass for accuracy using a separate verification model: ```http POST /sessions/{session_id}/chat Content-Type: application/json { "query": "What are the key findings?", "verify": true } ``` #### Contextual Enrichment Document context enrichment during indexing for better understanding: ```bash # Enable during index building POST /indexes/{id}/build { "enable_enrich": true, "window_size": 2 } ``` #### Late Chunking Better context preservation by chunking after embedding: ```bash # Configure in pipeline "late_chunking": {"enabled": true} ``` #### Streaming Chat ```http POST /chat/stream Content-Type: application/json { "query": "Explain the methodology", "session_id": "uuid", "stream": true } ``` #### Batch Processing ```bash # Using the batch indexing script python demo_batch_indexing.py --config batch_indexing_config.json # Example batch configuration (batch_indexing_config.json): { "index_name": "Sample Batch Index", "index_description": "Example batch index configuration", "documents": [ "./rag_system/documents/invoice_1039.pdf", "./rag_system/documents/invoice_1041.pdf" ], "processing": { "chunk_size": 512, "chunk_overlap": 64, "enable_enrich": true, "enable_latechunk": true, "enable_docling": true, "embedding_model": "Qwen/Qwen3-Embedding-0.6B", "generation_model": "qwen3:0.6b", "retrieval_mode": "hybrid", "window_size": 2 } } ``` ```http # API endpoint for batch processing POST /batch/index Content-Type: application/json { "file_paths": ["doc1.pdf", "doc2.pdf"], "config": { "chunk_size": 512, "enable_enrich": true, "enable_latechunk": true, "enable_docling": true } } ``` For complete API documentation, see [API_REFERENCE.md](API_REFERENCE.md). --- ## 🏗️ Architecture LocalGPT is built with a modular, scalable architecture: ```mermaid graph TB UI[Web Interface] --> API[Backend API] API --> Agent[RAG Agent] Agent --> Retrieval[Retrieval Pipeline] Agent --> Generation[Generation Pipeline] Retrieval --> Vector[Vector Search] Retrieval --> BM25[BM25 Search] Retrieval --> Rerank[Reranking] Vector --> LanceDB[(LanceDB)] BM25 --> BM25DB[(BM25 Index)] Generation --> Ollama[Ollama Models] Generation --> HF[Hugging Face Models] API --> SQLite[(SQLite DB)] ``` Overview of the Retrieval Agent ```mermaid graph TD classDef llmcall fill:#e6f3ff,stroke:#007bff; classDef pipeline fill:#e6ffe6,stroke:#28a745; classDef cache fill:#fff3e0,stroke:#fd7e14; classDef logic fill:#f8f9fa,stroke:#6c757d; classDef thread stroke-dasharray: 5 5; A(Start: Agent.run) --> B_asyncio.run(_run_async); B --> C{_run_async}; C --> C1[Get Chat History]; C1 --> T1[Build Triage Prompt
Query + Doc Overviews ]; T1 --> T2["(asyncio.to_thread)
LLM Triage: RAG or LLM_DIRECT?"]; class T2 llmcall,thread; T2 --> T3{Decision?}; T3 -- RAG --> RAG_Path; T3 -- LLM_DIRECT --> LLM_Path; subgraph RAG Path RAG_Path --> R1[Format Query + History]; R1 --> R2["(asyncio.to_thread)
Generate Query Embedding"]; class R2 pipeline,thread; R2 --> R3{{Check Semantic Cache}}; class R3 cache; R3 -- Hit --> R_Cache_Hit(Return Cached Result); R_Cache_Hit --> R_Hist_Update; R3 -- Miss --> R4{Decomposition
Enabled?}; R4 -- Yes --> R5["(asyncio.to_thread)
Decompose Raw Query"]; class R5 llmcall,thread; R5 --> R6{{Run Sub-Queries
Parallel RAG Pipeline}}; class R6 pipeline,thread; R6 --> R7[Collect Results & Docs]; R7 --> R8["(asyncio.to_thread)
Compose Final Answer"]; class R8 llmcall,thread; R8 --> V1(RAG Answer); R4 -- No --> R9["(asyncio.to_thread)
Run Single Query
(RAG Pipeline)"]; class R9 pipeline,thread; R9 --> V1; V1 --> V2{{Verification
await verify_async}}; class V2 llmcall; V2 --> V3(Final RAG Result); V3 --> R_Cache_Store{{Store in Semantic Cache}}; class R_Cache_Store cache; R_Cache_Store --> FinalResult; end subgraph Direct LLM Path LLM_Path --> L1[Format Query + History]; L1 --> L2["(asyncio.to_thread)
Generate Direct LLM Answer
(No RAG)"]; class L2 llmcall,thread; L2 --> FinalResult(Final Direct Result); end FinalResult --> R_Hist_Update(Update Chat History); R_Hist_Update --> ZZZ(End: Return Result); ``` --- ## 🤝 Contributing We welcome contributions from developers of all skill levels! LocalGPT is an open-source project that benefits from community involvement. ### 🚀 Quick Start for Contributors ```bash # Fork and clone the repository git clone https://github.com/PromtEngineer/localGPT.git cd localGPT # Set up development environment pip install -r requirements.txt npm install # Install Ollama and models curl -fsSL https://ollama.ai/install.sh | sh ollama pull qwen3:0.6b qwen3:8b # Verify setup python system_health_check.py python run_system.py --mode dev ``` ### 📋 How to Contribute 1. **🐛 Report Bugs**: Use our [bug report template](.github/ISSUE_TEMPLATE/bug_report.md) 2. **💡 Request Features**: Use our [feature request template](.github/ISSUE_TEMPLATE/feature_request.md) 3. **🔧 Submit Code**: Follow our [development workflow](CONTRIBUTING.md#development-workflow) 4. **📚 Improve Docs**: Help make our documentation better ### 📖 Detailed Guidelines For comprehensive contributing guidelines, including: - Development setup and workflow - Coding standards and best practices - Testing requirements - Documentation standards - Release process **👉 See our [CONTRIBUTING.md](CONTRIBUTING.md) guide** --- ## 📄 License This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. For models, please check their respective licenses. --- ## 📞 Support - **Documentation**: [Technical Docs](TECHNICAL_DOCS.md) - **Issues**: [GitHub Issues](https://github.com/PromtEngineer/localGPT/issues) - **Discussions**: [GitHub Discussions](https://github.com/PromtEngineer/localGPT/discussions) - **Business Deployment and Customization**: [Contact Us](https://tally.so/r/wv6R2d) ---
## Star History [![Star History Chart](https://api.star-history.com/svg?repos=PromtEngineer/localGPT&type=Date)](https://star-history.com/#PromtEngineer/localGPT&Date) ================================================ FILE: WATSONX_README.md ================================================ # Watson X Integration with Granite Models This branch adds support for IBM Watson X AI with Granite models as an alternative to Ollama for running LocalGPT. ## Overview LocalGPT now supports two LLM backends: 1. **Ollama** (default): Run models locally using Ollama 2. **Watson X**: Use IBM's Granite models hosted on Watson X AI ## What Changed - Added `WatsonXClient` class in `rag_system/utils/watsonx_client.py` that provides an Ollama-compatible interface for Watson X - Updated `factory.py` and `main.py` to support backend switching via environment variable - Added `ibm-watsonx-ai` SDK dependency to `requirements.txt` - Configuration now supports both backends through environment variables ## Prerequisites To use Watson X with Granite models, you need: 1. IBM Cloud account with Watson X access 2. Watson X API key 3. Watson X project ID ### Getting Your Credentials 1. Go to [IBM Cloud](https://cloud.ibm.com/) 2. Navigate to Watson X AI service 3. Create or select a project 4. Get your API key from IBM Cloud IAM 5. Copy your project ID from the Watson X project settings ## Configuration ### Environment Variables Create a `.env` file or set these environment variables: ```bash # Choose LLM backend (default: ollama) LLM_BACKEND=watsonx # Watson X Configuration WATSONX_API_KEY=your_api_key_here WATSONX_PROJECT_ID=your_project_id_here WATSONX_URL=https://us-south.ml.cloud.ibm.com # Model Configuration WATSONX_GENERATION_MODEL=ibm/granite-13b-chat-v2 WATSONX_ENRICHMENT_MODEL=ibm/granite-8b-japanese ``` ### Available Granite Models Watson X offers several Granite models: - `ibm/granite-13b-chat-v2` - General purpose chat model - `ibm/granite-13b-instruct-v2` - Instruction-following model - `ibm/granite-20b-multilingual` - Multilingual support - `ibm/granite-8b-japanese` - Lightweight Japanese model - `ibm/granite-3b-code-instruct` - Code generation model For a full list of available models, visit the [Watson X documentation](https://www.ibm.com/docs/en/watsonx/saas?topic=solutions-supported-foundation-models). ## Installation 1. Install the Watson X SDK: ```bash pip install ibm-watsonx-ai>=1.3.39 ``` Or install all dependencies: ```bash pip install -r rag_system/requirements.txt ``` ## Usage ### Running with Watson X Once configured, simply set the environment variable and run as normal: ```bash export LLM_BACKEND=watsonx python -m rag_system.main api ``` Or in Python: ```python import os os.environ['LLM_BACKEND'] = 'watsonx' from rag_system.factory import get_agent # Get agent with Watson X backend agent = get_agent(mode="default") # Use as normal result = agent.run("What is artificial intelligence?") print(result) ``` ### Switching Between Backends You can easily switch between Ollama and Watson X: ```bash # Use Ollama (local) export LLM_BACKEND=ollama python -m rag_system.main api # Use Watson X (cloud) export LLM_BACKEND=watsonx python -m rag_system.main api ``` ## Features The Watson X client supports all the key features used by LocalGPT: - ✅ Text generation / completion - ✅ Async generation - ✅ Streaming responses - ✅ Embeddings (if using Watson X embedding models) - ✅ Custom generation parameters (temperature, max_tokens, top_p, top_k) - ⚠️ Image/multimodal support (limited, depends on model availability) ## API Compatibility The `WatsonXClient` provides the same interface as `OllamaClient`: ```python from rag_system.utils.watsonx_client import WatsonXClient client = WatsonXClient( api_key="your_api_key", project_id="your_project_id" ) # Generate completion response = client.generate_completion( model="ibm/granite-13b-chat-v2", prompt="Explain quantum computing" ) print(response['response']) # Stream completion for chunk in client.stream_completion( model="ibm/granite-13b-chat-v2", prompt="Write a story about AI" ): print(chunk, end='', flush=True) ``` ## Limitations 1. **Embedding Models**: Watson X uses different embedding models than Ollama. Make sure to configure embedding models appropriately in `main.py` if needed. 2. **Multimodal Support**: Image support varies by model availability in Watson X. Not all Granite models support multimodal inputs. 3. **Streaming**: Streaming support depends on the Watson X SDK version and may fall back to returning the full response at once. 4. **Rate Limits**: Watson X has API rate limits that may differ from local Ollama usage. Monitor your usage accordingly. ## Troubleshooting ### Authentication Errors If you see authentication errors: - Verify your API key is correct - Check that your project ID matches an existing Watson X project - Ensure your IBM Cloud account has Watson X access ### Model Not Found If you get model not found errors: - Verify the model ID is correct (e.g., `ibm/granite-13b-chat-v2`) - Check that the model is available in your Watson X instance - Some models may require additional permissions ### Connection Errors If you experience connection issues: - Check your internet connection - Verify the Watson X URL is correct for your region - Check IBM Cloud status page for service outages ## Cost Considerations Unlike local Ollama, Watson X is a cloud service with usage-based pricing: - Token-based pricing for generation - Consider your query volume - Monitor usage through IBM Cloud dashboard ## Reverting to Ollama To switch back to local Ollama: ```bash unset LLM_BACKEND # or set LLM_BACKEND=ollama python -m rag_system.main api ``` ## Support For Watson X specific issues: - [IBM Watson X Documentation](https://www.ibm.com/docs/en/watsonx/saas) - [Watson X Developer Hub](https://www.ibm.com/watsonx/developer/) - [IBM Cloud Support](https://cloud.ibm.com/docs/get-support) For LocalGPT issues: - [LocalGPT GitHub Issues](https://github.com/PromtEngineer/localGPT/issues) ## Contributing If you find issues with the Watson X integration or want to add features: 1. Create an issue describing the problem/feature 2. Submit a pull request with your changes 3. Ensure all tests pass ## License This integration follows the same license as LocalGPT (MIT License). ================================================ FILE: backend/README.md ================================================ # localGPT Backend Simple Python backend that connects your frontend to Ollama for local LLM chat. ## Prerequisites 1. **Install Ollama** (if not already installed): ```bash # Visit https://ollama.ai or run: curl -fsSL https://ollama.ai/install.sh | sh ``` 2. **Start Ollama**: ```bash ollama serve ``` 3. **Pull a model** (optional, server will suggest if needed): ```bash ollama pull llama3.2 ``` ## Setup 1. **Install Python dependencies**: ```bash pip install -r requirements.txt ``` 2. **Test Ollama connection**: ```bash python ollama_client.py ``` 3. **Start the backend server**: ```bash python server.py ``` Server will run on `http://localhost:8000` ## API Endpoints ### Health Check ```bash GET /health ``` Returns server status and available models. ### Chat ```bash POST /chat Content-Type: application/json { "message": "Hello!", "model": "llama3.2:latest", "conversation_history": [] } ``` Returns: ```json { "response": "Hello! How can I help you?", "model": "llama3.2:latest", "message_count": 1 } ``` ## Testing Test the chat endpoint: ```bash curl -X POST http://localhost:8000/chat \ -H "Content-Type: application/json" \ -d '{"message": "Hello!", "model": "llama3.2:latest"}' ``` ## Frontend Integration Your React frontend should connect to: - **Backend**: `http://localhost:8000` - **Chat endpoint**: `http://localhost:8000/chat` ## What's Next This simple backend is ready for: - ✅ **Real-time chat** with local LLMs - 🔜 **Document upload** for RAG - 🔜 **Vector database** integration - 🔜 **Streaming responses** - 🔜 **Chat history** persistence ================================================ FILE: backend/database.py ================================================ import sqlite3 import uuid import json from datetime import datetime from typing import List, Dict, Optional, Tuple class ChatDatabase: def __init__(self, db_path: str = None): if db_path is None: # Auto-detect environment and set appropriate path import os if os.path.exists("/app"): # Docker environment self.db_path = "/app/backend/chat_data.db" else: # Local development environment self.db_path = "backend/chat_data.db" else: self.db_path = db_path self.init_database() def init_database(self): """Initialize the SQLite database with required tables""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # Enable foreign keys conn.execute("PRAGMA foreign_keys = ON") # Sessions table conn.execute(''' CREATE TABLE IF NOT EXISTS sessions ( id TEXT PRIMARY KEY, title TEXT NOT NULL, created_at TEXT NOT NULL, updated_at TEXT NOT NULL, model_used TEXT NOT NULL, message_count INTEGER DEFAULT 0 ) ''') # Messages table conn.execute(''' CREATE TABLE IF NOT EXISTS messages ( id TEXT PRIMARY KEY, session_id TEXT NOT NULL, content TEXT NOT NULL, sender TEXT NOT NULL CHECK (sender IN ('user', 'assistant')), timestamp TEXT NOT NULL, metadata TEXT DEFAULT '{}', FOREIGN KEY (session_id) REFERENCES sessions (id) ON DELETE CASCADE ) ''') # Create indexes for better performance conn.execute('CREATE INDEX IF NOT EXISTS idx_messages_session_id ON messages(session_id)') conn.execute('CREATE INDEX IF NOT EXISTS idx_messages_timestamp ON messages(timestamp)') conn.execute('CREATE INDEX IF NOT EXISTS idx_sessions_updated_at ON sessions(updated_at)') # Documents table conn.execute(''' CREATE TABLE IF NOT EXISTS session_documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, session_id TEXT NOT NULL, file_path TEXT NOT NULL, indexed INTEGER DEFAULT 0, FOREIGN KEY (session_id) REFERENCES sessions (id) ON DELETE CASCADE ) ''') conn.execute('CREATE INDEX IF NOT EXISTS idx_session_documents_session_id ON session_documents(session_id)') # --- NEW: Index persistence tables --- cursor.execute(''' CREATE TABLE IF NOT EXISTS indexes ( id TEXT PRIMARY KEY, name TEXT UNIQUE, description TEXT, created_at TEXT, updated_at TEXT, vector_table_name TEXT, metadata TEXT ) ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS index_documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, index_id TEXT, original_filename TEXT, stored_path TEXT, FOREIGN KEY(index_id) REFERENCES indexes(id) ) ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS session_indexes ( id INTEGER PRIMARY KEY AUTOINCREMENT, session_id TEXT, index_id TEXT, linked_at TEXT, FOREIGN KEY(session_id) REFERENCES sessions(id), FOREIGN KEY(index_id) REFERENCES indexes(id) ) ''') conn.commit() conn.close() print("✅ Database initialized successfully") def create_session(self, title: str, model: str) -> str: """Create a new chat session""" session_id = str(uuid.uuid4()) now = datetime.now().isoformat() conn = sqlite3.connect(self.db_path) conn.execute(''' INSERT INTO sessions (id, title, created_at, updated_at, model_used) VALUES (?, ?, ?, ?, ?) ''', (session_id, title, now, now, model)) conn.commit() conn.close() print(f"📝 Created new session: {session_id[:8]}... - {title}") return session_id def get_sessions(self, limit: int = 50) -> List[Dict]: """Get all chat sessions, ordered by most recent""" conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row cursor = conn.execute(''' SELECT id, title, created_at, updated_at, model_used, message_count FROM sessions ORDER BY updated_at DESC LIMIT ? ''', (limit,)) sessions = [dict(row) for row in cursor.fetchall()] conn.close() return sessions def get_session(self, session_id: str) -> Optional[Dict]: """Get a specific session""" conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row cursor = conn.execute(''' SELECT id, title, created_at, updated_at, model_used, message_count FROM sessions WHERE id = ? ''', (session_id,)) row = cursor.fetchone() conn.close() return dict(row) if row else None def add_message(self, session_id: str, content: str, sender: str, metadata: Dict = None) -> str: """Add a message to a session""" message_id = str(uuid.uuid4()) now = datetime.now().isoformat() metadata_json = json.dumps(metadata or {}) conn = sqlite3.connect(self.db_path) # Add the message conn.execute(''' INSERT INTO messages (id, session_id, content, sender, timestamp, metadata) VALUES (?, ?, ?, ?, ?, ?) ''', (message_id, session_id, content, sender, now, metadata_json)) # Update session timestamp and message count conn.execute(''' UPDATE sessions SET updated_at = ?, message_count = message_count + 1 WHERE id = ? ''', (now, session_id)) conn.commit() conn.close() return message_id def get_messages(self, session_id: str, limit: int = 100) -> List[Dict]: """Get all messages for a session""" conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row cursor = conn.execute(''' SELECT id, content, sender, timestamp, metadata FROM messages WHERE session_id = ? ORDER BY timestamp ASC LIMIT ? ''', (session_id, limit)) messages = [] for row in cursor.fetchall(): message = dict(row) message['metadata'] = json.loads(message['metadata']) messages.append(message) conn.close() return messages def get_conversation_history(self, session_id: str) -> List[Dict]: """Get conversation history in the format expected by Ollama""" messages = self.get_messages(session_id) history = [] for msg in messages: history.append({ "role": msg["sender"], "content": msg["content"] }) return history def update_session_title(self, session_id: str, title: str): """Update session title""" conn = sqlite3.connect(self.db_path) conn.execute(''' UPDATE sessions SET title = ?, updated_at = ? WHERE id = ? ''', (title, datetime.now().isoformat(), session_id)) conn.commit() conn.close() def delete_session(self, session_id: str) -> bool: """Delete a session and all its messages""" conn = sqlite3.connect(self.db_path) cursor = conn.execute('DELETE FROM sessions WHERE id = ?', (session_id,)) deleted = cursor.rowcount > 0 conn.commit() conn.close() if deleted: print(f"🗑️ Deleted session: {session_id[:8]}...") return deleted def cleanup_empty_sessions(self) -> int: """Remove sessions with no messages""" conn = sqlite3.connect(self.db_path) # Find sessions with no messages cursor = conn.execute(''' SELECT s.id FROM sessions s LEFT JOIN messages m ON s.id = m.session_id WHERE m.id IS NULL ''') empty_sessions = [row[0] for row in cursor.fetchall()] # Delete empty sessions deleted_count = 0 for session_id in empty_sessions: cursor = conn.execute('DELETE FROM sessions WHERE id = ?', (session_id,)) if cursor.rowcount > 0: deleted_count += 1 print(f"🗑️ Cleaned up empty session: {session_id[:8]}...") conn.commit() conn.close() if deleted_count > 0: print(f"✨ Cleaned up {deleted_count} empty sessions") return deleted_count def get_stats(self) -> Dict: """Get database statistics""" conn = sqlite3.connect(self.db_path) # Get session count cursor = conn.execute('SELECT COUNT(*) FROM sessions') session_count = cursor.fetchone()[0] # Get message count cursor = conn.execute('SELECT COUNT(*) FROM messages') message_count = cursor.fetchone()[0] # Get most used model cursor = conn.execute(''' SELECT model_used, COUNT(*) as count FROM sessions GROUP BY model_used ORDER BY count DESC LIMIT 1 ''') most_used_model = cursor.fetchone() conn.close() return { "total_sessions": session_count, "total_messages": message_count, "most_used_model": most_used_model[0] if most_used_model else None } def add_document_to_session(self, session_id: str, file_path: str) -> int: """Adds a document file path to a session.""" conn = sqlite3.connect(self.db_path) cursor = conn.execute( "INSERT INTO session_documents (session_id, file_path) VALUES (?, ?)", (session_id, file_path) ) doc_id = cursor.lastrowid conn.commit() conn.close() print(f"📄 Added document '{file_path}' to session {session_id[:8]}...") return doc_id def get_documents_for_session(self, session_id: str) -> List[str]: """Retrieves all document file paths for a given session.""" conn = sqlite3.connect(self.db_path) cursor = conn.execute( "SELECT file_path FROM session_documents WHERE session_id = ?", (session_id,) ) paths = [row[0] for row in cursor.fetchall()] conn.close() return paths # -------- Index helpers --------- def create_index(self, name: str, description: str|None = None, metadata: dict | None = None) -> str: idx_id = str(uuid.uuid4()) created = datetime.now().isoformat() vector_table = f"text_pages_{idx_id}" conn = sqlite3.connect(self.db_path) conn.execute(''' INSERT INTO indexes (id, name, description, created_at, updated_at, vector_table_name, metadata) VALUES (?,?,?,?,?,?,?) ''', (idx_id, name, description, created, created, vector_table, json.dumps(metadata or {}))) conn.commit() conn.close() print(f"📂 Created new index '{name}' ({idx_id[:8]})") return idx_id def get_index(self, index_id: str) -> dict | None: conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row cur = conn.execute('SELECT * FROM indexes WHERE id=?', (index_id,)) row = cur.fetchone() if not row: conn.close() return None idx = dict(row) idx['metadata'] = json.loads(idx['metadata'] or '{}') cur = conn.execute('SELECT original_filename, stored_path FROM index_documents WHERE index_id=?', (index_id,)) docs = [{'filename': r[0], 'stored_path': r[1]} for r in cur.fetchall()] idx['documents'] = docs conn.close() return idx def list_indexes(self) -> list[dict]: conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row rows = conn.execute('SELECT * FROM indexes').fetchall() res = [] for r in rows: item = dict(r) item['metadata'] = json.loads(item['metadata'] or '{}') # attach documents list for convenience docs_cur = conn.execute('SELECT original_filename, stored_path FROM index_documents WHERE index_id=?', (item['id'],)) docs = [{'filename':d[0],'stored_path':d[1]} for d in docs_cur.fetchall()] item['documents'] = docs res.append(item) conn.close() return res def add_document_to_index(self, index_id: str, filename: str, stored_path: str): conn = sqlite3.connect(self.db_path) conn.execute('INSERT INTO index_documents (index_id, original_filename, stored_path) VALUES (?,?,?)', (index_id, filename, stored_path)) conn.commit() conn.close() def link_index_to_session(self, session_id: str, index_id: str): conn = sqlite3.connect(self.db_path) conn.execute('INSERT INTO session_indexes (session_id, index_id, linked_at) VALUES (?,?,?)', (session_id, index_id, datetime.now().isoformat())) conn.commit() conn.close() def get_indexes_for_session(self, session_id: str) -> list[str]: conn = sqlite3.connect(self.db_path) cursor = conn.execute('SELECT index_id FROM session_indexes WHERE session_id=? ORDER BY linked_at', (session_id,)) ids = [r[0] for r in cursor.fetchall()] conn.close() return ids def delete_index(self, index_id: str) -> bool: """Delete an index and its related records (documents, session links). Returns True if deleted.""" conn = sqlite3.connect(self.db_path) try: # Get vector table name before deletion (optional, for LanceDB cleanup) cur = conn.execute('SELECT vector_table_name FROM indexes WHERE id = ?', (index_id,)) row = cur.fetchone() vector_table_name = row[0] if row else None # Remove child rows first due to foreign‐key constraints conn.execute('DELETE FROM index_documents WHERE index_id = ?', (index_id,)) conn.execute('DELETE FROM session_indexes WHERE index_id = ?', (index_id,)) cursor = conn.execute('DELETE FROM indexes WHERE id = ?', (index_id,)) deleted = cursor.rowcount > 0 conn.commit() finally: conn.close() if deleted: print(f"🗑️ Deleted index {index_id[:8]}... and related records") # Optional: attempt to drop LanceDB table if available if vector_table_name: try: from rag_system.indexing.embedders import LanceDBManager import os db_path = os.getenv('LANCEDB_PATH') or './rag_system/index_store/lancedb' ldb = LanceDBManager(db_path) db = ldb.db if hasattr(db, 'table_names') and vector_table_name in db.table_names(): db.drop_table(vector_table_name) print(f"🚮 Dropped LanceDB table '{vector_table_name}'") except Exception as e: print(f"⚠️ Could not drop LanceDB table '{vector_table_name}': {e}") return deleted def update_index_metadata(self, index_id: str, updates: dict): """Merge new key/values into an index's metadata JSON column.""" conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row cur = conn.execute('SELECT metadata FROM indexes WHERE id=?', (index_id,)) row = cur.fetchone() if row is None: conn.close() raise ValueError("Index not found") existing = json.loads(row['metadata'] or '{}') existing.update(updates) conn.execute('UPDATE indexes SET metadata=?, updated_at=? WHERE id=?', (json.dumps(existing), datetime.now().isoformat(), index_id)) conn.commit() conn.close() def inspect_and_populate_index_metadata(self, index_id: str) -> dict: """ Inspect LanceDB table to extract metadata for older indexes. Returns the inferred metadata or empty dict if inspection fails. """ try: # Get index info index_info = self.get_index(index_id) if not index_info: return {} # Check if metadata is already populated if index_info.get('metadata') and len(index_info['metadata']) > 0: return index_info['metadata'] # Try to inspect the LanceDB table vector_table_name = index_info.get('vector_table_name') if not vector_table_name: return {} try: # Try to import the RAG system modules try: from rag_system.indexing.embedders import LanceDBManager import os # Use the same path as the system db_path = os.getenv('LANCEDB_PATH') or './rag_system/index_store/lancedb' ldb = LanceDBManager(db_path) # Check if table exists if not hasattr(ldb.db, 'table_names') or vector_table_name not in ldb.db.table_names(): # Table doesn't exist - this means the index was never properly built inferred_metadata = { 'status': 'incomplete', 'issue': 'Vector table not found - index may not have been built properly', 'vector_table_expected': vector_table_name, 'available_tables': list(ldb.db.table_names()) if hasattr(ldb.db, 'table_names') else [], 'metadata_inferred_at': datetime.now().isoformat(), 'metadata_source': 'lancedb_inspection' } self.update_index_metadata(index_id, inferred_metadata) print(f"⚠️ Index {index_id[:8]}... appears incomplete - vector table missing") return inferred_metadata # Get table and inspect schema/data table = ldb.db.open_table(vector_table_name) # Get a sample record to inspect - use correct LanceDB API try: # Try to get sample data using proper LanceDB methods sample_df = table.to_pandas() if len(sample_df) == 0: inferred_metadata = { 'status': 'empty', 'issue': 'Vector table exists but contains no data', 'metadata_inferred_at': datetime.now().isoformat(), 'metadata_source': 'lancedb_inspection' } self.update_index_metadata(index_id, inferred_metadata) return inferred_metadata # Take only first row for inspection sample_df = sample_df.head(1) except Exception as e: print(f"⚠️ Could not read data from table {vector_table_name}: {e}") return {} # Infer metadata from table structure inferred_metadata = { 'status': 'functional', 'total_chunks': len(table.to_pandas()), # Get total count } # Check vector dimensions if 'vector' in sample_df.columns: vector_data = sample_df['vector'].iloc[0] if isinstance(vector_data, list): inferred_metadata['vector_dimensions'] = len(vector_data) # Try to infer embedding model from vector dimensions dim_to_model = { 384: 'BAAI/bge-small-en-v1.5 (or similar)', 512: 'sentence-transformers/all-MiniLM-L6-v2 (or similar)', 768: 'BAAI/bge-base-en-v1.5 (or similar)', 1024: 'Qwen/Qwen3-Embedding-0.6B (or similar)', 1536: 'text-embedding-ada-002 (or similar)' } if len(vector_data) in dim_to_model: inferred_metadata['embedding_model_inferred'] = dim_to_model[len(vector_data)] # Try to parse metadata from sample record if 'metadata' in sample_df.columns: try: sample_metadata = json.loads(sample_df['metadata'].iloc[0]) # Look for common metadata fields that might give us clues if 'document_id' in sample_metadata: inferred_metadata['has_document_structure'] = True if 'chunk_index' in sample_metadata: inferred_metadata['has_chunk_indexing'] = True if 'original_text' in sample_metadata: inferred_metadata['has_contextual_enrichment'] = True inferred_metadata['retrieval_mode_inferred'] = 'hybrid (contextual enrichment detected)' # Check for chunk size patterns if 'text' in sample_df.columns: text_length = len(sample_df['text'].iloc[0]) if text_length > 0: inferred_metadata['sample_chunk_length'] = text_length # Rough chunk size estimation estimated_tokens = text_length // 4 # rough estimate: 4 chars per token if estimated_tokens < 300: inferred_metadata['chunk_size_inferred'] = '256 tokens (estimated)' elif estimated_tokens < 600: inferred_metadata['chunk_size_inferred'] = '512 tokens (estimated)' else: inferred_metadata['chunk_size_inferred'] = '1024+ tokens (estimated)' except (json.JSONDecodeError, KeyError): pass # Check if FTS index exists try: indices = table.list_indices() fts_exists = any('fts' in idx.name.lower() for idx in indices) if fts_exists: inferred_metadata['has_fts_index'] = True inferred_metadata['retrieval_mode_inferred'] = 'hybrid (FTS + vector)' else: inferred_metadata['retrieval_mode_inferred'] = 'vector-only' except: pass # Add inspection timestamp inferred_metadata['metadata_inferred_at'] = datetime.now().isoformat() inferred_metadata['metadata_source'] = 'lancedb_inspection' # Update the database with inferred metadata if inferred_metadata: self.update_index_metadata(index_id, inferred_metadata) print(f"🔍 Inferred metadata for index {index_id[:8]}...: {len(inferred_metadata)} fields") return inferred_metadata except ImportError as import_error: # RAG system modules not available - provide basic fallback metadata print(f"⚠️ RAG system modules not available for inspection: {import_error}") # Check if this is actually a legacy index by looking at creation date created_at = index_info.get('created_at', '') is_recent = False if created_at: try: from datetime import datetime, timedelta created_date = datetime.fromisoformat(created_at.replace('Z', '+00:00')) # Consider indexes created in the last 30 days as "recent" is_recent = created_date > datetime.now().replace(tzinfo=created_date.tzinfo) - timedelta(days=30) except: pass # Provide basic fallback metadata with better status detection if is_recent: status = 'functional' issue = 'Detailed configuration inspection requires RAG system modules, but index appears functional' else: status = 'legacy' issue = 'This index was created before metadata tracking was implemented. Configuration details are not available.' fallback_metadata = { 'status': status, 'issue': issue, 'metadata_inferred_at': datetime.now().isoformat(), 'metadata_source': 'fallback_inspection', 'documents_count': len(index_info.get('documents', [])), 'created_at': index_info.get('created_at', 'unknown'), 'inspection_limitation': 'Backend server cannot access full RAG system modules for detailed inspection' } # Try to infer some basic info from the vector table name if vector_table_name: fallback_metadata['vector_table_name'] = vector_table_name fallback_metadata['note'] = 'Vector table exists but detailed inspection requires RAG system modules' self.update_index_metadata(index_id, fallback_metadata) status_msg = "recent but limited inspection" if is_recent else "legacy" print(f"📝 Added fallback metadata for {status_msg} index {index_id[:8]}...") return fallback_metadata except Exception as e: print(f"⚠️ Could not inspect LanceDB table for index {index_id[:8]}...: {e}") return {} except Exception as e: print(f"⚠️ Failed to inspect index metadata for {index_id[:8]}...: {e}") return {} def generate_session_title(first_message: str, max_length: int = 50) -> str: """Generate a session title from the first message""" # Clean up the message title = first_message.strip() # Remove common prefixes prefixes = ["hey", "hi", "hello", "can you", "please", "i want", "i need"] title_lower = title.lower() for prefix in prefixes: if title_lower.startswith(prefix): title = title[len(prefix):].strip() break # Capitalize first letter if title: title = title[0].upper() + title[1:] # Truncate if too long if len(title) > max_length: title = title[:max_length].strip() + "..." # Fallback if not title or len(title) < 3: title = "New Chat" return title # Global database instance db = ChatDatabase() if __name__ == "__main__": # Test the database print("🧪 Testing database...") # Create a test session session_id = db.create_session("Test Chat", "llama3.2:latest") # Add some messages db.add_message(session_id, "Hello!", "user") db.add_message(session_id, "Hi there! How can I help you?", "assistant") # Get messages messages = db.get_messages(session_id) print(f"📨 Messages: {len(messages)}") # Get sessions sessions = db.get_sessions() print(f"📋 Sessions: {len(sessions)}") # Get stats stats = db.get_stats() print(f"📊 Stats: {stats}") print("✅ Database test completed!") ================================================ FILE: backend/ollama_client.py ================================================ import requests import json import os from typing import List, Dict, Optional class OllamaClient: def __init__(self, base_url: Optional[str] = None): if base_url is None: base_url = os.getenv("OLLAMA_HOST", "http://localhost:11434") self.base_url = base_url self.api_url = f"{base_url}/api" def is_ollama_running(self) -> bool: """Check if Ollama server is running""" try: response = requests.get(f"{self.base_url}/api/tags", timeout=5) return response.status_code == 200 except requests.exceptions.RequestException: return False def list_models(self) -> List[str]: """Get list of available models""" try: response = requests.get(f"{self.api_url}/tags") if response.status_code == 200: models = response.json().get("models", []) return [model["name"] for model in models] return [] except requests.exceptions.RequestException as e: print(f"Error fetching models: {e}") return [] def pull_model(self, model_name: str) -> bool: """Pull a model if not available""" try: response = requests.post( f"{self.api_url}/pull", json={"name": model_name}, stream=True ) if response.status_code == 200: print(f"Pulling model {model_name}...") for line in response.iter_lines(): if line: data = json.loads(line) if "status" in data: print(f"Status: {data['status']}") if data.get("status") == "success": return True return True return False except requests.exceptions.RequestException as e: print(f"Error pulling model: {e}") return False def chat(self, message: str, model: str = "llama3.2", conversation_history: List[Dict] = None, enable_thinking: bool = True) -> str: """Send a chat message to Ollama""" if conversation_history is None: conversation_history = [] # Add user message to conversation messages = conversation_history + [{"role": "user", "content": message}] try: payload = { "model": model, "messages": messages, "stream": False, } # Multiple approaches to disable thinking tokens if not enable_thinking: payload.update({ "think": False, # Native Ollama parameter "options": { "think": False, "thinking": False, "temperature": 0.7, "top_p": 0.9 } }) else: payload["think"] = True response = requests.post( f"{self.api_url}/chat", json=payload, timeout=60 ) if response.status_code == 200: result = response.json() response_text = result["message"]["content"] # Additional cleanup: remove any thinking tokens that might slip through if not enable_thinking: # Remove common thinking token patterns import re response_text = re.sub(r'.*?', '', response_text, flags=re.DOTALL | re.IGNORECASE) response_text = re.sub(r'.*?', '', response_text, flags=re.DOTALL | re.IGNORECASE) response_text = response_text.strip() return response_text else: return f"Error: {response.status_code} - {response.text}" except requests.exceptions.RequestException as e: return f"Connection error: {e}" def chat_stream(self, message: str, model: str = "llama3.2", conversation_history: List[Dict] = None, enable_thinking: bool = True): """Stream chat response from Ollama""" if conversation_history is None: conversation_history = [] messages = conversation_history + [{"role": "user", "content": message}] try: payload = { "model": model, "messages": messages, "stream": True, } # Multiple approaches to disable thinking tokens if not enable_thinking: payload.update({ "think": False, # Native Ollama parameter "options": { "think": False, "thinking": False, "temperature": 0.7, "top_p": 0.9 } }) else: payload["think"] = True response = requests.post( f"{self.api_url}/chat", json=payload, stream=True, timeout=60 ) if response.status_code == 200: for line in response.iter_lines(): if line: try: data = json.loads(line) if "message" in data and "content" in data["message"]: content = data["message"]["content"] # Filter out thinking tokens in streaming mode if not enable_thinking: # Skip content that looks like thinking tokens if '' in content.lower() or '' in content.lower(): continue yield content except json.JSONDecodeError: continue else: yield f"Error: {response.status_code} - {response.text}" except requests.exceptions.RequestException as e: yield f"Connection error: {e}" def main(): """Test the Ollama client""" client = OllamaClient() # Check if Ollama is running if not client.is_ollama_running(): print("❌ Ollama is not running. Please start Ollama first.") print("Install: https://ollama.ai") print("Run: ollama serve") return print("✅ Ollama is running!") # List available models models = client.list_models() print(f"Available models: {models}") # Try to use llama3.2, pull if needed model_name = "llama3.2" if model_name not in [m.split(":")[0] for m in models]: print(f"Model {model_name} not found. Pulling...") if client.pull_model(model_name): print(f"✅ Model {model_name} pulled successfully!") else: print(f"❌ Failed to pull model {model_name}") return # Test chat print("\n🤖 Testing chat...") response = client.chat("Hello! Can you tell me a short joke?", model_name) print(f"AI: {response}") if __name__ == "__main__": main() ================================================ FILE: backend/requirements.txt ================================================ requests python-dotenv PyPDF2 ================================================ FILE: backend/server.py ================================================ import json import http.server import socketserver import cgi import os import uuid from urllib.parse import urlparse, parse_qs import requests # 🆕 Import requests for making HTTP calls import sys from datetime import datetime # Add parent directory to path so we can import rag_system modules sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Import RAG system modules for complete metadata try: from rag_system.main import PIPELINE_CONFIGS RAG_SYSTEM_AVAILABLE = True print("✅ RAG system modules accessible from backend") except ImportError as e: PIPELINE_CONFIGS = {} RAG_SYSTEM_AVAILABLE = False print(f"⚠️ RAG system modules not available: {e}") from ollama_client import OllamaClient from database import db, generate_session_title import simple_pdf_processor as pdf_module from simple_pdf_processor import initialize_simple_pdf_processor from typing import List, Dict, Any import re # 🆕 Reusable TCPServer with address reuse enabled class ReusableTCPServer(socketserver.TCPServer): allow_reuse_address = True class ChatHandler(http.server.BaseHTTPRequestHandler): def __init__(self, *args, **kwargs): self.ollama_client = OllamaClient() super().__init__(*args, **kwargs) def do_OPTIONS(self): """Handle CORS preflight requests""" self.send_response(200) self.send_header('Access-Control-Allow-Origin', '*') self.send_header('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS') self.send_header('Access-Control-Allow-Headers', 'Content-Type') self.end_headers() def do_GET(self): """Handle GET requests""" parsed_path = urlparse(self.path) if parsed_path.path == '/health': self.send_json_response({ "status": "ok", "ollama_running": self.ollama_client.is_ollama_running(), "available_models": self.ollama_client.list_models(), "database_stats": db.get_stats() }) elif parsed_path.path == '/sessions': self.handle_get_sessions() elif parsed_path.path == '/sessions/cleanup': self.handle_cleanup_sessions() elif parsed_path.path == '/models': self.handle_get_models() elif parsed_path.path == '/indexes': self.handle_get_indexes() elif parsed_path.path.startswith('/indexes/') and parsed_path.path.count('/') == 2: index_id = parsed_path.path.split('/')[-1] self.handle_get_index(index_id) elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/documents'): session_id = parsed_path.path.split('/')[-2] self.handle_get_session_documents(session_id) elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/indexes'): session_id = parsed_path.path.split('/')[-2] self.handle_get_session_indexes(session_id) elif parsed_path.path.startswith('/sessions/') and parsed_path.path.count('/') == 2: session_id = parsed_path.path.split('/')[-1] self.handle_get_session(session_id) else: self.send_response(404) self.end_headers() def do_POST(self): """Handle POST requests""" parsed_path = urlparse(self.path) if parsed_path.path == '/chat': self.handle_chat() elif parsed_path.path == '/sessions': self.handle_create_session() elif parsed_path.path == '/indexes': self.handle_create_index() elif parsed_path.path.startswith('/indexes/') and parsed_path.path.endswith('/upload'): index_id = parsed_path.path.split('/')[-2] self.handle_index_file_upload(index_id) elif parsed_path.path.startswith('/indexes/') and parsed_path.path.endswith('/build'): index_id = parsed_path.path.split('/')[-2] self.handle_build_index(index_id) elif parsed_path.path.startswith('/sessions/') and '/indexes/' in parsed_path.path: parts = parsed_path.path.split('/') session_id = parts[2] index_id = parts[4] self.handle_link_index_to_session(session_id, index_id) elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/messages'): session_id = parsed_path.path.split('/')[-2] self.handle_session_chat(session_id) elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/upload'): session_id = parsed_path.path.split('/')[-2] self.handle_file_upload(session_id) elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/index'): session_id = parsed_path.path.split('/')[-2] self.handle_index_documents(session_id) elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/rename'): session_id = parsed_path.path.split('/')[-2] self.handle_rename_session(session_id) else: self.send_response(404) self.end_headers() def do_DELETE(self): """Handle DELETE requests""" parsed_path = urlparse(self.path) if parsed_path.path.startswith('/sessions/') and parsed_path.path.count('/') == 2: session_id = parsed_path.path.split('/')[-1] self.handle_delete_session(session_id) elif parsed_path.path.startswith('/indexes/') and parsed_path.path.count('/') == 2: index_id = parsed_path.path.split('/')[-1] self.handle_delete_index(index_id) else: self.send_response(404) self.end_headers() def handle_chat(self): """Handle legacy chat requests (without sessions)""" try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode('utf-8')) message = data.get('message', '') model = data.get('model', 'llama3.2:latest') conversation_history = data.get('conversation_history', []) if not message: self.send_json_response({ "error": "Message is required" }, status_code=400) return # Check if Ollama is running if not self.ollama_client.is_ollama_running(): self.send_json_response({ "error": "Ollama is not running. Please start Ollama first." }, status_code=503) return # Get response from Ollama response = self.ollama_client.chat(message, model, conversation_history) self.send_json_response({ "response": response, "model": model, "message_count": len(conversation_history) + 1 }) except json.JSONDecodeError: self.send_json_response({ "error": "Invalid JSON" }, status_code=400) except Exception as e: self.send_json_response({ "error": f"Server error: {str(e)}" }, status_code=500) def handle_get_sessions(self): """Get all chat sessions""" try: sessions = db.get_sessions() self.send_json_response({ "sessions": sessions, "total": len(sessions) }) except Exception as e: self.send_json_response({ "error": f"Failed to get sessions: {str(e)}" }, status_code=500) def handle_cleanup_sessions(self): """Clean up empty sessions""" try: cleanup_count = db.cleanup_empty_sessions() self.send_json_response({ "message": f"Cleaned up {cleanup_count} empty sessions", "cleanup_count": cleanup_count }) except Exception as e: self.send_json_response({ "error": f"Failed to cleanup sessions: {str(e)}" }, status_code=500) def handle_get_session(self, session_id: str): """Get a specific session with its messages""" try: session = db.get_session(session_id) if not session: self.send_json_response({ "error": "Session not found" }, status_code=404) return messages = db.get_messages(session_id) self.send_json_response({ "session": session, "messages": messages }) except Exception as e: self.send_json_response({ "error": f"Failed to get session: {str(e)}" }, status_code=500) def handle_get_session_documents(self, session_id: str): """Return documents and basic info for a session.""" try: session = db.get_session(session_id) if not session: self.send_json_response({"error": "Session not found"}, status_code=404) return docs = db.get_documents_for_session(session_id) # Extract original filenames from stored paths filenames = [os.path.basename(p).split('_', 1)[-1] if '_' in os.path.basename(p) else os.path.basename(p) for p in docs] self.send_json_response({ "session": session, "files": filenames, "file_count": len(docs) }) except Exception as e: self.send_json_response({"error": f"Failed to get documents: {str(e)}"}, status_code=500) def handle_create_session(self): """Create a new chat session""" try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode('utf-8')) title = data.get('title', 'New Chat') model = data.get('model', 'llama3.2:latest') session_id = db.create_session(title, model) session = db.get_session(session_id) self.send_json_response({ "session": session, "session_id": session_id }, status_code=201) except json.JSONDecodeError: self.send_json_response({ "error": "Invalid JSON" }, status_code=400) except Exception as e: self.send_json_response({ "error": f"Failed to create session: {str(e)}" }, status_code=500) def handle_session_chat(self, session_id: str): """ Handle chat within a specific session. Intelligently routes between direct LLM (fast) and RAG pipeline (document-aware). """ try: session = db.get_session(session_id) if not session: self.send_json_response({"error": "Session not found"}, status_code=404) return content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode('utf-8')) message = data.get('message', '') if not message: self.send_json_response({"error": "Message is required"}, status_code=400) return if session['message_count'] == 0: title = generate_session_title(message) db.update_session_title(session_id, title) # Add user message to database first user_message_id = db.add_message(session_id, message, "user") # 🎯 SMART ROUTING: Decide between direct LLM vs RAG idx_ids = db.get_indexes_for_session(session_id) force_rag = bool(data.get("force_rag", False)) use_rag = True if force_rag else self._should_use_rag(message, idx_ids) if use_rag: # 🔍 --- Use RAG Pipeline for Document-Related Queries --- print(f"🔍 Using RAG pipeline for document query: '{message[:50]}...'") response_text, source_docs = self._handle_rag_query(session_id, message, data, idx_ids) else: # ⚡ --- Use Direct LLM for General Queries (FAST) --- print(f"⚡ Using direct LLM for general query: '{message[:50]}...'") response_text, source_docs = self._handle_direct_llm_query(session_id, message, session) # Add AI response to database ai_message_id = db.add_message(session_id, response_text, "assistant") updated_session = db.get_session(session_id) # Send response with proper error handling self.send_json_response({ "response": response_text, "session": updated_session, "source_documents": source_docs, "used_rag": use_rag }) except BrokenPipeError: # Client disconnected - this is normal for long queries, just log it print(f"⚠️ Client disconnected during RAG processing for query: '{message[:30]}...'") except json.JSONDecodeError: self.send_json_response({ "error": "Invalid JSON" }, status_code=400) except Exception as e: print(f"❌ Server error in session chat: {str(e)}") try: self.send_json_response({ "error": f"Server error: {str(e)}" }, status_code=500) except BrokenPipeError: print(f"⚠️ Client disconnected during error response") def _should_use_rag(self, message: str, idx_ids: List[str]) -> bool: """ 🧠 ENHANCED: Determine if a query should use RAG pipeline using document overviews. Args: message: The user's query idx_ids: List of index IDs associated with the session Returns: bool: True if should use RAG, False for direct LLM """ # No indexes = definitely no RAG needed if not idx_ids: return False # Load document overviews for intelligent routing try: doc_overviews = self._load_document_overviews(idx_ids) if doc_overviews: return self._route_using_overviews(message, doc_overviews) except Exception as e: print(f"⚠️ Overview-based routing failed, falling back to simple routing: {e}") # Fallback to simple pattern matching if overviews unavailable return self._simple_pattern_routing(message, idx_ids) def _load_document_overviews(self, idx_ids: List[str]) -> List[str]: """Load and aggregate overviews for the given index IDs. Strategy: 1. Attempt to load each index's dedicated overview file. 2. Aggregate all overviews found across available files (deduplicated). 3. If none of the index files exist, fall back to the legacy global overview file. """ import os, json aggregated: list[str] = [] # 1️⃣ Collect overviews from per-index files for idx in idx_ids: candidate_paths = [ f"../index_store/overviews/{idx}.jsonl", f"index_store/overviews/{idx}.jsonl", f"./index_store/overviews/{idx}.jsonl", ] for p in candidate_paths: if os.path.exists(p): print(f"📖 Loading overviews from: {p}") try: with open(p, "r", encoding="utf-8") as f: for line in f: if not line.strip(): continue try: record = json.loads(line) overview = record.get("overview", "").strip() if overview: aggregated.append(overview) except json.JSONDecodeError: continue # skip malformed lines break # Stop after the first existing path for this idx except Exception as e: print(f"⚠️ Error reading {p}: {e}") break # Don't keep trying other paths for this idx if read failed # 2️⃣ Fall back to legacy global file if no per-index overviews found if not aggregated: legacy_paths = [ "../index_store/overviews/overviews.jsonl", "index_store/overviews/overviews.jsonl", "./index_store/overviews/overviews.jsonl", ] for p in legacy_paths: if os.path.exists(p): print(f"⚠️ Falling back to legacy overviews file: {p}") try: with open(p, "r", encoding="utf-8") as f: for line in f: if not line.strip(): continue try: record = json.loads(line) overview = record.get("overview", "").strip() if overview: aggregated.append(overview) except json.JSONDecodeError: continue except Exception as e: print(f"⚠️ Error reading legacy overviews file {p}: {e}") break # Limit for performance if aggregated: print(f"✅ Loaded {len(aggregated)} document overviews from {len(idx_ids)} index(es)") else: print(f"⚠️ No overviews found for indices {idx_ids}") return aggregated[:40] def _route_using_overviews(self, query: str, overviews: List[str]) -> bool: """ 🎯 Use document overviews and LLM to make intelligent routing decisions. Returns True if RAG should be used, False for direct LLM. """ if not overviews: return False # Format overviews for the routing prompt overviews_block = "\n".join(f"[{i+1}] {ov}" for i, ov in enumerate(overviews)) router_prompt = f"""You are an AI router deciding whether a user question should be answered via: • "USE_RAG" – search the user's private documents (described below) • "DIRECT_LLM" – reply from general knowledge (greetings, public facts, unrelated topics) CRITICAL PRINCIPLE: When documents exist in the KB, strongly prefer USE_RAG unless the query is purely conversational or completely unrelated to any possible document content. RULES: 1. If ANY overview clearly relates to the question (entities, numbers, addresses, dates, amounts, companies, technical terms) → USE_RAG 2. For document operations (summarize, analyze, explain, extract, find) → USE_RAG 3. For greetings only ("Hi", "Hello", "Thanks") → DIRECT_LLM 4. For pure math/world knowledge clearly unrelated to documents → DIRECT_LLM 5. When in doubt → USE_RAG DOCUMENT OVERVIEWS: {overviews_block} DECISION EXAMPLES: • "What invoice amounts are mentioned?" → USE_RAG (document-specific) • "Who is PromptX AI LLC?" → USE_RAG (entity in documents) • "What is the DeepSeek model?" → USE_RAG (mentioned in documents) • "Summarize the research paper" → USE_RAG (document operation) • "What is 2+2?" → DIRECT_LLM (pure math) • "Hi there" → DIRECT_LLM (greeting only) USER QUERY: "{query}" Respond with exactly one word: USE_RAG or DIRECT_LLM""" try: # Use Ollama to make the routing decision response = self.ollama_client.chat( message=router_prompt, model="qwen3:0.6b", # Fast model for routing enable_thinking=False # Fast routing ) # The response is directly the text, not a dict decision = response.strip().upper() # Parse decision if "USE_RAG" in decision: print(f"🎯 Overview-based routing: USE_RAG for query: '{query[:50]}...'") return True elif "DIRECT_LLM" in decision: print(f"⚡ Overview-based routing: DIRECT_LLM for query: '{query[:50]}...'") return False else: print(f"⚠️ Unclear routing decision '{decision}', defaulting to RAG") return True # Default to RAG when uncertain except Exception as e: print(f"❌ LLM routing failed: {e}, falling back to pattern matching") return self._simple_pattern_routing(query, []) def _simple_pattern_routing(self, message: str, idx_ids: List[str]) -> bool: """ 📝 FALLBACK: Simple pattern-based routing (original logic). """ message_lower = message.lower() # Always use Direct LLM for greetings and casual conversation greeting_patterns = [ 'hello', 'hi', 'hey', 'greetings', 'good morning', 'good afternoon', 'good evening', 'how are you', 'how do you do', 'nice to meet', 'pleasure to meet', 'thanks', 'thank you', 'bye', 'goodbye', 'see you', 'talk to you later', 'test', 'testing', 'check', 'ping', 'just saying', 'nevermind', 'ok', 'okay', 'alright', 'got it', 'understood', 'i see' ] # Check for greeting patterns for pattern in greeting_patterns: if pattern in message_lower: return False # Use Direct LLM for greetings # Keywords that strongly suggest document-related queries rag_indicators = [ 'document', 'doc', 'file', 'pdf', 'text', 'content', 'page', 'according to', 'based on', 'mentioned', 'states', 'says', 'what does', 'summarize', 'summary', 'analyze', 'analysis', 'quote', 'citation', 'reference', 'source', 'evidence', 'explain from', 'extract', 'find in', 'search for' ] # Check for strong RAG indicators for indicator in rag_indicators: if indicator in message_lower: return True # Question words + substantial length might benefit from RAG question_words = ['what', 'how', 'when', 'where', 'why', 'who', 'which'] starts_with_question = any(message_lower.startswith(word) for word in question_words) if starts_with_question and len(message) > 40: return True # Very short messages - use direct LLM if len(message.strip()) < 20: return False # Default to Direct LLM unless there's clear indication of document query return False def _handle_direct_llm_query(self, session_id: str, message: str, session: dict): """ Handle query using direct Ollama client with thinking disabled for speed. Returns: tuple: (response_text, empty_source_docs) """ try: # Get conversation history for context conversation_history = db.get_conversation_history(session_id) # Use the session's model or default model = session.get('model', 'qwen3:8b') # Default to fast model # Direct Ollama call with thinking disabled for speed response_text = self.ollama_client.chat( message=message, model=model, conversation_history=conversation_history, enable_thinking=False # ⚡ DISABLE THINKING FOR SPEED ) return response_text, [] # No source docs for direct LLM except Exception as e: print(f"❌ Direct LLM error: {e}") return f"Error processing query: {str(e)}", [] def _handle_rag_query(self, session_id: str, message: str, data: dict, idx_ids: List[str]): """ Handle query using the full RAG pipeline (delegates to the advanced RAG API running on port 8001). Returns: tuple[str, List[dict]]: (response_text, source_documents) """ # Defaults response_text = "" source_docs: List[dict] = [] # Build payload for RAG API rag_api_url = "http://localhost:8001/chat" table_name = f"text_pages_{idx_ids[-1]}" if idx_ids else None payload: Dict[str, Any] = { "query": message, "session_id": session_id, } if table_name: payload["table_name"] = table_name # Copy optional parameters from the incoming request optional_params: Dict[str, tuple[type, str]] = { "compose_sub_answers": (bool, "compose_sub_answers"), "query_decompose": (bool, "query_decompose"), "ai_rerank": (bool, "ai_rerank"), "context_expand": (bool, "context_expand"), "verify": (bool, "verify"), "retrieval_k": (int, "retrieval_k"), "context_window_size": (int, "context_window_size"), "reranker_top_k": (int, "reranker_top_k"), "search_type": (str, "search_type"), "dense_weight": (float, "dense_weight"), "provence_prune": (bool, "provence_prune"), "provence_threshold": (float, "provence_threshold"), } for key, (caster, payload_key) in optional_params.items(): val = data.get(key) if val is not None: try: payload[payload_key] = caster(val) # type: ignore[arg-type] except Exception: payload[payload_key] = val try: rag_response = requests.post(rag_api_url, json=payload) if rag_response.status_code == 200: rag_data = rag_response.json() response_text = rag_data.get("answer", "No answer found.") source_docs = rag_data.get("source_documents", []) else: response_text = f"Error from RAG API ({rag_response.status_code}): {rag_response.text}" print(f"❌ RAG API error: {response_text}") except requests.exceptions.ConnectionError: response_text = "Could not connect to the RAG API server. Please ensure it is running." print("❌ Connection to RAG API failed (port 8001).") except Exception as e: response_text = f"Error processing RAG query: {str(e)}" print(f"❌ RAG processing error: {e}") # Strip any / tags that might slip through response_text = re.sub(r'<(think|thinking)>.*?', '', response_text, flags=re.DOTALL | re.IGNORECASE).strip() return response_text, source_docs def handle_delete_session(self, session_id: str): """Delete a session and its messages""" try: deleted = db.delete_session(session_id) if deleted: self.send_json_response({'deleted': deleted}) else: self.send_json_response({'error': 'Session not found'}, status_code=404) except Exception as e: self.send_json_response({'error': str(e)}, status_code=500) def handle_file_upload(self, session_id: str): """Handle file uploads, save them, and associate with the session.""" form = cgi.FieldStorage( fp=self.rfile, headers=self.headers, environ={'REQUEST_METHOD': 'POST', 'CONTENT_TYPE': self.headers['Content-Type']} ) uploaded_files = [] if 'files' in form: files = form['files'] if not isinstance(files, list): files = [files] upload_dir = "shared_uploads" os.makedirs(upload_dir, exist_ok=True) for file_item in files: if file_item.filename: # Create a unique filename to avoid overwrites unique_filename = f"{uuid.uuid4()}_{file_item.filename}" file_path = os.path.join(upload_dir, unique_filename) with open(file_path, 'wb') as f: f.write(file_item.file.read()) # Store the absolute path for the indexing service absolute_file_path = os.path.abspath(file_path) db.add_document_to_session(session_id, absolute_file_path) uploaded_files.append({"filename": file_item.filename, "stored_path": absolute_file_path}) if not uploaded_files: self.send_json_response({"error": "No files were uploaded"}, status_code=400) return self.send_json_response({ "message": f"Successfully uploaded {len(uploaded_files)} files.", "uploaded_files": uploaded_files }) def handle_index_documents(self, session_id: str): """Triggers indexing for all documents in a session.""" print(f"🔥 Received request to index documents for session {session_id[:8]}...") try: file_paths = db.get_documents_for_session(session_id) if not file_paths: self.send_json_response({"message": "No documents to index for this session."}, status_code=200) return print(f"Found {len(file_paths)} documents to index. Sending to RAG API...") rag_api_url = "http://localhost:8001/index" rag_response = requests.post(rag_api_url, json={"file_paths": file_paths, "session_id": session_id}) if rag_response.status_code == 200: print("✅ RAG API successfully indexed documents.") # Merge key config values into index metadata idx_meta = { "session_linked": True, "retrieval_mode": "hybrid", } try: db.update_index_metadata(session_id, idx_meta) # session_id used as index_id in text table naming except Exception as e: print(f"⚠️ Failed to update index metadata for session index: {e}") self.send_json_response(rag_response.json()) else: error_info = rag_response.text print(f"❌ RAG API indexing failed ({rag_response.status_code}): {error_info}") self.send_json_response({"error": f"Indexing failed: {error_info}"}, status_code=500) except Exception as e: print(f"❌ Exception during indexing: {str(e)}") self.send_json_response({"error": f"An unexpected error occurred: {str(e)}"}, status_code=500) def handle_pdf_upload(self, session_id: str): """ Processes PDF files: extracts text and stores it in the database. DEPRECATED: This is the old method. Use handle_file_upload instead. """ # This function is now deprecated in favor of the new indexing workflow # but is kept for potential legacy/compatibility reasons. # For new functionality, it should not be used. self.send_json_response({ "warning": "This upload method is deprecated. Use the new file upload and indexing flow.", "message": "No action taken." }, status_code=410) # 410 Gone def handle_get_models(self): """Get available models from both Ollama and HuggingFace, grouped by capability""" try: generation_models = [] embedding_models = [] # Get Ollama models if available if self.ollama_client.is_ollama_running(): all_ollama_models = self.ollama_client.list_models() # Very naive classification - same logic as RAG API server ollama_embedding_models = [m for m in all_ollama_models if any(k in m for k in ['embed','bge','embedding','text'])] ollama_generation_models = [m for m in all_ollama_models if m not in ollama_embedding_models] generation_models.extend(ollama_generation_models) embedding_models.extend(ollama_embedding_models) # Add supported HuggingFace embedding models huggingface_embedding_models = [ "Qwen/Qwen3-Embedding-0.6B", "Qwen/Qwen3-Embedding-4B", "Qwen/Qwen3-Embedding-8B" ] embedding_models.extend(huggingface_embedding_models) # Sort models for consistent ordering generation_models.sort() embedding_models.sort() self.send_json_response({ "generation_models": generation_models, "embedding_models": embedding_models }) except Exception as e: self.send_json_response({ "error": f"Could not list models: {str(e)}" }, status_code=500) def handle_get_indexes(self): try: data = db.list_indexes() self.send_json_response({'indexes': data, 'total': len(data)}) except Exception as e: self.send_json_response({'error': str(e)}, status_code=500) def handle_get_index(self, index_id: str): try: data = db.get_index(index_id) if not data: self.send_json_response({'error': 'Index not found'}, status_code=404) return self.send_json_response(data) except Exception as e: self.send_json_response({'error': str(e)}, status_code=500) def handle_create_index(self): try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode('utf-8')) name = data.get('name') description = data.get('description') metadata = data.get('metadata', {}) if not name: self.send_json_response({'error': 'Name required'}, status_code=400) return # Add complete metadata from RAG system configuration if available if RAG_SYSTEM_AVAILABLE and PIPELINE_CONFIGS.get('default'): default_config = PIPELINE_CONFIGS['default'] complete_metadata = { 'status': 'created', 'metadata_source': 'rag_system_config', 'created_at': json.loads(json.dumps(datetime.now().isoformat())), 'chunk_size': 512, # From default config 'chunk_overlap': 64, # From default config 'retrieval_mode': 'hybrid', # From default config 'window_size': 5, # From default config 'embedding_model': 'Qwen/Qwen3-Embedding-0.6B', # From default config 'enrich_model': 'qwen3:0.6b', # From default config 'overview_model': 'qwen3:0.6b', # From default config 'enable_enrich': True, # From default config 'latechunk': True, # From default config 'docling_chunk': True, # From default config 'note': 'Default configuration from RAG system' } # Merge with any provided metadata complete_metadata.update(metadata) metadata = complete_metadata idx_id = db.create_index(name, description, metadata) self.send_json_response({'index_id': idx_id}, status_code=201) except Exception as e: self.send_json_response({'error': str(e)}, status_code=500) def handle_index_file_upload(self, index_id: str): """Reuse file upload logic but store docs under index.""" form = cgi.FieldStorage(fp=self.rfile, headers=self.headers, environ={'REQUEST_METHOD':'POST', 'CONTENT_TYPE': self.headers['Content-Type']}) uploaded_files=[] if 'files' in form: files=form['files'] if not isinstance(files, list): files=[files] upload_dir='shared_uploads' os.makedirs(upload_dir, exist_ok=True) for f in files: if f.filename: unique=f"{uuid.uuid4()}_{f.filename}" path=os.path.join(upload_dir, unique) with open(path,'wb') as out: out.write(f.file.read()) db.add_document_to_index(index_id, f.filename, os.path.abspath(path)) uploaded_files.append({'filename':f.filename,'stored_path':os.path.abspath(path)}) if not uploaded_files: self.send_json_response({'error':'No files uploaded'}, status_code=400); return self.send_json_response({'message':f"Uploaded {len(uploaded_files)} files","uploaded_files":uploaded_files}) def handle_build_index(self, index_id: str): try: index=db.get_index(index_id) if not index: self.send_json_response({'error':'Index not found'}, status_code=404); return file_paths=[d['stored_path'] for d in index.get('documents',[])] if not file_paths: self.send_json_response({'error':'No documents to index'}, status_code=400); return # Parse request body for optional flags and configuration latechunk = False docling_chunk = False chunk_size = 512 chunk_overlap = 64 retrieval_mode = 'hybrid' window_size = 2 enable_enrich = True embedding_model = None enrich_model = None batch_size_embed = 50 batch_size_enrich = 25 overview_model = None if 'Content-Length' in self.headers and int(self.headers['Content-Length']) > 0: try: length = int(self.headers['Content-Length']) body = self.rfile.read(length) opts = json.loads(body.decode('utf-8')) latechunk = bool(opts.get('latechunk', False)) docling_chunk = bool(opts.get('doclingChunk', False)) chunk_size = int(opts.get('chunkSize', 512)) chunk_overlap = int(opts.get('chunkOverlap', 64)) retrieval_mode = str(opts.get('retrievalMode', 'hybrid')) window_size = int(opts.get('windowSize', 2)) enable_enrich = bool(opts.get('enableEnrich', True)) embedding_model = opts.get('embeddingModel') enrich_model = opts.get('enrichModel') batch_size_embed = int(opts.get('batchSizeEmbed', 50)) batch_size_enrich = int(opts.get('batchSizeEnrich', 25)) overview_model = opts.get('overviewModel') except Exception: # Keep defaults on parse error pass # Set per-index overview file path overview_path = f"index_store/overviews/{index_id}.jsonl" # Ensure config_override includes overview_path def ensure_overview_path(cfg: dict): cfg["overview_path"] = overview_path # we'll inject later when we build config_override # Delegate to advanced RAG API same as session indexing rag_api_url = "http://localhost:8001/index" import requests, json as _json # Use the index's dedicated LanceDB table so retrieval matches table_name = index.get("vector_table_name") payload = { "file_paths": file_paths, "session_id": index_id, # reuse index_id for progress tracking "table_name": table_name, "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "retrieval_mode": retrieval_mode, "window_size": window_size, "enable_enrich": enable_enrich, "batch_size_embed": batch_size_embed, "batch_size_enrich": batch_size_enrich } if latechunk: payload["enable_latechunk"] = True if docling_chunk: payload["enable_docling_chunk"] = True if embedding_model: payload["embedding_model"] = embedding_model if enrich_model: payload["enrich_model"] = enrich_model if overview_model: payload["overview_model_name"] = overview_model rag_resp = requests.post(rag_api_url, json=payload) if rag_resp.status_code==200: meta_updates = { "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "retrieval_mode": retrieval_mode, "window_size": window_size, "enable_enrich": enable_enrich, "latechunk": latechunk, "docling_chunk": docling_chunk, } if embedding_model: meta_updates["embedding_model"] = embedding_model if enrich_model: meta_updates["enrich_model"] = enrich_model if overview_model: meta_updates["overview_model"] = overview_model try: db.update_index_metadata(index_id, meta_updates) except Exception as e: print(f"⚠️ Failed to update index metadata: {e}") self.send_json_response({ "response": rag_resp.json(), **meta_updates }) else: # Gracefully handle scenario where table already exists (idempotent build) try: err_json = rag_resp.json() except Exception: err_json = {} err_text = err_json.get('error') if isinstance(err_json, dict) else rag_resp.text if err_text and 'already exists' in err_text: # Treat as non-fatal; return message indicating index previously built self.send_json_response({ "message": "Index already built – skipping rebuild.", "note": err_text }) else: self.send_json_response({"error":f"RAG indexing failed: {rag_resp.text}"}, status_code=500) except Exception as e: self.send_json_response({'error':str(e)}, status_code=500) def handle_link_index_to_session(self, session_id: str, index_id: str): try: db.link_index_to_session(session_id, index_id) self.send_json_response({'message':'Index linked to session'}) except Exception as e: self.send_json_response({'error':str(e)}, status_code=500) def handle_get_session_indexes(self, session_id: str): try: idx_ids = db.get_indexes_for_session(session_id) indexes = [] for idx_id in idx_ids: idx = db.get_index(idx_id) if idx: # Try to populate metadata for older indexes that have empty metadata if not idx.get('metadata') or len(idx['metadata']) == 0: print(f"🔍 Attempting to infer metadata for index {idx_id[:8]}...") inferred_metadata = db.inspect_and_populate_index_metadata(idx_id) if inferred_metadata: # Refresh the index data with the new metadata idx = db.get_index(idx_id) indexes.append(idx) self.send_json_response({'indexes': indexes, 'total': len(indexes)}) except Exception as e: self.send_json_response({'error': str(e)}, status_code=500) def handle_delete_index(self, index_id: str): """Remove an index, its documents, links, and the underlying LanceDB table.""" try: deleted = db.delete_index(index_id) if deleted: self.send_json_response({'message': 'Index deleted successfully', 'index_id': index_id}) else: self.send_json_response({'error': 'Index not found'}, status_code=404) except Exception as e: self.send_json_response({'error': str(e)}, status_code=500) def handle_rename_session(self, session_id: str): """Rename an existing session title""" try: session = db.get_session(session_id) if not session: self.send_json_response({"error": "Session not found"}, status_code=404) return content_length = int(self.headers.get('Content-Length', 0)) if content_length == 0: self.send_json_response({"error": "Request body required"}, status_code=400) return post_data = self.rfile.read(content_length) data = json.loads(post_data.decode('utf-8')) new_title: str = data.get('title', '').strip() if not new_title: self.send_json_response({"error": "Title cannot be empty"}, status_code=400) return db.update_session_title(session_id, new_title) updated_session = db.get_session(session_id) self.send_json_response({ "message": "Session renamed successfully", "session": updated_session }) except json.JSONDecodeError: self.send_json_response({"error": "Invalid JSON"}, status_code=400) except Exception as e: self.send_json_response({"error": f"Failed to rename session: {str(e)}"}, status_code=500) def send_json_response(self, data, status_code: int = 200): """Send a JSON (UTF-8) response with CORS headers. Safe against client disconnects.""" try: self.send_response(status_code) self.send_header('Content-Type', 'application/json') self.send_header('Access-Control-Allow-Origin', '*') self.send_header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS') self.send_header('Access-Control-Allow-Headers', 'Content-Type, Authorization') self.send_header('Access-Control-Allow-Credentials', 'true') self.end_headers() response_bytes = json.dumps(data, indent=2).encode('utf-8') self.wfile.write(response_bytes) except BrokenPipeError: # Client disconnected before we could finish sending print("⚠️ Client disconnected during response – ignoring.") except Exception as e: print(f"❌ Error sending response: {e}") def log_message(self, format, *args): """Custom log format""" print(f"[{self.date_time_string()}] {format % args}") def main(): """Main function to initialize and start the server""" PORT = 8000 # 🆕 Define port try: # Initialize the database print("✅ Database initialized successfully") # Initialize the PDF processor try: pdf_module.initialize_simple_pdf_processor() print("📄 Initializing simple PDF processing...") if pdf_module.simple_pdf_processor: print("✅ Simple PDF processor initialized") else: print("⚠️ PDF processing could not be initialized.") except Exception as e: print(f"❌ Error initializing PDF processor: {e}") print("⚠️ PDF processing disabled - server will run without RAG functionality") # Set a global reference to the initialized processor if needed elsewhere global pdf_processor pdf_processor = pdf_module.simple_pdf_processor if pdf_processor: print("✅ Global PDF processor initialized") else: print("⚠️ PDF processing disabled - server will run without RAG functionality") # Cleanup empty sessions on startup print("🧹 Cleaning up empty sessions...") cleanup_count = db.cleanup_empty_sessions() if cleanup_count > 0: print(f"✨ Cleaned up {cleanup_count} empty sessions") else: print("✨ No empty sessions to clean up") # Start the server with ReusableTCPServer(("", PORT), ChatHandler) as httpd: print(f"🚀 Starting localGPT backend server on port {PORT}") print(f"📍 Chat endpoint: http://localhost:{PORT}/chat") print(f"🔍 Health check: http://localhost:{PORT}/health") # Test Ollama connection client = OllamaClient() if client.is_ollama_running(): models = client.list_models() print(f"✅ Ollama is running with {len(models)} models") print(f"📋 Available models: {', '.join(models[:3])}{'...' if len(models) > 3 else ''}") else: print("⚠️ Ollama is not running. Please start Ollama:") print(" Install: https://ollama.ai") print(" Run: ollama serve") print(f"\n🌐 Frontend should connect to: http://localhost:{PORT}") print("💬 Ready to chat!\n") httpd.serve_forever() except KeyboardInterrupt: print("\n🛑 Server stopped") if __name__ == "__main__": main() ================================================ FILE: backend/simple_pdf_processor.py ================================================ """ Simple PDF Processing Service Handles PDF upload and text extraction for RAG functionality """ import uuid from typing import List, Dict, Any import PyPDF2 from io import BytesIO import sqlite3 from datetime import datetime class SimplePDFProcessor: def __init__(self, db_path: str = "chat_data.db"): """Initialize simple PDF processor with SQLite storage""" self.db_path = db_path self.init_database() print("✅ Simple PDF processor initialized") def init_database(self): """Initialize SQLite database for storing PDF content""" conn = sqlite3.connect(self.db_path) conn.execute(''' CREATE TABLE IF NOT EXISTS pdf_documents ( id TEXT PRIMARY KEY, session_id TEXT NOT NULL, filename TEXT NOT NULL, content TEXT NOT NULL, created_at TEXT NOT NULL ) ''') conn.commit() conn.close() def extract_text_from_pdf(self, pdf_bytes: bytes) -> str: """Extract text from PDF bytes""" try: print(f"📄 Starting PDF text extraction ({len(pdf_bytes)} bytes)") pdf_file = BytesIO(pdf_bytes) pdf_reader = PyPDF2.PdfReader(pdf_file) print(f"📖 PDF has {len(pdf_reader.pages)} pages") text = "" for page_num, page in enumerate(pdf_reader.pages): print(f"📄 Processing page {page_num + 1}") try: page_text = page.extract_text() if page_text.strip(): text += f"\n--- Page {page_num + 1} ---\n" text += page_text + "\n" print(f"✅ Page {page_num + 1}: extracted {len(page_text)} characters") except Exception as page_error: print(f"❌ Error on page {page_num + 1}: {str(page_error)}") continue print(f"📄 Total extracted text: {len(text)} characters") return text.strip() except Exception as e: print(f"❌ Error extracting text from PDF: {str(e)}") print(f"❌ Error type: {type(e).__name__}") return "" def process_pdf(self, pdf_bytes: bytes, filename: str, session_id: str) -> Dict[str, Any]: """Process a PDF file and store in database""" print(f"📄 Processing PDF: {filename}") # Extract text text = self.extract_text_from_pdf(pdf_bytes) if not text: return { "success": False, "error": "Could not extract text from PDF", "filename": filename } print(f"📝 Extracted {len(text)} characters from {filename}") # Store in database document_id = str(uuid.uuid4()) now = datetime.now().isoformat() try: conn = sqlite3.connect(self.db_path) # Store document conn.execute(''' INSERT INTO pdf_documents (id, session_id, filename, content, created_at) VALUES (?, ?, ?, ?, ?) ''', (document_id, session_id, filename, text, now)) conn.commit() conn.close() print(f"💾 Stored document {filename} in database") return { "success": True, "filename": filename, "file_id": document_id, "text_length": len(text) } except Exception as e: print(f"❌ Error storing in database: {str(e)}") return { "success": False, "error": f"Database storage failed: {str(e)}", "filename": filename } def get_session_documents(self, session_id: str) -> List[Dict[str, Any]]: """Get all documents for a session""" try: conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row cursor = conn.execute(''' SELECT id, filename, created_at FROM pdf_documents WHERE session_id = ? ORDER BY created_at DESC ''', (session_id,)) documents = [dict(row) for row in cursor.fetchall()] conn.close() return documents except Exception as e: print(f"❌ Error getting session documents: {str(e)}") return [] def get_document_content(self, session_id: str) -> str: """Get all document content for a session (for LLM context)""" try: conn = sqlite3.connect(self.db_path) cursor = conn.execute(''' SELECT filename, content FROM pdf_documents WHERE session_id = ? ORDER BY created_at ASC ''', (session_id,)) rows = cursor.fetchall() conn.close() if not rows: return "" # Combine all document content combined_content = "" for filename, content in rows: combined_content += f"\n\n=== Document: {filename} ===\n\n" combined_content += content return combined_content.strip() except Exception as e: print(f"❌ Error getting document content: {str(e)}") return "" def delete_session_documents(self, session_id: str) -> bool: """Delete all documents for a session""" try: conn = sqlite3.connect(self.db_path) cursor = conn.execute(''' DELETE FROM pdf_documents WHERE session_id = ? ''', (session_id,)) deleted_count = cursor.rowcount conn.commit() conn.close() if deleted_count > 0: print(f"🗑️ Deleted {deleted_count} documents for session {session_id[:8]}...") return deleted_count > 0 except Exception as e: print(f"❌ Error deleting session documents: {str(e)}") return False # Global instance simple_pdf_processor = None def initialize_simple_pdf_processor(): """Initialize the global PDF processor""" global simple_pdf_processor try: simple_pdf_processor = SimplePDFProcessor() print("✅ Global PDF processor initialized") except Exception as e: print(f"❌ Failed to initialize PDF processor: {str(e)}") simple_pdf_processor = None def get_simple_pdf_processor(): """Get the global PDF processor instance""" global simple_pdf_processor if simple_pdf_processor is None: initialize_simple_pdf_processor() return simple_pdf_processor if __name__ == "__main__": # Test the simple PDF processor print("🧪 Testing simple PDF processor...") processor = SimplePDFProcessor() print("✅ Simple PDF processor test completed!") ================================================ FILE: backend/test_backend.py ================================================ #!/usr/bin/env python3 """ Simple test script for the localGPT backend """ import requests def test_health_endpoint(): """Test the health endpoint""" print("🔍 Testing health endpoint...") try: response = requests.get("http://localhost:8000/health", timeout=5) if response.status_code == 200: data = response.json() print(f"✅ Health check passed") print(f" Ollama running: {data['ollama_running']}") print(f" Models available: {len(data['available_models'])}") return True else: print(f"❌ Health check failed: {response.status_code}") return False except requests.exceptions.RequestException as e: print(f"❌ Health check failed: {e}") return False def test_chat_endpoint(): """Test the chat endpoint""" print("\n💬 Testing chat endpoint...") test_message = { "message": "Say 'Hello World' and nothing else.", "model": "llama3.2:latest" } try: response = requests.post( "http://localhost:8000/chat", headers={"Content-Type": "application/json"}, json=test_message, timeout=30 ) if response.status_code == 200: data = response.json() print(f"✅ Chat test passed") print(f" Model: {data['model']}") print(f" Response: {data['response']}") print(f" Message count: {data['message_count']}") return True else: print(f"❌ Chat test failed: {response.status_code}") print(f" Response: {response.text}") return False except requests.exceptions.RequestException as e: print(f"❌ Chat test failed: {e}") return False def test_conversation_history(): """Test conversation with history""" print("\n🗨️ Testing conversation history...") # First message conversation = [] message1 = { "message": "My name is Alice. Remember this.", "model": "llama3.2:latest", "conversation_history": conversation } try: response1 = requests.post( "http://localhost:8000/chat", headers={"Content-Type": "application/json"}, json=message1, timeout=30 ) if response1.status_code == 200: data1 = response1.json() # Add to conversation history conversation.append({"role": "user", "content": "My name is Alice. Remember this."}) conversation.append({"role": "assistant", "content": data1["response"]}) # Second message asking about the name message2 = { "message": "What is my name?", "model": "llama3.2:latest", "conversation_history": conversation } response2 = requests.post( "http://localhost:8000/chat", headers={"Content-Type": "application/json"}, json=message2, timeout=30 ) if response2.status_code == 200: data2 = response2.json() print(f"✅ Conversation history test passed") print(f" First response: {data1['response']}") print(f" Second response: {data2['response']}") # Check if the AI remembered the name if "alice" in data2['response'].lower(): print(f"✅ AI correctly remembered the name!") else: print(f"⚠️ AI might not have remembered the name") return True else: print(f"❌ Second message failed: {response2.status_code}") return False else: print(f"❌ First message failed: {response1.status_code}") return False except requests.exceptions.RequestException as e: print(f"❌ Conversation test failed: {e}") return False def main(): print("🧪 Testing localGPT Backend") print("=" * 40) # Test health endpoint health_ok = test_health_endpoint() if not health_ok: print("\n❌ Backend server is not running or not healthy") print(" Make sure to run: python server.py") return # Test basic chat chat_ok = test_chat_endpoint() if not chat_ok: print("\n❌ Chat functionality is not working") return # Test conversation history conversation_ok = test_conversation_history() print("\n" + "=" * 40) if health_ok and chat_ok and conversation_ok: print("🎉 All tests passed! Backend is ready for frontend integration.") else: print("⚠️ Some tests failed. Check the issues above.") print("\n🔗 Ready to connect to frontend at http://localhost:3000") if __name__ == "__main__": main() ================================================ FILE: backend/test_ollama_connectivity.py ================================================ #!/usr/bin/env python3 import os import sys def test_ollama_connectivity(): """Test Ollama connectivity from within Docker container""" print("🧪 Testing Ollama Connectivity") print("=" * 40) ollama_host = os.getenv('OLLAMA_HOST', 'Not set') print(f"OLLAMA_HOST environment variable: {ollama_host}") try: from ollama_client import OllamaClient client = OllamaClient() print(f"OllamaClient base_url: {client.base_url}") is_running = client.is_ollama_running() print(f"Ollama running: {is_running}") if is_running: models = client.list_models() print(f"Available models: {models}") print("✅ Ollama connectivity test passed!") return True else: print("❌ Ollama connectivity test failed!") return False except Exception as e: print(f"❌ Error testing Ollama connectivity: {e}") return False if __name__ == "__main__": success = test_ollama_connectivity() sys.exit(0 if success else 1) ================================================ FILE: batch_indexing_config.json ================================================ { "index_name": "Sample Batch Index", "index_description": "Example batch index configuration", "documents": [ "./rag_system/documents/invoice_1039.pdf", "./rag_system/documents/invoice_1041.pdf" ], "processing": { "chunk_size": 512, "chunk_overlap": 64, "enable_enrich": true, "enable_latechunk": true, "enable_docling": true, "embedding_model": "Qwen/Qwen3-Embedding-0.6B", "generation_model": "qwen3:0.6b", "retrieval_mode": "hybrid", "window_size": 2 } } ================================================ FILE: create_index_script.py ================================================ #!/usr/bin/env python3 """ Interactive Index Creation Script for LocalGPT RAG System This script provides a user-friendly interface for creating document indexes using the LocalGPT RAG system. It supports both single documents and batch processing of multiple documents. Usage: python create_index_script.py python create_index_script.py --batch python create_index_script.py --config custom_config.json """ import os import sys import json import argparse from typing import List, Optional from pathlib import Path # Add the project root to the path so we can import rag_system modules sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) try: from rag_system.main import PIPELINE_CONFIGS, get_agent from rag_system.pipelines.indexing_pipeline import IndexingPipeline from rag_system.utils.ollama_client import OllamaClient from backend.database import ChatDatabase except ImportError as e: print(f"❌ Error importing required modules: {e}") print("Please ensure you're running this script from the project root directory.") sys.exit(1) class IndexCreator: """Interactive index creation utility.""" def __init__(self, config_path: Optional[str] = None): """Initialize the index creator with optional custom configuration.""" self.db = ChatDatabase() self.config = self._load_config(config_path) # Initialize Ollama client self.ollama_client = OllamaClient() self.ollama_config = { "generation_model": "qwen3:0.6b", "embedding_model": "qwen3:0.6b" } # Initialize indexing pipeline self.pipeline = IndexingPipeline( self.config, self.ollama_client, self.ollama_config ) def _load_config(self, config_path: Optional[str] = None) -> dict: """Load configuration from file or use default.""" if config_path and os.path.exists(config_path): try: with open(config_path, 'r') as f: return json.load(f) except Exception as e: print(f"⚠️ Error loading config from {config_path}: {e}") print("Using default configuration...") return PIPELINE_CONFIGS.get("default", {}) def get_user_input(self, prompt: str, default: str = "") -> str: """Get user input with optional default value.""" if default: user_input = input(f"{prompt} [{default}]: ").strip() return user_input if user_input else default return input(f"{prompt}: ").strip() def select_documents(self) -> List[str]: """Interactive document selection.""" print("\n📁 Document Selection") print("=" * 50) documents = [] while True: print("\nOptions:") print("1. Add a single document") print("2. Add all documents from a directory") print("3. Finish and proceed with selected documents") print("4. Show selected documents") choice = self.get_user_input("Select an option (1-4)", "1") if choice == "1": doc_path = self.get_user_input("Enter document path") if os.path.exists(doc_path): documents.append(os.path.abspath(doc_path)) print(f"✅ Added: {doc_path}") else: print(f"❌ File not found: {doc_path}") elif choice == "2": dir_path = self.get_user_input("Enter directory path") if os.path.isdir(dir_path): supported_extensions = ['.pdf', '.txt', '.docx', '.md', '.html', '.htm'] found_docs = [] for ext in supported_extensions: found_docs.extend(Path(dir_path).glob(f"*{ext}")) found_docs.extend(Path(dir_path).glob(f"**/*{ext}")) if found_docs: print(f"Found {len(found_docs)} documents:") for doc in found_docs: print(f" - {doc}") if self.get_user_input("Add all these documents? (y/n)", "y").lower() == 'y': documents.extend([str(doc.absolute()) for doc in found_docs]) print(f"✅ Added {len(found_docs)} documents") else: print("❌ No supported documents found in directory") else: print(f"❌ Directory not found: {dir_path}") elif choice == "3": if documents: break else: print("❌ No documents selected. Please add at least one document.") elif choice == "4": if documents: print(f"\n📄 Selected documents ({len(documents)}):") for i, doc in enumerate(documents, 1): print(f" {i}. {doc}") else: print("No documents selected yet.") else: print("Invalid choice. Please select 1-4.") return documents def configure_processing(self) -> dict: """Interactive processing configuration.""" print("\n⚙️ Processing Configuration") print("=" * 50) print("Configure how documents will be processed:") # Basic settings chunk_size = int(self.get_user_input("Chunk size", "512")) chunk_overlap = int(self.get_user_input("Chunk overlap", "64")) # Advanced settings print("\nAdvanced options:") enable_enrich = self.get_user_input("Enable contextual enrichment? (y/n)", "y").lower() == 'y' enable_latechunk = self.get_user_input("Enable late chunking? (y/n)", "y").lower() == 'y' enable_docling = self.get_user_input("Enable Docling chunking? (y/n)", "y").lower() == 'y' # Model selection print("\nModel Configuration:") embedding_model = self.get_user_input("Embedding model", "Qwen/Qwen3-Embedding-0.6B") generation_model = self.get_user_input("Generation model", "qwen3:0.6b") return { "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "enable_enrich": enable_enrich, "enable_latechunk": enable_latechunk, "enable_docling": enable_docling, "embedding_model": embedding_model, "generation_model": generation_model, "retrieval_mode": "hybrid", "window_size": 2 } def create_index_interactive(self) -> None: """Run the interactive index creation process.""" print("🚀 LocalGPT Index Creation Tool") print("=" * 50) # Get index details index_name = self.get_user_input("Enter index name") index_description = self.get_user_input("Enter index description (optional)") # Select documents documents = self.select_documents() # Configure processing processing_config = self.configure_processing() # Confirm creation print("\n📋 Index Summary") print("=" * 50) print(f"Name: {index_name}") print(f"Description: {index_description or 'None'}") print(f"Documents: {len(documents)}") print(f"Chunk size: {processing_config['chunk_size']}") print(f"Enrichment: {'Enabled' if processing_config['enable_enrich'] else 'Disabled'}") print(f"Embedding model: {processing_config['embedding_model']}") if self.get_user_input("\nProceed with index creation? (y/n)", "y").lower() != 'y': print("❌ Index creation cancelled.") return # Create the index try: print("\n🔥 Creating index...") # Create index record in database index_id = self.db.create_index( name=index_name, description=index_description, metadata=processing_config ) # Add documents to index for doc_path in documents: filename = os.path.basename(doc_path) self.db.add_document_to_index(index_id, filename, doc_path) # Process documents through pipeline print("📚 Processing documents...") self.pipeline.process_documents(documents) print(f"\n✅ Index '{index_name}' created successfully!") print(f"Index ID: {index_id}") print(f"Processed {len(documents)} documents") # Test the index if self.get_user_input("\nTest the index with a sample query? (y/n)", "y").lower() == 'y': self.test_index(index_id) except Exception as e: print(f"❌ Error creating index: {e}") import traceback traceback.print_exc() def test_index(self, index_id: str) -> None: """Test the created index with a sample query.""" try: print("\n🧪 Testing Index") print("=" * 50) # Get agent for testing agent = get_agent("default") # Test query test_query = self.get_user_input("Enter a test query", "What is this document about?") print(f"\nProcessing query: {test_query}") response = agent.run(test_query, table_name=f"text_pages_{index_id}") print(f"\n🤖 Response:") print(response) except Exception as e: print(f"❌ Error testing index: {e}") def batch_create_from_config(self, config_file: str) -> None: """Create index from batch configuration file.""" try: with open(config_file, 'r') as f: batch_config = json.load(f) index_name = batch_config.get("index_name", "Batch Index") index_description = batch_config.get("index_description", "") documents = batch_config.get("documents", []) processing_config = batch_config.get("processing", {}) if not documents: print("❌ No documents specified in batch configuration") return # Validate documents exist valid_documents = [] for doc_path in documents: if os.path.exists(doc_path): valid_documents.append(doc_path) else: print(f"⚠️ Document not found: {doc_path}") if not valid_documents: print("❌ No valid documents found") return print(f"🚀 Creating batch index: {index_name}") print(f"📄 Processing {len(valid_documents)} documents...") # Create index index_id = self.db.create_index( name=index_name, description=index_description, metadata=processing_config ) # Add documents for doc_path in valid_documents: filename = os.path.basename(doc_path) self.db.add_document_to_index(index_id, filename, doc_path) # Process documents self.pipeline.process_documents(valid_documents) print(f"✅ Batch index '{index_name}' created successfully!") print(f"Index ID: {index_id}") except Exception as e: print(f"❌ Error creating batch index: {e}") import traceback traceback.print_exc() def create_sample_batch_config(): """Create a sample batch configuration file.""" sample_config = { "index_name": "Sample Batch Index", "index_description": "Example batch index configuration", "documents": [ "./rag_system/documents/invoice_1039.pdf", "./rag_system/documents/invoice_1041.pdf" ], "processing": { "chunk_size": 512, "chunk_overlap": 64, "enable_enrich": True, "enable_latechunk": True, "enable_docling": True, "embedding_model": "Qwen/Qwen3-Embedding-0.6B", "generation_model": "qwen3:0.6b", "retrieval_mode": "hybrid", "window_size": 2 } } with open("batch_indexing_config.json", "w") as f: json.dump(sample_config, f, indent=2) print("📄 Sample batch configuration created: batch_indexing_config.json") def main(): """Main entry point for the script.""" parser = argparse.ArgumentParser(description="LocalGPT Index Creation Tool") parser.add_argument("--batch", help="Batch configuration file", type=str) parser.add_argument("--config", help="Custom pipeline configuration file", type=str) parser.add_argument("--create-sample", action="store_true", help="Create sample batch config") args = parser.parse_args() if args.create_sample: create_sample_batch_config() return try: creator = IndexCreator(config_path=args.config) if args.batch: creator.batch_create_from_config(args.batch) else: creator.create_index_interactive() except KeyboardInterrupt: print("\n\n❌ Operation cancelled by user.") except Exception as e: print(f"❌ Unexpected error: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main() ================================================ FILE: demo_batch_indexing.py ================================================ #!/usr/bin/env python3 """ Demo Batch Indexing Script for LocalGPT RAG System This script demonstrates how to perform batch indexing of multiple documents using configuration files. It's designed to showcase the full capabilities of the indexing pipeline with various configuration options. Usage: python demo_batch_indexing.py --config batch_indexing_config.json python demo_batch_indexing.py --create-sample-config python demo_batch_indexing.py --help """ import os import sys import json import argparse import time import logging from typing import List, Dict, Any, Optional from pathlib import Path from datetime import datetime # Add the project root to the path so we can import rag_system modules sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) try: from rag_system.main import PIPELINE_CONFIGS from rag_system.pipelines.indexing_pipeline import IndexingPipeline from rag_system.utils.ollama_client import OllamaClient from backend.database import ChatDatabase except ImportError as e: print(f"❌ Error importing required modules: {e}") print("Please ensure you're running this script from the project root directory.") sys.exit(1) # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s", ) class BatchIndexingDemo: """Demonstration of batch indexing capabilities.""" def __init__(self, config_path: str): """Initialize the batch indexing demo.""" self.config_path = config_path self.config = self._load_config() self.db = ChatDatabase() # Initialize Ollama client self.ollama_client = OllamaClient() # Initialize pipeline with merged configuration self.pipeline_config = self._merge_configurations() self.pipeline = IndexingPipeline( self.pipeline_config, self.ollama_client, self.config.get("ollama_config", { "generation_model": "qwen3:0.6b", "embedding_model": "qwen3:0.6b" }) ) def _load_config(self) -> Dict[str, Any]: """Load batch indexing configuration from file.""" try: with open(self.config_path, 'r') as f: config = json.load(f) print(f"✅ Loaded configuration from {self.config_path}") return config except FileNotFoundError: print(f"❌ Configuration file not found: {self.config_path}") sys.exit(1) except json.JSONDecodeError as e: print(f"❌ Invalid JSON in configuration file: {e}") sys.exit(1) def _merge_configurations(self) -> Dict[str, Any]: """Merge batch config with default pipeline config.""" # Start with default pipeline configuration merged_config = PIPELINE_CONFIGS.get("default", {}).copy() # Override with batch-specific settings batch_settings = self.config.get("pipeline_settings", {}) # Deep merge for nested dictionaries def deep_merge(base: dict, override: dict) -> dict: result = base.copy() for key, value in override.items(): if key in result and isinstance(result[key], dict) and isinstance(value, dict): result[key] = deep_merge(result[key], value) else: result[key] = value return result return deep_merge(merged_config, batch_settings) def validate_documents(self, documents: List[str]) -> List[str]: """Validate and filter document paths.""" valid_documents = [] print(f"📋 Validating {len(documents)} documents...") for doc_path in documents: # Handle relative paths if not os.path.isabs(doc_path): doc_path = os.path.abspath(doc_path) if os.path.exists(doc_path): # Check file extension ext = Path(doc_path).suffix.lower() if ext in ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']: valid_documents.append(doc_path) print(f" ✅ {doc_path}") else: print(f" ⚠️ Unsupported file type: {doc_path}") else: print(f" ❌ File not found: {doc_path}") print(f"📊 {len(valid_documents)} valid documents found") return valid_documents def create_indexes(self) -> List[str]: """Create multiple indexes based on configuration.""" indexes = self.config.get("indexes", []) created_indexes = [] for index_config in indexes: index_id = self.create_single_index(index_config) if index_id: created_indexes.append(index_id) return created_indexes def create_single_index(self, index_config: Dict[str, Any]) -> Optional[str]: """Create a single index from configuration.""" try: # Extract index metadata index_name = index_config.get("name", "Unnamed Index") index_description = index_config.get("description", "") documents = index_config.get("documents", []) if not documents: print(f"⚠️ No documents specified for index '{index_name}', skipping...") return None # Validate documents valid_documents = self.validate_documents(documents) if not valid_documents: print(f"❌ No valid documents found for index '{index_name}'") return None print(f"\n🚀 Creating index: {index_name}") print(f"📄 Processing {len(valid_documents)} documents") # Create index record in database index_metadata = { "created_by": "demo_batch_indexing.py", "created_at": datetime.now().isoformat(), "document_count": len(valid_documents), "config_used": index_config.get("processing_options", {}) } index_id = self.db.create_index( name=index_name, description=index_description, metadata=index_metadata ) # Add documents to index for doc_path in valid_documents: filename = os.path.basename(doc_path) self.db.add_document_to_index(index_id, filename, doc_path) # Process documents through pipeline start_time = time.time() self.pipeline.process_documents(valid_documents) processing_time = time.time() - start_time print(f"✅ Index '{index_name}' created successfully!") print(f" Index ID: {index_id}") print(f" Processing time: {processing_time:.2f} seconds") print(f" Documents processed: {len(valid_documents)}") return index_id except Exception as e: print(f"❌ Error creating index '{index_name}': {e}") import traceback traceback.print_exc() return None def demonstrate_features(self): """Demonstrate various indexing features.""" print("\n🎯 Batch Indexing Demo Features:") print("=" * 50) # Show configuration print(f"📋 Configuration file: {self.config_path}") print(f"📊 Number of indexes to create: {len(self.config.get('indexes', []))}") # Show pipeline settings pipeline_settings = self.config.get("pipeline_settings", {}) if pipeline_settings: print("\n⚙️ Pipeline Settings:") for key, value in pipeline_settings.items(): print(f" {key}: {value}") # Show model configuration ollama_config = self.config.get("ollama_config", {}) if ollama_config: print("\n🤖 Model Configuration:") for key, value in ollama_config.items(): print(f" {key}: {value}") def run_demo(self): """Run the complete batch indexing demo.""" print("🚀 LocalGPT Batch Indexing Demo") print("=" * 50) # Show demo features self.demonstrate_features() # Create indexes print(f"\n📚 Starting batch indexing process...") start_time = time.time() created_indexes = self.create_indexes() total_time = time.time() - start_time # Summary print(f"\n📊 Batch Indexing Summary") print("=" * 50) print(f"✅ Successfully created {len(created_indexes)} indexes") print(f"⏱️ Total processing time: {total_time:.2f} seconds") if created_indexes: print(f"\n📋 Created Indexes:") for i, index_id in enumerate(created_indexes, 1): index_info = self.db.get_index(index_id) if index_info: print(f" {i}. {index_info['name']} ({index_id[:8]}...)") print(f" Documents: {len(index_info.get('documents', []))}") print(f"\n🎉 Demo completed successfully!") print(f"💡 You can now use these indexes in the LocalGPT interface.") def create_sample_config(): """Create a comprehensive sample configuration file.""" sample_config = { "description": "Demo batch indexing configuration showcasing various features", "pipeline_settings": { "embedding_model_name": "Qwen/Qwen3-Embedding-0.6B", "indexing": { "embedding_batch_size": 50, "enrichment_batch_size": 25, "enable_progress_tracking": True }, "contextual_enricher": { "enabled": True, "window_size": 2, "model_name": "qwen3:0.6b" }, "chunking": { "chunk_size": 512, "chunk_overlap": 64, "enable_latechunk": True, "enable_docling": True }, "retrievers": { "dense": { "enabled": True, "lancedb_table_name": "demo_text_pages" }, "bm25": { "enabled": True, "index_name": "demo_bm25_index" } }, "storage": { "lancedb_uri": "./index_store/lancedb", "bm25_path": "./index_store/bm25" } }, "ollama_config": { "generation_model": "qwen3:0.6b", "embedding_model": "qwen3:0.6b" }, "indexes": [ { "name": "Sample Invoice Collection", "description": "Demo index containing sample invoice documents", "documents": [ "./rag_system/documents/invoice_1039.pdf", "./rag_system/documents/invoice_1041.pdf" ], "processing_options": { "chunk_size": 512, "enable_enrichment": True, "retrieval_mode": "hybrid" } }, { "name": "Research Papers Demo", "description": "Demo index for research papers and whitepapers", "documents": [ "./rag_system/documents/Newwhitepaper_Agents2.pdf" ], "processing_options": { "chunk_size": 1024, "enable_enrichment": True, "retrieval_mode": "dense" } } ] } config_filename = "batch_indexing_config.json" with open(config_filename, "w") as f: json.dump(sample_config, f, indent=2) print(f"✅ Sample configuration created: {config_filename}") print(f"📝 Edit this file to customize your batch indexing setup") print(f"🚀 Run: python demo_batch_indexing.py --config {config_filename}") def main(): """Main entry point for the demo script.""" parser = argparse.ArgumentParser( description="LocalGPT Batch Indexing Demo", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python demo_batch_indexing.py --config batch_indexing_config.json python demo_batch_indexing.py --create-sample-config This demo showcases the advanced batch indexing capabilities of LocalGPT, including multi-index creation, advanced configuration options, and comprehensive processing pipelines. """ ) parser.add_argument( "--config", type=str, default="batch_indexing_config.json", help="Path to batch indexing configuration file" ) parser.add_argument( "--create-sample-config", action="store_true", help="Create a sample configuration file" ) args = parser.parse_args() if args.create_sample_config: create_sample_config() return if not os.path.exists(args.config): print(f"❌ Configuration file not found: {args.config}") print(f"💡 Create a sample config with: python {sys.argv[0]} --create-sample-config") sys.exit(1) try: demo = BatchIndexingDemo(args.config) demo.run_demo() except KeyboardInterrupt: print("\n\n❌ Demo cancelled by user.") except Exception as e: print(f"❌ Demo failed: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main() ================================================ FILE: docker-compose.local-ollama.yml ================================================ services: # RAG API server (connects to host Ollama) rag-api: build: context: . dockerfile: Dockerfile.rag-api container_name: rag-api ports: - "8001:8001" environment: - OLLAMA_HOST=http://host.docker.internal:11434 - NODE_ENV=production volumes: - ./lancedb:/app/lancedb - ./index_store:/app/index_store - ./shared_uploads:/app/shared_uploads healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8001/models"] interval: 30s timeout: 10s retries: 3 restart: unless-stopped networks: - rag-network # Backend API server backend: build: context: . dockerfile: Dockerfile.backend container_name: rag-backend ports: - "8000:8000" environment: - NODE_ENV=production - RAG_API_URL=http://rag-api:8001 volumes: - ./backend/chat_data.db:/app/backend/chat_data.db - ./shared_uploads:/app/shared_uploads depends_on: rag-api: condition: service_healthy healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 restart: unless-stopped networks: - rag-network # Frontend Next.js application frontend: build: context: . dockerfile: Dockerfile.frontend container_name: rag-frontend ports: - "3000:3000" environment: - NODE_ENV=production - NEXT_PUBLIC_API_URL=http://localhost:8000 depends_on: backend: condition: service_healthy healthcheck: test: ["CMD", "curl", "-f", "http://localhost:3000"] interval: 30s timeout: 10s retries: 3 restart: unless-stopped networks: - rag-network networks: rag-network: driver: bridge ================================================ FILE: docker-compose.yml ================================================ services: # Ollama service for LLM inference (optional - can use host Ollama instead) ollama: image: ollama/ollama:latest container_name: rag-ollama ports: - "11434:11434" volumes: - ollama_data:/root/.ollama environment: - OLLAMA_HOST=0.0.0.0 healthcheck: test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"] interval: 30s timeout: 10s retries: 3 restart: unless-stopped networks: - rag-network profiles: - with-ollama # Optional service - enable with --profile with-ollama # RAG API server rag-api: build: context: . dockerfile: Dockerfile.rag-api container_name: rag-api ports: - "8001:8001" environment: # Use host Ollama by default, or containerized Ollama if enabled - OLLAMA_HOST=${OLLAMA_HOST:-http://host.docker.internal:11434} - NODE_ENV=production volumes: - ./lancedb:/app/lancedb - ./index_store:/app/index_store - ./shared_uploads:/app/shared_uploads healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8001/models"] interval: 30s timeout: 10s retries: 3 restart: unless-stopped networks: - rag-network # Backend API server backend: build: context: . dockerfile: Dockerfile.backend container_name: rag-backend ports: - "8000:8000" environment: - NODE_ENV=production - RAG_API_URL=http://rag-api:8001 - OLLAMA_HOST=${OLLAMA_HOST:-http://172.18.0.1:11434} volumes: - ./backend:/app/backend - ./shared_uploads:/app/shared_uploads depends_on: rag-api: condition: service_healthy healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 restart: unless-stopped networks: - rag-network # Frontend Next.js application frontend: build: context: . dockerfile: Dockerfile.frontend container_name: rag-frontend ports: - "3000:3000" environment: - NODE_ENV=production - NEXT_PUBLIC_API_URL=http://localhost:8000 depends_on: backend: condition: service_healthy healthcheck: test: ["CMD", "curl", "-f", "http://localhost:3000"] interval: 30s timeout: 10s retries: 3 restart: unless-stopped networks: - rag-network volumes: ollama_data: driver: local networks: rag-network: driver: bridge ================================================ FILE: docker.env ================================================ # Docker environment configuration # Set this to use local Ollama instance running on host # Note: Using Docker gateway IP instead of host.docker.internal for Linux compatibility OLLAMA_HOST=http://172.18.0.1:11434 # Alternative: Use containerized Ollama (uncomment and run with --profile with-ollama) # OLLAMA_HOST=http://ollama:11434 # Other configuration NODE_ENV=production NEXT_PUBLIC_API_URL=http://localhost:8000 RAG_API_URL=http://rag-api:8001 ================================================ FILE: env.example.watsonx ================================================ # ==================================================================== # LocalGPT Watson X Configuration Example # ==================================================================== # This file shows how to configure LocalGPT to use IBM Watson X AI # with Granite models instead of local Ollama. # # Copy this file to .env and fill in your credentials: # cp .env.example.watsonx .env # ==================================================================== # LLM Backend Selection # Options: "ollama" (default) or "watsonx" LLM_BACKEND=watsonx # ==================================================================== # Watson X Credentials # ==================================================================== # Get these from your IBM Cloud Watson X project: # 1. Go to https://cloud.ibm.com/ # 2. Navigate to Watson X AI service # 3. Create or select a project # 4. Get API key from IBM Cloud IAM # 5. Copy project ID from project settings # Your IBM Cloud API key WATSONX_API_KEY=your_api_key_here # Your Watson X project ID WATSONX_PROJECT_ID=your_project_id_here # Watson X service URL (default: us-south region) # Options: # - https://us-south.ml.cloud.ibm.com (US South) # - https://eu-de.ml.cloud.ibm.com (Frankfurt) # - https://eu-gb.ml.cloud.ibm.com (London) # - https://jp-tok.ml.cloud.ibm.com (Tokyo) WATSONX_URL=https://us-south.ml.cloud.ibm.com # ==================================================================== # Model Configuration # ==================================================================== # Granite models available on Watson X # Main generation model for answering queries # Options: # - ibm/granite-13b-chat-v2 (recommended for chat) # - ibm/granite-13b-instruct-v2 (for instructions) # - ibm/granite-20b-multilingual (for multilingual) # - ibm/granite-3b-code-instruct (for code) WATSONX_GENERATION_MODEL=ibm/granite-13b-chat-v2 # Lightweight model for enrichment and routing # Use a smaller model for better performance on simple tasks WATSONX_ENRICHMENT_MODEL=ibm/granite-8b-japanese # ==================================================================== # Optional: Ollama Configuration (fallback) # ==================================================================== # These settings are used if LLM_BACKEND=ollama OLLAMA_HOST=http://localhost:11434 ================================================ FILE: eslint.config.mjs ================================================ import { dirname } from "path"; import { fileURLToPath } from "url"; import { FlatCompat } from "@eslint/eslintrc"; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); const compat = new FlatCompat({ baseDirectory: __dirname, }); const eslintConfig = [ ...compat.extends("next/core-web-vitals", "next/typescript"), ]; export default eslintConfig; ================================================ FILE: next.config.ts ================================================ import type { NextConfig } from "next"; const nextConfig: NextConfig = { /* config options here */ eslint: { // Warning: This allows production builds to successfully complete even if your project has ESLint errors. ignoreDuringBuilds: true, }, typescript: { // Warning: This allows production builds to successfully complete even if your project has type errors. ignoreBuildErrors: true, }, }; export default nextConfig; ================================================ FILE: package.json ================================================ { "name": "multimodal_rag", "version": "0.1.0", "private": true, "scripts": { "dev": "next dev", "build": "next build", "start": "next start", "lint": "next lint" }, "dependencies": { "@radix-ui/react-avatar": "^1.1.10", "@radix-ui/react-dropdown-menu": "^2.1.15", "@radix-ui/react-scroll-area": "^1.2.9", "@radix-ui/react-separator": "^1.1.7", "@radix-ui/react-slot": "^1.2.3", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "framer-motion": "^12.16.0", "lucide-react": "^0.513.0", "next": "15.3.3", "react": "^19.0.0", "react-dom": "^19.0.0", "react-markdown": "^10.1.0", "remark-gfm": "^4.0.1", "tailwind-merge": "^3.3.0" }, "devDependencies": { "@eslint/eslintrc": "^3", "@tailwindcss/postcss": "^4", "@types/node": "^20", "@types/react": "^19", "@types/react-dom": "^19", "eslint": "^9", "eslint-config-next": "15.3.3", "tailwindcss": "^4", "tw-animate-css": "^1.3.4", "typescript": "^5" } } ================================================ FILE: postcss.config.mjs ================================================ const config = { plugins: ["@tailwindcss/postcss"], }; export default config; ================================================ FILE: rag_system/DOCUMENTATION.md ================================================ # RAG System Documentation This document provides a detailed overview of the RAG (Retrieval-Augmented Generation) system, its architecture, and how to use it. ## System Overview This RAG system is a sophisticated, multimodal question-answering system designed to work with a variety of documents. It can understand and process both the text and the visual layout of documents, and it uses a knowledge graph to understand the relationships between the entities in the documents. The system is built around an agentic workflow that allows it to: * **Decompose complex questions** into smaller, more manageable sub-questions. * **Triage queries** to determine if they can be answered directly or if they require retrieval from the knowledge base. * **Verify answers** against the retrieved context to ensure they are accurate and supported by the documents. ## Architecture The system is composed of two main pipelines: an indexing pipeline and a retrieval pipeline. ### Indexing Pipeline The indexing pipeline is responsible for processing the documents and building the knowledge base. It performs the following steps: 1. **Text Extraction**: The pipeline uses `PyMuPDF` to extract the text from each page of the PDF documents, preserving the original layout. 2. **Text Embedding**: The extracted text is then passed to a text embedding model (`Qwen/Qwen3-Embedding-0.6B`) to create numerical vector representations of the text. 3. **Knowledge Graph Creation**: The text is also passed to a `GraphExtractor` that uses a large language model (`qwen2.5vl:7b`) to extract entities and their relationships. This information is then used to build a knowledge graph, which is stored as a `.gml` file. 4. **Indexing**: The text embeddings and the knowledge graph are then stored in a LanceDB database. ### Retrieval Pipeline The retrieval pipeline is responsible for answering user queries. It uses an agentic workflow that includes the following steps: 1. **Triage**: The agent first triages the user's query to determine if it can be answered directly or if it requires retrieval from the knowledge base. 2. **Query Decomposition**: If the query is complex, the agent uses a `QueryDecomposer` to break it down into smaller, more manageable sub-questions. 3. **Retrieval**: The agent then uses a `MultiVectorRetriever` and a `GraphRetriever` to retrieve relevant information from the knowledge base. 4. **Verification**: The retrieved context is then passed to a `Verifier` that uses an LLM to check if the context is sufficient to answer the query. 5. **Synthesis**: Finally, the agent uses an LLM to synthesize a final answer from the verified context. ## API Endpoints The system provides the following command-line endpoints: * `index`: This endpoint runs the indexing pipeline to process the documents and build the knowledge base. * `chat`: This endpoint runs the retrieval pipeline to answer a user's query. * `show_graph`: This endpoint displays the knowledge graph in a human-readable format and also provides a visual representation of the graph. ### Usage To run the system, use the following commands: ```bash # Activate the virtual environment source rag_system/rag_venv/bin/activate # Index the documents python rag_system/main.py index # Ask a question python rag_system/main.py chat "Your question here" # Show the knowledge graph python rag_system/main.py show_graph ``` ================================================ FILE: rag_system/README.md ================================================ # Multimodal RAG System This document provides a detailed overview of the multimodal Retrieval-Augmented Generation (RAG) system implemented in this directory. The system is designed to process and understand information from PDF documents, combining both textual and visual data to answer complex queries. ## 1. Overview This RAG system is a sophisticated pipeline that leverages state-of-the-art open-source models to provide accurate, context-aware answers from a document corpus. Unlike traditional RAG systems that only process text, this implementation is fully multimodal. It extracts and indexes both text and images from PDFs, allowing a Vision Language Model (VLM) to reason over both modalities when generating a final answer. The core capabilities include: - **Multimodal Indexing**: Extracts text and images from PDFs and creates separate vector embeddings for each. - **Hybrid Retrieval**: Combines dense vector search (for semantic similarity) with traditional keyword-based search (BM25) for robust retrieval. - **Advanced Reranking**: Utilizes a powerful reranker model to improve the relevance of retrieved documents before they are passed to the generator. - **VLM-Powered Synthesis**: Employs a Vision Language Model to synthesize the final answer, allowing it to analyze both the text and the images from the retrieved document chunks. ## 2. Architecture The system is composed of several key Python modules that work together to form the RAG pipeline. ### Key Modules: - `main.py`: The main entry point for the application. It contains the configuration for all models and pipelines and orchestrates the indexing and retrieval processes. - `rag_system/pipelines/`: Contains the high-level orchestration for indexing and retrieval. - `indexing_pipeline.py`: Manages the process of converting raw PDFs into indexed, searchable data. - `retrieval_pipeline.py`: Handles the end-to-end process of taking a user query, retrieving relevant information, and generating a final answer. - `rag_system/indexing/`: Contains all modules related to data processing and indexing. - `multimodal.py`: Responsible for extracting text and images from PDFs and generating embeddings using the configured vision model (`colqwen2-v1.0`). - `representations.py`: Defines the text embedding model (`Qwen2-7B-instruct`) and other data representation generators. - `embedders.py`: Manages the connection to the **LanceDB** vector database and handles the indexing of vector embeddings. - `rag_system/retrieval/`: Contains modules for retrieving and ranking documents. - `retrievers.py`: Implements the logic for searching the vector database to find relevant text and image chunks. - `reranker.py`: Contains the `QwenReranker` class, which re-ranks the retrieved documents for improved relevance. - `rag_system/agent/`: Contains the `Agent` loop that interacts with the user and the RAG pipelines. - `rag_system/utils/`: Contains utility clients, such as the `OllamaClient` for interacting with the Ollama server. ### Data Flow: 1. **Indexing**: - The `MultimodalProcessor` reads a PDF and splits it into pages. - For each page, it extracts the raw text and a full-page image. - The `QwenEmbedder` generates a vector embedding for the text. - The `LocalVisionModel` (using `colqwen2-v1.0`) generates a vector embedding for the image. - The `VectorIndexer` stores these embeddings in separate tables within a **LanceDB** database. 2. **Retrieval**: - A user submits a query to the `Agent`. - The `RetrievalPipeline`'s `MultiVectorRetriever` searches both the text and image tables in LanceDB for relevant chunks. - The retrieved documents are passed to the `QwenReranker`, which re-orders them based on relevance to the query. - The top-ranked documents (containing both text and image references) are passed to the Vision Language Model (`qwen-vl`). - The VLM analyzes the text and images to extract key facts. - A final text generation model (`llama3`) synthesizes these facts into a coherent, human-readable answer. ## 3. Models This system relies on a suite of powerful, open-source models. | Component | Model | Framework | Purpose | | --------------------- | ----------------------------------- | -------------- | ------------------------------------------- | | **Image Embedding** | `vidore/colqwen2-v1.0` | `colpali` | Generates vector embeddings from images. | | **Text Embedding** | `Qwen/Qwen2-7B-instruct` | `transformers` | Generates vector embeddings from text. | | **Reranker** | `Qwen/Qwen-reranker` | `transformers` | Re-ranks retrieved documents for relevance. | | **Vision Language Model** | `qwen2.5vl:7b` | `Ollama` | Extracts facts from text and images. | | **Text Generation** | `llama3` | `Ollama` | Synthesizes the final answer. | ## 4. Configuration All system configurations are centralized in `main.py`. - **`OLLAMA_CONFIG`**: Defines the models that will be run via the Ollama server. This includes the final text generation model and the Vision Language Model. - **`PIPELINE_CONFIGS`**: Contains the configurations for both the `indexing` and `retrieval` pipelines. Here you can specify: - The paths for the LanceDB database and source documents. - The names of the tables to be used for text and image embeddings. - The Hugging Face model names for the text embedder, vision model, and reranker. - Parameters for the reranker and retrieval process (e.g., `top_k`, `retrieval_k`). To change a model, simply update the corresponding model name in this configuration file. ## 5. Usage To run the system, you first need to ensure the required models are available. ### Prerequisites: 1. **Install Dependencies**: ```bash pip install -r requirements.txt ``` 2. **Download Ollama Models**: ```bash ollama pull llama3 ollama pull qwen2.5vl:7b ``` 3. **Hugging Face Models**: The `transformers` and `colpali` libraries will automatically download the required models the first time they are used. Ensure you have a stable internet connection. ### Running the System: 1. **Execute the Main Script**: ```bash python rag_system/main.py ``` 2. **Indexing**: The script will first run the indexing pipeline, processing any documents in the `rag_system/documents` directory and storing their embeddings in LanceDB. 3. **Querying**: Once indexing is complete, the RAG agent will be ready. You can ask questions about the documents you have indexed. ``` > What was the revenue growth in Q3? ``` 4. **Exit**: To stop the agent, type `quit`. ================================================ FILE: rag_system/__init__.py ================================================ import logging import os # --------------------------------------------------------- # Global logging setup for the entire `rag_system` package. # --------------------------------------------------------- # You can control verbosity with an env variable, e.g.: # export RAG_LOG_LEVEL=DEBUG (or INFO / WARNING / ERROR) # If not set, we default to INFO to avoid excessive noise. # --------------------------------------------------------- _level_str = os.getenv("RAG_LOG_LEVEL", "INFO").upper() _level = getattr(logging, _level_str, logging.INFO) # Only configure root logger if it hasn't been configured yet if not logging.getLogger().handlers: logging.basicConfig( level=_level, format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s", ) else: logging.getLogger().setLevel(_level) logging.getLogger(__name__).debug( "Initialized rag_system logging (level=%s)", _level_str ) # --------------------------------------------------------- # Authenticate to Hugging Face Hub if a token is provided # --------------------------------------------------------- from typing import Optional def _hf_auto_login() -> None: """Attempt to authenticate with Hugging Face Hub using an env token. We support both the new canonical env var name (HF_TOKEN) and the two historical variants to avoid breaking user setups. The login call is idempotent: if a cached token already exists, the hub library will simply reuse it, so it is safe to run on every import. """ import os token: Optional[str] = ( os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN") ) if not token: logging.getLogger(__name__).debug("No Hugging Face token found in env; proceeding anonymously.") return try: from huggingface_hub import login as hf_login hf_login(token=token, add_to_git_credential=False) # type: ignore logging.getLogger(__name__).info("Authenticated to Hugging Face Hub via env token.") except Exception as exc: # pragma: no cover – best-effort login logging.getLogger(__name__).warning( "Failed to login to Hugging Face Hub automatically: %s", exc ) # Run on module import _hf_auto_login() ================================================ FILE: rag_system/agent/__init__.py ================================================ ================================================ FILE: rag_system/agent/loop.py ================================================ from typing import Dict, Any, Optional import json import time, asyncio, os import numpy as np import concurrent.futures from cachetools import TTLCache, LRUCache from rag_system.utils.ollama_client import OllamaClient from rag_system.pipelines.retrieval_pipeline import RetrievalPipeline from rag_system.agent.verifier import Verifier from rag_system.retrieval.query_transformer import QueryDecomposer, GraphQueryTranslator from rag_system.retrieval.retrievers import GraphRetriever class Agent: """ The main agent, now fully wired to use a live Ollama client. """ def __init__(self, pipeline_configs: Dict[str, Dict], llm_client: OllamaClient, ollama_config: Dict[str, str]): self.pipeline_configs = pipeline_configs self.llm_client = llm_client self.ollama_config = ollama_config gen_model = self.ollama_config["generation_model"] # Initialize the single, persistent retrieval pipeline for this agent self.retrieval_pipeline = RetrievalPipeline(pipeline_configs, self.llm_client, self.ollama_config) self.verifier = Verifier(llm_client, gen_model) self.query_decomposer = QueryDecomposer(llm_client, gen_model) # 🚀 OPTIMIZED: TTL cache now stores embeddings for semantic matching self._cache_max_size = 100 # fallback size limit for manual eviction helper self._query_cache: TTLCache = TTLCache(maxsize=self._cache_max_size, ttl=300) self.semantic_cache_threshold = self.pipeline_configs.get("semantic_cache_threshold", 0.98) # If set to "session", semantic-cache hits will be restricted to the same chat session. # Otherwise (default "global") answers can be reused across sessions. self.cache_scope = self.pipeline_configs.get("cache_scope", "global") # 'global' or 'session' # 🚀 NEW: In-memory store for conversational history per session self.chat_histories: LRUCache = LRUCache(maxsize=100) # Stores history for 100 recent sessions graph_config = self.pipeline_configs.get("graph_strategy", {}) if graph_config.get("enabled"): self.graph_query_translator = GraphQueryTranslator(llm_client, gen_model) self.graph_retriever = GraphRetriever(graph_config["graph_path"]) print("Agent initialized with live GraphRAG capabilities.") else: print("Agent initialized (GraphRAG disabled).") # ---- Load document overviews for fast routing ---- self._global_overview_path = os.path.join("index_store", "overviews", "overviews.jsonl") self.doc_overviews: list[str] = [] self._current_overview_session: str | None = None # cache key to avoid rereading on every query self._load_overviews(self._global_overview_path) def _load_overviews(self, path: str): """Helper to load overviews from a .jsonl file into self.doc_overviews.""" import json, os self.doc_overviews.clear() if not os.path.exists(path): return try: with open(path, encoding="utf-8") as fh: for line in fh: try: rec = json.loads(line) if isinstance(rec, dict) and rec.get("overview"): self.doc_overviews.append(rec["overview"].strip()) except Exception: continue print(f"📖 Loaded {len(self.doc_overviews)} overviews from {path}") except Exception as e: print(f"⚠️ Failed to load document overviews from {path}: {e}") def load_overviews_for_indexes(self, idx_ids: list[str]): """Aggregate overviews for the given indexes or fall back to global file.""" import os, json aggregated: list[str] = [] for idx in idx_ids: path = os.path.join("index_store", "overviews", f"{idx}.jsonl") if os.path.exists(path): try: with open(path, encoding="utf-8") as fh: for line in fh: if not line.strip(): continue try: rec = json.loads(line) ov = rec.get("overview", "").strip() if ov: aggregated.append(ov) except json.JSONDecodeError: continue except Exception as e: print(f"⚠️ Error reading {path}: {e}") if aggregated: self.doc_overviews = aggregated self._current_overview_session = "|".join(idx_ids) # cache composite key so no overwrite print(f"📖 Loaded {len(aggregated)} overviews for indexes {[i[:8] for i in idx_ids]}") else: print(f"⚠️ No per-index overviews found for {idx_ids}. Using global overview file.") self._load_overviews(self._global_overview_path) self._current_overview_session = "GLOBAL" def _cosine_similarity(self, v1: np.ndarray, v2: np.ndarray) -> float: """Computes cosine similarity between two vectors.""" if not isinstance(v1, np.ndarray): v1 = np.array(v1) if not isinstance(v2, np.ndarray): v2 = np.array(v2) if v1.shape != v2.shape: raise ValueError("Vectors must have the same shape for cosine similarity.") if np.all(v1 == 0) or np.all(v2 == 0): return 0.0 dot_product = np.dot(v1, v2) norm_v1 = np.linalg.norm(v1) norm_v2 = np.linalg.norm(v2) # Avoid division by zero if norm_v1 == 0 or norm_v2 == 0: return 0.0 return dot_product / (norm_v1 * norm_v2) def _find_in_semantic_cache(self, query_embedding: np.ndarray, session_id: Optional[str] = None) -> Optional[Dict[str, Any]]: """Finds a semantically similar query in the cache.""" if not self._query_cache or query_embedding is None: return None for key, cached_item in self._query_cache.items(): cached_embedding = cached_item.get('embedding') if cached_embedding is None: continue # Respect cache scoping: if scope is session-level, skip results from other sessions if self.cache_scope == "session" and session_id is not None: if cached_item.get("session_id") != session_id: continue try: similarity = self._cosine_similarity(query_embedding, cached_embedding) if similarity >= self.semantic_cache_threshold: print(f"🚀 Semantic cache hit! Similarity: {similarity:.3f} with cached query '{key}'") return cached_item.get('result') except ValueError: # In case of shape mismatch, just skip continue return None def _format_query_with_history(self, query: str, history: list) -> str: """Formats the user query with conversation history for context.""" if not history: return query formatted_history = "\n".join([f"User: {turn['query']}\nAssistant: {turn['answer']}" for turn in history]) prompt = f""" Given the following conversation history, answer the user's latest query. The history provides context for resolving pronouns or follow-up questions. --- Conversation History --- {formatted_history} --- Latest User Query: "{query}" """ return prompt # ---------------- Asynchronous triage using Ollama ---------------- async def _triage_query_async(self, query: str, history: list) -> str: print(f"🔍 ROUTING DEBUG: Starting triage for query: '{query[:100]}...'") # 1️⃣ Fast routing using precomputed overviews (if available) print(f"📖 ROUTING DEBUG: Attempting overview-based routing...") routed = self._route_via_overviews(query) if routed: print(f"✅ ROUTING DEBUG: Overview routing decided: '{routed}'") return routed else: print(f"❌ ROUTING DEBUG: Overview routing returned None, falling back to LLM triage") if history: # If there's history, the query is likely a follow-up, so we default to RAG. # A more advanced implementation could use an LLM to see if the new query # changes the topic entirely. print(f"📜 ROUTING DEBUG: History exists, defaulting to 'rag_query'") return "rag_query" print(f"🤖 ROUTING DEBUG: No history, using LLM fallback triage...") prompt = f""" You are a query routing expert. Analyze the user's question and decide which backend should handle it. Choose **exactly one** category: 1. "rag_query" – Questions about the user's uploaded documents or specific document content that should be searched. Examples: "What is the invoice amount?", "Summarize the research paper", "What companies are mentioned?" 2. "direct_answer" – General knowledge questions, greetings, or queries unrelated to uploaded documents. Examples: "Who are the CEOs of Tesla and Amazon?", "What is the capital of France?", "Hello", "Explain quantum physics" 3. "graph_query" – Specific factual relations for knowledge-graph lookup (currently limited use) IMPORTANT: For general world knowledge about well-known companies, people, or facts NOT related to uploaded documents, choose "direct_answer". User query: "{query}" Respond with JSON: {{"category": ""}} """ resp = self.llm_client.generate_completion( model=self.ollama_config["generation_model"], prompt=prompt, format="json" ) try: data = json.loads(resp.get("response", "{}")) decision = data.get("category", "rag_query") print(f"🤖 ROUTING DEBUG: LLM fallback triage decided: '{decision}'") return decision except json.JSONDecodeError: print(f"❌ ROUTING DEBUG: LLM fallback triage JSON parsing failed, defaulting to 'rag_query'") return "rag_query" def _run_graph_query(self, query: str, history: list) -> Dict[str, Any]: contextual_query = self._format_query_with_history(query, history) structured_query = self.graph_query_translator.translate(contextual_query) if not structured_query.get("start_node"): return self.retrieval_pipeline.run(contextual_query, window_size_override=0) results = self.graph_retriever.retrieve(structured_query) if not results: return self.retrieval_pipeline.run(contextual_query, window_size_override=0) answer = ", ".join([res['details']['node_id'] for res in results]) return {"answer": f"From the knowledge graph: {answer}", "source_documents": results} def _get_cache_key(self, query: str, query_type: str) -> str: """Generate a cache key for the query""" # Simple cache key based on query and type return f"{query_type}:{query.strip().lower()}" def _cache_result(self, cache_key: str, result: Dict[str, Any], session_id: Optional[str] = None): """Cache a result with size limit""" if len(self._query_cache) >= self._cache_max_size: # Remove oldest entry (simple FIFO eviction) oldest_key = next(iter(self._query_cache)) del self._query_cache[oldest_key] self._query_cache[cache_key] = { 'result': result, 'timestamp': time.time(), 'session_id': session_id } # ---------------- Public sync API (kept for backwards compatibility) -------------- def run(self, query: str, table_name: str = None, session_id: str = None, compose_sub_answers: Optional[bool] = None, query_decompose: Optional[bool] = None, ai_rerank: Optional[bool] = None, context_expand: Optional[bool] = None, verify: Optional[bool] = None, retrieval_k: Optional[int] = None, context_window_size: Optional[int] = None, reranker_top_k: Optional[int] = None, search_type: Optional[str] = None, dense_weight: Optional[float] = None, max_retries: int = 1, event_callback: Optional[callable] = None) -> Dict[str, Any]: """Synchronous helper. If *event_callback* is supplied, important milestones will be forwarded to that callable as event_callback(phase:str, payload:Any) """ return asyncio.run(self._run_async(query, table_name, session_id, compose_sub_answers, query_decompose, ai_rerank, context_expand, verify, retrieval_k, context_window_size, reranker_top_k, search_type, dense_weight, max_retries, event_callback)) # ---------------- Main async implementation -------------------------------------- async def _run_async(self, query: str, table_name: str = None, session_id: str = None, compose_sub_answers: Optional[bool] = None, query_decompose: Optional[bool] = None, ai_rerank: Optional[bool] = None, context_expand: Optional[bool] = None, verify: Optional[bool] = None, retrieval_k: Optional[int] = None, context_window_size: Optional[int] = None, reranker_top_k: Optional[int] = None, search_type: Optional[str] = None, dense_weight: Optional[float] = None, max_retries: int = 1, event_callback: Optional[callable] = None) -> Dict[str, Any]: start_time = time.time() # Emit analyze event at the start if event_callback: event_callback("analyze", {"query": query}) # 🚀 NEW: Get conversation history history = self.chat_histories.get(session_id, []) if session_id else [] # 🔄 Refresh overviews for this session if available # if session_id and session_id != getattr(self, "_current_overview_session", None): # candidate_path = os.path.join("index_store", "overviews", f"{session_id}.jsonl") # if os.path.exists(candidate_path): # self._load_overviews(candidate_path) # self._current_overview_session = session_id # else: # # Fall back to global overviews if per-session file not found # if self._current_overview_session != "GLOBAL": # self._load_overviews(self._global_overview_path) # self._current_overview_session = "GLOBAL" query_type = await self._triage_query_async(query, history) print(f"🎯 ROUTING DEBUG: Final triage decision: '{query_type}'") print(f"Agent Triage Decision: '{query_type}'") # Create a contextual query that includes history for most operations contextual_query = self._format_query_with_history(query, history) raw_query = query.strip() # --- Apply runtime AI reranker override (must happen before any retrieval calls) --- if ai_rerank is not None: rr_cfg = self.retrieval_pipeline.config.setdefault("reranker", {}) rr_cfg["enabled"] = bool(ai_rerank) if ai_rerank: # Ensure the pipeline knows to use the external ColBERT reranker rr_cfg.setdefault("type", "ai") rr_cfg.setdefault("strategy", "rerankers-lib") rr_cfg.setdefault( "model_name", # Falls back to ColBERT-small if the caller did not supply one self.ollama_config.get("rerank_model", "answerai-colbert-small-v1"), ) # --- Apply runtime retrieval configuration overrides --- if retrieval_k is not None: self.retrieval_pipeline.config["retrieval_k"] = retrieval_k print(f"🔍 Retrieval K set to: {retrieval_k}") if context_window_size is not None: self.retrieval_pipeline.config["context_window_size"] = context_window_size print(f"🔍 Context window size set to: {context_window_size}") if reranker_top_k is not None: rr_cfg = self.retrieval_pipeline.config.setdefault("reranker", {}) rr_cfg["top_k"] = reranker_top_k print(f"🔍 Reranker top K set to: {reranker_top_k}") if search_type is not None: retrieval_cfg = self.retrieval_pipeline.config.setdefault("retrieval", {}) retrieval_cfg["search_type"] = search_type print(f"🔍 Search type set to: {search_type}") if dense_weight is not None: dense_cfg = self.retrieval_pipeline.config.setdefault("retrieval", {}).setdefault("dense", {}) dense_cfg["weight"] = dense_weight print(f"🔍 Dense search weight set to: {dense_weight}") query_embedding = None # 🚀 OPTIMIZED: Semantic Cache Check if query_type != "direct_answer": text_embedder = self.retrieval_pipeline._get_text_embedder() if text_embedder: # The embedder expects a list, so we wrap the *raw* query only. query_embedding_list = text_embedder.create_embeddings([raw_query]) if isinstance(query_embedding_list, np.ndarray): query_embedding = query_embedding_list[0] else: # Some embedders return a list – convert if necessary query_embedding = np.array(query_embedding_list[0]) cached_result = self._find_in_semantic_cache(query_embedding, session_id) if cached_result: # Update history even on cache hit if session_id: history.append({"query": query, "answer": cached_result.get('answer', 'Cached answer not found.')}) self.chat_histories[session_id] = history return cached_result if query_type == "direct_answer": print(f"✅ ROUTING DEBUG: Executing DIRECT_ANSWER path") if event_callback: event_callback("direct_answer", {}) prompt = ( "You are a helpful assistant. Read the conversation history below. " "If the answer to the user's latest question is already present in the history, quote it concisely. " "Otherwise answer from your general world knowledge. Provide a short, factual reply (1‒2 sentences).\n\n" f"Conversation + Latest Question:\n{contextual_query}\n\nAssistant:" ) async def _run_stream(): answer_parts: list[str] = [] def _blocking_stream(): for tok in self.llm_client.stream_completion( model=self.ollama_config["generation_model"], prompt=prompt ): answer_parts.append(tok) if event_callback: event_callback("token", {"text": tok}) # Run the blocking generator in a thread so the event loop stays responsive await asyncio.to_thread(_blocking_stream) return "".join(answer_parts) final_answer = await _run_stream() result = {"answer": final_answer, "source_documents": []} elif query_type == "graph_query" and hasattr(self, 'graph_retriever'): print(f"✅ ROUTING DEBUG: Executing GRAPH_QUERY path") result = self._run_graph_query(query, history) # --- RAG Query Processing with Optional Query Decomposition --- else: # Default to rag_query print(f"✅ ROUTING DEBUG: Executing RAG_QUERY path (query_type='{query_type}')") query_decomp_config = self.pipeline_configs.get("query_decomposition", {}) decomp_enabled = query_decomp_config.get("enabled", False) if query_decompose is not None: decomp_enabled = query_decompose if decomp_enabled: print(f"\n--- Query Decomposition Enabled ---") # Use the raw user query (without conversation history) for decomposition to avoid leakage of prior context # Pass the last 5 conversation turns for context resolution within the decomposer recent_history = history[-5:] if history else [] sub_queries = self.query_decomposer.decompose(raw_query, recent_history) if event_callback: event_callback("decomposition", {"sub_queries": sub_queries}) print(f"Original query: '{query}' (Contextual: '{contextual_query}')") print(f"Decomposed into {len(sub_queries)} sub-queries: {sub_queries}") # Emit retrieval_started event before any retrievals if event_callback: event_callback("retrieval_started", {"count": len(sub_queries)}) # If decomposition produced only a single sub-query, skip the # parallel/composition machinery for efficiency. if len(sub_queries) == 1: print("--- Only one sub-query after decomposition; using direct retrieval path ---") result = self.retrieval_pipeline.run( sub_queries[0], table_name, 0 if context_expand is False else None, event_callback=event_callback ) if event_callback: event_callback("single_query_result", result) # Emit retrieval_done and rerank_done for single sub-query if event_callback: event_callback("retrieval_done", {"count": 1}) event_callback("rerank_started", {"count": 1}) event_callback("rerank_done", {"count": 1}) else: compose_from_sub_answers = query_decomp_config.get("compose_from_sub_answers", True) if compose_sub_answers is not None: compose_from_sub_answers = compose_sub_answers print(f"\n--- Processing {len(sub_queries)} sub-queries in parallel ---") start_time_inner = time.time() # Shared containers sub_answers = [] # For two-stage composition all_source_docs = [] # For single-stage aggregation citations_seen = set() # Emit rerank_started event before parallel retrievals (since each sub-query will rerank) if event_callback: event_callback("rerank_started", {"count": len(sub_queries)}) # Emit token chunks as soon as we receive them. The UI # keeps answers separated by `index`, so interleaving is # harmless and gives continuous feedback. def make_cb(idx: int): def _cb(ev_type: str, payload): if event_callback is None: return if ev_type == "token": event_callback("sub_query_token", {"index": idx, "text": payload.get("text", ""), "question": sub_queries[idx]}) else: event_callback(ev_type, payload) return _cb with concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sub_queries))) as executor: future_to_query = { executor.submit( self.retrieval_pipeline.run, sub_query, table_name, 0 if context_expand is False else None, make_cb(i), ): (i, sub_query) for i, sub_query in enumerate(sub_queries) } for future in concurrent.futures.as_completed(future_to_query): i, sub_query = future_to_query[future] try: sub_result = future.result() print(f"✅ Sub-Query {i+1} completed: '{sub_query}'") if event_callback: event_callback("sub_query_result", { "index": i, "query": sub_query, "answer": sub_result.get("answer", ""), "source_documents": sub_result.get("source_documents", []), }) if compose_from_sub_answers: sub_answers.append({ "question": sub_query, "answer": sub_result.get("answer", "") }) # Keep up to 5 citations per sub-query for traceability for doc in sub_result.get("source_documents", [])[:5]: if doc['chunk_id'] not in citations_seen: all_source_docs.append(doc) citations_seen.add(doc['chunk_id']) else: # Aggregate unique docs (single-stage path) for doc in sub_result.get('source_documents', []): if doc['chunk_id'] not in citations_seen: all_source_docs.append(doc) citations_seen.add(doc['chunk_id']) except Exception as e: print(f"❌ Sub-Query {i+1} failed: '{sub_query}' - {e}") parallel_time = time.time() - start_time_inner print(f"🚀 Parallel processing completed in {parallel_time:.2f}s") # Emit retrieval_done and rerank_done after all sub-queries are processed if event_callback: event_callback("retrieval_done", {"count": len(sub_queries)}) event_callback("rerank_done", {"count": len(sub_queries)}) if compose_from_sub_answers: print("\n--- Composing final answer from sub-answers ---") compose_prompt = f""" You are an expert answer composer for a Retrieval-Augmented Generation (RAG) system. Context: • The ORIGINAL QUESTION from the user is shown below. • That question was automatically decomposed into simpler SUB-QUESTIONS. • Each sub-question has already been answered by an earlier step and the resulting Question→Answer pairs are provided to you in JSON. Your task: 1. Read every sub-answer carefully. 2. Write a single, final answer to the ORIGINAL QUESTION **using only the information contained in the sub-answers**. Do NOT invent facts that are not present. 3. If the original question includes a comparison (e.g., "Which, A or B, …") clearly state the outcome (e.g., "A > B"). Quote concrete numbers when available. 4. If any aspect of the original question cannot be answered with the given sub-answers, explicitly say so (e.g., "The provided context does not mention …"). 5. Keep the answer concise (≤ 5 sentences) and use a factual, third-person tone. Input ------ ORIGINAL QUESTION: "{contextual_query}" SUB-ANSWERS (JSON): {json.dumps(sub_answers, indent=2)} ------ FINAL ANSWER: """ # --- Stream composition answer token-by-token --- answer_parts: list[str] = [] for tok in self.llm_client.stream_completion( model=self.ollama_config["generation_model"], prompt=compose_prompt, ): answer_parts.append(tok) if event_callback: event_callback("token", {"text": tok}) final_answer = "".join(answer_parts) or "Unable to generate an answer." result = { "answer": final_answer, "source_documents": all_source_docs } if event_callback: event_callback("final_answer", result) else: print(f"\n--- Aggregated {len(all_source_docs)} unique documents from all sub-queries ---") if all_source_docs: aggregated_context = "\n\n".join([doc['text'] for doc in all_source_docs]) final_answer = self.retrieval_pipeline._synthesize_final_answer(contextual_query, aggregated_context) result = { "answer": final_answer, "source_documents": all_source_docs } if event_callback: event_callback("final_answer", result) else: result = { "answer": "I could not find relevant information to answer your question.", "source_documents": [] } if event_callback: event_callback("final_answer", result) else: # Standard retrieval (single-query) retrieved_docs = (self.retrieval_pipeline.retriever.retrieve( text_query=contextual_query, table_name=table_name or self.retrieval_pipeline.storage_config["text_table_name"], k=self.retrieval_pipeline.config.get("retrieval_k", 10), ) if hasattr(self.retrieval_pipeline, "retriever") and self.retrieval_pipeline.retriever else []) print("\n=== DEBUG: Original retrieval order ===") for i, d in enumerate(retrieved_docs[:10]): snippet = (d.get('text','') or '')[:200].replace('\n',' ') print(f"Orig[{i}] id={d.get('chunk_id')} dist={d.get('_distance','') or d.get('score','')} {snippet}") result = self.retrieval_pipeline.run(contextual_query, table_name, 0 if context_expand is False else None, event_callback=event_callback) # After run, result['source_documents'] is reranked list reranked_docs = result.get('source_documents', []) print("\n=== DEBUG: Reranked docs order ===") for i, d in enumerate(reranked_docs[:10]): snippet = (d.get('text','') or '')[:200].replace('\n',' ') print(f"ReRank[{i}] id={d.get('chunk_id')} score={d.get('rerank_score','')} {snippet}") # Verification step (simplified for now) - Skip in fast mode verification_enabled = self.pipeline_configs.get("verification", {}).get("enabled", True) if verify is not None: verification_enabled = verify if verification_enabled and result.get("source_documents"): context_str = "\n".join([doc['text'] for doc in result['source_documents']]) verification = await self.verifier.verify_async(contextual_query, context_str, result['answer']) score = verification.confidence_score # Only include confidence details if we received a non-zero score (0 usually means JSON parse failure) if score > 0: result['answer'] += f" [Confidence: {score}%]" # Add warning only when the verifier explicitly reported low confidence / not grounded if (not verification.is_grounded) or score < 50: result['answer'] += f" [Warning: Low confidence. Groundedness: {verification.is_grounded}]" else: # Skip appending any verifier note – 0 likely indicates a parser error print("⚠️ Verifier returned 0 confidence – likely JSON parse error; omitting tags.") else: print("🚀 Skipping verification for speed or lack of sources") # 🚀 NEW: Update history if session_id: history.append({"query": query, "answer": result['answer']}) self.chat_histories[session_id] = history # 🚀 OPTIMIZED: Cache the result for future queries if query_type != "direct_answer" and query_embedding is not None: cache_key = raw_query # Key is for logging/debugging self._query_cache[cache_key] = { "embedding": query_embedding, "result": result, "session_id": session_id, } total_time = time.time() - start_time print(f"🚀 Total query processing time: {total_time:.2f}s") return result # ------------------------------------------------------------------ def _route_via_overviews(self, query: str) -> str | None: """Use document overviews and a small model to decide routing. Returns 'rag_query', 'direct_answer', or None if unsure/disabled.""" if not self.doc_overviews: print(f"📖 ROUTING DEBUG: No document overviews available, returning None") return None print(f"📖 ROUTING DEBUG: Found {len(self.doc_overviews)} document overviews, using LLM routing...") # Keep prompt concise: if more than 40 overviews, take first 40 overviews_snip = self.doc_overviews[:40] overviews_block = "\n".join(f"[{i+1}] {ov}" for i, ov in enumerate(overviews_snip)) router_prompt = f"""Task: Route query to correct system. Documents available: Invoices, DeepSeek-V3 research papers Query: "{query}" Is this query asking about: A) Greetings/social: "Hi", "Hello", "Thanks", "What's up", "How are you" B) General knowledge: "CEO of Tesla", "capital of France", "what is 2+2" C) Document content: invoice amounts, DeepSeek-V3 details, companies mentioned If A or B → {{"category": "direct_answer"}} If C → {{"category": "rag_query"}} Response:""" resp = self.llm_client.generate_completion( model=self.ollama_config["generation_model"], prompt=router_prompt, format="json" ) try: raw_response = resp.get("response", "{}") print(f"📖 ROUTING DEBUG: Overview LLM raw response: '{raw_response[:200]}...'") data = json.loads(raw_response) decision = data.get("category", "rag_query") print(f"📖 ROUTING DEBUG: Overview routing final decision: '{decision}'") return decision except json.JSONDecodeError as e: print(f"❌ ROUTING DEBUG: Overview routing JSON parsing failed: {e}, defaulting to 'rag_query'") return "rag_query" ================================================ FILE: rag_system/agent/verifier.py ================================================ import json from rag_system.utils.ollama_client import OllamaClient class VerificationResult: def __init__(self, is_grounded: bool, reasoning: str, verdict: str, confidence_score: int): self.is_grounded = is_grounded self.reasoning = reasoning self.verdict = verdict self.confidence_score = confidence_score class Verifier: """ Verifies if a generated answer is grounded in the provided context using Ollama. """ def __init__(self, llm_client: OllamaClient, llm_model: str): self.llm_client = llm_client self.llm_model = llm_model print(f"Initialized Verifier with Ollama model '{self.llm_model}'.") # Synchronous verify() method removed – async version is used everywhere. # --- Async wrapper ------------------------------------------------ async def verify_async(self, query: str, context: str, answer: str) -> VerificationResult: """Async variant that calls the Ollama client asynchronously.""" prompt = f""" You are an automated fact-checker. Determine whether the ANSWER is fully supported by the CONTEXT and output a single line of JSON. # EXAMPLES What color is the sky? During the day, the sky appears blue due to Rayleigh scattering. The sky is blue during the day. {{"verdict": "SUPPORTED", "is_grounded": true, "reasoning": "The context explicitly supports that the sky is blue during the day.", "confidence_score": 100}} Where are apples and oranges grown? Apples are grown in orchards. Apples are grown in orchards and oranges are grown in groves. {{"verdict": "NOT_SUPPORTED", "is_grounded": false, "reasoning": "The context mentions orchards, but not oranges or groves.", "confidence_score": 80}} How long is the process? The first step takes 3 days. The second step takes 5 days. The process takes 3 days. {{"verdict": "NEEDS_CLARIFICATION", "is_grounded": false, "reasoning": "The answer omits the 5 days required for the second step.", "confidence_score": 70}} # TASK "{query}" """ prompt += context[:4000] # Clamp to avoid huge prompts prompt += """ """ prompt += answer prompt += """ """ resp = await self.llm_client.generate_completion_async(self.llm_model, prompt, format="json") try: data = json.loads(resp.get("response", "{}")) return VerificationResult( is_grounded=data.get("is_grounded", False), reasoning=data.get("reasoning", "async parse error"), verdict=data.get("verdict", "NOT_SUPPORTED"), confidence_score=data.get('confidence_score', 0) ) except (json.JSONDecodeError, AttributeError): return VerificationResult(False, "Failed async parse", "NOT_SUPPORTED", 0) ================================================ FILE: rag_system/api_server.py ================================================ import json import http.server import socketserver from urllib.parse import urlparse, parse_qs import os import requests import sys import logging # Add backend directory to path for database imports backend_dir = os.path.join(os.path.dirname(__file__), '..', 'backend') if backend_dir not in sys.path: sys.path.append(backend_dir) from backend.database import ChatDatabase, generate_session_title from rag_system.main import get_agent from rag_system.factory import get_indexing_pipeline # Initialize database connection once at module level # Use auto-detection for environment-appropriate path db = ChatDatabase() # Get the desired agent mode from environment variables, defaulting to 'default' # This allows us to easily switch between 'default', 'fast', 'react', etc. AGENT_MODE = os.getenv("RAG_CONFIG_MODE", "default") RAG_AGENT = get_agent(AGENT_MODE) INDEXING_PIPELINE = get_indexing_pipeline(AGENT_MODE) # --- Global Singleton for the RAG Agent --- # The agent is initialized once when the server starts. # This avoids reloading all the models on every request. print("🧠 Initializing RAG Agent with MAXIMUM ACCURACY... (This may take a moment)") if RAG_AGENT is None: print("❌ Critical error: RAG Agent could not be initialized. Exiting.") exit(1) print("✅ RAG Agent initialized successfully with MAXIMUM ACCURACY.") # --- # Add helper near top after db & agent init # -------------- Helper ---------------- def _apply_index_embedding_model(idx_ids): """Ensure retrieval pipeline uses the embedding model stored with the first index.""" debug_info = f"🔧 _apply_index_embedding_model called with idx_ids: {idx_ids}\n" if not idx_ids: debug_info += "⚠️ No index IDs provided\n" with open("logs/embedding_debug.log", "a") as f: f.write(debug_info) return try: idx = db.get_index(idx_ids[0]) debug_info += f"🔧 Retrieved index: {idx.get('id')} with metadata: {idx.get('metadata', {})}\n" model = (idx.get("metadata") or {}).get("embedding_model") debug_info += f"🔧 Embedding model from metadata: {model}\n" if model: rp = RAG_AGENT.retrieval_pipeline current_model = rp.config.get("embedding_model_name") debug_info += f"🔧 Current embedding model: {current_model}\n" rp.update_embedding_model(model) debug_info += f"🔧 Updated embedding model to: {model}\n" else: debug_info += "⚠️ No embedding model found in metadata\n" except Exception as e: debug_info += f"⚠️ Could not apply index embedding model: {e}\n" # Write debug info to file with open("logs/embedding_debug.log", "a") as f: f.write(debug_info) def _get_table_name_for_session(session_id): """Get the correct vector table name for a session by looking up its linked indexes.""" logger = logging.getLogger(__name__) if not session_id: logger.info("❌ No session_id provided") return None try: # Get indexes linked to this session idx_ids = db.get_indexes_for_session(session_id) logger.info(f"🔍 Session {session_id[:8]}... has {len(idx_ids)} indexes: {idx_ids}") if not idx_ids: logger.warning(f"⚠️ No indexes found for session {session_id}") # Use the default table name from config instead of session-specific name from rag_system.main import PIPELINE_CONFIGS default_table = PIPELINE_CONFIGS["default"]["storage"]["text_table_name"] logger.info(f"📊 Using default table '{default_table}' for session {session_id[:8]}...") return default_table # Use the first index's vector table name idx = db.get_index(idx_ids[0]) if idx and idx.get('vector_table_name'): table_name = idx['vector_table_name'] logger.info(f"📊 Using table '{table_name}' for session {session_id[:8]}...") print(f"📊 RAG API: Using table '{table_name}' for session {session_id[:8]}...") return table_name else: logger.warning(f"⚠️ Index found but no vector table name for session {session_id}") # Use the default table name from config instead of session-specific name from rag_system.main import PIPELINE_CONFIGS default_table = PIPELINE_CONFIGS["default"]["storage"]["text_table_name"] logger.info(f"📊 Using default table '{default_table}' for session {session_id[:8]}...") return default_table except Exception as e: logger.error(f"❌ Error getting table name for session {session_id}: {e}") # Use the default table name from config instead of session-specific name from rag_system.main import PIPELINE_CONFIGS default_table = PIPELINE_CONFIGS["default"]["storage"]["text_table_name"] logger.info(f"📊 Using default table '{default_table}' for session {session_id[:8]}...") return default_table class AdvancedRagApiHandler(http.server.BaseHTTPRequestHandler): def do_OPTIONS(self): """Handle CORS preflight requests for frontend integration.""" self.send_response(200) self.send_header('Access-Control-Allow-Origin', '*') self.send_header('Access-Control-Allow-Methods', 'POST, OPTIONS') self.send_header('Access-Control-Allow-Headers', 'Content-Type') self.end_headers() def do_POST(self): """Handle POST requests for chat and indexing.""" parsed_path = urlparse(self.path) if parsed_path.path == '/chat': self.handle_chat() elif parsed_path.path == '/chat/stream': self.handle_chat_stream() elif parsed_path.path == '/index': self.handle_index() else: self.send_json_response({"error": "Not Found"}, status_code=404) def do_GET(self): parsed_path = urlparse(self.path) if parsed_path.path == '/models': self.handle_models() else: self.send_json_response({"error": "Not Found"}, status_code=404) def handle_chat(self): """Handles a chat query by calling the agentic RAG pipeline.""" try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode('utf-8')) query = data.get('query') session_id = data.get('session_id') compose_flag = data.get('compose_sub_answers') decomp_flag = data.get('query_decompose') ai_rerank_flag = data.get('ai_rerank') ctx_expand_flag = data.get('context_expand') verify_flag = data.get('verify') # ✨ NEW RETRIEVAL PARAMETERS retrieval_k = data.get('retrieval_k', 20) context_window_size = data.get('context_window_size', 1) reranker_top_k = data.get('reranker_top_k', 10) search_type = data.get('search_type', 'hybrid') dense_weight = data.get('dense_weight', 0.7) # 🚩 NEW: Force RAG override from frontend force_rag = bool(data.get('force_rag', False)) # 🌿 Provence sentence pruning provence_prune = data.get('provence_prune') provence_threshold = data.get('provence_threshold') # User-selected generation model requested_model = data.get('model') if isinstance(requested_model,str) and requested_model: RAG_AGENT.ollama_config['generation_model']=requested_model if not query: self.send_json_response({"error": "Query is required"}, status_code=400) return # 🔄 UPDATE SESSION TITLE: If this is the first message in the session, update the title if session_id: try: # Check if this is the first message by calling the backend server backend_url = f"http://localhost:8000/sessions/{session_id}" session_resp = requests.get(backend_url) if session_resp.status_code == 200: session_data = session_resp.json() session = session_data.get('session', {}) # If message_count is 0, this is the first message if session.get('message_count', 0) == 0: # Generate a title from the first message title = generate_session_title(query) # Update the session title via backend API # We'll need to add this endpoint to the backend, for now let's make a direct database call # This is a temporary solution until we add a proper API endpoint db.update_session_title(session_id, title) print(f"📝 Updated session title to: {title}") # 💾 STORE USER MESSAGE: Add the user message to the database user_message_id = db.add_message(session_id, query, "user") print(f"💾 Stored user message: {user_message_id}") else: # Not the first message, but still store the user message user_message_id = db.add_message(session_id, query, "user") print(f"💾 Stored user message: {user_message_id}") except Exception as e: print(f"⚠️ Failed to update session title or store user message: {e}") # Continue with the request even if title update fails # Allow explicit table_name override table_name = data.get('table_name') if not table_name and session_id: table_name = _get_table_name_for_session(session_id) # Decide execution path print(f"🔧 Force RAG flag: {force_rag}") if force_rag: # --- Apply runtime overrides manually because we skip Agent.run() rp_cfg = RAG_AGENT.retrieval_pipeline.config if retrieval_k is not None: rp_cfg["retrieval_k"] = retrieval_k if reranker_top_k is not None: rp_cfg.setdefault("reranker", {})["top_k"] = reranker_top_k if search_type is not None: rp_cfg.setdefault("retrieval", {})["search_type"] = search_type if dense_weight is not None: rp_cfg.setdefault("retrieval", {}).setdefault("dense", {})["weight"] = dense_weight # Provence overrides if provence_prune is not None: rp_cfg.setdefault("provence", {})["enabled"] = bool(provence_prune) if provence_threshold is not None: rp_cfg.setdefault("provence", {})["threshold"] = float(provence_threshold) # 🔄 Apply embedding model for this session (same as in agent path) if session_id: idx_ids = db.get_indexes_for_session(session_id) _apply_index_embedding_model(idx_ids) # Directly invoke retrieval pipeline to bypass triage result = RAG_AGENT.retrieval_pipeline.run( query, table_name=table_name, window_size_override=context_window_size, ) else: # Use full agent with smart routing # Apply Provence overrides even in agent path rp_cfg = RAG_AGENT.retrieval_pipeline.config if provence_prune is not None: rp_cfg.setdefault("provence", {})["enabled"] = bool(provence_prune) if provence_threshold is not None: rp_cfg.setdefault("provence", {})["threshold"] = float(provence_threshold) # 🔄 Refresh document overviews for this session if session_id: idx_ids = db.get_indexes_for_session(session_id) _apply_index_embedding_model(idx_ids) RAG_AGENT.load_overviews_for_indexes(idx_ids) # 🔧 Set index-specific overview path if session_id: rp_cfg["overview_path"] = f"index_store/overviews/{session_id}.jsonl" # 🔧 Configure late chunking rp_cfg.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True result = RAG_AGENT.run( query, table_name=table_name, session_id=session_id, compose_sub_answers=compose_flag, query_decompose=decomp_flag, ai_rerank=ai_rerank_flag, context_expand=ctx_expand_flag, verify=verify_flag, retrieval_k=retrieval_k, context_window_size=context_window_size, reranker_top_k=reranker_top_k, search_type=search_type, dense_weight=dense_weight, ) # The result is a dict, so we need to dump it to a JSON string self.send_json_response(result) # 💾 STORE AI RESPONSE: Add the AI response to the database if session_id and result and result.get("answer"): try: ai_message_id = db.add_message(session_id, result["answer"], "assistant") print(f"💾 Stored AI response: {ai_message_id}") except Exception as e: print(f"⚠️ Failed to store AI response: {e}") # Continue even if storage fails except json.JSONDecodeError: self.send_json_response({"error": "Invalid JSON"}, status_code=400) except Exception as e: self.send_json_response({"error": f"Server error: {str(e)}"}, status_code=500) def handle_chat_stream(self): """Stream internal phases and final answer using SSE (text/event-stream).""" try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode('utf-8')) query = data.get('query') session_id = data.get('session_id') compose_flag = data.get('compose_sub_answers') decomp_flag = data.get('query_decompose') ai_rerank_flag = data.get('ai_rerank') ctx_expand_flag = data.get('context_expand') verify_flag = data.get('verify') # ✨ NEW RETRIEVAL PARAMETERS retrieval_k = data.get('retrieval_k', 20) context_window_size = data.get('context_window_size', 1) reranker_top_k = data.get('reranker_top_k', 10) search_type = data.get('search_type', 'hybrid') dense_weight = data.get('dense_weight', 0.7) # 🚩 NEW: Force RAG override from frontend force_rag = bool(data.get('force_rag', False)) # 🌿 Provence sentence pruning provence_prune = data.get('provence_prune') provence_threshold = data.get('provence_threshold') # User-selected generation model requested_model = data.get('model') if isinstance(requested_model,str) and requested_model: RAG_AGENT.ollama_config['generation_model']=requested_model if not query: self.send_json_response({"error": "Query is required"}, status_code=400) return # 🔄 UPDATE SESSION TITLE: If this is the first message in the session, update the title if session_id: try: # Check if this is the first message by calling the backend server backend_url = f"http://localhost:8000/sessions/{session_id}" session_resp = requests.get(backend_url) if session_resp.status_code == 200: session_data = session_resp.json() session = session_data.get('session', {}) # If message_count is 0, this is the first message if session.get('message_count', 0) == 0: # Generate a title from the first message title = generate_session_title(query) # Update the session title via backend API # We'll need to add this endpoint to the backend, for now let's make a direct database call # This is a temporary solution until we add a proper API endpoint db.update_session_title(session_id, title) print(f"📝 Updated session title to: {title}") # 💾 STORE USER MESSAGE: Add the user message to the database user_message_id = db.add_message(session_id, query, "user") print(f"💾 Stored user message: {user_message_id}") else: # Not the first message, but still store the user message user_message_id = db.add_message(session_id, query, "user") print(f"💾 Stored user message: {user_message_id}") except Exception as e: print(f"⚠️ Failed to update session title or store user message: {e}") # Continue with the request even if title update fails # Allow explicit table_name override table_name = data.get('table_name') if not table_name and session_id: table_name = _get_table_name_for_session(session_id) # Prepare response headers for SSE self.send_response(200) self.send_header('Content-Type', 'text/event-stream') self.send_header('Cache-Control', 'no-cache') # Keep connection alive for SSE; no manual chunked encoding (Python http.server # does not add chunk sizes automatically, so declaring it breaks clients). self.send_header('Connection', 'keep-alive') self.send_header('Access-Control-Allow-Origin', '*') self.end_headers() def emit(event_type: str, payload): """Send a single SSE event.""" try: data_str = json.dumps({"type": event_type, "data": payload}) self.wfile.write(f"data: {data_str}\n\n".encode('utf-8')) self.wfile.flush() except BrokenPipeError: # Client disconnected raise # Run the agent synchronously, emitting checkpoints try: if force_rag: # Apply overrides same as above since we bypass Agent.run rp_cfg = RAG_AGENT.retrieval_pipeline.config if retrieval_k is not None: rp_cfg["retrieval_k"] = retrieval_k if reranker_top_k is not None: rp_cfg.setdefault("reranker", {})["top_k"] = reranker_top_k if search_type is not None: rp_cfg.setdefault("retrieval", {})["search_type"] = search_type if dense_weight is not None: rp_cfg.setdefault("retrieval", {}).setdefault("dense", {})["weight"] = dense_weight # Provence overrides if provence_prune is not None: rp_cfg.setdefault("provence", {})["enabled"] = bool(provence_prune) if provence_threshold is not None: rp_cfg.setdefault("provence", {})["threshold"] = float(provence_threshold) # 🔄 Apply embedding model for this session (same as in agent path) if session_id: idx_ids = db.get_indexes_for_session(session_id) _apply_index_embedding_model(idx_ids) # 🔧 Set index-specific overview path so each index writes separate file if session_id: rp_cfg["overview_path"] = f"index_store/overviews/{session_id}.jsonl" # 🔧 Configure late chunking rp_cfg.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True # Straight retrieval pipeline with streaming events final_result = RAG_AGENT.retrieval_pipeline.run( query, table_name=table_name, window_size_override=context_window_size, event_callback=emit, ) else: # Provence overrides rp_cfg = RAG_AGENT.retrieval_pipeline.config if provence_prune is not None: rp_cfg.setdefault("provence", {})["enabled"] = bool(provence_prune) if provence_threshold is not None: rp_cfg.setdefault("provence", {})["threshold"] = float(provence_threshold) # 🔄 Refresh overviews for this session if session_id: idx_ids = db.get_indexes_for_session(session_id) _apply_index_embedding_model(idx_ids) RAG_AGENT.load_overviews_for_indexes(idx_ids) # 🔧 Set index-specific overview path if session_id: rp_cfg["overview_path"] = f"index_store/overviews/{session_id}.jsonl" # 🔧 Configure late chunking rp_cfg.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True final_result = RAG_AGENT.run( query, table_name=table_name, session_id=session_id, compose_sub_answers=compose_flag, query_decompose=decomp_flag, ai_rerank=ai_rerank_flag, context_expand=ctx_expand_flag, verify=verify_flag, # ✨ NEW RETRIEVAL PARAMETERS retrieval_k=retrieval_k, context_window_size=context_window_size, reranker_top_k=reranker_top_k, search_type=search_type, dense_weight=dense_weight, event_callback=emit, ) # Ensure the final answer is sent (in case callback missed it) emit("complete", final_result) # 💾 STORE AI RESPONSE: Add the AI response to the database if session_id and final_result and final_result.get("answer"): try: ai_message_id = db.add_message(session_id, final_result["answer"], "assistant") print(f"💾 Stored AI response: {ai_message_id}") except Exception as e: print(f"⚠️ Failed to store AI response: {e}") # Continue even if storage fails except BrokenPipeError: print("🔌 Client disconnected from SSE stream.") except Exception as e: # Send error event then close error_payload = {"error": str(e)} try: emit("error", error_payload) finally: print(f"❌ Stream error: {e}") except json.JSONDecodeError: self.send_json_response({"error": "Invalid JSON"}, status_code=400) except Exception as e: self.send_json_response({"error": f"Server error: {str(e)}"}, status_code=500) def handle_index(self): """Triggers the document indexing pipeline for specific files.""" try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode('utf-8')) file_paths = data.get('file_paths') session_id = data.get('session_id') compose_flag = data.get('compose_sub_answers') decomp_flag = data.get('query_decompose') ai_rerank_flag = data.get('ai_rerank') ctx_expand_flag = data.get('context_expand') enable_latechunk = bool(data.get("enable_latechunk", False)) enable_docling_chunk = bool(data.get("enable_docling_chunk", False)) # 🆕 NEW CONFIGURATION OPTIONS: chunk_size = int(data.get("chunk_size", 512)) chunk_overlap = int(data.get("chunk_overlap", 64)) retrieval_mode = data.get("retrieval_mode", "hybrid") window_size = int(data.get("window_size", 2)) enable_enrich = bool(data.get("enable_enrich", True)) embedding_model = data.get('embeddingModel') enrich_model = data.get('enrichModel') overview_model = data.get('overviewModel') or data.get('overview_model_name') batch_size_embed = int(data.get("batch_size_embed", 50)) batch_size_enrich = int(data.get("batch_size_enrich", 25)) if not file_paths or not isinstance(file_paths, list): self.send_json_response({ "error": "A 'file_paths' list is required." }, status_code=400) return # Allow explicit table_name override table_name = data.get('table_name') if not table_name and session_id: table_name = _get_table_name_for_session(session_id) # The INDEXING_PIPELINE is already initialized. We just need to use it. # If a session-specific table is needed, we can override the config for this run. if table_name: import copy config_override = copy.deepcopy(INDEXING_PIPELINE.config) config_override["storage"]["text_table_name"] = table_name config_override.setdefault("retrievers", {}).setdefault("dense", {})["lancedb_table_name"] = table_name # 🔧 Configure late chunking if enable_latechunk: config_override["retrievers"].setdefault("latechunk", {})["enabled"] = True else: # ensure disabled if not requested config_override["retrievers"].setdefault("latechunk", {})["enabled"] = False # 🔧 Configure docling chunking if enable_docling_chunk: config_override["chunker_mode"] = "docling" # 🔧 Configure contextual enrichment (THIS WAS MISSING!) config_override.setdefault("contextual_enricher", {}) config_override["contextual_enricher"]["enabled"] = enable_enrich config_override["contextual_enricher"]["window_size"] = window_size # 🔧 Configure indexing batch sizes config_override.setdefault("indexing", {}) config_override["indexing"]["embedding_batch_size"] = batch_size_embed config_override["indexing"]["enrichment_batch_size"] = batch_size_enrich # 🔧 Configure chunking parameters config_override.setdefault("chunking", {}) config_override["chunking"]["chunk_size"] = chunk_size config_override["chunking"]["chunk_overlap"] = chunk_overlap # 🔧 Configure embedding model if specified if embedding_model: config_override["embedding_model_name"] = embedding_model # 🔧 Configure enrichment model if specified if enrich_model: config_override["enrich_model"] = enrich_model # 🔧 Overview model (can differ from enrichment) if overview_model: config_override["overview_model_name"] = overview_model print(f"🔧 INDEXING CONFIG: Contextual Enrichment: {enable_enrich}, Window Size: {window_size}") print(f"🔧 CHUNKING CONFIG: Size: {chunk_size}, Overlap: {chunk_overlap}") print(f"🔧 MODEL CONFIG: Embedding: {embedding_model or 'default'}, Enrichment: {enrich_model or 'default'}") # 🔧 Set index-specific overview path so each index writes separate file if session_id: config_override["overview_path"] = f"index_store/overviews/{session_id}.jsonl" # 🔧 Configure late chunking config_override.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True # Create a temporary pipeline instance with the overridden config temp_pipeline = INDEXING_PIPELINE.__class__( config_override, INDEXING_PIPELINE.llm_client, INDEXING_PIPELINE.ollama_config ) temp_pipeline.run(file_paths) else: # Use the default pipeline with overrides import copy config_override = copy.deepcopy(INDEXING_PIPELINE.config) # 🔧 Configure late chunking if enable_latechunk: config_override.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True # 🔧 Configure docling chunking if enable_docling_chunk: config_override["chunker_mode"] = "docling" # 🔧 Configure contextual enrichment (THIS WAS MISSING!) config_override.setdefault("contextual_enricher", {}) config_override["contextual_enricher"]["enabled"] = enable_enrich config_override["contextual_enricher"]["window_size"] = window_size # 🔧 Configure indexing batch sizes config_override.setdefault("indexing", {}) config_override["indexing"]["embedding_batch_size"] = batch_size_embed config_override["indexing"]["enrichment_batch_size"] = batch_size_enrich # 🔧 Configure chunking parameters config_override.setdefault("chunking", {}) config_override["chunking"]["chunk_size"] = chunk_size config_override["chunking"]["chunk_overlap"] = chunk_overlap # 🔧 Configure embedding model if specified if embedding_model: config_override["embedding_model_name"] = embedding_model # 🔧 Configure enrichment model if specified if enrich_model: config_override["enrich_model"] = enrich_model # 🔧 Overview model (can differ from enrichment) if overview_model: config_override["overview_model_name"] = overview_model print(f"🔧 INDEXING CONFIG: Contextual Enrichment: {enable_enrich}, Window Size: {window_size}") print(f"🔧 CHUNKING CONFIG: Size: {chunk_size}, Overlap: {chunk_overlap}") print(f"🔧 MODEL CONFIG: Embedding: {embedding_model or 'default'}, Enrichment: {enrich_model or 'default'}") # 🔧 Set index-specific overview path so each index writes separate file if session_id: config_override["overview_path"] = f"index_store/overviews/{session_id}.jsonl" # 🔧 Configure late chunking config_override.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True # Create temporary pipeline with overridden config temp_pipeline = INDEXING_PIPELINE.__class__( config_override, INDEXING_PIPELINE.llm_client, INDEXING_PIPELINE.ollama_config ) temp_pipeline.run(file_paths) self.send_json_response({ "message": f"Indexing process for {len(file_paths)} file(s) completed successfully.", "table_name": table_name or "default_text_table", "latechunk": enable_latechunk, "docling_chunk": enable_docling_chunk, "indexing_config": { "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "retrieval_mode": retrieval_mode, "window_size": window_size, "enable_enrich": enable_enrich, "embedding_model": embedding_model, "enrich_model": enrich_model, "batch_size_embed": batch_size_embed, "batch_size_enrich": batch_size_enrich } }) if embedding_model: try: db.update_index_metadata(session_id, {"embedding_model": embedding_model}) except Exception as e: print(f"⚠️ Could not update embedding_model metadata: {e}") except json.JSONDecodeError: self.send_json_response({"error": "Invalid JSON"}, status_code=400) except Exception as e: self.send_json_response({"error": f"Failed to start indexing: {str(e)}"}, status_code=500) def handle_models(self): """Return a list of locally installed Ollama models and supported HuggingFace models, grouped by capability.""" try: generation_models = [] embedding_models = [] # Get Ollama models if available try: resp = requests.get(f"{RAG_AGENT.ollama_config['host']}/api/tags", timeout=5) resp.raise_for_status() data = resp.json() all_ollama_models = [m.get('name') for m in data.get('models', [])] # Very naive classification ollama_embedding_models = [m for m in all_ollama_models if any(k in m for k in ['embed','bge','embedding','text'])] ollama_generation_models = [m for m in all_ollama_models if m not in ollama_embedding_models] generation_models.extend(ollama_generation_models) embedding_models.extend(ollama_embedding_models) except Exception as e: print(f"⚠️ Could not get Ollama models: {e}") # Add supported HuggingFace embedding models huggingface_embedding_models = [ "Qwen/Qwen3-Embedding-0.6B", "Qwen/Qwen3-Embedding-4B", "Qwen/Qwen3-Embedding-8B" ] embedding_models.extend(huggingface_embedding_models) # Sort models for consistent ordering generation_models.sort() embedding_models.sort() self.send_json_response({ "generation_models": generation_models, "embedding_models": embedding_models }) except Exception as e: self.send_json_response({"error": f"Could not list models: {e}"}, status_code=500) def send_json_response(self, data, status_code=200): """Utility to send a JSON response with CORS headers.""" self.send_response(status_code) self.send_header('Content-Type', 'application/json') self.send_header('Access-Control-Allow-Origin', '*') self.end_headers() response = json.dumps(data, indent=2) self.wfile.write(response.encode('utf-8')) def start_server(port=8001): """Starts the API server.""" # Use a reusable TCP server to avoid "address in use" errors on restart class ReusableTCPServer(socketserver.TCPServer): allow_reuse_address = True with ReusableTCPServer(("", port), AdvancedRagApiHandler) as httpd: print(f"🚀 Starting Advanced RAG API server on port {port}") print(f"💬 Chat endpoint: http://localhost:{port}/chat") print(f"✨ Indexing endpoint: http://localhost:{port}/index") httpd.serve_forever() if __name__ == "__main__": # To run this server: python -m rag_system.api_server start_server() ================================================ FILE: rag_system/api_server_with_progress.py ================================================ import json import threading import time from typing import Dict, List, Any import logging from urllib.parse import urlparse, parse_qs import http.server import socketserver # Import the core logic and batch processing utilities from rag_system.main import get_agent from rag_system.utils.batch_processor import ProgressTracker, timer # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Global progress tracking storage ACTIVE_PROGRESS_SESSIONS: Dict[str, Dict[str, Any]] = {} # --- Global Singleton for the RAG Agent --- print("🧠 Initializing RAG Agent... (This may take a moment)") RAG_AGENT = get_agent() if RAG_AGENT is None: print("❌ Critical error: RAG Agent could not be initialized. Exiting.") exit(1) print("✅ RAG Agent initialized successfully.") class ServerSentEventsHandler: """Handler for Server-Sent Events (SSE) for real-time progress updates""" active_connections: Dict[str, Any] = {} @classmethod def add_connection(cls, session_id: str, response_handler): """Add a new SSE connection""" cls.active_connections[session_id] = response_handler logger.info(f"SSE connection added for session: {session_id}") @classmethod def remove_connection(cls, session_id: str): """Remove an SSE connection""" if session_id in cls.active_connections: del cls.active_connections[session_id] logger.info(f"SSE connection removed for session: {session_id}") @classmethod def send_event(cls, session_id: str, event_type: str, data: Dict[str, Any]): """Send an SSE event to a specific session""" if session_id not in cls.active_connections: return try: handler = cls.active_connections[session_id] event_data = json.dumps(data) message = f"event: {event_type}\ndata: {event_data}\n\n" handler.wfile.write(message.encode('utf-8')) handler.wfile.flush() except Exception as e: logger.error(f"Failed to send SSE event: {e}") cls.remove_connection(session_id) class RealtimeProgressTracker(ProgressTracker): """Enhanced ProgressTracker that sends updates via Server-Sent Events""" def __init__(self, total_items: int, operation_name: str, session_id: str): super().__init__(total_items, operation_name) self.session_id = session_id self.last_update = 0 self.update_interval = 1 # Update every 1 second # Initialize session progress ACTIVE_PROGRESS_SESSIONS[session_id] = { "operation_name": operation_name, "total_items": total_items, "processed_items": 0, "errors_encountered": 0, "start_time": self.start_time, "status": "running", "current_step": "", "eta_seconds": 0, "throughput": 0, "progress_percentage": 0 } # Send initial progress update self._send_progress_update() def update(self, items_processed: int, errors: int = 0, current_step: str = ""): """Update progress and send notification""" super().update(items_processed, errors) # Update session data session_data = ACTIVE_PROGRESS_SESSIONS.get(self.session_id) if session_data: session_data.update({ "processed_items": self.processed_items, "errors_encountered": self.errors_encountered, "current_step": current_step, "progress_percentage": (self.processed_items / self.total_items) * 100, }) # Calculate throughput and ETA elapsed = time.time() - self.start_time if elapsed > 0: session_data["throughput"] = self.processed_items / elapsed remaining = self.total_items - self.processed_items session_data["eta_seconds"] = remaining / session_data["throughput"] if session_data["throughput"] > 0 else 0 # Send update if enough time has passed current_time = time.time() if current_time - self.last_update >= self.update_interval: self._send_progress_update() self.last_update = current_time def finish(self): """Mark progress as finished and send final update""" super().finish() # Update session status session_data = ACTIVE_PROGRESS_SESSIONS.get(self.session_id) if session_data: session_data.update({ "status": "completed", "progress_percentage": 100, "eta_seconds": 0 }) # Send final update self._send_progress_update(final=True) def _send_progress_update(self, final: bool = False): """Send progress update via Server-Sent Events""" session_data = ACTIVE_PROGRESS_SESSIONS.get(self.session_id, {}) event_data = { "session_id": self.session_id, "progress": session_data.copy(), "final": final, "timestamp": time.time() } ServerSentEventsHandler.send_event(self.session_id, "progress", event_data) def run_indexing_with_progress(file_paths: List[str], session_id: str): """Enhanced indexing function with real-time progress tracking""" from rag_system.pipelines.indexing_pipeline import IndexingPipeline from rag_system.utils.ollama_client import OllamaClient import json try: # Send initial status ServerSentEventsHandler.send_event(session_id, "status", { "message": "Initializing indexing pipeline...", "session_id": session_id }) # Load configuration config_file = "batch_indexing_config.json" try: with open(config_file, 'r') as f: config = json.load(f) except FileNotFoundError: # Fallback to default config config = { "embedding_model_name": "Qwen/Qwen3-Embedding-0.6B", "indexing": { "embedding_batch_size": 50, "enrichment_batch_size": 10, "enable_progress_tracking": True }, "contextual_enricher": {"enabled": True, "window_size": 1}, "retrievers": { "dense": {"enabled": True, "lancedb_table_name": "default_text_table"}, "bm25": {"enabled": True, "index_name": "default_bm25_index"} }, "storage": { "chunk_store_path": "./index_store/chunks/chunks.pkl", "lancedb_uri": "./index_store/lancedb", "bm25_path": "./index_store/bm25" } } # Initialize components ollama_client = OllamaClient() ollama_config = { "generation_model": "llama3.2:1b", "embedding_model": "mxbai-embed-large" } # Create enhanced pipeline pipeline = IndexingPipeline(config, ollama_client, ollama_config) # Create progress tracker for the overall process total_steps = 6 # Rough estimate of pipeline steps step_tracker = RealtimeProgressTracker(total_steps, "Document Indexing", session_id) with timer("Complete Indexing Pipeline"): try: # Step 1: Document Processing step_tracker.update(1, current_step="Processing documents...") # Run the indexing pipeline pipeline.run(file_paths) # Update progress through the steps step_tracker.update(1, current_step="Chunking completed...") step_tracker.update(1, current_step="BM25 indexing completed...") step_tracker.update(1, current_step="Contextual enrichment completed...") step_tracker.update(1, current_step="Vector embeddings completed...") step_tracker.update(1, current_step="Indexing finalized...") step_tracker.finish() # Send completion notification ServerSentEventsHandler.send_event(session_id, "completion", { "message": f"Successfully indexed {len(file_paths)} file(s)", "file_count": len(file_paths), "session_id": session_id }) except Exception as e: # Send error notification ServerSentEventsHandler.send_event(session_id, "error", { "message": str(e), "session_id": session_id }) raise except Exception as e: logger.error(f"Indexing failed for session {session_id}: {e}") ServerSentEventsHandler.send_event(session_id, "error", { "message": str(e), "session_id": session_id }) raise class EnhancedRagApiHandler(http.server.BaseHTTPRequestHandler): """Enhanced API handler with progress tracking support""" def do_OPTIONS(self): """Handle CORS preflight requests for frontend integration.""" self.send_response(200) self.send_header('Access-Control-Allow-Origin', '*') self.send_header('Access-Control-Allow-Methods', 'POST, GET, OPTIONS') self.send_header('Access-Control-Allow-Headers', 'Content-Type') self.end_headers() def do_GET(self): """Handle GET requests for progress status and SSE streams""" parsed_path = urlparse(self.path) if parsed_path.path == '/progress': self.handle_progress_status() elif parsed_path.path == '/stream': self.handle_progress_stream() else: self.send_json_response({"error": "Not Found"}, status_code=404) def do_POST(self): """Handle POST requests for chat and indexing.""" parsed_path = urlparse(self.path) if parsed_path.path == '/chat': self.handle_chat() elif parsed_path.path == '/index': self.handle_index_with_progress() else: self.send_json_response({"error": "Not Found"}, status_code=404) def handle_chat(self): """Handles a chat query by calling the agentic RAG pipeline.""" try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode('utf-8')) query = data.get('query') if not query: self.send_json_response({"error": "Query is required"}, status_code=400) return # Use the single, persistent agent instance to run the query result = RAG_AGENT.run(query) # The result is a dict, so we need to dump it to a JSON string self.send_json_response(result) except json.JSONDecodeError: self.send_json_response({"error": "Invalid JSON"}, status_code=400) except Exception as e: self.send_json_response({"error": f"Server error: {str(e)}"}, status_code=500) def handle_index_with_progress(self): """Triggers the document indexing pipeline with real-time progress tracking.""" try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) data = json.loads(post_data.decode('utf-8')) file_paths = data.get('file_paths') session_id = data.get('session_id') if not file_paths or not isinstance(file_paths, list): self.send_json_response({ "error": "A 'file_paths' list is required." }, status_code=400) return if not session_id: self.send_json_response({ "error": "A 'session_id' is required for progress tracking." }, status_code=400) return # Start indexing in a separate thread to avoid blocking def run_indexing_thread(): try: run_indexing_with_progress(file_paths, session_id) except Exception as e: logger.error(f"Indexing thread failed: {e}") thread = threading.Thread(target=run_indexing_thread) thread.daemon = True thread.start() # Return immediate response self.send_json_response({ "message": f"Indexing started for {len(file_paths)} file(s)", "session_id": session_id, "status": "started", "progress_stream_url": f"http://localhost:8001/stream?session_id={session_id}" }) except json.JSONDecodeError: self.send_json_response({"error": "Invalid JSON"}, status_code=400) except Exception as e: self.send_json_response({"error": f"Failed to start indexing: {str(e)}"}, status_code=500) def handle_progress_status(self): """Handle GET requests for current progress status""" parsed_url = urlparse(self.path) params = parse_qs(parsed_url.query) session_id = params.get('session_id', [None])[0] if not session_id: self.send_json_response({"error": "session_id is required"}, status_code=400) return progress_data = ACTIVE_PROGRESS_SESSIONS.get(session_id) if not progress_data: self.send_json_response({"error": "No active progress for this session"}, status_code=404) return self.send_json_response({ "session_id": session_id, "progress": progress_data }) def handle_progress_stream(self): """Handle Server-Sent Events stream for real-time progress""" parsed_url = urlparse(self.path) params = parse_qs(parsed_url.query) session_id = params.get('session_id', [None])[0] if not session_id: self.send_response(400) self.end_headers() return # Set up SSE headers self.send_response(200) self.send_header('Content-Type', 'text/event-stream') self.send_header('Cache-Control', 'no-cache') self.send_header('Connection', 'keep-alive') self.send_header('Access-Control-Allow-Origin', '*') self.end_headers() # Add this connection to the SSE handler ServerSentEventsHandler.add_connection(session_id, self) # Send initial connection message initial_message = json.dumps({ "session_id": session_id, "message": "Progress stream connected", "timestamp": time.time() }) self.wfile.write(f"event: connected\ndata: {initial_message}\n\n".encode('utf-8')) self.wfile.flush() # Keep connection alive try: while session_id in ServerSentEventsHandler.active_connections: time.sleep(1) # Send heartbeat heartbeat = json.dumps({"type": "heartbeat", "timestamp": time.time()}) self.wfile.write(f"event: heartbeat\ndata: {heartbeat}\n\n".encode('utf-8')) self.wfile.flush() except Exception as e: logger.info(f"SSE connection closed for session {session_id}: {e}") finally: ServerSentEventsHandler.remove_connection(session_id) def send_json_response(self, data, status_code=200): """Utility to send a JSON response with CORS headers.""" self.send_response(status_code) self.send_header('Content-Type', 'application/json') self.send_header('Access-Control-Allow-Origin', '*') self.end_headers() response = json.dumps(data, indent=2) self.wfile.write(response.encode('utf-8')) def start_enhanced_server(port=8000): """Start the enhanced API server with a reusable TCP socket.""" # Use a custom TCPServer that allows address reuse class ReusableTCPServer(socketserver.TCPServer): allow_reuse_address = True with ReusableTCPServer(("", port), EnhancedRagApiHandler) as httpd: print(f"🚀 Starting Enhanced RAG API server on port {port}") print(f"💬 Chat endpoint: http://localhost:{port}/chat") print(f"✨ Indexing endpoint: http://localhost:{port}/index") print(f"📊 Progress endpoint: http://localhost:{port}/progress") print(f"🌊 Progress stream: http://localhost:{port}/stream") print(f"📈 Real-time progress tracking enabled via Server-Sent Events!") httpd.serve_forever() if __name__ == '__main__': # Start the server on a dedicated thread server_thread = threading.Thread(target=start_enhanced_server) server_thread.daemon = True server_thread.start() print("🚀 Enhanced RAG API server with progress tracking is running.") print("Press Ctrl+C to stop.") # Keep the main thread alive try: while True: time.sleep(1) except KeyboardInterrupt: print("\nStopping server...") ================================================ FILE: rag_system/factory.py ================================================ from dotenv import load_dotenv def get_agent(mode: str = "default"): """ Factory function to get an instance of the RAG agent based on the specified mode. This uses local imports to prevent circular dependencies. """ from rag_system.agent.loop import Agent from rag_system.utils.ollama_client import OllamaClient from rag_system.main import PIPELINE_CONFIGS, OLLAMA_CONFIG, LLM_BACKEND, WATSONX_CONFIG load_dotenv() # Initialize the appropriate LLM client based on backend configuration if LLM_BACKEND.lower() == "watsonx": from rag_system.utils.watsonx_client import WatsonXClient if not WATSONX_CONFIG["api_key"] or not WATSONX_CONFIG["project_id"]: raise ValueError( "Watson X configuration incomplete. Please set WATSONX_API_KEY and WATSONX_PROJECT_ID " "environment variables." ) llm_client = WatsonXClient( api_key=WATSONX_CONFIG["api_key"], project_id=WATSONX_CONFIG["project_id"], url=WATSONX_CONFIG["url"] ) llm_config = WATSONX_CONFIG else: llm_client = OllamaClient(host=OLLAMA_CONFIG["host"]) llm_config = OLLAMA_CONFIG config = PIPELINE_CONFIGS.get(mode, PIPELINE_CONFIGS['default']) if 'storage' not in config: config['storage'] = { 'db_path': 'lancedb', 'text_table_name': 'text_pages_default', 'image_table_name': 'image_pages' } agent = Agent( pipeline_configs=config, llm_client=llm_client, ollama_config=llm_config ) return agent def get_indexing_pipeline(mode: str = "default"): """ Factory function to get an instance of the Indexing Pipeline. """ from rag_system.pipelines.indexing_pipeline import IndexingPipeline from rag_system.main import PIPELINE_CONFIGS, OLLAMA_CONFIG, LLM_BACKEND, WATSONX_CONFIG from rag_system.utils.ollama_client import OllamaClient load_dotenv() # Initialize the appropriate LLM client based on backend configuration if LLM_BACKEND.lower() == "watsonx": from rag_system.utils.watsonx_client import WatsonXClient if not WATSONX_CONFIG["api_key"] or not WATSONX_CONFIG["project_id"]: raise ValueError( "Watson X configuration incomplete. Please set WATSONX_API_KEY and WATSONX_PROJECT_ID " "environment variables." ) llm_client = WatsonXClient( api_key=WATSONX_CONFIG["api_key"], project_id=WATSONX_CONFIG["project_id"], url=WATSONX_CONFIG["url"] ) llm_config = WATSONX_CONFIG else: llm_client = OllamaClient(host=OLLAMA_CONFIG["host"]) llm_config = OLLAMA_CONFIG config = PIPELINE_CONFIGS.get(mode, PIPELINE_CONFIGS['default']) return IndexingPipeline(config, llm_client, llm_config) ================================================ FILE: rag_system/indexing/__init__.py ================================================ ================================================ FILE: rag_system/indexing/contextualizer.py ================================================ from typing import List, Dict, Any from rag_system.utils.ollama_client import OllamaClient from rag_system.ingestion.chunking import create_contextual_window import logging import re # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Define the structured prompt templates, adapted from the example SYSTEM_PROMPT = "You are an expert at summarizing and providing context for document sections based on their local surroundings." LOCAL_CONTEXT_PROMPT_TEMPLATE = """ {local_context_text} """ CHUNK_PROMPT_TEMPLATE = """Here is the specific chunk we want to situate within the local context provided: {chunk_content} Based *only* on the local context provided, give a very short (2-5 sentence) context summary to situate this specific chunk. Focus on the chunk's topic and its relation to the immediately surrounding text shown in the local context. Focus on the the overall theme of the context, make sure to include topics, concepts, and other relevant information. Answer *only* with the succinct context and nothing else.""" class ContextualEnricher: """ Enriches chunks with a prepended summary of their surrounding context using Ollama, while preserving the original text. """ def __init__(self, llm_client: OllamaClient, llm_model: str, batch_size: int = 10): self.llm_client = llm_client self.llm_model = llm_model self.batch_size = batch_size logger.info(f"Initialized ContextualEnricher with Ollama model '{self.llm_model}' (batch_size={batch_size}).") def _generate_summary(self, local_context_text: str, chunk_text: str) -> str: """Generates a contextual summary using a structured, multi-part prompt.""" # Combine the templates to form the final content for the HumanMessage equivalent human_prompt_content = ( f"{LOCAL_CONTEXT_PROMPT_TEMPLATE.format(local_context_text=local_context_text)}\n\n" f"{CHUNK_PROMPT_TEMPLATE.format(chunk_content=chunk_text)}" ) try: # Although we don't use LangChain's message objects, we can simulate the # System + Human message structure in the single prompt for the Ollama client. # A common way is to provide the system prompt and then the user's request. full_prompt = f"{SYSTEM_PROMPT}\n\n{human_prompt_content}" response = self.llm_client.generate_completion(self.llm_model, full_prompt, enable_thinking=False) summary_raw = response.get('response', '').strip() # --- Sanitize the summary to remove chain-of-thought markers --- # Many Qwen models wrap reasoning in ... or similar tags. cleaned = re.sub(r']*>.*?', '', summary_raw, flags=re.IGNORECASE | re.DOTALL) # Remove any assistant role tags that may appear cleaned = re.sub(r']*>|', '', cleaned, flags=re.IGNORECASE) # If the model used an explicit "Answer:" delimiter keep only the part after it if 'Answer:' in cleaned: cleaned = cleaned.split('Answer:', 1)[1] # Take the first non-empty line to avoid leftover blank lines summary = next((ln.strip() for ln in cleaned.splitlines() if ln.strip()), '') # Fallback to raw if cleaning removed everything if not summary: summary = summary_raw if not summary or len(summary) < 5: logger.warning("Generated context summary is too short or empty. Skipping enrichment for this chunk.") return "" return summary except Exception as e: logger.error(f"LLM invocation failed during contextualization: {e}", exc_info=True) return "" # Gracefully fail by returning no summary def enrich_chunks(self, chunks: List[Dict[str, Any]], window_size: int = 1) -> List[Dict[str, Any]]: if not chunks: return [] logger.info(f"Enriching {len(chunks)} chunks with contextual summaries (window_size={window_size}) using Ollama...") # Import batch processor from rag_system.utils.batch_processor import BatchProcessor, estimate_memory_usage # Estimate memory usage memory_mb = estimate_memory_usage(chunks) logger.info(f"Estimated memory usage for contextual enrichment: {memory_mb:.1f}MB") # Use batch processing for better performance and progress tracking batch_processor = BatchProcessor(batch_size=self.batch_size) def process_chunk_batch(chunk_indices): """Process a batch of chunk indices for contextual enrichment""" batch_results = [] for i in chunk_indices: chunk = chunks[i] try: local_context_text = create_contextual_window(chunks, chunk_index=i, window_size=window_size) # The summary is generated based on the original, unmodified text original_text = chunk['text'] summary = self._generate_summary(local_context_text, original_text) new_chunk = chunk.copy() # Ensure metadata is a dictionary if 'metadata' not in new_chunk or not isinstance(new_chunk['metadata'], dict): new_chunk['metadata'] = {} # Store original text and summary in metadata new_chunk['metadata']['original_text'] = original_text new_chunk['metadata']['contextual_summary'] = "N/A" # Prepend the context summary ONLY if it was successfully generated if summary: new_chunk['text'] = f"Context: {summary}\n\n---\n\n{original_text}" new_chunk['metadata']['contextual_summary'] = summary batch_results.append(new_chunk) except Exception as e: logger.error(f"Error enriching chunk {i}: {e}") # Return original chunk if enrichment fails batch_results.append(chunk) return batch_results # Create list of chunk indices for batch processing chunk_indices = list(range(len(chunks))) # Process chunks in batches enriched_chunks = batch_processor.process_in_batches( chunk_indices, process_chunk_batch, "Contextual Enrichment" ) return enriched_chunks def enrich_chunks_sequential(self, chunks: List[Dict[str, Any]], window_size: int = 1) -> List[Dict[str, Any]]: """Sequential enrichment method (legacy) - kept for comparison""" if not chunks: return [] logger.info(f"Enriching {len(chunks)} chunks sequentially (window_size={window_size})...") enriched_chunks = [] for i, chunk in enumerate(chunks): local_context_text = create_contextual_window(chunks, chunk_index=i, window_size=window_size) # The summary is generated based on the original, unmodified text original_text = chunk['text'] summary = self._generate_summary(local_context_text, original_text) new_chunk = chunk.copy() # Ensure metadata is a dictionary if 'metadata' not in new_chunk or not isinstance(new_chunk['metadata'], dict): new_chunk['metadata'] = {} # Store original text and summary in metadata new_chunk['metadata']['original_text'] = original_text new_chunk['metadata']['contextual_summary'] = "N/A" # Prepend the context summary ONLY if it was successfully generated if summary: new_chunk['text'] = f"Context: {summary}\n\n---\n\n{original_text}" new_chunk['metadata']['contextual_summary'] = summary enriched_chunks.append(new_chunk) if (i + 1) % 10 == 0 or i == len(chunks) - 1: logger.info(f" ...processed {i+1}/{len(chunks)} chunks.") return enriched_chunks ================================================ FILE: rag_system/indexing/embedders.py ================================================ # from rag_system.indexing.representations import BM25Generator import lancedb import pyarrow as pa from typing import List, Dict, Any import numpy as np import json class LanceDBManager: def __init__(self, db_path: str): self.db_path = db_path self.db = lancedb.connect(db_path) print(f"LanceDB connection established at: {db_path}") def get_table(self, table_name: str): return self.db.open_table(table_name) def create_table(self, table_name: str, schema: pa.Schema, mode: str = "overwrite"): print(f"Creating table '{table_name}' with mode '{mode}'...") return self.db.create_table(table_name, schema=schema, mode=mode) class VectorIndexer: """ Handles the indexing of vector embeddings and rich metadata into LanceDB. The 'text' field is the content that gets embedded (which can be enriched). The original, clean text is stored in the metadata. """ def __init__(self, db_manager: LanceDBManager): self.db_manager = db_manager def index(self, table_name: str, chunks: List[Dict[str, Any]], embeddings: np.ndarray): if len(chunks) != len(embeddings): raise ValueError("The number of chunks and embeddings must be the same.") if not chunks: print("No chunks to index.") return vector_dim = embeddings[0].shape[0] # The schema stores the text that was used for the embedding (potentially enriched) # and the full metadata object as a JSON string. schema = pa.schema([ pa.field("vector", pa.list_(pa.float32(), vector_dim)), pa.field("text", pa.string(), nullable=False), pa.field("chunk_id", pa.string()), pa.field("document_id", pa.string()), pa.field("chunk_index", pa.int32()), pa.field("metadata", pa.string()) ]) data = [] skipped_count = 0 for chunk, vector in zip(chunks, embeddings): # Check for NaN values in the vector if np.isnan(vector).any(): print(f"⚠️ Skipping chunk '{chunk.get('chunk_id', 'unknown')}' due to NaN values in embedding") skipped_count += 1 continue # Check for infinite values in the vector if np.isinf(vector).any(): print(f"⚠️ Skipping chunk '{chunk.get('chunk_id', 'unknown')}' due to infinite values in embedding") skipped_count += 1 continue # Ensure original_text is in metadata if not already present if 'original_text' not in chunk['metadata']: chunk['metadata']['original_text'] = chunk['text'] # Extract document_id and chunk_index for top-level storage doc_id = chunk.get("metadata", {}).get("document_id", "unknown") chunk_idx = chunk.get("metadata", {}).get("chunk_index", -1) # Defensive check for text content to ensure it's a non-empty string text_content = chunk.get('text', '') if not text_content or not isinstance(text_content, str): text_content = "" data.append({ "vector": vector.tolist(), "text": text_content, "chunk_id": chunk['chunk_id'], "document_id": doc_id, "chunk_index": chunk_idx, "metadata": json.dumps(chunk) }) if skipped_count > 0: print(f"⚠️ Skipped {skipped_count} chunks due to invalid embeddings (NaN or infinite values)") if not data: print("❌ No valid embeddings to index after filtering out NaN/infinite values") return # Incremental indexing: append to existing table if present, otherwise create it db = self.db_manager.db # underlying LanceDB connection if hasattr(db, "table_names") and table_name in db.table_names(): tbl = self.db_manager.get_table(table_name) print(f"Appending {len(data)} vectors to existing table '{table_name}'.") else: print(f"Creating table '{table_name}' (new) and adding {len(data)} vectors...") tbl = self.db_manager.create_table(table_name, schema=schema, mode="create") # Add data with NaN handling configuration try: tbl.add(data, on_bad_vectors='drop') print(f"✅ Indexed {len(data)} vectors into table '{table_name}'.") except Exception as e: print(f"❌ Failed to add data to table: {e}") # Fallback: try with fill strategy try: print("🔄 Retrying with NaN fill strategy...") tbl.add(data, on_bad_vectors='fill', fill_value=0.0) print(f"✅ Indexed {len(data)} vectors into table '{table_name}' (with NaN fill).") except Exception as e2: print(f"❌ Failed to add data even with NaN fill: {e2}") raise # BM25Indexer is no longer needed as we are moving to LanceDB's native FTS. # class BM25Indexer: # ... if __name__ == '__main__': print("embedders.py updated for contextual enrichment.") # This chunk has been "enriched". The 'text' field contains the context. enriched_chunk = { 'chunk_id': 'doc1_0', 'text': 'Context: Discusses animals.\n\n---\n\nOriginal: The cat sat on the mat.', 'metadata': { 'original_text': 'The cat sat on the mat.', 'contextual_summary': 'Discusses animals.', 'document_id': 'doc1', 'title': 'Pet Stories' } } sample_embeddings = np.random.rand(1, 128).astype('float32') DB_PATH = "./rag_system/index_store/lancedb" db_manager = LanceDBManager(db_path=DB_PATH) vector_indexer = VectorIndexer(db_manager=db_manager) vector_indexer.index( table_name="enriched_text_embeddings", chunks=[enriched_chunk], embeddings=sample_embeddings ) try: tbl = db_manager.get_table("enriched_text_embeddings") df = tbl.limit(1).to_pandas() df['metadata'] = df['metadata'].apply(json.loads) print("\n--- Verification ---") print("Embedded Text:", df['text'].iloc[0]) print("Original Text from Metadata:", df['metadata'].iloc[0]['original_text']) except Exception as e: print(f"Could not verify LanceDB table. Error: {e}") ================================================ FILE: rag_system/indexing/graph_extractor.py ================================================ from typing import List, Dict, Any import json from rag_system.utils.ollama_client import OllamaClient class GraphExtractor: """ Extracts entities and relationships from text chunks using a live Ollama model. """ def __init__(self, llm_client: OllamaClient, llm_model: str): self.llm_client = llm_client self.llm_model = llm_model print(f"Initialized GraphExtractor with Ollama model '{self.llm_model}'.") def extract(self, chunks: List[Dict[str, Any]]) -> Dict[str, List[Dict]]: all_entities = {} all_relationships = set() print(f"Extracting graph from {len(chunks)} chunks with Ollama...") for i, chunk in enumerate(chunks): # Step 1: Extract Entities entity_prompt = f""" From the following text, extract key entities (people, companies, locations). Return the answer as a JSON object with a single key 'entities', which is a list of strings. Each entity should be a short, specific name, not a long string of text. Text: "{chunk['text']}" """ entity_response = self.llm_client.generate_completion( self.llm_model, entity_prompt, format="json" ) entity_response_text = entity_response.get('response', '{}') try: entity_data = json.loads(entity_response_text) entities = entity_data.get('entities', []) if not entities: continue # Clean up entities cleaned_entities = [] for entity in entities: if len(entity) < 50 and not any(c in entity for c in "[]{}()"): cleaned_entities.append(entity) if not cleaned_entities: continue # Step 2: Extract Relationships relationship_prompt = f""" Given the following entities: {cleaned_entities} And the following text: "{chunk['text']}" Extract the relationships between the entities. Return the answer as a JSON object with a single key 'relationships', which is a list of objects, each with 'source', 'target', and 'label'. """ relationship_response = self.llm_client.generate_completion( self.llm_model, relationship_prompt, format="json" ) relationship_response_text = relationship_response.get('response', '{}') relationship_data = json.loads(relationship_response_text) for entity_name in cleaned_entities: all_entities[entity_name] = {"id": entity_name, "type": "Unknown"} # Placeholder type for rel in relationship_data.get("relationships", []): if 'source' in rel and 'target' in rel and 'label' in rel: all_relationships.add( (rel['source'], rel['target'], rel['label']) ) except json.JSONDecodeError: print(f"Warning: Could not decode JSON from LLM for chunk {i+1}.") continue return { "entities": list(all_entities.values()), "relationships": [{"source": s, "target": t, "label": l} for s, t, l in all_relationships] } ================================================ FILE: rag_system/indexing/latechunk.py ================================================ from __future__ import annotations """Late Chunking encoder. This helper feeds the *entire* document to the embedding model, collects per-token hidden-states and then mean-pools those vectors inside pre-defined chunk spans. The end result is one vector per chunk – but each vector has been produced with knowledge of the *whole* document, alleviating context-loss issues of vanilla chunking. We purposefully keep this class lightweight and free of LanceDB/Chunking logic so it can be re-used elsewhere (e.g. notebook experiments). """ from typing import List, Tuple import torch from transformers import AutoModel, AutoTokenizer import numpy as np class LateChunkEncoder: """Generate late-chunked embeddings given character-offset spans.""" def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B", *, max_tokens: int = 8192) -> None: self.model_name = model_name self.max_len = max_tokens self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Back-compat: allow short alias without repo namespace repo_id = model_name if "/" not in model_name and not model_name.startswith("Qwen/"): # map common alias to official repo alias_map = { "qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B", } repo_id = alias_map.get(model_name.lower(), model_name) self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True) self.model = AutoModel.from_pretrained(repo_id, trust_remote_code=True) self.model.to(self.device) self.model.eval() @torch.inference_mode() def encode(self, text: str, chunk_spans: List[Tuple[int, int]]) -> List[np.ndarray]: """Return one vector *per* span. Args: text: Full document text. chunk_spans: List of (char_start, char_end) offsets for each chunk. Returns: List of numpy float32 arrays – one per chunk. """ if not chunk_spans: return [] # Tokenise and obtain per-token hidden states inputs = self.tokenizer( text, return_tensors="pt", return_offsets_mapping=True, truncation=True, max_length=self.max_len, ) inputs = {k: v.to(self.device) for k, v in inputs.items()} offsets = inputs.pop("offset_mapping").squeeze(0).cpu().tolist() # (seq_len, 2) out = self.model(**inputs) last_hidden = out.last_hidden_state.squeeze(0) # (seq_len, dim) last_hidden = last_hidden.cpu() # For each chunk span, gather token indices belonging to it vectors: List[np.ndarray] = [] for start_char, end_char in chunk_spans: token_indices = [i for i, (s, e) in enumerate(offsets) if s >= start_char and e <= end_char] if not token_indices: # Fallback: if tokenizer lost the span (e.g. due to trimming) just average CLS + SEP token_indices = [0] chunk_vec = last_hidden[token_indices].mean(dim=0).numpy().astype("float32") # Check for NaN or infinite values if np.isnan(chunk_vec).any() or np.isinf(chunk_vec).any(): print(f"⚠️ Warning: Invalid values detected in late chunk embedding for span ({start_char}, {end_char})") # Replace invalid values with zeros chunk_vec = np.nan_to_num(chunk_vec, nan=0.0, posinf=0.0, neginf=0.0) print(f"🔄 Replaced invalid values with zeros") vectors.append(chunk_vec) return vectors ================================================ FILE: rag_system/indexing/multimodal.py ================================================ import fitz # PyMuPDF from PIL import Image import torch import os from typing import List, Dict, Any from rag_system.indexing.embedders import LanceDBManager, VectorIndexer from rag_system.indexing.representations import QwenEmbedder from transformers import ColPaliForRetrieval, ColPaliProcessor, Qwen2TokenizerFast class LocalVisionModel: """ A wrapper for a local vision model (ColPali) from the transformers library. """ def __init__(self, model_name: str = "vidore/colqwen2-v1.0", device: str = "cpu"): print(f"Initializing local vision model '{model_name}' on device '{device}'.") self.device = device self.model = ColPaliForRetrieval.from_pretrained(model_name).to(self.device).eval() self.tokenizer = Qwen2TokenizerFast.from_pretrained(model_name) self.image_processor = ColPaliProcessor.from_pretrained(model_name).image_processor self.processor = ColPaliProcessor(tokenizer=self.tokenizer, image_processor=self.image_processor) print("Local vision model loaded successfully.") def embed_image(self, image: Image.Image) -> torch.Tensor: """ Generates a multi-vector embedding for a single image. """ inputs = self.processor(text="", images=image, return_tensors="pt").to(self.device) with torch.no_grad(): image_embeds = self.model.get_image_features(**inputs) return image_embeds class MultimodalProcessor: """ Processes PDFs into separate text and image embeddings using local models. """ def __init__(self, vision_model: LocalVisionModel, text_embedder: QwenEmbedder, db_manager: LanceDBManager): self.vision_model = vision_model self.text_embedder = text_embedder self.text_vector_indexer = VectorIndexer(db_manager) self.image_vector_indexer = VectorIndexer(db_manager) def process_and_index( self, pdf_path: str, text_table_name: str, image_table_name: str ): print(f"\n--- Processing PDF for multimodal indexing: {os.path.basename(pdf_path)} ---") doc = fitz.open(pdf_path) document_id = os.path.basename(pdf_path) all_pages_text_chunks = [] all_pages_images = [] for page_num in range(len(doc)): page = doc.load_page(page_num) # 1. Extract Text text = page.get_text("text") if not text.strip(): text = f"Page {page_num + 1} contains no extractable text." all_pages_text_chunks.append({ "chunk_id": f"{document_id}_page_{page_num+1}", "text": text, "metadata": {"document_id": document_id, "page_number": page_num + 1} }) # 2. Extract Image pix = page.get_pixmap() img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) all_pages_images.append(img) # --- Batch Indexing --- # Index all text chunks if all_pages_text_chunks: text_embeddings = self.text_embedder.create_embeddings([c['text'] for c in all_pages_text_chunks]) self.text_vector_indexer.index(text_table_name, all_pages_text_chunks, text_embeddings) print(f"Indexed {len(all_pages_text_chunks)} text pages into '{text_table_name}'.") # Index all images if all_pages_images: image_embeddings = self.vision_model.create_image_embeddings(all_pages_images) # We use the text chunks as placeholders for metadata self.image_vector_indexer.index(image_table_name, all_pages_text_chunks, image_embeddings) print(f"Indexed {len(all_pages_images)} image pages into '{image_table_name}'.") if __name__ == '__main__': # This test requires an internet connection to download the models. try: # 1. Setup models and dependencies text_embedder = QwenEmbedder() vision_model = LocalVisionModel() db_manager = LanceDBManager(db_path="./rag_system/index_store/lancedb") # 2. Create a dummy PDF dummy_pdf_path = "multimodal_test.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 72), "This is a test page with text and an image.") doc.save(dummy_pdf_path) # 3. Run the processor processor = MultimodalProcessor(vision_model, text_embedder, db_manager) processor.process_and_index( pdf_path=dummy_pdf_path, text_table_name="test_text_pages", image_table_name="test_image_pages" ) # 4. Verify print("\n--- Verification ---") text_tbl = db_manager.get_table("test_text_pages") img_tbl = db_manager.get_table("test_image_pages") print(f"Text table has {len(text_tbl)} rows.") print(f"Image table has {len(img_tbl)} rows.") except Exception as e: print(f"\nAn error occurred during the multimodal test: {e}") print("Please ensure you have an internet connection for model downloads.") ================================================ FILE: rag_system/indexing/overview_builder.py ================================================ from __future__ import annotations import os, json, logging, re from typing import List, Dict, Any logger = logging.getLogger(__name__) class OverviewBuilder: """Generates and stores a one-paragraph overview for each document. The overview is derived from the first *n* chunks of the document. """ DEFAULT_PROMPT = ( "You will receive the beginning of a document. " "In no more than 120 tokens, describe what the document is about, " "state its type (e.g. invoice, slide deck, policy, research paper, receipt) " "and mention 3-5 important entities, numbers or dates it contains.\n\n" "DOCUMENT_START:\n{text}\n\nOVERVIEW:" ) def __init__(self, llm_client, model: str = "qwen3:0.6b", first_n_chunks: int = 5, out_path: str | None = None): if out_path is None: out_path = "index_store/overviews/overviews.jsonl" self.llm_client = llm_client self.model = model self.first_n = first_n_chunks self.out_path = out_path os.makedirs(os.path.dirname(out_path), exist_ok=True) def build_and_store(self, doc_id: str, chunks: List[Dict[str, Any]]): if not chunks: return head_text = "\n".join(c["text"] for c in chunks[: self.first_n] if c.get("text")) prompt = self.DEFAULT_PROMPT.format(text=head_text[:5000]) # safety cap try: resp = self.llm_client.generate_completion(model=self.model, prompt=prompt, enable_thinking=False) summary_raw = resp.get("response", "") # Remove any lingering ... blocks just in case summary = re.sub(r']*>.*?', '', summary_raw, flags=re.IGNORECASE | re.DOTALL).strip() except Exception as e: summary = f"Failed to generate overview: {e}" record = {"doc_id": doc_id, "overview": summary.strip()} with open(self.out_path, "a", encoding="utf-8") as f: f.write(json.dumps(record, ensure_ascii=False) + "\n") logger.info(f"📄 Overview generated for {doc_id} (stored in {self.out_path})") ================================================ FILE: rag_system/indexing/representations.py ================================================ from typing import List, Dict, Any, Protocol import numpy as np from transformers import AutoModel, AutoTokenizer import torch import os # We keep the protocol to ensure a consistent interface class EmbeddingModel(Protocol): def create_embeddings(self, texts: List[str]) -> np.ndarray: ... # Global cache for models - use dict to cache by model name _MODEL_CACHE = {} # --- New Ollama Embedder --- class QwenEmbedder(EmbeddingModel): """ An embedding model that uses a local Hugging Face transformer model. """ def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B"): self.model_name = model_name # Auto-select the best available device: CUDA > MPS > CPU if torch.cuda.is_available(): self.device = "cuda" elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available(): self.device = "mps" else: self.device = "cpu" # Use model-specific cache if model_name not in _MODEL_CACHE: print(f"Initializing HF Embedder with model '{model_name}' on device '{self.device}'. (first load)") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left") model = AutoModel.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float16 if self.device != "cpu" else None, ).to(self.device).eval() _MODEL_CACHE[model_name] = (tokenizer, model) print(f"QwenEmbedder weights loaded and cached for {model_name}.") else: print(f"Reusing cached QwenEmbedder weights for {model_name}.") self.tokenizer, self.model = _MODEL_CACHE[model_name] def create_embeddings(self, texts: List[str]) -> np.ndarray: print(f"Generating {len(texts)} embeddings with {self.model_name} model...") inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.model(**inputs) last_hidden = outputs.last_hidden_state # [B, seq, dim] # Pool via last valid token per sequence (recommended for Qwen3) seq_len = inputs["attention_mask"].sum(dim=1) - 1 # index of last token batch_indices = torch.arange(last_hidden.size(0), device=self.device) embeddings = last_hidden[batch_indices, seq_len] # Convert to numpy and validate embeddings_np = embeddings.cpu().numpy() # Check for NaN or infinite values if np.isnan(embeddings_np).any(): print(f"⚠️ Warning: NaN values detected in embeddings from {self.model_name}") # Replace NaN values with zeros embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0) print(f"🔄 Replaced NaN values with zeros") if np.isinf(embeddings_np).any(): print(f"⚠️ Warning: Infinite values detected in embeddings from {self.model_name}") # Replace infinite values with zeros embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0) print(f"🔄 Replaced infinite values with zeros") return embeddings_np class EmbeddingGenerator: def __init__(self, embedding_model: EmbeddingModel, batch_size: int = 50): self.model = embedding_model self.batch_size = batch_size def generate(self, chunks: List[Dict[str, Any]]) -> List[np.ndarray]: """Generate embeddings for all chunks using batch processing""" texts_to_embed = [chunk['text'] for chunk in chunks] if not texts_to_embed: return [] from rag_system.utils.batch_processor import BatchProcessor, estimate_memory_usage memory_mb = estimate_memory_usage(chunks) print(f"Estimated memory usage for {len(chunks)} chunks: {memory_mb:.1f}MB") batch_processor = BatchProcessor(batch_size=self.batch_size) def process_text_batch(text_batch): if not text_batch: return [] batch_embeddings = self.model.create_embeddings(text_batch) return [embedding for embedding in batch_embeddings] all_embeddings = batch_processor.process_in_batches( texts_to_embed, process_text_batch, "Embedding Generation" ) return all_embeddings class OllamaEmbedder(EmbeddingModel): """Call Ollama's /api/embeddings endpoint for each text.""" def __init__(self, model_name: str, host: str | None = None, timeout: int = 60): self.model_name = model_name self.host = (host or os.getenv("OLLAMA_HOST") or "http://localhost:11434").rstrip("/") self.timeout = timeout def _embed_single(self, text: str): import requests, numpy as np, json payload = {"model": self.model_name, "prompt": text} r = requests.post(f"{self.host}/api/embeddings", json=payload, timeout=self.timeout) r.raise_for_status() data = r.json() # Ollama may return {"embedding": [...]} or {"data": [...]} depending on version vec = data.get("embedding") or data.get("data") if vec is None: raise ValueError("Unexpected Ollama embeddings response format") return np.array(vec, dtype="float32") def create_embeddings(self, texts: List[str]): import numpy as np vectors = [self._embed_single(t) for t in texts] embeddings_np = np.vstack(vectors) # Check for NaN or infinite values if np.isnan(embeddings_np).any(): print(f"⚠️ Warning: NaN values detected in Ollama embeddings from {self.model_name}") # Replace NaN values with zeros embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0) print(f"🔄 Replaced NaN values with zeros") if np.isinf(embeddings_np).any(): print(f"⚠️ Warning: Infinite values detected in Ollama embeddings from {self.model_name}") # Replace infinite values with zeros embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0) print(f"🔄 Replaced infinite values with zeros") return embeddings_np def select_embedder(model_name: str, ollama_host: str | None = None): """Return appropriate EmbeddingModel implementation for the given name.""" if "/" in model_name or model_name.startswith("http"): # Treat as HF model path return QwenEmbedder(model_name=model_name) # Otherwise assume it's an Ollama tag return OllamaEmbedder(model_name=model_name, host=ollama_host) if __name__ == '__main__': print("representations.py cleaned up.") try: qwen_embedder = QwenEmbedder() emb_gen = EmbeddingGenerator(embedding_model=qwen_embedder) sample_chunks = [{'text': 'Hello world'}, {'text': 'This is a test'}] embeddings = emb_gen.generate(sample_chunks) print(f"\nSuccessfully generated {len(embeddings)} embeddings.") print(f"Shape of first embedding: {embeddings[0].shape}") except Exception as e: print(f"\nAn error occurred during the QwenEmbedder test: {e}") print("Please ensure you have an internet connection for model downloads.") ================================================ FILE: rag_system/ingestion/__init__.py ================================================ ================================================ FILE: rag_system/ingestion/chunking.py ================================================ from typing import List, Dict, Any, Optional import re from transformers import AutoTokenizer class MarkdownRecursiveChunker: """ A recursive chunker that splits Markdown text based on its semantic structure and embeds document-level metadata into each chunk. """ def __init__(self, max_chunk_size: int = 1500, min_chunk_size: int = 200, tokenizer_model: str = "Qwen/Qwen3-Embedding-0.6B"): self.max_chunk_size = max_chunk_size self.min_chunk_size = min_chunk_size self.split_priority = ["\n## ", "\n### ", "\n#### ", "```", "\n\n"] repo_id = tokenizer_model if "/" not in tokenizer_model and not tokenizer_model.startswith("Qwen/"): repo_id = { "qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B", }.get(tokenizer_model.lower(), tokenizer_model) try: self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True) except Exception as e: print(f"Warning: Failed to load tokenizer {repo_id}: {e}") print("Falling back to character-based approximation (4 chars ≈ 1 token)") self.tokenizer = None def _token_len(self, text: str) -> int: """Get token count for text using the tokenizer.""" if self.tokenizer is not None: return len(self.tokenizer.tokenize(text)) else: return max(1, len(text) // 4) def _split_text(self, text: str, separators: List[str]) -> List[str]: final_chunks = [] chunks_to_process = [text] for sep in separators: new_chunks = [] for chunk in chunks_to_process: if self._token_len(chunk) > self.max_chunk_size: sub_chunks = re.split(f'({sep})', chunk) combined = [] i = 0 while i < len(sub_chunks): if i + 1 < len(sub_chunks) and sub_chunks[i+1] == sep: combined.append(sub_chunks[i+1] + sub_chunks[i+2]) i += 3 else: if sub_chunks[i]: combined.append(sub_chunks[i]) i += 1 new_chunks.extend(combined) else: new_chunks.append(chunk) chunks_to_process = new_chunks final_chunks = [] for chunk in chunks_to_process: if self._token_len(chunk) > self.max_chunk_size: words = chunk.split() current_chunk = "" for word in words: test_chunk = current_chunk + " " + word if current_chunk else word if self._token_len(test_chunk) <= self.max_chunk_size: current_chunk = test_chunk else: if current_chunk: final_chunks.append(current_chunk) current_chunk = word if current_chunk: final_chunks.append(current_chunk) else: final_chunks.append(chunk) return final_chunks def chunk(self, text: str, document_id: str, document_metadata: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: """ Chunks the Markdown text and injects metadata. Args: text: The Markdown text to chunk. document_id: The identifier for the source document. document_metadata: A dictionary of metadata for the source document. Returns: A list of dictionaries, where each dictionary is a chunk with metadata. """ if not text: return [] raw_chunks = self._split_text(text, self.split_priority) merged_chunks_text = [] current_chunk = "" for chunk_text in raw_chunks: test_chunk = current_chunk + chunk_text if current_chunk else chunk_text if not current_chunk or self._token_len(test_chunk) <= self.max_chunk_size: current_chunk = test_chunk elif self._token_len(current_chunk) < self.min_chunk_size: current_chunk = test_chunk else: merged_chunks_text.append(current_chunk) current_chunk = chunk_text if current_chunk: merged_chunks_text.append(current_chunk) final_chunks = [] for i, chunk_text in enumerate(merged_chunks_text): # Combine document-level metadata with chunk-specific metadata combined_metadata = (document_metadata or {}).copy() combined_metadata.update({ "document_id": document_id, "chunk_number": i, }) final_chunks.append({ "chunk_id": f"{document_id}_{i}", # Create a more unique ID "text": chunk_text.strip(), "metadata": combined_metadata }) return final_chunks def create_contextual_window(all_chunks: List[Dict[str, Any]], chunk_index: int, window_size: int = 1) -> str: if not (0 <= chunk_index < len(all_chunks)): raise ValueError("chunk_index is out of bounds.") start = max(0, chunk_index - window_size) end = min(len(all_chunks), chunk_index + window_size + 1) context_chunks = all_chunks[start:end] return " ".join([chunk['text'] for chunk in context_chunks]) if __name__ == '__main__': print("chunking.py updated to include document metadata in each chunk.") sample_markdown = "# Doc Title\n\nContent paragraph." doc_meta = {"title": "My Awesome Document", "author": "Jane Doe", "year": 2024} chunker = MarkdownRecursiveChunker() chunks = chunker.chunk( text=sample_markdown, document_id="doc456", document_metadata=doc_meta ) print(f"\n--- Created {len(chunks)} chunk(s) ---") for chunk in chunks: print(f"Chunk ID: {chunk['chunk_id']}") print(f"Text: '{chunk['text']}'") print(f"Metadata: {chunk['metadata']}") print("-" * 20) ================================================ FILE: rag_system/ingestion/docling_chunker.py ================================================ from __future__ import annotations """Docling-aware chunker (simplified). For now we proxy the old MarkdownRecursiveChunker but add: • sentence-aware packing to max_tokens with overlap • breadcrumb metadata stubs so downstream code already handles them In a follow-up we can replace the internals with true Docling element-tree walking once the PDFConverter returns structured nodes. """ from typing import List, Dict, Any, Tuple import math import re from itertools import islice from rag_system.ingestion.chunking import MarkdownRecursiveChunker from transformers import AutoTokenizer class DoclingChunker: def __init__(self, *, max_tokens: int = 512, overlap: int = 1, tokenizer_model: str = "Qwen/Qwen3-Embedding-0.6B"): self.max_tokens = max_tokens self.overlap = overlap # sentences of overlap repo_id = tokenizer_model if "/" not in tokenizer_model and not tokenizer_model.startswith("Qwen/"): repo_id = { "qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B", }.get(tokenizer_model.lower(), tokenizer_model) try: self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True) except Exception as e: print(f"Warning: Failed to load tokenizer {repo_id}: {e}") print("Falling back to character-based approximation (4 chars ≈ 1 token)") self.tokenizer = None # Fallback simple sentence splitter (period, question, exclamation, newline) self._sent_re = re.compile(r"(?<=[\.\!\?])\s+|\n+") self.legacy = MarkdownRecursiveChunker(max_chunk_size=10_000, min_chunk_size=100) # ------------------------------------------------------------------ def _token_len(self, text: str) -> int: if self.tokenizer is not None: return len(self.tokenizer.tokenize(text)) else: # Fallback: approximate 4 characters per token return max(1, len(text) // 4) def split_markdown(self, markdown: str, *, document_id: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]: """Split one Markdown doc into chunks with max_tokens limit.""" base_chunks = self.legacy.chunk(markdown, document_id, metadata) new_chunks: List[Dict[str, Any]] = [] global_idx = 0 for ch in base_chunks: sentences = [s.strip() for s in self._sent_re.split(ch["text"]) if s.strip()] if not sentences: continue window: List[str] = [] while sentences: # Add until over limit while sentences and self._token_len(" ".join(window + [sentences[0]])) <= self.max_tokens: window.append(sentences.pop(0)) if not window: # single sentence > limit → hard cut window.append(sentences.pop(0)) chunk_text = " ".join(window) new_chunk = { "chunk_id": f"{document_id}_{global_idx}", "text": chunk_text, "metadata": { **metadata, "document_id": document_id, "chunk_index": global_idx, "heading_path": metadata.get("heading_path", []), "heading_level": len(metadata.get("heading_path", [])), "block_type": metadata.get("block_type", "paragraph"), }, } new_chunks.append(new_chunk) global_idx += 1 # Overlap: prepend last `overlap` sentences of the current window to the remaining queue if self.overlap and sentences: back = window[-self.overlap:] if self.overlap <= len(window) else window[:] sentences = back + sentences window = [] return new_chunks # ------------------------------------------------------------------ # Element-tree based chunking (true Docling path) # ------------------------------------------------------------------ def chunk_document(self, doc, *, document_id: str, metadata: Dict[str, Any] | None = None) -> List[Dict[str, Any]]: """Walk a DoclingDocument and emit chunks. Tables / Code / Figures are emitted as atomic chunks. Paragraph-like nodes are sentence-packed to <= max_tokens. """ metadata = metadata or {} def _token_len(txt: str) -> int: if self.tokenizer is not None: return len(self.tokenizer.tokenize(txt)) else: # Fallback: approximate 4 characters per token return max(1, len(txt) // 4) chunks: List[Dict[str, Any]] = [] global_idx = 0 # Helper to create a chunk and append to list def _add_chunk(text: str, block_type: str, heading_path: List[str], page_no: int | None = None): nonlocal global_idx if not text.strip(): return chunk_meta = { **metadata, "document_id": document_id, "chunk_index": global_idx, "heading_path": heading_path, "heading_level": len(heading_path), "block_type": block_type, } if page_no is not None: chunk_meta["page"] = page_no chunks.append({ "chunk_id": f"{document_id}_{global_idx}", "text": text, "metadata": chunk_meta, }) global_idx += 1 # The Docling API exposes .body which is a tree of nodes; we fall back to .texts/.tables lists if available try: # We walk doc.texts (reading order). We'll buffer consecutive paragraph items current_heading_path: List[str] = [] buffer: List[str] = [] buffer_tokens = 0 buffer_page = None def flush_buffer(): nonlocal buffer, buffer_tokens, buffer_page if buffer: _add_chunk(" ".join(buffer), "paragraph", heading_path=current_heading_path[:], page_no=buffer_page) buffer, buffer_tokens, buffer_page = [], 0, None # Create quick lookup for table items by id to preserve later insertion order if needed tables_by_anchor = { getattr(t, "anchor_text_id", None): t for t in getattr(doc, "tables", []) if getattr(t, "anchor_text_id", None) is not None } for txt_item in getattr(doc, "texts", []): # If this text item is a placeholder for a table anchor, emit table first anchor_id = getattr(txt_item, "id", None) if anchor_id in tables_by_anchor: flush_buffer() tbl = tables_by_anchor[anchor_id] try: tbl_md = tbl.export_to_markdown(doc) # pass doc for deprecation compliance except Exception: tbl_md = tbl.export_to_markdown() if hasattr(tbl, "export_to_markdown") else str(tbl) _add_chunk(tbl_md, "table", heading_path=current_heading_path[:], page_no=getattr(tbl, "page_no", None)) role = getattr(txt_item, "role", None) if role == "heading": flush_buffer() level = getattr(txt_item, "level", 1) current_heading_path = current_heading_path[: max(0, level - 1)] current_heading_path.append(txt_item.text.strip()) continue # skip heading as content text_piece = txt_item.text if hasattr(txt_item, "text") else str(txt_item) piece_tokens = _token_len(text_piece) if piece_tokens > self.max_tokens: # very long paragraph flush_buffer() _add_chunk(text_piece, "paragraph", heading_path=current_heading_path[:], page_no=getattr(txt_item, "page_no", None)) continue if buffer_tokens + piece_tokens > self.max_tokens: flush_buffer() buffer.append(text_piece) buffer_tokens += piece_tokens if buffer_page is None: buffer_page = getattr(txt_item, "page_no", None) flush_buffer() # Emit any remaining tables that were not anchored for tbl in getattr(doc, "tables", []): if tbl in tables_by_anchor.values(): continue # already emitted try: tbl_md = tbl.export_to_markdown(doc) except Exception: tbl_md = tbl.export_to_markdown() if hasattr(tbl, "export_to_markdown") else str(tbl) _add_chunk(tbl_md, "table", heading_path=current_heading_path[:], page_no=getattr(tbl, "page_no", None)) except Exception as e: print(f"⚠️ Docling tree walk failed: {e}. Falling back to markdown splitter.") return self.split_markdown(doc.export_to_markdown(), document_id=document_id, metadata=metadata) # -------------------------------------------------------------- # Second-pass consolidation: merge small consecutive paragraph # chunks that share heading & page into up-to-max_tokens blobs. # -------------------------------------------------------------- consolidated: List[Dict[str, Any]] = [] buf_txt: List[str] = [] buf_meta: Dict[str, Any] | None = None def flush_paragraph_buffer(): nonlocal buf_txt, buf_meta if not buf_txt: return merged_text = " ".join(buf_txt) # Re-use meta from first piece but update chunk_id later new_chunk = { "chunk_id": buf_meta["chunk_id"], "text": merged_text, "metadata": buf_meta["metadata"], } consolidated.append(new_chunk) buf_txt = [] buf_meta = None for ch in chunks: if ch["metadata"].get("block_type") != "paragraph": flush_paragraph_buffer() consolidated.append(ch) continue if not buf_txt: buf_txt.append(ch["text"]) buf_meta = ch continue same_page = ch["metadata"].get("page") == buf_meta["metadata"].get("page") same_heading = ch["metadata"].get("heading_path") == buf_meta["metadata"].get("heading_path") prospective_len = self._token_len(" ".join(buf_txt + [ch["text"]])) if same_page and same_heading and prospective_len <= self.max_tokens: buf_txt.append(ch["text"]) else: flush_paragraph_buffer() buf_txt.append(ch["text"]) buf_meta = ch flush_paragraph_buffer() return consolidated # Public API expected by IndexingPipeline -------------------------------- def chunk(self, text: str, document_id: str, document_metadata: Dict[str, Any] | None = None) -> List[Dict[str, Any]]: return self.split_markdown(text, document_id=document_id, metadata=document_metadata or {}) ================================================ FILE: rag_system/ingestion/document_converter.py ================================================ from typing import List, Tuple, Dict, Any from docling.document_converter import DocumentConverter as DoclingConverter, PdfFormatOption from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions from docling.datamodel.base_models import InputFormat import fitz # PyMuPDF for quick text inspection import os class DocumentConverter: """ A class to convert various document formats to structured Markdown using the docling library. Supports PDF, DOCX, HTML, and other formats. """ # Mapping of file extensions to InputFormat SUPPORTED_FORMATS = { '.pdf': InputFormat.PDF, '.docx': InputFormat.DOCX, '.html': InputFormat.HTML, '.htm': InputFormat.HTML, '.md': InputFormat.MD, '.txt': 'TXT', # Special handling for plain text files } def __init__(self): """Initializes the docling document converter with forced OCR enabled for macOS.""" try: # --- Converter WITHOUT OCR (fast path) --- pipeline_no_ocr = PdfPipelineOptions() pipeline_no_ocr.do_ocr = False format_no_ocr = { InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr) } self.converter_no_ocr = DoclingConverter(format_options=format_no_ocr) # --- Converter WITH OCR (fallback) --- pipeline_ocr = PdfPipelineOptions() pipeline_ocr.do_ocr = True ocr_options = OcrMacOptions(force_full_page_ocr=True) pipeline_ocr.ocr_options = ocr_options format_ocr = { InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr) } self.converter_ocr = DoclingConverter(format_options=format_ocr) self.converter_general = DoclingConverter() print("docling DocumentConverter(s) initialized (OCR + no-OCR + general).") except Exception as e: print(f"Error initializing docling DocumentConverter(s): {e}") self.converter_no_ocr = None self.converter_ocr = None self.converter_general = None def convert_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]: """ Converts a document to a single Markdown string, preserving layout and tables. Supports PDF, DOCX, HTML, and other formats. """ if not (self.converter_no_ocr and self.converter_ocr and self.converter_general): print("docling converters not available. Skipping conversion.") return [] file_ext = os.path.splitext(file_path)[1].lower() if file_ext not in self.SUPPORTED_FORMATS: print(f"Unsupported file format: {file_ext}") return [] input_format = self.SUPPORTED_FORMATS[file_ext] if input_format == InputFormat.PDF: return self._convert_pdf_to_markdown(file_path) elif input_format == 'TXT': return self._convert_txt_to_markdown(file_path) else: return self._convert_general_to_markdown(file_path, input_format) def _convert_pdf_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]: """Convert PDF with OCR detection logic.""" # Quick heuristic: if the PDF already contains a text layer, skip OCR for speed def _pdf_has_text(path: str) -> bool: try: doc = fitz.open(path) for page in doc: if page.get_text("text").strip(): return True except Exception: pass return False use_ocr = not _pdf_has_text(pdf_path) converter = self.converter_ocr if use_ocr else self.converter_no_ocr ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)" print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...") return self._perform_conversion(pdf_path, converter, ocr_msg) def _convert_txt_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]: """Convert plain text files to markdown by reading content directly.""" print(f"Converting {file_path} (TXT) to Markdown...") try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() markdown_content = f"```\n{content}\n```" metadata = {"source": file_path} print(f"Successfully converted {file_path} (TXT) to Markdown.") return [(markdown_content, metadata)] except Exception as e: print(f"Error processing TXT file {file_path}: {e}") return [] def _convert_general_to_markdown(self, file_path: str, input_format: InputFormat) -> List[Tuple[str, Dict[str, Any]]]: """Convert non-PDF formats using general converter.""" print(f"Converting {file_path} ({input_format.name}) to Markdown using docling...") return self._perform_conversion(file_path, self.converter_general, f"({input_format.name})") def _perform_conversion(self, file_path: str, converter, format_msg: str) -> List[Tuple[str, Dict[str, Any]]]: """Perform the actual conversion using the specified converter.""" pages_data = [] try: result = converter.convert(file_path) markdown_content = result.document.export_to_markdown() metadata = {"source": file_path} # Return the *DoclingDocument* object as third tuple element so downstream # chunkers that understand the element tree can use it. Legacy callers that # expect only (markdown, metadata) can simply ignore the extra value. pages_data.append((markdown_content, metadata, result.document)) print(f"Successfully converted {file_path} with docling {format_msg}.") return pages_data except Exception as e: print(f"Error processing {file_path} with docling: {e}") return [] ================================================ FILE: rag_system/main.py ================================================ import os import json import sys import argparse from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # The sys.path manipulation has been removed to prevent import conflicts. # This script should be run as a module from the project root, e.g.: # python -m rag_system.main api from rag_system.agent.loop import Agent from rag_system.utils.ollama_client import OllamaClient # Configuration is now defined in this file - no import needed # Advanced RAG System Configuration # ================================== # This file contains the MASTER configuration for all models used in the RAG system. # All components should reference these configurations to ensure consistency. # ============================================================================ # 🎯 MASTER MODEL CONFIGURATION # ============================================================================ # All model configurations are centralized here to prevent conflicts # LLM Backend Configuration LLM_BACKEND = os.getenv("LLM_BACKEND", "ollama") # Ollama Models Configuration (for inference via Ollama) OLLAMA_CONFIG = { "host": os.getenv("OLLAMA_HOST", "http://localhost:11434"), "generation_model": "qwen3:8b", # Main text generation model "enrichment_model": "qwen3:0.6b", # Lightweight model for routing/enrichment } WATSONX_CONFIG = { "api_key": os.getenv("WATSONX_API_KEY", ""), "project_id": os.getenv("WATSONX_PROJECT_ID", ""), "url": os.getenv("WATSONX_URL", "https://us-south.ml.cloud.ibm.com"), "generation_model": os.getenv("WATSONX_GENERATION_MODEL", "ibm/granite-13b-chat-v2"), "enrichment_model": os.getenv("WATSONX_ENRICHMENT_MODEL", "ibm/granite-8b-japanese"), # Lightweight model } # External Model Configuration (HuggingFace models used directly) EXTERNAL_MODELS = { "embedding_model": "Qwen/Qwen3-Embedding-0.6B", # HuggingFace embedding model (1024 dims - fresh start) "reranker_model": "answerdotai/answerai-colbert-small-v1", # ColBERT reranker "vision_model": "Qwen/Qwen-VL-Chat", # Vision model for multimodal "fallback_reranker": "BAAI/bge-reranker-base", # Backup reranker } # ============================================================================ # 🔧 PIPELINE CONFIGURATIONS # ============================================================================ PIPELINE_CONFIGS = { "default": { "description": "Production-ready pipeline with hybrid search, AI reranking, and verification", "storage": { "lancedb_uri": "./lancedb", "text_table_name": "text_pages_v3", "image_table_name": "image_pages_v3", "bm25_path": "./index_store/bm25", "graph_path": "./index_store/graph/knowledge_graph.gml" }, "retrieval": { "retriever": "multivector", "search_type": "hybrid", "late_chunking": { "enabled": True, "table_suffix": "_lc_v3" }, "dense": { "enabled": True, "weight": 0.7 }, "bm25": { "enabled": True, "index_name": "rag_bm25_index" }, "graph": { "enabled": False, "graph_path": "./index_store/graph/knowledge_graph.gml" } }, # 🎯 EMBEDDING MODEL: Uses HuggingFace Qwen model directly "embedding_model_name": EXTERNAL_MODELS["embedding_model"], # 🎯 VISION MODEL: For multimodal capabilities "vision_model_name": EXTERNAL_MODELS["vision_model"], # 🎯 RERANKER: AI-powered reranking with ColBERT "reranker": { "enabled": True, "type": "ai", "strategy": "rerankers-lib", "model_name": EXTERNAL_MODELS["reranker_model"], "top_k": 10 }, "query_decomposition": { "enabled": True, "max_sub_queries": 3, "compose_from_sub_answers": True }, "verification": {"enabled": True}, "retrieval_k": 20, "context_window_size": 0, "semantic_cache_threshold": 0.98, "cache_scope": "global", # 🔧 Contextual enrichment configuration "contextual_enricher": { "enabled": True, "window_size": 1 }, # 🔧 Indexing configuration "indexing": { "embedding_batch_size": 50, "enrichment_batch_size": 10, "enable_progress_tracking": True } }, "fast": { "description": "Speed-optimized pipeline with minimal overhead", "storage": { "lancedb_uri": "./lancedb", "text_table_name": "text_pages_v3", "image_table_name": "image_pages_v3", "bm25_path": "./index_store/bm25" }, "retrieval": { "retriever": "multivector", "search_type": "vector_only", "late_chunking": {"enabled": False}, "dense": {"enabled": True} }, "embedding_model_name": EXTERNAL_MODELS["embedding_model"], "reranker": {"enabled": False}, "query_decomposition": {"enabled": False}, "verification": {"enabled": False}, "retrieval_k": 10, "context_window_size": 0, # 🔧 Contextual enrichment (disabled for speed) "contextual_enricher": { "enabled": False, "window_size": 1 }, # 🔧 Indexing configuration "indexing": { "embedding_batch_size": 100, "enrichment_batch_size": 50, "enable_progress_tracking": False } }, "bm25": { "enabled": True, "index_name": "rag_bm25_index" }, "graph_rag": { "enabled": False, # Keep disabled for now unless specified } } # ============================================================================ # 🏭 FACTORY FUNCTIONS # ============================================================================ def get_agent(mode: str = "default") -> Agent: """ Factory function to get an instance of the RAG agent based on the specified mode. Args: mode: Configuration mode ("default", "fast") Returns: Configured Agent instance """ load_dotenv() # Initialize the appropriate LLM client based on backend configuration if LLM_BACKEND.lower() == "watsonx": from rag_system.utils.watsonx_client import WatsonXClient if not WATSONX_CONFIG["api_key"] or not WATSONX_CONFIG["project_id"]: raise ValueError( "Watson X configuration incomplete. Please set WATSONX_API_KEY and WATSONX_PROJECT_ID " "environment variables." ) llm_client = WatsonXClient( api_key=WATSONX_CONFIG["api_key"], project_id=WATSONX_CONFIG["project_id"], url=WATSONX_CONFIG["url"] ) llm_config = WATSONX_CONFIG print(f"🔧 Using Watson X backend with granite models") else: llm_client = OllamaClient(host=OLLAMA_CONFIG["host"]) llm_config = OLLAMA_CONFIG print(f"🔧 Using Ollama backend") # Get the configuration for the specified mode config = PIPELINE_CONFIGS.get(mode, PIPELINE_CONFIGS['default']) agent = Agent( pipeline_configs=config, llm_client=llm_client, ollama_config=llm_config ) return agent def validate_model_config(): """ Validates the model configuration for consistency and availability. Raises: ValueError: If configuration conflicts are detected """ print("🔍 Validating model configuration...") # Check for embedding model consistency default_embedding = PIPELINE_CONFIGS["default"]["embedding_model_name"] external_embedding = EXTERNAL_MODELS["embedding_model"] if default_embedding != external_embedding: raise ValueError(f"Embedding model mismatch: {default_embedding} != {external_embedding}") # Check reranker configuration default_reranker = PIPELINE_CONFIGS["default"]["reranker"]["model_name"] external_reranker = EXTERNAL_MODELS["reranker_model"] if default_reranker != external_reranker: raise ValueError(f"Reranker model mismatch: {default_reranker} != {external_reranker}") print("✅ Model configuration validation passed!") return True # ============================================================================ # 🚀 UTILITY FUNCTIONS # ============================================================================ def run_indexing(docs_path: str, config_mode: str = "default"): """Runs the indexing pipeline for the specified documents.""" print(f"📚 Starting indexing for documents in: {docs_path}") validate_model_config() # Local import to avoid circular dependencies from rag_system.pipelines.indexing_pipeline import IndexingPipeline # Get the appropriate indexing pipeline from the factory indexing_pipeline = IndexingPipeline(PIPELINE_CONFIGS[config_mode]) # Find all PDF files in the directory pdf_files = [os.path.join(docs_path, f) for f in os.listdir(docs_path) if f.endswith(".pdf")] if not pdf_files: print("No PDF files found to index.") return # Process all documents through the pipeline indexing_pipeline.process_documents(pdf_files) print("✅ Indexing complete.") def run_chat(query: str): """ Runs the agentic RAG pipeline for a given query. Returns the result as a JSON string. """ try: validate_model_config() ollama_client = OllamaClient(OLLAMA_CONFIG["host"]) except ConnectionError as e: print(e) return json.dumps({"error": str(e)}, indent=2) except ValueError as e: print(f"Configuration Error: {e}") return json.dumps({"error": f"Configuration Error: {e}"}, indent=2) agent = Agent(PIPELINE_CONFIGS['default'], ollama_client, OLLAMA_CONFIG) result = agent.run(query) return json.dumps(result, indent=2, ensure_ascii=False) def show_graph(): """ Loads and displays the knowledge graph. """ import networkx as nx import matplotlib.pyplot as plt graph_path = PIPELINE_CONFIGS["indexing"]["graph_path"] if not os.path.exists(graph_path): print("Knowledge graph not found. Please run the 'index' command first.") return G = nx.read_gml(graph_path) print("--- Knowledge Graph ---") print("Nodes:", G.nodes(data=True)) print("Edges:", G.edges(data=True)) print("---------------------") # Optional: Visualize the graph try: pos = nx.spring_layout(G) nx.draw(G, pos, with_labels=True, node_size=2000, node_color="skyblue", font_size=10, font_weight="bold") edge_labels = nx.get_edge_attributes(G, 'label') nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels) plt.title("Knowledge Graph Visualization") plt.show() except Exception as e: print(f"\nCould not visualize the graph. Matplotlib might not be installed or configured for your environment.") print(f"Error: {e}") def run_api_server(): """Starts the advanced RAG API server.""" from rag_system.api_server import start_server start_server() def main(): if len(sys.argv) < 2: print("Usage: python main.py [index|chat|show_graph|api] [query]") return command = sys.argv[1] if command == "index": # Allow passing file paths from the command line files = sys.argv[2:] if len(sys.argv) > 2 else None run_indexing(files) elif command == "chat": if len(sys.argv) < 3: print("Usage: python main.py chat ") return query = " ".join(sys.argv[2:]) # 🆕 Print the result for command-line usage print(run_chat(query)) elif command == "show_graph": show_graph() elif command == "api": run_api_server() else: print(f"Unknown command: {command}") if __name__ == "__main__": # This allows running the script from the command line to index documents. parser = argparse.ArgumentParser(description="Main entry point for the RAG system.") parser.add_argument( '--index', type=str, help='Path to the directory containing documents to index.' ) parser.add_argument( '--config', type=str, default='default', help='The configuration profile to use (e.g., "default", "fast").' ) args = parser.parse_args() # Load environment variables load_dotenv() if args.index: run_indexing(args.index, args.config) else: # This is where you might start a server or interactive session print("No action specified. Use --index to process documents.") # Example of how to get an agent instance # agent = get_agent(args.config) # print(f"Agent loaded with '{args.config}' config.") ================================================ FILE: rag_system/pipelines/__init__.py ================================================ ================================================ FILE: rag_system/pipelines/indexing_pipeline.py ================================================ from typing import List, Dict, Any import os import networkx as nx from rag_system.ingestion.document_converter import DocumentConverter from rag_system.ingestion.chunking import MarkdownRecursiveChunker from rag_system.indexing.representations import EmbeddingGenerator, select_embedder from rag_system.indexing.embedders import LanceDBManager, VectorIndexer from rag_system.indexing.graph_extractor import GraphExtractor from rag_system.utils.ollama_client import OllamaClient from rag_system.indexing.contextualizer import ContextualEnricher from rag_system.indexing.overview_builder import OverviewBuilder class IndexingPipeline: def __init__(self, config: Dict[str, Any], ollama_client: OllamaClient, ollama_config: Dict[str, str]): self.config = config self.llm_client = ollama_client self.ollama_config = ollama_config self.document_converter = DocumentConverter() # Chunker selection: docling (token-based) or legacy (character-based) chunker_mode = config.get("chunker_mode", "docling") # 🔧 Get chunking configuration from frontend parameters chunking_config = config.get("chunking", {}) chunk_size = chunking_config.get("chunk_size", config.get("chunk_size", 1500)) chunk_overlap = chunking_config.get("chunk_overlap", config.get("chunk_overlap", 200)) print(f"🔧 CHUNKING CONFIG: Size: {chunk_size}, Overlap: {chunk_overlap}, Mode: {chunker_mode}") if chunker_mode == "docling": try: from rag_system.ingestion.docling_chunker import DoclingChunker self.chunker = DoclingChunker( max_tokens=config.get("max_tokens", chunk_size), overlap=config.get("overlap_sentences", 1), tokenizer_model=config.get("embedding_model_name", "qwen3-embedding-0.6b"), ) print("🪄 Using DoclingChunker for high-recall sentence packing.") except Exception as e: print(f"⚠️ Failed to initialise DoclingChunker: {e}. Falling back to legacy chunker.") self.chunker = MarkdownRecursiveChunker( max_chunk_size=chunk_size, min_chunk_size=min(chunk_overlap, chunk_size // 4), # Sensible minimum tokenizer_model=config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B") ) else: self.chunker = MarkdownRecursiveChunker( max_chunk_size=chunk_size, min_chunk_size=min(chunk_overlap, chunk_size // 4), # Sensible minimum tokenizer_model=config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B") ) retriever_configs = self.config.get("retrievers") or self.config.get("retrieval", {}) storage_config = self.config["storage"] # Get batch processing configuration indexing_config = self.config.get("indexing", {}) self.embedding_batch_size = indexing_config.get("embedding_batch_size", 50) self.enrichment_batch_size = indexing_config.get("enrichment_batch_size", 10) self.enable_progress_tracking = indexing_config.get("enable_progress_tracking", True) # Treat dense retrieval as enabled by default unless explicitly disabled dense_cfg = retriever_configs.setdefault("dense", {}) dense_cfg.setdefault("enabled", True) if dense_cfg.get("enabled"): # Accept modern keys: db_path or lancedb_path; fall back to legacy lancedb_uri db_path = ( storage_config.get("db_path") or storage_config.get("lancedb_path") or storage_config.get("lancedb_uri") ) if not db_path: raise KeyError( "Storage config must include 'db_path', 'lancedb_path', or 'lancedb_uri' for LanceDB." ) self.lancedb_manager = LanceDBManager(db_path=db_path) self.vector_indexer = VectorIndexer(self.lancedb_manager) embedding_model = select_embedder( self.config.get("embedding_model_name", "BAAI/bge-small-en-v1.5"), self.ollama_config.get("host") if isinstance(self.ollama_config, dict) else None, ) self.embedding_generator = EmbeddingGenerator( embedding_model=embedding_model, batch_size=self.embedding_batch_size ) if retriever_configs.get("graph", {}).get("enabled"): self.graph_extractor = GraphExtractor( llm_client=self.llm_client, llm_model=self.ollama_config["generation_model"] ) if self.config.get("contextual_enricher", {}).get("enabled"): # 🔧 Use frontend enrich_model parameter if provided enrichment_model = ( self.config.get("enrich_model") or # Frontend parameter self.config.get("enrichment_model_name") or # Alternative config key self.ollama_config.get("enrichment_model") or # Default from ollama config self.ollama_config["generation_model"] # Final fallback ) print(f"🔧 ENRICHMENT MODEL: Using '{enrichment_model}' for contextual enrichment") self.contextual_enricher = ContextualEnricher( llm_client=self.llm_client, llm_model=enrichment_model, batch_size=self.enrichment_batch_size ) # Overview builder always enabled for triage routing ov_path = self.config.get("overview_path") self.overview_builder = OverviewBuilder( llm_client=self.llm_client, model=self.config.get("overview_model_name", self.ollama_config.get("enrichment_model", "qwen3:0.6b")), first_n_chunks=self.config.get("overview_first_n_chunks", 5), out_path=ov_path if ov_path else None, ) # ------------------------------------------------------------------ # Late-Chunk encoder initialisation (optional) # ------------------------------------------------------------------ self.latechunk_enabled = retriever_configs.get("latechunk", {}).get("enabled", False) if self.latechunk_enabled: try: from rag_system.indexing.latechunk import LateChunkEncoder self.latechunk_cfg = retriever_configs["latechunk"] self.latechunk_encoder = LateChunkEncoder(model_name=self.config.get("embedding_model_name", "qwen3-embedding-0.6b")) except Exception as e: print(f"⚠️ Failed to initialise LateChunkEncoder: {e}. Disabling latechunk retrieval.") self.latechunk_enabled = False def run(self, file_paths: List[str] | None = None, *, documents: List[str] | None = None): """ Processes and indexes documents based on the pipeline's configuration. Accepts legacy keyword *documents* as an alias for *file_paths* so that older callers (backend/index builder) keep working. """ # Back-compat shim --------------------------------------------------- if file_paths is None and documents is not None: file_paths = documents if file_paths is None: raise TypeError("IndexingPipeline.run() expects 'file_paths' (or alias 'documents') argument") print(f"--- Starting indexing process for {len(file_paths)} files. ---") # Import progress tracking utilities from rag_system.utils.batch_processor import timer, ProgressTracker, estimate_memory_usage with timer("Complete Indexing Pipeline"): # Step 1: Document Processing and Chunking all_chunks = [] doc_chunks_map = {} with timer("Document Processing & Chunking"): file_tracker = ProgressTracker(len(file_paths), "Document Processing") for file_path in file_paths: try: document_id = os.path.basename(file_path) print(f"Processing: {document_id}") pages_data = self.document_converter.convert_to_markdown(file_path) file_chunks = [] for tpl in pages_data: if len(tpl) == 3: markdown_text, metadata, doc_obj = tpl if hasattr(self.chunker, "chunk_document"): chunks = self.chunker.chunk_document(doc_obj, document_id=document_id, metadata=metadata) else: chunks = self.chunker.chunk(markdown_text, document_id, metadata) else: markdown_text, metadata = tpl chunks = self.chunker.chunk(markdown_text, document_id, metadata) file_chunks.extend(chunks) # Add a sequential chunk_index to each chunk within the document for i, chunk in enumerate(file_chunks): if 'metadata' not in chunk: chunk['metadata'] = {} chunk['metadata']['chunk_index'] = i # Build and persist document overview (non-blocking errors) try: self.overview_builder.build_and_store(document_id, file_chunks) except Exception as e: print(f" ⚠️ Failed to create overview for {document_id}: {e}") all_chunks.extend(file_chunks) doc_chunks_map[document_id] = file_chunks # save for late-chunk step print(f" Generated {len(file_chunks)} chunks from {document_id}") file_tracker.update(1) except Exception as e: print(f" ❌ Error processing {file_path}: {e}") file_tracker.update(1, errors=1) continue file_tracker.finish() if not all_chunks: print("No text chunks were generated. Skipping indexing.") return print(f"\n✅ Generated {len(all_chunks)} text chunks total.") memory_mb = estimate_memory_usage(all_chunks) print(f"📊 Estimated memory usage: {memory_mb:.1f}MB") retriever_configs = self.config.get("retrievers") or self.config.get("retrieval", {}) # Step 3: Optional Contextual Enrichment (before indexing for consistency) enricher_config = self.config.get("contextual_enricher", {}) enricher_enabled = enricher_config.get("enabled", False) print(f"\n🔍 CONTEXTUAL ENRICHMENT DEBUG:") print(f" Config present: {bool(enricher_config)}") print(f" Enabled: {enricher_enabled}") print(f" Has enricher object: {hasattr(self, 'contextual_enricher')}") if hasattr(self, 'contextual_enricher') and enricher_enabled: with timer("Contextual Enrichment"): window_size = enricher_config.get("window_size", 1) print(f"\n🚀 CONTEXTUAL ENRICHMENT ACTIVE!") print(f" Window size: {window_size}") print(f" Model: {self.contextual_enricher.llm_model}") print(f" Batch size: {self.contextual_enricher.batch_size}") print(f" Processing {len(all_chunks)} chunks...") # Show before/after example if all_chunks: print(f" Example BEFORE: '{all_chunks[0]['text'][:100]}...'") # This modifies the 'text' field in each chunk dictionary all_chunks = self.contextual_enricher.enrich_chunks(all_chunks, window_size=window_size) if all_chunks: print(f" Example AFTER: '{all_chunks[0]['text'][:100]}...'") print(f"✅ Enriched {len(all_chunks)} chunks with context for indexing.") else: print(f"⚠️ CONTEXTUAL ENRICHMENT SKIPPED:") if not hasattr(self, 'contextual_enricher'): print(f" Reason: No enricher object (config enabled={enricher_enabled})") elif not enricher_enabled: print(f" Reason: Disabled in config") print(f" Chunks will be indexed without contextual enrichment.") # Step 4: Create BM25 Index from enriched chunks (for consistency with vector index) if hasattr(self, 'vector_indexer') and hasattr(self, 'embedding_generator'): with timer("Vector Embedding & Indexing"): table_name = self.config["storage"].get("text_table_name") or retriever_configs.get("dense", {}).get("lancedb_table_name", "default_text_table") print(f"\n--- Generating embeddings with {self.config.get('embedding_model_name')} ---") embeddings = self.embedding_generator.generate(all_chunks) print(f"\n--- Indexing {len(embeddings)} vectors into LanceDB table: {table_name} ---") self.vector_indexer.index(table_name, all_chunks, embeddings) print("✅ Vector embeddings indexed successfully") # Create FTS index on the 'text' field after adding data print(f"\n--- Ensuring Full-Text Search (FTS) index on table '{table_name}' ---") try: tbl = self.lancedb_manager.get_table(table_name) # LanceDB's default index name is "text_idx" while older # revisions of this pipeline used our own name "fts_text". # Guard against both so we don't attempt to create a # duplicate index and trigger a LanceError. existing_indices = [idx.name for idx in tbl.list_indices()] if not any(name in existing_indices for name in ("text_idx", "fts_text")): # Use LanceDB default index naming ("text_idx") tbl.create_fts_index( "text", use_tantivy=False, replace=False, ) print("✅ FTS index created successfully (using Lance native FTS).") else: print("ℹ️ FTS index already exists – skipped creation.") except Exception as e: print(f"❌ Failed to create/verify FTS index: {e}") # --------------------------------------------------- # Late-Chunk Embedding + Indexing (optional) # --------------------------------------------------- if self.latechunk_enabled: with timer("Late-Chunk Embedding & Indexing"): lc_table_name = self.latechunk_cfg.get("lancedb_table_name", f"{table_name}_lc") print(f"\n--- Generating late-chunk embeddings (table={lc_table_name}) ---") total_lc_vecs = 0 for doc_id, doc_chunks in doc_chunks_map.items(): # Build full text and span list full_text_parts = [] spans = [] current_pos = 0 for ch in doc_chunks: ch_text = ch["text"] full_text_parts.append(ch_text) start = current_pos end = start + len(ch_text) spans.append((start, end)) current_pos = end + 1 # +1 for newline to join later full_doc = "\n".join(full_text_parts) try: lc_vecs = self.latechunk_encoder.encode(full_doc, spans) except Exception as e: print(f"⚠️ LateChunk encode failed for {doc_id}: {e}") continue if len(doc_chunks) == 0 or len(lc_vecs) == 0: # Nothing to index for this document continue if len(lc_vecs) != len(doc_chunks): print(f"⚠️ Mismatch LC vecs ({len(lc_vecs)}) vs chunks ({len(doc_chunks)}) for {doc_id}. Skipping.") continue self.vector_indexer.index(lc_table_name, doc_chunks, lc_vecs) total_lc_vecs += len(lc_vecs) print(f"✅ Late-chunk vectors indexed: {total_lc_vecs}") # Step 6: Knowledge Graph Extraction (Optional) if hasattr(self, 'graph_extractor'): with timer("Knowledge Graph Extraction"): graph_path = retriever_configs.get("graph", {}).get("graph_path", "./index_store/graph/default_graph.gml") print(f"\n--- Building and saving knowledge graph to: {graph_path} ---") graph_data = self.graph_extractor.extract(all_chunks) G = nx.DiGraph() for entity in graph_data['entities']: G.add_node(entity['id'], type=entity.get('type', 'Unknown'), properties=entity.get('properties', {})) for rel in graph_data['relationships']: G.add_edge(rel['source'], rel['target'], label=rel['label']) os.makedirs(os.path.dirname(graph_path), exist_ok=True) nx.write_gml(G, graph_path) print(f"✅ Knowledge graph saved successfully.") print("\n--- ✅ Indexing Complete ---") self._print_final_statistics(len(file_paths), len(all_chunks)) def _print_final_statistics(self, num_files: int, num_chunks: int): """Print final indexing statistics""" print(f"\n📈 Final Statistics:") print(f" Files processed: {num_files}") print(f" Chunks generated: {num_chunks}") print(f" Average chunks per file: {num_chunks/num_files:.1f}") # Component status components = [] if hasattr(self, 'contextual_enricher'): components.append("✅ Contextual Enrichment") if hasattr(self, 'vector_indexer'): components.append("✅ Vector & FTS Index") if hasattr(self, 'graph_extractor'): components.append("✅ Knowledge Graph") print(f" Components: {', '.join(components)}") print(f" Batch sizes: Embeddings={self.embedding_batch_size}, Enrichment={self.enrichment_batch_size}") ================================================ FILE: rag_system/pipelines/retrieval_pipeline.py ================================================ import pymupdf from typing import List, Dict, Any, Tuple, Optional from PIL import Image import concurrent.futures import time import json import lancedb import logging import math import numpy as np from threading import Lock from rag_system.utils.ollama_client import OllamaClient from rag_system.retrieval.retrievers import MultiVectorRetriever, GraphRetriever from rag_system.indexing.multimodal import LocalVisionModel from rag_system.indexing.representations import select_embedder from rag_system.indexing.embedders import LanceDBManager from rag_system.rerankers.reranker import QwenReranker from rag_system.rerankers.sentence_pruner import SentencePruner # from rag_system.indexing.chunk_store import ChunkStore import os from PIL import Image # --------------------------------------------------------------------------- # Thread-safety helpers # --------------------------------------------------------------------------- # 1. ColBERT (via `rerankers` lib) is not thread-safe. We protect the actual # `.rank()` call with `_rerank_lock`. _rerank_lock: Lock = Lock() # 2. Loading a large cross-encoder or ColBERT model can easily take >1 GB of # RAM. When multiple sub-queries are processed in parallel they may try to # instantiate the reranker simultaneously, which results in PyTorch meta # tensor errors. We therefore guard the *initialisation* with its own # lock so only one thread carries out the heavy `from_pretrained()` call. _ai_reranker_init_lock: Lock = Lock() # Lock to serialise first-time Provence model load _sentence_pruner_lock: Lock = Lock() class RetrievalPipeline: """ Orchestrates the state-of-the-art multimodal RAG pipeline. """ def __init__(self, config: Dict[str, Any], ollama_client: OllamaClient, ollama_config: Dict[str, Any]): self.config = config self.ollama_config = ollama_config self.ollama_client = ollama_client # Support both legacy "retrievers" key and newer "retrieval" key self.retriever_configs = self.config.get("retrievers") or self.config.get("retrieval", {}) self.storage_config = self.config["storage"] # Defer initialization to just-in-time methods self.db_manager = None self.text_embedder = None self.dense_retriever = None self.bm25_retriever = None # Use a private attribute to avoid clashing with the public property self._graph_retriever = None self.reranker = None self.ai_reranker = None def _get_db_manager(self): if self.db_manager is None: # Accept either "db_path" (preferred) or legacy "lancedb_uri" db_path = self.storage_config.get("db_path") or self.storage_config.get("lancedb_uri") if not db_path: raise ValueError("Storage config must contain 'db_path' or 'lancedb_uri'.") self.db_manager = LanceDBManager(db_path=db_path) return self.db_manager def _get_text_embedder(self): if self.text_embedder is None: from rag_system.indexing.representations import select_embedder self.text_embedder = select_embedder( self.config.get("embedding_model_name", "BAAI/bge-small-en-v1.5"), self.ollama_config.get("host") if isinstance(self.ollama_config, dict) else None, ) return self.text_embedder def _get_dense_retriever(self): """Ensure a dense MultiVectorRetriever is always available unless explicitly disabled.""" if self.dense_retriever is None: # If the config explicitly sets dense.enabled to False, respect it if self.retriever_configs.get("dense", {}).get("enabled", True) is False: return None try: db_manager = self._get_db_manager() text_embedder = self._get_text_embedder() fusion_cfg = self.config.get("fusion", {}) self.dense_retriever = MultiVectorRetriever( db_manager, text_embedder, vision_model=None, fusion_config=fusion_cfg, ) except Exception as e: print(f"❌ Failed to initialise dense retriever: {e}") self.dense_retriever = None return self.dense_retriever def _get_bm25_retriever(self): if self.bm25_retriever is None and self.retriever_configs.get("bm25", {}).get("enabled"): try: print(f"🔧 Lazily initializing BM25 retriever...") self.bm25_retriever = BM25Retriever( index_path=self.storage_config["bm25_path"], index_name=self.retriever_configs["bm25"]["index_name"] ) print("✅ BM25 retriever initialized successfully") except Exception as e: print(f"❌ Failed to initialize BM25 retriever on demand: {e}") # Keep it None so we don't try again return self.bm25_retriever def _get_graph_retriever(self): if self._graph_retriever is None and self.retriever_configs.get("graph", {}).get("enabled"): self._graph_retriever = GraphRetriever(graph_path=self.storage_config["graph_path"]) return self._graph_retriever def _get_reranker(self): """Initializes the reranker for hybrid search score fusion.""" reranker_config = self.config.get("reranker", {}) # This is for the LanceDB internal reranker, not the AI one. if self.reranker is None and reranker_config.get("type") == "linear_combination": rerank_weight = reranker_config.get("weight", 0.5) self.reranker = lancedb.rerankers.LinearCombinationReranker(weight=rerank_weight) print(f"✅ Initialized LinearCombinationReranker with weight {rerank_weight}") return self.reranker def _get_ai_reranker(self): """Initializes a dedicated AI-based reranker.""" reranker_config = self.config.get("reranker", {}) if self.ai_reranker is None and reranker_config.get("enabled"): # Serialise first-time initialisation so only one thread attempts # to load the (very large) model. Other threads will wait and use # the instance once ready, preventing the meta-tensor crash. with _ai_reranker_init_lock: # Another thread may have completed init while we waited if self.ai_reranker is None: try: model_name = reranker_config.get("model_name") strategy = reranker_config.get("strategy", "qwen") if strategy == "rerankers-lib": print(f"🔧 Initialising Answer.AI ColBERT reranker ({model_name}) via rerankers lib…") from rerankers import Reranker self.ai_reranker = Reranker(model_name, model_type="colbert") else: print(f"🔧 Lazily initializing Qwen reranker ({model_name})…") self.ai_reranker = QwenReranker(model_name=model_name) print("✅ AI reranker initialized successfully.") except Exception as e: # Leave as None so the pipeline can proceed without reranking print(f"❌ Failed to initialize AI reranker: {e}") return self.ai_reranker def _get_sentence_pruner(self): if getattr(self, "_sentence_pruner", None) is None: with _sentence_pruner_lock: if getattr(self, "_sentence_pruner", None) is None: self._sentence_pruner = SentencePruner() return self._sentence_pruner def _get_surrounding_chunks_lancedb(self, chunk: Dict[str, Any], window_size: int) -> List[Dict[str, Any]]: """ Retrieves a window of chunks around a central chunk using LanceDB. """ db_manager = self._get_db_manager() if not db_manager: return [chunk] # Extract identifiers needed for the query document_id = chunk.get("document_id") chunk_index = chunk.get("chunk_index") # If essential identifiers are missing, return the chunk itself if document_id is None or chunk_index is None or chunk_index == -1: return [chunk] table_name = self.config["storage"]["text_table_name"] try: tbl = db_manager.get_table(table_name) except Exception: # If the table can't be opened, we can't get surrounding chunks return [chunk] # Define the window for the search start_index = max(0, chunk_index - window_size) end_index = chunk_index + window_size # Construct the SQL filter for an efficient metadata-based search sql_filter = f"document_id = '{document_id}' AND chunk_index >= {start_index} AND chunk_index <= {end_index}" try: # Execute a filter-only search, which is very fast on indexed metadata results = tbl.search().where(sql_filter).to_list() # The results must be sorted by chunk_index to maintain logical order results.sort(key=lambda c: c['chunk_index']) # The 'metadata' field is a JSON string and needs to be parsed for res in results: if isinstance(res.get('metadata'), str): try: res['metadata'] = json.loads(res['metadata']) except json.JSONDecodeError: res['metadata'] = {} # Handle corrupted metadata gracefully return results except Exception: # If the query fails for any reason, fall back to the single chunk return [chunk] def _synthesize_final_answer(self, query: str, facts: str, *, event_callback=None) -> str: """Uses a text LLM to synthesize a final answer from extracted facts.""" prompt = f""" You are an AI assistant specialised in answering questions from retrieved context. Context you receive • VERIFIED FACTS – text snippets retrieved from the user's documents. Some may be irrelevant noise. • ORIGINAL QUESTION – the user's actual query. Instructions 1. Evaluate each snippet for relevance to the ORIGINAL QUESTION; ignore those that do not help answer it. 2. Synthesise an answer **using only information from the relevant snippets**. 3. If snippets contradict one another, mention the contradiction explicitly. 4. If the snippets do not contain the needed information, reply exactly with: "I could not find that information in the provided documents." 5. Provide a thorough, well-structured answer. Use paragraphs or bullet points where helpful, and include any relevant numbers/names exactly as they appear. There is **no strict sentence limit**, but aim for clarity over brevity. 6. Do **not** introduce external knowledge unless step 4 applies; in that case you may add a clearly-labelled "General knowledge" sentence after the required statement. Output format Answer: ––––– Retrieved Snippets ––––– {facts} –––––––––––––––––––––––––––––– ORIGINAL QUESTION: "{query}" """ # Stream the answer token-by-token so the caller can forward them as SSE answer_parts: list[str] = [] for tok in self.ollama_client.stream_completion( model=self.ollama_config["generation_model"], prompt=prompt, ): answer_parts.append(tok) if event_callback: event_callback("token", {"text": tok}) return "".join(answer_parts) def run(self, query: str, table_name: str = None, window_size_override: Optional[int] = None, event_callback=None) -> Dict[str, Any]: start_time = time.time() retrieval_k = self.config.get("retrieval_k", 10) logger = logging.getLogger(__name__) logger.debug("--- Running Hybrid Search for query '%s' (table=%s) ---", query, table_name or self.storage_config.get("text_table_name")) # If a custom table_name is provided, propagate it to storage config so helper methods use it if table_name: self.storage_config["text_table_name"] = table_name if event_callback: event_callback("retrieval_started", {}) # Unified retrieval using the refactored MultiVectorRetriever dense_retriever = self._get_dense_retriever() # Get the LanceDB reranker for initial score fusion lancedb_reranker = self._get_reranker() retrieved_docs = [] if dense_retriever: retrieved_docs = dense_retriever.retrieve( text_query=query, table_name=table_name or self.storage_config["text_table_name"], k=retrieval_k, reranker=lancedb_reranker # Pass the reranker to enable hybrid search ) # --------------------------------------------------------------- # Late-Chunk retrieval (optional) # --------------------------------------------------------------- if self.retriever_configs.get("latechunk", {}).get("enabled"): lc_table = self.retriever_configs["latechunk"].get("lancedb_table_name") if lc_table: try: lc_docs = dense_retriever.retrieve( text_query=query, table_name=lc_table, k=retrieval_k, reranker=lancedb_reranker, ) retrieved_docs.extend(lc_docs) except Exception as e: print(f"⚠️ Late-chunk retrieval failed: {e}") if event_callback: event_callback("retrieval_done", {"count": len(retrieved_docs)}) retrieval_time = time.time() - start_time logger.debug("Retrieved %s chunks in %.2fs", len(retrieved_docs), retrieval_time) # ----------------------------------------------------------- # LATE-CHUNK MERGING (merge ±1 sub-vector into central hit) # ----------------------------------------------------------- if self.retriever_configs.get("latechunk", {}).get("enabled") and retrieved_docs: merged_count = 0 for doc in retrieved_docs: try: cid = doc.get("chunk_id") meta = doc.get("metadata", {}) if meta.get("latechunk_merged"): continue # already processed doc_id = doc.get("document_id") cidx = doc.get("chunk_index") if doc_id is None or cidx is None or cidx == -1: continue # Fetch neighbouring late-chunks inside same document (±1) siblings = self._get_surrounding_chunks_lancedb(doc, window_size=1) # Keep only same document_id and ordered by chunk_index siblings = [s for s in siblings if s.get("document_id") == doc_id] siblings.sort(key=lambda s: s.get("chunk_index", 0)) merged_text = " \n".join(s.get("text", "") for s in siblings) if merged_text: doc["text"] = merged_text meta["latechunk_merged"] = True merged_count += 1 except Exception as e: print(f"⚠️ Late-chunk merge failed for chunk {doc.get('chunk_id')}: {e}") if merged_count: print(f"🪄 Late-chunk merging applied to {merged_count} retrieved chunks.") # --- AI Reranking Step --- ai_reranker = self._get_ai_reranker() if ai_reranker and retrieved_docs: if event_callback: event_callback("rerank_started", {"count": len(retrieved_docs)}) print(f"\n--- Reranking top {len(retrieved_docs)} docs with AI model... ---") start_rerank_time = time.time() rerank_cfg = self.config.get("reranker", {}) top_k_cfg = rerank_cfg.get("top_k") top_percent = rerank_cfg.get("top_percent") # value in range 0–1 if top_percent is not None: try: pct = float(top_percent) assert 0 < pct <= 1 top_k = max(1, int(len(retrieved_docs) * pct)) except Exception: print("⚠️ Invalid top_percent value; falling back to top_k") top_k = top_k_cfg or len(retrieved_docs) else: top_k = top_k_cfg or len(retrieved_docs) strategy = self.config.get("reranker", {}).get("strategy", "qwen") if strategy == "rerankers-lib": texts = [d['text'] for d in retrieved_docs] # ColBERT's Rust backend isn't Sync; serialise calls. with _rerank_lock: ranked = ai_reranker.rank(query=query, docs=texts) # ranked is RankedResults; convert to list of (score, idx) try: pairs = [(r.score, r.document.doc_id) for r in ranked.results] if any(p[1] is None for p in pairs): pairs = [(r.score, i) for i, r in enumerate(ranked.results)] except Exception: pairs = ranked # Keep only top_k results if requested if top_k is not None and len(pairs) > top_k: pairs = pairs[:top_k] reranked_docs = [retrieved_docs[idx] | {"rerank_score": score} for score, idx in pairs] else: try: reranked_docs = ai_reranker.rerank(query, retrieved_docs, top_k=top_k) except TypeError: texts = [d['text'] for d in retrieved_docs] pairs = ai_reranker.rank(query, texts, top_k=top_k) reranked_docs = [retrieved_docs[idx] | {"rerank_score": score} for score, idx in pairs] rerank_time = time.time() - start_rerank_time print(f"✅ Reranking completed in {rerank_time:.2f}s. Refined to {len(reranked_docs)} docs.") if event_callback: event_callback("rerank_done", {"count": len(reranked_docs)}) else: # If no AI reranker, proceed with the initially retrieved docs reranked_docs = retrieved_docs window_size = self.config.get("context_window_size", 1) if window_size_override is not None: window_size = window_size_override if window_size > 0 and reranked_docs: if event_callback: event_callback("context_expand_started", {"count": len(reranked_docs)}) print(f"\n--- Expanding context for {len(reranked_docs)} top documents (window size: {window_size})... ---") expanded_chunks = {} with concurrent.futures.ThreadPoolExecutor() as executor: future_to_chunk = {executor.submit(self._get_surrounding_chunks_lancedb, chunk, window_size): chunk for chunk in reranked_docs} for future in concurrent.futures.as_completed(future_to_chunk): try: seed_chunk = future_to_chunk[future] surrounding_chunks = future.result() for surrounding_chunk in surrounding_chunks: cid = surrounding_chunk['chunk_id'] if cid not in expanded_chunks: # If this is the *central* chunk we already reranked, carry over its score if cid == seed_chunk.get('chunk_id') and 'rerank_score' in seed_chunk: surrounding_chunk['rerank_score'] = seed_chunk['rerank_score'] expanded_chunks[cid] = surrounding_chunk except Exception as e: print(f"Error expanding context for a chunk: {e}") final_docs = list(expanded_chunks.values()) # Sort by reranker score if present, otherwise by raw score/distance if any('rerank_score' in d for d in final_docs): final_docs.sort(key=lambda c: c.get('rerank_score', -1), reverse=True) elif any('_distance' in d for d in final_docs): # For vector search smaller distance is better final_docs.sort(key=lambda c: c.get('_distance', 1e9)) elif any('score' in d for d in final_docs): final_docs.sort(key=lambda c: c.get('score', 0), reverse=True) else: # Fallback to document order final_docs.sort(key=lambda c: (c.get('document_id', ''), c.get('chunk_index', 0))) print(f"Expanded to {len(final_docs)} unique chunks for synthesis.") if event_callback: event_callback("context_expand_done", {"count": len(final_docs)}) else: final_docs = reranked_docs # Optionally hide non-reranked chunks: if any chunk carries a # `rerank_score`, we assume the caller wants to focus on those. if any('rerank_score' in d for d in final_docs): final_docs = [d for d in final_docs if 'rerank_score' in d] # ------------------------------------------------------------------ # Sentence-level pruning (Provence) # ------------------------------------------------------------------ prov_cfg = self.config.get("provence", {}) if prov_cfg.get("enabled"): if event_callback: event_callback("prune_started", {"count": len(final_docs)}) thresh = float(prov_cfg.get("threshold", 0.1)) print(f"\n--- Provence pruning enabled (threshold={thresh}) ---") pruner = self._get_sentence_pruner() final_docs = pruner.prune_documents(query, final_docs, threshold=thresh) # Remove any chunks that were fully pruned (empty text) final_docs = [d for d in final_docs if d.get('text', '').strip()] if event_callback: event_callback("prune_done", {"count": len(final_docs)}) print("\n--- Final Documents for Synthesis ---") if not final_docs: print("No documents to synthesize.") else: for i, doc in enumerate(final_docs): print(f" [{i+1}] Chunk ID: {doc.get('chunk_id')}") print(f" Score: {doc.get('score', 'N/A')}") if 'rerank_score' in doc: print(f" Rerank Score: {doc.get('rerank_score'):.4f}") print(f" Text: \"{doc.get('text', '').strip()}\"") print("------------------------------------") if not final_docs: return {"answer": "I could not find an answer in the documents.", "source_documents": []} # --- Sanitize docs for JSON serialization (no NaN/Inf types) --- def _clean_val(v): if isinstance(v, float) and (math.isnan(v) or math.isinf(v)): return None if isinstance(v, (np.floating,)): try: f = float(v) if math.isnan(f) or math.isinf(f): return None return f except Exception: return None return v for doc in final_docs: # Remove heavy or internal-only fields before serialising doc.pop("vector", None) doc.pop("_distance", None) # Clean numeric fields for key in ['score', '_distance', 'rerank_score']: if key in doc: doc[key] = _clean_val(doc[key]) context = "\n\n".join([doc['text'] for doc in final_docs]) # 👀 DEBUG: Show the exact context passed to the LLM after pruning print("\n=== Context passed to LLM (post-pruning) ===") if len(context) > 2000: print(context[:2000] + "…\n[truncated] (total {} chars)".format(len(context))) else: print(context) print("=== End of context ===\n") final_answer = self._synthesize_final_answer(query, context, event_callback=event_callback) return {"answer": final_answer, "source_documents": final_docs} # ------------------------------------------------------------------ # Public utility # ------------------------------------------------------------------ def list_document_titles(self, max_items: int = 25) -> List[str]: """Return up to *max_items* distinct document titles (or IDs). This is used only for prompt-routing, so we favour robustness over perfect recall. If anything goes wrong we return an empty list so the caller can degrade gracefully. """ try: tbl_name = self.storage_config.get("text_table_name") if not tbl_name: return [] tbl = self._get_db_manager().get_table(tbl_name) field_name = "document_title" if "document_title" in tbl.schema.names else "document_id" # Use a cheap SQL filter to grab distinct values; fall back to a # simple scan if the driver lacks DISTINCT support. try: sql = f"SELECT DISTINCT {field_name} FROM tbl LIMIT {max_items}" rows = tbl.search().where("true").sql(sql).to_list() # type: ignore titles = [r[field_name] for r in rows if r.get(field_name)] except Exception: # Fallback: scan first N rows rows = tbl.search().select(field_name).limit(max_items * 4).to_list() seen = set() titles = [] for r in rows: val = r.get(field_name) if val and val not in seen: titles.append(val) seen.add(val) if len(titles) >= max_items: break # Ensure we don't exceed max_items return titles[:max_items] except Exception: # Any issues (missing table, bad schema, etc.) –> just return [] return [] # -------------------- Public helper properties -------------------- @property def retriever(self): """Lazily exposes the main (dense) retriever so external components like the ReAct agent tools can call `.retrieve()` directly without reaching into private helpers. If the retriever has not yet been instantiated, it is created on first access via `_get_dense_retriever`.""" return self._get_dense_retriever() def update_embedding_model(self, model_name: str): """Switch embedding model at runtime and clear cached objects so they re-initialize.""" if self.config.get("embedding_model_name") == model_name: return # nothing to do print(f"🔧 RetrievalPipeline switching embedding model to '{model_name}' (was '{self.config.get('embedding_model_name')}')") self.config["embedding_model_name"] = model_name # Reset caches so new instances are built on demand self.text_embedder = None self.dense_retriever = None ================================================ FILE: rag_system/requirements.txt ================================================ colpali-engine PyMuPDF Pillow transformers==4.51.0 torch==2.4.1 torchvision==0.19.1 lancedb rank_bm25 fuzzywuzzy python-Levenshtein torchaudio transformers sentencepiece accelerate docling ocrmac ibm-watsonx-ai>=1.3.39 ================================================ FILE: rag_system/rerankers/__init__.py ================================================ ================================================ FILE: rag_system/rerankers/reranker.py ================================================ from transformers import AutoModelForSequenceClassification, AutoTokenizer import torch from typing import List, Dict, Any class QwenReranker: """ A reranker that uses a local Hugging Face transformer model. """ def __init__(self, model_name: str = "BAAI/bge-reranker-base"): # Auto-select the best available device: CUDA > MPS > CPU if torch.cuda.is_available(): self.device = "cuda" elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available(): self.device = "mps" else: self.device = "cpu" print(f"Initializing BGE Reranker with model '{model_name}' on device '{self.device}'.") self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForSequenceClassification.from_pretrained( model_name, torch_dtype=torch.float16 if self.device != "cpu" else None, ).to(self.device).eval() print("BGE Reranker loaded successfully.") def _format_instruction(self, query: str, doc: str): instruction = 'Given a web search query, retrieve relevant passages that answer the query' return f": {instruction}\n: {query}\n: {doc}" def rerank(self, query: str, documents: List[Dict[str, Any]], top_k: int = 5, *, early_exit: bool = True, margin: float = 0.4, min_scored: int = 8, batch_size: int = 8) -> List[Dict[str, Any]]: """ Reranks a list of documents based on their relevance to a query. If *early_exit* is True the cross-encoder scores documents in mini-batches and stops once the best-so-far score beats the worst-so-far by *margin* after at least *min_scored* docs have been processed. This accelerates "easy" queries where strong positives dominate. """ if not documents: return [] # Sort by the upstream (hybrid) score so that the strongest candidates are evaluated first. docs_sorted = sorted(documents, key=lambda d: d.get('score', 0.0), reverse=True) scored_pairs: List[tuple[float, Dict[str, Any]]] = [] with torch.no_grad(): for start in range(0, len(docs_sorted), batch_size): batch_docs = docs_sorted[start : start + batch_size] batch_pairs = [[query, d['text']] for d in batch_docs] inputs = self.tokenizer( batch_pairs, padding=True, truncation=True, return_tensors="pt", max_length=512, ).to(self.device) logits = self.model(**inputs).logits.view(-1) batch_scores = logits.float().cpu().tolist() scored_pairs.extend(zip(batch_scores, batch_docs)) # --- Early-exit check --- if early_exit and len(scored_pairs) >= min_scored: # Current best and worst among *already* scored docs best_score = max(scored_pairs, key=lambda x: x[0])[0] worst_score = min(scored_pairs, key=lambda x: x[0])[0] if best_score - worst_score >= margin: break # Sort final set and attach scores sorted_by_score = sorted(scored_pairs, key=lambda x: x[0], reverse=True) reranked_docs: List[Dict[str, Any]] = [] for score, doc in sorted_by_score[:top_k]: doc_with_score = doc.copy() doc_with_score['rerank_score'] = score reranked_docs.append(doc_with_score) return reranked_docs if __name__ == '__main__': # This test requires an internet connection to download the models. try: reranker = QwenReranker(model_name="BAAI/bge-reranker-base") query = "What is the capital of France?" documents = [ {'text': "Paris is the capital of France.", 'metadata': {'doc_id': 'a'}}, {'text': "The Eiffel Tower is in Paris.", 'metadata': {'doc_id': 'b'}}, {'text': "France is a country in Europe.", 'metadata': {'doc_id': 'c'}}, ] reranked_documents = reranker.rerank(query, documents) print("\n--- Verification ---") print(f"Query: {query}") print("Reranked documents:") for doc in reranked_documents: print(f" - Score: {doc['rerank_score']:.4f}, Text: {doc['text']}") except Exception as e: print(f"\nAn error occurred during the QwenReranker test: {e}") print("Please ensure you have an internet connection for model downloads.") ================================================ FILE: rag_system/rerankers/sentence_pruner.py ================================================ from __future__ import annotations """Sentence-level context pruning using the Provence model (ICLR 2025). This lightweight helper wraps the HuggingFace model hosted at `naver/provence-reranker-debertav3-v1` and exposes a thread-safe `prune_documents()` method that converts a list of RAG chunks into their pruned variants. The module fails gracefully – if the model weights cannot be downloaded (or the `transformers` / `nltk` deps are missing) we simply return the original documents unchanged so the upstream pipeline continues unaffected. """ from threading import Lock from typing import List, Dict, Any class SentencePruner: """Lightweight singleton wrapper around the Provence model.""" _model = None # shared across all instances _init_lock: Lock = Lock() def __init__(self, model_name: str = "naver/provence-reranker-debertav3-v1") -> None: self.model_name = model_name self._ensure_model() # --------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------- def _ensure_model(self) -> None: """Lazily download and load the Provence model exactly once.""" if SentencePruner._model is not None: return with SentencePruner._init_lock: if SentencePruner._model is not None: return # another thread beat us try: from transformers import AutoModel # local import to keep base deps light print("🔧 Loading Provence sentence-pruning model …") SentencePruner._model = AutoModel.from_pretrained( self.model_name, trust_remote_code=True, ) print("✅ Provence model loaded successfully.") except Exception as e: # Any failure leaves the singleton as None so callers can skip pruning. print(f"❌ Failed to load Provence model: {e}. Context pruning will be skipped.") SentencePruner._model = None # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def prune_documents( self, question: str, docs: List[Dict[str, Any]], *, threshold: float = 0.1, ) -> List[Dict[str, Any]]: """Return *docs* with their `text` field pruned sentence-wise. If the model could not be initialised we simply echo the input. """ if SentencePruner._model is None: return docs # model unavailable – no-op # Batch texts for efficiency when >1 doc texts = [d.get("text", "") for d in docs] try: if len(texts) == 1: # returns dict outputs = [SentencePruner._model.process(question, texts[0], threshold=threshold)] else: # Batch call expects list[list[str]] with same outer length as questions list (1) batched_out = SentencePruner._model.process(question, [texts], threshold=threshold) # HF returns List[Dict] per question outputs = batched_out[0] if isinstance(batched_out, list) else batched_out if isinstance(outputs, dict): outputs = [outputs] if len(outputs) != len(texts): print("⚠️ Provence batch size mismatch; falling back to per-doc loop") raise ValueError pruned: List[Dict[str, Any]] = [] for doc, out in zip(docs, outputs): raw = out.get("pruned_context", doc.get("text", "")) if isinstance(out, dict) else doc.get("text", "") new_text = raw if isinstance(raw, str) else " ".join(raw) # HF model may return a list of sentences pruned.append({**doc, "text": new_text}) except Exception as e: print(f"⚠️ Provence batch pruning failed ({e}); falling back to individual calls") pruned = [] for doc in docs: text = doc.get("text", "") if not text: pruned.append(doc) continue try: res = SentencePruner._model.process(question, text, threshold=threshold) raw = res.get("pruned_context", text) if isinstance(res, dict) else text new_text = raw if isinstance(raw, str) else " ".join(raw) pruned.append({**doc, "text": new_text}) except Exception as err: print(f"⚠️ Provence pruning failed for chunk {doc.get('chunk_id')}: {err}") pruned.append(doc) return pruned ================================================ FILE: rag_system/retrieval/__init__.py ================================================ ================================================ FILE: rag_system/retrieval/query_transformer.py ================================================ from typing import List, Any, Dict import json from rag_system.utils.ollama_client import OllamaClient class QueryDecomposer: def __init__(self, llm_client: OllamaClient, llm_model: str): self.llm_client = llm_client self.llm_model = llm_model def decompose(self, query: str, chat_history: List[Dict[str, Any]] | None = None) -> List[str]: """Decompose *query* into standalone sub-queries. Parameters ---------- query : str The latest user message. chat_history : list[dict] | None Recent conversation turns (each item should contain at least the original user query under the key ``"query"``). Only the **last 5** turns are included to keep the prompt short. """ # ---- Limit history to last 5 user turns and extract the queries ---- history_snippets: List[str] = [] if chat_history: # Keep only the last 5 turns recent_turns = chat_history[-5:] # Extract user queries (fallback: full dict as string if key missing) for turn in recent_turns: history_snippets.append(str(turn.get("query", turn))) # Serialize chat_history for the prompt (single string) chat_history_text = " | ".join(history_snippets) # ---- Build the new SYSTEM prompt with added legacy examples ---- system_prompt = """ You are an expert at query decomposition for a Retrieval-Augmented Generation (RAG) system. Return one RFC-8259-compliant JSON object and nothing else. Schema: { “requires_decomposition”: , “reasoning”: , // ≤ 50 words “resolved_query”: , // query after context resolution “sub_queries”: // 1–10 standalone items } Think step-by-step internally, but reveal only the concise reasoning. ⸻ Context Resolution (perform FIRST) You will receive: • query – the current user message • chat_history – the most recent user turns (may be empty) If query contains pronouns, ellipsis, or shorthand that can be unambiguously linked to something in chat_history, rewrite it to a fully self-contained question and place the result in resolved_query. Otherwise, copy query into resolved_query unchanged. ⸻ When is decomposition REQUIRED? • MULTI-PART questions joined by “and”, “or”, “also”, list commas, etc. • COMPARATIVE / SUPERLATIVE questions (two or more entities, e.g. “bigger, better, fastest”). • TEMPORAL / SEQUENTIAL questions (changes over time, event timelines). • ENUMERATIONS (pros, cons, impacts). • ENTITY-SET COMPARISONS (A, B, C revenue…). When is decomposition NOT REQUIRED? • A single, factual information need. • Ambiguous queries needing clarification rather than splitting. ⸻ Output rules 1. Use resolved_query—not the raw query—to decide on decomposition. 2. If requires_decomposition is false, sub_queries must contain exactly resolved_query. 3. Otherwise, produce 2–10 self-contained questions; avoid pronouns and shared context. ⸻ """ # ---- Append NEW examples provided by the user ---- new_examples = """ Normalise pronouns and references: turn “this paper” into the explicit title if it can be inferred, otherwise leave as-is. chat_history: “What is the email address of the computer vision consultants?” query: “What is their revenue?” { "requires_decomposition": false, "reasoning": "Pronoun resolved; single information need.", "resolved_query": "What is the revenue of the computer vision consultants?", "sub_queries": [ "What is the revenue of the computer vision consultants?" ] } Context resolution (single info need) chat_history: “What is the email address of the computer vision consultants?” query: “What is the address?” { "requires_decomposition": false, "reasoning": "Pronoun resolved; single information need.", "resolved_query": "What is the physical address of the computer vision consultants?", "sub_queries": [ "What is the physical address of the computer vision consultants?" ] } Context resolution (single info need) chat_history: “ComputeX has a revenue of 100M?” query: “Who is the CEO?” { "requires_decomposition": false, "reasoning": "entities normalization.", "resolved_query": "who is the CEO of ComputeX", "sub_queries": [ "who is the CEO of ComputeX" ] } No unique antecedent → leave unresolved chat_history: “Tell me about the paper.” query: “What is the address?” { "requires_decomposition": false, "reasoning": "Ambiguous reference; cannot resolve safely.", "resolved_query": "What is the address?", "sub_queries": ["What is the address?"] } Temporal + Comparative chat_history: "" query: “How did Nvidia’s 2024 revenue compare with 2023?” { "requires_decomposition": true, "reasoning": "Needs revenue for two separate years before comparison.", "resolved_query": "How did Nvidia’s 2024 revenue compare with 2023?", "sub_queries": [ "What was Nvidia’s revenue in 2024?", "What was Nvidia’s revenue in 2023?" ] } Enumeration (pros / cons / cost) chat_history: "" query: “List the pros, cons, and estimated implementation cost of adopting a vector database.” { "requires_decomposition": true, "reasoning": "Three distinct information needs: pros, cons, cost.", "resolved_query": "List the pros, cons, and estimated implementation cost of adopting a vector database.", "sub_queries": [ "What are the pros of adopting a vector database?", "What are the cons of adopting a vector database?", "What is the estimated implementation cost of adopting a vector database?" ] } Entity-set comparison (multiple companies) chat_history: "" query: “How did Nvidia, AMD, and Intel perform in Q2 2025 in terms of revenue?” { "requires_decomposition": true, "reasoning": "Need revenue for each of three entities before comparison.", "resolved_query": "How did Nvidia, AMD, and Intel perform in Q2 2025 in terms of revenue?", "sub_queries": [ "What was Nvidia's revenue in Q2 2025?", "What was AMD's revenue in Q2 2025?", "What was Intel's revenue in Q2 2025?" ] } Multi-part question (limitations + mitigations) chat_history: "" query: “What are the limitations of GPT-4o and what are the recommended mitigations?” { "requires_decomposition": true, "reasoning": "Two distinct pieces of information: limitations and mitigations.", "resolved_query": "What are the limitations of GPT-4o and what are the recommended mitigations?", "sub_queries": [ "What are the known limitations of GPT-4o?", "What are the recommended mitigations for the limitations of GPT-4o?" ] } """ # ---- Append legacy examples that already existed in the old prompt ---- legacy_examples_header = """ ⸻ Additional legacy examples """ legacy_examples_body = """ **Example 1: Multi-Part Query** Query: "What were the main findings of the aiconfig report and how do they compare to the results from the RAG paper?" JSON Output: { "reasoning": "The query asks for two distinct pieces of information: the findings from one report and a comparison to another. This requires two separate retrieval steps.", "sub_queries": [ "What were the main findings of the aiconfig report?", "How do the findings of the aiconfig report compare to the results from the RAG paper?" ] } **Example 2: Simple Query** Query: "Summarize the contributions of the DeepSeek-V3 paper." JSON Output: { "reasoning": "This is a direct request for a summary of a single document and does not contain multiple parts.", "sub_queries": [ "Summarize the contributions of the DeepSeek-V3 paper." ] } **Example 3: Comparative Query** Query: "Did Microsoft or Google make more money last year?" JSON Output: { "reasoning": "This is a comparative query that requires fetching the profit for each company before a comparison can be made.", "sub_queries": [ "How much profit did Microsoft make last year?", "How much profit did Google make last year?" ] } **Example 4: Comparative Query with different phrasing** Query: "Who has more siblings, Jamie or Sansa?" JSON Output: { "reasoning": "This comparative query needs the sibling count for both individuals to be answered.", "sub_queries": [ "How many siblings does Jamie have?", "How many siblings does Sansa have?" ] } """ full_prompt = ( system_prompt + new_examples # + legacy_examples_header # + legacy_examples_body + """ ⸻ Now process Input payload: """ + json.dumps({"query": query, "chat_history": chat_history_text}, indent=2) + """ """ ) # ---- Call the LLM ---- response = self.llm_client.generate_completion(self.llm_model, full_prompt, format="json") response_text = response.get('response', '{}') try: # Handle potential markdown code blocks in the response if response_text.strip().startswith("```json"): response_text = response_text.strip()[7:-3].strip() data = json.loads(response_text) sub_queries = data.get('sub_queries') or [query] reasoning = data.get('reasoning', 'No reasoning provided.') print(f"Query Decomposition Reasoning: {reasoning}") # Fallback: ensure at least the resolved_query if sub_queries empty if not sub_queries: sub_queries = [data.get('resolved_query', query)] # Deduplicate while preserving order sub_queries = list(dict.fromkeys(sub_queries)) # Enforce 10 sub-query limit per new requirements return sub_queries[:10] except json.JSONDecodeError: print(f"Failed to decode JSON from query decomposer: {response_text}") return [query] class HyDEGenerator: def __init__(self, llm_client: OllamaClient, llm_model: str): self.llm_client = llm_client self.llm_model = llm_model def generate(self, query: str) -> str: prompt = f"Generate a short, hypothetical document that answers the following question. The document should be dense with keywords and concepts related to the query.\n\nQuery: {query}\n\nHypothetical Document:" response = self.llm_client.generate_completion(self.llm_model, prompt) return response.get('response', '') class GraphQueryTranslator: def __init__(self, llm_client: OllamaClient, llm_model: str): self.llm_client = llm_client self.llm_model = llm_model def _generate_translation_prompt(self, query: str) -> str: return f""" You are an expert query planner. Convert the user's question into a structured JSON query for a knowledge graph. The JSON should contain a 'start_node' (the known entity in the query) and an 'edge_label' (the relationship being asked about). The graph has nodes (entities) and directed edges (relationships). For example, (Tim Cook) -[IS_CEO_OF]-> (Apple). Return ONLY the JSON object. User Question: "{query}" JSON Output: """ def translate(self, query: str) -> Dict[str, Any]: prompt = self._generate_translation_prompt(query) response = self.llm_client.generate_completion(self.llm_model, prompt, format="json") try: return json.loads(response.get('response', '{}')) except json.JSONDecodeError: return {} ================================================ FILE: rag_system/retrieval/retrievers.py ================================================ import lancedb import pickle import json from typing import List, Dict, Any import numpy as np import networkx as nx import os from PIL import Image from transformers import CLIPProcessor, CLIPModel import torch import logging import pandas as pd import math import concurrent.futures from functools import lru_cache from rag_system.indexing.embedders import LanceDBManager from rag_system.indexing.representations import QwenEmbedder from rag_system.indexing.multimodal import LocalVisionModel from rag_system.utils.logging_utils import log_retrieval_results # BM25Retriever is no longer needed. # class BM25Retriever: ... from fuzzywuzzy import process class GraphRetriever: def __init__(self, graph_path: str): self.graph = nx.read_gml(graph_path) def retrieve(self, query: str, k: int = 5, score_cutoff: int = 80) -> List[Dict[str, Any]]: print(f"\n--- Performing Graph Retrieval for query: '{query}' ---") query_parts = query.split() entities = [] for part in query_parts: match = process.extractOne(part, self.graph.nodes(), score_cutoff=score_cutoff) if match and isinstance(match[0], str): entities.append(match[0]) retrieved_docs = [] for entity in set(entities): for neighbor in self.graph.neighbors(entity): retrieved_docs.append({ 'chunk_id': f"graph_{entity}_{neighbor}", 'text': f"Entity: {entity}, Neighbor: {neighbor}", 'score': 1.0, 'metadata': {'source': 'graph'} }) print(f"Retrieved {len(retrieved_docs)} documents from the graph.") return retrieved_docs[:k] # region === MultiVectorRetriever === class MultiVectorRetriever: """ Performs hybrid (vector + FTS) or vector-only retrieval. """ def __init__(self, db_manager: LanceDBManager, text_embedder: QwenEmbedder, vision_model: LocalVisionModel = None, *, fusion_config: Dict[str, Any] | None = None): self.db_manager = db_manager self.text_embedder = text_embedder self.vision_model = vision_model self.fusion_config = fusion_config or {"method": "linear", "bm25_weight": 0.5, "vec_weight": 0.5} # Lightweight in-memory LRU cache for single-query embeddings (256 entries) @lru_cache(maxsize=256) def _embed_single(q: str): return self.text_embedder.create_embeddings([q])[0] self._embed_single = _embed_single def retrieve(self, text_query: str, table_name: str, k: int, reranker=None) -> List[Dict[str, Any]]: """ Performs a search on a single LanceDB table. If a reranker is provided, it performs a hybrid search. Otherwise, it performs a standard vector search. """ print(f"\n--- Performing Retrieval for query: '{text_query}' on table '{table_name}' ---") try: if table_name is None: table_name = "default_text_table" tbl = self.db_manager.get_table(table_name) # Create / fetch cached text embedding for the query text_query_embedding = self._embed_single(text_query) logger = logging.getLogger(__name__) # Always perform hybrid lexical + vector search logger.debug( "Running hybrid search on table '%s' (k=%s, have_reranker=%s)", table_name, k, bool(reranker), ) if reranker: logger.debug("Hybrid + reranker path not yet implemented with manual fusion; proceeding without extra reranker.") # Manual two-leg hybrid: take half from each modality fts_k = k // 2 vec_k = k - fts_k # Run FTS and vector search in parallel to cut latency def _run_fts(): # Very short queries often underperform → add fuzzy wildcard fts_query = text_query if len(text_query.split()) == 1: fts_query = f"{text_query}* OR {text_query}~" return ( tbl.search(query=fts_query, query_type="fts") .limit(fts_k) .to_df() ) def _run_vec(): if vec_k == 0: return None return ( tbl.search(text_query_embedding) .limit(vec_k * 2) # fetch extra to allow for dedup .to_df() ) with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: fts_future = executor.submit(_run_fts) vec_future = executor.submit(_run_vec) fts_df = fts_future.result() vec_df = vec_future.result() if vec_df is not None: combined = pd.concat([fts_df, vec_df]) else: combined = fts_df # Remove duplicates preserving first occurrence, then trim to k dedup_subset = ["_rowid"] if "_rowid" in combined.columns else (["chunk_id"] if "chunk_id" in combined.columns else None) if dedup_subset: combined = combined.drop_duplicates(subset=dedup_subset, keep="first") combined = combined.head(k) results_df = combined logger.debug( "Hybrid (fts=%s, vec=%s) → %s unique chunks", len(fts_df), 0 if vec_df is None else len(vec_df), len(results_df), ) retrieved_docs = [] for _, row in results_df.iterrows(): metadata = json.loads(row.get('metadata', '{}')) # Add top-level fields back into metadata for consistency if they don't exist metadata.setdefault('document_id', row.get('document_id')) metadata.setdefault('chunk_index', row.get('chunk_index')) # Determine score (vector distance or FTS). Replace NaN with 0.0 raw_score = row.get('_distance') if '_distance' in row else row.get('score') try: if raw_score is None or (isinstance(raw_score, float) and math.isnan(raw_score)): raw_score = 0.0 except Exception: raw_score = 0.0 combined_score = raw_score # Optional linear-weight fusion if both FTS & vector scores exist if '_distance' in row and 'score' in row: try: bm25 = row.get('score', 0.0) vec_sim = 1.0 / (1.0 + row.get('_distance', 1.0)) # convert distance to similarity w_bm25 = float(self.fusion_config.get('bm25_weight', 0.5)) w_vec = float(self.fusion_config.get('vec_weight', 0.5)) combined_score = w_bm25 * bm25 + w_vec * vec_sim except Exception: pass retrieved_docs.append({ 'chunk_id': row.get('chunk_id'), 'text': metadata.get('original_text', row.get('text')), 'score': combined_score, 'bm25': row.get('score'), '_distance': row.get('_distance'), 'document_id': row.get('document_id'), 'chunk_index': row.get('chunk_index'), 'metadata': metadata }) logger.debug("Hybrid search returned %s results", len(retrieved_docs)) log_retrieval_results(retrieved_docs, k) print(f"Retrieved {len(retrieved_docs)} documents.") return retrieved_docs except Exception as e: print(f"Could not search table '{table_name}': {e}") return [] # endregion if __name__ == '__main__': print("retrievers.py updated for LanceDB FTS Hybrid Search.") ================================================ FILE: rag_system/utils/batch_processor.py ================================================ import time import logging from typing import List, Dict, Any, Callable, Optional, Iterator from contextlib import contextmanager import gc # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @contextmanager def timer(operation_name: str): """Context manager to time operations""" start = time.time() try: yield finally: duration = time.time() - start logger.info(f"{operation_name} completed in {duration:.2f}s") class ProgressTracker: """Tracks progress and performance metrics for batch operations""" def __init__(self, total_items: int, operation_name: str = "Processing"): self.total_items = total_items self.operation_name = operation_name self.processed_items = 0 self.errors_encountered = 0 self.start_time = time.time() self.last_report_time = time.time() self.report_interval = 10 # Report every 10 seconds def update(self, items_processed: int, errors: int = 0): """Update progress with number of items processed""" self.processed_items += items_processed self.errors_encountered += errors current_time = time.time() if current_time - self.last_report_time >= self.report_interval: self._report_progress() self.last_report_time = current_time def _report_progress(self): """Report current progress""" elapsed = time.time() - self.start_time if elapsed > 0: rate = self.processed_items / elapsed remaining = self.total_items - self.processed_items eta = remaining / rate if rate > 0 else 0 progress_pct = (self.processed_items / self.total_items) * 100 logger.info( f"{self.operation_name}: {self.processed_items}/{self.total_items} " f"({progress_pct:.1f}%) - {rate:.2f} items/sec - " f"ETA: {eta/60:.1f}min - Errors: {self.errors_encountered}" ) def finish(self): """Report final statistics""" elapsed = time.time() - self.start_time rate = self.processed_items / elapsed if elapsed > 0 else 0 logger.info( f"{self.operation_name} completed: {self.processed_items}/{self.total_items} items " f"in {elapsed:.2f}s ({rate:.2f} items/sec) - {self.errors_encountered} errors" ) class BatchProcessor: """Generic batch processor with progress tracking and error handling""" def __init__(self, batch_size: int = 50, enable_gc: bool = True): self.batch_size = batch_size self.enable_gc = enable_gc def process_in_batches( self, items: List[Any], process_func: Callable, operation_name: str = "Processing", **kwargs ) -> List[Any]: """ Process items in batches with progress tracking Args: items: List of items to process process_func: Function to process each batch operation_name: Name for progress reporting **kwargs: Additional arguments passed to process_func Returns: List of results from all batches """ if not items: logger.info(f"{operation_name}: No items to process") return [] tracker = ProgressTracker(len(items), operation_name) results = [] logger.info(f"Starting {operation_name} for {len(items)} items in batches of {self.batch_size}") with timer(f"{operation_name} (total)"): for i in range(0, len(items), self.batch_size): batch = items[i:i + self.batch_size] batch_num = i // self.batch_size + 1 total_batches = (len(items) + self.batch_size - 1) // self.batch_size try: with timer(f"Batch {batch_num}/{total_batches}"): batch_results = process_func(batch, **kwargs) results.extend(batch_results) tracker.update(len(batch)) except Exception as e: logger.error(f"Error in batch {batch_num}: {e}") tracker.update(len(batch), errors=len(batch)) # Continue processing other batches continue # Optional garbage collection to manage memory if self.enable_gc and batch_num % 5 == 0: gc.collect() tracker.finish() return results def batch_iterator(self, items: List[Any]) -> Iterator[List[Any]]: """Generate batches as an iterator for memory-efficient processing""" for i in range(0, len(items), self.batch_size): yield items[i:i + self.batch_size] class StreamingProcessor: """Process items one at a time with minimal memory usage""" def __init__(self, enable_gc_interval: int = 100): self.enable_gc_interval = enable_gc_interval def process_streaming( self, items: List[Any], process_func: Callable, operation_name: str = "Streaming Processing", **kwargs ) -> List[Any]: """ Process items one at a time with minimal memory footprint Args: items: List of items to process process_func: Function to process each item operation_name: Name for progress reporting **kwargs: Additional arguments passed to process_func Returns: List of results """ if not items: logger.info(f"{operation_name}: No items to process") return [] tracker = ProgressTracker(len(items), operation_name) results = [] logger.info(f"Starting {operation_name} for {len(items)} items (streaming)") with timer(f"{operation_name} (streaming)"): for i, item in enumerate(items): try: result = process_func(item, **kwargs) results.append(result) tracker.update(1) except Exception as e: logger.error(f"Error processing item {i}: {e}") tracker.update(1, errors=1) continue # Periodic garbage collection if self.enable_gc_interval and (i + 1) % self.enable_gc_interval == 0: gc.collect() tracker.finish() return results # Utility functions for common batch operations def batch_chunks_by_document(chunks: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: """Group chunks by document_id for document-level batch processing""" document_batches = {} for chunk in chunks: doc_id = chunk.get('metadata', {}).get('document_id', 'unknown') if doc_id not in document_batches: document_batches[doc_id] = [] document_batches[doc_id].append(chunk) return document_batches def estimate_memory_usage(chunks: List[Dict[str, Any]]) -> float: """Estimate memory usage of chunks in MB""" if not chunks: return 0.0 # Rough estimate: average text length * number of chunks * 2 (for overhead) avg_text_length = sum(len(chunk.get('text', '')) for chunk in chunks[:min(10, len(chunks))]) / min(10, len(chunks)) estimated_bytes = avg_text_length * len(chunks) * 2 return estimated_bytes / (1024 * 1024) # Convert to MB if __name__ == '__main__': # Test the batch processor def dummy_process_func(batch): time.sleep(0.1) # Simulate processing time return [f"processed_{item}" for item in batch] test_items = list(range(100)) processor = BatchProcessor(batch_size=10) results = processor.process_in_batches( test_items, dummy_process_func, "Test Processing" ) print(f"Processed {len(results)} items") ================================================ FILE: rag_system/utils/logging_utils.py ================================================ import logging from typing import List, Dict from textwrap import shorten logger = logging.getLogger("rag-system") # Global log format – only set if user has not configured logging if not logger.handlers: logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s", ) def log_query(query: str, sub_queries: List[str] | None = None) -> None: """Emit a nicely-formatted block describing the incoming query and any decomposition.""" border = "=" * 60 logger.info("\n%s\nUSER QUERY: %s", border, query) if sub_queries: for i, q in enumerate(sub_queries, 1): logger.info(" sub-%d → %s", i, q) logger.info("%s", border) def log_retrieval_results(results: List[Dict], k: int) -> None: """Show chunk_id, truncated text and score for the first *k* rows.""" if not results: logger.info("Retrieval returned 0 documents.") return logger.info("Top %d results:", min(k, len(results))) header = f"{'chunk_id':<14} {'score':<7} preview" logger.info(header) logger.info("-" * len(header)) for row in results[:k]: preview = shorten(row.get("text", ""), width=60, placeholder="…") logger.info("%s %-7.3f %s", str(row.get("chunk_id"))[:12], row.get("score", 0.0), preview) ================================================ FILE: rag_system/utils/ollama_client.py ================================================ import requests import json from typing import List, Dict, Any import base64 from io import BytesIO from PIL import Image import httpx, asyncio class OllamaClient: """ An enhanced client for Ollama that now handles image data for VLM models. """ def __init__(self, host: str = "http://localhost:11434"): self.host = host self.api_url = f"{host}/api" # (Connection check remains the same) def _image_to_base64(self, image: Image.Image) -> str: """Converts a Pillow Image to a base64 string.""" buffered = BytesIO() image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode('utf-8') def generate_embedding(self, model: str, text: str) -> List[float]: try: response = requests.post( f"{self.api_url}/embeddings", json={"model": model, "prompt": text} ) response.raise_for_status() return response.json().get("embedding", []) except requests.exceptions.RequestException as e: print(f"Error generating embedding: {e}") return [] def generate_completion( self, model: str, prompt: str, *, format: str = "", images: List[Image.Image] | None = None, enable_thinking: bool | None = None, ) -> Dict[str, Any]: """ Generates a completion, now with optional support for images. Args: model: The name of the generation model (e.g., 'llava', 'qwen-vl'). prompt: The text prompt for the model. format: The format for the response, e.g., "json". images: A list of Pillow Image objects to send to the VLM. enable_thinking: Optional flag to disable chain-of-thought for Qwen models. """ try: payload = { "model": model, "prompt": prompt, "stream": False } if format: payload["format"] = format if images: payload["images"] = [self._image_to_base64(img) for img in images] # Optional: disable thinking mode for Qwen3 / DeepSeek models if enable_thinking is not None: payload["chat_template_kwargs"] = {"enable_thinking": enable_thinking} response = requests.post( f"{self.api_url}/generate", json=payload ) response.raise_for_status() response_lines = response.text.strip().split('\n') final_response = json.loads(response_lines[-1]) return final_response except requests.exceptions.RequestException as e: print(f"Error generating completion: {e}") return {} # ------------------------------------------------------------- # Async variant – uses httpx so the caller can await multiple # LLM calls concurrently (triage, verification, etc.). # ------------------------------------------------------------- async def generate_completion_async( self, model: str, prompt: str, *, format: str = "", images: List[Image.Image] | None = None, enable_thinking: bool | None = None, timeout: int = 60, ) -> Dict[str, Any]: """Asynchronous version of generate_completion using httpx.""" payload = {"model": model, "prompt": prompt, "stream": False} if format: payload["format"] = format if images: payload["images"] = [self._image_to_base64(img) for img in images] if enable_thinking is not None: payload["chat_template_kwargs"] = {"enable_thinking": enable_thinking} try: async with httpx.AsyncClient(timeout=timeout) as client: resp = await client.post(f"{self.api_url}/generate", json=payload) resp.raise_for_status() return json.loads(resp.text.strip().split("\n")[-1]) except (httpx.HTTPError, asyncio.CancelledError) as e: print(f"Async Ollama completion error: {e}") return {} # ------------------------------------------------------------- # Streaming variant – yields token chunks in real time # ------------------------------------------------------------- def stream_completion( self, model: str, prompt: str, *, images: List[Image.Image] | None = None, enable_thinking: bool | None = None, ): """Generator that yields partial *response* strings as they arrive. Example: for tok in client.stream_completion("qwen2", "Hello"): print(tok, end="", flush=True) """ payload: Dict[str, Any] = {"model": model, "prompt": prompt, "stream": True} if images: payload["images"] = [self._image_to_base64(img) for img in images] if enable_thinking is not None: payload["chat_template_kwargs"] = {"enable_thinking": enable_thinking} with requests.post(f"{self.api_url}/generate", json=payload, stream=True) as resp: resp.raise_for_status() for raw_line in resp.iter_lines(): if not raw_line: # Keep-alive newline continue try: data = json.loads(raw_line.decode()) except json.JSONDecodeError: continue # The Ollama streaming API sends objects like {"response":"Hi","done":false} chunk = data.get("response", "") if chunk: yield chunk if data.get("done"): break if __name__ == '__main__': # This test now requires a VLM model like 'llava' or 'qwen-vl' to be pulled. print("Ollama client updated for multimodal (VLM) support.") try: client = OllamaClient() # Create a dummy black image for testing dummy_image = Image.new('RGB', (100, 100), 'black') # Test VLM completion vlm_response = client.generate_completion( model="llava", # Make sure you have run 'ollama pull llava' prompt="What color is this image?", images=[dummy_image] ) if vlm_response and 'response' in vlm_response: print("\n--- VLM Test Response ---") print(vlm_response['response']) else: print("\nFailed to get VLM response. Is 'llava' model pulled and running?") except Exception as e: print(f"An error occurred: {e}") ================================================ FILE: rag_system/utils/validate_model_config.py ================================================ #!/usr/bin/env python3 """ Model Configuration Validation Script ===================================== This script validates the consolidated model configuration system to ensure: 1. No configuration conflicts exist 2. All model names are consistent across components 3. Models are accessible and properly configured 4. The configuration validation system works correctly Run this after making configuration changes to catch issues early. """ import sys import os # Add parent directories to path for imports sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from rag_system.main import ( PIPELINE_CONFIGS, OLLAMA_CONFIG, EXTERNAL_MODELS, validate_model_config ) def print_header(title: str): """Print a formatted header.""" print(f"\n{'='*60}") print(f"🔍 {title}") print(f"{'='*60}") def print_section(title: str): """Print a formatted section header.""" print(f"\n{'─'*40}") print(f"📋 {title}") print(f"{'─'*40}") def validate_configuration_consistency(): """Validate that all configurations are consistent.""" print_header("CONFIGURATION CONSISTENCY VALIDATION") errors = [] # 1. Check embedding model consistency print_section("Embedding Model Consistency") default_embedding = PIPELINE_CONFIGS["default"]["embedding_model_name"] external_embedding = EXTERNAL_MODELS["embedding_model"] fast_embedding = PIPELINE_CONFIGS["fast"]["embedding_model_name"] print(f"Default Config: {default_embedding}") print(f"External Models: {external_embedding}") print(f"Fast Config: {fast_embedding}") if default_embedding != external_embedding: errors.append(f"❌ Embedding model mismatch: default={default_embedding}, external={external_embedding}") elif default_embedding != fast_embedding: errors.append(f"❌ Embedding model mismatch: default={default_embedding}, fast={fast_embedding}") else: print("✅ Embedding models are consistent") # 2. Check reranker model consistency print_section("Reranker Model Consistency") default_reranker = PIPELINE_CONFIGS["default"]["reranker"]["model_name"] external_reranker = EXTERNAL_MODELS["reranker_model"] print(f"Default Config: {default_reranker}") print(f"External Models: {external_reranker}") if default_reranker != external_reranker: errors.append(f"❌ Reranker model mismatch: default={default_reranker}, external={external_reranker}") else: print("✅ Reranker models are consistent") # 3. Check vision model consistency print_section("Vision Model Consistency") default_vision = PIPELINE_CONFIGS["default"]["vision_model_name"] external_vision = EXTERNAL_MODELS["vision_model"] print(f"Default Config: {default_vision}") print(f"External Models: {external_vision}") if default_vision != external_vision: errors.append(f"❌ Vision model mismatch: default={default_vision}, external={external_vision}") else: print("✅ Vision models are consistent") return errors def print_model_usage_map(): """Print a comprehensive map of which models are used where.""" print_header("MODEL USAGE MAP") print_section("🤖 Ollama Models (Local Inference)") for model_type, model_name in OLLAMA_CONFIG.items(): if model_type != "host": print(f" {model_type.replace('_', ' ').title()}: {model_name}") print_section("🔗 External Models (HuggingFace/Direct)") for model_type, model_name in EXTERNAL_MODELS.items(): print(f" {model_type.replace('_', ' ').title()}: {model_name}") print_section("📍 Model Usage by Component") usage_map = { "🔤 Text Embedding": { "Model": EXTERNAL_MODELS["embedding_model"], "Used In": ["Retrieval Pipeline", "Semantic Cache", "Dense Retrieval", "Late Chunking"], "Component": "QwenEmbedder (representations.py)" }, "🧠 Text Generation": { "Model": OLLAMA_CONFIG["generation_model"], "Used In": ["Agent Loop", "Answer Synthesis", "Query Decomposition", "Verification"], "Component": "OllamaClient" }, "🚀 Enrichment/Routing": { "Model": OLLAMA_CONFIG["enrichment_model"], "Used In": ["Query Routing", "Document Overview Analysis"], "Component": "Agent Loop (_route_via_overviews)" }, "🔀 Reranking": { "Model": EXTERNAL_MODELS["reranker_model"], "Used In": ["Hybrid Search", "Document Reranking", "AI Reranker"], "Component": "ColBERT (rerankers-lib) or QwenReranker" }, "👁️ Vision": { "Model": EXTERNAL_MODELS["vision_model"], "Used In": ["Multimodal Processing", "Image Embeddings"], "Component": "Vision Pipeline (when enabled)" } } for model_name, details in usage_map.items(): print(f"\n{model_name}") print(f" Model: {details['Model']}") print(f" Component: {details['Component']}") print(f" Used In: {', '.join(details['Used In'])}") def test_validation_function(): """Test the built-in validation function.""" print_header("VALIDATION FUNCTION TEST") try: result = validate_model_config() if result: print("✅ validate_model_config() passed successfully!") else: print("❌ validate_model_config() returned False") except Exception as e: print(f"❌ validate_model_config() failed with error: {e}") return False return True def check_pipeline_configurations(): """Check all pipeline configurations for completeness.""" print_header("PIPELINE CONFIGURATION COMPLETENESS") required_keys = { "default": ["storage", "retrieval", "embedding_model_name", "reranker"], "fast": ["storage", "retrieval", "embedding_model_name"] } errors = [] for config_name, required in required_keys.items(): print_section(f"{config_name.title()} Configuration") config = PIPELINE_CONFIGS.get(config_name, {}) for key in required: if key in config: print(f" ✅ {key}: {type(config[key]).__name__}") else: error_msg = f"❌ Missing required key '{key}' in {config_name} config" errors.append(error_msg) print(f" {error_msg}") return errors def main(): """Run all validation checks.""" print("🚀 Starting Model Configuration Validation") print(f"Python Path: {sys.path[0]}") all_errors = [] # Run all validation checks all_errors.extend(validate_configuration_consistency()) all_errors.extend(check_pipeline_configurations()) # Print model usage map print_model_usage_map() # Test validation function validation_passed = test_validation_function() # Final summary print_header("VALIDATION SUMMARY") if all_errors: print("❌ VALIDATION FAILED - Issues Found:") for error in all_errors: print(f" {error}") return 1 elif not validation_passed: print("❌ VALIDATION FAILED - validate_model_config() function failed") return 1 else: print("✅ ALL VALIDATIONS PASSED!") print("\n🎉 Your model configuration is consistent and properly structured!") print("\n📋 Summary:") print(f" • Embedding Model: {EXTERNAL_MODELS['embedding_model']}") print(f" • Generation Model: {OLLAMA_CONFIG['generation_model']}") print(f" • Enrichment Model: {OLLAMA_CONFIG['enrichment_model']}") print(f" • Reranker Model: {EXTERNAL_MODELS['reranker_model']}") print(f" • Vision Model: {EXTERNAL_MODELS['vision_model']}") return 0 if __name__ == "__main__": sys.exit(main()) ================================================ FILE: rag_system/utils/watsonx_client.py ================================================ import json from typing import List, Dict, Any, Optional import base64 from io import BytesIO from PIL import Image class WatsonXClient: """ A client for IBM Watson X AI that provides similar interface to OllamaClient for seamless integration with the RAG system. """ def __init__( self, api_key: str, project_id: str, url: str = "https://us-south.ml.cloud.ibm.com", ): """ Initialize the Watson X client. Args: api_key: IBM Cloud API key for authentication project_id: Watson X project ID url: Watson X service URL (default: us-south region) """ self.api_key = api_key self.project_id = project_id self.url = url try: from ibm_watsonx_ai import APIClient from ibm_watsonx_ai import Credentials from ibm_watsonx_ai.foundation_models import ModelInference from ibm_watsonx_ai.foundation_models.schema import TextGenParameters except ImportError: raise ImportError( "ibm-watsonx-ai package is required. " "Install it with: pip install ibm-watsonx-ai" ) self._APIClient = APIClient self._Credentials = Credentials self._ModelInference = ModelInference self._TextGenParameters = TextGenParameters self.credentials = self._Credentials( api_key=self.api_key, url=self.url ) self.client = self._APIClient(self.credentials) self.client.set.default_project(self.project_id) def _image_to_base64(self, image: Image.Image) -> str: """Converts a Pillow Image to a base64 string.""" buffered = BytesIO() image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode('utf-8') def generate_embedding(self, model: str, text: str) -> List[float]: """ Generate embeddings using Watson X embedding models. Note: This requires using Watson X embedding models through the embeddings API. """ try: from ibm_watsonx_ai.foundation_models import Embeddings embedding_model = Embeddings( model_id=model, credentials=self.credentials, project_id=self.project_id ) result = embedding_model.embed_query(text) return result if isinstance(result, list) else [] except Exception as e: print(f"Error generating embedding: {e}") return [] def generate_completion( self, model: str, prompt: str, *, format: str = "", images: Optional[List[Image.Image]] = None, enable_thinking: Optional[bool] = None, **kwargs ) -> Dict[str, Any]: """ Generates a completion using Watson X foundation models. Args: model: The name/ID of the Watson X model (e.g., 'ibm/granite-13b-chat-v2') prompt: The text prompt for the model format: The format for the response (e.g., "json") images: List of Pillow Image objects (for multimodal models) enable_thinking: Optional flag (not used in Watson X, kept for compatibility) **kwargs: Additional parameters for text generation Returns: Dictionary with response in Ollama-compatible format """ try: gen_params = {} if kwargs.get('max_tokens'): gen_params['max_new_tokens'] = kwargs['max_tokens'] if kwargs.get('temperature'): gen_params['temperature'] = kwargs['temperature'] if kwargs.get('top_p'): gen_params['top_p'] = kwargs['top_p'] if kwargs.get('top_k'): gen_params['top_k'] = kwargs['top_k'] parameters = self._TextGenParameters(**gen_params) if gen_params else None model_inference = self._ModelInference( model_id=model, credentials=self.credentials, project_id=self.project_id, params=parameters ) if images: print("Warning: Image support in Watson X may vary by model") result = model_inference.generate(prompt=prompt) else: result = model_inference.generate(prompt=prompt) generated_text = "" if isinstance(result, dict): generated_text = result.get('results', [{}])[0].get('generated_text', '') else: generated_text = str(result) return { 'response': generated_text, 'model': model, 'done': True } except Exception as e: print(f"Error generating completion: {e}") return {'response': '', 'error': str(e)} async def generate_completion_async( self, model: str, prompt: str, *, format: str = "", images: Optional[List[Image.Image]] = None, enable_thinking: Optional[bool] = None, timeout: int = 60, **kwargs ) -> Dict[str, Any]: """ Asynchronous version of generate_completion. Note: IBM Watson X SDK may not have native async support, so this is a wrapper around the sync version. """ import asyncio loop = asyncio.get_event_loop() return await loop.run_in_executor( None, lambda: self.generate_completion( model, prompt, format=format, images=images, enable_thinking=enable_thinking, **kwargs ) ) def stream_completion( self, model: str, prompt: str, *, images: Optional[List[Image.Image]] = None, enable_thinking: Optional[bool] = None, **kwargs ): """ Generator that yields partial response strings as they arrive. Note: Watson X streaming support depends on the SDK version and model. """ try: gen_params = {} if kwargs.get('max_tokens'): gen_params['max_new_tokens'] = kwargs['max_tokens'] if kwargs.get('temperature'): gen_params['temperature'] = kwargs['temperature'] parameters = self._TextGenParameters(**gen_params) if gen_params else None model_inference = self._ModelInference( model_id=model, credentials=self.credentials, project_id=self.project_id, params=parameters ) try: for chunk in model_inference.generate_text_stream(prompt=prompt): if chunk: yield chunk except AttributeError: result = model_inference.generate(prompt=prompt) generated_text = "" if isinstance(result, dict): generated_text = result.get('results', [{}])[0].get('generated_text', '') else: generated_text = str(result) yield generated_text except Exception as e: print(f"Error in stream_completion: {e}") yield "" if __name__ == '__main__': print("Watson X Client for IBM watsonx.ai integration") print("This client provides Ollama-compatible interface for Watson X granite models") print("\nTo use this client, you need:") print("1. IBM Cloud API key") print("2. Watson X project ID") print("3. ibm-watsonx-ai package installed") print("\nExample usage:") print(""" from rag_system.utils.watsonx_client import WatsonXClient client = WatsonXClient( api_key="your-api-key", project_id="your-project-id" ) response = client.generate_completion( model="ibm/granite-13b-chat-v2", prompt="What is AI?" ) print(response['response']) """) ================================================ FILE: requirements-docker.txt ================================================ requests python-dotenv PyPDF2 colpali-engine PyMuPDF Pillow transformers==4.51.0 torch==2.4.1 torchvision==0.19.1 lancedb rank_bm25 fuzzywuzzy python-Levenshtein torchaudio sentencepiece accelerate docling cachetools numpy networkx matplotlib psutil httpx scikit-learn pandas sentence_transformers rerankers nltk # Standard library modules (no need to install) # asyncio, logging, json, os, sys, typing, threading, itertools, math, re # ocrmac - removed for Docker compatibility (macOS-specific) ================================================ FILE: requirements.txt ================================================ requests python-dotenv PyPDF2 colpali-engine requests python-dotenv PyPDF2 colpali-engine PyMuPDF Pillow transformers==4.51.0 torch==2.4.1 torchvision==0.19.1 lancedb rank_bm25 fuzzywuzzy python-Levenshtein torchaudio sentencepiece accelerate docling cachetools numpy networkx matplotlib psutil httpx scikit-learn pandas sentence_transformers rerankers nltk ================================================ FILE: run_system.py ================================================ #!/usr/bin/env python3 """ RAG System Unified Launcher =========================== A comprehensive launcher that starts all RAG system components: - Ollama server - RAG API server (port 8001) - Backend server (port 8000) - Frontend server (port 3000) Features: - Single command startup - Real-time log aggregation - Process health monitoring - Graceful shutdown - Production-ready deployment support Usage: python run_system.py [--mode dev|prod] [--logs-only] [--no-frontend] """ import subprocess import threading import time import signal import sys import os import argparse import json import requests from pathlib import Path from datetime import datetime from typing import Dict, List, Optional, TextIO import logging from dataclasses import dataclass import psutil @dataclass class ServiceConfig: name: str command: List[str] port: int cwd: Optional[str] = None env: Optional[Dict[str, str]] = None health_check_path: str = "/health" startup_delay: int = 2 required: bool = True class ColoredFormatter(logging.Formatter): """Custom formatter with colors for different log levels and services.""" COLORS = { 'DEBUG': '\033[36m', # Cyan 'INFO': '\033[32m', # Green 'WARNING': '\033[33m', # Yellow 'ERROR': '\033[31m', # Red 'CRITICAL': '\033[35m', # Magenta } SERVICE_COLORS = { 'ollama': '\033[94m', # Blue 'rag-api': '\033[95m', # Magenta 'backend': '\033[96m', # Cyan 'frontend': '\033[93m', # Yellow 'system': '\033[92m', # Green } RESET = '\033[0m' def format(self, record): # Add service-specific coloring service_name = getattr(record, 'service', 'system') service_color = self.SERVICE_COLORS.get(service_name, self.COLORS.get(record.levelname, '')) # Format timestamp timestamp = datetime.fromtimestamp(record.created).strftime('%H:%M:%S') # Create colored log line colored_service = f"{service_color}[{service_name.upper()}]{self.RESET}" colored_level = f"{self.COLORS.get(record.levelname, '')}{record.levelname}{self.RESET}" return f"{timestamp} {colored_service} {colored_level}: {record.getMessage()}" class ServiceManager: """Manages multiple system services with logging and health monitoring.""" def __init__(self, mode: str = "dev", logs_dir: str = "logs"): self.mode = mode self.logs_dir = Path(logs_dir) self.logs_dir.mkdir(exist_ok=True) self.processes: Dict[str, subprocess.Popen] = {} self.log_threads: Dict[str, threading.Thread] = {} self.running = False # Setup logging self.setup_logging() # Service configurations self.services = self._get_service_configs() # Register signal handlers for graceful shutdown signal.signal(signal.SIGINT, self._signal_handler) signal.signal(signal.SIGTERM, self._signal_handler) def setup_logging(self): """Setup centralized logging with colors.""" # Create main logger self.logger = logging.getLogger('system') self.logger.setLevel(logging.INFO) # Console handler with colors console_handler = logging.StreamHandler(sys.stdout) console_handler.setFormatter(ColoredFormatter()) self.logger.addHandler(console_handler) # File handler for system logs file_handler = logging.FileHandler(self.logs_dir / 'system.log') file_handler.setFormatter(logging.Formatter( '%(asctime)s [%(levelname)s] %(message)s' )) self.logger.addHandler(file_handler) def _get_service_configs(self) -> Dict[str, ServiceConfig]: """Define service configurations based on mode.""" base_configs = { 'ollama': ServiceConfig( name='ollama', command=['ollama', 'serve'], port=11434, startup_delay=5, required=True ), 'rag-api': ServiceConfig( name='rag-api', command=[sys.executable, '-m', 'rag_system.api_server'], port=8001, startup_delay=3, required=True ), 'backend': ServiceConfig( name='backend', command=[sys.executable, 'backend/server.py'], port=8000, startup_delay=2, required=True ), 'frontend': ServiceConfig( name='frontend', command=['npm', 'run', 'dev' if self.mode == 'dev' else 'start'], port=3000, startup_delay=5, required=False # Optional in case Node.js not available ) } # Production mode adjustments if self.mode == 'prod': # Use production build for frontend base_configs['frontend'].command = ['npm', 'run', 'start'] # Add production environment variables base_configs['rag-api'].env = {'NODE_ENV': 'production'} base_configs['backend'].env = {'NODE_ENV': 'production'} return base_configs def _signal_handler(self, signum, frame): """Handle shutdown signals gracefully.""" self.logger.info(f"Received signal {signum}, shutting down...") self.shutdown() sys.exit(0) def is_port_in_use(self, port: int) -> bool: """Check if a port is already in use.""" try: for conn in psutil.net_connections(): if conn.laddr.port == port and conn.status == 'LISTEN': return True return False except (psutil.AccessDenied, AttributeError): # Fallback method import socket with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: return s.connect_ex(('localhost', port)) == 0 def check_prerequisites(self) -> bool: """Check if all required tools are available.""" self.logger.info("🔍 Checking prerequisites...") missing_tools = [] # Check Ollama if not self._command_exists('ollama'): missing_tools.append('ollama (https://ollama.ai)') # Check Python if not self._command_exists('python') and not self._command_exists('python3'): missing_tools.append('python') # Check Node.js (optional) if not self._command_exists('npm'): self.logger.warning("⚠️ npm not found - frontend will be disabled") self.services['frontend'].required = False if missing_tools: self.logger.error(f"❌ Missing required tools: {', '.join(missing_tools)}") return False self.logger.info("✅ All prerequisites satisfied") return True def _command_exists(self, command: str) -> bool: """Check if a command exists in PATH.""" try: subprocess.run([command, '--version'], capture_output=True, check=True, timeout=5) return True except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError): return False def ensure_models(self): """Ensure required Ollama models are available.""" self.logger.info("📥 Checking required models...") required_models = ['qwen3:8b', 'qwen3:0.6b'] try: # Get list of installed models result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, timeout=10) installed_models = result.stdout for model in required_models: if model not in installed_models: self.logger.info(f"📥 Pulling {model}...") subprocess.run(['ollama', 'pull', model], check=True, timeout=300) # 5 min timeout self.logger.info(f"✅ {model} ready") else: self.logger.info(f"✅ {model} already available") except subprocess.TimeoutExpired: self.logger.warning("⚠️ Model check timed out - continuing anyway") except subprocess.CalledProcessError as e: self.logger.warning(f"⚠️ Could not check/pull models: {e}") def start_service(self, service_name: str, config: ServiceConfig) -> bool: """Start a single service.""" if service_name in self.processes: self.logger.warning(f"⚠️ {service_name} already running") return True # Check if port is in use if self.is_port_in_use(config.port): self.logger.warning(f"⚠️ Port {config.port} already in use, skipping {service_name}") return not config.required self.logger.info(f"🔄 Starting {service_name} on port {config.port}...") try: # Setup environment env = os.environ.copy() if config.env: env.update(config.env) # Start process process = subprocess.Popen( config.command, cwd=config.cwd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True ) self.processes[service_name] = process # Start log monitoring thread log_thread = threading.Thread( target=self._monitor_service_logs, args=(service_name, process), daemon=True ) log_thread.start() self.log_threads[service_name] = log_thread # Wait for startup time.sleep(config.startup_delay) # Check if process is still running if process.poll() is None: self.logger.info(f"✅ {service_name} started successfully (PID: {process.pid})") return True else: self.logger.error(f"❌ {service_name} failed to start") return False except Exception as e: self.logger.error(f"❌ Failed to start {service_name}: {e}") return False def _monitor_service_logs(self, service_name: str, process: subprocess.Popen): """Monitor service logs and forward to main logger.""" service_logger = logging.getLogger(service_name) service_logger.setLevel(logging.INFO) # Add file handler for this service file_handler = logging.FileHandler(self.logs_dir / f'{service_name}.log') file_handler.setFormatter(logging.Formatter('%(asctime)s %(message)s')) service_logger.addHandler(file_handler) try: for line in iter(process.stdout.readline, ''): if line.strip(): # Create log record with service context record = logging.LogRecord( name=service_name, level=logging.INFO, pathname='', lineno=0, msg=line.strip(), args=(), exc_info=None ) record.service = service_name # Log to both service file and main console service_logger.handle(record) self.logger.handle(record) except Exception as e: self.logger.error(f"Error monitoring {service_name} logs: {e}") def health_check(self, service_name: str, config: ServiceConfig) -> bool: """Perform health check on a service.""" try: url = f"http://localhost:{config.port}{config.health_check_path}" response = requests.get(url, timeout=5) return response.status_code == 200 except: return False def start_all(self, skip_frontend: bool = False) -> bool: """Start all services in order.""" self.logger.info("🚀 Starting RAG System Components...") if not self.check_prerequisites(): return False self.running = True failed_services = [] # Start services in dependency order service_order = ['ollama', 'rag-api', 'backend'] if not skip_frontend and 'frontend' in self.services: service_order.append('frontend') for service_name in service_order: if service_name not in self.services: continue config = self.services[service_name] # Special handling for Ollama if service_name == 'ollama': if not self._start_ollama(): if config.required: failed_services.append(service_name) continue else: self.logger.warning(f"⚠️ Skipping optional service: {service_name}") continue else: if not self.start_service(service_name, config): if config.required: failed_services.append(service_name) else: self.logger.warning(f"⚠️ Skipping optional service: {service_name}") if failed_services: self.logger.error(f"❌ Failed to start required services: {', '.join(failed_services)}") return False # Print status summary self._print_status_summary() return True def _start_ollama(self) -> bool: """Special handling for Ollama startup.""" # Check if Ollama is already running if self.is_port_in_use(11434): self.logger.info("✅ Ollama already running") self.ensure_models() return True # Start Ollama if self.start_service('ollama', self.services['ollama']): self.ensure_models() return True return False def _print_status_summary(self): """Print system status summary.""" self.logger.info("") self.logger.info("🎉 RAG System Started!") self.logger.info("📊 Services Status:") for service_name, config in self.services.items(): if service_name in self.processes or self.is_port_in_use(config.port): status = "✅ Running" url = f"http://localhost:{config.port}" self.logger.info(f" • {service_name.capitalize():<10}: {status:<10} {url}") else: self.logger.info(f" • {service_name.capitalize():<10}: ❌ Stopped") self.logger.info("") self.logger.info("🌐 Access your RAG system at: http://localhost:3000") self.logger.info("") self.logger.info("📋 Useful commands:") self.logger.info(" • Stop system: Ctrl+C") self.logger.info(" • Check logs: tail -f logs/*.log") self.logger.info(" • Health check: python run_system.py --health") def shutdown(self): """Gracefully shutdown all services.""" if not self.running: return self.logger.info("🛑 Shutting down RAG system...") self.running = False # Stop services in reverse order for service_name in reversed(list(self.processes.keys())): self._stop_service(service_name) self.logger.info("✅ All services stopped") def _stop_service(self, service_name: str): """Stop a single service.""" if service_name not in self.processes: return process = self.processes[service_name] self.logger.info(f"🔄 Stopping {service_name}...") try: # Try graceful shutdown first process.terminate() # Wait up to 10 seconds for graceful shutdown try: process.wait(timeout=10) except subprocess.TimeoutExpired: # Force kill if graceful shutdown fails process.kill() process.wait() self.logger.info(f"✅ {service_name} stopped") except Exception as e: self.logger.error(f"❌ Error stopping {service_name}: {e}") finally: del self.processes[service_name] def monitor(self): """Monitor running services and restart if needed.""" self.logger.info("👁️ Monitoring services... (Press Ctrl+C to stop)") try: while self.running: time.sleep(30) # Check every 30 seconds for service_name, process in list(self.processes.items()): if process.poll() is not None: self.logger.warning(f"⚠️ {service_name} has stopped unexpectedly") # Restart the service config = self.services[service_name] if config.required: self.logger.info(f"🔄 Restarting {service_name}...") del self.processes[service_name] self.start_service(service_name, config) except KeyboardInterrupt: self.logger.info("Monitoring stopped by user") def main(): """Main entry point.""" parser = argparse.ArgumentParser(description='RAG System Unified Launcher') parser.add_argument('--mode', choices=['dev', 'prod'], default='dev', help='Run mode (default: dev)') parser.add_argument('--logs-only', action='store_true', help='Only show aggregated logs from running services') parser.add_argument('--no-frontend', action='store_true', help='Skip frontend startup') parser.add_argument('--health', action='store_true', help='Check health of running services') parser.add_argument('--stop', action='store_true', help='Stop all running services') args = parser.parse_args() # Create service manager manager = ServiceManager(mode=args.mode) try: if args.health: # Health check mode manager._print_status_summary() return if args.stop: # Stop mode - kill any running processes manager.logger.info("🛑 Stopping all RAG system processes...") # Implementation for stopping would go here return if args.logs_only: # Logs only mode - just tail existing logs manager.logger.info("📋 Showing aggregated logs... (Press Ctrl+C to stop)") manager.monitor() return # Normal startup mode if manager.start_all(skip_frontend=args.no_frontend): manager.monitor() else: manager.logger.error("❌ System startup failed") sys.exit(1) except KeyboardInterrupt: manager.logger.info("Received interrupt signal") finally: manager.shutdown() if __name__ == "__main__": main() ================================================ FILE: setup_rag_system.sh ================================================ #!/bin/bash # setup_rag_system.sh - Complete RAG System Setup Script # This script handles Docker installation, system setup, and initial configuration set -e # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Logging function log() { echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" } warn() { echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}" } error() { echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}" } info() { echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}" } # Check if running as root if [[ $EUID -eq 0 ]]; then error "This script should not be run as root (except for package installation steps)" exit 1 fi echo "================================================================" echo "🚀 RAG System Complete Setup Script" echo "================================================================" echo "" # Step 1: System Requirements Check log "Step 1: Checking system requirements..." # Check OS if [[ "$OSTYPE" == "darwin"* ]]; then OS="macos" info "Detected macOS" elif [[ -f /etc/os-release ]]; then . /etc/os-release OS=$ID info "Detected Linux: $OS" else error "Unsupported operating system" exit 1 fi # Check available memory MEMORY_GB=$(free -g 2>/dev/null | grep '^Mem:' | awk '{print $2}' || sysctl -n hw.memsize 2>/dev/null | awk '{print int($1/1024/1024/1024)}' || echo "unknown") if [[ "$MEMORY_GB" != "unknown" && "$MEMORY_GB" -lt 8 ]]; then warn "System has ${MEMORY_GB}GB RAM. Recommended: 16GB+ for optimal performance" else info "Memory check passed: ${MEMORY_GB}GB RAM" fi # Check available disk space DISK_GB=$(df -BG . | tail -1 | awk '{print $4}' | sed 's/G//' || echo "unknown") if [[ "$DISK_GB" != "unknown" && "$DISK_GB" -lt 50 ]]; then warn "Available disk space: ${DISK_GB}GB. Recommended: 50GB+ free space" else info "Disk space check passed: ${DISK_GB}GB available" fi # Step 2: Install Dependencies log "Step 2: Installing system dependencies..." # Install Git if not present if ! command -v git &> /dev/null; then info "Installing Git..." case $OS in "macos") if command -v brew &> /dev/null; then brew install git else error "Git not found. Please install Git first or install Homebrew" exit 1 fi ;; "ubuntu"|"debian") sudo apt-get update sudo apt-get install -y git ;; "centos"|"rhel"|"fedora") if command -v dnf &> /dev/null; then sudo dnf install -y git else sudo yum install -y git fi ;; esac else info "Git is already installed: $(git --version)" fi # Install curl if not present if ! command -v curl &> /dev/null; then info "Installing curl..." case $OS in "macos") # curl is usually pre-installed on macOS ;; "ubuntu"|"debian") sudo apt-get install -y curl ;; "centos"|"rhel"|"fedora") if command -v dnf &> /dev/null; then sudo dnf install -y curl else sudo yum install -y curl fi ;; esac else info "curl is already installed" fi # Step 3: Install Docker log "Step 3: Installing Docker..." if command -v docker &> /dev/null; then info "Docker is already installed: $(docker --version)" else info "Docker not found. Installing Docker..." case $OS in "macos") # Check if Homebrew is installed if ! command -v brew &> /dev/null; then info "Installing Homebrew..." /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" fi # Install Docker Desktop info "Installing Docker Desktop..." brew install --cask docker warn "Docker Desktop installed. Please:" warn "1. Start Docker Desktop from Applications" warn "2. Wait for Docker to start completely" warn "3. Run this script again" exit 0 ;; "ubuntu"|"debian") # Update package index sudo apt-get update # Install dependencies sudo apt-get install -y \ ca-certificates \ curl \ gnupg \ lsb-release # Add Docker's official GPG key sudo mkdir -p /etc/apt/keyrings curl -fsSL https://download.docker.com/linux/$OS/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg # Set up repository echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/$OS \ $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null # Install Docker Engine sudo apt-get update sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin # Add user to docker group sudo usermod -aG docker $USER # Start Docker service sudo systemctl enable docker sudo systemctl start docker info "Docker installed successfully!" warn "Please log out and log back in for group changes to take effect, then run this script again" warn "Or run: newgrp docker && $0" exit 0 ;; "centos"|"rhel"|"fedora") # Install required packages if command -v dnf &> /dev/null; then sudo dnf install -y yum-utils sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo sudo dnf install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin else sudo yum install -y yum-utils sudo yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo sudo yum install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin fi # Add user to docker group sudo usermod -aG docker $USER # Start Docker service sudo systemctl enable docker sudo systemctl start docker info "Docker installed successfully!" warn "Please log out and log back in for group changes to take effect, then run this script again" exit 0 ;; esac fi # Verify Docker is working if ! docker --version &> /dev/null; then error "Docker is not working properly. Please check Docker installation" exit 1 fi if ! docker compose version &> /dev/null; then error "Docker Compose is not working properly. Please check Docker Compose installation" exit 1 fi info "Docker verification passed: $(docker --version)" info "Docker Compose verification passed: $(docker compose version)" # Test Docker daemon if ! docker ps &> /dev/null; then error "Cannot connect to Docker daemon. Please ensure Docker is running" exit 1 fi # Step 4: Setup RAG System log "Step 4: Setting up RAG System..." # Create project directory structure info "Creating directory structure..." mkdir -p {lancedb,shared_uploads,logs,ollama_data} mkdir -p index_store/{overviews,bm25,graph} mkdir -p backups # Set proper permissions chmod 755 {lancedb,shared_uploads,logs,ollama_data} chmod 755 index_store/{overviews,bm25,graph} chmod 755 backups # Create environment file if [[ ! -f ".env" ]]; then info "Creating environment configuration..." cat > .env << 'EOF' # System Configuration NODE_ENV=production LOG_LEVEL=info DEBUG=false # Service URLs FRONTEND_URL=http://localhost:3000 BACKEND_URL=http://localhost:8000 RAG_API_URL=http://localhost:8001 OLLAMA_URL=http://localhost:11434 # Database Configuration DATABASE_PATH=./backend/chat_data.db LANCEDB_PATH=./lancedb UPLOADS_PATH=./shared_uploads INDEX_STORE_PATH=./index_store # Model Configuration DEFAULT_EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2 # Default model names - updated to current versions DEFAULT_GENERATION_MODEL=qwen3:8b DEFAULT_RERANKER_MODEL=answerdotai/answerai-colbert-small-v1 DEFAULT_ENRICHMENT_MODEL=qwen3:0.6b # Performance Configuration MAX_CONCURRENT_REQUESTS=5 REQUEST_TIMEOUT=300 EMBEDDING_BATCH_SIZE=32 MAX_CONTEXT_LENGTH=4096 # Security Configuration CORS_ORIGINS=http://localhost:3000 API_KEY_REQUIRED=false RATE_LIMIT_REQUESTS=100 RATE_LIMIT_WINDOW=60 # Storage Configuration MAX_FILE_SIZE=50MB MAX_UPLOAD_FILES=10 CLEANUP_INTERVAL=3600 BACKUP_RETENTION_DAYS=30 EOF info "Environment file created: .env" else info "Environment file already exists: .env" fi # Step 5: Build and Start Services log "Step 5: Building and starting services..." info "Building Docker containers (this may take 10-15 minutes)..." docker compose build --no-cache info "Starting services..." docker compose up -d # Wait for services to start info "Waiting for services to initialize..." sleep 30 # Check service status info "Checking service status..." docker compose ps # Step 6: Install AI Models log "Step 6: Installing AI models..." # Wait for Ollama to be ready info "Waiting for Ollama to be ready..." max_attempts=30 attempt=0 while ! docker compose exec ollama ollama list &> /dev/null; do if [ $attempt -ge $max_attempts ]; then error "Ollama failed to start after $max_attempts attempts" exit 1 fi info "Waiting for Ollama... (attempt $((attempt+1))/$max_attempts)" sleep 10 ((attempt++)) done # Download Ollama models info "Downloading required Ollama models..." docker compose exec ollama ollama pull qwen3:8b docker compose exec ollama ollama pull qwen3:0.6b info "Verifying model installation..." docker compose exec ollama ollama list # Step 7: System Verification log "Step 7: Verifying system installation..." # Check service health info "Checking service health..." services=("frontend:3000" "backend:8000" "rag-api:8001" "ollama:11434") for service in "${services[@]}"; do name="${service%:*}" port="${service#*:}" if curl -s -f "http://localhost:$port" &> /dev/null || curl -s -f "http://localhost:$port/health" &> /dev/null || curl -s -f "http://localhost:$port/api/tags" &> /dev/null || curl -s -f "http://localhost:$port/models" &> /dev/null; then info "✅ $name service is healthy" else warn "⚠️ $name service may not be ready yet" fi done # Step 8: Create Helper Scripts log "Step 8: Creating helper scripts..." # Create start script cat > start_rag_system.sh << 'EOF' #!/bin/bash # Start RAG System echo "Starting RAG System..." docker compose up -d echo "RAG System started. Access at: http://localhost:3000" EOF chmod +x start_rag_system.sh # Create stop script cat > stop_rag_system.sh << 'EOF' #!/bin/bash # Stop RAG System echo "Stopping RAG System..." docker compose down echo "RAG System stopped." EOF chmod +x stop_rag_system.sh # Create status script cat > status_rag_system.sh << 'EOF' #!/bin/bash # Check RAG System Status echo "=== RAG System Status ===" docker compose ps echo "" echo "=== Service Health ===" curl -s -f http://localhost:3000 && echo "✅ Frontend: OK" || echo "❌ Frontend: FAIL" curl -s -f http://localhost:8000/health && echo "✅ Backend: OK" || echo "❌ Backend: FAIL" curl -s -f http://localhost:8001/models && echo "✅ RAG API: OK" || echo "❌ RAG API: FAIL" curl -s -f http://localhost:11434/api/tags && echo "✅ Ollama: OK" || echo "❌ Ollama: FAIL" EOF chmod +x status_rag_system.sh # Create backup script cat > backup_rag_system.sh << 'EOF' #!/bin/bash # Backup RAG System Data BACKUP_DIR="./backups/$(date +%Y%m%d_%H%M%S)" mkdir -p "$BACKUP_DIR" echo "Creating backup in $BACKUP_DIR..." # Stop services docker compose down # Backup data cp -r ./backend/chat_data.db "$BACKUP_DIR/" 2>/dev/null || true cp -r ./lancedb "$BACKUP_DIR/" 2>/dev/null || true cp -r ./shared_uploads "$BACKUP_DIR/" 2>/dev/null || true cp -r ./index_store "$BACKUP_DIR/" 2>/dev/null || true # Backup configuration cp .env "$BACKUP_DIR/" cp docker-compose.yml "$BACKUP_DIR/" # Restart services docker compose up -d echo "Backup completed: $BACKUP_DIR" EOF chmod +x backup_rag_system.sh # Create update script cat > update_rag_system.sh << 'EOF' #!/bin/bash # Update RAG System echo "Updating RAG System..." # Backup first ./backup_rag_system.sh # Pull latest changes git pull origin main # Rebuild containers docker compose build --no-cache # Restart services docker compose up -d echo "Update completed!" EOF chmod +x update_rag_system.sh info "Helper scripts created:" info " - start_rag_system.sh: Start the system" info " - stop_rag_system.sh: Stop the system" info " - status_rag_system.sh: Check system status" info " - backup_rag_system.sh: Backup system data" info " - update_rag_system.sh: Update the system" # Step 9: Final Setup log "Step 9: Final setup and verification..." # Create initial database if it doesn't exist if [[ ! -f "./backend/chat_data.db" ]]; then info "Creating initial database..." docker compose exec backend python -c " import sqlite3 conn = sqlite3.connect('/app/backend/chat_data.db') conn.execute('CREATE TABLE IF NOT EXISTS sessions (id TEXT PRIMARY KEY, title TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)') conn.execute('CREATE TABLE IF NOT EXISTS messages (id INTEGER PRIMARY KEY, session_id TEXT, content TEXT, role TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)') conn.execute('CREATE TABLE IF NOT EXISTS indexes (id TEXT PRIMARY KEY, name TEXT, metadata TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)') conn.execute('CREATE TABLE IF NOT EXISTS session_indexes (session_id TEXT, index_id TEXT, PRIMARY KEY (session_id, index_id))') conn.commit() conn.close() print('Database initialized') " 2>/dev/null || warn "Database initialization may have failed" fi # Final health check info "Performing final health check..." sleep 10 ./status_rag_system.sh echo "" echo "================================================================" echo "🎉 RAG System Setup Complete!" echo "================================================================" echo "" echo "✅ System Status:" echo " - Frontend: http://localhost:3000" echo " - Backend API: http://localhost:8000" echo " - RAG API: http://localhost:8001" echo " - Ollama: http://localhost:11434" echo "" echo "📚 Documentation:" echo " - System Overview: Documentation/system_overview.md" echo " - Deployment Guide: Documentation/deployment_guide.md" echo " - Docker Usage: Documentation/docker_usage.md" echo " - Installation Guide: Documentation/installation_guide.md" echo "" echo "🔧 Helper Scripts:" echo " - Start system: ./start_rag_system.sh" echo " - Stop system: ./stop_rag_system.sh" echo " - Check status: ./status_rag_system.sh" echo " - Backup data: ./backup_rag_system.sh" echo " - Update system: ./update_rag_system.sh" echo "" echo "🚀 Next Steps:" echo " 1. Open http://localhost:3000 in your browser" echo " 2. Create a new chat session" echo " 3. Upload some PDF documents" echo " 4. Start asking questions about your documents!" echo "" echo "📋 System Information:" echo " - OS: $OS" echo " - Memory: ${MEMORY_GB}GB" echo " - Disk Space: ${DISK_GB}GB available" echo " - Docker: $(docker --version)" echo " - Docker Compose: $(docker compose version)" echo "" echo "For support and troubleshooting, check the documentation in the" echo "Documentation/ folder or run ./status_rag_system.sh to check system health." echo "" ================================================ FILE: simple_create_index.sh ================================================ #!/bin/bash # Simple Index Creation Script for LocalGPT RAG System # Usage: ./simple_create_index.sh "Index Name" "path/to/document.pdf" [additional_files...] set -e # Exit on any error # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Function to print colored output print_status() { echo -e "${BLUE}[INFO]${NC} $1" } print_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } print_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } print_error() { echo -e "${RED}[ERROR]${NC} $1" } # Function to check if a command exists command_exists() { command -v "$1" >/dev/null 2>&1 } # Function to check prerequisites check_prerequisites() { print_status "Checking prerequisites..." # Check Python if ! command_exists python3; then print_error "Python 3 is required but not installed." exit 1 fi # Check if we're in the right directory if [ ! -f "run_system.py" ] || [ ! -d "rag_system" ]; then print_error "This script must be run from the LocalGPT project root directory." exit 1 fi # Check if Ollama is running if ! curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then print_error "Ollama is not running. Please start Ollama first:" echo " ollama serve" exit 1 fi print_success "Prerequisites check passed" } # Function to validate documents validate_documents() { local documents=("$@") local valid_docs=() print_status "Validating documents..." for doc in "${documents[@]}"; do if [ -f "$doc" ]; then # Check file extension case "${doc##*.}" in pdf|txt|docx|md|html|htm) valid_docs+=("$doc") print_status "✓ Valid document: $doc" ;; *) print_warning "Unsupported file type: $doc (skipping)" ;; esac else print_warning "File not found: $doc (skipping)" fi done if [ ${#valid_docs[@]} -eq 0 ]; then print_error "No valid documents found." exit 1 fi echo "${valid_docs[@]}" } # Function to create index using Python create_index() { local index_name="$1" shift local documents=("$@") print_status "Creating index: $index_name" print_status "Documents: ${documents[*]}" # Create a temporary Python script to create the index cat > /tmp/create_index_temp.py << EOF #!/usr/bin/env python3 import sys import os import json sys.path.insert(0, os.getcwd()) from rag_system.main import PIPELINE_CONFIGS from rag_system.pipelines.indexing_pipeline import IndexingPipeline from rag_system.utils.ollama_client import OllamaClient from backend.database import ChatDatabase import uuid def create_index_simple(): try: # Initialize database db = ChatDatabase() # Create index record index_id = db.create_index( name="$index_name", description="Created with simple_create_index.sh", metadata={ "chunk_size": 512, "chunk_overlap": 64, "enable_enrich": True, "enable_latechunk": True, "retrieval_mode": "hybrid", "created_by": "simple_create_index.sh" } ) # Add documents to index documents = [${documents[@]/#/\"} ${documents[@]/%/\"}] for doc_path in documents: if doc_path.strip(): # Skip empty strings filename = os.path.basename(doc_path.strip()) db.add_document_to_index(index_id, filename, os.path.abspath(doc_path.strip())) # Initialize pipeline config = PIPELINE_CONFIGS.get("default", {}) ollama_client = OllamaClient() ollama_config = { "generation_model": "qwen3:0.6b", "embedding_model": "qwen3:0.6b" } pipeline = IndexingPipeline(config, ollama_client, ollama_config) # Process documents valid_docs = [doc.strip() for doc in documents if doc.strip() and os.path.exists(doc.strip())] if valid_docs: pipeline.process_documents(valid_docs) print(f"✅ Index '{index_name}' created successfully!") print(f"Index ID: {index_id}") print(f"Processed {len(valid_docs)} documents") return index_id except Exception as e: print(f"❌ Error creating index: {e}") import traceback traceback.print_exc() return None if __name__ == "__main__": create_index_simple() EOF # Run the Python script python3 /tmp/create_index_temp.py # Clean up rm -f /tmp/create_index_temp.py } # Function to show usage show_usage() { echo "Usage: $0 \"Index Name\" \"path/to/document.pdf\" [additional_files...]" echo "" echo "Examples:" echo " $0 \"My Documents\" \"document.pdf\"" echo " $0 \"Research Papers\" \"paper1.pdf\" \"paper2.pdf\" \"notes.txt\"" echo " $0 \"Invoice Collection\" ./invoices/*.pdf" echo "" echo "Supported file types: PDF, TXT, DOCX, MD, HTML" } # Main script main() { # Check arguments if [ $# -lt 2 ]; then print_error "Insufficient arguments provided." show_usage exit 1 fi local index_name="$1" shift local documents=("$@") # Check prerequisites check_prerequisites # Validate documents local valid_documents valid_documents=($(validate_documents "${documents[@]}")) if [ ${#valid_documents[@]} -eq 0 ]; then print_error "No valid documents to process." exit 1 fi # Create the index print_status "Starting index creation process..." create_index "$index_name" "${valid_documents[@]}" print_success "Index creation completed!" print_status "You can now use the index in the LocalGPT interface." } # Run main function with all arguments main "$@" ================================================ FILE: src/app/globals.css ================================================ @import "tailwindcss"; @import "tw-animate-css"; @custom-variant dark (&:is(.dark *)); @theme inline { --color-background: var(--background); --color-foreground: var(--foreground); --font-sans: var(--font-geist-sans); --font-mono: var(--font-geist-mono); --color-sidebar-ring: var(--sidebar-ring); --color-sidebar-border: var(--sidebar-border); --color-sidebar-accent-foreground: var(--sidebar-accent-foreground); --color-sidebar-accent: var(--sidebar-accent); --color-sidebar-primary-foreground: var(--sidebar-primary-foreground); --color-sidebar-primary: var(--sidebar-primary); --color-sidebar-foreground: var(--sidebar-foreground); --color-sidebar: var(--sidebar); --color-chart-5: var(--chart-5); --color-chart-4: var(--chart-4); --color-chart-3: var(--chart-3); --color-chart-2: var(--chart-2); --color-chart-1: var(--chart-1); --color-ring: var(--ring); --color-input: var(--input); --color-border: var(--border); --color-destructive: var(--destructive); --color-accent-foreground: var(--accent-foreground); --color-accent: var(--accent); --color-muted-foreground: var(--muted-foreground); --color-muted: var(--muted); --color-secondary-foreground: var(--secondary-foreground); --color-secondary: var(--secondary); --color-primary-foreground: var(--primary-foreground); --color-primary: var(--primary); --color-popover-foreground: var(--popover-foreground); --color-popover: var(--popover); --color-card-foreground: var(--card-foreground); --color-card: var(--card); --radius-sm: calc(var(--radius) - 4px); --radius-md: calc(var(--radius) - 2px); --radius-lg: var(--radius); --radius-xl: calc(var(--radius) + 4px); } :root { --radius: 0.625rem; --background: oklch(1 0 0); --foreground: oklch(0.145 0 0); --card: oklch(1 0 0); --card-foreground: oklch(0.145 0 0); --popover: oklch(1 0 0); --popover-foreground: oklch(0.145 0 0); --primary: oklch(0.205 0 0); --primary-foreground: oklch(0.985 0 0); --secondary: oklch(0.97 0 0); --secondary-foreground: oklch(0.205 0 0); --muted: oklch(0.97 0 0); --muted-foreground: oklch(0.556 0 0); --accent: oklch(0.97 0 0); --accent-foreground: oklch(0.205 0 0); --destructive: oklch(0.577 0.245 27.325); --border: oklch(0.922 0 0); --input: oklch(0.922 0 0); --ring: oklch(0.708 0 0); --chart-1: oklch(0.646 0.222 41.116); --chart-2: oklch(0.6 0.118 184.704); --chart-3: oklch(0.398 0.07 227.392); --chart-4: oklch(0.828 0.189 84.429); --chart-5: oklch(0.769 0.188 70.08); --sidebar: oklch(0.985 0 0); --sidebar-foreground: oklch(0.145 0 0); --sidebar-primary: oklch(0.205 0 0); --sidebar-primary-foreground: oklch(0.985 0 0); --sidebar-accent: oklch(0.97 0 0); --sidebar-accent-foreground: oklch(0.205 0 0); --sidebar-border: oklch(0.922 0 0); --sidebar-ring: oklch(0.708 0 0); } .dark { --background: oklch(0.145 0 0); --foreground: oklch(0.985 0 0); --card: oklch(0.205 0 0); --card-foreground: oklch(0.985 0 0); --popover: oklch(0.205 0 0); --popover-foreground: oklch(0.985 0 0); --primary: oklch(0.922 0 0); --primary-foreground: oklch(0.205 0 0); --secondary: oklch(0.269 0 0); --secondary-foreground: oklch(0.985 0 0); --muted: oklch(0.269 0 0); --muted-foreground: oklch(0.708 0 0); --accent: oklch(0.269 0 0); --accent-foreground: oklch(0.985 0 0); --destructive: oklch(0.704 0.191 22.216); --border: oklch(1 0 0 / 10%); --input: oklch(1 0 0 / 15%); --ring: oklch(0.556 0 0); --chart-1: oklch(0.488 0.243 264.376); --chart-2: oklch(0.696 0.17 162.48); --chart-3: oklch(0.769 0.188 70.08); --chart-4: oklch(0.627 0.265 303.9); --chart-5: oklch(0.645 0.246 16.439); --sidebar: oklch(0.205 0 0); --sidebar-foreground: oklch(0.985 0 0); --sidebar-primary: oklch(0.488 0.243 264.376); --sidebar-primary-foreground: oklch(0.985 0 0); --sidebar-accent: oklch(0.269 0 0); --sidebar-accent-foreground: oklch(0.985 0 0); --sidebar-border: oklch(1 0 0 / 10%); --sidebar-ring: oklch(0.556 0 0); } @layer base { * { @apply border-border outline-ring/50; } html { @apply bg-black overflow-x-hidden overflow-y-hidden; font-size: 17px; } body { @apply bg-black text-white overflow-x-hidden; } } /* Style for tokens */ .thinking-block summary::-webkit-details-marker { display: none; } .thinking-block summary::after { content: "▸"; display: inline-block; margin-left: 4px; transform-origin: center; transition: transform 0.15s ease-out; } .thinking-block[open] summary::after { transform: rotate(90deg); } .thinking-block summary { outline: none; } .thinking-block div { color: #9ca3af; font-style: italic; } ================================================ FILE: src/app/layout.tsx ================================================ import type { Metadata } from "next"; import { Geist, Geist_Mono } from "next/font/google"; import "./globals.css"; const geistSans = Geist({ variable: "--font-geist-sans", subsets: ["latin"], }); const geistMono = Geist_Mono({ variable: "--font-geist-mono", subsets: ["latin"], }); export const metadata: Metadata = { title: "Create Next App", description: "Generated by create next app", }; export default function RootLayout({ children, }: Readonly<{ children: React.ReactNode; }>) { return ( {children} ); } ================================================ FILE: src/app/page.tsx ================================================ import { Demo } from "@/components/demo"; export default function Home() { return (
); } ================================================ FILE: src/components/IndexForm.tsx ================================================ "use client"; import { useState } from 'react'; import { GlassInput } from '@/components/ui/GlassInput'; import { GlassToggle } from '@/components/ui/GlassToggle'; import { AccordionGroup } from '@/components/ui/AccordionGroup'; import { ModelSelect } from '@/components/ModelSelect'; import { chatAPI, ChatSession } from '@/lib/api'; import { InfoTooltip } from '@/components/ui/InfoTooltip'; interface Props { onClose: () => void; onIndexed?: (session: ChatSession) => void; } export function IndexForm({ onClose, onIndexed }: Props) { const [files, setFiles] = useState(null); const [indexName, setIndexName] = useState(''); const [chunkSize, setChunkSize] = useState(512); const [chunkOverlap, setChunkOverlap] = useState(64); const [windowSize, setWindowSize] = useState(5); const [enableEnrich, setEnableEnrich] = useState(true); const [retrievalMode, setRetrievalMode] = useState<'hybrid' | 'vector' | 'fts'>('hybrid'); const [embeddingModel, setEmbeddingModel] = useState(); const DEFAULT_LLM = 'qwen3:0.6b'; const [enrichModel, setEnrichModel] = useState(DEFAULT_LLM); const [overviewModel, setOverviewModel] = useState(DEFAULT_LLM); const [batchSizeEmbed, setBatchSizeEmbed] = useState(64); const [batchSizeEnrich, setBatchSizeEnrich] = useState(64); const [loading, setLoading] = useState(false); const [enableLateChunk, setEnableLateChunk] = useState(false); const [enableDoclingChunk, setEnableDoclingChunk] = useState(true); const handleSubmit = async () => { if (!files) return; setLoading(true); try { // 1. create index record const { index_id } = await chatAPI.createIndex(indexName); // 2. upload files to index await chatAPI.uploadFilesToIndex(index_id, Array.from(files)); // 3. build index (run pipeline) with ALL OPTIONS await chatAPI.buildIndex(index_id, { latechunk: enableLateChunk, doclingChunk: enableDoclingChunk, chunkSize: chunkSize, chunkOverlap: chunkOverlap, retrievalMode: retrievalMode==='fts' ? 'bm25' : retrievalMode, windowSize: windowSize, enableEnrich: enableEnrich, embeddingModel: embeddingModel, enrichModel: enrichModel, overviewModel: overviewModel, batchSizeEmbed: batchSizeEmbed, batchSizeEnrich: batchSizeEnrich }); // 4. create chat session and link index const session = await chatAPI.createSession(indexName); await chatAPI.linkIndexToSession(session.id, index_id); // 5. callback if (onIndexed) onIndexed(session); } catch (e) { console.error('Indexing failed', e); setLoading(false); alert('Indexing failed. See console for details.'); } }; return (
{/* Loading overlay */} {loading && (

Indexing… this may take a moment

)}

Create new index

{/* Index name */}
setIndexName(e.target.value)} />
{/* Upload & defaults */}
{files &&

{files.length} file(s) selected

}
{/* Retrieval mode & Late-chunk toggle */}
{(['hybrid','vector','fts'] as const).map((m)=>( ))}
Late-chunk vectors
High-recall chunking
setChunkSize(parseInt(e.target.value))} />
setChunkOverlap(parseInt(e.target.value))} />
{/* Embedding & Overview models */}
{/* Contextual retrieval section */} Contextual Retrieval }>
Enable
setWindowSize(parseInt(e.target.value))} />
{/* Advanced */} Batch Size }>
setBatchSizeEmbed(parseInt(e.target.value))} />
setBatchSizeEnrich(parseInt(e.target.value))} />
); } ================================================ FILE: src/components/IndexPicker.tsx ================================================ import { useEffect, useState } from 'react'; import { chatAPI } from '@/lib/api'; interface Props { onSelect: (indexId: string) => void; onClose: () => void; } export default function IndexPicker({ onSelect, onClose }: Props) { const [indexes, setIndexes] = useState([]); const [loading, setLoading] = useState(true); const [error, setError] = useState(null); const [search, setSearch] = useState(''); const [menuOpenId, setMenuOpenId] = useState(null); useEffect(() => { (async () => { try { const data = await chatAPI.listIndexes(); setIndexes(data.indexes); } catch (e: any) { setError(e.message || 'Failed to load indexes'); } finally { setLoading(false); } })(); }, []); const filtered = indexes.filter(i => i.name.toLowerCase().includes(search.toLowerCase())); async function handleDelete(idxId: string, name: string) { if (!confirm(`Delete index "${name}"? This cannot be undone.`)) return; try { await chatAPI.deleteIndex(idxId); setIndexes(prev => prev.filter(i => i.id!==idxId)); setMenuOpenId(null); } catch (e:any){ alert(e.message || 'Failed to delete index'); } } useEffect(() => { function handleOutside(e: MouseEvent) { if ((e.target as Element).closest('.index-row-menu') === null) { setMenuOpenId(null); } } if (menuOpenId) { document.addEventListener('click', handleOutside); } return () => document.removeEventListener('click', handleOutside); }, [menuOpenId]); return (

Select an index

setSearch(e.target.value)} placeholder="Search…" className="w-full px-3 py-2 rounded bg-black/30 border border-white/20 focus:outline-none" /> {loading &&

Loading…

} {error &&

{error}

} {!loading && !error && (
    {filtered.map(idx => (
  • {menuOpenId===idx.id && (
    )}
  • ))} {filtered.length===0 &&

    No indexes found.

    }
)}
); } ================================================ FILE: src/components/IndexWizard.tsx ================================================ "use client"; import { useState } from 'react'; import { ModelSelect } from '@/components/ModelSelect'; interface Props { onClose: () => void; } export function IndexWizard({ onClose }: Props) { const [files, setFiles] = useState(null); const [chunkSize, setChunkSize] = useState(512); const [chunkOverlap, setChunkOverlap] = useState(64); const [embeddingModel, setEmbeddingModel] = useState(); // TODO: more params const handleFile = (e: React.ChangeEvent) => { setFiles(e.target.files); }; return (

Create new index

setChunkSize(parseInt(e.target.value))} className="w-full bg-gray-800 rounded px-2 py-1" />
setChunkOverlap(parseInt(e.target.value))} className="w-full bg-gray-800 rounded px-2 py-1" />
); } ================================================ FILE: src/components/LandingMenu.tsx ================================================ "use client"; import React from 'react'; interface Props { onSelect: (mode: 'INDEX' | 'CHAT_EXISTING' | 'QUICK_CHAT') => void; } export function LandingMenu({ onSelect }: Props) { const Tile = ({ label, mode, icon }: { label: string; mode: Props["onSelect"] extends (m: infer U)=>void ? U: never; icon: React.ReactNode;}) => ( ); const FileIcon = ( ); const DbIcon = ( ); const ChatIcon = ( ); return (
); } ================================================ FILE: src/components/Markdown.tsx ================================================ // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-nocheck 'use client' import dynamic from 'next/dynamic' import React, { useMemo } from 'react' import remarkGfm from 'remark-gfm' // Dynamically import react-markdown to avoid SSR issues const ReactMarkdown: any = dynamic(() => import('react-markdown') as any, { ssr: false }) interface MarkdownProps { text: string className?: string } export default function Markdown({ text, className = '' }: MarkdownProps) { const plugins = useMemo(() => [remarkGfm], []) return (
{/* @ts-ignore – react-markdown type doesn't recognise remarkPlugins array */} ( ), }} > {text}
) } ================================================ FILE: src/components/ModelSelect.tsx ================================================ import { useEffect, useState } from 'react'; import { chatAPI, ModelsResponse } from '@/lib/api'; interface Props { value: string | undefined; onChange: (v: string) => void; type: 'generation' | 'embedding'; className?: string; placeholder?: string; } export function ModelSelect({ value, onChange, type, className, placeholder }: Props) { const [models, setModels] = useState([]); const [loading, setLoading] = useState(true); const [error, setError] = useState(null); useEffect(() => { let mounted = true; chatAPI .getModels() .then((res: ModelsResponse) => { if (!mounted) return; const list = type === 'generation' ? res.generation_models : res.embedding_models; setModels(list); // Auto-select default qwen3:0.6b if available and not chosen yet if(!value && list.includes('qwen3:0.6b')){ onChange('qwen3:0.6b'); } setLoading(false); }) .catch((e) => { if (!mounted) return; setError(String(e)); setLoading(false); }); return () => { mounted = false; }; }, [type]); if (loading) { return ( ); } if (error || models.length === 0) { return ( ); } return ( ); } ================================================ FILE: src/components/SessionIndexInfo.tsx ================================================ import { useEffect, useState } from 'react'; import { chatAPI, ChatSession } from '@/lib/api'; interface Props { sessionId: string; onClose: () => void; } export default function SessionIndexInfo({ sessionId, onClose }: Props) { const [files, setFiles] = useState([]); const [indexMeta, setIndexMeta] = useState(null); const [session, setSession] = useState(null); const [loading, setLoading] = useState(true); const [error, setError] = useState(null); useEffect(() => { (async () => { try { const data = await chatAPI.getSessionIndexes(sessionId); const first = data.indexes[0]; if(first){ setSession(first.session??{...first, title:first.name, model_used:first.model_used||''}); setFiles(first.documents?.map((d:any)=>d.filename) || []); setIndexMeta(first.metadata || {}); } else { setError('No indexes linked to this chat'); } } catch (e:any){ setError(e.message||'Failed to load'); } finally{ setLoading(false);} })(); }, [sessionId]); const hasMetadata = indexMeta && Object.keys(indexMeta).length > 0; const isInferredMetadata = indexMeta?.metadata_source === 'lancedb_inspection'; const indexStatus = indexMeta?.status; const getStatusMessage = () => { if (!hasMetadata) { return { type: 'warning', title: '⚠️ No Configuration Data', message: 'This index was created before metadata tracking was implemented. Configuration details are not available.' }; } if (indexStatus === 'incomplete') { return { type: 'error', title: '❌ Index Incomplete', message: indexMeta.issue || 'The index appears to be incomplete or was never properly built.' }; } if (indexStatus === 'empty') { return { type: 'error', title: '❌ Index Empty', message: 'The vector table exists but contains no data. The index may need to be rebuilt.' }; } if (indexStatus === 'legacy') { return { type: 'warning', title: '⚠️ Legacy Index', message: indexMeta.issue || 'This index was created before metadata tracking was implemented. Configuration details are not available.' }; } if (isInferredMetadata) { return { type: 'info', title: '🔍 Metadata Inferred', message: 'This metadata was inferred from the vector database structure. Some configuration details may be incomplete.' }; } if (indexStatus === 'functional') { // Check if we have complete configuration metadata const hasCompleteConfig = indexMeta.chunk_size && indexMeta.chunk_overlap !== undefined && indexMeta.retrieval_mode && indexMeta.embedding_model; // Only show limited message if we truly have limited data if (indexMeta.inspection_limitation && !hasCompleteConfig) { return { type: 'info', title: '🔍 Limited Configuration Data', message: 'This index is functional but detailed configuration inspection requires direct RAG system access. Basic information is shown below.' }; } // Don't show any status message for functional indexes with complete metadata return null; } return null; }; const statusMessage = getStatusMessage(); return (

Index details

{loading &&

Loading…

} {error &&

{error}

} {(!loading && !error) && ( <>
Name

{session?.title}

{statusMessage && (

{statusMessage.title}

{statusMessage.message}

)} {hasMetadata && (indexStatus === 'functional' || indexStatus === 'created' || !indexStatus) && ( <> {/* Basic Information */}
{(indexMeta.embedding_model || indexMeta.embedding_model_inferred) && (
Embedding model

{indexMeta.embedding_model || indexMeta.embedding_model_inferred} {indexMeta.embedding_model_inferred && (inferred)}

)} {(indexMeta.retrieval_mode || indexMeta.retrieval_mode_inferred) && (
Retrieval mode

{indexMeta.retrieval_mode || indexMeta.retrieval_mode_inferred} {indexMeta.retrieval_mode_inferred && (inferred)}

)} {indexMeta.vector_dimensions && (
Vector dimensions

{indexMeta.vector_dimensions}

)} {indexMeta.total_chunks && (
Total chunks

{indexMeta.total_chunks.toLocaleString()}

)}
{/* Chunk Configuration */}
{(typeof indexMeta.chunk_size==='number' || indexMeta.chunk_size_inferred) && (
Chunk size

{typeof indexMeta.chunk_size==='number' ? `${indexMeta.chunk_size} tokens` : indexMeta.chunk_size_inferred} {indexMeta.chunk_size_inferred && (estimated)}

)} {typeof indexMeta.chunk_overlap==='number' && (
Chunk overlap

{indexMeta.chunk_overlap} tokens

)}
{/* Context and Features */}
{typeof indexMeta.window_size==='number' && (
Context window

{indexMeta.window_size}

)} {typeof indexMeta.enable_enrich==='boolean' && (
Contextual enrichment

{indexMeta.enable_enrich ? '✅ Enabled' : '❌ Disabled'}

)} {indexMeta.has_contextual_enrichment && (
Contextual enrichment

🔍 Detected

)}
{/* Advanced features */}
{typeof indexMeta.latechunk==='boolean' && (
Late-chunk vectors

{indexMeta.latechunk ? '✅ Enabled' : '❌ Disabled'}

)} {typeof indexMeta.docling_chunk==='boolean' && (
High-recall chunking

{indexMeta.docling_chunk ? '✅ Enabled' : '❌ Disabled'}

)} {indexMeta.has_fts_index && (
Full-text search

🔍 Available

)} {indexMeta.has_document_structure && (
Document structure

🔍 Organized

)}
{/* LLM Models section */} {(indexMeta.enrich_model || indexMeta.overview_model) && ( <>

LLM Models

{indexMeta.enrich_model && (
Enrichment LLM

{indexMeta.enrich_model}

)} {indexMeta.overview_model && (
Overview LLM

{indexMeta.overview_model}

)}
)} {/* Batch sizes section */} {(typeof indexMeta.batch_size_embed==='number' || typeof indexMeta.batch_size_enrich==='number') && ( <>

Batch Configuration

{typeof indexMeta.batch_size_embed==='number' && (
Embedding batch size

{indexMeta.batch_size_embed}

)} {typeof indexMeta.batch_size_enrich==='number' && (
Enrichment batch size

{indexMeta.batch_size_enrich}

)}
)} {/* Metadata info */} {isInferredMetadata && indexMeta.metadata_inferred_at && (

Metadata Information

Inferred at: {new Date(indexMeta.metadata_inferred_at).toLocaleString()}

Source: LanceDB table inspection

{indexMeta.sample_chunk_length && (

Sample chunk length: {indexMeta.sample_chunk_length} characters

)}
)} )} {/* Legacy index information */} {hasMetadata && indexStatus === 'legacy' && ( <>
{typeof indexMeta.documents_count === 'number' && (
Documents

{indexMeta.documents_count}

)} {indexMeta.created_at && (
Created

{new Date(indexMeta.created_at).toLocaleDateString()}

)} {indexMeta.vector_table_name && (
Vector table

{indexMeta.vector_table_name}

)}
{indexMeta.note && (

Technical Note

{indexMeta.note}

)} )} {/* Debug info for incomplete indexes */} {indexStatus === 'incomplete' && indexMeta.available_tables && (

Debug Information

Expected table: {indexMeta.vector_table_expected}

Available tables: {indexMeta.available_tables.join(', ') || 'None'}

)}
Files ({files.length})
    {files.map((f) => (
  • {f}
  • ))}
)}
); } ================================================ FILE: src/components/demo.tsx ================================================ "use client"; import { useState, useEffect } from "react" import { LocalGPTChat } from "@/components/ui/localgpt-chat" import { SessionSidebar } from "@/components/ui/session-sidebar" import { SessionChat } from '@/components/ui/session-chat' import { chatAPI, ChatSession } from "@/lib/api" import { LandingMenu } from "@/components/LandingMenu"; import { IndexForm } from "@/components/IndexForm"; import SessionIndexInfo from "@/components/SessionIndexInfo"; import IndexPicker from "@/components/IndexPicker"; import { QuickChat } from '@/components/ui/quick-chat' export function Demo() { const [currentSessionId, setCurrentSessionId] = useState() const [currentSession, setCurrentSession] = useState(null) const [showConversation, setShowConversation] = useState(false) const [backendStatus, setBackendStatus] = useState<'checking' | 'connected' | 'error'>('checking') const [sidebarRef, setSidebarRef] = useState<{ refreshSessions: () => Promise } | null>(null) const [homeMode, setHomeMode] = useState<'HOME' | 'INDEX' | 'CHAT_EXISTING' | 'QUICK_CHAT'>('HOME') const [showIndexInfo, setShowIndexInfo] = useState(false) const [showIndexPicker, setShowIndexPicker] = useState(false) const [sidebarOpen, setSidebarOpen] = useState(true) console.log('Demo component rendering...') useEffect(() => { console.log('Demo component mounted') checkBackendHealth() }, []) const checkBackendHealth = async () => { try { const health = await chatAPI.checkHealth() setBackendStatus('connected') console.log('Backend connected:', health) } catch (error) { console.error('Backend health check failed:', error) setBackendStatus('error') } } const handleSessionSelect = (sessionId: string) => { setCurrentSessionId(sessionId) setShowConversation(true) setHomeMode('CHAT_EXISTING') // Ensure we're in the right mode to show SessionChat } const handleNewSession = () => { // Reset state and return to landing page so user can choose chat type setCurrentSessionId(undefined) setCurrentSession(null) setShowConversation(false) // Hide conversation view & sidebar setHomeMode('HOME') // Show landing selector (Create index / Chat with index / LLM Chat) } const handleSessionChange = async (session: ChatSession) => { setCurrentSession(session) // Update the current session ID if it changed (e.g., brand-new session) if (session.id !== currentSessionId) { setCurrentSessionId(session.id) } // Always refresh the sidebar so that updated titles / message counts are displayed if (sidebarRef) { await sidebarRef.refreshSessions() } } const handleSessionDelete = (deletedSessionId: string) => { if (currentSessionId === deletedSessionId) { // Stay in conversation mode but show empty state setCurrentSessionId(undefined) setCurrentSession(null) } } const handleStartConversation = () => { if (backendStatus === 'connected') { // Just show empty state, don't create session yet handleNewSession() } else { setShowConversation(true) } } return (
{/* Top App Bar */}
{homeMode !== 'HOME' && (

localGPT

)}
{/* Main content row */}
{/* Session Sidebar */} {sidebarOpen && showConversation && (homeMode === 'CHAT_EXISTING' || homeMode === 'QUICK_CHAT') && ( )}
{homeMode === 'HOME' ? (

LocalGPT

What can I help you find today?

{ if(m==='CHAT_EXISTING'){ setShowIndexPicker(true); return; } if(m==='QUICK_CHAT'){ setHomeMode('QUICK_CHAT'); setShowConversation(true); return; } setHomeMode('INDEX'); }} />
{backendStatus === 'checking' && (
Connecting to backend...
)} {backendStatus === 'connected' && (
Backend connected • Session-based chat ready
)} {backendStatus === 'error' && (
Backend offline • Start backend server to enable chat
)}
) : homeMode==='CHAT_EXISTING' ? ( ) : homeMode==='QUICK_CHAT' ? ( ) : null}
{homeMode==='INDEX' && (
setHomeMode('HOME')} onIndexed={(s)=>{setHomeMode('CHAT_EXISTING'); handleSessionSelect(s.id);}} />
)} {showIndexInfo && currentSessionId && ( setShowIndexInfo(false)} /> )} {showIndexPicker && ( setShowIndexPicker(false)} onSelect={async (idxId)=>{ // create session and link index then open chat const session = await chatAPI.createSession() await chatAPI.linkIndexToSession(session.id, idxId) setShowIndexPicker(false) setHomeMode('CHAT_EXISTING') handleSessionSelect(session.id) }} /> )}
); } ================================================ FILE: src/components/ui/AccordionGroup.tsx ================================================ "use client"; import React from 'react'; interface Props { title: React.ReactNode; children: React.ReactNode; defaultOpen?: boolean; } export function AccordionGroup({ title, children, defaultOpen }: Props) { return (
{title}
{children}
); } ================================================ FILE: src/components/ui/GlassInput.tsx ================================================ "use client"; import React, { InputHTMLAttributes } from 'react'; export function GlassInput(props: InputHTMLAttributes) { return ( ); } ================================================ FILE: src/components/ui/GlassSelect.tsx ================================================ "use client"; import React, { SelectHTMLAttributes } from 'react'; export function GlassSelect(props: SelectHTMLAttributes) { return ( ); } ================================================ FILE: src/components/ui/GlassToggle.tsx ================================================ "use client"; import React from 'react'; interface Props { checked: boolean; onChange: (v: boolean) => void; } export function GlassToggle({ checked, onChange }: Props) { return ( ); } ================================================ FILE: src/components/ui/InfoTooltip.tsx ================================================ import { useState } from "react"; import { Info } from "lucide-react"; interface Props { text: string; className?: string; size?: number; } // A lightweight hover / focus tooltip used next to form labels. // It shows a small Info icon; on hover (or focus) a dark glassy popover appears. export function InfoTooltip({ text, className = "", size = 14 }: Props) { const [open, setOpen] = useState(false); return ( setOpen(true)} onMouseLeave={() => setOpen(false)} onFocus={() => setOpen(true)} onBlur={() => setOpen(false)} tabIndex={0} > {open && (
{text}
)}
); } ================================================ FILE: src/components/ui/avatar.tsx ================================================ "use client" import * as React from "react" import * as AvatarPrimitive from "@radix-ui/react-avatar" import { cn } from "@/lib/utils" function Avatar({ className, ...props }: React.ComponentProps) { return ( ) } function AvatarImage({ className, ...props }: React.ComponentProps) { return ( ) } function AvatarFallback({ className, ...props }: React.ComponentProps) { return ( ) } export { Avatar, AvatarImage, AvatarFallback } ================================================ FILE: src/components/ui/badge.tsx ================================================ import * as React from "react" import { Slot } from "@radix-ui/react-slot" import { cva, type VariantProps } from "class-variance-authority" import { cn } from "@/lib/utils" const badgeVariants = cva( "inline-flex items-center justify-center rounded-md border px-2 py-0.5 text-xs font-medium w-fit whitespace-nowrap shrink-0 [&>svg]:size-3 gap-1 [&>svg]:pointer-events-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive transition-[color,box-shadow] overflow-hidden", { variants: { variant: { default: "border-transparent bg-primary text-primary-foreground [a&]:hover:bg-primary/90", secondary: "border-transparent bg-secondary text-secondary-foreground [a&]:hover:bg-secondary/90", destructive: "border-transparent bg-destructive text-white [a&]:hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60", outline: "text-foreground [a&]:hover:bg-accent [a&]:hover:text-accent-foreground", }, }, defaultVariants: { variant: "default", }, } ) function Badge({ className, variant, asChild = false, ...props }: React.ComponentProps<"span"> & VariantProps & { asChild?: boolean }) { const Comp = asChild ? Slot : "span" return ( ) } export { Badge, badgeVariants } ================================================ FILE: src/components/ui/button.tsx ================================================ import * as React from "react" import { Slot } from "@radix-ui/react-slot" import { cva, type VariantProps } from "class-variance-authority" import { cn } from "@/lib/utils" const buttonVariants = cva( "inline-flex items-center justify-center gap-2 whitespace-nowrap rounded-md text-sm font-medium transition-all disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg:not([class*='size-'])]:size-4 shrink-0 [&_svg]:shrink-0 outline-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive", { variants: { variant: { default: "bg-primary text-primary-foreground shadow-xs hover:bg-primary/90", destructive: "bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60", outline: "border bg-background shadow-xs hover:bg-accent hover:text-accent-foreground dark:bg-input/30 dark:border-input dark:hover:bg-input/50", secondary: "bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80", ghost: "hover:bg-accent hover:text-accent-foreground dark:hover:bg-accent/50", link: "text-primary underline-offset-4 hover:underline", }, size: { default: "h-9 px-4 py-2 has-[>svg]:px-3", sm: "h-8 rounded-md gap-1.5 px-3 has-[>svg]:px-2.5", lg: "h-10 rounded-md px-6 has-[>svg]:px-4", icon: "size-9", }, }, defaultVariants: { variant: "default", size: "default", }, } ) function Button({ className, variant, size, asChild = false, ...props }: React.ComponentProps<"button"> & VariantProps & { asChild?: boolean }) { const Comp = asChild ? Slot : "button" return ( ) } export { Button, buttonVariants } ================================================ FILE: src/components/ui/chat-bubble-demo.tsx ================================================ "use client" import { ChatBubble, ChatBubbleAvatar, ChatBubbleMessage } from "@/components/ui/chat-bubble" import { Copy, RefreshCcw } from "lucide-react" const messages = [ { id: 1, message: "Help me with my essay.", sender: "user", }, { id: 2, message: "I can help you with that. What do you need help with?", sender: "bot", }, ] const actionIcons = [ { icon: Copy, type: "Copy" }, { icon: RefreshCcw, type: "Regenerate" }, ] export function ChatBubbleVariants() { return (
I have a question about the library. Sure, I'd be happy to help!
) } export function ChatBubbleAiLayout() { return (
{messages.map((message, index) => { const variant = message.sender === "user" ? "sent" : "received" return (
{message.message} {message.sender === "bot" && (
{actionIcons.map(({ icon: Icon, type }) => ( ))}
)}
) })}
) } export function ChatBubbleStates() { return (
Error processing request
) } ================================================ FILE: src/components/ui/chat-bubble.tsx ================================================ "use client" import * as React from "react" import { cn } from "@/lib/utils" import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar" import { Button } from "@/components/ui/button" import { MessageLoading } from "@/components/ui/message-loading"; interface ChatBubbleProps { variant?: "sent" | "received" layout?: "default" | "ai" className?: string children: React.ReactNode } export function ChatBubble({ variant = "received", layout = "default", // eslint-disable-line @typescript-eslint/no-unused-vars className, children, }: ChatBubbleProps) { return (
{children}
) } interface ChatBubbleMessageProps { variant?: "sent" | "received" isLoading?: boolean className?: string children?: React.ReactNode } export function ChatBubbleMessage({ variant = "received", isLoading, className, children, }: ChatBubbleMessageProps) { return (
{isLoading ? (
) : ( children )}
) } interface ChatBubbleAvatarProps { src?: string fallback?: string className?: string } export function ChatBubbleAvatar({ src, fallback = "AI", className, }: ChatBubbleAvatarProps) { return ( {src && } {fallback} ) } interface ChatBubbleActionProps { icon?: React.ReactNode onClick?: () => void className?: string } export function ChatBubbleAction({ icon, onClick, className, }: ChatBubbleActionProps) { return ( ) } export function ChatBubbleActionWrapper({ className, children, }: { className?: string children: React.ReactNode }) { return (
{children}
) } ================================================ FILE: src/components/ui/chat-input.tsx ================================================ "use client" import * as React from "react" import { useState, useRef } from "react" import { ArrowUp, Settings as SettingsIcon, Plus, X, FileText } from "lucide-react" import { Button } from "@/components/ui/button" import { AttachedFile } from "@/lib/types" interface ChatInputProps { onSendMessage: (message: string, attachedFiles?: AttachedFile[]) => Promise disabled?: boolean placeholder?: string className?: string onOpenSettings?: () => void onAddIndex?: () => void leftExtras?: React.ReactNode } export function ChatInput({ onSendMessage, disabled = false, placeholder = "Message localGPT...", className = "", onOpenSettings, onAddIndex, leftExtras }: ChatInputProps) { const [message, setMessage] = useState("") const [attachedFiles, setAttachedFiles] = useState([]) const [isLoading, setIsLoading] = useState(false) const textareaRef = useRef(null) const fileInputRef = useRef(null) const handleSubmit = async (e: React.FormEvent) => { e.preventDefault() if ((!message.trim() && attachedFiles.length === 0) || disabled || isLoading) return const messageToSend = message.trim() const filesToSend = [...attachedFiles] setMessage("") setAttachedFiles([]) setIsLoading(true) try { await onSendMessage(messageToSend, filesToSend) } catch (error) { console.error("Failed to send message:", error) // Restore message and files on error setMessage(messageToSend) setAttachedFiles(filesToSend) } finally { setIsLoading(false) } } const handleKeyDown = (e: React.KeyboardEvent) => { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault() handleSubmit(e as unknown as React.FormEvent) } } const handleInput = (e: React.ChangeEvent) => { setMessage(e.target.value) // Auto-resize textarea const textarea = textareaRef.current if (textarea) { textarea.style.height = 'auto' textarea.style.height = Math.min(textarea.scrollHeight, 120) + 'px' } } const handleFileAttach = () => { fileInputRef.current?.click() } const handleFileChange = (e: React.ChangeEvent) => { const files = e.target.files if (!files) return const newFiles: AttachedFile[] = [] for (let i = 0; i < files.length; i++) { const file = files[i] console.log('🔧 Frontend: File selected:', { name: file.name, size: file.size, type: file.type, lastModified: file.lastModified }); if (file.type === 'application/pdf' || file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' || file.type === 'application/msword' || file.type === 'text/html' || file.type === 'text/markdown' || file.type === 'text/plain' || file.name.toLowerCase().endsWith('.pdf') || file.name.toLowerCase().endsWith('.docx') || file.name.toLowerCase().endsWith('.doc') || file.name.toLowerCase().endsWith('.html') || file.name.toLowerCase().endsWith('.htm') || file.name.toLowerCase().endsWith('.md') || file.name.toLowerCase().endsWith('.txt')) { newFiles.push({ id: crypto.randomUUID(), name: file.name, size: file.size, type: file.type, file: file, }) } else { console.log('🔧 Frontend: File rejected - unsupported format:', file.type); } } setAttachedFiles(prev => [...prev, ...newFiles]) // Reset the input if (fileInputRef.current) { fileInputRef.current.value = '' } } const removeFile = (fileId: string) => { setAttachedFiles(prev => prev.filter(f => f.id !== fileId)) } const formatFileSize = (bytes: number) => { if (bytes === 0) return '0 Bytes' const k = 1024 const sizes = ['Bytes', 'KB', 'MB', 'GB'] const i = Math.floor(Math.log(bytes) / Math.log(k)) return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i] } return (
{/* Attached Files Display */} {attachedFiles.length > 0 && (
Attached Files:
{attachedFiles.map((file) => (
{file.name}
{formatFileSize(file.size)}
))}
)}
{/* Hidden file input (kept for future use) */} {/* Textarea */}