Repository: PromtEngineer/localGPT
Branch: main
Commit: 4d41c7d1713b
Files: 134
Total size: 878.8 KB

Directory structure:
gitextract_pt0n86zf/

├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   └── feature_request.md
│   └── pull_request_template.md
├── .gitignore
├── CONTRIBUTING.md
├── DOCKER_README.md
├── DOCKER_TROUBLESHOOTING.md
├── Dockerfile.backend
├── Dockerfile.frontend
├── Dockerfile.rag-api
├── Documentation/
│   ├── api_reference.md
│   ├── architecture_overview.md
│   ├── deployment_guide.md
│   ├── docker_usage.md
│   ├── improvement_plan.md
│   ├── indexing_pipeline.md
│   ├── installation_guide.md
│   ├── prompt_inventory.md
│   ├── quick_start.md
│   ├── retrieval_pipeline.md
│   ├── system_overview.md
│   ├── triage_system.md
│   └── verifier.md
├── LICENSE
├── README.md
├── WATSONX_README.md
├── backend/
│   ├── README.md
│   ├── database.py
│   ├── ollama_client.py
│   ├── requirements.txt
│   ├── server.py
│   ├── simple_pdf_processor.py
│   ├── test_backend.py
│   └── test_ollama_connectivity.py
├── batch_indexing_config.json
├── create_index_script.py
├── demo_batch_indexing.py
├── docker-compose.local-ollama.yml
├── docker-compose.yml
├── docker.env
├── env.example.watsonx
├── eslint.config.mjs
├── next.config.ts
├── package.json
├── postcss.config.mjs
├── rag_system/
│   ├── DOCUMENTATION.md
│   ├── README.md
│   ├── __init__.py
│   ├── agent/
│   │   ├── __init__.py
│   │   ├── loop.py
│   │   └── verifier.py
│   ├── api_server.py
│   ├── api_server_with_progress.py
│   ├── factory.py
│   ├── indexing/
│   │   ├── __init__.py
│   │   ├── contextualizer.py
│   │   ├── embedders.py
│   │   ├── graph_extractor.py
│   │   ├── latechunk.py
│   │   ├── multimodal.py
│   │   ├── overview_builder.py
│   │   └── representations.py
│   ├── ingestion/
│   │   ├── __init__.py
│   │   ├── chunking.py
│   │   ├── docling_chunker.py
│   │   └── document_converter.py
│   ├── main.py
│   ├── pipelines/
│   │   ├── __init__.py
│   │   ├── indexing_pipeline.py
│   │   └── retrieval_pipeline.py
│   ├── requirements.txt
│   ├── rerankers/
│   │   ├── __init__.py
│   │   ├── reranker.py
│   │   └── sentence_pruner.py
│   ├── retrieval/
│   │   ├── __init__.py
│   │   ├── query_transformer.py
│   │   └── retrievers.py
│   └── utils/
│       ├── batch_processor.py
│       ├── logging_utils.py
│       ├── ollama_client.py
│       ├── validate_model_config.py
│       └── watsonx_client.py
├── requirements-docker.txt
├── requirements.txt
├── run_system.py
├── setup_rag_system.sh
├── simple_create_index.sh
├── src/
│   ├── app/
│   │   ├── globals.css
│   │   ├── layout.tsx
│   │   └── page.tsx
│   ├── components/
│   │   ├── IndexForm.tsx
│   │   ├── IndexPicker.tsx
│   │   ├── IndexWizard.tsx
│   │   ├── LandingMenu.tsx
│   │   ├── Markdown.tsx
│   │   ├── ModelSelect.tsx
│   │   ├── SessionIndexInfo.tsx
│   │   ├── demo.tsx
│   │   └── ui/
│   │       ├── AccordionGroup.tsx
│   │       ├── GlassInput.tsx
│   │       ├── GlassSelect.tsx
│   │       ├── GlassToggle.tsx
│   │       ├── InfoTooltip.tsx
│   │       ├── avatar.tsx
│   │       ├── badge.tsx
│   │       ├── button.tsx
│   │       ├── chat-bubble-demo.tsx
│   │       ├── chat-bubble.tsx
│   │       ├── chat-input.tsx
│   │       ├── chat-settings-modal.tsx
│   │       ├── conversation-page.tsx
│   │       ├── dropdown-menu.tsx
│   │       ├── empty-chat-state.tsx
│   │       ├── localgpt-chat.tsx
│   │       ├── message-loading.tsx
│   │       ├── quick-chat.tsx
│   │       ├── scroll-area.tsx
│   │       ├── separator.tsx
│   │       ├── session-chat.tsx
│   │       ├── session-sidebar.tsx
│   │       ├── sidebar.tsx
│   │       ├── skeleton.tsx
│   │       └── textarea.tsx
│   ├── lib/
│   │   ├── api.ts
│   │   ├── types.ts
│   │   └── utils.ts
│   ├── test-upload.html
│   └── utils/
│       └── textNormalization.ts
├── start-docker.sh
├── system_health_check.py
├── tailwind.config.js
├── test_docker_build.sh
├── test_markdown_streaming.js
└── tsconfig.json

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve LocalGPT
title: '[BUG] '
labels: 'bug'
assignees: ''

---

## 🐛 Bug Description
A clear and concise description of what the bug is.

## 🔄 Steps to Reproduce
1. Go to '...'
2. Click on '...'
3. Scroll down to '...'
4. See error

## ✅ Expected Behavior
A clear and concise description of what you expected to happen.

## ❌ Actual Behavior
A clear and concise description of what actually happened.

## 📸 Screenshots
If applicable, add screenshots to help explain your problem.

## 🖥️ Environment Information
**Desktop/Server:**
- OS: [e.g. macOS 13.4, Ubuntu 20.04, Windows 11]
- Python Version: [e.g. 3.11.5]
- Node.js Version: [e.g. 23.10.0]
- Ollama Version: [e.g. 0.9.5]
- Docker Version: [e.g. 24.0.6] (if using Docker)

**Browser (if web interface issue):**
- Browser: [e.g. Chrome, Safari, Firefox]
- Version: [e.g. 118.0.0.0]

## 📋 System Health Check
Please run `python system_health_check.py` and paste the output:

```
[Paste system health check output here]
```

## 📝 Error Logs
Please include relevant error messages or logs:

```
[Paste error logs here]
```

## 🔧 Configuration
- Deployment method: [Docker / Direct Python]
- Models used: [e.g. qwen3:0.6b, qwen3:8b]
- Document types: [e.g. PDF, DOCX, TXT]

## 📎 Additional Context
Add any other context about the problem here.

## 🤔 Possible Solution
If you have ideas for fixing the issue, please share them here. 

================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for LocalGPT
title: '[FEATURE] '
labels: 'enhancement'
assignees: ''

---

## 🚀 Feature Request

### 📝 Is your feature request related to a problem? Please describe.
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

### 💡 Describe the solution you'd like
A clear and concise description of what you want to happen.

### 🔄 Describe alternatives you've considered
A clear and concise description of any alternative solutions or features you've considered.

### 🎯 Use Case
Describe the specific use case or scenario where this feature would be valuable:
- Who would use this feature?
- When would they use it?
- How would it improve their workflow?

### 📋 Acceptance Criteria
What would need to be implemented for this feature to be considered complete?
- [ ] Criterion 1
- [ ] Criterion 2
- [ ] Criterion 3

### 🏗️ Implementation Ideas
If you have ideas about how this could be implemented, please share:
- Which components would be affected?
- Any technical considerations?
- Potential challenges?

### 📊 Priority
How important is this feature to you?
- [ ] Critical - Blocking my use case
- [ ] High - Would significantly improve my workflow
- [ ] Medium - Nice to have
- [ ] Low - Minor improvement

### 📎 Additional Context
Add any other context, screenshots, mockups, or examples about the feature request here.

### 🔗 Related Issues
Link any related issues or discussions: 

================================================
FILE: .github/pull_request_template.md
================================================
## 📝 Description

Brief description of what this PR does.

Fixes #(issue number) <!-- If applicable -->

## 🎯 Type of Change

- [ ] 🐛 Bug fix (non-breaking change which fixes an issue)
- [ ] ✨ New feature (non-breaking change which adds functionality)
- [ ] 💥 Breaking change (fix or feature that would cause existing functionality to not work as expected)
- [ ] 📚 Documentation update
- [ ] 🧪 Test improvements
- [ ] 🔧 Code refactoring
- [ ] 🎨 UI/UX improvements

## 🧪 Testing

### Test Environment
- [ ] Tested with Docker deployment
- [ ] Tested with direct Python deployment
- [ ] Tested on macOS
- [ ] Tested on Linux
- [ ] Tested on Windows

### Test Cases
- [ ] All existing tests pass
- [ ] New tests added for new functionality
- [ ] Manual testing completed
- [ ] System health check passes

```bash
# Commands used for testing
python system_health_check.py
python run_system.py --health
# Add any specific test commands here
```

## 📋 Checklist

### Code Quality
- [ ] Code follows the project's coding standards
- [ ] Self-review of the code completed
- [ ] Code is properly commented
- [ ] Type hints added (Python)
- [ ] No console.log statements left in production code

### Documentation
- [ ] Documentation updated (if applicable)
- [ ] API documentation updated (if applicable)
- [ ] README updated (if applicable)
- [ ] CONTRIBUTING.md guidelines followed

### Dependencies
- [ ] No new dependencies added, or new dependencies are justified
- [ ] requirements.txt updated (if applicable)
- [ ] package.json updated (if applicable)

## 🖥️ Screenshots (if applicable)

Add screenshots to help reviewers understand the changes.

## 📊 Performance Impact

Describe any performance implications:
- [ ] No performance impact
- [ ] Performance improved
- [ ] Performance may be affected (explain below)

## 🔄 Migration Notes

If this is a breaking change, describe what users need to do:
- [ ] No migration needed
- [ ] Migration steps documented below

## 📎 Additional Notes

Any additional information that reviewers should know. 

================================================
FILE: .gitignore
================================================
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.

# dependencies
/node_modules
/.pnp
.pnp.*
.yarn/*
!.yarn/patches
!.yarn/plugins
!.yarn/releases
!.yarn/versions

# testing
/coverage

# next.js
/.next/
/out/

# production
/build

# misc
.DS_Store
*.pem

# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*

# env files (can opt-in for committing if needed)
.env*

# vercel
.vercel

# typescript
*.tsbuildinfo
next-env.d.ts

# Python
__pycache__/
*.pyc

# Local Data
/index_store
/shared_uploads
chat_history.db
*.pkl

# Backend generated files
backend/shared_uploads/

# Vector DB artefacts
lancedb/
index_store/overviews/

# Logs and runtime output
logs/
*.log

# SQLite or other database files
*.db
#backend/*.db
# backend/chat_history.db
backend/chroma_db/
backend/chroma_db/**

# Document and user-uploaded files (PDFs, images, etc.)
rag_system/documents/
*.pdf

# Ensure docker.env remains tracked
!docker.env
!backend/chat_data.db


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to LocalGPT

Thank you for your interest in contributing to LocalGPT! This guide will help you get started with contributing to our private document intelligence platform.

## 🚀 Quick Start for Contributors

### Prerequisites
- Python 3.8+ (we test with 3.11.5)
- Node.js 16+ (we test with 23.10.0)
- Git
- Ollama (for local AI models)

### Development Setup

1. **Fork and Clone**
   ```bash
   # Fork the repository on GitHub, then clone your fork
   git clone https://github.com/YOUR_USERNAME/multimodal_rag.git
   cd multimodal_rag
   
   # Add upstream remote
   git remote add upstream https://github.com/PromtEngineer/multimodal_rag.git
   ```

2. **Set Up Development Environment**
   ```bash
   # Install Python dependencies
   pip install -r requirements.txt
   
   # Install Node.js dependencies
   npm install
   
   # Install Ollama and models
   curl -fsSL https://ollama.ai/install.sh | sh
   ollama pull qwen3:0.6b
   ollama pull qwen3:8b
   ```

3. **Verify Setup**
   ```bash
   # Run health check
   python system_health_check.py
   
   # Start development system
   python run_system.py --mode dev
   ```

## 📋 Development Workflow

### Branch Strategy

We use a feature branch workflow:

- `main` - Production-ready code
- `docker` - Docker deployment features and documentation
- `feature/*` - New features
- `fix/*` - Bug fixes
- `docs/*` - Documentation updates

### Making Changes

1. **Create a Feature Branch**
   ```bash
   # Update your main branch
   git checkout main
   git pull upstream main
   
   # Create feature branch
   git checkout -b feature/your-feature-name
   ```

2. **Make Your Changes**
   - Follow our [coding standards](#coding-standards)
   - Write tests for new functionality
   - Update documentation as needed

3. **Test Your Changes**
   ```bash
   # Run health checks
   python system_health_check.py
   
   # Test specific components
   python -m pytest tests/ -v
   
   # Test system integration
   python run_system.py --health
   ```

4. **Commit Your Changes**
   ```bash
   git add .
   git commit -m "feat: add new feature description"
   ```

5. **Push and Create PR**
   ```bash
   git push origin feature/your-feature-name
   # Create pull request on GitHub
   ```

## 🎯 Types of Contributions

### 🐛 Bug Fixes
- Check existing issues first
- Include reproduction steps
- Add tests to prevent regression

### ✨ New Features
- Discuss in issues before implementing
- Follow existing architecture patterns
- Include comprehensive tests
- Update documentation

### 📚 Documentation
- Fix typos and improve clarity
- Add examples and use cases
- Update API documentation
- Improve setup guides

### 🧪 Testing
- Add unit tests
- Improve integration tests
- Add performance benchmarks
- Test edge cases

## 📝 Coding Standards

### Python Code Style

We follow PEP 8 with some modifications:

```python
# Use type hints
def process_document(file_path: str, config: Dict[str, Any]) -> ProcessingResult:
    """Process a document with the given configuration.
    
    Args:
        file_path: Path to the document file
        config: Processing configuration dictionary
        
    Returns:
        ProcessingResult object with metadata and chunks
    """
    pass

# Use descriptive variable names
embedding_model_name = "Qwen/Qwen3-Embedding-0.6B"
retrieval_results = retriever.search(query, top_k=20)

# Use dataclasses for structured data
@dataclass
class IndexingConfig:
    embedding_batch_size: int = 50
    enable_late_chunking: bool = True
    chunk_size: int = 512
```

### TypeScript/React Code Style

```typescript
// Use TypeScript interfaces
interface ChatMessage {
  id: string;
  content: string;
  role: 'user' | 'assistant';
  timestamp: Date;
  sources?: DocumentSource[];
}

// Use functional components with hooks
const ChatInterface: React.FC<ChatProps> = ({ sessionId }) => {
  const [messages, setMessages] = useState<ChatMessage[]>([]);
  
  const handleSendMessage = useCallback(async (content: string) => {
    // Implementation
  }, [sessionId]);
  
  return (
    <div className="chat-interface">
      {/* Component JSX */}
    </div>
  );
};
```

### File Organization

```
rag_system/
├── agent/           # ReAct agent implementation
├── indexing/        # Document processing and indexing
├── retrieval/       # Search and retrieval components
├── pipelines/       # End-to-end processing pipelines
├── rerankers/       # Result reranking implementations
└── utils/           # Shared utilities

src/
├── components/      # React components
├── lib/            # Utility functions and API clients
└── app/            # Next.js app router pages
```

## 🧪 Testing Guidelines

### Unit Tests
```python
# Test file: tests/test_embeddings.py
import pytest
from rag_system.indexing.embedders import HuggingFaceEmbedder

def test_embedding_generation():
    embedder = HuggingFaceEmbedder("sentence-transformers/all-MiniLM-L6-v2")
    embeddings = embedder.create_embeddings(["test text"])
    
    assert embeddings.shape[0] == 1
    assert embeddings.shape[1] == 384  # Model dimension
    assert embeddings.dtype == np.float32
```

### Integration Tests
```python
# Test file: tests/test_integration.py
def test_end_to_end_indexing():
    """Test complete document indexing pipeline."""
    agent = get_agent("test")
    result = agent.index_documents(["test_document.pdf"])
    
    assert result.success
    assert len(result.indexed_chunks) > 0
```

### Frontend Tests
```typescript
// Test file: src/components/__tests__/ChatInterface.test.tsx
import { render, screen, fireEvent } from '@testing-library/react';
import { ChatInterface } from '../ChatInterface';

test('sends message when form is submitted', async () => {
  render(<ChatInterface sessionId="test-session" />);
  
  const input = screen.getByPlaceholderText('Type your message...');
  const button = screen.getByRole('button', { name: /send/i });
  
  fireEvent.change(input, { target: { value: 'test message' } });
  fireEvent.click(button);
  
  expect(screen.getByText('test message')).toBeInTheDocument();
});
```

## 📖 Documentation Standards

### Code Documentation
```python
def create_index(
    documents: List[str],
    config: IndexingConfig,
    progress_callback: Optional[Callable[[float], None]] = None
) -> IndexingResult:
    """Create a searchable index from documents.
    
    This function processes documents through the complete indexing pipeline:
    1. Text extraction and chunking
    2. Embedding generation
    3. Vector database storage
    4. BM25 index creation
    
    Args:
        documents: List of document file paths to index
        config: Indexing configuration with model settings and parameters
        progress_callback: Optional callback function for progress updates
        
    Returns:
        IndexingResult containing success status, metrics, and any errors
        
    Raises:
        IndexingError: If document processing fails
        ModelLoadError: If embedding model cannot be loaded
        
    Example:
        >>> config = IndexingConfig(embedding_batch_size=32)
        >>> result = create_index(["doc1.pdf", "doc2.pdf"], config)
        >>> print(f"Indexed {result.chunk_count} chunks")
    """
```

### API Documentation
```python
# Use OpenAPI/FastAPI documentation
@app.post("/chat", response_model=ChatResponse)
async def chat_endpoint(request: ChatRequest) -> ChatResponse:
    """Chat with indexed documents.
    
    Send a natural language query and receive an AI-generated response
    based on the indexed document collection.
    
    - **query**: The user's question or prompt
    - **session_id**: Chat session identifier
    - **search_type**: Type of search (vector, hybrid, bm25)
    - **retrieval_k**: Number of documents to retrieve
    
    Returns a response with the AI-generated answer and source documents.
    """
```

## 🔧 Development Tools

### Recommended VS Code Extensions
```json
{
  "recommendations": [
    "ms-python.python",
    "ms-python.pylint",
    "ms-python.black-formatter",
    "bradlc.vscode-tailwindcss",
    "esbenp.prettier-vscode",
    "ms-vscode.vscode-typescript-next"
  ]
}
```

### Pre-commit Hooks
```bash
# Install pre-commit
pip install pre-commit

# Set up hooks
pre-commit install

# Run manually
pre-commit run --all-files
```

### Development Scripts
```bash
# Lint Python code
python -m pylint rag_system/

# Format Python code
python -m black rag_system/

# Type check
python -m mypy rag_system/

# Lint TypeScript
npm run lint

# Format TypeScript
npm run format
```

## 🐛 Issue Reporting

### Bug Reports
When reporting bugs, please include:

1. **Environment Information**
   ```
   - OS: macOS 13.4
   - Python: 3.11.5
   - Node.js: 23.10.0
   - Ollama: 0.9.5
   ```

2. **Steps to Reproduce**
   ```
   1. Start system with `python run_system.py`
   2. Upload document via web interface
   3. Ask question "What is this document about?"
   4. Error occurs during response generation
   ```

3. **Expected vs Actual Behavior**
4. **Error Messages and Logs**
5. **Screenshots (if applicable)**

### Feature Requests
Include:
- **Use Case**: Why is this feature needed?
- **Proposed Solution**: How should it work?
- **Alternatives**: What other approaches were considered?
- **Additional Context**: Any relevant examples or references

## 📦 Release Process

### Version Numbering
We use semantic versioning (semver):
- `MAJOR.MINOR.PATCH`
- Major: Breaking changes
- Minor: New features (backward compatible)
- Patch: Bug fixes

### Release Checklist
- [ ] All tests pass
- [ ] Documentation updated
- [ ] Version bumped in relevant files
- [ ] Changelog updated
- [ ] Docker images built and tested
- [ ] Release notes prepared

## 🤝 Community Guidelines

### Code of Conduct
- Be respectful and inclusive
- Focus on constructive feedback
- Help others learn and grow
- Maintain professional communication

### Getting Help
- **GitHub Issues**: For bugs and feature requests
- **GitHub Discussions**: For questions and general discussion
- **Documentation**: Check existing docs first
- **Code Review**: Provide thoughtful, actionable feedback

## 🎯 Project Priorities

### Current Focus Areas
1. **Performance Optimization**: Improving indexing and retrieval speed
2. **Model Support**: Adding more embedding and generation models
3. **User Experience**: Enhancing the web interface
4. **Documentation**: Improving setup and usage guides
5. **Testing**: Expanding test coverage

### Architecture Goals
- **Modularity**: Components should be loosely coupled
- **Extensibility**: Easy to add new models and features
- **Performance**: Optimize for speed and memory usage
- **Reliability**: Robust error handling and recovery
- **Privacy**: Keep user data secure and local

## 📚 Additional Resources

### Learning Resources
- [RAG System Architecture Overview](Documentation/architecture_overview.md)
- [API Reference](Documentation/api_reference.md)
- [Deployment Guide](Documentation/deployment_guide.md)
- [Troubleshooting Guide](DOCKER_TROUBLESHOOTING.md)

### External References
- [LangChain Documentation](https://python.langchain.com/)
- [Ollama Documentation](https://ollama.ai/docs)
- [Next.js Documentation](https://nextjs.org/docs)
- [FastAPI Documentation](https://fastapi.tiangolo.com/)

---

## 🙏 Thank You!

Thank you for contributing to LocalGPT! Your contributions help make private document intelligence accessible to everyone.

For questions about contributing, please:
1. Check existing documentation
2. Search existing issues
3. Create a new issue with the `question` label
4. Join our community discussions

Happy coding! 🚀 

================================================
FILE: DOCKER_README.md
================================================
# 🐳 LocalGPT Docker Deployment Guide

This guide covers running LocalGPT using Docker containers with local Ollama for optimal performance.

## 🚀 Quick Start

### Complete Setup (5 Minutes)
```bash
# 1. Install Ollama locally
curl -fsSL https://ollama.ai/install.sh | sh

# 2. Start Ollama server
ollama serve

# 3. Install required models (in another terminal)
ollama pull qwen3:0.6b
ollama pull qwen3:8b

# 4. Clone and start LocalGPT
git clone https://github.com/your-org/rag-system.git
cd rag-system
./start-docker.sh

# 5. Access the application
open http://localhost:3000
```

## 📋 Prerequisites

- **Docker Desktop** installed and running
- **Ollama** installed locally (required for best performance)
- **8GB+ RAM** (16GB recommended for larger models)
- **10GB+ free disk space**

## 🏗️ Architecture

### Current Setup (Local Ollama + Docker Containers)
```
┌─────────────────┐    ┌─────────────────┐    ┌─────────────────┐
│   Frontend      │────│    Backend      │────│    RAG API      │
│  (Container)    │    │  (Container)    │    │  (Container)    │
│   Port: 3000    │    │   Port: 8000    │    │   Port: 8001    │
└─────────────────┘    └─────────────────┘    └─────────────────┘
                                                        │
                                                        │ API calls
                                                        ▼
                                               ┌─────────────────┐
                                               │     Ollama      │
                                               │ (Local/Host)    │
                                               │   Port: 11434   │
                                               └─────────────────┘
```

**Why Local Ollama?**
- ✅ Better performance (direct GPU access)
- ✅ Simpler setup (one less container)
- ✅ Easier model management
- ✅ More reliable connection

## 🛠️ Container Details

### Frontend Container (rag-frontend)
- **Image**: Custom Node.js 18 build
- **Port**: 3000
- **Purpose**: Next.js web interface
- **Health Check**: HTTP GET to /
- **Memory**: ~500MB

### Backend Container (rag-backend) 
- **Image**: Custom Python 3.11 build
- **Port**: 8000
- **Purpose**: Session management, chat history, API gateway
- **Health Check**: HTTP GET to /health
- **Memory**: ~300MB

### RAG API Container (rag-api)
- **Image**: Custom Python 3.11 build
- **Port**: 8001
- **Purpose**: Document indexing, retrieval, AI processing
- **Health Check**: HTTP GET to /models
- **Memory**: ~2GB (varies with model usage)

## 📂 Volume Mounts & Data

### Persistent Data
- `./lancedb/` → Vector database storage
- `./index_store/` → Document indexes and metadata
- `./shared_uploads/` → Uploaded document files
- `./backend/chat_data.db` → SQLite chat history database

### Shared Between Containers
All containers share access to document storage and databases through bind mounts.

## 🔧 Configuration

### Environment Variables (docker.env)
```bash
# Ollama Configuration
OLLAMA_HOST=http://host.docker.internal:11434

# Service Configuration  
NODE_ENV=production
RAG_API_URL=http://rag-api:8001
NEXT_PUBLIC_API_URL=http://localhost:8000

# Database Paths (inside containers)
DATABASE_PATH=/app/backend/chat_data.db
LANCEDB_PATH=/app/lancedb
UPLOADS_PATH=/app/shared_uploads
```

### Model Configuration
The system uses these models by default:
- **Embedding**: `Qwen/Qwen3-Embedding-0.6B` (1024 dimensions)
- **Generation**: `qwen3:0.6b` (fast) or `qwen3:8b` (high quality)
- **Reranking**: Built-in cross-encoder

## 🎯 Management Commands

### Start/Stop Services
```bash
# Start all services
./start-docker.sh

# Stop all services
./start-docker.sh stop

# Restart services
./start-docker.sh stop && ./start-docker.sh
```

### Monitor Services
```bash
# Check container status
./start-docker.sh status
docker compose ps

# View live logs
./start-docker.sh logs
docker compose logs -f

# View specific service logs
docker compose logs -f rag-api
docker compose logs -f backend
docker compose logs -f frontend
```

### Manual Docker Compose
```bash
# Start manually
docker compose --env-file docker.env up --build -d

# Stop manually
docker compose down

# Rebuild specific service
docker compose build --no-cache rag-api
docker compose up -d rag-api
```

### Health Checks
```bash
# Test all endpoints
curl -f http://localhost:3000 && echo "✅ Frontend OK"
curl -f http://localhost:8000/health && echo "✅ Backend OK"
curl -f http://localhost:8001/models && echo "✅ RAG API OK"
curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK"
```

## 🐞 Debugging

### Access Container Shells
```bash
# RAG API container (most debugging happens here)
docker compose exec rag-api bash

# Backend container
docker compose exec backend bash

# Frontend container
docker compose exec frontend sh
```

### Common Debug Commands
```bash
# Test RAG system initialization
docker compose exec rag-api python -c "
from rag_system.main import get_agent
agent = get_agent('default')
print('✅ RAG System OK')
"

# Test Ollama connection from container
docker compose exec rag-api curl http://host.docker.internal:11434/api/tags

# Check environment variables
docker compose exec rag-api env | grep OLLAMA

# View Python packages
docker compose exec rag-api pip list | grep -E "(torch|transformers|lancedb)"
```

### Resource Monitoring
```bash
# Monitor container resources
docker stats

# Check disk usage
docker system df
df -h ./lancedb ./shared_uploads

# Check memory usage by service
docker stats --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.MemPerc}}"
```

## 🚨 Troubleshooting

### Common Issues

#### Container Won't Start
```bash
# Check logs for specific error
docker compose logs [service-name]

# Rebuild from scratch
./start-docker.sh stop
docker system prune -f
./start-docker.sh

# Check for port conflicts
lsof -i :3000 -i :8000 -i :8001
```

#### Can't Connect to Ollama
```bash
# Verify Ollama is running
curl http://localhost:11434/api/tags

# Restart Ollama
pkill ollama
ollama serve

# Test from container
docker compose exec rag-api curl http://host.docker.internal:11434/api/tags
```

#### Memory Issues
```bash
# Check memory usage
docker stats --no-stream
free -h  # On host

# Increase Docker memory limit
# Docker Desktop → Settings → Resources → Memory → 8GB+

# Use smaller models
ollama pull qwen3:0.6b  # Instead of qwen3:8b
```

#### Frontend Build Errors
```bash
# Clean build
docker compose build --no-cache frontend
docker compose up -d frontend

# Check frontend logs
docker compose logs frontend
```

#### Database/Storage Issues
```bash
# Check file permissions
ls -la backend/chat_data.db
ls -la lancedb/

# Reset permissions
chmod 664 backend/chat_data.db
chmod -R 755 lancedb/ shared_uploads/

# Test database access
docker compose exec backend sqlite3 /app/backend/chat_data.db ".tables"
```

### Performance Issues

#### Slow Response Times
- Use faster models: `qwen3:0.6b` instead of `qwen3:8b`
- Increase Docker memory allocation
- Ensure SSD storage for databases
- Monitor with `docker stats`

#### High Memory Usage
- Reduce batch sizes in configuration
- Use smaller embedding models
- Clear unused Docker resources: `docker system prune`

### Complete Reset
```bash
# Nuclear option - reset everything
./start-docker.sh stop
docker system prune -a --volumes
rm -rf lancedb/* shared_uploads/* backend/chat_data.db
./start-docker.sh
```

## 🏆 Success Criteria

Your Docker deployment is successful when:

- ✅ `./start-docker.sh status` shows all containers healthy
- ✅ All health checks pass (see commands above)  
- ✅ You can access http://localhost:3000
- ✅ You can upload documents and create indexes
- ✅ You can chat with your documents
- ✅ No errors in container logs

### Performance Benchmarks

**Good Performance:**
- Container startup: < 2 minutes
- Index creation: < 2 min per 100MB document
- Query response: < 30 seconds
- Memory usage: < 4GB total containers

**Optimal Performance:**
- Container startup: < 1 minute
- Index creation: < 1 min per 100MB document  
- Query response: < 10 seconds
- Memory usage: < 2GB total containers

## 📚 Additional Resources

- **Detailed Troubleshooting**: See `DOCKER_TROUBLESHOOTING.md`
- **Complete Documentation**: See `Documentation/docker_usage.md`
- **System Architecture**: See `Documentation/architecture_overview.md`
- **Direct Development**: See main `README.md` for non-Docker setup

---

**Happy Dockerizing! 🐳** Need help? Check the troubleshooting guide or open an issue. 

================================================
FILE: DOCKER_TROUBLESHOOTING.md
================================================
# 🐳 Docker Troubleshooting Guide - LocalGPT

_Last updated: 2025-01-07_

This guide helps diagnose and fix Docker-related issues with LocalGPT's containerized deployment.

---

## 🏁 Quick Health Check

### System Status Check
```bash
# Check Docker daemon
docker version

# Check Ollama status  
curl http://localhost:11434/api/tags

# Check containers
./start-docker.sh status

# Test all endpoints
curl -f http://localhost:3000 && echo "✅ Frontend OK"
curl -f http://localhost:8000/health && echo "✅ Backend OK"
curl -f http://localhost:8001/models && echo "✅ RAG API OK"
curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK"
```

### Expected Success Output
```
✅ Frontend OK
✅ Backend OK
✅ RAG API OK
✅ Ollama OK
```

---

## 🚨 Common Issues & Solutions

### 1. Docker Daemon Issues

#### Problem: "Cannot connect to Docker daemon"
```
Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?
```

#### Solution A: Restart Docker Desktop (macOS/Windows)
```bash
# Quit Docker Desktop completely
# macOS: Click Docker icon → "Quit Docker Desktop"
# Windows: Right-click Docker icon → "Quit Docker Desktop"

# Wait for it to fully shut down
sleep 10

# Start Docker Desktop
open -a Docker  # macOS
# Windows: Click Docker Desktop from Start menu

# Wait for Docker to be ready (2-3 minutes)
docker version
```

#### Solution B: Linux Docker Service
```bash
# Check Docker service status
sudo systemctl status docker

# Restart Docker service
sudo systemctl restart docker

# Enable auto-start
sudo systemctl enable docker

# Test connection
docker version
```

#### Solution C: Hard Reset
```bash
# Kill all Docker processes
sudo pkill -f docker

# Remove socket files
sudo rm -f /var/run/docker.sock
sudo rm -f /Users/prompt/.docker/run/docker.sock  # macOS

# Restart Docker Desktop
open -a Docker  # macOS
```

### 2. Ollama Connection Issues

#### Problem: RAG API can't connect to Ollama
```
ConnectionError: Failed to connect to Ollama at http://host.docker.internal:11434
```

#### Solution A: Verify Ollama is Running
```bash
# Check if Ollama is running
curl http://localhost:11434/api/tags

# If not running, start it
ollama serve

# Install required models
ollama pull qwen3:0.6b
ollama pull qwen3:8b
```

#### Solution B: Test from Container
```bash
# Test Ollama connection from RAG API container
docker compose exec rag-api curl http://host.docker.internal:11434/api/tags

# If this fails, check Docker network settings
docker network ls
docker network inspect rag_system_old_default
```

#### Solution C: Alternative Ollama Host
```bash
# Edit docker.env to use different host
echo "OLLAMA_HOST=http://172.17.0.1:11434" >> docker.env

# Or use IP address
echo "OLLAMA_HOST=http://$(ipconfig getifaddr en0):11434" >> docker.env  # macOS
```

### 3. Container Build Failures

#### Problem: Frontend build fails
```
ERROR: Failed to build frontend container
```

#### Solution: Clean Build
```bash
# Stop containers
./start-docker.sh stop

# Clean Docker cache
docker system prune -f
docker builder prune -f

# Rebuild frontend only
docker compose build --no-cache frontend
docker compose up -d frontend

# Check logs
docker compose logs frontend
```

#### Problem: Python package installation fails
```
ERROR: Could not install packages due to an EnvironmentError
```

#### Solution: Update Dependencies
```bash
# Check requirements file exists
ls -la requirements-docker.txt

# Test package installation locally
pip install -r requirements-docker.txt --dry-run

# Rebuild with updated base image
docker compose build --no-cache --pull rag-api
```

### 4. Port Conflicts

#### Problem: "Port already in use"
```
Error starting userland proxy: listen tcp4 0.0.0.0:3000: bind: address already in use
```

#### Solution: Find and Kill Conflicting Processes
```bash
# Check what's using the ports
lsof -i :3000 -i :8000 -i :8001

# Kill specific processes
pkill -f "npm run dev"      # Frontend
pkill -f "server.py"        # Backend
pkill -f "api_server"       # RAG API

# Or kill by port
sudo kill -9 $(lsof -t -i:3000)
sudo kill -9 $(lsof -t -i:8000)
sudo kill -9 $(lsof -t -i:8001)

# Restart containers
./start-docker.sh
```

### 5. Memory Issues

#### Problem: Containers crash due to OOM (Out of Memory)
```
Container killed due to memory limit
```

#### Solution: Increase Docker Memory
```bash
# Check current memory usage
docker stats --no-stream

# Increase Docker Desktop memory allocation
# Docker Desktop → Settings → Resources → Memory → 8GB+

# Monitor memory usage
docker stats

# Use smaller models if needed
ollama pull qwen3:0.6b  # Instead of qwen3:8b
```

#### Problem: System running slow
```bash
# Check host memory
free -h  # Linux
vm_stat  # macOS

# Clean up Docker resources
docker system prune -f
docker volume prune -f
```

### 6. Volume Mount Issues

#### Problem: Permission denied accessing files
```
Permission denied: /app/lancedb
```

#### Solution: Fix Permissions
```bash
# Create directories if they don't exist
mkdir -p lancedb index_store shared_uploads backend

# Fix permissions
chmod -R 755 lancedb index_store shared_uploads
chmod 664 backend/chat_data.db

# Check ownership
ls -la lancedb/ shared_uploads/ backend/

# Reset permissions if needed
sudo chown -R $USER:$USER lancedb shared_uploads backend
```

#### Problem: Database file not found
```
No such file or directory: '/app/backend/chat_data.db'
```

#### Solution: Initialize Database
```bash
# Create empty database file
touch backend/chat_data.db

# Or initialize with schema
python -c "
from backend.database import ChatDatabase
db = ChatDatabase()
db.init_database()
print('Database initialized')
"

# Restart containers
./start-docker.sh stop
./start-docker.sh
```

---

## 🔍 Advanced Debugging

### Container-Level Debugging

#### Access Container Shells
```bash
# RAG API container (most issues happen here)
docker compose exec rag-api bash

# Check environment variables
docker compose exec rag-api env | grep -E "(OLLAMA|RAG|NODE)"

# Test Python imports
docker compose exec rag-api python -c "
import sys
print('Python version:', sys.version)
from rag_system.main import get_agent
print('✅ RAG system imports work')
"

# Backend container
docker compose exec backend bash
python -c "
from backend.database import ChatDatabase
print('✅ Database imports work')
"

# Frontend container  
docker compose exec frontend sh
npm --version
node --version
```

#### Check Container Resources
```bash
# Monitor real-time resource usage
docker stats

# Check individual container health
docker compose ps
docker inspect rag-api --format='{{.State.Health.Status}}'

# View container configurations
docker compose config
```

#### Network Debugging
```bash
# Check network connectivity
docker compose exec rag-api ping backend
docker compose exec backend ping rag-api
docker compose exec rag-api ping host.docker.internal

# Check DNS resolution
docker compose exec rag-api nslookup host.docker.internal

# Test HTTP connections
docker compose exec rag-api curl -v http://backend:8000/health
docker compose exec rag-api curl -v http://host.docker.internal:11434/api/tags
```

### Log Analysis

#### Container Logs
```bash
# View all logs
./start-docker.sh logs

# Follow specific service logs
docker compose logs -f rag-api
docker compose logs -f backend
docker compose logs -f frontend

# Search for errors
docker compose logs rag-api 2>&1 | grep -i error
docker compose logs backend 2>&1 | grep -i "traceback\|error"

# Save logs to file
docker compose logs > docker-debug.log 2>&1
```

#### System Logs
```bash
# Docker daemon logs (Linux)
journalctl -u docker.service -f

# macOS: Check Console app for Docker logs
# Windows: Check Event Viewer
```

---

## 🧪 Testing & Validation

### Manual Container Testing

#### Test Individual Containers
```bash
# Test RAG API alone
docker build -f Dockerfile.rag-api -t test-rag-api .
docker run --rm -p 8001:8001 -e OLLAMA_HOST=http://host.docker.internal:11434 test-rag-api &
sleep 30
curl http://localhost:8001/models
pkill -f test-rag-api

# Test Backend alone
docker build -f Dockerfile.backend -t test-backend .
docker run --rm -p 8000:8000 test-backend &
sleep 30
curl http://localhost:8000/health
pkill -f test-backend
```

#### Integration Testing
```bash
# Full system test
./start-docker.sh

# Wait for all services to be ready
sleep 60

# Test complete workflow
curl -X POST http://localhost:8000/sessions \
  -H "Content-Type: application/json" \
  -d '{"title": "Test Session"}'

# Test document upload (if you have a test PDF)
# curl -X POST http://localhost:8000/upload -F "file=@test.pdf"

# Clean up
./start-docker.sh stop
```

### Automated Testing Script

Create `test-docker-health.sh`:
```bash
#!/bin/bash
set -e

echo "🐳 Docker Health Test Starting..."

# Start containers
./start-docker.sh

# Wait for services
echo "⏳ Waiting for services to start..."
sleep 60

# Test endpoints
echo "🔍 Testing endpoints..."
curl -f http://localhost:3000 && echo "✅ Frontend OK" || echo "❌ Frontend FAIL"
curl -f http://localhost:8000/health && echo "✅ Backend OK" || echo "❌ Backend FAIL"  
curl -f http://localhost:8001/models && echo "✅ RAG API OK" || echo "❌ RAG API FAIL"
curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK" || echo "❌ Ollama FAIL"

# Test container health
echo "🔍 Checking container health..."
docker compose ps

echo "🎉 Health test complete!"
```

---

## 🔄 Recovery Procedures

### Complete System Reset

#### Soft Reset
```bash
# Stop containers
./start-docker.sh stop

# Clean up Docker resources
docker system prune -f

# Restart containers
./start-docker.sh
```

#### Hard Reset (⚠️ Deletes all data)
```bash
# Stop everything
./start-docker.sh stop

# Remove all containers, images, and volumes
docker system prune -a --volumes

# Remove local data (CAUTION: This deletes all your documents and chat history)
rm -rf lancedb/* shared_uploads/* backend/chat_data.db

# Rebuild from scratch
./start-docker.sh
```

#### Selective Reset

Reset only specific components:
```bash
# Reset just the database
./start-docker.sh stop
rm backend/chat_data.db
./start-docker.sh

# Reset just vector storage
./start-docker.sh stop
rm -rf lancedb/*
./start-docker.sh

# Reset just uploaded documents
rm -rf shared_uploads/*
```

---

## 📊 Performance Optimization

### Resource Monitoring
```bash
# Monitor containers continuously
watch -n 5 'docker stats --no-stream'

# Check disk usage
docker system df
du -sh lancedb shared_uploads backend

# Monitor host resources
htop  # Linux
top   # macOS/Windows
```

### Performance Tuning
```bash
# Use smaller models for better performance
ollama pull qwen3:0.6b  # Instead of qwen3:8b

# Reduce Docker memory if needed
# Docker Desktop → Settings → Resources → Memory

# Clean up regularly
docker system prune -f
docker volume prune -f
```

---

## 🆘 When All Else Fails

### Alternative Deployment Options

#### 1. Direct Development (No Docker)
```bash
# Stop Docker containers
./start-docker.sh stop

# Use direct development instead
python run_system.py
```

#### 2. Minimal Docker (RAG API only)
```bash
# Run only RAG API in Docker
docker build -f Dockerfile.rag-api -t rag-api .
docker run -p 8001:8001 rag-api

# Run other components directly
cd backend && python server.py &
npm run dev
```

#### 3. Hybrid Approach
```bash
# Run some services in Docker, others directly
docker compose up -d rag-api
cd backend && python server.py &
npm run dev
```

### Getting Help

#### Diagnostic Information to Collect
```bash
# System information
docker version
docker compose version
uname -a

# Container information
docker compose ps
docker compose config

# Resource information
docker stats --no-stream
docker system df

# Error logs
docker compose logs > docker-errors.log 2>&1
```

#### Support Channels
1. **Check GitHub Issues**: Search existing issues for similar problems
2. **Documentation**: Review the complete documentation in `Documentation/`
3. **Create Issue**: Include diagnostic information above

---

## ✅ Success Checklist

Your Docker deployment is working correctly when:

- ✅ `docker version` shows Docker is running
- ✅ `curl http://localhost:11434/api/tags` shows Ollama is accessible
- ✅ `./start-docker.sh status` shows all containers healthy
- ✅ All health check URLs return 200 OK
- ✅ You can access the frontend at http://localhost:3000
- ✅ You can create document indexes successfully
- ✅ You can chat with your documents
- ✅ No error messages in container logs

**If all boxes are checked, your Docker deployment is successful! 🎉**

---

**Still having issues?** Check the main `DOCKER_README.md` or create an issue with your diagnostic information. 

================================================
FILE: Dockerfile.backend
================================================
FROM python:3.11-slim

# Set working directory
WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    curl \
    && rm -rf /var/lib/apt/lists/*

# Copy requirements and install Python dependencies (using Docker-specific requirements)
COPY requirements-docker.txt ./requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Copy backend code and dependencies
COPY backend/ ./backend/
COPY rag_system/ ./rag_system/

# Create necessary directories and initialize database
RUN mkdir -p shared_uploads logs backend

# Expose port
EXPOSE 8000

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1

# Run the backend server
WORKDIR /app/backend
CMD ["python", "server.py"]  

================================================
FILE: Dockerfile.frontend
================================================
FROM node:18-alpine

# Set working directory
WORKDIR /app

# Install dependencies (including dev dependencies for build)
COPY package.json package-lock.json ./
RUN npm ci

# Copy source code and configuration files
COPY src/ ./src/
COPY public/ ./public/
COPY next.config.ts ./
COPY tsconfig.json ./
COPY tailwind.config.js ./
COPY postcss.config.mjs ./
COPY eslint.config.mjs ./

# Build the application (skip linting for Docker)
ENV NEXT_LINT=false
RUN npm run build

# Expose port
EXPOSE 3000

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:3000 || exit 1

# Start the application
CMD ["npm", "start"] 

================================================
FILE: Dockerfile.rag-api
================================================
FROM python:3.11-slim

# Set working directory
WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    curl \
    build-essential \
    && rm -rf /var/lib/apt/lists/*

# Copy requirements and install Python dependencies (using Docker-specific requirements)
COPY requirements-docker.txt ./requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Copy RAG system code and backend dependencies
COPY rag_system/ ./rag_system/
COPY backend/ ./backend/

# Create necessary directories
RUN mkdir -p lancedb index_store shared_uploads logs

# Expose port
EXPOSE 8001

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:8001/models || exit 1

# Run the RAG API server
CMD ["python", "-m", "rag_system.api_server"] 

================================================
FILE: Documentation/api_reference.md
================================================
# 📚 API Reference (Backend & RAG API)

_Last updated: 2025-01-07_

---

## Backend HTTP API (Python `backend/server.py`)
**Base URL**: `http://localhost:8000`

| Endpoint | Method | Description | Request Body | Success Response |
|----------|--------|-------------|--------------|------------------|
| `/health` | GET | Health probe incl. Ollama status & DB stats | – | 200 JSON `{ status, ollama_running, available_models, database_stats }` |
| `/chat` | POST | Stateless chat (no session) | `{ message:str, model?:str, conversation_history?:[{role,content}]}` | 200 `{ response:str, model:str, message_count:int }` |
| `/sessions` | GET | List all sessions | – | `{ sessions:ChatSession[], total:int }` |
| `/sessions` | POST | Create session | `{ title?:str, model?:str }` | 201 `{ session:ChatSession, session_id }` |
| `/sessions/<id>` | GET | Get session + msgs | – | `{ session, messages }` |
| `/sessions/<id>` | DELETE | Delete session | – | `{ message, deleted_session_id }` |
| `/sessions/<id>/rename` | POST | Rename session | `{ title:str }` | `{ message, session }` |
| `/sessions/<id>/messages` | POST | Session chat (builds history) | See ChatRequest + retrieval opts ▼ | `{ response, session, user_message_id, ai_message_id }` |
| `/sessions/<id>/documents` | GET | List uploaded docs | – | `{ files:string[], file_count:int, session }` |
| `/sessions/<id>/upload` | POST multipart | Upload docs to session | field `files[]` | `{ message, uploaded_files, processing_results?, session_documents?, total_session_documents? }` |
| `/sessions/<id>/index` | POST | Trigger RAG indexing for session | `{ latechunk?, doclingChunk?, chunkSize?, ... }` | `{ message }` |
| `/sessions/<id>/indexes` | GET | List indexes linked to session | – | `{ indexes, total }` |
| `/sessions/<sid>/indexes/<idxid>` | POST | Link index to session | – | `{ message }` |
| `/sessions/cleanup` | GET | Remove empty sessions | – | `{ message, cleanup_count }` |
| `/models` | GET | List generation / embedding models | – | `{ generation_models:str[], embedding_models:str[] }` |
| `/indexes` | GET | List all indexes | – | `{ indexes, total }` |
| `/indexes` | POST | Create index | `{ name:str, description?:str, metadata?:dict }` | `{ index_id }` |
| `/indexes/<id>` | GET | Get single index | – | `{ index }` |
| `/indexes/<id>` | DELETE | Delete index | – | `{ message, index_id }` |
| `/indexes/<id>/upload` | POST multipart | Upload docs to index | field `files[]` | `{ message, uploaded_files }` |
| `/indexes/<id>/build` | POST | Build / rebuild index (RAG) | `{ latechunk?, doclingChunk?, ...}` | 200 `{ response?, message?}` (idempotent) |

---

## RAG API (Python `rag_system/api_server.py`)
**Base URL**: `http://localhost:8001`

| Endpoint | Method | Description | Request Body | Success Response |
|----------|--------|-------------|--------------|------------------|
| `/chat` | POST | Run RAG query with full pipeline | See RAG ChatRequest ▼ | `{ answer:str, source_documents:[], reasoning?:str, confidence?:float }` |
| `/chat/stream` | POST | Run RAG query with SSE streaming | Same as /chat | Server-Sent Events stream |
| `/index` | POST | Index documents with full configuration | See Index Request ▼ | `{ message:str, indexed_files:[], table_name:str }` |
| `/models` | GET | List available models | – | `{ generation_models:str[], embedding_models:str[] }` |

### RAG ChatRequest (Advanced Options)
```jsonc
{
  "query": "string",                    // Required – user question
  "session_id": "string",               // Optional – for session context
  "table_name": "string",               // Optional – specific index table
  "compose_sub_answers": true,          // Optional – compose sub-answers 
  "query_decompose": true,              // Optional – decompose complex queries
  "ai_rerank": false,                   // Optional – AI-powered reranking
  "context_expand": false,              // Optional – context expansion
  "verify": true,                       // Optional – answer verification
  "retrieval_k": 20,                    // Optional – number of chunks to retrieve
  "context_window_size": 1,             // Optional – context window size
  "reranker_top_k": 10,                 // Optional – top-k after reranking
  "search_type": "hybrid",              // Optional – "hybrid|dense|fts"
  "dense_weight": 0.7,                  // Optional – dense search weight (0-1)
  "force_rag": false,                   // Optional – bypass triage, force RAG
  "provence_prune": false,              // Optional – sentence-level pruning
  "provence_threshold": 0.8,            // Optional – pruning threshold
  "model": "qwen3:8b"                   // Optional – generation model override
}
```

### Index Request (Document Indexing)
```jsonc
{
  "file_paths": ["path1.pdf", "path2.pdf"],  // Required – files to index
  "session_id": "string",                     // Required – session identifier
  "chunk_size": 512,                          // Optional – chunk size (default: 512)
  "chunk_overlap": 64,                        // Optional – chunk overlap (default: 64)
  "enable_latechunk": true,                   // Optional – enable late chunking
  "enable_docling_chunk": false,              // Optional – enable DocLing chunking
  "retrieval_mode": "hybrid",                 // Optional – "hybrid|dense|fts"
  "window_size": 2,                           // Optional – context window
  "enable_enrich": true,                      // Optional – enable enrichment
  "embedding_model": "Qwen/Qwen3-Embedding-0.6B",  // Optional – embedding model
  "enrich_model": "qwen3:0.6b",               // Optional – enrichment model
  "overview_model_name": "qwen3:0.6b",        // Optional – overview model
  "batch_size_embed": 50,                     // Optional – embedding batch size
  "batch_size_enrich": 25                     // Optional – enrichment batch size
}
```

> **Note on CORS** – All endpoints include `Access-Control-Allow-Origin: *` header.

---

## Frontend Wrapper (`src/lib/api.ts`)
The React/Next.js frontend calls the backend via a typed wrapper. Important methods & payloads:

| Method | Backend Endpoint | Payload Shape |
|--------|------------------|---------------|
| `checkHealth()` | `/health` | – |
| `sendMessage({ message, model?, conversation_history? })` | `/chat` | ChatRequest |
| `getSessions()` | `/sessions` | – |
| `createSession(title?, model?)` | `/sessions` | – |
| `getSession(sessionId)` | `/sessions/<id>` | – |
| `sendSessionMessage(sessionId, message, opts)` | `/sessions/<id>/messages` | `ChatRequest + retrieval opts` |
| `uploadFiles(sessionId, files[])` | `/sessions/<id>/upload` | multipart |
| `indexDocuments(sessionId)` | `/sessions/<id>/index` | opts similar to buildIndex |
| `buildIndex(indexId, opts)` | `/indexes/<id>/build` | Index build options |
| `linkIndexToSession` | `/sessions/<sid>/indexes/<idx>` | – |

---

## Payload Definitions (Canonical)

### ChatRequest (frontend ⇄ backend)
```jsonc
{
  "message": "string",              // Required – raw user text
  "model": "string",                // Optional – generation model id
  "conversation_history": [         // Optional – prior turn list
    { "role": "user|assistant", "content": "string" }
  ]
}
```

### Session Chat Extended Options
```jsonc
{
  "composeSubAnswers": true,
  "decompose": true,
  "aiRerank": false,
  "contextExpand": false,
  "verify": true,
  "retrievalK": 10,
  "contextWindowSize": 5,
  "rerankerTopK": 20,
  "searchType": "fts|hybrid|dense",
  "denseWeight": 0.75,
  "force_rag": false
}
```

### Index Build Options
```jsonc
{
  "latechunk": true,
  "doclingChunk": false,
  "chunkSize": 512,
  "chunkOverlap": 64,
  "retrievalMode": "hybrid|dense|fts",
  "windowSize": 2,
  "enableEnrich": true,
  "embeddingModel": "Qwen/Qwen3-Embedding-0.6B",
  "enrichModel": "qwen3:0.6b",
  "overviewModel": "qwen3:0.6b",
  "batchSizeEmbed": 64,
  "batchSizeEnrich": 32
}
```

---

_This reference is derived from static code analysis of `backend/server.py`, `rag_system/api_server.py`, and `src/lib/api.ts`. Keep it in sync with route or type changes._ 

================================================
FILE: Documentation/architecture_overview.md
================================================
# 🏗️ System Architecture Overview

_Last updated: 2025-07-06_

This document explains how data and control flow through the Advanced **RAG System** — from a user's browser all the way to model inference and back.  It is intended as the **ground-truth reference** for engineers and integrators.

---

## 1. Bird's-Eye Diagram

```mermaid
flowchart LR
    subgraph Client
        U["👤  User (Browser)"]
        FE["Next.js Front-end\nReact Components"]
        U --> FE
    end

    subgraph Network
        FE -->|HTTP/JSON| BE["Python HTTP Server\nbackend/server.py"]
    end

    subgraph Core["rag_system core package"]
        BE --> LOOP["Agent Loop\n(rag_system/agent/loop.py)"]
        BE --> IDX["Indexing Pipeline\n(pipelines/indexing_pipeline.py)"]

        LOOP --> RP["Retrieval Pipeline\n(pipelines/retrieval_pipeline.py)"]
        LOOP --> VER["Verifier (Grounding Check)"]
        RP --> RET["Retrievers\nBM25 | Dense | Hybrid"]
        RP --> RER["AI Reranker"]
        RP --> SYNT["Answer Synthesiser"]
    end

    subgraph Storage
        LDB[("LanceDB Vector Tables")]
        SQL[("SQLite – chat & metadata")]
    end

    subgraph Models
        OLLAMA["Ollama Server\n(qwen3, etc.)"]
        HF["HuggingFace Hosted\nEmbedding/Reranker Models"]
    end

    %% data edges
    IDX -->|chunks & embeddings| LDB
    RET -->|vector search| LDB
    LOOP -->|LLM calls| OLLAMA
    RP -->|LLM calls| OLLAMA
    VER -->|LLM calls| OLLAMA
    RP -->|rerank| HF

    BE -->|CRUD| SQL
```

---

### Data-flow Narrative
1. **User** interacts with the Next.js UI; messages are posted via `src/lib/api.ts`.
2. **backend/server.py** receives JSON over HTTP, applies CORS, and proxies the request into `rag_system`.
3. **Agent Loop** decides (via _Triage_) whether to perform Retrieval-Augmented Generation (RAG) or direct LLM answering.
4. If RAG is chosen:
   1. **Retrieval Pipeline** fetches candidates from **LanceDB** using BM25 + dense vectors.
   2. **AI Reranker** (HF model) sorts snippets.
   3. **Answer Synthesiser** calls **Ollama** to write the final answer.
5. Answers can be **Verified** for grounding (optional flag).
6. Index-building is an offline path triggered from the UI — PDF/📄 files are chunked, embedded and stored in LanceDB.

---

## 2. Component Documents
The table below links to deep-dives for each major component.

| **Component** | **Documentation** |
|---------------|-------------------|
| Agent Loop | [`system_overview.md`](system_overview.md) |
| Indexing Pipeline | [`indexing_pipeline.md`](indexing_pipeline.md) |
| Retrieval Pipeline | [`retrieval_pipeline.md`](retrieval_pipeline.md) |
| Verifier | [`verifier.md`](verifier.md) |
| Triage System | [`triage_system.md`](triage_system.md) |

---

> **Change-management**: whenever architecture changes (new micro-service, different DB, etc.) update this overview diagram first, then individual component docs. 

================================================
FILE: Documentation/deployment_guide.md
================================================
# 🚀 RAG System Deployment Guide

_Last updated: 2025-01-07_

This guide provides comprehensive instructions for deploying the RAG system using both Docker and direct development approaches.

---

## 🎯 Deployment Options

### Option 1: Docker Deployment (Production) 🐳
- **Best for**: Production environments, containerized deployments, scaling
- **Pros**: Isolated, reproducible, easy to manage
- **Cons**: Slightly more complex setup, resource overhead

### Option 2: Direct Development (Development) 💻
- **Best for**: Development, debugging, customization
- **Pros**: Direct access to code, faster iteration, easier debugging
- **Cons**: More dependencies to manage

---

## 1. Prerequisites

### 1.1 System Requirements

#### **Minimum Requirements**
- **CPU**: 4 cores, 2.5GHz+
- **RAM**: 8GB (16GB recommended)
- **Storage**: 50GB free space
- **OS**: Linux, macOS, or Windows with WSL2

#### **Recommended Requirements**
- **CPU**: 8+ cores, 3.0GHz+
- **RAM**: 32GB+ (for large models)
- **Storage**: 200GB+ SSD
- **GPU**: NVIDIA GPU with 8GB+ VRAM (optional, for acceleration)

### 1.2 Common Dependencies

**Both deployment methods require:**
```bash
# Ollama (required for both approaches)
curl -fsSL https://ollama.ai/install.sh | sh

# Git for cloning
git 2.30+
```

### 1.3 Docker-Specific Dependencies

**For Docker deployment:**
```bash
# Docker & Docker Compose
Docker Engine 24.0+
Docker Compose 2.20+
```

### 1.4 Direct Development Dependencies

**For direct development:**
```bash
# Python & Node.js
Python 3.8+
Node.js 16+
npm 8+
```

---

## 2. 🐳 Docker Deployment

### 2.1 Installation

#### **Step 1: Install Docker**

**Ubuntu/Debian:**
```bash
# Install Docker
curl -fsSL https://get.docker.com -o get-docker.sh
sudo sh get-docker.sh
sudo usermod -aG docker $USER
newgrp docker

# Install Docker Compose V2
sudo apt-get update
sudo apt-get install docker-compose-plugin
```

**macOS:**
```bash
# Install Docker Desktop
brew install --cask docker
# Or download from: https://www.docker.com/products/docker-desktop
```

**Windows:**
```bash
# Install Docker Desktop with WSL2 backend
# Download from: https://www.docker.com/products/docker-desktop
```

#### **Step 2: Clone Repository**
```bash
git clone https://github.com/your-org/rag-system.git
cd rag-system
```

#### **Step 3: Install Ollama**
```bash
# Install Ollama (runs locally even with Docker)
curl -fsSL https://ollama.ai/install.sh | sh

# Start Ollama
ollama serve

# In another terminal, install models
ollama pull qwen3:0.6b
ollama pull qwen3:8b
```

#### **Step 4: Launch Docker System**
```bash
# Start all containers using the convenience script
./start-docker.sh

# Or manually:
docker compose --env-file docker.env up --build -d
```

#### **Step 5: Verify Deployment**
```bash
# Check container status
docker compose ps

# Test all endpoints
curl http://localhost:3000      # Frontend
curl http://localhost:8000/health  # Backend
curl http://localhost:8001/models  # RAG API
curl http://localhost:11434/api/tags  # Ollama
```

### 2.2 Docker Management

#### **Container Operations**
```bash
# Start system
./start-docker.sh

# Stop system
./start-docker.sh stop

# View logs
./start-docker.sh logs

# Check status
./start-docker.sh status

# Manual Docker Compose commands
docker compose ps                    # Check status
docker compose logs -f              # Follow logs
docker compose down                 # Stop all containers
docker compose up --build -d        # Rebuild and restart
```

#### **Individual Container Management**
```bash
# Restart specific service
docker compose restart rag-api

# View specific service logs
docker compose logs -f backend

# Execute commands in container
docker compose exec rag-api python -c "print('Hello')"
```

---

## 3. 💻 Direct Development

### 3.1 Installation

#### **Step 1: Install Dependencies**

**Python Dependencies:**
```bash
# Clone repository
git clone https://github.com/your-org/rag-system.git
cd rag-system

# Create virtual environment (recommended)
python -m venv venv
source venv/bin/activate  # On Windows: venv\Scripts\activate

# Install Python packages
pip install -r requirements.txt
```

**Node.js Dependencies:**
```bash
# Install Node.js dependencies
npm install
```

#### **Step 2: Install and Configure Ollama**
```bash
# Install Ollama
curl -fsSL https://ollama.ai/install.sh | sh

# Start Ollama
ollama serve

# In another terminal, install models
ollama pull qwen3:0.6b
ollama pull qwen3:8b
```

#### **Step 3: Launch System**

**Option A: Integrated Launcher (Recommended)**
```bash
# Start all components with one command
python run_system.py
```

**Option B: Manual Component Startup**
```bash
# Terminal 1: RAG API
python -m rag_system.api_server

# Terminal 2: Backend
cd backend && python server.py

# Terminal 3: Frontend
npm run dev

# Access at http://localhost:3000
```

#### **Step 4: Verify Installation**
```bash
# Check system health
python system_health_check.py

# Test endpoints
curl http://localhost:3000      # Frontend
curl http://localhost:8000/health  # Backend
curl http://localhost:8001/models  # RAG API
```

### 3.2 Direct Development Management

#### **System Operations**
```bash
# Start system
python run_system.py

# Check system health
python system_health_check.py

# Stop system
# Press Ctrl+C in terminal running run_system.py
```

#### **Individual Component Management**
```bash
# Start components individually
python -m rag_system.api_server    # RAG API on port 8001
cd backend && python server.py     # Backend on port 8000
npm run dev                         # Frontend on port 3000

# Development tools
npm run build                       # Build frontend for production
pip install -r requirements.txt --upgrade  # Update Python packages
```

---

## 4. Architecture Comparison

### 4.1 Docker Architecture

```mermaid
graph TB
    subgraph "Docker Containers"
        Frontend[Frontend Container<br/>Next.js<br/>Port 3000]
        Backend[Backend Container<br/>Python API<br/>Port 8000]
        RAG[RAG API Container<br/>Document Processing<br/>Port 8001]
    end
    
    subgraph "Local System"
        Ollama[Ollama Server<br/>Port 11434]
    end
    
    Frontend --> Backend
    Backend --> RAG
    RAG --> Ollama
```

### 4.2 Direct Development Architecture

```mermaid
graph TB
    subgraph "Local Processes"
        Frontend[Next.js Dev Server<br/>Port 3000]
        Backend[Python Backend<br/>Port 8000]
        RAG[RAG API<br/>Port 8001]
        Ollama[Ollama Server<br/>Port 11434]
    end
    
    Frontend --> Backend
    Backend --> RAG
    RAG --> Ollama
```

---

## 5. Configuration

### 5.1 Environment Variables

#### **Docker Configuration (`docker.env`)**
```bash
# Ollama Configuration
OLLAMA_HOST=http://host.docker.internal:11434

# Service Configuration
NODE_ENV=production
RAG_API_URL=http://rag-api:8001
NEXT_PUBLIC_API_URL=http://localhost:8000
```

#### **Direct Development Configuration**
```bash
# Environment variables are set automatically by run_system.py
# Override in environment if needed:
export OLLAMA_HOST=http://localhost:11434
export RAG_API_URL=http://localhost:8001
```

### 5.2 Model Configuration

#### **Default Models**
```python
# Embedding Models
EMBEDDING_MODELS = [
    "Qwen/Qwen3-Embedding-0.6B",  # Fast, 1024 dimensions
    "Qwen/Qwen3-Embedding-4B",    # High quality, 2048 dimensions
]

# Generation Models  
GENERATION_MODELS = [
    "qwen3:0.6b",  # Fast responses
    "qwen3:8b",    # High quality
]
```

### 5.3 Performance Tuning

#### **Memory Settings**
```bash
# For Docker: Increase memory allocation
# Docker Desktop → Settings → Resources → Memory → 16GB+

# For Direct Development: Monitor with
htop  # or top on macOS
```

#### **Model Settings**
```python
# Batch sizes (adjust based on available RAM)
EMBEDDING_BATCH_SIZE = 50   # Reduce if OOM
ENRICHMENT_BATCH_SIZE = 25  # Reduce if OOM

# Chunk settings
CHUNK_SIZE = 512           # Text chunk size
CHUNK_OVERLAP = 64         # Overlap between chunks
```

---

## 6. Operational Procedures

### 6.1 System Monitoring

#### **Health Checks**
```bash
# Comprehensive system check
curl -f http://localhost:3000 && echo "✅ Frontend OK"
curl -f http://localhost:8000/health && echo "✅ Backend OK"
curl -f http://localhost:8001/models && echo "✅ RAG API OK"
curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK"
```

#### **Performance Monitoring**
```bash
# Docker monitoring
docker stats

# Direct development monitoring
htop           # Overall system
nvidia-smi     # GPU usage (if available)
```

### 6.2 Log Management

#### **Docker Logs**
```bash
# All services
docker compose logs -f

# Specific service
docker compose logs -f rag-api

# Save logs to file
docker compose logs > system.log 2>&1
```

#### **Direct Development Logs**
```bash
# Logs are printed to terminal
# Redirect to file if needed:
python run_system.py > system.log 2>&1
```

### 6.3 Backup and Restore

#### **Data Backup**
```bash
# Create backup directory
mkdir -p backups/$(date +%Y%m%d)

# Backup databases and indexes
cp -r backend/chat_data.db backups/$(date +%Y%m%d)/
cp -r lancedb backups/$(date +%Y%m%d)/
cp -r index_store backups/$(date +%Y%m%d)/

# For Docker: also backup volumes
docker compose down
docker run --rm -v rag_system_old_ollama_data:/data -v $(pwd)/backups:/backup alpine tar czf /backup/ollama_models_$(date +%Y%m%d).tar.gz -C /data .
```

#### **Data Restore**
```bash
# Stop system
./start-docker.sh stop  # Docker
# Or Ctrl+C for direct development

# Restore files
cp -r backups/YYYYMMDD/* ./

# Restart system
./start-docker.sh  # Docker
python run_system.py  # Direct development
```

---

## 7. Troubleshooting

### 7.1 Common Issues

#### **Port Conflicts**
```bash
# Check what's using ports
lsof -i :3000 -i :8000 -i :8001 -i :11434

# For Docker: Stop conflicting containers
./start-docker.sh stop

# For Direct: Kill processes
pkill -f "npm run dev"
pkill -f "server.py"
pkill -f "api_server"
```

#### **Docker Issues**
```bash
# Docker daemon not running
docker version  # Check if daemon responds

# Restart Docker Desktop (macOS/Windows)
# Or restart docker service (Linux)
sudo systemctl restart docker

# Clear Docker cache
docker system prune -f
```

#### **Ollama Issues**
```bash
# Check Ollama status
curl http://localhost:11434/api/tags

# Restart Ollama
pkill ollama
ollama serve

# Reinstall models
ollama pull qwen3:0.6b
ollama pull qwen3:8b
```

### 7.2 Performance Issues

#### **Memory Problems**
```bash
# Check memory usage
free -h           # Linux
vm_stat           # macOS
docker stats      # Docker containers

# Solutions:
# 1. Increase system RAM
# 2. Reduce batch sizes in configuration
# 3. Use smaller models (qwen3:0.6b instead of qwen3:8b)
```

#### **Slow Response Times**
```bash
# Check model loading
curl http://localhost:11434/api/tags

# Monitor component response times
time curl http://localhost:8001/models

# Solutions:
# 1. Use SSD storage
# 2. Increase CPU cores
# 3. Use GPU acceleration (if available)
```

---

## 8. Production Considerations

### 8.1 Security

#### **Network Security**
```bash
# Use reverse proxy (nginx/traefik) for production
# Enable HTTPS/TLS
# Restrict port access with firewall
```

#### **Data Security**
```bash
# Enable authentication in production
# Encrypt sensitive data
# Regular security updates
```

### 8.2 Scaling

#### **Horizontal Scaling**
```bash
# Use Docker Swarm or Kubernetes
# Load balance frontend and backend
# Scale RAG API instances based on load
```

#### **Resource Optimization**
```bash
# Use dedicated GPU nodes for AI workloads
# Implement model caching
# Optimize batch processing
```

---

## 9. Success Criteria

### 9.1 Deployment Verification

Your deployment is successful when:

- ✅ All health checks pass
- ✅ Frontend loads at http://localhost:3000
- ✅ You can create document indexes
- ✅ You can chat with uploaded documents
- ✅ No error messages in logs

### 9.2 Performance Benchmarks

**Acceptable Performance:**
- Index creation: < 2 minutes per 100MB document
- Query response: < 30 seconds for complex questions
- Memory usage: < 8GB total system memory

**Optimal Performance:**
- Index creation: < 1 minute per 100MB document  
- Query response: < 10 seconds for complex questions
- Memory usage: < 16GB total system memory

---

**Happy Deploying! 🚀** 

================================================
FILE: Documentation/docker_usage.md
================================================
# 🐳 Docker Usage Guide - RAG System

_Last updated: 2025-01-07_

This guide provides practical Docker commands and procedures for running the RAG system in containerized environments with local Ollama.

---

## 📋 Prerequisites

### Required Setup
- Docker Desktop installed and running
- Ollama installed locally (even for Docker deployment)
- 8GB+ RAM available

### Architecture Overview
```
┌─────────────────────────────────────┐
│           Docker Containers        │
├─────────────────────────────────────┤
│ Frontend (Port 3000)               │
│ Backend (Port 8000)                │
│ RAG API (Port 8001)                │
└─────────────────────────────────────┘
            │
            ▼
┌─────────────────────────────────────┐
│         Local System               │
├─────────────────────────────────────┤
│ Ollama Server (Port 11434)         │
└─────────────────────────────────────┘
```

---

## 1. Quick Start Commands

### Step 1: Clone and Setup

```bash
# Clone repository
git clone <your-repository-url>
cd rag_system_old

# Verify Docker is running
docker version
```

### Step 2: Install and Configure Ollama (Required)

**⚠️ Important**: Even with Docker, Ollama must be installed locally for optimal performance.

```bash
# Install Ollama
curl -fsSL https://ollama.ai/install.sh | sh

# Start Ollama (in one terminal)
ollama serve

# Install required models (in another terminal)
ollama pull qwen3:0.6b      # Fast model (650MB)
ollama pull qwen3:8b        # High-quality model (4.7GB)

# Verify models are installed
ollama list

# Test Ollama connection
curl http://localhost:11434/api/tags
```

### Step 3: Start Docker Containers

```bash
# Start all containers
./start-docker.sh

# Stop all containers
./start-docker.sh stop

# View logs
./start-docker.sh logs

# Check status
./start-docker.sh status

# Restart containers
./start-docker.sh stop
./start-docker.sh
```

### 1.2 Service Access

Once running, access the system at:
- **Frontend**: http://localhost:3000
- **Backend API**: http://localhost:8000  
- **RAG API**: http://localhost:8001
- **Ollama**: http://localhost:11434

---

## 2. Container Management

### 2.1 Using the Convenience Script

```bash
# Start all containers
./start-docker.sh

# Stop all containers
./start-docker.sh stop

# View logs
./start-docker.sh logs

# Check status
./start-docker.sh status

# Restart containers
./start-docker.sh stop
./start-docker.sh
```

### 2.2 Manual Docker Compose Commands

```bash
# Start all services
docker compose --env-file docker.env up --build -d

# Check status
docker compose ps

# View logs
docker compose logs -f

# Stop all services
docker compose down

# Force rebuild
docker compose build --no-cache
docker compose up --build -d
```

### 2.3 Individual Service Management

```bash
# Start specific service
docker compose up -d frontend
docker compose up -d backend
docker compose up -d rag-api

# Restart specific service
docker compose restart rag-api

# Stop specific service
docker compose stop backend

# View specific service logs
docker compose logs -f rag-api
```

---

## 3. Development Workflow

### 3.1 Code Changes

```bash
# After frontend changes
docker compose restart frontend

# After backend changes  
docker compose restart backend

# After RAG system changes
docker compose restart rag-api

# Rebuild after dependency changes
docker compose build --no-cache rag-api
docker compose up -d rag-api
```

### 3.2 Debugging Containers

```bash
# Access container shell
docker compose exec frontend sh
docker compose exec backend bash
docker compose exec rag-api bash

# Run commands in container
docker compose exec rag-api python -c "from rag_system.main import get_agent; print('✅ RAG System OK')"
docker compose exec backend curl http://localhost:8000/health

# Check environment variables
docker compose exec rag-api env | grep OLLAMA
```

### 3.3 Development vs Production

```bash
# Development mode (if docker-compose.dev.yml exists)
docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d

# Production mode (default)
docker compose --env-file docker.env up -d
```

---

## 4. Logging & Monitoring

### 4.1 Log Management

```bash
# View all logs
docker compose logs

# View specific service logs
docker compose logs frontend
docker compose logs backend
docker compose logs rag-api

# Follow logs in real-time
docker compose logs -f

# View last N lines
docker compose logs --tail=100

# View logs with timestamps
docker compose logs -t

# Save logs to file
docker compose logs > system.log 2>&1

# View logs since specific time
docker compose logs --since=2h
docker compose logs --since=2025-01-01T00:00:00
```

### 4.2 System Monitoring

```bash
# Monitor resource usage
docker stats

# Monitor specific containers
docker stats rag-frontend rag-backend rag-api

# Check container health
docker compose ps

# System information
docker system info
docker system df
```

---

## 5. Ollama Integration

### 5.1 Ollama Setup

```bash
# Install Ollama (one-time setup)
curl -fsSL https://ollama.ai/install.sh | sh

# Start Ollama server
ollama serve

# Check Ollama status
curl http://localhost:11434/api/tags

# Install models
ollama pull qwen3:0.6b      # Fast model
ollama pull qwen3:8b        # High-quality model

# List installed models
ollama list
```

### 5.2 Ollama Management

```bash
# Check model status from container
docker compose exec rag-api curl http://host.docker.internal:11434/api/tags

# Test Ollama connection
curl -X POST http://localhost:11434/api/generate \
  -H "Content-Type: application/json" \
  -d '{"model": "qwen3:0.6b", "prompt": "Hello", "stream": false}'

# Monitor Ollama logs (if running with logs)
# Ollama logs appear in the terminal where you ran 'ollama serve'
```

### 5.3 Model Management

```bash
# Update models
ollama pull qwen3:0.6b
ollama pull qwen3:8b

# Remove unused models
ollama rm old-model-name

# Check model information
ollama show qwen3:0.6b
```

---

## 6. Data Management

### 6.1 Volume Management

```bash
# List volumes
docker volume ls

# View volume usage
docker system df -v

# Backup volumes
docker run --rm -v rag_system_old_lancedb:/data -v $(pwd)/backup:/backup alpine tar czf /backup/lancedb_backup.tar.gz -C /data .

# Clean unused volumes
docker volume prune
```

### 6.2 Database Management

```bash
# Access SQLite database
docker compose exec backend sqlite3 /app/backend/chat_data.db

# Backup database
cp backend/chat_data.db backup/chat_data_$(date +%Y%m%d).db

# Check LanceDB tables from container
docker compose exec rag-api python -c "
import lancedb
db = lancedb.connect('/app/lancedb')
print('Tables:', db.table_names())
"
```

### 6.3 File Management

```bash
# Access shared files
docker compose exec rag-api ls -la /app/shared_uploads

# Copy files to/from containers
docker cp local_file.pdf rag-api:/app/shared_uploads/
docker cp rag-api:/app/shared_uploads/file.pdf ./local_file.pdf

# Check disk usage
docker compose exec rag-api df -h
```

---

## 7. Troubleshooting

### 7.1 Common Issues

#### Container Won't Start
```bash
# Check Docker daemon
docker version

# Check for port conflicts
lsof -i :3000 -i :8000 -i :8001

# Check container logs
docker compose logs [service-name]

# Restart Docker Desktop
# macOS/Windows: Restart Docker Desktop
# Linux: sudo systemctl restart docker
```

#### Ollama Connection Issues
```bash
# Check Ollama is running
curl http://localhost:11434/api/tags

# Restart Ollama
pkill ollama
ollama serve

# Check from container
docker compose exec rag-api curl http://host.docker.internal:11434/api/tags
```

#### Performance Issues
```bash
# Check resource usage
docker stats

# Increase Docker memory (Docker Desktop Settings)
# Recommended: 8GB+ for Docker

# Check container health
docker compose ps
```

### 7.2 Reset and Clean

```bash
# Stop everything
./start-docker.sh stop

# Clean containers and images
docker system prune -a

# Clean volumes (⚠️ deletes data)
docker volume prune

# Complete reset (⚠️ deletes everything)
docker compose down -v
docker system prune -a --volumes
```

### 7.3 Health Checks

```bash
# Comprehensive health check
curl -f http://localhost:3000 && echo "✅ Frontend OK"
curl -f http://localhost:8000/health && echo "✅ Backend OK"
curl -f http://localhost:8001/models && echo "✅ RAG API OK"
curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK"

# Check all container status
docker compose ps

# Test model loading
docker compose exec rag-api python -c "
from rag_system.main import get_agent
agent = get_agent('default')
print('✅ RAG System initialized successfully')
"
```

---

## 8. Advanced Usage

### 8.1 Production Deployment

```bash
# Use production environment
export NODE_ENV=production

# Start with resource limits
docker compose --env-file docker.env up -d

# Enable automatic restarts
docker update --restart unless-stopped $(docker ps -q)
```

### 8.2 Scaling

```bash
# Scale specific services
docker compose up -d --scale backend=2 --scale rag-api=2

# Use Docker Swarm for clustering
docker swarm init
docker stack deploy -c docker-compose.yml rag-system
```

### 8.3 Security

```bash
# Scan images for vulnerabilities
docker scout cves rag-frontend
docker scout cves rag-backend
docker scout cves rag-api

# Update base images
docker compose build --no-cache --pull
```

---

## 9. Configuration

### 9.1 Environment Variables

The system uses `docker.env` for configuration:

```bash
# Ollama configuration
OLLAMA_HOST=http://host.docker.internal:11434

# Service configuration
NODE_ENV=production
RAG_API_URL=http://rag-api:8001
NEXT_PUBLIC_API_URL=http://localhost:8000
```

### 9.2 Custom Configuration

```bash
# Create custom environment file
cp docker.env docker.custom.env

# Edit custom configuration
nano docker.custom.env

# Use custom configuration
docker compose --env-file docker.custom.env up -d
```

---

## 10. Success Checklist

Your Docker deployment is successful when:

- ✅ All containers are running: `docker compose ps`
- ✅ Ollama is accessible: `curl http://localhost:11434/api/tags`
- ✅ Frontend loads: `curl http://localhost:3000`
- ✅ Backend responds: `curl http://localhost:8000/health`
- ✅ RAG API works: `curl http://localhost:8001/models`
- ✅ You can create indexes and chat with documents

### Performance Expectations

**Acceptable Performance:**
- Container startup: < 2 minutes
- Memory usage: < 4GB Docker containers + Ollama
- Response time: < 30 seconds for complex queries

**Optimal Performance:**
- Container startup: < 1 minute  
- Memory usage: < 2GB Docker containers + Ollama
- Response time: < 10 seconds for complex queries

---

**Happy Containerizing! 🐳** 

================================================
FILE: Documentation/improvement_plan.md
================================================
# RAG System – Improvement Road-map

_Revision: 2025-07-05_

This document captures high-impact enhancements identified during the July 2025 code-review.  Items are grouped by theme and include a short rationale plus suggested implementation notes.  **No code has been changed – this file is planning only.**

---

## 1. Retrieval Accuracy & Speed

| ID | Item | Rationale | Notes |
|----|------|-----------|-------|
| 1.1 | Late-chunk result merging | Returned snippets can be single late-chunks → fragmented. | After retrieval, gather sibling chunks (±1) and concatenate before reranking / display. |
| 1.2 | Tiered retrieval (ANN pre-filter) | Large indexes → LanceDB full scan can be slow. | Use in-memory FAISS/HNSW to narrow to top-N, then exact LanceDB search. |
| 1.3 | Dynamic fusion weights | Different corpora favour dense vs BM25 differently. | Learn weight on small validation set; store in index `metadata`. |
| 1.4 | Query expansion via KG | Use extracted entities to enrich queries. | Requires Graph-RAG path clean-up first. |

## 2. Routing / Triage

| ID | Item | Rationale |
|----|------|-----------|
| 2.1 | Embed + cache document overviews | LLM router costs tokens; cosine-similarity pre-check is cheaper. |
| 2.2 | Session-level routing memo | Avoid repeated LLM triage for follow-up queries. |
| 2.3 | Remove legacy pattern rules | Simplifies maintenance once overview & ML routing mature. |

## 3. Indexing Pipeline

| ID | Item | Rationale |
|----|------|-----------|
| 3.1 | Parallel document conversion | PDF→MD + chunking is serial today; speed gains possible. |
| 3.2 | Incremental indexing | Re-embedding whole corpus wastes time. |
| 3.3 | Auto GPU dtype selection | Use FP16 on CUDA / MPS for memory and speed. |
| 3.4 | Post-build health check | Catch broken indexes (dim mismatch etc.) early. |

## 4. Embedding Model Management

* **Registry file** mapping tag → dims/source/license.  UI & backend validate against it.
* **Embedder pool** caches loaded HF/Ollama weights per model to save RAM.

## 5. Database & Storage

* LanceDB table GC for orphaned tables.
* Scheduled SQLite `VACUUM` when fragmentation > X %.

## 6. Observability & Ops

* JSON structured logging.
* `/metrics` endpoint for Prometheus.
* Deep health-probe (`/health/deep`) exercising end-to-end query.

## 7. Front-end UX

* SSE-driven progress bar for indexing.
* Matched-term highlighting in retrieved snippets.
* Preset buttons (Fast / Balanced / High-Recall) for retrieval settings.

## 8. Testing & CI

* Replace deleted BM25 tests with LanceDB hybrid tests.
* Integration test: build → query → assert ≥1 doc.
* GitHub Action that spins up Ollama, pulls small embedding model, runs smoke test.

## 9. Codebase Hygiene

* Graph-RAG integration (currently disabled, can be implemented if needed).
* Consolidate duplicate config keys (`embedding_model_name`, etc.).
* Run `mypy --strict`, pylint, and black in CI.

---

### 🧹 System Cleanup (Priority: **HIGH**)
Reduce complexity and improve maintainability.

* **✅ COMPLETED**: Remove experimental DSPy integration and unused modules (35+ files removed)  
* **✅ COMPLETED**: Clean up duplicate or obsolete documentation files
* **✅ COMPLETED**: Remove unused import statements and dependencies  
* **✅ COMPLETED**: Consolidate similar configuration files
* **✅ COMPLETED**: Remove broken or non-functional ReAct agent implementation

### Priority Matrix (suggested order)

1.  **Critical reliability**: 3.4, 5.1, 9.2
2.  **User-visible wins**: 1.1, 7.1, 7.2
3.  **Performance**: 1.2, 3.1, 3.3
4.  **Long-term maintainability**: 2.3, 9.1, 9.3

Feel free to rearrange based on team objectives and resource availability. 

================================================
FILE: Documentation/indexing_pipeline.md
================================================
# 🗂️ Indexing Pipeline

_Implementation entry-point: `rag_system/pipelines/indexing_pipeline.py` + helpers in `indexing/` & `ingestion/`._

## Overview
Transforms raw documents (PDF, TXT, etc.) into search-ready **chunks** with embeddings, storing them in LanceDB and generating auxiliary assets (overviews, context summaries).

## High-Level Diagram
```mermaid
flowchart TD
    A["Uploaded Files"] --> B{Converter}
    B -->|PDF→text| C["Plain Text"]
    C --> D{Chunker}
    D -->|docling| D1[DocLing Chunking]
    D -->|latechunk| D2[Late Chunking]
    D -->|standard| D3[Fixed-size]
    D1 & D2 & D3 --> E["Contextual Enricher"]
    E -->|local ctx summary| F["Embedding Generator"]
    F -->|vectors| G[(LanceDB Table)]
    E --> H["Overview Builder"]
    H -->|JSONL| OVR[[`index_store/overviews/<idx>.jsonl`]]
```

## Steps in Detail
| Step | Module | Key Classes | Notes |
|------|--------|------------|-------|
| Conversion | `ingestion/pdf_converter.py` | `PDFConverter` | Uses `Docling` library to extract text with structure preservation. |
| Chunking | `ingestion/chunking.py`, `indexing/latechunk.py`, `ingestion/docling_chunker.py` | `MarkdownRecursiveChunker`, `DoclingChunker` | Controlled by flags `latechunk`, `doclingChunk`, `chunkSize`, `chunkOverlap`. |
| Contextual Enrichment | `indexing/contextualizer.py` | `ContextualEnricher` | Generates per-chunk summaries (LLM call). |
| Embedding | `indexing/embedders.py`, `indexing/representations.py` | `QwenEmbedder`, `EmbeddingGenerator` | Batch size tunable (`batchSizeEmbed`). Uses Qwen3-Embedding models. |
| LanceDB Ingest | `index_store/lancedb/…` | – | Each index has a dedicated table `text_pages_<index_id>`. |
| Overview | `indexing/overview_builder.py` | `OverviewBuilder` | First-N chunks summarised for triage routing. |

### Control Flow (Code)
1. **backend/server.py → handle_build_index()** collects files + opts and POSTs to `/index` endpoint on advanced RAG API (local process).
2. **indexing_pipeline.IndexingPipeline.run()** orchestrates conversion → chunking → enrichment → embedding → storage.
3. Metadata (chunk_size, models, etc.) stored in SQLite `indexes` table.

## Configuration Flags
| Flag | Description | Default |
|------|-------------|---------|
| `latechunk` | Merge k adjacent sibling chunks at query time | false |
| `doclingChunk` | Use DocLing structural chunking | false |
| `chunkSize` / `chunkOverlap` | Standard fixed slicing | 512 / 64 |
| `enableEnrich` | Run contextual summaries | true |
| `embeddingModel` | Override embedder | `Qwen/Qwen3-Embedding-0.6B` |
| `overviewModel` | Model used in `OverviewBuilder` | `qwen3:0.6b` |
| `batchSizeEmbed / Enrich` | Batch sizes | 50 / 25 |

## Error Handling
* Duplicate LanceDB table ➟ now idempotent (commit `af99b38`).
* Failed PDF parse ➟ chunker skips file, logs warning.

## Extension Ideas
* Add OCR layer before PDF conversion.
* Store embeddings in Remote LanceDB instance (update URL in config).

## Detailed Implementation Analysis

### Pipeline Architecture Pattern
The `IndexingPipeline` uses a **sequential processing pattern** with parallel batch operations. Each stage processes all documents before moving to the next stage, enabling efficient memory usage and progress tracking.

```python
def run(self, file_paths: List[str]):
    with timer("Complete Indexing Pipeline"):
        # Stage 1: Document Processing & Chunking
        all_chunks = []
        doc_chunks_map = {}
        
        # Stage 2: Contextual Enrichment (optional)
        if self.contextual_enricher:
            all_chunks = self.contextual_enricher.enrich_batch(all_chunks)
        
        # Stage 3: Dense Indexing (embedding + storage)
        if self.vector_indexer:
            self.vector_indexer.index_chunks(all_chunks, table_name)
        
        # Stage 4: Graph Extraction (optional)
        if self.graph_extractor:
            self.graph_extractor.extract_and_store(all_chunks)
```

### Document Processing Deep-Dive

#### PDF Conversion Strategy
```python
# PDFConverter uses Docling for robust text extraction with structure
def convert_to_markdown(self, file_path: str) -> List[Tuple[str, Dict, Any]]:
    # Quick heuristic: if PDF has text layer, skip OCR for speed
    use_ocr = not self._pdf_has_text(file_path)
    converter = self.converter_ocr if use_ocr else self.converter_no_ocr
    
    result = converter.convert(file_path)
    markdown_content = result.document.export_to_markdown()
    
    metadata = {"source": file_path}
    # Return DoclingDocument object for advanced chunkers
    return [(markdown_content, metadata, result.document)]
```

**Benefits**:
- Preserves document structure (headings, lists, tables)
- Automatic OCR fallback for image-based PDFs
- Maintains page-level metadata for source attribution
- Structured output supports advanced chunking strategies

#### Chunking Strategy Selection
```python
# Dynamic chunker selection based on config
chunker_mode = config.get("chunker_mode", "legacy")

if chunker_mode == "docling":
    self.chunker = DoclingChunker(
        max_tokens=chunk_size,
        overlap=overlap_sentences,
        tokenizer_model="Qwen/Qwen3-Embedding-0.6B"
    )
else:
    self.chunker = MarkdownRecursiveChunker(
        max_chunk_size=chunk_size,
        min_chunk_size=min(chunk_overlap, chunk_size // 4)
    )
```

#### Recursive Markdown Chunking Algorithm
```python
def chunk(self, text: str, document_id: str, metadata: Dict) -> List[Dict]:
    # Priority hierarchy for splitting
    separators = [
        "\n\n# ",      # H1 headers (highest priority)
        "\n\n## ",     # H2 headers
        "\n\n### ",    # H3 headers
        "\n\n",        # Paragraph breaks
        "\n",          # Line breaks
        ". ",          # Sentence boundaries
        " "            # Word boundaries (last resort)
    ]
    
    chunks = []
    current_chunk = ""
    
    for separator in separators:
        if len(current_chunk) <= self.max_chunk_size:
            continue
            
        # Split on current separator
        parts = current_chunk.split(separator)
        
        # Reassemble with overlap
        for i, part in enumerate(parts):
            if len(part) > self.max_chunk_size:
                # Recursively split large parts
                continue
            
            # Add overlap from previous chunk
            if i > 0 and len(chunks) > 0:
                overlap_text = chunks[-1]["text"][-self.chunk_overlap:]
                part = overlap_text + separator + part
            
            chunks.append({
                "text": part,
                "document_id": document_id,
                "metadata": {**metadata, "chunk_index": len(chunks)}
            })
```

### DocLing Chunking Implementation

#### Token-Aware Sentence Packing
```python
class DoclingChunker:
    def __init__(self, max_tokens: int = 512, overlap: int = 1, 
                 tokenizer_model: str = "Qwen/Qwen3-Embedding-0.6B"):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
        self.max_tokens = max_tokens
        self.overlap = overlap  # sentences of overlap
    
    def split_markdown(self, markdown: str, document_id: str, metadata: Dict):
        sentences = self._sentence_split(markdown)
        chunks = []
        window = []
        
        while sentences:
            # Add sentences until token limit
            while (sentences and 
                   self._token_len(" ".join(window + [sentences[0]])) <= self.max_tokens):
                window.append(sentences.pop(0))
            
            if not window:  # Single sentence > limit
                window.append(sentences.pop(0))
            
            # Create chunk
            chunk_text = " ".join(window)
            chunks.append({
                "chunk_id": f"{document_id}_{len(chunks)}",
                "text": chunk_text,
                "metadata": {
                    **metadata,
                    "chunk_index": len(chunks),
                    "heading_path": metadata.get("heading_path", []),
                    "block_type": metadata.get("block_type", "paragraph")
                }
            })
            
            # Add overlap for next chunk
            if self.overlap and sentences:
                overlap_sentences = window[-self.overlap:]
                sentences = overlap_sentences + sentences
            window = []
        
        return chunks
```

#### Document Structure Preservation
```python
def chunk_document(self, doc, document_id: str, metadata: Dict):
    """Walk DoclingDocument tree and emit structured chunks."""
    chunks = []
    current_heading_path = []
    buffer = []
    
    # Process document elements in reading order
    for txt_item in doc.texts:
        role = getattr(txt_item, "role", None)
        
        if role == "heading":
            self._flush_buffer(buffer, chunks, current_heading_path)
            level = getattr(txt_item, "level", 1)
            # Update heading hierarchy
            current_heading_path = current_heading_path[:level-1]
            current_heading_path.append(txt_item.text.strip())
            continue
        
        # Accumulate text in token-aware buffer
        text_piece = txt_item.text
        if self._buffer_would_exceed_limit(buffer, text_piece):
            self._flush_buffer(buffer, chunks, current_heading_path)
        
        buffer.append(text_piece)
    
    self._flush_buffer(buffer, chunks, current_heading_path)
    return chunks
```

### Contextual Enrichment Implementation

#### Batch Processing Pattern
```python
class ContextualEnricher:
    def enrich_batch(self, chunks: List[Dict]) -> List[Dict]:
        enriched_chunks = []
        
        # Process in batches to manage memory
        for i in range(0, len(chunks), self.batch_size):
            batch = chunks[i:i + self.batch_size]
            
            # Parallel enrichment within batch
            with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
                futures = [
                    executor.submit(self._enrich_single_chunk, chunk)
                    for chunk in batch
                ]
                
                for future in concurrent.futures.as_completed(futures):
                    enriched_chunks.append(future.result())
        
        return enriched_chunks
```

#### Contextual Prompt Engineering
```python
def _generate_context_summary(self, chunk_text: str, surrounding_context: str) -> str:
    prompt = f"""
    Analyze this text chunk and provide a concise summary that captures:
    1. Main topics and key information
    2. Context within the broader document
    3. Relevance for search and retrieval
    
    Document Context:
    {surrounding_context}
    
    Chunk to Analyze:
    {chunk_text}
    
    Summary (max 2 sentences):
    """
    
    response = self.llm_client.complete(
        prompt=prompt,
        model=self.ollama_config["enrichment_model"]  # qwen3:0.6b
    )
    
    return response.strip()
```

### Embedding Generation Pipeline

#### Model Selection Strategy
```python
def select_embedder(model_name: str, ollama_host: str = None):
    """Select appropriate embedder based on model name."""
    if "Qwen3-Embedding" in model_name:
        return QwenEmbedder(model_name=model_name)
    elif "bge-" in model_name:
        return BGEEmbedder(model_name=model_name)
    elif ollama_host and model_name in ["nomic-embed-text"]:
        return OllamaEmbedder(model_name=model_name, host=ollama_host)
    else:
        # Default to Qwen embedder
        return QwenEmbedder(model_name="Qwen/Qwen3-Embedding-0.6B")
```

#### Batch Embedding Generation
```python
class QwenEmbedder:
    def create_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings in batches for efficiency."""
        embeddings = []
        
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i + self.batch_size]
            
            # Tokenize and encode
            inputs = self.tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                max_length=512,
                return_tensors='pt'
            )
            
            with torch.no_grad():
                outputs = self.model(**inputs)
                # Mean pooling over token embeddings
                batch_embeddings = outputs.last_hidden_state.mean(dim=1)
                embeddings.append(batch_embeddings.cpu().numpy())
        
        return np.vstack(embeddings)
```

### LanceDB Storage Implementation

#### Table Management Strategy
```python
class LanceDBManager:
    def create_table_if_not_exists(self, table_name: str, schema: Schema):
        """Create LanceDB table with proper schema."""
        try:
            table = self.db.open_table(table_name)
            print(f"Table {table_name} already exists")
            return table
        except FileNotFoundError:
            # Table doesn't exist, create it
            table = self.db.create_table(
                table_name,
                schema=schema,
                mode="create"
            )
            print(f"Created new table: {table_name}")
            return table
    
    def index_chunks(self, chunks: List[Dict], table_name: str):
        """Store chunks with embeddings in LanceDB."""
        table = self.get_table(table_name)
        
        # Prepare data for insertion
        records = []
        for chunk in chunks:
            record = {
                "chunk_id": chunk["chunk_id"],
                "text": chunk["text"],
                "vector": chunk["embedding"].tolist(),
                "metadata": json.dumps(chunk["metadata"]),
                "document_id": chunk["metadata"]["document_id"],
                "chunk_index": chunk["metadata"]["chunk_index"]
            }
            records.append(record)
        
        # Batch insert
        table.add(records)
        
        # Create vector index for fast similarity search
        table.create_index("vector", config=IvfPq(num_partitions=256))
```

### Overview Building for Query Routing

#### Document Summarization Strategy
```python
class OverviewBuilder:
    def build_overview(self, chunks: List[Dict], document_id: str) -> Dict:
        """Generate document overview for query routing."""
        # Take first N chunks for overview (usually most important)
        sample_chunks = chunks[:self.max_chunks_for_overview]
        combined_text = "\n\n".join([c["text"] for c in sample_chunks])
        
        overview_prompt = f"""
        Analyze this document and create a brief overview that includes:
        1. Main topic and purpose
        2. Key themes and concepts
        3. Document type and domain
        4. Relevant search keywords
        
        Document text:
        {combined_text}
        
        Overview (max 3 sentences):
        """
        
        overview = self.llm_client.complete(
            prompt=overview_prompt,
            model=self.overview_model  # qwen3:0.6b for speed
        )
        
        return {
            "document_id": document_id,
            "overview": overview.strip(),
            "chunk_count": len(chunks),
            "keywords": self._extract_keywords(combined_text),
            "created_at": datetime.now().isoformat()
        }
    
    def save_overview(self, overview: Dict):
        """Save overview to JSONL file for query routing."""
        overview_path = f"./index_store/overviews/{overview['document_id']}.jsonl"
        
        with open(overview_path, 'w') as f:
            json.dump(overview, f)
```

### Performance Optimizations

#### Memory Management
```python
class IndexingPipeline:
    def __init__(self, config: Dict, ollama_client: OllamaClient, ollama_config: Dict):
        # Lazy initialization to save memory
        self._pdf_converter = None
        self._chunker = None
        self._embedder = None
        
    def _get_embedder(self):
        """Lazy load embedder to avoid memory overhead."""
        if self._embedder is None:
            model_name = self.config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B")
            self._embedder = select_embedder(model_name)
        return self._embedder
    
    def process_document_batch(self, file_paths: List[str]):
        """Process documents in batches to manage memory."""
        for batch_start in range(0, len(file_paths), self.batch_size):
            batch = file_paths[batch_start:batch_start + self.batch_size]
            
            # Process batch
            self._process_batch(batch)
            
            # Cleanup to free memory
            if hasattr(self, '_embedder') and self._embedder:
                self._embedder.cleanup()
```

#### Parallel Processing
```python
def run_parallel_processing(self, file_paths: List[str]):
    """Process multiple documents in parallel."""
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        futures = []
        
        for file_path in file_paths:
            future = executor.submit(self._process_single_file, file_path)
            futures.append(future)
        
        # Collect results
        results = []
        for future in concurrent.futures.as_completed(futures):
            try:
                result = future.result(timeout=300)  # 5 minute timeout
                results.append(result)
            except Exception as e:
                print(f"Error processing file: {e}")
        
        return results
```

### Error Handling and Recovery

#### Graceful Degradation
```python
def run(self, file_paths: List[str], table_name: str):
    """Main pipeline with comprehensive error handling."""
    processed_files = []
    failed_files = []
    
    for file_path in file_paths:
        try:
            # Attempt processing
            chunks = self._process_single_file(file_path)
            
            if chunks:
                # Store successfully processed chunks
                self._store_chunks(chunks, table_name)
                processed_files.append(file_path)
            else:
                print(f"⚠️ No chunks generated from {file_path}")
                failed_files.append((file_path, "No chunks generated"))
                
        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
            failed_files.append((file_path, str(e)))
            continue  # Continue with other files
    
    # Return summary
    return {
        "processed": len(processed_files),
        "failed": len(failed_files),
        "processed_files": processed_files,
        "failed_files": failed_files
    }
```

#### Recovery Mechanisms
```python
def recover_from_partial_failure(self, table_name: str, document_id: str):
    """Recover from partial indexing failures."""
    try:
        # Check what was already processed
        table = self.db_manager.get_table(table_name)
        existing_chunks = table.search().where(f"document_id = '{document_id}'").to_list()
        
        if existing_chunks:
            print(f"Found {len(existing_chunks)} existing chunks for {document_id}")
            return True
            
        # Cleanup partial data
        self._cleanup_partial_data(table_name, document_id)
        return False
        
    except Exception as e:
        print(f"Recovery failed: {e}")
        return False
```

### Configuration and Customization

#### Pipeline Configuration Options
```python
DEFAULT_CONFIG = {
    "chunking": {
        "strategy": "docling",  # "docling", "recursive", "fixed"
        "max_tokens": 512,
        "overlap": 64,
        "min_chunk_size": 100
    },
    "embedding": {
        "model_name": "Qwen/Qwen3-Embedding-0.6B",
        "batch_size": 32,
        "max_length": 512
    },
    "enrichment": {
        "enabled": True,
        "model": "qwen3:0.6b",
        "batch_size": 16
    },
    "overview": {
        "enabled": True,
        "max_chunks": 5,
        "model": "qwen3:0.6b"
    },
    "storage": {
        "create_index": True,
        "index_type": "IvfPq",
        "num_partitions": 256
    }
}
```

#### Custom Processing Hooks
```python
class IndexingPipeline:
    def __init__(self, config: Dict, hooks: Dict = None):
        self.hooks = hooks or {}
    
    def _run_hook(self, hook_name: str, *args, **kwargs):
        """Execute custom processing hooks."""
        if hook_name in self.hooks:
            return self.hooks[hook_name](*args, **kwargs)
        return None
    
    def process_chunk(self, chunk: Dict) -> Dict:
        """Process single chunk with custom hooks."""
        # Pre-processing hook
        chunk = self._run_hook("pre_chunk_process", chunk) or chunk
        
        # Standard processing
        if self.contextual_enricher:
            chunk = self.contextual_enricher.enrich_chunk(chunk)
        
        # Post-processing hook
        chunk = self._run_hook("post_chunk_process", chunk) or chunk
        
        return chunk
```

---

## Current Implementation Status

### Completed Features ✅
- DocLing-based PDF processing with OCR fallback
- Multiple chunking strategies (DocLing, Recursive, Fixed-size)
- Qwen3-Embedding-0.6B integration
- Contextual enrichment with qwen3:0.6b
- LanceDB storage with vector indexing
- Overview generation for query routing
- Batch processing and parallel execution
- Comprehensive error handling

### In Development 🚧
- Graph extraction and knowledge graph building
- Multimodal processing for images and tables
- Advanced late-chunking optimization
- Distributed processing support

### Planned Features 📋
- Custom model fine-tuning pipeline
- Real-time incremental indexing
- Cross-document relationship extraction
- Advanced metadata enrichment

---

## Performance Benchmarks

| Document Type | Processing Speed | Memory Usage | Storage Efficiency |
|---------------|------------------|--------------|-------------------|
| Text PDFs | 2-5 pages/sec | 2-4GB | 1MB/100 pages |
| Image PDFs | 0.5-1 page/sec | 4-8GB | 2MB/100 pages |
| Technical Docs | 1-3 pages/sec | 3-6GB | 1.5MB/100 pages |
| Research Papers | 2-4 pages/sec | 2-4GB | 1.2MB/100 pages |

## Extension Points

### Custom Chunkers
```python
class CustomChunker(BaseChunker):
    def chunk(self, text: str, document_id: str, metadata: Dict) -> List[Dict]:
        # Implement custom chunking logic
        pass
```

### Custom Embedders
```python
class CustomEmbedder(BaseEmbedder):
    def create_embeddings(self, texts: List[str]) -> np.ndarray:
        # Implement custom embedding generation
        pass
```

### Custom Enrichers
```python
class CustomEnricher(BaseEnricher):
    def enrich_chunk(self, chunk: Dict) -> Dict:
        # Implement custom enrichment logic
        pass
``` 

================================================
FILE: Documentation/installation_guide.md
================================================
# 📦 RAG System Installation Guide

_Last updated: 2025-01-07_

This guide provides step-by-step instructions for installing and setting up the RAG system using either Docker or direct development approaches.

---

## 🎯 Installation Options

### Option 1: Docker Deployment (Production Ready) 🐳
- **Best for**: Production environments, isolated setups, easy management
- **Requirements**: Docker Desktop + Local Ollama
- **Setup time**: ~10 minutes

### Option 2: Direct Development (Developer Friendly) 💻
- **Best for**: Development, customization, debugging
- **Requirements**: Python + Node.js + Ollama
- **Setup time**: ~15 minutes

---

## 1. Prerequisites

### 1.1 System Requirements

#### **Minimum Requirements**
- **CPU**: 4 cores, 2.5GHz+
- **RAM**: 8GB (16GB recommended)
- **Storage**: 50GB free space
- **OS**: macOS 10.15+, Ubuntu 20.04+, Windows 10+

#### **Recommended Requirements**
- **CPU**: 8+ cores, 3.0GHz+
- **RAM**: 32GB+ (for large models)
- **Storage**: 200GB+ SSD
- **GPU**: NVIDIA GPU with 8GB+ VRAM (optional)

### 1.2 Common Dependencies

**Required for both approaches:**
- **Ollama**: AI model runtime (always required)
- **Git**: 2.30+ for cloning repository

**Docker-specific:**
- **Docker Desktop**: 24.0+ with Docker Compose

**Direct Development-specific:**
- **Python**: 3.8+ 
- **Node.js**: 16+ with npm

---

## 2. Ollama Installation (Required for Both)

### 2.1 Install Ollama

#### **macOS/Linux:**
```bash
# Install Ollama
curl -fsSL https://ollama.ai/install.sh | sh

# Verify installation
ollama --version
```

#### **Windows:**
```bash
# Download from: https://ollama.ai/download
# Run the installer and follow setup wizard
```

### 2.2 Configure Ollama

```bash
# Start Ollama server
ollama serve

# In another terminal, install required models
ollama pull qwen3:0.6b      # Fast model (650MB)
ollama pull qwen3:8b        # High-quality model (4.7GB)

# Verify models are installed
ollama list

# Test Ollama
ollama run qwen3:0.6b "Hello, how are you?"
```

**⚠️ Important**: Keep Ollama running (`ollama serve`) for the entire setup process.

---

## 3. 🐳 Docker Installation & Setup

### 3.1 Install Docker

#### **macOS:**
```bash
# Install Docker Desktop via Homebrew
brew install --cask docker

# Or download from: https://www.docker.com/products/docker-desktop/
# Start Docker Desktop from Applications

# Verify installation
docker --version
docker compose version
```

#### **Ubuntu/Debian:**
```bash
# Update system
sudo apt-get update

# Install Docker using convenience script
curl -fsSL https://get.docker.com -o get-docker.sh
sudo sh get-docker.sh

# Add user to docker group
sudo usermod -aG docker $USER
newgrp docker

# Install Docker Compose V2
sudo apt-get install docker-compose-plugin

# Verify installation
docker --version
docker compose version
```

#### **Windows:**
1. Download Docker Desktop from https://www.docker.com/products/docker-desktop/
2. Run installer and enable WSL 2 integration
3. Restart computer and start Docker Desktop
4. Verify in PowerShell: `docker --version`

### 3.2 Clone and Setup RAG System

```bash
# Clone repository
git clone <your-repository-url>
cd rag_system_old

# Verify Ollama is running
curl http://localhost:11434/api/tags

# Start Docker containers
./start-docker.sh

# Wait for containers to start (2-3 minutes)
sleep 120

# Verify deployment
./start-docker.sh status
```

### 3.3 Test Docker Deployment

```bash
# Test all endpoints
curl -f http://localhost:3000 && echo "✅ Frontend OK"
curl -f http://localhost:8000/health && echo "✅ Backend OK"
curl -f http://localhost:8001/models && echo "✅ RAG API OK"
curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK"

# Access the application
open http://localhost:3000
```

---

## 4. 💻 Direct Development Setup

### 4.1 Install Development Dependencies

#### **Python Setup:**
```bash
# Clone repository
git clone https://github.com/your-org/rag-system.git
cd rag-system

# Create virtual environment (recommended)
python -m venv venv

# Activate virtual environment
source venv/bin/activate  # macOS/Linux
# venv\Scripts\activate   # Windows

# Install Python dependencies
pip install -r requirements.txt

# Verify Python setup
python -c "import torch; print('✅ PyTorch OK')"
python -c "import transformers; print('✅ Transformers OK')"
python -c "import lancedb; print('✅ LanceDB OK')"
```

#### **Node.js Setup:**
```bash
# Install Node.js dependencies
npm install

# Verify Node.js setup
node --version  # Should be 16+
npm --version
npm list --depth=0
```

### 4.2 Start Direct Development

```bash
# Ensure Ollama is running
curl http://localhost:11434/api/tags

# Start all components with one command
python run_system.py

# Or start components manually in separate terminals:
# Terminal 1: python -m rag_system.api_server
# Terminal 2: cd backend && python server.py  
# Terminal 3: npm run dev
```

### 4.3 Test Direct Development

```bash
# Check system health
python system_health_check.py

# Test endpoints
curl -f http://localhost:3000 && echo "✅ Frontend OK"
curl -f http://localhost:8000/health && echo "✅ Backend OK"
curl -f http://localhost:8001/models && echo "✅ RAG API OK"

# Access the application
open http://localhost:3000
```

---

## 5. Detailed Installation Steps

### 5.1 Repository Setup

```bash
# Clone repository
git clone https://github.com/your-org/rag-system.git
cd rag-system

# Check repository structure
ls -la

# Create required directories
mkdir -p lancedb index_store shared_uploads logs backend
touch backend/chat_data.db

# Set permissions
chmod -R 755 lancedb index_store shared_uploads
chmod 664 backend/chat_data.db
```

### 5.2 Configuration

#### **Environment Variables**
For Docker (automatic via `docker.env`):
```bash
OLLAMA_HOST=http://host.docker.internal:11434
NODE_ENV=production
RAG_API_URL=http://rag-api:8001
NEXT_PUBLIC_API_URL=http://localhost:8000
```

For Direct Development (set automatically by `run_system.py`):
```bash
OLLAMA_HOST=http://localhost:11434
RAG_API_URL=http://localhost:8001
NEXT_PUBLIC_API_URL=http://localhost:8000
```

#### **Model Configuration**
The system defaults to these models:
- **Embedding**: `Qwen/Qwen3-Embedding-0.6B` (1024 dimensions)
- **Generation**: `qwen3:0.6b` for fast responses, `qwen3:8b` for quality
- **Reranking**: Built-in cross-encoder

### 5.3 Database Initialization

```bash
# Initialize SQLite database
python -c "
from backend.database import ChatDatabase
db = ChatDatabase()
db.init_database()
print('✅ Database initialized')
"

# Verify database
sqlite3 backend/chat_data.db ".tables"
```

---

## 6. Verification & Testing

### 6.1 System Health Checks

#### **Comprehensive Health Check:**
```bash
# For Docker deployment
./start-docker.sh status
docker compose ps

# For Direct development
python system_health_check.py

# Universal health check
curl -f http://localhost:3000 && echo "✅ Frontend OK"
curl -f http://localhost:8000/health && echo "✅ Backend OK"
curl -f http://localhost:8001/models && echo "✅ RAG API OK"
curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK"
```

#### **RAG System Test:**
```bash
# Test RAG system initialization
python -c "
from rag_system.main import get_agent
agent = get_agent('default')
print('✅ RAG System initialized successfully')
"

# Test embedding generation
python -c "
from rag_system.main import get_agent
agent = get_agent('default')
embedder = agent.retrieval_pipeline._get_text_embedder()
test_emb = embedder.create_embeddings(['Hello world'])
print(f'✅ Embedding generated: {test_emb.shape}')
"
```

### 6.2 Functional Testing

#### **Document Upload Test:**
1. Access http://localhost:3000
2. Click "Create New Index"
3. Upload a PDF document
4. Configure settings and build index
5. Test chat functionality

#### **API Testing:**
```bash
# Test session creation
curl -X POST http://localhost:8000/sessions \
  -H "Content-Type: application/json" \
  -d '{"title": "Test Session"}'

# Test models endpoint
curl http://localhost:8001/models

# Test health endpoints
curl http://localhost:8000/health
curl http://localhost:8001/health
```

---

## 7. Troubleshooting Installation

### 7.1 Common Issues

#### **Ollama Issues:**
```bash
# Ollama not responding
curl http://localhost:11434/api/tags

# If fails, restart Ollama
pkill ollama
ollama serve

# Reinstall models if needed
ollama pull qwen3:0.6b
ollama pull qwen3:8b
```

#### **Docker Issues:**
```bash
# Docker daemon not running
docker version

# Restart Docker Desktop (macOS/Windows)
# Or restart docker service (Linux)
sudo systemctl restart docker

# Clear Docker cache if build fails
docker system prune -f
```

#### **Python Issues:**
```bash
# Check Python version
python --version  # Should be 3.8+

# Check virtual environment
which python
pip list | grep torch

# Reinstall dependencies
pip install -r requirements.txt --force-reinstall
```

#### **Node.js Issues:**
```bash
# Check Node version
node --version  # Should be 16+

# Clear and reinstall
rm -rf node_modules package-lock.json
npm install
```

### 7.2 Performance Issues

#### **Memory Problems:**
```bash
# Check system memory
free -h  # Linux
vm_stat  # macOS

# For Docker: Increase memory allocation
# Docker Desktop → Settings → Resources → Memory → 8GB+

# Use smaller models
ollama pull qwen3:0.6b  # Instead of qwen3:8b
```

#### **Slow Performance:**
- Use SSD storage for databases (`lancedb/`, `shared_uploads/`)
- Increase CPU cores if possible
- Close unnecessary applications
- Use smaller batch sizes in configuration

---

## 8. Post-Installation Setup

### 8.1 Model Optimization

```bash
# Install additional models (optional)
ollama pull nomic-embed-text        # Alternative embedding model
ollama pull llama3.1:8b            # Alternative generation model

# Test model switching
curl -X POST http://localhost:8001/chat \
  -H "Content-Type: application/json" \
  -d '{"query": "Hello", "model": "qwen3:8b"}'
```

### 8.2 Security Configuration

```bash
# Set proper file permissions
chmod 600 backend/chat_data.db    # Restrict database access
chmod 700 lancedb/                # Restrict vector DB access

# Configure firewall (production)
sudo ufw allow 3000/tcp           # Frontend
sudo ufw deny 8000/tcp            # Backend (internal only)
sudo ufw deny 8001/tcp            # RAG API (internal only)
```

### 8.3 Backup Setup

```bash
# Create backup script
cat > backup_system.sh << 'EOF'
#!/bin/bash
BACKUP_DIR="backups/$(date +%Y%m%d_%H%M%S)"
mkdir -p "$BACKUP_DIR"

# Backup databases and indexes
cp -r backend/chat_data.db "$BACKUP_DIR/"
cp -r lancedb "$BACKUP_DIR/"
cp -r index_store "$BACKUP_DIR/"
cp -r shared_uploads "$BACKUP_DIR/"

echo "Backup completed: $BACKUP_DIR"
EOF

chmod +x backup_system.sh
```

---

## 9. Success Criteria

### 9.1 Installation Complete When:

- ✅ All health checks pass without errors
- ✅ Frontend loads at http://localhost:3000
- ✅ All models are installed and responding
- ✅ You can create document indexes
- ✅ You can chat with uploaded documents
- ✅ No error messages in logs/terminal

### 9.2 Performance Benchmarks

**Acceptable Performance:**
- System startup: < 5 minutes
- Index creation: < 2 minutes per 100MB document
- Query response: < 30 seconds
- Memory usage: < 8GB total

**Optimal Performance:**
- System startup: < 2 minutes
- Index creation: < 1 minute per 100MB document
- Query response: < 10 seconds
- Memory usage: < 4GB total

---

## 10. Next Steps

### 10.1 Getting Started

1. **Upload Documents**: Create your first index with PDF documents
2. **Explore Features**: Try different query types and models
3. **Customize**: Adjust model settings and chunk sizes
4. **Scale**: Add more documents and create multiple indexes

### 10.2 Additional Resources

- **Quick Start**: See `Documentation/quick_start.md`
- **Docker Usage**: See `Documentation/docker_usage.md`
- **System Architecture**: See `Documentation/architecture_overview.md`
- **API Reference**: See `Documentation/api_reference.md`

---

**Congratulations! 🎉** Your RAG system is now ready to use. Visit http://localhost:3000 to start chatting with your documents. 

================================================
FILE: Documentation/prompt_inventory.md
================================================
# 📜 Prompt Inventory (Ground-Truth)

_All generation / verification prompts currently hard-coded in the codebase._  
_Last updated: 2025-07-06_

> Edit process: if you change a prompt in code, please **update this file** or, once we migrate to the central registry, delete the entry here.

---

## 1. Indexing / Context Enrichment

| ID | File & Lines | Variable / Builder | Purpose |
|----|--------------|--------------------|---------|
| `overview_builder.default` | `rag_system/indexing/overview_builder.py` `12-21` | `DEFAULT_PROMPT` | Generate 1-paragraph document overview for search-time routing.
| `contextualizer.system` | `rag_system/indexing/contextualizer.py` `11` | `SYSTEM_PROMPT` | System instruction: explain summarisation role.
| `contextualizer.local_context` | same file `13-15` | `LOCAL_CONTEXT_PROMPT_TEMPLATE` | Human message – wraps neighbouring chunks.
| `contextualizer.chunk` | same file `17-19` | `CHUNK_PROMPT_TEMPLATE` | Human message – shows the target chunk.
| `graph_extractor.entities` | `rag_system/indexing/graph_extractor.py` `20-31` | `entity_prompt` | Ask LLM to list entities.
| `graph_extractor.relationships` | same file `53-64` | `relationship_prompt` | Ask LLM to list relationships.

## 2. Retrieval / Query Transformation

| ID | File & Lines | Purpose |
|----|--------------|---------|
| `query_transformer.expand` | `rag_system/retrieval/query_transformer.py` `10-26` | Produce query rewrites (keywords, boolean). |
| `hyde.hypothetical_doc` | same `115-122` | HyDE hypothetical document generator. |
| `graph_query.translate` | same `124-140` | Translate user question to JSON KG query. |

## 3. Pipeline Answer Synthesis

| ID | File & Lines | Purpose |
|----|--------------|---------|
| `retrieval_pipeline.synth_final` | `rag_system/pipelines/retrieval_pipeline.py` `217-256` | Turn verified facts into answer (with directives 1-6). |

## 4. Agent – Classical Loop

| ID | File & Lines | Purpose |
|----|--------------|---------|
| `agent.loop.initial_thought` | `rag_system/agent/loop.py` `157-180` | First LLM call to think about query. |
| `agent.loop.verify_path` | same `190-205` | Secondary thought loop. |
| `agent.loop.compose_sub` | same `506-542` | Compose answer from sub-answers. |
| `agent.loop.router` | same `648-660` | Decide which subsystem handles query. |

## 5. Verifier

| ID | File & Lines | Purpose |
|----|--------------|---------|
| `verifier.fact_check` | `rag_system/agent/verifier.py` `18-58` | Strict JSON-format grounding verifier. |

## 6. Backend Router (Fast path)

| ID | File & Lines | Purpose |
|----|--------------|---------|
| `backend.router` | `backend/server.py` `435-448` | Decide "RAG vs direct LLM" before heavy processing. |

## 7. Miscellaneous

| ID | File & Lines | Purpose |
|----|--------------|---------|
| `vision.placeholder` | `rag_system/utils/ollama_client.py` `169` | Dummy prompt for VLM colour check. |

---

### Missing / To-Do
1. Verify whether **ReActAgent.PROMPT_TEMPLATE** captures every placeholder – some earlier lines may need explicit ID when we move to central registry.
2. Search TS/JS code once the backend prompts are ported (currently none).

---

**Next step:** create `rag_system/prompts/registry.yaml` and start moving each prompt above into a key–value entry with identical IDs. Update callers gradually using the helper proposed earlier. 

================================================
FILE: Documentation/quick_start.md
================================================
# ⚡ Quick Start Guide - RAG System

_Get up and running in 5 minutes!_

---

## 🚀 Choose Your Deployment Method

### Option 1: Docker Deployment (Production Ready) 🐳

Best for: Production deployments, isolated environments, easy scaling

### Option 2: Direct Development (Developer Friendly) 💻  

Best for: Development, customization, debugging, faster iteration

---

## 🐳 Docker Deployment

### Prerequisites
- Docker Desktop installed and running
- 8GB+ RAM available
- Internet connection

### Step 1: Clone and Setup

```bash
# Clone repository
git clone <your-repository-url>
cd rag_system_old

# Ensure Docker is running
docker version
```

### Step 2: Install Ollama Locally

**Even with Docker, Ollama runs locally for better performance:**

```bash
# Install Ollama
curl -fsSL https://ollama.ai/install.sh | sh

# Start Ollama (in one terminal)
ollama serve

# Install models (in another terminal)
ollama pull qwen3:0.6b
ollama pull qwen3:8b
```

### Step 3: Start Docker Containers

```bash
# Start all containers
./start-docker.sh

# Or manually:
docker compose --env-file docker.env up --build -d
```

### Step 4: Verify Deployment

```bash
# Check container status
docker compose ps

# Test endpoints
curl http://localhost:3000      # Frontend
curl http://localhost:8000/health  # Backend  
curl http://localhost:8001/models  # RAG API
```

### Step 5: Access Application

Open your browser to: **http://localhost:3000**

---

## 💻 Direct Development

### Prerequisites
- Python 3.8+
- Node.js 16+ and npm
- 8GB+ RAM available

### Step 1: Clone and Install Dependencies

```bash
# Clone repository
git clone <your-repository-url>
cd rag_system_old

# Install Python dependencies
pip install -r requirements.txt

# Install Node.js dependencies  
npm install
```

### Step 2: Install and Configure Ollama

```bash
# Install Ollama
curl -fsSL https://ollama.ai/install.sh | sh

# Start Ollama (in one terminal)
ollama serve

# Install models (in another terminal)
ollama pull qwen3:0.6b
ollama pull qwen3:8b
```

### Step 3: Start the System

```bash
# Start all components with one command
python run_system.py
```

**Or start components manually in separate terminals:**

```bash
# Terminal 1: RAG API
python -m rag_system.api_server

# Terminal 2: Backend
cd backend && python server.py

# Terminal 3: Frontend
npm run dev
```

### Step 4: Verify Installation

```bash
# Check system health
python system_health_check.py

# Test endpoints
curl http://localhost:3000      # Frontend
curl http://localhost:8000/health  # Backend
curl http://localhost:8001/models  # RAG API
```

### Step 5: Access Application

Open your browser to: **http://localhost:3000**

---

## 🎯 First Use Guide

### 1. Create a Chat Session
- Click "New Chat" in the interface
- Give your session a descriptive name

### 2. Upload Documents
- Click "Create New Index" button
- Upload PDF files from your computer
- Configure processing options:
  - **Chunk Size**: 512 (recommended)
  - **Embedding Model**: Qwen/Qwen3-Embedding-0.6B
  - **Enable Enrichment**: Yes
- Click "Build Index" and wait for processing

### 3. Start Chatting
- Select your built index
- Ask questions about your documents:
  - "What is this document about?"
  - "Summarize the key points"
  - "What are the main findings?"
  - "Compare the arguments in section 3 and 5"

---

## 🔧 Management Commands

### Docker Commands

```bash
# Container management
./start-docker.sh                    # Start all containers
./start-docker.sh stop              # Stop all containers
./start-docker.sh logs              # View logs
./start-docker.sh status            # Check status

# Manual Docker Compose
docker compose ps                    # Check status
docker compose logs -f              # Follow logs
docker compose down                 # Stop containers
docker compose up --build -d        # Rebuild and start
```

### Direct Development Commands

```bash
# System management
python run_system.py               # Start all services
python system_health_check.py      # Check system health

# Individual components
python -m rag_system.api_server    # RAG API only
cd backend && python server.py     # Backend only
npm run dev                         # Frontend only

# Stop: Press Ctrl+C in terminal running services
```

---

## 🆘 Quick Troubleshooting

### Docker Issues

**Containers not starting?**
```bash
# Check Docker daemon
docker version

# Restart Docker Desktop and try again
./start-docker.sh
```

**Port conflicts?**
```bash
# Check what's using ports
lsof -i :3000 -i :8000 -i :8001

# Stop conflicting processes
./start-docker.sh stop
```

### Direct Development Issues

**Import errors?**
```bash
# Check Python installation
python --version  # Should be 3.8+

# Reinstall dependencies
pip install -r requirements.txt --force-reinstall
```

**Node.js errors?**
```bash
# Check Node version
node --version    # Should be 16+

# Reinstall dependencies
rm -rf node_modules package-lock.json
npm install
```

### Common Issues

**Ollama not responding?**
```bash
# Check if Ollama is running
curl http://localhost:11434/api/tags

# Restart Ollama
pkill ollama
ollama serve
```

**Out of memory?**
```bash
# Check memory usage
docker stats  # For Docker
htop          # For direct development

# Recommended: 16GB+ RAM for optimal performance
```

---

## 📊 System Verification

Run this comprehensive check:

```bash
# Check all endpoints
curl -f http://localhost:3000 && echo "✅ Frontend OK"
curl -f http://localhost:8000/health && echo "✅ Backend OK"  
curl -f http://localhost:8001/models && echo "✅ RAG API OK"
curl -f http://localhost:11434/api/tags && echo "✅ Ollama OK"

# For Docker: Check containers
docker compose ps
```

---

## 🎉 Success!

If you see:
- ✅ All services responding
- ✅ Frontend accessible at http://localhost:3000  
- ✅ No error messages

You're ready to start using LocalGPT!

### What's Next?

1. **📚 Upload Documents**: Add your PDF files to create indexes
2. **💬 Start Chatting**: Ask questions about your documents
3. **🔧 Customize**: Explore different models and settings
4. **📖 Learn More**: Check the full documentation below

### 📁 Key Files

```
rag-system/
├── 🐳 start-docker.sh           # Docker deployment script
├── 🏃 run_system.py             # Direct development launcher
├── 🩺 system_health_check.py    # System verification
├── 📋 requirements.txt          # Python dependencies
├── 📦 package.json              # Node.js dependencies
├── 📁 Documentation/            # Complete documentation
└── 📁 rag_system/              # Core system code
```

### 📖 Additional Resources

- **🏗️ Architecture**: See `Documentation/architecture_overview.md`
- **🔧 Configuration**: See `Documentation/system_overview.md`  
- **🚀 Deployment**: See `Documentation/deployment_guide.md`
- **🐛 Troubleshooting**: See `DOCKER_TROUBLESHOOTING.md`

---

**Happy RAG-ing! 🚀** 

---

## 🛠️ Indexing Scripts

The repository includes several convenient scripts for document indexing:

### Simple Index Creation Script

For quick document indexing without the UI:

```bash
# Basic usage
./simple_create_index.sh "Index Name" "document.pdf"

# Multiple documents
./simple_create_index.sh "Research Papers" "paper1.pdf" "paper2.pdf" "notes.txt"

# Using wildcards
./simple_create_index.sh "Invoice Collection" ./invoices/*.pdf
```

**Supported file types**: PDF, TXT, DOCX, MD

### Batch Indexing Script

For processing large document collections:

```bash
# Using the Python batch indexing script
python demo_batch_indexing.py

# Or using the direct indexing script
python create_index_script.py
```

These scripts automatically:
- ✅ Check prerequisites (Ollama running, Python dependencies)
- ✅ Validate document formats
- ✅ Create database entries
- ✅ Process documents with the RAG pipeline
- ✅ Generate searchable indexes

--- 

================================================
FILE: Documentation/retrieval_pipeline.md
================================================
# 📥 Retrieval Pipeline

_Maps to `rag_system/pipelines/retrieval_pipeline.py` and helpers in `retrieval/`, `rerankers/`._

## Role
Given a **user query** and one or more indexed tables, retrieve the most relevant text chunks and synthesise an answer.

## Sub-components
| Stage | Module | Key Classes / Fns | Notes |
|-------|--------|-------------------|-------|
| Query Pre-processing | `retrieval/query_transformer.py` | `QueryTransformer`, `HyDEGenerator`, `GraphQueryTranslator` | Expands, rewrites, or translates the raw query. |
| Retrieval | `retrieval/retrievers.py` | `BM25Retriever`, `DenseRetriever`, `HybridRetriever` | Abstract over LanceDB vector + FTS search. |
| Reranking | `rerankers/reranker.py` | `ColBERTSmall`, fallback `bge-reranker` | Optionally improves result ordering. |
| Synthesis | `pipelines/retrieval_pipeline.py` | `_synthesize_final_answer()` | Calls LLM with evidence snippets. |

## End-to-End Flow

```mermaid
flowchart LR
    Q["User Query"] --> XT["Query Transformer"]
    XT -->|variants| RETRIEVE
    subgraph Retrieval
        RET_BM25[BM25] --> MERGE
        RET_DENSE[Dense Vector] --> MERGE
        style RET_BM25 fill:#444,stroke:#ccc,color:#fff
        style RET_DENSE fill:#444,stroke:#ccc,color:#fff
    end
    MERGE --> RERANK
    RERANK --> K[["Top-K Chunks"]]
    K --> SYNTH["Answer Synthesiser\n(LLM)"]
    SYNTH --> A["Answer + Sources"]
```

### Narrative
1. **Query Transformer** may expand the query (keyword list, HyDE doc, KG translation) depending on `searchType`.
2. **Retrievers** execute BM25 and/or dense similarity against LanceDB.  Combination controlled by `retrievalMode` and `denseWeight`.
3. **Reranker** (if `aiRerank=true` or hybrid search) scores snippets; top `rerankerTopK` chosen.
4. **Synthesiser** streams an LLM completion using the prompt described in `prompt_inventory.md` (`retrieval_pipeline.synth_final`).

## Configuration Flags (passed from UI → backend)
| Flag | Default | Effect |
|------|---------|--------|
| `searchType` | `fts` | UI label (FTS / Dense / Hybrid). |
| `retrievalK` | 10 | Initial candidate count per retriever. |
| `contextWindowSize` | 5 | How many adjacent chunks to merge (late-chunk). |
| `rerankerTopK` | 20 | How many docs to pass into AI reranker. |
| `denseWeight` | 0.5 | When `hybrid`, linear mix weight. |
| `aiRerank` | bool | Toggle reranker. |
| `verify` | bool | If true, pass answer to **Verifier** component. |

## Interfaces
* Reads from **LanceDB** tables `text_pages_<index>`.
* Calls **Ollama** generation model specified in `PIPELINE_CONFIGS`.
* Exposes `RetrievalPipeline.answer_stream()` iterator consumed by SSE API.

## Extension Points
* Plug new retriever by inheriting `BaseRetriever` and registering in `retrievers.py`.
* Swap reranker model via `EXTERNAL_MODELS['reranker_model']`.
* Custom answer prompt can be overridden by passing `prompt_override` to `_synthesize_final_answer()` (not yet surfaced in UI).

##  Detailed Implementation Analysis

### Core Architecture Pattern
The `RetrievalPipeline` uses **lazy initialization** for all components to avoid heavy memory usage during startup. Each component (embedder, retrievers, rerankers) is only loaded when first accessed via private `_get_*()` methods.

```python
def _get_text_embedder(self):
    if self.text_embedder is None:
        self.text_embedder = select_embedder(
            self.config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B"),
            self.ollama_config.get("host")
        )
    return self.text_embedder
```

### Thread Safety Implementation
**Critical Issue**: ColBERT reranker and model loading are not thread-safe. The system uses multiple locks:

```python
# Global locks to prevent race conditions
_rerank_lock: Lock = Lock()           # Protects .rank() calls
_ai_reranker_init_lock: Lock = Lock() # Prevents concurrent model loading
_sentence_pruner_lock: Lock = Lock()  # Serializes Provence model init
```

When multiple queries run in parallel, only one thread can initialize heavy models or perform reranking operations.

### Retrieval Strategy Deep-Dive

#### 1. Multi-Vector Dense Retrieval (`_get_dense_retriever()`)
```python
self.dense_retriever = MultiVectorRetriever(
    db_manager,           # LanceDB connection
    text_embedder,        # Qwen3-Embedding embedder
    vision_model=None,    # Optional multimodal
    fusion_config={}      # Score combination rules
)
```

**Process**:
1. Query → embedding vector (1024D for Qwen3-Embedding-0.6B)
2. LanceDB ANN search using IVF-PQ index
3. Cosine similarity scoring
4. Returns top-K with metadata

#### 2. BM25 Full-Text Search (`_get_bm25_retriever()`)
```python
# Uses SQLite FTS5 under the hood
SELECT chunk_id, text, bm25(fts_table) as score 
FROM fts_table 
WHERE fts_table MATCH ? 
ORDER BY bm25(fts_table) 
LIMIT ?
```

**Token Processing**:
- Stemming via Porter algorithm
- Stop-word removal
- N-gram tokenization (configurable)

#### 3. Hybrid Score Fusion
When both retrievers are enabled:
```python
final_score = (1 - dense_weight) * bm25_score + dense_weight * dense_score
```
Default `dense_weight = 0.7` favors semantic over lexical matching (updated from 0.5).

### Late-Chunk Merging Algorithm

**Problem**: Small chunks lose context; large chunks dilute relevance.  
**Solution**: Retrieve small chunks, then expand with neighbors.

```python
def _get_surrounding_chunks_lancedb(self, chunk, window_size):
    start_index = max(0, chunk_index - window_size)
    end_index = chunk_index + window_size
    
    sql_filter = f"document_id = '{document_id}' AND chunk_index >= {start_index} AND chunk_index <= {end_index}"
    results = tbl.search().where(sql_filter).to_list()
    
    # Sort by chunk_index to maintain document order
    return sorted(results, key=lambda x: x.get("chunk_index", 0))
```

**Benefits**:
- Maintains granular search precision
- Provides richer context for answer generation
- Configurable window size (default: 5 chunks = ~2500 tokens)

### AI Reranker Implementation

#### ColBERT Strategy (via rerankers-lib)
```python
from rerankers import Reranker
self.ai_reranker = Reranker("answerdotai/answerai-colbert-small-v1", model_type="colbert")

# Usage
scores = reranker.rank(query, [doc.text for doc in candidates])
```

**ColBERT Architecture**:
- **Query encoding**: Each token → 128D vector
- **Document encoding**: Each token → 128D vector  
- **Interaction**: MaxSim between all query-doc token pairs
- **Advantage**: Fine-grained token-level matching

#### Fallback: BGE Cross-Encoder
```python
# When ColBERT fails/unavailable
from sentence_transformers import CrossEncoder
model = CrossEncoder('BAAI/bge-reranker-base')
scores = model.predict([(query, doc.text) for doc in candidates])
```

### Answer Synthesis Pipeline

#### Prompt Engineering Pattern
```python
def _synthesize_final_answer(self, query: str, facts: str, *, event_callback=None):
    prompt = f"""
You are an AI assistant specialised in answering questions from retrieved context.

Context you receive
• VERIFIED FACTS – text snippets retrieved from the user's documents.
• ORIGINAL QUESTION – the user's actual query.

Instructions
1. Evaluate each snippet for relevance to the ORIGINAL QUESTION
2. Synthesise an answer **using only information from relevant snippets**
3. If snippets contradict, mention the contradiction explicitly
4. If insufficient information: "I could not find that information in the provided documents."
5. Provide thorough, well-structured answer with relevant numbers/names
6. Do **not** introduce external knowledge

–––––  Retrieved Snippets  –––––
{facts}
––––––––––––––––––––––––––––––

ORIGINAL QUESTION: "{query}"
"""

    response = self.llm_client.complete_stream(
        prompt=prompt,
        model=self.ollama_config["generation_model"]  # qwen3:8b
    )
    
    for chunk in response:
        if event_callback:
            event_callback({"type": "answer_chunk", "content": chunk})
        yield chunk
```

**Advanced Features**:
- **Source Attribution**: Automatic citation generation
- **Confidence Scoring**: Based on retrieval scores and snippet relevance
- **Answer Verification**: Optional grounding check via Verifier component

### Query Processing and Transformation

#### Query Decomposition
```python
class QueryDecomposer:
    def decompose_query(self, query: str) -> List[str]:
        """Break complex queries into simpler sub-queries."""
        decomposition_prompt = f"""
        Break down this complex question into 2-4 simpler sub-questions that would help answer the original question.
        
        Original question: {query}
        
        Sub-questions:
        1.
        2.
        3.
        4.
        """
        
        response = self.llm_client.complete(
            prompt=decomposition_prompt,
            model=self.enrichment_model  # qwen3:0.6b for speed
        )
        
        # Parse response into list of sub-queries
        return self._parse_subqueries(response)
```

#### HyDE (Hypothetical Document Embeddings)
```python
class HyDEGenerator:
    def generate_hypothetical_doc(self, query: str) -> str:
        """Generate hypothetical document that would answer the query."""
        hyde_prompt = f"""
        Generate a hypothetical document passage that would perfectly answer this question:
        
        Question: {query}
        
        Hypothetical passage:
        """
        
        response = self.llm_client.complete(
            prompt=hyde_prompt,
            model=self.enrichment_model
        )
        
        return response.strip()
```

### Caching and Performance Optimization

#### Semantic Query Caching
```python
class RetrievalPipeline:
    def __init__(self, config, ollama_client, ollama_config):
        # TTL cache for embeddings and results
        self.query_cache = TTLCache(maxsize=100, ttl=300)  # 5 min TTL
        self.embedding_cache = LRUCache(maxsize=500)
        self.semantic_threshold = 0.98  # Similarity threshold for cache hits
    
    def get_cached_result(self, query: str, session_id: str = None) -> Optional[Dict]:
        """Check for semantically similar cached queries."""
        query_embedding = self._get_text_embedder().create_embeddings([query])[0]
        
        for cached_query, cached_data in self.query_cache.items():
            cached_embedding = cached_data["embedding"]
            similarity = cosine_similarity([query_embedding], [cached_embedding])[0][0]
            
            if similarity > self.semantic_threshold:
                # Check session scope if configured
                if self.cache_scope == "session" and cached_data.get("session_id") != session_id:
                    continue
                
                print(f"🎯 Cache hit: {similarity:.3f} similarity")
                return cached_data["result"]
        
        return None
```

#### Batch Processing Optimizations
```python
def process_query_batch(self, queries: List[str]) -> List[Dict]:
    """Process multiple queries efficiently."""
    # Batch embed all queries
    query_embeddings = self._get_text_embedder().create_embeddings(queries)
    
    # Batch search
    results = []
    for i, query in enumerate(queries):
        embedding = query_embeddings[i]
        
        # Search with pre-computed embedding
        dense_results = self._search_dense_with_embedding(embedding)
        bm25_results = self._search_bm25(query)
        
        # Combine and rerank
        combined = self._combine_results(dense_results, bm25_results)
        reranked = self._rerank_batch([query], [combined])[0]
        
        results.append(reranked)
    
    return results
```

### Advanced Search Features

#### Conversational Context Integration
```python
def answer_with_history(self, query: str, conversation_history: List[Dict], **kwargs):
    """Answer query with conversation context."""
    # Build conversational context
    context_prompt = self._build_conversation_context(conversation_history)
    
    # Expand query with context
    expanded_query = f"{context_prompt}\n\nCurrent question: {query}"
    
    # Process with expanded context
    return self.answer_stream(expanded_query, **kwargs)

def _build_conversation_context(self, history: List[Dict]) -> str:
    """Build context from conversation history."""
    context_parts = []
    
    for turn in history[-3:]:  # Last 3 turns for context
        if turn.get("role") == "user":
            context_parts.append(f"Previous question: {turn['content']}")
        elif turn.get("role") == "assistant":
            # Extract key points from previous answers
            context_parts.append(f"Previous context: {turn['content'][:200]}...")
    
    return "\n".join(context_parts)
```

#### Multi-Index Search
```python
def search_multiple_indexes(self, query: str, index_ids: List[str], **kwargs):
    """Search across multiple document indexes."""
    all_results = []
    
    for index_id in index_ids:
        table_name = f"text_pages_{index_id}"
        
        try:
            # Search individual index
            index_results = self._search_single_index(query, table_name, **kwargs)
            
            # Add index metadata
            for result in index_results:
                result["source_index"] = index_id
            
            all_results.extend(index_results)
            
        except Exception as e:
            print(f"⚠️ Error searching index {index_id}: {e}")
            continue
    
    # Global reranking across all indexes
    if len(all_results) > kwargs.get("retrieval_k", 20):
        all_results = self._rerank_global(query, all_results, **kwargs)
    
    return all_results
```

### Error Handling and Resilience

#### Graceful Degradation
```python
def answer_stream(self, query: str, **kwargs):
    """Main answer method with comprehensive error handling."""
    try:
        # Try full pipeline
        return self._answer_stream_full_pipeline(query, **kwargs)
        
    except Exception as e:
        print(f"⚠️ Full pipeline failed: {e}")
        
        try:
            # Fallback: Dense-only search
            kwargs["search_type"] = "dense"
            kwargs["ai_rerank"] = False
            return self._answer_stream_fallback(query, **kwargs)
            
        except Exception as e2:
            print(f"⚠️ Fallback failed: {e2}")
            
            # Last resort: Direct LLM answer
            return self._direct_llm_answer(query)

def _direct_llm_answer(self, query: str):
    """Direct LLM answer as last resort."""
    prompt = f"""
    The document retrieval system is temporarily unavailable. 
    Please provide a helpful response acknowledging this limitation.
    
    User question: {query}
    
    Response:
    """
    
    response = self.llm_client.complete_stream(
        prompt=prompt,
        model=self.ollama_config["generation_model"]
    )
    
    yield "⚠️ Document search unavailable. Providing general response:\n\n"
    
    for chunk in response:
        yield chunk
```

#### Recovery Mechanisms
```python
def recover_from_embedding_failure(self, query: str, **kwargs):
    """Recover when embedding model fails."""
    print("🔄 Attempting embedding model recovery...")
    
    # Try to reinitialize embedder
    try:
        self.text_embedder = None  # Clear failed instance
        embedder = self._get_text_embedder()  # Reinitialize
        
        # Test with simple query
        test_embedding = embedder.create_embeddings(["test"])
        
        if test_embedding is not None:
            print("✅ Embedding model recovered")
            return True
            
    except Exception as e:
        print(f"❌ Recovery failed: {e}")
    
    # Fallback to BM25-only search
    kwargs["search_type"] = "bm25"
    kwargs["ai_rerank"] = False
    print("🔄 Falling back to keyword search only")
    
    return False
```

### Performance Monitoring and Metrics

#### Query Performance Tracking
```python
class PerformanceTracker:
    def __init__(self):
        self.metrics = {
            "query_count": 0,
            "avg_response_time": 0,
            "cache_hit_rate": 0,
            "error_rate": 0,
            "embedding_time": 0,
            "retrieval_time": 0,
            "reranking_time": 0,
            "synthesis_time": 0
        }
    
    @contextmanager
    def track_query(self, query: str):
        """Context manager for tracking query performance."""
        start_time = time.time()
        
        try:
            yield
            
            # Success metrics
            duration = time.time() - start_time
            self.metrics["query_count"] += 1
            self.metrics["avg_response_time"] = (
                (self.metrics["avg_response_time"] * (self.metrics["query_count"] - 1) + duration) 
                / self.metrics["query_count"]
            )
            
        except Exception as e:
            # Error metrics
            self.metrics["error_rate"] = (
                self.metrics["error_rate"] * self.metrics["query_count"] + 1
            ) / (self.metrics["query_count"] + 1)
            
            raise e
        
        finally:
            self.metrics["query_count"] += 1
```

#### Resource Usage Monitoring
```python
def monitor_memory_usage(self):
    """Monitor memory usage of pipeline components."""
    import psutil
    import gc
    
    process = psutil.Process()
    memory_info = process.memory_info()
    
    print(f"Memory Usage: {memory_info.rss / 1024 / 1024:.1f} MB")
    
    # Component-specific monitoring
    if hasattr(self, 'text_embedder') and self.text_embedder:
        print(f"Embedder loaded: {type(self.text_embedder).__name__}")
    
    if hasattr(self, 'ai_reranker') and self.ai_reranker:
        print(f"Reranker loaded: {type(self.ai_reranker).__name__}")
    
    # Suggest cleanup if memory usage is high
    if memory_info.rss > 8 * 1024 * 1024 * 1024:  # 8GB
        print("⚠️ High memory usage detected - consider cleanup")
        gc.collect()
```

---

## Configuration Reference

### Default Pipeline Configuration
```python
RETRIEVAL_CONFIG = {
    "retriever": "multivector",
    "search_type": "hybrid",
    "retrieval_k": 20,
    "reranker_top_k": 10,
    "dense_weight": 0.7,
    "late_chunking": {
        "enabled": True,
        "window_size": 5
    },
    "ai_rerank": True,
    "verify_answers": False,
    "cache_enabled": True,
    "cache_ttl": 300,
    "semantic_cache_threshold": 0.98
}
```

### Model Configuration
```python
MODEL_CONFIG = {
    "embedding_model": "Qwen/Qwen3-Embedding-0.6B",
    "generation_model": "qwen3:8b",
    "enrichment_model": "qwen3:0.6b",
    "reranker_model": "answerdotai/answerai-colbert-small-v1",
    "fallback_reranker": "BAAI/bge-reranker-base"
}
```

### Performance Tuning
```python
PERFORMANCE_CONFIG = {
    "batch_sizes": {
        "embedding": 32,
        "reranking": 16,
        "synthesis": 1
    },
    "timeouts": {
        "embedding": 30,
        "retrieval": 60,
        "reranking": 30,
        "synthesis": 120
    },
    "memory_limits": {
        "max_cache_size": 1000,
        "max_results_per_query": 100,
        "chunk_size_limit": 2048
    }
}
```

## Extension Examples

### Custom Retriever Implementation
```python
class CustomRetriever(BaseRetriever):
    def search(self, query: str, k: int = 10) -> List[Dict]:
        """Implement custom search logic."""
        # Your custom retrieval implementation
        pass
    
    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings for custom retrieval."""
        # Your custom embedding logic
        pass
```

### Custom Reranker Implementation
```python
class CustomReranker(BaseReranker):
    def rank(self, query: str, documents: List[Dict]) -> List[Dict]:
        """Implement custom reranking logic."""
        # Your custom reranking implementation
        pass
```

### Custom Query Transformer
```python
class CustomQueryTransformer:
    def transform(self, query: str, context: Dict = None) -> str:
        """Transform query based on context."""
        # Your custom query transformation logic
        pass
``` 

================================================
FILE: Documentation/system_overview.md
================================================
# 🏗️ RAG System - Complete System Overview

_Last updated: 2025-01-09_

This document provides a comprehensive overview of the Advanced Retrieval-Augmented Generation (RAG) System, covering its architecture, components, data flow, and operational characteristics.

---

## 1. System Architecture

### 1.1 High-Level Architecture

The RAG system implements a sophisticated 4-tier microservices architecture:

```mermaid
graph TB
    subgraph "Client Layer"
        Browser[👤 User Browser]
        UI[Next.js Frontend<br/>React/TypeScript]
        Browser --> UI
    end
    
    subgraph "API Gateway Layer"
        Backend[Backend Server<br/>Python HTTP Server<br/>Port 8000]
        UI -->|REST API| Backend
    end
    
    subgraph "Processing Layer"
        RAG[RAG API Server<br/>Document Processing<br/>Port 8001]
        Backend -->|Internal API| RAG
    end
    
    subgraph "LLM Service Layer"
        Ollama[Ollama Server<br/>LLM Inference<br/>Port 11434]
        RAG -->|Model Calls| Ollama
    end
    
    subgraph "Storage Layer"
        SQLite[(SQLite Database<br/>Sessions & Metadata)]
        LanceDB[(LanceDB<br/>Vector Embeddings)]
        FileSystem[File System<br/>Documents & Indexes]
        
        Backend --> SQLite
        RAG --> LanceDB
        RAG --> FileSystem
    end
```

### 1.2 Component Breakdown

| Component | Technology | Port | Purpose |
|-----------|------------|------|---------|
| **Frontend** | Next.js 15, React 19, TypeScript | 3000 | User interface, chat interactions |
| **Backend** | Python 3.11, HTTP Server | 8000 | API gateway, session management, routing |
| **RAG API** | Python 3.11, Advanced NLP | 8001 | Document processing, retrieval, generation |
| **Ollama** | Go-based LLM server | 11434 | Local LLM inference (embedding, generation) |
| **SQLite** | Embedded database | - | Sessions, messages, index metadata |
| **LanceDB** | Vector database | - | Document embeddings, similarity search |

---

## 2. Core Functionality

### 2.1 Intelligent Dual-Layer Routing

The system's key innovation is its **dual-layer routing architecture** that optimizes both speed and intelligence:

#### **Layer 1: Speed Optimization Routing**
- **Location**: `backend/server.py`
- **Purpose**: Route simple queries to Direct LLM (~1.3s) vs complex queries to RAG Pipeline (~20s)
- **Decision Logic**: Pattern matching, keyword detection, query complexity analysis

```python
# Example routing decisions
"Hello!" → Direct LLM (greeting pattern)
"What does the document say about pricing?" → RAG Pipeline (document keyword)
"What's 2+2?" → Direct LLM (simple + short)
"Summarize the key findings from the report" → RAG Pipeline (complex + indicators)
```

#### **Layer 2: Intelligence Optimization Routing**
- **Location**: `rag_system/agent/loop.py`
- **Purpose**: Within RAG pipeline, route to optimal processing method
- **Methods**: 
  - `direct_answer`: General knowledge queries
  - `rag_query`: Document-specific queries requiring retrieval
  - `graph_query`: Entity relationship queries (future feature)

### 2.2 Document Processing Pipeline

#### **Indexing Process**
1. **Document Upload**: PDF files uploaded via web interface
2. **Text Extraction**: Docling library extracts text with layout preservation
3. **Chunking**: Intelligent chunking with configurable strategies (DocLing, Late Chunking, Standard)
4. **Embedding**: Text converted to vector embeddings using Qwen models
5. **Storage**: Vectors stored in LanceDB with metadata in SQLite

#### **Retrieval Process**
1. **Query Processing**: User query analyzed and contextualized
2. **Embedding**: Query converted to vector embedding
3. **Search**: Hybrid search combining vector similarity and BM25 keyword matching
4. **Reranking**: AI-powered reranking for relevance optimization
5. **Synthesis**: LLM generates final answer using retrieved context

### 2.3 Advanced Features

#### **Query Decomposition**
- Complex queries automatically broken into sub-queries
- Parallel processing of sub-queries for efficiency
- Intelligent composition of final answers

#### **Contextual Enrichment**
- Conversation history integration
- Context-aware query expansion
- Session-based memory management

#### **Verification System**
- Answer verification against source documents
- Confidence scoring and grounding checks
- Source attribution and citation

---

## 3. Data Architecture

### 3.1 Storage Systems

#### **SQLite Database** (`backend/chat_data.db`)
```sql
-- Core tables
sessions          -- Chat sessions with metadata
messages          -- Individual messages and responses
indexes           -- Document index metadata
session_indexes   -- Links sessions to their indexes
```

#### **LanceDB Vector Store** (`./lancedb/`)
```
tables/
├── text_pages_[uuid]     -- Document text embeddings
├── image_pages_[uuid]    -- Image embeddings (future)
└── metadata_[uuid]       -- Document metadata
```

#### **File System** (`./index_store/`)
```
index_store/
├── overviews/           -- Document summaries for routing
├── bm25/               -- BM25 keyword indexes
└── graph/              -- Knowledge graph data
```

### 3.2 Data Flow

1. **Document Upload** → File System (`shared_uploads/`)
2. **Processing** → Embeddings stored in LanceDB
3. **Metadata** → Index info stored in SQLite
4. **Query** → Search LanceDB + SQLite coordination
5. **Response** → Message history stored in SQLite

---

## 4. Model Architecture

### 4.1 Configurable Model Pipeline

The system supports multiple embedding and generation models with automatic switching:

#### **Current Model Configuration**
```python
EXTERNAL_MODELS = {
    "embedding_model": "Qwen/Qwen3-Embedding-0.6B",  # 1024D
    "reranker_model": "answerdotai/answerai-colbert-small-v1",  # ColBERT reranker
    "vision_model": "Qwen/Qwen-VL-Chat",  # Vision model for multimodal
    "fallback_reranker": "BAAI/bge-reranker-base",  # Backup reranker
}

OLLAMA_CONFIG = {
    "generation_model": "qwen3:8b",  # High-quality generation
    "enrichment_model": "qwen3:0.6b",  # Fast enrichment/routing
    "host": "http://localhost:11434"
}
```

#### **Model Switching**
- **Per-Session**: Each chat session can use different embedding models
- **Automatic**: System automatically switches models based on index metadata
- **Dynamic**: Models loaded just-in-time to optimize memory usage

### 4.2 Supported Models

#### **Embedding Models**
- `Qwen/Qwen3-Embedding-0.6B` (1024D) - Default, fast and high-quality

#### **Generation Models** (via Ollama)
- `qwen3:8b` - Primary generation model (high quality)
- `qwen3:0.6b` - Fast enrichment and routing model

#### **Reranking Models**
- `answerdotai/answerai-colbert-small-v1` - Primary ColBERT reranker
- `BAAI/bge-reranker-base` - Fallback cross-encoder reranker

#### **Vision Models** (Multimodal)
- `Qwen/Qwen-VL-Chat` - Vision-language model for image processing

---

## 5. Pipeline Configurations

### 5.1 Default Production Pipeline

```python
PIPELINE_CONFIGS = {
    "default": {
        "description": "Production-ready pipeline with hybrid search, AI reranking, and verification",
        "storage": {
            "lancedb_uri": "./lancedb",
            "text_table_name": "text_pages_v3", 
            "bm25_path": "./index_store/bm25",
            "graph_path": "./index_store/graph/knowledge_graph.gml"
        },
        "retrieval": {
            "retriever": "multivector",
            "search_type": "hybrid",
            "late_chunking": {
                "enabled": True,
                "table_suffix": "_lc_v3"
            },
            "dense": { 
                "enabled": True,
                "weight": 0.7
            },
            "bm25": { 
                "enabled": True,
                "index_name": "rag_bm25_index"
            }
        },
        "embedding_model_name": "Qwen/Qwen3-Embedding-0.6B",
        "reranker": {
            "enabled": True,
            "model_name": "answerdotai/answerai-colbert-small-v1",
            "top_k": 20
        }
    }
}
```

### 5.2 Processing Options

#### **Chunking Strategies**
- **Standard**: Fixed-size chunks with overlap
- **DocLing**: Structure-aware chunking using DocLing library
- **Late Chunking**: Small chunks expanded at query time

#### **Enrichment Options**
- **Contextual Enrichment**: AI-generated chunk summaries
- **Overview Building**: Document-level summaries for routing
- **Graph Extraction**: Entity and relationship extraction

---

## 6. Performance Characteristics

### 6.1 Response Times

| Operation | Time Range | Notes |
|-----------|------------|-------|
| Simple Chat | 1-3 seconds | Direct LLM, no retrieval |
| Document Query | 5-15 seconds | Includes retrieval and reranking |
| Complex Analysis | 15-30 seconds | Multi-step reasoning |
| Document Indexing | 2-5 min/100MB | Depends on enrichment settings |

### 6.2 Memory Usage

| Component | Memory Usage | Notes |
|-----------|--------------|-------|
| Embedding Model | 1-2GB | Qwen3-Embedding-0.6B |
| Generation Model | 8-16GB | qwen3:8b |
| Reranker Model | 500MB-1GB | ColBERT reranker |
| Database Cache | 500MB-2GB | LanceDB and SQLite |

### 6.3 Scalability

- **Concurrent Users**: 5-10 users with 16GB RAM
- **Document Capacity**: 10,000+ documents per index
- **Query Throughput**: 10-20 queries/minute per instance
- **Storage**: Approximately 1MB per 100 pages indexed

---

## 7. Security & Privacy

### 7.1 Data Privacy

- **Local Processing**: All AI models run locally via Ollama
- **No External Calls**: No data sent to external APIs
- **Document Isolation**: Documents stored locally with session-based access
- **User Isolation**: Each session maintains separate context

---

## 8. Configuration & Customization

### 8.1 Model Configuration
Models can be configured in `rag_system/main.py`:

```python
# Embedding model configuration
EXTERNAL_MODELS = {
    "embedding_model": "Qwen/Qwen3-Embedding-0.6B",  # Your preferred model
    "reranker_model": "answerdotai/answerai-colbert-small-v1",
}

# Generation model configuration
OLLAMA_CONFIG = {
    "generation_model": "qwen3:8b",  # Your LLM model
    "enrichment_model": "qwen3:0.6b",  # Your fast model
}
```

### 8.2 Pipeline Configuration
Processing behavior configured in `PIPELINE_CONFIGS`:

```python
PIPELINE_CONFIGS = {
    "retrieval": {
        "search_type": "hybrid",
        "dense": {"weight": 0.7},
        "bm25": {"enabled": True}
    },
    "chunking": {
        "chunk_size": 512,
        "chunk_overlap": 64,
        "enable_latechunk": True,
        "enable_docling": True
    }
}
```

### 8.3 UI Configuration
Frontend behavior configured in environment variables:

```bash
NEXT_PUBLIC_API_URL=http://localhost:8000
NEXT_PUBLIC_ENABLE_STREAMING=true
NEXT_PUBLIC_MAX_FILE_SIZE=50MB
```

---

## 9. Monitoring & Observability

### 9.1 Logging System
- **Structured Logging**: JSON-formatted logs with timestamps
- **Log Levels**: DEBUG, INFO, WARNING, ERROR
- **Log Rotation**: Automatic log file rotation
- **Component Isolation**: Separate logs per service

### 9.2 Health Monitoring
- **Health Endpoints**: `/health` on all services
- **Service Dependencies**: Cascading health checks
- **Performance Metrics**: Response times, error rates
- **Resource Monitoring**: Memory, CPU, disk usage

### 9.3 Debugging Features
- **Debug Mode**: Detailed operation tracing
- **Query Inspection**: Step-by-step query processing
- **Model Switching Logs**: Embedding model change tracking
- **Error Reporting**: Comprehensive error context

---

## ⚙️ Configuration Modes

The system supports multiple configuration modes optimized for different use cases:

### **Default Mode** (`"default"`)
- **Description**: Production-ready pipeline with full features
- **Search**: Hybrid (dense + BM25) with 0.7 dense weight
- **Reranking**: AI-powered ColBERT reranker
- **Query Processing**: Query decomposition enabled
- **Verification**: Grounding verification enabled
- **Performance**: ~3-8 seconds per query
- **Memory**: ~10-16GB (with models loaded)

### **Fast Mode** (`"fast"`)  
- **Description**: Speed-optimized pipeline with minimal overhead
- **Search**: Vector-only (no BM25, no late chunking)
- **Reranking**: Disabled
- **Query Processing**: Single-pass, no decomposition
- **Verification**: Disabled
- **Performance**: ~1-3 seconds per query
- **Memory**: ~8-12GB (with models loaded)

### **BM25 Mode** (`"bm25"`)
- **Description**: Traditional keyword-based search
- **Search**: BM25 only
- **Use Case**: Exact keyword matching, legacy compatibility

### **Graph RAG Mode** (`"graph_rag"`)
- **Description**: Knowledge graph integration (currently disabled)
- **Status**: Available for future implementation
- **Use Case**: Relationship-aware retrieval

---

## 10. Development & Extension

### 10.1 Architecture Principles
- **Modular Design**: Clear separation of concerns
- **Configuration-Driven**: Behavior controlled via config files
- **Lazy Loading**: Components loaded on-demand
- **Thread Safety**: Proper synchronization for concurrent access

### 10.2 Extension Points
- **Custom Retrievers**: Implement `BaseRetriever` interface
- **Custom Chunkers**: Extend chunking strategies
- **Custom Models**: Add new embedding or generation models
- **Custom Pipelines**: Create specialized processing workflows

### 10.3 Testing Strategy
- **Unit Tests**: Individual component testing
- **Integration Tests**: End-to-end workflow testing
- **Performance Tests**: Load and stress testing
- **Health Checks**: Automated system validation

---

> **Note**: This overview reflects the current implementation as of 2025-01-09. For the latest changes, check the git history and individual component documentation. 

================================================
FILE: Documentation/triage_system.md
================================================
# 🔀 Triage / Routing System

_Maps to `rag_system/agent/loop.Agent._should_use_rag`, `_route_using_overviews`, and the fast-path router in `backend/server.py`._

## Purpose
Determine, for every incoming query, whether it should be answered by:
1. **Direct LLM Generation** (no retrieval) — faster, cheaper.
2. **Retrieval-Augmented Generation (RAG)** — when the answer likely requires document context.

## Decision Signals
| Signal | Source | Notes |
|--------|--------|-------|
| Keyword/regex check | `backend/server.py` (fast path) | Hard-coded quick wins (`what time`, `define`, etc.). |
| Index presence | SQLite (session → indexes) | If no indexes linked, direct LLM. |
| Overview routing | `_route_using_overviews()` | Uses document overviews and enrichment model to predict relevance. |
| LLM router prompt | `agent/loop.py` lines 648-665 | Final arbitrator (Ollama call, JSON output). |

## High-level Flow
```mermaid
flowchart TD
    Q["Incoming Query"] --> S1{Session\nHas Indexes?}
    S1 -- no --> LLM["Direct LLM Generation"]
    S1 -- yes --> S2{Fast Regex\nHeuristics}
    S2 -- match--> LLM
    S2 -- no --> S3{Overview\nRelevance > τ?}
    S3 -- low --> LLM
    S3 -- high --> S4[LLM Router\n(prompt @648)]
    S4 -- "route: RAG" --> RAG["Retrieval Pipeline"]
    S4 -- "route: DIRECT" --> LLM
```

## Detailed Sequence (Code-level)
1. **backend/server.py**
   * `handle_session_chat()` builds `router_prompt` (line ~435) and makes a **first pass** decision before calling the heavy agent code.
2. **agent.loop._should_use_rag()**
   * Re-evaluates using richer features (e.g., token count, query type).
3. **Overviews Phase** (`_route_using_overviews()`)
   * Loads JSONL overviews file per index.
   * Calls enrichment model (`qwen3:0.6b`) with prompt: _"Does this overview mention … ? "_ → returns yes/no.
4. **LLM Router** (prompt lines 648-665)
   * JSON-only response `{ "route": "RAG" | "DIRECT" }`.

## Interfaces & Dependencies
| Component | Calls / Data |
|-----------|--------------|
| SQLite `chat_sessions` | Reads `indexes` column to know linked index IDs. |
| LanceDB Overviews | Reads `index_store/overviews/<idx>.jsonl`. |
| `OllamaClient` | Generates LLM router decision. |

## Config Flags
* `PIPELINE_CONFIGS.triage.enabled` – global toggle.
* Env var `TRIAGE_OVERVIEW_THRESHOLD` – min similarity score to prefer RAG (default 0.35).

## Failure / Fallback Modes
1. If overview file missing → skip to LLM router.
2. If LLM router errors → default to RAG (safer) but log warning.

---

_Keep this document updated whenever routing heuristics, thresholds, or prompt wording change._ 

================================================
FILE: Documentation/verifier.md
================================================
# ✅ Answer Verifier

_File: `rag_system/agent/verifier.py`_

## Objective
Assess whether an answer produced by RAG is **grounded** in the retrieved context snippets.

## Prompt (see `prompt_inventory.md` `verifier.fact_check`)
Strict JSON schema:
```jsonc
{
  "verdict": "SUPPORTED" | "NOT_SUPPORTED" | "NEEDS_CLARIFICATION",
  "is_grounded": true | false,
  "reasoning": "< ≤30 words >",
  "confidence_score": 0-100
}
```

## Sequence Diagram
```mermaid
sequenceDiagram
    participant RP as Retrieval Pipeline
    participant V as Verifier
    participant LLM as Ollama

    RP->>V: query, context, answer
    V->>LLM: verification prompt
    LLM-->>V: JSON verdict
    V-->>RP: VerificationResult
```

## Usage Sites
| Caller | Code | When |
|--------|------|------|
| `RetrievalPipeline.answer_stream()` | `pipelines/retrieval_pipeline.py` | If `verify=true` flag from frontend. |
| `Agent.loop.run()` | fallback path | Experimental for composed answers. |

## Config
| Flag | Default | Meaning |
|------|---------|---------|
| `verify` | false | Frontend toggle; if true verifier runs. |
| `generation_model` | `qwen3:8b` | Same model as answer generation.

## Failure Modes
* If LLM returns invalid JSON → parse exception handled, result = NOT_SUPPORTED.
* If verification call times out → pipeline logs but still returns answer (unverified).

---
_Keep updated when schema or usage flags change._ 

================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2025 PromptEngineer

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# LocalGPT - Private Document Intelligence Platform

<div align="center">

<p align="center">
<a href="https://trendshift.io/repositories/2947" target="_blank"><img src="https://trendshift.io/api/badge/repositories/2947" alt="PromtEngineer%2FlocalGPT | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p>

[![GitHub Stars](https://img.shields.io/github/stars/PromtEngineer/localGPT?style=flat-square)](https://github.com/PromtEngineer/localGPT/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/PromtEngineer/localGPT?style=flat-square)](https://github.com/PromtEngineer/localGPT/network/members)
[![GitHub Issues](https://img.shields.io/github/issues/PromtEngineer/localGPT?style=flat-square)](https://github.com/PromtEngineer/localGPT/issues)
[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/PromtEngineer/localGPT?style=flat-square)](https://github.com/PromtEngineer/localGPT/pulls)
[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg?style=flat-square)](https://www.python.org/downloads/)
[![License](https://img.shields.io/badge/license-MIT-green.svg?style=flat-square)](LICENSE)
[![Docker](https://img.shields.io/badge/docker-supported-blue.svg?style=flat-square)](https://www.docker.com/)

<p align="center">
    <a href="https://x.com/engineerrprompt">
      <img src="https://img.shields.io/badge/Follow%20on%20X-000000?style=for-the-badge&logo=x&logoColor=white" alt="Follow on X" />
    </a>
    <a href="https://discord.gg/tUDWAFGc">
      <img src="https://img.shields.io/badge/Join%20our%20Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Join our Discord" />
    </a>
  </p>
</div>

## 🚀 What is LocalGPT?

LocalGPT is a **fully private, on-premise Document Intelligence platform**. Ask questions, summarise, and uncover insights from your files with state-of-the-art AI—no data ever leaves your machine.

More than a traditional RAG (Retrieval-Augmented Generation) tool, LocalGPT features a **hybrid search engine** that blends semantic similarity, keyword matching, and [Late Chunking](https://jina.ai/news/late-chunking-in-long-context-embedding-models/) for long-context precision. A **smart router** automatically selects between RAG and direct LLM answering for every query, while **contextual enrichment** and sentence-level [Context Pruning](https://huggingface.co/naver/provence-reranker-debertav3-v1) surface only the most relevant content. An independent **verification** pass adds an extra layer of accuracy.

The architecture is **modular and lightweight**—enable only the components you need. With a pure-Python core and minimal dependencies, LocalGPT is simple to deploy, run, and maintain on any infrastructure.The system has minimal dependencies on frameworks and libraries, making it easy to deploy and maintain. The RAG system is pure python and does not require any additional dependencies.

## ▶️ Video
Watch this [video](https://youtu.be/JTbtGH3secI) to get started with LocalGPT. 

| Home | Create Index | Chat |
|------|--------------|------|
| ![](Documentation/images/Home.png) | ![](Documentation/images/Index%20Creation.png) | ![](Documentation/images/Retrieval%20Process.png) |

## ✨ Features

- **Utmost Privacy**: Your data remains on your computer, ensuring 100% security.
- **Versatile Model Support**: Seamlessly integrate a variety of open-source models via Ollama.
- **Diverse Embeddings**: Choose from a range of open-source embeddings.
- **Reuse Your LLM**: Once downloaded, reuse your LLM without the need for repeated downloads.
- **Chat History**: Remembers your previous conversations (in a session).
- **API**: LocalGPT has an API that you can use for building RAG Applications.
- **GPU, CPU, HPU & MPS Support**: Supports multiple platforms out of the box, Chat with your data using `CUDA`, `CPU`, `HPU (Intel® Gaudi®)` or `MPS` and more!

### 📖 Document Processing
- **Multi-format Support**: PDF, DOCX, TXT, Markdown, and more (Currently only PDF is supported)
- **Contextual Enrichment**: Enhanced document understanding with AI-generated context, inspired by [Contextual Retrieval](https://www.anthropic.com/news/contextual-retrieval)
- **Batch Processing**: Handle multiple documents simultaneously

### 🤖 AI-Powered Chat
- **Natural Language Queries**: Ask questions in plain English
- **Source Attribution**: Every answer includes document references
- **Smart Routing**: Automatically chooses between RAG and direct LLM responses
- **Query Decomposition**: Breaks complex queries into sub-questions for better answers
- **Semantic Caching**: TTL-based caching with similarity matching for faster responses
- **Session-Aware History**: Maintains conversation context across interactions
- **Answer Verification**: Independent verification pass for accuracy
- **Multiple AI Models**: Ollama for inference, HuggingFace for embeddings and reranking


### 🛠️ Developer-Friendly
- **RESTful APIs**: Complete API access for integration
- **Real-time Progress**: Live updates during document processing
- **Flexible Configuration**: Customize models, chunk sizes, and search parameters
- **Extensible Architecture**: Plugin system for custom components

### 🎨 Modern Interface
- **Intuitive Web UI**: Clean, responsive design
- **Session Management**: Organize conversations by topic
- **Index Management**: Easy document collection management
- **Real-time Chat**: Streaming responses for immediate feedback

---

## 🚀 Quick Start

Note: The installation is currently only tested on macOS. 

### Prerequisites
- Python 3.8 or higher (tested with Python 3.11.5)
- Node.js 16+ and npm (tested with Node.js 23.10.0, npm 10.9.2)
- Docker (optional, for containerized deployment)
- 8GB+ RAM (16GB+ recommended)
- Ollama (required for both deployment approaches)

### ***NOTE***
Before this brach is moved to the main branch, please clone this branch for instalation:

```bash
git clone -b localgpt-v2 https://github.com/PromtEngineer/localGPT.git
cd localGPT
```

### Option 1: Docker Deployment 

```bash
# Clone the repository
git clone https://github.com/PromtEngineer/localGPT.git
cd localGPT

# Install Ollama locally (required even for Docker)
curl -fsSL https://ollama.ai/install.sh | sh
ollama pull qwen3:0.6b
ollama pull qwen3:8b

# Start Ollama
ollama serve

# Start with Docker (in a new terminal)
./start-docker.sh

# Access the application
open http://localhost:3000
```

**Docker Management Commands:**
```bash
# Check container status
docker compose ps

# View logs
docker compose logs -f

# Stop containers
./start-docker.sh stop
```

### Option 2: Direct Development (Recommended for Development)

```bash
# Clone the repository
git clone https://github.com/PromtEngineer/localGPT.git
cd localGPT

# Install Python dependencies
pip install -r requirements.txt

# Key dependencies installed:
# - torch==2.4.1, transformers==4.51.0 (AI models)
# - lancedb (vector database)
# - rank_bm25, fuzzywuzzy (search algorithms)
# - sentence_transformers, rerankers (embedding/reranking)
# - docling (document processing)
# - colpali-engine (multimodal processing - support coming soon)

# Install Node.js dependencies
npm install

# Install and start Ollama
curl -fsSL https://ollama.ai/install.sh | sh
ollama pull qwen3:0.6b
ollama pull qwen3:8b
ollama serve

# Start the system (in a new terminal)
python run_system.py

# Access the application
open http://localhost:3000
```

**System Management:**
```bash
# Check system health (comprehensive diagnostics)
python system_health_check.py

# Check service status and health
python run_system.py --health

# Start in production mode
python run_system.py --mode prod

# Skip frontend (backend + RAG API only)
python run_system.py --no-frontend

# View aggregated logs
python run_system.py --logs-only

# Stop all services
python run_system.py --stop
# Or press Ctrl+C in the terminal running python run_system.py
```

**Service Architecture:**
The `run_system.py` launcher manages four key services:
- **Ollama Server** (port 11434): AI model serving
- **RAG API Server** (port 8001): Document processing and retrieval
- **Backend Server** (port 8000): Session management and API endpoints
- **Frontend Server** (port 3000): React/Next.js web interface

### Option 3: Manual Component Startup

```bash
# Terminal 1: Start Ollama
ollama serve

# Terminal 2: Start RAG API
python -m rag_system.api_server

# Terminal 3: Start Backend
cd backend && python server.py

# Terminal 4: Start Frontend
npm run dev

# Access at http://localhost:3000
```

---

### Detailed Installation

#### 1. Install System Dependencies

**Ubuntu/Debian:**
```bash
sudo apt update
sudo apt install python3.8 python3-pip nodejs npm docker.io docker-compose
```

**macOS:**
```bash
brew install python@3.8 node npm docker docker-compose
```

**Windows:**
```bash
# Install Python 3.8+, Node.js, and Docker Desktop
# Then use PowerShell or WSL2
```

#### 2. Install AI Models

**Install Ollama (Recommended):**
```bash
# Install Ollama
curl -fsSL https://ollama.ai/install.sh | sh

# Pull recommended models
ollama pull qwen3:0.6b          # Fast generation model
ollama pull qwen3:8b            # High-quality generation model
```

#### 3. Configure Environment

```bash
# Copy environment template
cp .env.example .env

# Edit configuration
nano .env
```

**Key Configuration Options:**
```env
# AI Models (referenced in rag_system/main.py)
OLLAMA_HOST=http://localhost:11434

# Database Paths (used by backend and RAG system)
DATABASE_PATH=./backend/chat_data.db
VECTOR_DB_PATH=./lancedb

# Server Settings (used by run_system.py)
BACKEND_PORT=8000
FRONTEND_PORT=3000
RAG_API_PORT=8001

# Optional: Override default models
GENERATION_MODEL=qwen3:8b
ENRICHMENT_MODEL=qwen3:0.6b
EMBEDDING_MODEL=Qwen/Qwen3-Embedding-0.6B
RERANKER_MODEL=answerdotai/answerai-colbert-small-v1
```

#### 4. Initialize the System

```bash
# Run system health check
python system_health_check.py

# Initialize databases
python -c "from backend.database import ChatDatabase; ChatDatabase().init_database()"

# Test installation
python -c "from rag_system.main import get_agent; print('✅ Installation successful!')"

# Validate complete setup
python run_system.py --health
```

---

## 🎯 Getting Started

### 1. Create Your First Index

An **index** is a collection of processed documents that you can chat with.

#### Using the Web Interface:
1. Open http://localhost:3000
2. Click "Create New Index"
3. Upload your documents (PDF, DOCX, TXT)
4. Configure processing options
5. Click "Build Index"

#### Using Scripts:
```bash
# Simple script approach
./simple_create_index.sh "My Documents" "path/to/document.pdf"

# Interactive script
python create_index_script.py
```

#### Using API:
```bash
# Create index
curl -X POST http://localhost:8000/indexes \
  -H "Content-Type: application/json" \
  -d '{"name": "My Index", "description": "My documents"}'

# Upload documents
curl -X POST http://localhost:8000/indexes/INDEX_ID/upload \
  -F "files=@document.pdf"

# Build index
curl -X POST http://localhost:8000/indexes/INDEX_ID/build
```

### 2. Start Chatting

Once your index is built:

1. **Create a Chat Session**: Click "New Chat" or use an existing session
2. **Select Your Index**: Choose which document collection to query
3. **Ask Questions**: Type natural language questions about your documents
4. **Get Answers**: Receive AI-generated responses with source citations

### 3. Advanced Features

#### Custom Model Configuration
```bash
# Use different models for different tasks
curl -X POST http://localhost:8000/sessions \
  -H "Content-Type: application/json" \
  -d '{
    "title": "High Quality Session",
    "model": "qwen3:8b",
    "embedding_model": "Qwen/Qwen3-Embedding-4B"
  }'
```

#### Batch Document Processing
```bash
# Process multiple documents at once
python demo_batch_indexing.py --config batch_indexing_config.json
```

#### API Integration
```python
import requests

# Chat with your documents via API
response = requests.post('http://localhost:8000/chat', json={
    'query': 'What are the key findings in the research papers?',
    'session_id': 'your-session-id',
    'search_type': 'hybrid',
    'retrieval_k': 20
})

print(response.json()['response'])
```

---

## 🔧 Configuration

### Model Configuration

LocalGPT supports multiple AI model providers with centralized configuration:

#### Ollama Models (Local Inference)
```python
OLLAMA_CONFIG = {
    "host": "http://localhost:11434",
    "generation_model": "qwen3:8b",        # Main text generation
    "enrichment_model": "qwen3:0.6b"       # Lightweight routing/enrichment
}
```

#### External Models (HuggingFace Direct)
```python
EXTERNAL_MODELS = {
    "embedding_model": "Qwen/Qwen3-Embedding-0.6B",           # 1024 dimensions
    "reranker_model": "answerdotai/answerai-colbert-small-v1", # ColBERT reranker
    "fallback_reranker": "BAAI/bge-reranker-base"             # Backup reranker
}
```

### Pipeline Configuration

LocalGPT offers two main pipeline configurations:

#### Default Pipeline (Production-Ready)
```python
"default": {
    "description": "Production-ready pipeline with hybrid search, AI reranking, and verification",
    "storage": {
        "lancedb_uri": "./lancedb",
        "text_table_name": "text_pages_v3",
        "bm25_path": "./index_store/bm25"
    },
    "retrieval": {
        "retriever": "multivector",
        "search_type": "hybrid",
        "late_chunking": {"enabled": True},
        "dense": {"enabled": True, "weight": 0.7},
        "bm25": {"enabled": True}
    },
    "reranker": {
        "enabled": True,
        "type": "ai",
        "strategy": "rerankers-lib",
        "model_name": "answerdotai/answerai-colbert-small-v1",
        "top_k": 10
    },
    "query_decomposition": {"enabled": True, "max_sub_queries": 3},
    "verification": {"enabled": True},
    "retrieval_k": 20,
    "contextual_enricher": {"enabled": True, "window_size": 1}
}
```

#### Fast Pipeline (Speed-Optimized)
```python
"fast": {
    "description": "Speed-optimized pipeline with minimal overhead",
    "retrieval": {
        "search_type": "vector_only",
        "late_chunking": {"enabled": False}
    },
    "reranker": {"enabled": False},
    "query_decomposition": {"enabled": False},
    "verification": {"enabled": False},
    "retrieval_k": 10,
    "contextual_enricher": {"enabled": False}
}
```

### Search Configuration

```python
SEARCH_CONFIG = {
    'hybrid': {
        'dense_weight': 0.7,
        'sparse_weight': 0.3,
        'retrieval_k': 20,
        'reranker_top_k': 10
    }
}
```
---

## 🛠️ Troubleshooting

### Common Issues

#### Installation Problems
```bash
# Check Python version
python --version  # Should be 3.8+

# Check dependencies
pip list | grep -E "(torch|transformers|lancedb)"

# Reinstall dependencies
pip install -r requirements.txt --force-reinstall
```

#### Model Loading Issues
```bash
# Check Ollama status
ollama list
curl http://localhost:11434/api/tags

# Pull missing models
ollama pull qwen3:0.6b
```

#### Database Issues
```bash
# Check database connectivity
python -c "from backend.database import ChatDatabase; db = ChatDatabase(); print('✅ Database OK')"

# Reset database (WARNING: This deletes all data)
rm backend/chat_data.db
python -c "from backend.database import ChatDatabase; ChatDatabase().init_database()"
```

#### Performance Issues
```bash
# Check system resources
python system_health_check.py

# Monitor memory usage
htop  # or Task Manager on Windows

# Optimize for low-memory systems
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
```

### Getting Help

1. **Check Logs**: The system creates structured logs in the `logs/` directory:
   - `logs/system.log`: Main system events and errors
   - `logs/ollama.log`: Ollama server logs
   - `logs/rag-api.log`: RAG API processing logs
   - `logs/backend.log`: Backend server logs
   - `logs/frontend.log`: Frontend build and runtime logs

2. **System Health**: Run comprehensive diagnostics:
   ```bash
   python system_health_check.py  # Full system diagnostics
   python run_system.py --health  # Service status check
   ```

3. **Health Endpoints**: Check individual service health:
   - Backend: `http://localhost:8000/health`
   - RAG API: `http://localhost:8001/health`
   - Ollama: `http://localhost:11434/api/tags`

4. **Documentation**: Check the [Technical Documentation](TECHNICAL_DOCS.md)
5. **GitHub Issues**: Report bugs and request features
6. **Community**: Join our Discord/Slack community

---

## 🔗 API Reference

### Core Endpoints

#### Chat API
```http
# Session-based chat (recommended)
POST /sessions/{session_id}/chat
Content-Type: application/json

{
  "query": "What are the main topics discussed?",
  "search_type": "hybrid",
  "retrieval_k": 20,
  "ai_rerank": true,
  "context_window_size": 5
}

# Legacy chat endpoint
POST /chat
Content-Type: application/json

{
  "query": "What are the main topics discussed?",
  "session_id": "uuid",
  "search_type": "hybrid",
  "retrieval_k": 20
}
```

#### Index Management
```http
# Create index
POST /indexes
Content-Type: application/json
{
  "name": "My Index",
  "description": "Description",
  "config": "default"
}

# Get all indexes
GET /indexes

# Get specific index
GET /indexes/{id}

# Upload documents to index
POST /indexes/{id}/upload
Content-Type: multipart/form-data
files: [file1.pdf, file2.pdf, ...]

# Build index (process uploaded documents)
POST /indexes/{id}/build
Content-Type: application/json
{
  "config_mode": "default",
  "enable_enrich": true,
  "chunk_size": 512
}

# Delete index
DELETE /indexes/{id}
```

#### Session Management
```http
# Create session
POST /sessions
Content-Type: application/json
{
  "title": "My Session",
  "model": "qwen3:0.6b"
}

# Get all sessions
GET /sessions

# Get specific session
GET /sessions/{session_id}

# Get session documents
GET /sessions/{session_id}/documents

# Get session indexes
GET /sessions/{session_id}/indexes

# Link index to session
POST /sessions/{session_id}/indexes/{index_id}

# Delete session
DELETE /sessions/{session_id}

# Rename session
POST /sessions/{session_id}/rename
Content-Type: application/json
{
  "new_title": "Updated Session Name"
}
```

### Advanced Features

#### Query Decomposition
The system can break complex queries into sub-questions for better answers:
```http
POST /sessions/{session_id}/chat
Content-Type: application/json

{
  "query": "Compare the methodologies and analyze their effectiveness",
  "query_decompose": true,
  "compose_sub_answers": true
}
```

#### Answer Verification
Independent verification pass for accuracy using a separate verification model:
```http
POST /sessions/{session_id}/chat
Content-Type: application/json

{
  "query": "What are the key findings?",
  "verify": true
}
```

#### Contextual Enrichment
Document context enrichment during indexing for better understanding:
```bash
# Enable during index building
POST /indexes/{id}/build
{
  "enable_enrich": true,
  "window_size": 2
}
```

#### Late Chunking
Better context preservation by chunking after embedding:
```bash
# Configure in pipeline
"late_chunking": {"enabled": true}
```

#### Streaming Chat
```http
POST /chat/stream
Content-Type: application/json

{
  "query": "Explain the methodology",
  "session_id": "uuid",
  "stream": true
}
```

#### Batch Processing
```bash
# Using the batch indexing script
python demo_batch_indexing.py --config batch_indexing_config.json

# Example batch configuration (batch_indexing_config.json):
{
  "index_name": "Sample Batch Index",
  "index_description": "Example batch index configuration",
  "documents": [
    "./rag_system/documents/invoice_1039.pdf",
    "./rag_system/documents/invoice_1041.pdf"
  ],
  "processing": {
    "chunk_size": 512,
    "chunk_overlap": 64,
    "enable_enrich": true,
    "enable_latechunk": true,
    "enable_docling": true,
    "embedding_model": "Qwen/Qwen3-Embedding-0.6B",
    "generation_model": "qwen3:0.6b",
    "retrieval_mode": "hybrid",
    "window_size": 2
  }
}
```

```http
# API endpoint for batch processing
POST /batch/index
Content-Type: application/json

{
  "file_paths": ["doc1.pdf", "doc2.pdf"],
  "config": {
    "chunk_size": 512,
    "enable_enrich": true,
    "enable_latechunk": true,
    "enable_docling": true
  }
}
```

For complete API documentation, see [API_REFERENCE.md](API_REFERENCE.md).

---

## 🏗️ Architecture

LocalGPT is built with a modular, scalable architecture:

```mermaid
graph TB
    UI[Web Interface] --> API[Backend API]
    API --> Agent[RAG Agent]
    Agent --> Retrieval[Retrieval Pipeline]
    Agent --> Generation[Generation Pipeline]

    Retrieval --> Vector[Vector Search]
    Retrieval --> BM25[BM25 Search]
    Retrieval --> Rerank[Reranking]

    Vector --> LanceDB[(LanceDB)]
    BM25 --> BM25DB[(BM25 Index)]

    Generation --> Ollama[Ollama Models]
    Generation --> HF[Hugging Face Models]

    API --> SQLite[(SQLite DB)]
```

Overview of the Retrieval Agent

```mermaid
graph TD
    classDef llmcall fill:#e6f3ff,stroke:#007bff;
    classDef pipeline fill:#e6ffe6,stroke:#28a745;
    classDef cache fill:#fff3e0,stroke:#fd7e14;
    classDef logic fill:#f8f9fa,stroke:#6c757d;
    classDef thread stroke-dasharray: 5 5;

    A(Start: Agent.run) --> B_asyncio.run(_run_async);
    B --> C{_run_async};

    C --> C1[Get Chat History];
    C1 --> T1[Build Triage Prompt <br/> Query + Doc Overviews ];
    T1 --> T2["(asyncio.to_thread)<br/>LLM Triage: RAG or LLM_DIRECT?"]; class T2 llmcall,thread;
    T2 --> T3{Decision?};

    T3 -- RAG --> RAG_Path;
    T3 -- LLM_DIRECT --> LLM_Path;

    subgraph RAG Path
        RAG_Path --> R1[Format Query + History];
        R1 --> R2["(asyncio.to_thread)<br/>Generate Query Embedding"]; class R2 pipeline,thread;
        R2 --> R3{{Check Semantic Cache}}; class R3 cache;
        R3 -- Hit --> R_Cache_Hit(Return Cached Result);
        R_Cache_Hit --> R_Hist_Update;
        R3 -- Miss --> R4{Decomposition <br/> Enabled?};

        R4 -- Yes --> R5["(asyncio.to_thread)<br/>Decompose Raw Query"]; class R5 llmcall,thread;
        R5 --> R6{{Run Sub-Queries <br/> Parallel RAG Pipeline}}; class R6 pipeline,thread;
        R6 --> R7[Collect Results & Docs];
        R7 --> R8["(asyncio.to_thread)<br/>Compose Final Answer"]; class R8 llmcall,thread;
        R8 --> V1(RAG Answer);

        R4 -- No --> R9["(asyncio.to_thread)<br/>Run Single Query <br/>(RAG Pipeline)"]; class R9 pipeline,thread;
        R9 --> V1;

        V1 --> V2{{Verification <br/> await verify_async}}; class V2 llmcall;
        V2 --> V3(Final RAG Result);
        V3 --> R_Cache_Store{{Store in Semantic Cache}}; class R_Cache_Store cache;
        R_Cache_Store --> FinalResult;
    end

    subgraph Direct LLM Path
        LLM_Path --> L1[Format Query + History];
        L1 --> L2["(asyncio.to_thread)<br/>Generate Direct LLM Answer <br/> (No RAG)"]; class L2 llmcall,thread;
        L2 --> FinalResult(Final Direct Result);
    end

    FinalResult --> R_Hist_Update(Update Chat History);
    R_Hist_Update --> ZZZ(End: Return Result);
```

---

## 🤝 Contributing

We welcome contributions from developers of all skill levels! LocalGPT is an open-source project that benefits from community involvement.

### 🚀 Quick Start for Contributors

```bash
# Fork and clone the repository
git clone https://github.com/PromtEngineer/localGPT.git
cd localGPT

# Set up development environment
pip install -r requirements.txt
npm install

# Install Ollama and models
curl -fsSL https://ollama.ai/install.sh | sh
ollama pull qwen3:0.6b qwen3:8b

# Verify setup
python system_health_check.py
python run_system.py --mode dev
```

### 📋 How to Contribute

1. **🐛 Report Bugs**: Use our [bug report template](.github/ISSUE_TEMPLATE/bug_report.md)
2. **💡 Request Features**: Use our [feature request template](.github/ISSUE_TEMPLATE/feature_request.md)
3. **🔧 Submit Code**: Follow our [development workflow](CONTRIBUTING.md#development-workflow)
4. **📚 Improve Docs**: Help make our documentation better

### 📖 Detailed Guidelines

For comprehensive contributing guidelines, including:
- Development setup and workflow
- Coding standards and best practices
- Testing requirements
- Documentation standards
- Release process

**👉 See our [CONTRIBUTING.md](CONTRIBUTING.md) guide**

---

## 📄 License

This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. For models, please check their respective licenses.

---

## 📞 Support

- **Documentation**: [Technical Docs](TECHNICAL_DOCS.md)
- **Issues**: [GitHub Issues](https://github.com/PromtEngineer/localGPT/issues)
- **Discussions**: [GitHub Discussions](https://github.com/PromtEngineer/localGPT/discussions)
- **Business Deployment and Customization**: [Contact Us](https://tally.so/r/wv6R2d)
---

<div align="center">

## Star History

[![Star History Chart](https://api.star-history.com/svg?repos=PromtEngineer/localGPT&type=Date)](https://star-history.com/#PromtEngineer/localGPT&Date)


================================================
FILE: WATSONX_README.md
================================================
# Watson X Integration with Granite Models

This branch adds support for IBM Watson X AI with Granite models as an alternative to Ollama for running LocalGPT.

## Overview

LocalGPT now supports two LLM backends:
1. **Ollama** (default): Run models locally using Ollama
2. **Watson X**: Use IBM's Granite models hosted on Watson X AI

## What Changed

- Added `WatsonXClient` class in `rag_system/utils/watsonx_client.py` that provides an Ollama-compatible interface for Watson X
- Updated `factory.py` and `main.py` to support backend switching via environment variable
- Added `ibm-watsonx-ai` SDK dependency to `requirements.txt`
- Configuration now supports both backends through environment variables

## Prerequisites

To use Watson X with Granite models, you need:

1. IBM Cloud account with Watson X access
2. Watson X API key
3. Watson X project ID

### Getting Your Credentials

1. Go to [IBM Cloud](https://cloud.ibm.com/)
2. Navigate to Watson X AI service
3. Create or select a project
4. Get your API key from IBM Cloud IAM
5. Copy your project ID from the Watson X project settings

## Configuration

### Environment Variables

Create a `.env` file or set these environment variables:

```bash
# Choose LLM backend (default: ollama)
LLM_BACKEND=watsonx

# Watson X Configuration
WATSONX_API_KEY=your_api_key_here
WATSONX_PROJECT_ID=your_project_id_here
WATSONX_URL=https://us-south.ml.cloud.ibm.com

# Model Configuration
WATSONX_GENERATION_MODEL=ibm/granite-13b-chat-v2
WATSONX_ENRICHMENT_MODEL=ibm/granite-8b-japanese
```

### Available Granite Models

Watson X offers several Granite models:
- `ibm/granite-13b-chat-v2` - General purpose chat model
- `ibm/granite-13b-instruct-v2` - Instruction-following model
- `ibm/granite-20b-multilingual` - Multilingual support
- `ibm/granite-8b-japanese` - Lightweight Japanese model
- `ibm/granite-3b-code-instruct` - Code generation model

For a full list of available models, visit the [Watson X documentation](https://www.ibm.com/docs/en/watsonx/saas?topic=solutions-supported-foundation-models).

## Installation

1. Install the Watson X SDK:
```bash
pip install ibm-watsonx-ai>=1.3.39
```

Or install all dependencies:
```bash
pip install -r rag_system/requirements.txt
```

## Usage

### Running with Watson X

Once configured, simply set the environment variable and run as normal:

```bash
export LLM_BACKEND=watsonx
python -m rag_system.main api
```

Or in Python:

```python
import os
os.environ['LLM_BACKEND'] = 'watsonx'

from rag_system.factory import get_agent

# Get agent with Watson X backend
agent = get_agent(mode="default")

# Use as normal
result = agent.run("What is artificial intelligence?")
print(result)
```

### Switching Between Backends

You can easily switch between Ollama and Watson X:

```bash
# Use Ollama (local)
export LLM_BACKEND=ollama
python -m rag_system.main api

# Use Watson X (cloud)
export LLM_BACKEND=watsonx
python -m rag_system.main api
```

## Features

The Watson X client supports all the key features used by LocalGPT:

- ✅ Text generation / completion
- ✅ Async generation
- ✅ Streaming responses
- ✅ Embeddings (if using Watson X embedding models)
- ✅ Custom generation parameters (temperature, max_tokens, top_p, top_k)
- ⚠️ Image/multimodal support (limited, depends on model availability)

## API Compatibility

The `WatsonXClient` provides the same interface as `OllamaClient`:

```python
from rag_system.utils.watsonx_client import WatsonXClient

client = WatsonXClient(
    api_key="your_api_key",
    project_id="your_project_id"
)

# Generate completion
response = client.generate_completion(
    model="ibm/granite-13b-chat-v2",
    prompt="Explain quantum computing"
)

print(response['response'])

# Stream completion
for chunk in client.stream_completion(
    model="ibm/granite-13b-chat-v2",
    prompt="Write a story about AI"
):
    print(chunk, end='', flush=True)
```

## Limitations

1. **Embedding Models**: Watson X uses different embedding models than Ollama. Make sure to configure embedding models appropriately in `main.py` if needed.

2. **Multimodal Support**: Image support varies by model availability in Watson X. Not all Granite models support multimodal inputs.

3. **Streaming**: Streaming support depends on the Watson X SDK version and may fall back to returning the full response at once.

4. **Rate Limits**: Watson X has API rate limits that may differ from local Ollama usage. Monitor your usage accordingly.

## Troubleshooting

### Authentication Errors

If you see authentication errors:
- Verify your API key is correct
- Check that your project ID matches an existing Watson X project
- Ensure your IBM Cloud account has Watson X access

### Model Not Found

If you get model not found errors:
- Verify the model ID is correct (e.g., `ibm/granite-13b-chat-v2`)
- Check that the model is available in your Watson X instance
- Some models may require additional permissions

### Connection Errors

If you experience connection issues:
- Check your internet connection
- Verify the Watson X URL is correct for your region
- Check IBM Cloud status page for service outages

## Cost Considerations

Unlike local Ollama, Watson X is a cloud service with usage-based pricing:
- Token-based pricing for generation
- Consider your query volume
- Monitor usage through IBM Cloud dashboard

## Reverting to Ollama

To switch back to local Ollama:

```bash
unset LLM_BACKEND  # or set LLM_BACKEND=ollama
python -m rag_system.main api
```

## Support

For Watson X specific issues:
- [IBM Watson X Documentation](https://www.ibm.com/docs/en/watsonx/saas)
- [Watson X Developer Hub](https://www.ibm.com/watsonx/developer/)
- [IBM Cloud Support](https://cloud.ibm.com/docs/get-support)

For LocalGPT issues:
- [LocalGPT GitHub Issues](https://github.com/PromtEngineer/localGPT/issues)

## Contributing

If you find issues with the Watson X integration or want to add features:
1. Create an issue describing the problem/feature
2. Submit a pull request with your changes
3. Ensure all tests pass

## License

This integration follows the same license as LocalGPT (MIT License).


================================================
FILE: backend/README.md
================================================
# localGPT Backend

Simple Python backend that connects your frontend to Ollama for local LLM chat.

## Prerequisites

1. **Install Ollama** (if not already installed):
   ```bash
   # Visit https://ollama.ai or run:
   curl -fsSL https://ollama.ai/install.sh | sh
   ```

2. **Start Ollama**:
   ```bash
   ollama serve
   ```

3. **Pull a model** (optional, server will suggest if needed):
   ```bash
   ollama pull llama3.2
   ```

## Setup

1. **Install Python dependencies**:
   ```bash
   pip install -r requirements.txt
   ```

2. **Test Ollama connection**:
   ```bash
   python ollama_client.py
   ```

3. **Start the backend server**:
   ```bash
   python server.py
   ```

Server will run on `http://localhost:8000`

## API Endpoints

### Health Check
```bash
GET /health
```
Returns server status and available models.

### Chat
```bash
POST /chat
Content-Type: application/json

{
  "message": "Hello!",
  "model": "llama3.2:latest",
  "conversation_history": []
}
```

Returns:
```json
{
  "response": "Hello! How can I help you?",
  "model": "llama3.2:latest",
  "message_count": 1
}
```

## Testing

Test the chat endpoint:
```bash
curl -X POST http://localhost:8000/chat \
  -H "Content-Type: application/json" \
  -d '{"message": "Hello!", "model": "llama3.2:latest"}'
```

## Frontend Integration

Your React frontend should connect to:
- **Backend**: `http://localhost:8000`
- **Chat endpoint**: `http://localhost:8000/chat`

## What's Next

This simple backend is ready for:
- ✅ **Real-time chat** with local LLMs
- 🔜 **Document upload** for RAG
- 🔜 **Vector database** integration
- 🔜 **Streaming responses**
- 🔜 **Chat history** persistence 

================================================
FILE: backend/database.py
================================================
import sqlite3
import uuid
import json
from datetime import datetime
from typing import List, Dict, Optional, Tuple

class ChatDatabase:
    def __init__(self, db_path: str = None):
        if db_path is None:
            # Auto-detect environment and set appropriate path
            import os
            if os.path.exists("/app"):  # Docker environment
                self.db_path = "/app/backend/chat_data.db"
            else:  # Local development environment
                self.db_path = "backend/chat_data.db"
        else:
            self.db_path = db_path
        self.init_database()
    
    def init_database(self):
        """Initialize the SQLite database with required tables"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Enable foreign keys
        conn.execute("PRAGMA foreign_keys = ON")
        
        # Sessions table
        conn.execute('''
            CREATE TABLE IF NOT EXISTS sessions (
                id TEXT PRIMARY KEY,
                title TEXT NOT NULL,
                created_at TEXT NOT NULL,
                updated_at TEXT NOT NULL,
                model_used TEXT NOT NULL,
                message_count INTEGER DEFAULT 0
            )
        ''')
        
        # Messages table
        conn.execute('''
            CREATE TABLE IF NOT EXISTS messages (
                id TEXT PRIMARY KEY,
                session_id TEXT NOT NULL,
                content TEXT NOT NULL,
                sender TEXT NOT NULL CHECK (sender IN ('user', 'assistant')),
                timestamp TEXT NOT NULL,
                metadata TEXT DEFAULT '{}',
                FOREIGN KEY (session_id) REFERENCES sessions (id) ON DELETE CASCADE
            )
        ''')
        
        # Create indexes for better performance
        conn.execute('CREATE INDEX IF NOT EXISTS idx_messages_session_id ON messages(session_id)')
        conn.execute('CREATE INDEX IF NOT EXISTS idx_messages_timestamp ON messages(timestamp)')
        conn.execute('CREATE INDEX IF NOT EXISTS idx_sessions_updated_at ON sessions(updated_at)')
        
        # Documents table
        conn.execute('''
            CREATE TABLE IF NOT EXISTS session_documents (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                session_id TEXT NOT NULL,
                file_path TEXT NOT NULL,
                indexed INTEGER DEFAULT 0,
                FOREIGN KEY (session_id) REFERENCES sessions (id) ON DELETE CASCADE
            )
        ''')
        conn.execute('CREATE INDEX IF NOT EXISTS idx_session_documents_session_id ON session_documents(session_id)')
        
        # --- NEW: Index persistence tables ---
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS indexes (
                id TEXT PRIMARY KEY,
                name TEXT UNIQUE,
                description TEXT,
                created_at TEXT,
                updated_at TEXT,
                vector_table_name TEXT,
                metadata TEXT
            )
        ''')

        cursor.execute('''
            CREATE TABLE IF NOT EXISTS index_documents (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                index_id TEXT,
                original_filename TEXT,
                stored_path TEXT,
                FOREIGN KEY(index_id) REFERENCES indexes(id)
            )
        ''')

        cursor.execute('''
            CREATE TABLE IF NOT EXISTS session_indexes (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                session_id TEXT,
                index_id TEXT,
                linked_at TEXT,
                FOREIGN KEY(session_id) REFERENCES sessions(id),
                FOREIGN KEY(index_id) REFERENCES indexes(id)
            )
        ''')
        
        conn.commit()
        conn.close()
        print("✅ Database initialized successfully")
    
    def create_session(self, title: str, model: str) -> str:
        """Create a new chat session"""
        session_id = str(uuid.uuid4())
        now = datetime.now().isoformat()
        
        conn = sqlite3.connect(self.db_path)
        conn.execute('''
            INSERT INTO sessions (id, title, created_at, updated_at, model_used)
            VALUES (?, ?, ?, ?, ?)
        ''', (session_id, title, now, now, model))
        conn.commit()
        conn.close()
        
        print(f"📝 Created new session: {session_id[:8]}... - {title}")
        return session_id
    
    def get_sessions(self, limit: int = 50) -> List[Dict]:
        """Get all chat sessions, ordered by most recent"""
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        
        cursor = conn.execute('''
            SELECT id, title, created_at, updated_at, model_used, message_count
            FROM sessions
            ORDER BY updated_at DESC
            LIMIT ?
        ''', (limit,))
        
        sessions = [dict(row) for row in cursor.fetchall()]
        conn.close()
        
        return sessions
    
    def get_session(self, session_id: str) -> Optional[Dict]:
        """Get a specific session"""
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        
        cursor = conn.execute('''
            SELECT id, title, created_at, updated_at, model_used, message_count
            FROM sessions
            WHERE id = ?
        ''', (session_id,))
        
        row = cursor.fetchone()
        conn.close()
        
        return dict(row) if row else None
    
    def add_message(self, session_id: str, content: str, sender: str, metadata: Dict = None) -> str:
        """Add a message to a session"""
        message_id = str(uuid.uuid4())
        now = datetime.now().isoformat()
        metadata_json = json.dumps(metadata or {})
        
        conn = sqlite3.connect(self.db_path)
        
        # Add the message
        conn.execute('''
            INSERT INTO messages (id, session_id, content, sender, timestamp, metadata)
            VALUES (?, ?, ?, ?, ?, ?)
        ''', (message_id, session_id, content, sender, now, metadata_json))
        
        # Update session timestamp and message count
        conn.execute('''
            UPDATE sessions 
            SET updated_at = ?, 
                message_count = message_count + 1
            WHERE id = ?
        ''', (now, session_id))
        
        conn.commit()
        conn.close()
        
        return message_id
    
    def get_messages(self, session_id: str, limit: int = 100) -> List[Dict]:
        """Get all messages for a session"""
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        
        cursor = conn.execute('''
            SELECT id, content, sender, timestamp, metadata
            FROM messages
            WHERE session_id = ?
            ORDER BY timestamp ASC
            LIMIT ?
        ''', (session_id, limit))
        
        messages = []
        for row in cursor.fetchall():
            message = dict(row)
            message['metadata'] = json.loads(message['metadata'])
            messages.append(message)
        
        conn.close()
        return messages
    
    def get_conversation_history(self, session_id: str) -> List[Dict]:
        """Get conversation history in the format expected by Ollama"""
        messages = self.get_messages(session_id)
        
        history = []
        for msg in messages:
            history.append({
                "role": msg["sender"],
                "content": msg["content"]
            })
        
        return history
    
    def update_session_title(self, session_id: str, title: str):
        """Update session title"""
        conn = sqlite3.connect(self.db_path)
        conn.execute('''
            UPDATE sessions 
            SET title = ?, updated_at = ?
            WHERE id = ?
        ''', (title, datetime.now().isoformat(), session_id))
        conn.commit()
        conn.close()
    
    def delete_session(self, session_id: str) -> bool:
        """Delete a session and all its messages"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.execute('DELETE FROM sessions WHERE id = ?', (session_id,))
        deleted = cursor.rowcount > 0
        conn.commit()
        conn.close()
        
        if deleted:
            print(f"🗑️ Deleted session: {session_id[:8]}...")
        
        return deleted
    
    def cleanup_empty_sessions(self) -> int:
        """Remove sessions with no messages"""
        conn = sqlite3.connect(self.db_path)
        
        # Find sessions with no messages
        cursor = conn.execute('''
            SELECT s.id FROM sessions s
            LEFT JOIN messages m ON s.id = m.session_id
            WHERE m.id IS NULL
        ''')
        
        empty_sessions = [row[0] for row in cursor.fetchall()]
        
        # Delete empty sessions
        deleted_count = 0
        for session_id in empty_sessions:
            cursor = conn.execute('DELETE FROM sessions WHERE id = ?', (session_id,))
            if cursor.rowcount > 0:
                deleted_count += 1
                print(f"🗑️ Cleaned up empty session: {session_id[:8]}...")
        
        conn.commit()
        conn.close()
        
        if deleted_count > 0:
            print(f"✨ Cleaned up {deleted_count} empty sessions")
        
        return deleted_count
    
    def get_stats(self) -> Dict:
        """Get database statistics"""
        conn = sqlite3.connect(self.db_path)
        
        # Get session count
        cursor = conn.execute('SELECT COUNT(*) FROM sessions')
        session_count = cursor.fetchone()[0]
        
        # Get message count
        cursor = conn.execute('SELECT COUNT(*) FROM messages')
        message_count = cursor.fetchone()[0]
        
        # Get most used model
        cursor = conn.execute('''
            SELECT model_used, COUNT(*) as count
            FROM sessions
            GROUP BY model_used
            ORDER BY count DESC
            LIMIT 1
        ''')
        most_used_model = cursor.fetchone()
        
        conn.close()
        
        return {
            "total_sessions": session_count,
            "total_messages": message_count,
            "most_used_model": most_used_model[0] if most_used_model else None
        }

    def add_document_to_session(self, session_id: str, file_path: str) -> int:
        """Adds a document file path to a session."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.execute(
            "INSERT INTO session_documents (session_id, file_path) VALUES (?, ?)",
            (session_id, file_path)
        )
        doc_id = cursor.lastrowid
        conn.commit()
        conn.close()
        print(f"📄 Added document '{file_path}' to session {session_id[:8]}...")
        return doc_id

    def get_documents_for_session(self, session_id: str) -> List[str]:
        """Retrieves all document file paths for a given session."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.execute(
            "SELECT file_path FROM session_documents WHERE session_id = ?",
            (session_id,)
        )
        paths = [row[0] for row in cursor.fetchall()]
        conn.close()
        return paths

    # -------- Index helpers ---------

    def create_index(self, name: str, description: str|None = None, metadata: dict | None = None) -> str:
        idx_id = str(uuid.uuid4())
        created = datetime.now().isoformat()
        vector_table = f"text_pages_{idx_id}"
        conn = sqlite3.connect(self.db_path)
        conn.execute('''
            INSERT INTO indexes (id, name, description, created_at, updated_at, vector_table_name, metadata)
            VALUES (?,?,?,?,?,?,?)
        ''', (idx_id, name, description, created, created, vector_table, json.dumps(metadata or {})))
        conn.commit()
        conn.close()
        print(f"📂 Created new index '{name}' ({idx_id[:8]})")
        return idx_id

    def get_index(self, index_id: str) -> dict | None:
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        cur = conn.execute('SELECT * FROM indexes WHERE id=?', (index_id,))
        row = cur.fetchone()
        if not row:
            conn.close()
            return None
        idx = dict(row)
        idx['metadata'] = json.loads(idx['metadata'] or '{}')
        cur = conn.execute('SELECT original_filename, stored_path FROM index_documents WHERE index_id=?', (index_id,))
        docs = [{'filename': r[0], 'stored_path': r[1]} for r in cur.fetchall()]
        idx['documents'] = docs
        conn.close()
        return idx

    def list_indexes(self) -> list[dict]:
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        rows = conn.execute('SELECT * FROM indexes').fetchall()
        res = []
        for r in rows:
            item = dict(r)
            item['metadata'] = json.loads(item['metadata'] or '{}')
            # attach documents list for convenience
            docs_cur = conn.execute('SELECT original_filename, stored_path FROM index_documents WHERE index_id=?', (item['id'],))
            docs = [{'filename':d[0],'stored_path':d[1]} for d in docs_cur.fetchall()]
            item['documents'] = docs
            res.append(item)
        conn.close()
        return res

    def add_document_to_index(self, index_id: str, filename: str, stored_path: str):
        conn = sqlite3.connect(self.db_path)
        conn.execute('INSERT INTO index_documents (index_id, original_filename, stored_path) VALUES (?,?,?)', (index_id, filename, stored_path))
        conn.commit()
        conn.close()

    def link_index_to_session(self, session_id: str, index_id: str):
        conn = sqlite3.connect(self.db_path)
        conn.execute('INSERT INTO session_indexes (session_id, index_id, linked_at) VALUES (?,?,?)', (session_id, index_id, datetime.now().isoformat()))
        conn.commit()
        conn.close()

    def get_indexes_for_session(self, session_id: str) -> list[str]:
        conn = sqlite3.connect(self.db_path)
        cursor = conn.execute('SELECT index_id FROM session_indexes WHERE session_id=? ORDER BY linked_at', (session_id,))
        ids = [r[0] for r in cursor.fetchall()]
        conn.close()
        return ids

    def delete_index(self, index_id: str) -> bool:
        """Delete an index and its related records (documents, session links). Returns True if deleted."""
        conn = sqlite3.connect(self.db_path)
        try:
            # Get vector table name before deletion (optional, for LanceDB cleanup)
            cur = conn.execute('SELECT vector_table_name FROM indexes WHERE id = ?', (index_id,))
            row = cur.fetchone()
            vector_table_name = row[0] if row else None

            # Remove child rows first due to foreign‐key constraints
            conn.execute('DELETE FROM index_documents WHERE index_id = ?', (index_id,))
            conn.execute('DELETE FROM session_indexes WHERE index_id = ?', (index_id,))
            cursor = conn.execute('DELETE FROM indexes WHERE id = ?', (index_id,))
            deleted = cursor.rowcount > 0
            conn.commit()
        finally:
            conn.close()

        if deleted:
            print(f"🗑️ Deleted index {index_id[:8]}... and related records")
            # Optional: attempt to drop LanceDB table if available
            if vector_table_name:
                try:
                    from rag_system.indexing.embedders import LanceDBManager
                    import os
                    db_path = os.getenv('LANCEDB_PATH') or './rag_system/index_store/lancedb'
                    ldb = LanceDBManager(db_path)
                    db = ldb.db
                    if hasattr(db, 'table_names') and vector_table_name in db.table_names():
                        db.drop_table(vector_table_name)
                        print(f"🚮 Dropped LanceDB table '{vector_table_name}'")
                except Exception as e:
                    print(f"⚠️ Could not drop LanceDB table '{vector_table_name}': {e}")
        return deleted

    def update_index_metadata(self, index_id: str, updates: dict):
        """Merge new key/values into an index's metadata JSON column."""
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        cur = conn.execute('SELECT metadata FROM indexes WHERE id=?', (index_id,))
        row = cur.fetchone()
        if row is None:
            conn.close()
            raise ValueError("Index not found")
        existing = json.loads(row['metadata'] or '{}')
        existing.update(updates)
        conn.execute('UPDATE indexes SET metadata=?, updated_at=? WHERE id=?', (json.dumps(existing), datetime.now().isoformat(), index_id))
        conn.commit()
        conn.close()

    def inspect_and_populate_index_metadata(self, index_id: str) -> dict:
        """
        Inspect LanceDB table to extract metadata for older indexes.
        Returns the inferred metadata or empty dict if inspection fails.
        """
        try:
            # Get index info
            index_info = self.get_index(index_id)
            if not index_info:
                return {}
            
            # Check if metadata is already populated
            if index_info.get('metadata') and len(index_info['metadata']) > 0:
                return index_info['metadata']
            
            # Try to inspect the LanceDB table
            vector_table_name = index_info.get('vector_table_name')
            if not vector_table_name:
                return {}
            
            try:
                # Try to import the RAG system modules
                try:
                    from rag_system.indexing.embedders import LanceDBManager
                    import os
                    
                    # Use the same path as the system
                    db_path = os.getenv('LANCEDB_PATH') or './rag_system/index_store/lancedb'
                    ldb = LanceDBManager(db_path)
                    
                    # Check if table exists
                    if not hasattr(ldb.db, 'table_names') or vector_table_name not in ldb.db.table_names():
                        # Table doesn't exist - this means the index was never properly built
                        inferred_metadata = {
                            'status': 'incomplete',
                            'issue': 'Vector table not found - index may not have been built properly',
                            'vector_table_expected': vector_table_name,
                            'available_tables': list(ldb.db.table_names()) if hasattr(ldb.db, 'table_names') else [],
                            'metadata_inferred_at': datetime.now().isoformat(),
                            'metadata_source': 'lancedb_inspection'
                        }
                        self.update_index_metadata(index_id, inferred_metadata)
                        print(f"⚠️ Index {index_id[:8]}... appears incomplete - vector table missing")
                        return inferred_metadata
                    
                    # Get table and inspect schema/data
                    table = ldb.db.open_table(vector_table_name)
                    
                    # Get a sample record to inspect - use correct LanceDB API
                    try:
                        # Try to get sample data using proper LanceDB methods
                        sample_df = table.to_pandas()
                        if len(sample_df) == 0:
                            inferred_metadata = {
                                'status': 'empty',
                                'issue': 'Vector table exists but contains no data',
                                'metadata_inferred_at': datetime.now().isoformat(),
                                'metadata_source': 'lancedb_inspection'
                            }
                            self.update_index_metadata(index_id, inferred_metadata)
                            return inferred_metadata
                        
                        # Take only first row for inspection
                        sample_df = sample_df.head(1)
                    except Exception as e:
                        print(f"⚠️ Could not read data from table {vector_table_name}: {e}")
                        return {}
                    
                    # Infer metadata from table structure
                    inferred_metadata = {
                        'status': 'functional',
                        'total_chunks': len(table.to_pandas()),  # Get total count
                    }
                    
                    # Check vector dimensions
                    if 'vector' in sample_df.columns:
                        vector_data = sample_df['vector'].iloc[0]
                        if isinstance(vector_data, list):
                            inferred_metadata['vector_dimensions'] = len(vector_data)
                            
                            # Try to infer embedding model from vector dimensions
                            dim_to_model = {
                                384: 'BAAI/bge-small-en-v1.5 (or similar)',
                                512: 'sentence-transformers/all-MiniLM-L6-v2 (or similar)',
                                768: 'BAAI/bge-base-en-v1.5 (or similar)', 
                                1024: 'Qwen/Qwen3-Embedding-0.6B (or similar)',
                                1536: 'text-embedding-ada-002 (or similar)'
                            }
                            if len(vector_data) in dim_to_model:
                                inferred_metadata['embedding_model_inferred'] = dim_to_model[len(vector_data)]
                    
                    # Try to parse metadata from sample record
                    if 'metadata' in sample_df.columns:
                        try:
                            sample_metadata = json.loads(sample_df['metadata'].iloc[0])
                            # Look for common metadata fields that might give us clues
                            if 'document_id' in sample_metadata:
                                inferred_metadata['has_document_structure'] = True
                            if 'chunk_index' in sample_metadata:
                                inferred_metadata['has_chunk_indexing'] = True
                            if 'original_text' in sample_metadata:
                                inferred_metadata['has_contextual_enrichment'] = True
                                inferred_metadata['retrieval_mode_inferred'] = 'hybrid (contextual enrichment detected)'
                            
                            # Check for chunk size patterns
                            if 'text' in sample_df.columns:
                                text_length = len(sample_df['text'].iloc[0])
                                if text_length > 0:
                                    inferred_metadata['sample_chunk_length'] = text_length
                                    # Rough chunk size estimation
                                    estimated_tokens = text_length // 4  # rough estimate: 4 chars per token
                                    if estimated_tokens < 300:
                                        inferred_metadata['chunk_size_inferred'] = '256 tokens (estimated)'
                                    elif estimated_tokens < 600:
                                        inferred_metadata['chunk_size_inferred'] = '512 tokens (estimated)'
                                    else:
                                        inferred_metadata['chunk_size_inferred'] = '1024+ tokens (estimated)'
                                        
                        except (json.JSONDecodeError, KeyError):
                            pass
                    
                    # Check if FTS index exists
                    try:
                        indices = table.list_indices()
                        fts_exists = any('fts' in idx.name.lower() for idx in indices)
                        if fts_exists:
                            inferred_metadata['has_fts_index'] = True
                            inferred_metadata['retrieval_mode_inferred'] = 'hybrid (FTS + vector)'
                        else:
                            inferred_metadata['retrieval_mode_inferred'] = 'vector-only'
                    except:
                        pass
                    
                    # Add inspection timestamp
                    inferred_metadata['metadata_inferred_at'] = datetime.now().isoformat()
                    inferred_metadata['metadata_source'] = 'lancedb_inspection'
                    
                    # Update the database with inferred metadata
                    if inferred_metadata:
                        self.update_index_metadata(index_id, inferred_metadata)
                        print(f"🔍 Inferred metadata for index {index_id[:8]}...: {len(inferred_metadata)} fields")
                    
                    return inferred_metadata
                    
                except ImportError as import_error:
                    # RAG system modules not available - provide basic fallback metadata
                    print(f"⚠️ RAG system modules not available for inspection: {import_error}")
                    
                    # Check if this is actually a legacy index by looking at creation date
                    created_at = index_info.get('created_at', '')
                    is_recent = False
                    if created_at:
                        try:
                            from datetime import datetime, timedelta
                            created_date = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
                            # Consider indexes created in the last 30 days as "recent"
                            is_recent = created_date > datetime.now().replace(tzinfo=created_date.tzinfo) - timedelta(days=30)
                        except:
                            pass
                    
                    # Provide basic fallback metadata with better status detection
                    if is_recent:
                        status = 'functional'
                        issue = 'Detailed configuration inspection requires RAG system modules, but index appears functional'
                    else:
                        status = 'legacy'
                        issue = 'This index was created before metadata tracking was implemented. Configuration details are not available.'
                    
                    fallback_metadata = {
                        'status': status,
                        'issue': issue,
                        'metadata_inferred_at': datetime.now().isoformat(),
                        'metadata_source': 'fallback_inspection',
                        'documents_count': len(index_info.get('documents', [])),
                        'created_at': index_info.get('created_at', 'unknown'),
                        'inspection_limitation': 'Backend server cannot access full RAG system modules for detailed inspection'
                    }
                    
                    # Try to infer some basic info from the vector table name
                    if vector_table_name:
                        fallback_metadata['vector_table_name'] = vector_table_name
                        fallback_metadata['note'] = 'Vector table exists but detailed inspection requires RAG system modules'
                    
                    self.update_index_metadata(index_id, fallback_metadata)
                    status_msg = "recent but limited inspection" if is_recent else "legacy"
                    print(f"📝 Added fallback metadata for {status_msg} index {index_id[:8]}...")
                    return fallback_metadata
                    
            except Exception as e:
                print(f"⚠️ Could not inspect LanceDB table for index {index_id[:8]}...: {e}")
                return {}
                
        except Exception as e:
            print(f"⚠️ Failed to inspect index metadata for {index_id[:8]}...: {e}")
            return {}

def generate_session_title(first_message: str, max_length: int = 50) -> str:
    """Generate a session title from the first message"""
    # Clean up the message
    title = first_message.strip()
    
    # Remove common prefixes
    prefixes = ["hey", "hi", "hello", "can you", "please", "i want", "i need"]
    title_lower = title.lower()
    for prefix in prefixes:
        if title_lower.startswith(prefix):
            title = title[len(prefix):].strip()
            break
    
    # Capitalize first letter
    if title:
        title = title[0].upper() + title[1:]
    
    # Truncate if too long
    if len(title) > max_length:
        title = title[:max_length].strip() + "..."
    
    # Fallback
    if not title or len(title) < 3:
        title = "New Chat"
    
    return title

# Global database instance
db = ChatDatabase()

if __name__ == "__main__":
    # Test the database
    print("🧪 Testing database...")
    
    # Create a test session
    session_id = db.create_session("Test Chat", "llama3.2:latest")
    
    # Add some messages
    db.add_message(session_id, "Hello!", "user")
    db.add_message(session_id, "Hi there! How can I help you?", "assistant")
    
    # Get messages
    messages = db.get_messages(session_id)
    print(f"📨 Messages: {len(messages)}")
    
    # Get sessions
    sessions = db.get_sessions()
    print(f"📋 Sessions: {len(sessions)}")
    
    # Get stats
    stats = db.get_stats()
    print(f"📊 Stats: {stats}")
    
    print("✅ Database test completed!")  

================================================
FILE: backend/ollama_client.py
================================================
import requests
import json
import os
from typing import List, Dict, Optional

class OllamaClient:
    def __init__(self, base_url: Optional[str] = None):
        if base_url is None:
            base_url = os.getenv("OLLAMA_HOST", "http://localhost:11434")
        self.base_url = base_url
        self.api_url = f"{base_url}/api"
    
    def is_ollama_running(self) -> bool:
        """Check if Ollama server is running"""
        try:
            response = requests.get(f"{self.base_url}/api/tags", timeout=5)
            return response.status_code == 200
        except requests.exceptions.RequestException:
            return False
    
    def list_models(self) -> List[str]:
        """Get list of available models"""
        try:
            response = requests.get(f"{self.api_url}/tags")
            if response.status_code == 200:
                models = response.json().get("models", [])
                return [model["name"] for model in models]
            return []
        except requests.exceptions.RequestException as e:
            print(f"Error fetching models: {e}")
            return []
    
    def pull_model(self, model_name: str) -> bool:
        """Pull a model if not available"""
        try:
            response = requests.post(
                f"{self.api_url}/pull",
                json={"name": model_name},
                stream=True
            )
            
            if response.status_code == 200:
                print(f"Pulling model {model_name}...")
                for line in response.iter_lines():
                    if line:
                        data = json.loads(line)
                        if "status" in data:
                            print(f"Status: {data['status']}")
                        if data.get("status") == "success":
                            return True
                return True
            return False
        except requests.exceptions.RequestException as e:
            print(f"Error pulling model: {e}")
            return False
    
    def chat(self, message: str, model: str = "llama3.2", conversation_history: List[Dict] = None, enable_thinking: bool = True) -> str:
        """Send a chat message to Ollama"""
        if conversation_history is None:
            conversation_history = []
        
        # Add user message to conversation
        messages = conversation_history + [{"role": "user", "content": message}]
        
        try:
            payload = {
                "model": model,
                "messages": messages,
                "stream": False,
            }
            
            # Multiple approaches to disable thinking tokens
            if not enable_thinking:
                payload.update({
                    "think": False,  # Native Ollama parameter
                    "options": {
                        "think": False,
                        "thinking": False,
                        "temperature": 0.7,
                        "top_p": 0.9
                    }
                })
            else:
                payload["think"] = True
            
            response = requests.post(
                f"{self.api_url}/chat",
                json=payload,
                timeout=60
            )
            
            if response.status_code == 200:
                result = response.json()
                response_text = result["message"]["content"]
                
                # Additional cleanup: remove any thinking tokens that might slip through
                if not enable_thinking:
                    # Remove common thinking token patterns
                    import re
                    response_text = re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL | re.IGNORECASE)
                    response_text = re.sub(r'<thinking>.*?</thinking>', '', response_text, flags=re.DOTALL | re.IGNORECASE)
                    response_text = response_text.strip()
                
                return response_text
            else:
                return f"Error: {response.status_code} - {response.text}"
                
        except requests.exceptions.RequestException as e:
            return f"Connection error: {e}"
    
    def chat_stream(self, message: str, model: str = "llama3.2", conversation_history: List[Dict] = None, enable_thinking: bool = True):
        """Stream chat response from Ollama"""
        if conversation_history is None:
            conversation_history = []
        
        messages = conversation_history + [{"role": "user", "content": message}]
        
        try:
            payload = {
                "model": model,
                "messages": messages,
                "stream": True,
            }
            
            # Multiple approaches to disable thinking tokens
            if not enable_thinking:
                payload.update({
                    "think": False,  # Native Ollama parameter
                    "options": {
                        "think": False,
                        "thinking": False,
                        "temperature": 0.7,
                        "top_p": 0.9
                    }
                })
            else:
                payload["think"] = True
            
            response = requests.post(
                f"{self.api_url}/chat",
                json=payload,
                stream=True,
                timeout=60
            )
            
            if response.status_code == 200:
                for line in response.iter_lines():
                    if line:
                        try:
                            data = json.loads(line)
                            if "message" in data and "content" in data["message"]:
                                content = data["message"]["content"]
                                
                                # Filter out thinking tokens in streaming mode
                                if not enable_thinking:
                                    # Skip content that looks like thinking tokens
                                    if '<think>' in content.lower() or '<thinking>' in content.lower():
                                        continue
                                
                                yield content
                        except json.JSONDecodeError:
                            continue
            else:
                yield f"Error: {response.status_code} - {response.text}"
                
        except requests.exceptions.RequestException as e:
            yield f"Connection error: {e}"

def main():
    """Test the Ollama client"""
    client = OllamaClient()
    
    # Check if Ollama is running
    if not client.is_ollama_running():
        print("❌ Ollama is not running. Please start Ollama first.")
        print("Install: https://ollama.ai")
        print("Run: ollama serve")
        return
    
    print("✅ Ollama is running!")
    
    # List available models
    models = client.list_models()
    print(f"Available models: {models}")
    
    # Try to use llama3.2, pull if needed
    model_name = "llama3.2"
    if model_name not in [m.split(":")[0] for m in models]:
        print(f"Model {model_name} not found. Pulling...")
        if client.pull_model(model_name):
            print(f"✅ Model {model_name} pulled successfully!")
        else:
            print(f"❌ Failed to pull model {model_name}")
            return
    
    # Test chat
    print("\n🤖 Testing chat...")
    response = client.chat("Hello! Can you tell me a short joke?", model_name)
    print(f"AI: {response}")

if __name__ == "__main__":
    main()    

================================================
FILE: backend/requirements.txt
================================================
requests
python-dotenv
PyPDF2 

================================================
FILE: backend/server.py
================================================
import json
import http.server
import socketserver
import cgi
import os
import uuid
from urllib.parse import urlparse, parse_qs
import requests  # 🆕 Import requests for making HTTP calls
import sys
from datetime import datetime

# Add parent directory to path so we can import rag_system modules
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Import RAG system modules for complete metadata
try:
    from rag_system.main import PIPELINE_CONFIGS
    RAG_SYSTEM_AVAILABLE = True
    print("✅ RAG system modules accessible from backend")
except ImportError as e:
    PIPELINE_CONFIGS = {}
    RAG_SYSTEM_AVAILABLE = False
    print(f"⚠️ RAG system modules not available: {e}")

from ollama_client import OllamaClient
from database import db, generate_session_title
import simple_pdf_processor as pdf_module
from simple_pdf_processor import initialize_simple_pdf_processor
from typing import List, Dict, Any
import re

# 🆕 Reusable TCPServer with address reuse enabled
class ReusableTCPServer(socketserver.TCPServer):
    allow_reuse_address = True

class ChatHandler(http.server.BaseHTTPRequestHandler):
    def __init__(self, *args, **kwargs):
        self.ollama_client = OllamaClient()
        super().__init__(*args, **kwargs)
    
    def do_OPTIONS(self):
        """Handle CORS preflight requests"""
        self.send_response(200)
        self.send_header('Access-Control-Allow-Origin', '*')
        self.send_header('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS')
        self.send_header('Access-Control-Allow-Headers', 'Content-Type')
        self.end_headers()
    
    def do_GET(self):
        """Handle GET requests"""
        parsed_path = urlparse(self.path)
        
        if parsed_path.path == '/health':
            self.send_json_response({
                "status": "ok",
                "ollama_running": self.ollama_client.is_ollama_running(),
                "available_models": self.ollama_client.list_models(),
                "database_stats": db.get_stats()
            })
        elif parsed_path.path == '/sessions':
            self.handle_get_sessions()
        elif parsed_path.path == '/sessions/cleanup':
            self.handle_cleanup_sessions()
        elif parsed_path.path == '/models':
            self.handle_get_models()
        elif parsed_path.path == '/indexes':
            self.handle_get_indexes()
        elif parsed_path.path.startswith('/indexes/') and parsed_path.path.count('/') == 2:
            index_id = parsed_path.path.split('/')[-1]
            self.handle_get_index(index_id)
        elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/documents'):
            session_id = parsed_path.path.split('/')[-2]
            self.handle_get_session_documents(session_id)
        elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/indexes'):
            session_id = parsed_path.path.split('/')[-2]
            self.handle_get_session_indexes(session_id)
        elif parsed_path.path.startswith('/sessions/') and parsed_path.path.count('/') == 2:
            session_id = parsed_path.path.split('/')[-1]
            self.handle_get_session(session_id)
        else:
            self.send_response(404)
            self.end_headers()
    
    def do_POST(self):
        """Handle POST requests"""
        parsed_path = urlparse(self.path)
        
        if parsed_path.path == '/chat':
            self.handle_chat()
        elif parsed_path.path == '/sessions':
            self.handle_create_session()
        elif parsed_path.path == '/indexes':
            self.handle_create_index()
        elif parsed_path.path.startswith('/indexes/') and parsed_path.path.endswith('/upload'):
            index_id = parsed_path.path.split('/')[-2]
            self.handle_index_file_upload(index_id)
        elif parsed_path.path.startswith('/indexes/') and parsed_path.path.endswith('/build'):
            index_id = parsed_path.path.split('/')[-2]
            self.handle_build_index(index_id)
        elif parsed_path.path.startswith('/sessions/') and '/indexes/' in parsed_path.path:
            parts = parsed_path.path.split('/')
            session_id = parts[2]
            index_id = parts[4]
            self.handle_link_index_to_session(session_id, index_id)
        elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/messages'):
            session_id = parsed_path.path.split('/')[-2]
            self.handle_session_chat(session_id)
        elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/upload'):
            session_id = parsed_path.path.split('/')[-2]
            self.handle_file_upload(session_id)
        elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/index'):
            session_id = parsed_path.path.split('/')[-2]
            self.handle_index_documents(session_id)
        elif parsed_path.path.startswith('/sessions/') and parsed_path.path.endswith('/rename'):
            session_id = parsed_path.path.split('/')[-2]
            self.handle_rename_session(session_id)
        else:
            self.send_response(404)
            self.end_headers()

    def do_DELETE(self):
        """Handle DELETE requests"""
        parsed_path = urlparse(self.path)
        
        if parsed_path.path.startswith('/sessions/') and parsed_path.path.count('/') == 2:
            session_id = parsed_path.path.split('/')[-1]
            self.handle_delete_session(session_id)
        elif parsed_path.path.startswith('/indexes/') and parsed_path.path.count('/') == 2:
            index_id = parsed_path.path.split('/')[-1]
            self.handle_delete_index(index_id)
        else:
            self.send_response(404)
            self.end_headers()
    
    def handle_chat(self):
        """Handle legacy chat requests (without sessions)"""
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length)
            data = json.loads(post_data.decode('utf-8'))
            
            message = data.get('message', '')
            model = data.get('model', 'llama3.2:latest')
            conversation_history = data.get('conversation_history', [])
            
            if not message:
                self.send_json_response({
                    "error": "Message is required"
                }, status_code=400)
                return
            
            # Check if Ollama is running
            if not self.ollama_client.is_ollama_running():
                self.send_json_response({
                    "error": "Ollama is not running. Please start Ollama first."
                }, status_code=503)
                return
            
            # Get response from Ollama
            response = self.ollama_client.chat(message, model, conversation_history)
            
            self.send_json_response({
                "response": response,
                "model": model,
                "message_count": len(conversation_history) + 1
            })
            
        except json.JSONDecodeError:
            self.send_json_response({
                "error": "Invalid JSON"
            }, status_code=400)
        except Exception as e:
            self.send_json_response({
                "error": f"Server error: {str(e)}"
            }, status_code=500)
    
    def handle_get_sessions(self):
        """Get all chat sessions"""
        try:
            sessions = db.get_sessions()
            self.send_json_response({
                "sessions": sessions,
                "total": len(sessions)
            })
        except Exception as e:
            self.send_json_response({
                "error": f"Failed to get sessions: {str(e)}"
            }, status_code=500)
    
    def handle_cleanup_sessions(self):
        """Clean up empty sessions"""
        try:
            cleanup_count = db.cleanup_empty_sessions()
            self.send_json_response({
                "message": f"Cleaned up {cleanup_count} empty sessions",
                "cleanup_count": cleanup_count
            })
        except Exception as e:
            self.send_json_response({
                "error": f"Failed to cleanup sessions: {str(e)}"
            }, status_code=500)
    
    def handle_get_session(self, session_id: str):
        """Get a specific session with its messages"""
        try:
            session = db.get_session(session_id)
            if not session:
                self.send_json_response({
                    "error": "Session not found"
                }, status_code=404)
                return
            
            messages = db.get_messages(session_id)
            
            self.send_json_response({
                "session": session,
                "messages": messages
            })
        except Exception as e:
            self.send_json_response({
                "error": f"Failed to get session: {str(e)}"
            }, status_code=500)
    
    def handle_get_session_documents(self, session_id: str):
        """Return documents and basic info for a session."""
        try:
            session = db.get_session(session_id)
            if not session:
                self.send_json_response({"error": "Session not found"}, status_code=404)
                return

            docs = db.get_documents_for_session(session_id)

            # Extract original filenames from stored paths
            filenames = [os.path.basename(p).split('_', 1)[-1] if '_' in os.path.basename(p) else os.path.basename(p) for p in docs]

            self.send_json_response({
                "session": session,
                "files": filenames,
                "file_count": len(docs)
            })
        except Exception as e:
            self.send_json_response({"error": f"Failed to get documents: {str(e)}"}, status_code=500)
    
    def handle_create_session(self):
        """Create a new chat session"""
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length)
            data = json.loads(post_data.decode('utf-8'))
            
            title = data.get('title', 'New Chat')
            model = data.get('model', 'llama3.2:latest')
            
            session_id = db.create_session(title, model)
            session = db.get_session(session_id)
            
            self.send_json_response({
                "session": session,
                "session_id": session_id
            }, status_code=201)
            
        except json.JSONDecodeError:
            self.send_json_response({
                "error": "Invalid JSON"
            }, status_code=400)
        except Exception as e:
            self.send_json_response({
                "error": f"Failed to create session: {str(e)}"
            }, status_code=500)
    
    def handle_session_chat(self, session_id: str):
        """
        Handle chat within a specific session.
        Intelligently routes between direct LLM (fast) and RAG pipeline (document-aware).
        """
        try:
            session = db.get_session(session_id)
            if not session:
                self.send_json_response({"error": "Session not found"}, status_code=404)
                return
            
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length)
            data = json.loads(post_data.decode('utf-8'))
            message = data.get('message', '')

            if not message:
                self.send_json_response({"error": "Message is required"}, status_code=400)
                return

            if session['message_count'] == 0:
                title = generate_session_title(message)
                db.update_session_title(session_id, title)

            # Add user message to database first
            user_message_id = db.add_message(session_id, message, "user")
            
            # 🎯 SMART ROUTING: Decide between direct LLM vs RAG
            idx_ids = db.get_indexes_for_session(session_id)
            force_rag = bool(data.get("force_rag", False))
            use_rag = True if force_rag else self._should_use_rag(message, idx_ids)
            
            if use_rag:
                # 🔍 --- Use RAG Pipeline for Document-Related Queries ---
                print(f"🔍 Using RAG pipeline for document query: '{message[:50]}...'")
                response_text, source_docs = self._handle_rag_query(session_id, message, data, idx_ids)
            else:
                # ⚡ --- Use Direct LLM for General Queries (FAST) ---
                print(f"⚡ Using direct LLM for general query: '{message[:50]}...'")
                response_text, source_docs = self._handle_direct_llm_query(session_id, message, session)

            # Add AI response to database
            ai_message_id = db.add_message(session_id, response_text, "assistant")
            
            updated_session = db.get_session(session_id)
            
            # Send response with proper error handling
            self.send_json_response({
                "response": response_text,
                "session": updated_session,
                "source_documents": source_docs,
                "used_rag": use_rag
            })
            
        except BrokenPipeError:
            # Client disconnected - this is normal for long queries, just log it
            print(f"⚠️  Client disconnected during RAG processing for query: '{message[:30]}...'")
        except json.JSONDecodeError:
            self.send_json_response({
                "error": "Invalid JSON"
            }, status_code=400)
        except Exception as e:
            print(f"❌ Server error in session chat: {str(e)}")
            try:
                self.send_json_response({
                    "error": f"Server error: {str(e)}"
                }, status_code=500)
            except BrokenPipeError:
                print(f"⚠️  Client disconnected during error response")
    
    def _should_use_rag(self, message: str, idx_ids: List[str]) -> bool:
        """
        🧠 ENHANCED: Determine if a query should use RAG pipeline using document overviews.
        
        Args:
            message: The user's query
            idx_ids: List of index IDs associated with the session
            
        Returns:
            bool: True if should use RAG, False for direct LLM
        """
        # No indexes = definitely no RAG needed
        if not idx_ids:
            return False

        # Load document overviews for intelligent routing
        try:
            doc_overviews = self._load_document_overviews(idx_ids)
            if doc_overviews:
                return self._route_using_overviews(message, doc_overviews)
        except Exception as e:
            print(f"⚠️ Overview-based routing failed, falling back to simple routing: {e}")
        
        # Fallback to simple pattern matching if overviews unavailable
        return self._simple_pattern_routing(message, idx_ids)

    def _load_document_overviews(self, idx_ids: List[str]) -> List[str]:
        """Load and aggregate overviews for the given index IDs.
        
        Strategy:
        1. Attempt to load each index's dedicated overview file.
        2. Aggregate all overviews found across available files (deduplicated).
        3. If none of the index files exist, fall back to the legacy global overview file.
        """
        import os, json

        aggregated: list[str] = []

        # 1️⃣  Collect overviews from per-index files
        for idx in idx_ids:
            candidate_paths = [
                f"../index_store/overviews/{idx}.jsonl",
                f"index_store/overviews/{idx}.jsonl",
                f"./index_store/overviews/{idx}.jsonl",
            ]
            for p in candidate_paths:
                if os.path.exists(p):
                    print(f"📖 Loading overviews from: {p}")
                    try:
                        with open(p, "r", encoding="utf-8") as f:
                            for line in f:
                                if not line.strip():
                                    continue
                                try:
                                    record = json.loads(line)
                                    overview = record.get("overview", "").strip()
                                    if overview:
                                        aggregated.append(overview)
                                except json.JSONDecodeError:
                                    continue  # skip malformed lines
                        break  # Stop after the first existing path for this idx
                    except Exception as e:
                        print(f"⚠️ Error reading {p}: {e}")
                        break  # Don't keep trying other paths for this idx if read failed

        # 2️⃣  Fall back to legacy global file if no per-index overviews found
        if not aggregated:
            legacy_paths = [
                "../index_store/overviews/overviews.jsonl",
                "index_store/overviews/overviews.jsonl",
                "./index_store/overviews/overviews.jsonl",
            ]
            for p in legacy_paths:
                if os.path.exists(p):
                    print(f"⚠️ Falling back to legacy overviews file: {p}")
                    try:
                        with open(p, "r", encoding="utf-8") as f:
                            for line in f:
                                if not line.strip():
                                    continue
                                try:
                                    record = json.loads(line)
                                    overview = record.get("overview", "").strip()
                                    if overview:
                                        aggregated.append(overview)
                                except json.JSONDecodeError:
                                    continue
                    except Exception as e:
                        print(f"⚠️ Error reading legacy overviews file {p}: {e}")
                    break

        # Limit for performance
        if aggregated:
            print(f"✅ Loaded {len(aggregated)} document overviews from {len(idx_ids)} index(es)")
        else:
            print(f"⚠️ No overviews found for indices {idx_ids}")
        return aggregated[:40]

    def _route_using_overviews(self, query: str, overviews: List[str]) -> bool:
        """
        🎯 Use document overviews and LLM to make intelligent routing decisions.
        
        Returns True if RAG should be used, False for direct LLM.
        """
        if not overviews:
            return False
        
        # Format overviews for the routing prompt
        overviews_block = "\n".join(f"[{i+1}] {ov}" for i, ov in enumerate(overviews))
        
        router_prompt = f"""You are an AI router deciding whether a user question should be answered via:
• "USE_RAG" – search the user's private documents (described below)  
• "DIRECT_LLM" – reply from general knowledge (greetings, public facts, unrelated topics)

CRITICAL PRINCIPLE: When documents exist in the KB, strongly prefer USE_RAG unless the query is purely conversational or completely unrelated to any possible document content.

RULES:
1. If ANY overview clearly relates to the question (entities, numbers, addresses, dates, amounts, companies, technical terms) → USE_RAG
2. For document operations (summarize, analyze, explain, extract, find) → USE_RAG  
3. For greetings only ("Hi", "Hello", "Thanks") → DIRECT_LLM
4. For pure math/world knowledge clearly unrelated to documents → DIRECT_LLM
5. When in doubt → USE_RAG

DOCUMENT OVERVIEWS:
{overviews_block}

DECISION EXAMPLES:
• "What invoice amounts are mentioned?" → USE_RAG (document-specific)
• "Who is PromptX AI LLC?" → USE_RAG (entity in documents)  
• "What is the DeepSeek model?" → USE_RAG (mentioned in documents)
• "Summarize the research paper" → USE_RAG (document operation)
• "What is 2+2?" → DIRECT_LLM (pure math)
• "Hi there" → DIRECT_LLM (greeting only)

USER QUERY: "{query}"

Respond with exactly one word: USE_RAG or DIRECT_LLM"""

        try:
            # Use Ollama to make the routing decision
            response = self.ollama_client.chat(
                message=router_prompt,
                model="qwen3:0.6b",  # Fast model for routing
                enable_thinking=False  # Fast routing
            )
            
            # The response is directly the text, not a dict
            decision = response.strip().upper()
            
            # Parse decision
            if "USE_RAG" in decision:
                print(f"🎯 Overview-based routing: USE_RAG for query: '{query[:50]}...'")
                return True
            elif "DIRECT_LLM" in decision:
                print(f"⚡ Overview-based routing: DIRECT_LLM for query: '{query[:50]}...'")
                return False
            else:
                print(f"⚠️ Unclear routing decision '{decision}', defaulting to RAG")
                return True  # Default to RAG when uncertain
                
        except Exception as e:
            print(f"❌ LLM routing failed: {e}, falling back to pattern matching")
            return self._simple_pattern_routing(query, [])

    def _simple_pattern_routing(self, message: str, idx_ids: List[str]) -> bool:
        """
        📝 FALLBACK: Simple pattern-based routing (original logic).
        """
        message_lower = message.lower()
        
        # Always use Direct LLM for greetings and casual conversation
        greeting_patterns = [
            'hello', 'hi', 'hey', 'greetings', 'good morning', 'good afternoon', 'good evening',
            'how are you', 'how do you do', 'nice to meet', 'pleasure to meet',
            'thanks', 'thank you', 'bye', 'goodbye', 'see you', 'talk to you later',
            'test', 'testing', 'check', 'ping', 'just saying', 'nevermind',
            'ok', 'okay', 'alright', 'got it', 'understood', 'i see'
        ]
        
        # Check for greeting patterns
        for pattern in greeting_patterns:
            if pattern in message_lower:
                return False  # Use Direct LLM for greetings
        
        # Keywords that strongly suggest document-related queries
        rag_indicators = [
            'document', 'doc', 'file', 'pdf', 'text', 'content', 'page',
            'according to', 'based on', 'mentioned', 'states', 'says',
            'what does', 'summarize', 'summary', 'analyze', 'analysis',
            'quote', 'citation', 'reference', 'source', 'evidence',
            'explain from', 'extract', 'find in', 'search for'
        ]
        
        # Check for strong RAG indicators
        for indicator in rag_indicators:
            if indicator in message_lower:
                return True
        
        # Question words + substantial length might benefit from RAG
        question_words = ['what', 'how', 'when', 'where', 'why', 'who', 'which']
        starts_with_question = any(message_lower.startswith(word) for word in question_words)
        
        if starts_with_question and len(message) > 40:
            return True
        
        # Very short messages - use direct LLM
        if len(message.strip()) < 20:
            return False
        
        # Default to Direct LLM unless there's clear indication of document query
        return False
    
    def _handle_direct_llm_query(self, session_id: str, message: str, session: dict):
        """
        Handle query using direct Ollama client with thinking disabled for speed.
        
        Returns:
            tuple: (response_text, empty_source_docs)
        """
        try:
            # Get conversation history for context
            conversation_history = db.get_conversation_history(session_id)
            
            # Use the session's model or default
            model = session.get('model', 'qwen3:8b')  # Default to fast model
            
            # Direct Ollama call with thinking disabled for speed
            response_text = self.ollama_client.chat(
                message=message,
                model=model,
                conversation_history=conversation_history,
                enable_thinking=False  # ⚡ DISABLE THINKING FOR SPEED
            )
            
            return response_text, []  # No source docs for direct LLM
            
        except Exception as e:
            print(f"❌ Direct LLM error: {e}")
            return f"Error processing query: {str(e)}", []
    
    def _handle_rag_query(self, session_id: str, message: str, data: dict, idx_ids: List[str]):
        """
        Handle query using the full RAG pipeline (delegates to the advanced RAG API running on port 8001).

        Returns:
            tuple[str, List[dict]]: (response_text, source_documents)
        """
        # Defaults
        response_text = ""
        source_docs: List[dict] = []

        # Build payload for RAG API
        rag_api_url = "http://localhost:8001/chat"
        table_name = f"text_pages_{idx_ids[-1]}" if idx_ids else None
        payload: Dict[str, Any] = {
            "query": message,
            "session_id": session_id,
        }
        if table_name:
            payload["table_name"] = table_name

        # Copy optional parameters from the incoming request
        optional_params: Dict[str, tuple[type, str]] = {
            "compose_sub_answers": (bool, "compose_sub_answers"),
            "query_decompose": (bool, "query_decompose"),
            "ai_rerank": (bool, "ai_rerank"),
            "context_expand": (bool, "context_expand"),
            "verify": (bool, "verify"),
            "retrieval_k": (int, "retrieval_k"),
            "context_window_size": (int, "context_window_size"),
            "reranker_top_k": (int, "reranker_top_k"),
            "search_type": (str, "search_type"),
            "dense_weight": (float, "dense_weight"),
            "provence_prune": (bool, "provence_prune"),
            "provence_threshold": (float, "provence_threshold"),
        }
        for key, (caster, payload_key) in optional_params.items():
            val = data.get(key)
            if val is not None:
                try:
                    payload[payload_key] = caster(val)  # type: ignore[arg-type]
                except Exception:
                    payload[payload_key] = val

        try:
            rag_response = requests.post(rag_api_url, json=payload)
            if rag_response.status_code == 200:
                rag_data = rag_response.json()
                response_text = rag_data.get("answer", "No answer found.")
                source_docs = rag_data.get("source_documents", [])
            else:
                response_text = f"Error from RAG API ({rag_response.status_code}): {rag_response.text}"
                print(f"❌ RAG API error: {response_text}")
        except requests.exceptions.ConnectionError:
            response_text = "Could not connect to the RAG API server. Please ensure it is running."
            print("❌ Connection to RAG API failed (port 8001).")
        except Exception as e:
            response_text = f"Error processing RAG query: {str(e)}"
            print(f"❌ RAG processing error: {e}")

        # Strip any <think>/<thinking> tags that might slip through
        response_text = re.sub(r'<(think|thinking)>.*?</\\1>', '', response_text, flags=re.DOTALL | re.IGNORECASE).strip()

        return response_text, source_docs

    def handle_delete_session(self, session_id: str):
        """Delete a session and its messages"""
        try:
            deleted = db.delete_session(session_id)
            if deleted:
                self.send_json_response({'deleted': deleted})
            else:
                self.send_json_response({'error': 'Session not found'}, status_code=404)
        except Exception as e:
            self.send_json_response({'error': str(e)}, status_code=500)
    
    def handle_file_upload(self, session_id: str):
        """Handle file uploads, save them, and associate with the session."""
        form = cgi.FieldStorage(
            fp=self.rfile,
            headers=self.headers,
            environ={'REQUEST_METHOD': 'POST', 'CONTENT_TYPE': self.headers['Content-Type']}
        )

        uploaded_files = []
        if 'files' in form:
            files = form['files']
            if not isinstance(files, list):
                files = [files]
            
            upload_dir = "shared_uploads"
            os.makedirs(upload_dir, exist_ok=True)

            for file_item in files:
                if file_item.filename:
                    # Create a unique filename to avoid overwrites
                    unique_filename = f"{uuid.uuid4()}_{file_item.filename}"
                    file_path = os.path.join(upload_dir, unique_filename)
                    
                    with open(file_path, 'wb') as f:
                        f.write(file_item.file.read())
                    
                    # Store the absolute path for the indexing service
                    absolute_file_path = os.path.abspath(file_path)
                    db.add_document_to_session(session_id, absolute_file_path)
                    uploaded_files.append({"filename": file_item.filename, "stored_path": absolute_file_path})

        if not uploaded_files:
            self.send_json_response({"error": "No files were uploaded"}, status_code=400)
            return
            
        self.send_json_response({
            "message": f"Successfully uploaded {len(uploaded_files)} files.",
            "uploaded_files": uploaded_files
        })

    def handle_index_documents(self, session_id: str):
        """Triggers indexing for all documents in a session."""
        print(f"🔥 Received request to index documents for session {session_id[:8]}...")
        try:
            file_paths = db.get_documents_for_session(session_id)
            if not file_paths:
                self.send_json_response({"message": "No documents to index for this session."}, status_code=200)
                return

            print(f"Found {len(file_paths)} documents to index. Sending to RAG API...")
            
            rag_api_url = "http://localhost:8001/index"
            rag_response = requests.post(rag_api_url, json={"file_paths": file_paths, "session_id": session_id})

            if rag_response.status_code == 200:
                print("✅ RAG API successfully indexed documents.")
                # Merge key config values into index metadata
                idx_meta = {
                    "session_linked": True,
                    "retrieval_mode": "hybrid",
                }
                try:
                    db.update_index_metadata(session_id, idx_meta)  # session_id used as index_id in text table naming
                except Exception as e:
                    print(f"⚠️ Failed to update index metadata for session index: {e}")
                self.send_json_response(rag_response.json())
            else:
                error_info = rag_response.text
                print(f"❌ RAG API indexing failed ({rag_response.status_code}): {error_info}")
                self.send_json_response({"error": f"Indexing failed: {error_info}"}, status_code=500)

        except Exception as e:
            print(f"❌ Exception during indexing: {str(e)}")
            self.send_json_response({"error": f"An unexpected error occurred: {str(e)}"}, status_code=500)
            
    def handle_pdf_upload(self, session_id: str):
        """
        Processes PDF files: extracts text and stores it in the database.
        DEPRECATED: This is the old method. Use handle_file_upload instead.
        """
        # This function is now deprecated in favor of the new indexing workflow
        # but is kept for potential legacy/compatibility reasons.
        # For new functionality, it should not be used.
        self.send_json_response({
            "warning": "This upload method is deprecated. Use the new file upload and indexing flow.",
            "message": "No action taken."
        }, status_code=410) # 410 Gone

    def handle_get_models(self):
        """Get available models from both Ollama and HuggingFace, grouped by capability"""
        try:
            generation_models = []
            embedding_models = []
            
            # Get Ollama models if available
            if self.ollama_client.is_ollama_running():
                all_ollama_models = self.ollama_client.list_models()
                
                # Very naive classification - same logic as RAG API server
                ollama_embedding_models = [m for m in all_ollama_models if any(k in m for k in ['embed','bge','embedding','text'])]
                ollama_generation_models = [m for m in all_ollama_models if m not in ollama_embedding_models]
                
                generation_models.extend(ollama_generation_models)
                embedding_models.extend(ollama_embedding_models)
            
            # Add supported HuggingFace embedding models
            huggingface_embedding_models = [
                "Qwen/Qwen3-Embedding-0.6B",
                "Qwen/Qwen3-Embedding-4B", 
                "Qwen/Qwen3-Embedding-8B"
            ]
            embedding_models.extend(huggingface_embedding_models)
            
            # Sort models for consistent ordering
            generation_models.sort()
            embedding_models.sort()
            
            self.send_json_response({
                "generation_models": generation_models,
                "embedding_models": embedding_models
            })
        except Exception as e:
            self.send_json_response({
                "error": f"Could not list models: {str(e)}"
            }, status_code=500)

    def handle_get_indexes(self):
        try:
            data = db.list_indexes()
            self.send_json_response({'indexes': data, 'total': len(data)})
        except Exception as e:
            self.send_json_response({'error': str(e)}, status_code=500)
    
    def handle_get_index(self, index_id: str):
        try:
            data = db.get_index(index_id)
            if not data:
                self.send_json_response({'error': 'Index not found'}, status_code=404)
                return
            self.send_json_response(data)
        except Exception as e:
            self.send_json_response({'error': str(e)}, status_code=500)
    
    def handle_create_index(self):
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length)
            data = json.loads(post_data.decode('utf-8'))
            name = data.get('name')
            description = data.get('description')
            metadata = data.get('metadata', {})
            
            if not name:
                self.send_json_response({'error': 'Name required'}, status_code=400)
                return
            
            # Add complete metadata from RAG system configuration if available
            if RAG_SYSTEM_AVAILABLE and PIPELINE_CONFIGS.get('default'):
                default_config = PIPELINE_CONFIGS['default']
                complete_metadata = {
                    'status': 'created',
                    'metadata_source': 'rag_system_config',
                    'created_at': json.loads(json.dumps(datetime.now().isoformat())),
                    'chunk_size': 512,  # From default config
                    'chunk_overlap': 64,  # From default config
                    'retrieval_mode': 'hybrid',  # From default config
                    'window_size': 5,  # From default config
                    'embedding_model': 'Qwen/Qwen3-Embedding-0.6B',  # From default config
                    'enrich_model': 'qwen3:0.6b',  # From default config
                    'overview_model': 'qwen3:0.6b',  # From default config
                    'enable_enrich': True,  # From default config
                    'latechunk': True,  # From default config
                    'docling_chunk': True,  # From default config
                    'note': 'Default configuration from RAG system'
                }
                # Merge with any provided metadata
                complete_metadata.update(metadata)
                metadata = complete_metadata
            
            idx_id = db.create_index(name, description, metadata)
            self.send_json_response({'index_id': idx_id}, status_code=201)
        except Exception as e:
            self.send_json_response({'error': str(e)}, status_code=500)
    
    def handle_index_file_upload(self, index_id: str):
        """Reuse file upload logic but store docs under index."""
        form = cgi.FieldStorage(fp=self.rfile, headers=self.headers, environ={'REQUEST_METHOD':'POST', 'CONTENT_TYPE': self.headers['Content-Type']})
        uploaded_files=[]
        if 'files' in form:
            files=form['files']
            if not isinstance(files, list):
                files=[files]
            upload_dir='shared_uploads'
            os.makedirs(upload_dir, exist_ok=True)
            for f in files:
                if f.filename:
                    unique=f"{uuid.uuid4()}_{f.filename}"
                    path=os.path.join(upload_dir, unique)
                    with open(path,'wb') as out: out.write(f.file.read())
                    db.add_document_to_index(index_id, f.filename, os.path.abspath(path))
                    uploaded_files.append({'filename':f.filename,'stored_path':os.path.abspath(path)})
        if not uploaded_files:
            self.send_json_response({'error':'No files uploaded'}, status_code=400); return
        self.send_json_response({'message':f"Uploaded {len(uploaded_files)} files","uploaded_files":uploaded_files})
    
    def handle_build_index(self, index_id: str):
        try:
            index=db.get_index(index_id)
            if not index:
                self.send_json_response({'error':'Index not found'}, status_code=404); return
            file_paths=[d['stored_path'] for d in index.get('documents',[])]
            if not file_paths:
                self.send_json_response({'error':'No documents to index'}, status_code=400); return

            # Parse request body for optional flags and configuration
            latechunk = False
            docling_chunk = False
            chunk_size = 512
            chunk_overlap = 64
            retrieval_mode = 'hybrid'
            window_size = 2
            enable_enrich = True
            embedding_model = None
            enrich_model = None
            batch_size_embed = 50
            batch_size_enrich = 25
            overview_model = None
            
            if 'Content-Length' in self.headers and int(self.headers['Content-Length']) > 0:
                try:
                    length = int(self.headers['Content-Length'])
                    body = self.rfile.read(length)
                    opts = json.loads(body.decode('utf-8'))
                    latechunk = bool(opts.get('latechunk', False))
                    docling_chunk = bool(opts.get('doclingChunk', False))
                    chunk_size = int(opts.get('chunkSize', 512))
                    chunk_overlap = int(opts.get('chunkOverlap', 64))
                    retrieval_mode = str(opts.get('retrievalMode', 'hybrid'))
                    window_size = int(opts.get('windowSize', 2))
                    enable_enrich = bool(opts.get('enableEnrich', True))
                    embedding_model = opts.get('embeddingModel')
                    enrich_model = opts.get('enrichModel')
                    batch_size_embed = int(opts.get('batchSizeEmbed', 50))
                    batch_size_enrich = int(opts.get('batchSizeEnrich', 25))
                    overview_model = opts.get('overviewModel')
                except Exception:
                    # Keep defaults on parse error
                    pass

            # Set per-index overview file path
            overview_path = f"index_store/overviews/{index_id}.jsonl"

            # Ensure config_override includes overview_path
            def ensure_overview_path(cfg: dict):
                cfg["overview_path"] = overview_path
            
            # we'll inject later when we build config_override

            # Delegate to advanced RAG API same as session indexing
            rag_api_url = "http://localhost:8001/index"
            import requests, json as _json
            # Use the index's dedicated LanceDB table so retrieval matches
            table_name = index.get("vector_table_name")
            payload = {
                "file_paths": file_paths,
                "session_id": index_id,  # reuse index_id for progress tracking
                "table_name": table_name,
                "chunk_size": chunk_size,
                "chunk_overlap": chunk_overlap,
                "retrieval_mode": retrieval_mode,
                "window_size": window_size,
                "enable_enrich": enable_enrich,
                "batch_size_embed": batch_size_embed,
                "batch_size_enrich": batch_size_enrich
            }
            if latechunk:
                payload["enable_latechunk"] = True
            if docling_chunk:
                payload["enable_docling_chunk"] = True
            if embedding_model:
                payload["embedding_model"] = embedding_model
            if enrich_model:
                payload["enrich_model"] = enrich_model
            if overview_model:
                payload["overview_model_name"] = overview_model
                
            rag_resp = requests.post(rag_api_url, json=payload)
            if rag_resp.status_code==200:
                meta_updates = {
                    "chunk_size": chunk_size,
                    "chunk_overlap": chunk_overlap,
                    "retrieval_mode": retrieval_mode,
                    "window_size": window_size,
                    "enable_enrich": enable_enrich,
                    "latechunk": latechunk,
                    "docling_chunk": docling_chunk,
                }
                if embedding_model:
                    meta_updates["embedding_model"] = embedding_model
                if enrich_model:
                    meta_updates["enrich_model"] = enrich_model
                if overview_model:
                    meta_updates["overview_model"] = overview_model
                try:
                    db.update_index_metadata(index_id, meta_updates)
                except Exception as e:
                    print(f"⚠️ Failed to update index metadata: {e}")

                self.send_json_response({
                    "response": rag_resp.json(),
                    **meta_updates
                })
            else:
                # Gracefully handle scenario where table already exists (idempotent build)
                try:
                    err_json = rag_resp.json()
                except Exception:
                    err_json = {}
                err_text = err_json.get('error') if isinstance(err_json, dict) else rag_resp.text
                if err_text and 'already exists' in err_text:
                    # Treat as non-fatal; return message indicating index previously built
                    self.send_json_response({
                        "message": "Index already built – skipping rebuild.",
                        "note": err_text
                })
                else:
                    self.send_json_response({"error":f"RAG indexing failed: {rag_resp.text}"}, status_code=500)
        except Exception as e:
            self.send_json_response({'error':str(e)}, status_code=500)
    
    def handle_link_index_to_session(self, session_id: str, index_id: str):
        try:
            db.link_index_to_session(session_id, index_id)
            self.send_json_response({'message':'Index linked to session'})
        except Exception as e:
            self.send_json_response({'error':str(e)}, status_code=500)

    def handle_get_session_indexes(self, session_id: str):
        try:
            idx_ids = db.get_indexes_for_session(session_id)
            indexes = []
            for idx_id in idx_ids:
                idx = db.get_index(idx_id)
                if idx:
                    # Try to populate metadata for older indexes that have empty metadata
                    if not idx.get('metadata') or len(idx['metadata']) == 0:
                        print(f"🔍 Attempting to infer metadata for index {idx_id[:8]}...")
                        inferred_metadata = db.inspect_and_populate_index_metadata(idx_id)
                        if inferred_metadata:
                            # Refresh the index data with the new metadata
                            idx = db.get_index(idx_id)
                    indexes.append(idx)
            self.send_json_response({'indexes': indexes, 'total': len(indexes)})
        except Exception as e:
            self.send_json_response({'error': str(e)}, status_code=500)

    def handle_delete_index(self, index_id: str):
        """Remove an index, its documents, links, and the underlying LanceDB table."""
        try:
            deleted = db.delete_index(index_id)
            if deleted:
                self.send_json_response({'message': 'Index deleted successfully', 'index_id': index_id})
            else:
                self.send_json_response({'error': 'Index not found'}, status_code=404)
        except Exception as e:
            self.send_json_response({'error': str(e)}, status_code=500)

    def handle_rename_session(self, session_id: str):
        """Rename an existing session title"""
        try:
            session = db.get_session(session_id)
            if not session:
                self.send_json_response({"error": "Session not found"}, status_code=404)
                return

            content_length = int(self.headers.get('Content-Length', 0))
            if content_length == 0:
                self.send_json_response({"error": "Request body required"}, status_code=400)
                return

            post_data = self.rfile.read(content_length)
            data = json.loads(post_data.decode('utf-8'))
            new_title: str = data.get('title', '').strip()

            if not new_title:
                self.send_json_response({"error": "Title cannot be empty"}, status_code=400)
                return

            db.update_session_title(session_id, new_title)
            updated_session = db.get_session(session_id)

            self.send_json_response({
                "message": "Session renamed successfully",
                "session": updated_session
            })

        except json.JSONDecodeError:
            self.send_json_response({"error": "Invalid JSON"}, status_code=400)
        except Exception as e:
            self.send_json_response({"error": f"Failed to rename session: {str(e)}"}, status_code=500)

    def send_json_response(self, data, status_code: int = 200):
        """Send a JSON (UTF-8) response with CORS headers. Safe against client disconnects."""
        try:
            self.send_response(status_code)
            self.send_header('Content-Type', 'application/json')
            self.send_header('Access-Control-Allow-Origin', '*')
            self.send_header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS')
            self.send_header('Access-Control-Allow-Headers', 'Content-Type, Authorization')
            self.send_header('Access-Control-Allow-Credentials', 'true')
            self.end_headers()
        
            response_bytes = json.dumps(data, indent=2).encode('utf-8')
            self.wfile.write(response_bytes)
        except BrokenPipeError:
            # Client disconnected before we could finish sending
            print("⚠️  Client disconnected during response – ignoring.")
        except Exception as e:
            print(f"❌ Error sending response: {e}")
    
    def log_message(self, format, *args):
        """Custom log format"""
        print(f"[{self.date_time_string()}] {format % args}")

def main():
    """Main function to initialize and start the server"""
    PORT = 8000  # 🆕 Define port
    try:
        # Initialize the database
        print("✅ Database initialized successfully")

        # Initialize the PDF processor
        try:
            pdf_module.initialize_simple_pdf_processor()
            print("📄 Initializing simple PDF processing...")
            if pdf_module.simple_pdf_processor:
                print("✅ Simple PDF processor initialized")
            else:
                print("⚠️ PDF processing could not be initialized.")
        except Exception as e:
            print(f"❌ Error initializing PDF processor: {e}")
            print("⚠️ PDF processing disabled - server will run without RAG functionality")

        # Set a global reference to the initialized processor if needed elsewhere
        global pdf_processor
        pdf_processor = pdf_module.simple_pdf_processor
        if pdf_processor:
            print("✅ Global PDF processor initialized")
        else:
            print("⚠️ PDF processing disabled - server will run without RAG functionality")
        
        # Cleanup empty sessions on startup
        print("🧹 Cleaning up empty sessions...")
        cleanup_count = db.cleanup_empty_sessions()
        if cleanup_count > 0:
            print(f"✨ Cleaned up {cleanup_count} empty sessions")
        else:
            print("✨ No empty sessions to clean up")

        # Start the server
        with ReusableTCPServer(("", PORT), ChatHandler) as httpd:
            print(f"🚀 Starting localGPT backend server on port {PORT}")
            print(f"📍 Chat endpoint: http://localhost:{PORT}/chat")
            print(f"🔍 Health check: http://localhost:{PORT}/health")
            
            # Test Ollama connection
            client = OllamaClient()
            if client.is_ollama_running():
                models = client.list_models()
                print(f"✅ Ollama is running with {len(models)} models")
                print(f"📋 Available models: {', '.join(models[:3])}{'...' if len(models) > 3 else ''}")
            else:
                print("⚠️  Ollama is not running. Please start Ollama:")
                print("   Install: https://ollama.ai")
                print("   Run: ollama serve")
            
            print(f"\n🌐 Frontend should connect to: http://localhost:{PORT}")
            print("💬 Ready to chat!\n")
            
            httpd.serve_forever()
    except KeyboardInterrupt:
        print("\n🛑 Server stopped")

if __name__ == "__main__":
    main() 

================================================
FILE: backend/simple_pdf_processor.py
================================================
"""
Simple PDF Processing Service
Handles PDF upload and text extraction for RAG functionality
"""

import uuid
from typing import List, Dict, Any
import PyPDF2
from io import BytesIO
import sqlite3
from datetime import datetime

class SimplePDFProcessor:
    def __init__(self, db_path: str = "chat_data.db"):
        """Initialize simple PDF processor with SQLite storage"""
        self.db_path = db_path
        self.init_database()
        print("✅ Simple PDF processor initialized")
    
    def init_database(self):
        """Initialize SQLite database for storing PDF content"""
        conn = sqlite3.connect(self.db_path)
        conn.execute('''
            CREATE TABLE IF NOT EXISTS pdf_documents (
                id TEXT PRIMARY KEY,
                session_id TEXT NOT NULL,
                filename TEXT NOT NULL,
                content TEXT NOT NULL,
                created_at TEXT NOT NULL
            )
        ''')
        
        conn.commit()
        conn.close()
    
    def extract_text_from_pdf(self, pdf_bytes: bytes) -> str:
        """Extract text from PDF bytes"""
        try:
            print(f"📄 Starting PDF text extraction ({len(pdf_bytes)} bytes)")
            pdf_file = BytesIO(pdf_bytes)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            
            print(f"📖 PDF has {len(pdf_reader.pages)} pages")
            
            text = ""
            for page_num, page in enumerate(pdf_reader.pages):
                print(f"📄 Processing page {page_num + 1}")
                try:
                    page_text = page.extract_text()
                    if page_text.strip():
                        text += f"\n--- Page {page_num + 1} ---\n"
                        text += page_text + "\n"
                    print(f"✅ Page {page_num + 1}: extracted {len(page_text)} characters")
                except Exception as page_error:
                    print(f"❌ Error on page {page_num + 1}: {str(page_error)}")
                    continue
            
            print(f"📄 Total extracted text: {len(text)} characters")
            return text.strip()
            
        except Exception as e:
            print(f"❌ Error extracting text from PDF: {str(e)}")
            print(f"❌ Error type: {type(e).__name__}")
            return ""
    
    def process_pdf(self, pdf_bytes: bytes, filename: str, session_id: str) -> Dict[str, Any]:
        """Process a PDF file and store in database"""
        print(f"📄 Processing PDF: {filename}")
        
        # Extract text
        text = self.extract_text_from_pdf(pdf_bytes)
        if not text:
            return {
                "success": False,
                "error": "Could not extract text from PDF",
                "filename": filename
            }
        
        print(f"📝 Extracted {len(text)} characters from {filename}")
        
        # Store in database
        document_id = str(uuid.uuid4())
        now = datetime.now().isoformat()
        
        try:
            conn = sqlite3.connect(self.db_path)
            
            # Store document
            conn.execute('''
                INSERT INTO pdf_documents (id, session_id, filename, content, created_at)
                VALUES (?, ?, ?, ?, ?)
            ''', (document_id, session_id, filename, text, now))
            
            conn.commit()
            conn.close()
            
            print(f"💾 Stored document {filename} in database")
            
            return {
                "success": True,
                "filename": filename,
                "file_id": document_id,
                "text_length": len(text)
            }
            
        except Exception as e:
            print(f"❌ Error storing in database: {str(e)}")
            return {
                "success": False,
                "error": f"Database storage failed: {str(e)}",
                "filename": filename
            }
    
    def get_session_documents(self, session_id: str) -> List[Dict[str, Any]]:
        """Get all documents for a session"""
        try:
            conn = sqlite3.connect(self.db_path)
            conn.row_factory = sqlite3.Row
            
            cursor = conn.execute('''
                SELECT id, filename, created_at
                FROM pdf_documents
                WHERE session_id = ?
                ORDER BY created_at DESC
            ''', (session_id,))
            
            documents = [dict(row) for row in cursor.fetchall()]
            conn.close()
            
            return documents
            
        except Exception as e:
            print(f"❌ Error getting session documents: {str(e)}")
            return []
    
    def get_document_content(self, session_id: str) -> str:
        """Get all document content for a session (for LLM context)"""
        try:
            conn = sqlite3.connect(self.db_path)
            
            cursor = conn.execute('''
                SELECT filename, content
                FROM pdf_documents
                WHERE session_id = ?
                ORDER BY created_at ASC
            ''', (session_id,))
            
            rows = cursor.fetchall()
            conn.close()
            
            if not rows:
                return ""
            
            # Combine all document content
            combined_content = ""
            for filename, content in rows:
                combined_content += f"\n\n=== Document: {filename} ===\n\n"
                combined_content += content
            
            return combined_content.strip()
            
        except Exception as e:
            print(f"❌ Error getting document content: {str(e)}")
            return ""
    
    def delete_session_documents(self, session_id: str) -> bool:
        """Delete all documents for a session"""
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.execute('''
                DELETE FROM pdf_documents
                WHERE session_id = ?
            ''', (session_id,))
            
            deleted_count = cursor.rowcount
            conn.commit()
            conn.close()
            
            if deleted_count > 0:
                print(f"🗑️ Deleted {deleted_count} documents for session {session_id[:8]}...")
            
            return deleted_count > 0
            
        except Exception as e:
            print(f"❌ Error deleting session documents: {str(e)}")
            return False


# Global instance
simple_pdf_processor = None

def initialize_simple_pdf_processor():
    """Initialize the global PDF processor"""
    global simple_pdf_processor
    try:
        simple_pdf_processor = SimplePDFProcessor()
        print("✅ Global PDF processor initialized")
    except Exception as e:
        print(f"❌ Failed to initialize PDF processor: {str(e)}")
        simple_pdf_processor = None

def get_simple_pdf_processor():
    """Get the global PDF processor instance"""
    global simple_pdf_processor
    if simple_pdf_processor is None:
        initialize_simple_pdf_processor()
    return simple_pdf_processor

if __name__ == "__main__":
    # Test the simple PDF processor
    print("🧪 Testing simple PDF processor...")
    
    processor = SimplePDFProcessor()
    print("✅ Simple PDF processor test completed!") 

================================================
FILE: backend/test_backend.py
================================================
#!/usr/bin/env python3
"""
Simple test script for the localGPT backend
"""

import requests

def test_health_endpoint():
    """Test the health endpoint"""
    print("🔍 Testing health endpoint...")
    try:
        response = requests.get("http://localhost:8000/health", timeout=5)
        if response.status_code == 200:
            data = response.json()
            print(f"✅ Health check passed")
            print(f"   Ollama running: {data['ollama_running']}")
            print(f"   Models available: {len(data['available_models'])}")
            return True
        else:
            print(f"❌ Health check failed: {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"❌ Health check failed: {e}")
        return False

def test_chat_endpoint():
    """Test the chat endpoint"""
    print("\n💬 Testing chat endpoint...")
    
    test_message = {
        "message": "Say 'Hello World' and nothing else.",
        "model": "llama3.2:latest"
    }
    
    try:
        response = requests.post(
            "http://localhost:8000/chat",
            headers={"Content-Type": "application/json"},
            json=test_message,
            timeout=30
        )
        
        if response.status_code == 200:
            data = response.json()
            print(f"✅ Chat test passed")
            print(f"   Model: {data['model']}")
            print(f"   Response: {data['response']}")
            print(f"   Message count: {data['message_count']}")
            return True
        else:
            print(f"❌ Chat test failed: {response.status_code}")
            print(f"   Response: {response.text}")
            return False
            
    except requests.exceptions.RequestException as e:
        print(f"❌ Chat test failed: {e}")
        return False

def test_conversation_history():
    """Test conversation with history"""
    print("\n🗨️  Testing conversation history...")
    
    # First message
    conversation = []
    
    message1 = {
        "message": "My name is Alice. Remember this.",
        "model": "llama3.2:latest",
        "conversation_history": conversation
    }
    
    try:
        response1 = requests.post(
            "http://localhost:8000/chat",
            headers={"Content-Type": "application/json"},
            json=message1,
            timeout=30
        )
        
        if response1.status_code == 200:
            data1 = response1.json()
            
            # Add to conversation history
            conversation.append({"role": "user", "content": "My name is Alice. Remember this."})
            conversation.append({"role": "assistant", "content": data1["response"]})
            
            # Second message asking about the name
            message2 = {
                "message": "What is my name?",
                "model": "llama3.2:latest", 
                "conversation_history": conversation
            }
            
            response2 = requests.post(
                "http://localhost:8000/chat",
                headers={"Content-Type": "application/json"},
                json=message2,
                timeout=30
            )
            
            if response2.status_code == 200:
                data2 = response2.json()
                print(f"✅ Conversation history test passed")
                print(f"   First response: {data1['response']}")
                print(f"   Second response: {data2['response']}")
                
                # Check if the AI remembered the name
                if "alice" in data2['response'].lower():
                    print(f"✅ AI correctly remembered the name!")
                else:
                    print(f"⚠️  AI might not have remembered the name")
                return True
            else:
                print(f"❌ Second message failed: {response2.status_code}")
                return False
        else:
            print(f"❌ First message failed: {response1.status_code}")
            return False
            
    except requests.exceptions.RequestException as e:
        print(f"❌ Conversation test failed: {e}")
        return False

def main():
    print("🧪 Testing localGPT Backend")
    print("=" * 40)
    
    # Test health endpoint
    health_ok = test_health_endpoint()
    if not health_ok:
        print("\n❌ Backend server is not running or not healthy")
        print("   Make sure to run: python server.py")
        return
    
    # Test basic chat
    chat_ok = test_chat_endpoint()
    if not chat_ok:
        print("\n❌ Chat functionality is not working")
        return
    
    # Test conversation history
    conversation_ok = test_conversation_history()
    
    print("\n" + "=" * 40)
    if health_ok and chat_ok and conversation_ok:
        print("🎉 All tests passed! Backend is ready for frontend integration.")
    else:
        print("⚠️  Some tests failed. Check the issues above.")
    
    print("\n🔗 Ready to connect to frontend at http://localhost:3000")

if __name__ == "__main__":
    main() 

================================================
FILE: backend/test_ollama_connectivity.py
================================================
#!/usr/bin/env python3

import os
import sys

def test_ollama_connectivity():
    """Test Ollama connectivity from within Docker container"""
    print("🧪 Testing Ollama Connectivity")
    print("=" * 40)
    
    ollama_host = os.getenv('OLLAMA_HOST', 'Not set')
    print(f"OLLAMA_HOST environment variable: {ollama_host}")
    
    try:
        from ollama_client import OllamaClient
        client = OllamaClient()
        print(f"OllamaClient base_url: {client.base_url}")
        
        is_running = client.is_ollama_running()
        print(f"Ollama running: {is_running}")
        
        if is_running:
            models = client.list_models()
            print(f"Available models: {models}")
            print("✅ Ollama connectivity test passed!")
            return True
        else:
            print("❌ Ollama connectivity test failed!")
            return False
            
    except Exception as e:
        print(f"❌ Error testing Ollama connectivity: {e}")
        return False

if __name__ == "__main__":
    success = test_ollama_connectivity()
    sys.exit(0 if success else 1)


================================================
FILE: batch_indexing_config.json
================================================
{
  "index_name": "Sample Batch Index",
  "index_description": "Example batch index configuration",
  "documents": [
    "./rag_system/documents/invoice_1039.pdf",
    "./rag_system/documents/invoice_1041.pdf"
  ],
  "processing": {
    "chunk_size": 512,
    "chunk_overlap": 64,
    "enable_enrich": true,
    "enable_latechunk": true,
    "enable_docling": true,
    "embedding_model": "Qwen/Qwen3-Embedding-0.6B",
    "generation_model": "qwen3:0.6b",
    "retrieval_mode": "hybrid",
    "window_size": 2
  }
}

================================================
FILE: create_index_script.py
================================================
#!/usr/bin/env python3
"""
Interactive Index Creation Script for LocalGPT RAG System

This script provides a user-friendly interface for creating document indexes
using the LocalGPT RAG system. It supports both single documents and batch
processing of multiple documents.

Usage:
    python create_index_script.py
    python create_index_script.py --batch
    python create_index_script.py --config custom_config.json
"""

import os
import sys
import json
import argparse
from typing import List, Optional
from pathlib import Path

# Add the project root to the path so we can import rag_system modules
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

try:
    from rag_system.main import PIPELINE_CONFIGS, get_agent
    from rag_system.pipelines.indexing_pipeline import IndexingPipeline
    from rag_system.utils.ollama_client import OllamaClient
    from backend.database import ChatDatabase
except ImportError as e:
    print(f"❌ Error importing required modules: {e}")
    print("Please ensure you're running this script from the project root directory.")
    sys.exit(1)


class IndexCreator:
    """Interactive index creation utility."""
    
    def __init__(self, config_path: Optional[str] = None):
        """Initialize the index creator with optional custom configuration."""
        self.db = ChatDatabase()
        self.config = self._load_config(config_path)
        
        # Initialize Ollama client
        self.ollama_client = OllamaClient()
        self.ollama_config = {
            "generation_model": "qwen3:0.6b",
            "embedding_model": "qwen3:0.6b"
        }
        
        # Initialize indexing pipeline
        self.pipeline = IndexingPipeline(
            self.config, 
            self.ollama_client, 
            self.ollama_config
        )
    
    def _load_config(self, config_path: Optional[str] = None) -> dict:
        """Load configuration from file or use default."""
        if config_path and os.path.exists(config_path):
            try:
                with open(config_path, 'r') as f:
                    return json.load(f)
            except Exception as e:
                print(f"⚠️  Error loading config from {config_path}: {e}")
                print("Using default configuration...")
        
        return PIPELINE_CONFIGS.get("default", {})
    
    def get_user_input(self, prompt: str, default: str = "") -> str:
        """Get user input with optional default value."""
        if default:
            user_input = input(f"{prompt} [{default}]: ").strip()
            return user_input if user_input else default
        return input(f"{prompt}: ").strip()
    
    def select_documents(self) -> List[str]:
        """Interactive document selection."""
        print("\n📁 Document Selection")
        print("=" * 50)
        
        documents = []
        
        while True:
            print("\nOptions:")
            print("1. Add a single document")
            print("2. Add all documents from a directory")
            print("3. Finish and proceed with selected documents")
            print("4. Show selected documents")
            
            choice = self.get_user_input("Select an option (1-4)", "1")
            
            if choice == "1":
                doc_path = self.get_user_input("Enter document path")
                if os.path.exists(doc_path):
                    documents.append(os.path.abspath(doc_path))
                    print(f"✅ Added: {doc_path}")
                else:
                    print(f"❌ File not found: {doc_path}")
            
            elif choice == "2":
                dir_path = self.get_user_input("Enter directory path")
                if os.path.isdir(dir_path):
                    supported_extensions = ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']
                    found_docs = []
                    
                    for ext in supported_extensions:
                        found_docs.extend(Path(dir_path).glob(f"*{ext}"))
                        found_docs.extend(Path(dir_path).glob(f"**/*{ext}"))
                    
                    if found_docs:
                        print(f"Found {len(found_docs)} documents:")
                        for doc in found_docs:
                            print(f"  - {doc}")
                        
                        if self.get_user_input("Add all these documents? (y/n)", "y").lower() == 'y':
                            documents.extend([str(doc.absolute()) for doc in found_docs])
                            print(f"✅ Added {len(found_docs)} documents")
                    else:
                        print("❌ No supported documents found in directory")
                else:
                    print(f"❌ Directory not found: {dir_path}")
            
            elif choice == "3":
                if documents:
                    break
                else:
                    print("❌ No documents selected. Please add at least one document.")
            
            elif choice == "4":
                if documents:
                    print(f"\n📄 Selected documents ({len(documents)}):")
                    for i, doc in enumerate(documents, 1):
                        print(f"  {i}. {doc}")
                else:
                    print("No documents selected yet.")
            
            else:
                print("Invalid choice. Please select 1-4.")
        
        return documents
    
    def configure_processing(self) -> dict:
        """Interactive processing configuration."""
        print("\n⚙️  Processing Configuration")
        print("=" * 50)
        
        print("Configure how documents will be processed:")
        
        # Basic settings
        chunk_size = int(self.get_user_input("Chunk size", "512"))
        chunk_overlap = int(self.get_user_input("Chunk overlap", "64"))
        
        # Advanced settings
        print("\nAdvanced options:")
        enable_enrich = self.get_user_input("Enable contextual enrichment? (y/n)", "y").lower() == 'y'
        enable_latechunk = self.get_user_input("Enable late chunking? (y/n)", "y").lower() == 'y'
        enable_docling = self.get_user_input("Enable Docling chunking? (y/n)", "y").lower() == 'y'
        
        # Model selection
        print("\nModel Configuration:")
        embedding_model = self.get_user_input("Embedding model", "Qwen/Qwen3-Embedding-0.6B")
        generation_model = self.get_user_input("Generation model", "qwen3:0.6b")
        
        return {
            "chunk_size": chunk_size,
            "chunk_overlap": chunk_overlap,
            "enable_enrich": enable_enrich,
            "enable_latechunk": enable_latechunk,
            "enable_docling": enable_docling,
            "embedding_model": embedding_model,
            "generation_model": generation_model,
            "retrieval_mode": "hybrid",
            "window_size": 2
        }
    
    def create_index_interactive(self) -> None:
        """Run the interactive index creation process."""
        print("🚀 LocalGPT Index Creation Tool")
        print("=" * 50)
        
        # Get index details
        index_name = self.get_user_input("Enter index name")
        index_description = self.get_user_input("Enter index description (optional)")
        
        # Select documents
        documents = self.select_documents()
        
        # Configure processing
        processing_config = self.configure_processing()
        
        # Confirm creation
        print("\n📋 Index Summary")
        print("=" * 50)
        print(f"Name: {index_name}")
        print(f"Description: {index_description or 'None'}")
        print(f"Documents: {len(documents)}")
        print(f"Chunk size: {processing_config['chunk_size']}")
        print(f"Enrichment: {'Enabled' if processing_config['enable_enrich'] else 'Disabled'}")
        print(f"Embedding model: {processing_config['embedding_model']}")
        
        if self.get_user_input("\nProceed with index creation? (y/n)", "y").lower() != 'y':
            print("❌ Index creation cancelled.")
            return
        
        # Create the index
        try:
            print("\n🔥 Creating index...")
            
            # Create index record in database
            index_id = self.db.create_index(
                name=index_name,
                description=index_description,
                metadata=processing_config
            )
            
            # Add documents to index
            for doc_path in documents:
                filename = os.path.basename(doc_path)
                self.db.add_document_to_index(index_id, filename, doc_path)
            
            # Process documents through pipeline
            print("📚 Processing documents...")
            self.pipeline.process_documents(documents)
            
            print(f"\n✅ Index '{index_name}' created successfully!")
            print(f"Index ID: {index_id}")
            print(f"Processed {len(documents)} documents")
            
            # Test the index
            if self.get_user_input("\nTest the index with a sample query? (y/n)", "y").lower() == 'y':
                self.test_index(index_id)
                
        except Exception as e:
            print(f"❌ Error creating index: {e}")
            import traceback
            traceback.print_exc()
    
    def test_index(self, index_id: str) -> None:
        """Test the created index with a sample query."""
        try:
            print("\n🧪 Testing Index")
            print("=" * 50)
            
            # Get agent for testing
            agent = get_agent("default")
            
            # Test query
            test_query = self.get_user_input("Enter a test query", "What is this document about?")
            
            print(f"\nProcessing query: {test_query}")
            response = agent.run(test_query, table_name=f"text_pages_{index_id}")
            
            print(f"\n🤖 Response:")
            print(response)
            
        except Exception as e:
            print(f"❌ Error testing index: {e}")
    
    def batch_create_from_config(self, config_file: str) -> None:
        """Create index from batch configuration file."""
        try:
            with open(config_file, 'r') as f:
                batch_config = json.load(f)
            
            index_name = batch_config.get("index_name", "Batch Index")
            index_description = batch_config.get("index_description", "")
            documents = batch_config.get("documents", [])
            processing_config = batch_config.get("processing", {})
            
            if not documents:
                print("❌ No documents specified in batch configuration")
                return
            
            # Validate documents exist
            valid_documents = []
            for doc_path in documents:
                if os.path.exists(doc_path):
                    valid_documents.append(doc_path)
                else:
                    print(f"⚠️  Document not found: {doc_path}")
            
            if not valid_documents:
                print("❌ No valid documents found")
                return
            
            print(f"🚀 Creating batch index: {index_name}")
            print(f"📄 Processing {len(valid_documents)} documents...")
            
            # Create index
            index_id = self.db.create_index(
                name=index_name,
                description=index_description,
                metadata=processing_config
            )
            
            # Add documents
            for doc_path in valid_documents:
                filename = os.path.basename(doc_path)
                self.db.add_document_to_index(index_id, filename, doc_path)
            
            # Process documents
            self.pipeline.process_documents(valid_documents)
            
            print(f"✅ Batch index '{index_name}' created successfully!")
            print(f"Index ID: {index_id}")
            
        except Exception as e:
            print(f"❌ Error creating batch index: {e}")
            import traceback
            traceback.print_exc()


def create_sample_batch_config():
    """Create a sample batch configuration file."""
    sample_config = {
        "index_name": "Sample Batch Index",
        "index_description": "Example batch index configuration",
        "documents": [
            "./rag_system/documents/invoice_1039.pdf",
            "./rag_system/documents/invoice_1041.pdf"
        ],
        "processing": {
            "chunk_size": 512,
            "chunk_overlap": 64,
            "enable_enrich": True,
            "enable_latechunk": True,
            "enable_docling": True,
            "embedding_model": "Qwen/Qwen3-Embedding-0.6B",
            "generation_model": "qwen3:0.6b",
            "retrieval_mode": "hybrid",
            "window_size": 2
        }
    }
    
    with open("batch_indexing_config.json", "w") as f:
        json.dump(sample_config, f, indent=2)
    
    print("📄 Sample batch configuration created: batch_indexing_config.json")


def main():
    """Main entry point for the script."""
    parser = argparse.ArgumentParser(description="LocalGPT Index Creation Tool")
    parser.add_argument("--batch", help="Batch configuration file", type=str)
    parser.add_argument("--config", help="Custom pipeline configuration file", type=str)
    parser.add_argument("--create-sample", action="store_true", help="Create sample batch config")
    
    args = parser.parse_args()
    
    if args.create_sample:
        create_sample_batch_config()
        return
    
    try:
        creator = IndexCreator(config_path=args.config)
        
        if args.batch:
            creator.batch_create_from_config(args.batch)
        else:
            creator.create_index_interactive()
            
    except KeyboardInterrupt:
        print("\n\n❌ Operation cancelled by user.")
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()  

================================================
FILE: demo_batch_indexing.py
================================================
#!/usr/bin/env python3
"""
Demo Batch Indexing Script for LocalGPT RAG System

This script demonstrates how to perform batch indexing of multiple documents
using configuration files. It's designed to showcase the full capabilities
of the indexing pipeline with various configuration options.

Usage:
    python demo_batch_indexing.py --config batch_indexing_config.json
    python demo_batch_indexing.py --create-sample-config
    python demo_batch_indexing.py --help
"""

import os
import sys
import json
import argparse
import time
import logging
from typing import List, Dict, Any, Optional
from pathlib import Path
from datetime import datetime

# Add the project root to the path so we can import rag_system modules
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

try:
    from rag_system.main import PIPELINE_CONFIGS
    from rag_system.pipelines.indexing_pipeline import IndexingPipeline
    from rag_system.utils.ollama_client import OllamaClient
    from backend.database import ChatDatabase
except ImportError as e:
    print(f"❌ Error importing required modules: {e}")
    print("Please ensure you're running this script from the project root directory.")
    sys.exit(1)

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s",
)


class BatchIndexingDemo:
    """Demonstration of batch indexing capabilities."""
    
    def __init__(self, config_path: str):
        """Initialize the batch indexing demo."""
        self.config_path = config_path
        self.config = self._load_config()
        self.db = ChatDatabase()
        
        # Initialize Ollama client
        self.ollama_client = OllamaClient()
        
        # Initialize pipeline with merged configuration
        self.pipeline_config = self._merge_configurations()
        self.pipeline = IndexingPipeline(
            self.pipeline_config,
            self.ollama_client,
            self.config.get("ollama_config", {
                "generation_model": "qwen3:0.6b",
                "embedding_model": "qwen3:0.6b"
            })
        )
    
    def _load_config(self) -> Dict[str, Any]:
        """Load batch indexing configuration from file."""
        try:
            with open(self.config_path, 'r') as f:
                config = json.load(f)
            print(f"✅ Loaded configuration from {self.config_path}")
            return config
        except FileNotFoundError:
            print(f"❌ Configuration file not found: {self.config_path}")
            sys.exit(1)
        except json.JSONDecodeError as e:
            print(f"❌ Invalid JSON in configuration file: {e}")
            sys.exit(1)
    
    def _merge_configurations(self) -> Dict[str, Any]:
        """Merge batch config with default pipeline config."""
        # Start with default pipeline configuration
        merged_config = PIPELINE_CONFIGS.get("default", {}).copy()
        
        # Override with batch-specific settings
        batch_settings = self.config.get("pipeline_settings", {})
        
        # Deep merge for nested dictionaries
        def deep_merge(base: dict, override: dict) -> dict:
            result = base.copy()
            for key, value in override.items():
                if key in result and isinstance(result[key], dict) and isinstance(value, dict):
                    result[key] = deep_merge(result[key], value)
                else:
                    result[key] = value
            return result
        
        return deep_merge(merged_config, batch_settings)
    
    def validate_documents(self, documents: List[str]) -> List[str]:
        """Validate and filter document paths."""
        valid_documents = []
        
        print(f"📋 Validating {len(documents)} documents...")
        
        for doc_path in documents:
            # Handle relative paths
            if not os.path.isabs(doc_path):
                doc_path = os.path.abspath(doc_path)
            
            if os.path.exists(doc_path):
                # Check file extension
                ext = Path(doc_path).suffix.lower()
                if ext in ['.pdf', '.txt', '.docx', '.md', '.html', '.htm']:
                    valid_documents.append(doc_path)
                    print(f"  ✅ {doc_path}")
                else:
                    print(f"  ⚠️  Unsupported file type: {doc_path}")
            else:
                print(f"  ❌ File not found: {doc_path}")
        
        print(f"📊 {len(valid_documents)} valid documents found")
        return valid_documents
    
    def create_indexes(self) -> List[str]:
        """Create multiple indexes based on configuration."""
        indexes = self.config.get("indexes", [])
        created_indexes = []
        
        for index_config in indexes:
            index_id = self.create_single_index(index_config)
            if index_id:
                created_indexes.append(index_id)
        
        return created_indexes
    
    def create_single_index(self, index_config: Dict[str, Any]) -> Optional[str]:
        """Create a single index from configuration."""
        try:
            # Extract index metadata
            index_name = index_config.get("name", "Unnamed Index")
            index_description = index_config.get("description", "")
            documents = index_config.get("documents", [])
            
            if not documents:
                print(f"⚠️  No documents specified for index '{index_name}', skipping...")
                return None
            
            # Validate documents
            valid_documents = self.validate_documents(documents)
            if not valid_documents:
                print(f"❌ No valid documents found for index '{index_name}'")
                return None
            
            print(f"\n🚀 Creating index: {index_name}")
            print(f"📄 Processing {len(valid_documents)} documents")
            
            # Create index record in database
            index_metadata = {
                "created_by": "demo_batch_indexing.py",
                "created_at": datetime.now().isoformat(),
                "document_count": len(valid_documents),
                "config_used": index_config.get("processing_options", {})
            }
            
            index_id = self.db.create_index(
                name=index_name,
                description=index_description,
                metadata=index_metadata
            )
            
            # Add documents to index
            for doc_path in valid_documents:
                filename = os.path.basename(doc_path)
                self.db.add_document_to_index(index_id, filename, doc_path)
            
            # Process documents through pipeline
            start_time = time.time()
            self.pipeline.process_documents(valid_documents)
            processing_time = time.time() - start_time
            
            print(f"✅ Index '{index_name}' created successfully!")
            print(f"   Index ID: {index_id}")
            print(f"   Processing time: {processing_time:.2f} seconds")
            print(f"   Documents processed: {len(valid_documents)}")
            
            return index_id
            
        except Exception as e:
            print(f"❌ Error creating index '{index_name}': {e}")
            import traceback
            traceback.print_exc()
            return None
    
    def demonstrate_features(self):
        """Demonstrate various indexing features."""
        print("\n🎯 Batch Indexing Demo Features:")
        print("=" * 50)
        
        # Show configuration
        print(f"📋 Configuration file: {self.config_path}")
        print(f"📊 Number of indexes to create: {len(self.config.get('indexes', []))}")
        
        # Show pipeline settings
        pipeline_settings = self.config.get("pipeline_settings", {})
        if pipeline_settings:
            print("\n⚙️  Pipeline Settings:")
            for key, value in pipeline_settings.items():
                print(f"   {key}: {value}")
        
        # Show model configuration
        ollama_config = self.config.get("ollama_config", {})
        if ollama_config:
            print("\n🤖 Model Configuration:")
            for key, value in ollama_config.items():
                print(f"   {key}: {value}")
    
    def run_demo(self):
        """Run the complete batch indexing demo."""
        print("🚀 LocalGPT Batch Indexing Demo")
        print("=" * 50)
        
        # Show demo features
        self.demonstrate_features()
        
        # Create indexes
        print(f"\n📚 Starting batch indexing process...")
        start_time = time.time()
        
        created_indexes = self.create_indexes()
        
        total_time = time.time() - start_time
        
        # Summary
        print(f"\n📊 Batch Indexing Summary")
        print("=" * 50)
        print(f"✅ Successfully created {len(created_indexes)} indexes")
        print(f"⏱️  Total processing time: {total_time:.2f} seconds")
        
        if created_indexes:
            print(f"\n📋 Created Indexes:")
            for i, index_id in enumerate(created_indexes, 1):
                index_info = self.db.get_index(index_id)
                if index_info:
                    print(f"   {i}. {index_info['name']} ({index_id[:8]}...)")
                    print(f"      Documents: {len(index_info.get('documents', []))}")
        
        print(f"\n🎉 Demo completed successfully!")
        print(f"💡 You can now use these indexes in the LocalGPT interface.")


def create_sample_config():
    """Create a comprehensive sample configuration file."""
    sample_config = {
        "description": "Demo batch indexing configuration showcasing various features",
        "pipeline_settings": {
            "embedding_model_name": "Qwen/Qwen3-Embedding-0.6B",
            "indexing": {
                "embedding_batch_size": 50,
                "enrichment_batch_size": 25,
                "enable_progress_tracking": True
            },
            "contextual_enricher": {
                "enabled": True,
                "window_size": 2,
                "model_name": "qwen3:0.6b"
            },
            "chunking": {
                "chunk_size": 512,
                "chunk_overlap": 64,
                "enable_latechunk": True,
                "enable_docling": True
            },
            "retrievers": {
                "dense": {
                    "enabled": True,
                    "lancedb_table_name": "demo_text_pages"
                },
                "bm25": {
                    "enabled": True,
                    "index_name": "demo_bm25_index"
                }
            },
            "storage": {
                "lancedb_uri": "./index_store/lancedb",
                "bm25_path": "./index_store/bm25"
            }
        },
        "ollama_config": {
            "generation_model": "qwen3:0.6b",
            "embedding_model": "qwen3:0.6b"
        },
        "indexes": [
            {
                "name": "Sample Invoice Collection",
                "description": "Demo index containing sample invoice documents",
                "documents": [
                    "./rag_system/documents/invoice_1039.pdf",
                    "./rag_system/documents/invoice_1041.pdf"
                ],
                "processing_options": {
                    "chunk_size": 512,
                    "enable_enrichment": True,
                    "retrieval_mode": "hybrid"
                }
            },
            {
                "name": "Research Papers Demo",
                "description": "Demo index for research papers and whitepapers",
                "documents": [
                    "./rag_system/documents/Newwhitepaper_Agents2.pdf"
                ],
                "processing_options": {
                    "chunk_size": 1024,
                    "enable_enrichment": True,
                    "retrieval_mode": "dense"
                }
            }
        ]
    }
    
    config_filename = "batch_indexing_config.json"
    with open(config_filename, "w") as f:
        json.dump(sample_config, f, indent=2)
    
    print(f"✅ Sample configuration created: {config_filename}")
    print(f"📝 Edit this file to customize your batch indexing setup")
    print(f"🚀 Run: python demo_batch_indexing.py --config {config_filename}")


def main():
    """Main entry point for the demo script."""
    parser = argparse.ArgumentParser(
        description="LocalGPT Batch Indexing Demo",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python demo_batch_indexing.py --config batch_indexing_config.json
  python demo_batch_indexing.py --create-sample-config
  
This demo showcases the advanced batch indexing capabilities of LocalGPT,
including multi-index creation, advanced configuration options, and
comprehensive processing pipelines.
        """
    )
    
    parser.add_argument(
        "--config",
        type=str,
        default="batch_indexing_config.json",
        help="Path to batch indexing configuration file"
    )
    
    parser.add_argument(
        "--create-sample-config",
        action="store_true",
        help="Create a sample configuration file"
    )
    
    args = parser.parse_args()
    
    if args.create_sample_config:
        create_sample_config()
        return
    
    if not os.path.exists(args.config):
        print(f"❌ Configuration file not found: {args.config}")
        print(f"💡 Create a sample config with: python {sys.argv[0]} --create-sample-config")
        sys.exit(1)
    
    try:
        demo = BatchIndexingDemo(args.config)
        demo.run_demo()
        
    except KeyboardInterrupt:
        print("\n\n❌ Demo cancelled by user.")
    except Exception as e:
        print(f"❌ Demo failed: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()  

================================================
FILE: docker-compose.local-ollama.yml
================================================
services:
  # RAG API server (connects to host Ollama)
  rag-api:
    build:
      context: .
      dockerfile: Dockerfile.rag-api
    container_name: rag-api
    ports:
      - "8001:8001"
    environment:
      - OLLAMA_HOST=http://host.docker.internal:11434
      - NODE_ENV=production
    volumes:
      - ./lancedb:/app/lancedb
      - ./index_store:/app/index_store
      - ./shared_uploads:/app/shared_uploads
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8001/models"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped
    networks:
      - rag-network

  # Backend API server
  backend:
    build:
      context: .
      dockerfile: Dockerfile.backend
    container_name: rag-backend
    ports:
      - "8000:8000"
    environment:
      - NODE_ENV=production
      - RAG_API_URL=http://rag-api:8001
    volumes:
      - ./backend/chat_data.db:/app/backend/chat_data.db
      - ./shared_uploads:/app/shared_uploads
    depends_on:
      rag-api:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped
    networks:
      - rag-network

  # Frontend Next.js application
  frontend:
    build:
      context: .
      dockerfile: Dockerfile.frontend
    container_name: rag-frontend
    ports:
      - "3000:3000"
    environment:
      - NODE_ENV=production
      - NEXT_PUBLIC_API_URL=http://localhost:8000
    depends_on:
      backend:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped
    networks:
      - rag-network

networks:
  rag-network:
    driver: bridge 

================================================
FILE: docker-compose.yml
================================================
services:
  # Ollama service for LLM inference (optional - can use host Ollama instead)
  ollama:
    image: ollama/ollama:latest
    container_name: rag-ollama
    ports:
      - "11434:11434"
    volumes:
      - ollama_data:/root/.ollama
    environment:
      - OLLAMA_HOST=0.0.0.0
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped
    networks:
      - rag-network
    profiles:
      - with-ollama  # Optional service - enable with --profile with-ollama

  # RAG API server
  rag-api:
    build:
      context: .
      dockerfile: Dockerfile.rag-api
    container_name: rag-api
    ports:
      - "8001:8001"
    environment:
      # Use host Ollama by default, or containerized Ollama if enabled
      - OLLAMA_HOST=${OLLAMA_HOST:-http://host.docker.internal:11434}
      - NODE_ENV=production
    volumes:
      - ./lancedb:/app/lancedb
      - ./index_store:/app/index_store
      - ./shared_uploads:/app/shared_uploads
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8001/models"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped
    networks:
      - rag-network

  # Backend API server
  backend:
    build:
      context: .
      dockerfile: Dockerfile.backend
    container_name: rag-backend
    ports:
      - "8000:8000"
    environment:
      - NODE_ENV=production
      - RAG_API_URL=http://rag-api:8001
      - OLLAMA_HOST=${OLLAMA_HOST:-http://172.18.0.1:11434}
    volumes:
      - ./backend:/app/backend
      - ./shared_uploads:/app/shared_uploads
    depends_on:
      rag-api:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped
    networks:
      - rag-network

  # Frontend Next.js application
  frontend:
    build:
      context: .
      dockerfile: Dockerfile.frontend
    container_name: rag-frontend
    ports:
      - "3000:3000"
    environment:
      - NODE_ENV=production
      - NEXT_PUBLIC_API_URL=http://localhost:8000
    depends_on:
      backend:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3000"]
      interval: 30s
      timeout: 10s
      retries: 3
    restart: unless-stopped
    networks:
      - rag-network

volumes:
  ollama_data:
    driver: local

networks:
  rag-network:
    driver: bridge    

================================================
FILE: docker.env
================================================
# Docker environment configuration
# Set this to use local Ollama instance running on host
# Note: Using Docker gateway IP instead of host.docker.internal for Linux compatibility
OLLAMA_HOST=http://172.18.0.1:11434

# Alternative: Use containerized Ollama (uncomment and run with --profile with-ollama)
# OLLAMA_HOST=http://ollama:11434

# Other configuration
NODE_ENV=production
NEXT_PUBLIC_API_URL=http://localhost:8000
RAG_API_URL=http://rag-api:8001   

================================================
FILE: env.example.watsonx
================================================
# ====================================================================
# LocalGPT Watson X Configuration Example
# ====================================================================
# This file shows how to configure LocalGPT to use IBM Watson X AI
# with Granite models instead of local Ollama.
#
# Copy this file to .env and fill in your credentials:
#   cp .env.example.watsonx .env
# ====================================================================

# LLM Backend Selection
# Options: "ollama" (default) or "watsonx"
LLM_BACKEND=watsonx

# ====================================================================
# Watson X Credentials
# ====================================================================
# Get these from your IBM Cloud Watson X project:
# 1. Go to https://cloud.ibm.com/
# 2. Navigate to Watson X AI service
# 3. Create or select a project
# 4. Get API key from IBM Cloud IAM
# 5. Copy project ID from project settings

# Your IBM Cloud API key
WATSONX_API_KEY=your_api_key_here

# Your Watson X project ID
WATSONX_PROJECT_ID=your_project_id_here

# Watson X service URL (default: us-south region)
# Options:
#   - https://us-south.ml.cloud.ibm.com (US South)
#   - https://eu-de.ml.cloud.ibm.com (Frankfurt)
#   - https://eu-gb.ml.cloud.ibm.com (London)
#   - https://jp-tok.ml.cloud.ibm.com (Tokyo)
WATSONX_URL=https://us-south.ml.cloud.ibm.com

# ====================================================================
# Model Configuration
# ====================================================================
# Granite models available on Watson X

# Main generation model for answering queries
# Options:
#   - ibm/granite-13b-chat-v2 (recommended for chat)
#   - ibm/granite-13b-instruct-v2 (for instructions)
#   - ibm/granite-20b-multilingual (for multilingual)
#   - ibm/granite-3b-code-instruct (for code)
WATSONX_GENERATION_MODEL=ibm/granite-13b-chat-v2

# Lightweight model for enrichment and routing
# Use a smaller model for better performance on simple tasks
WATSONX_ENRICHMENT_MODEL=ibm/granite-8b-japanese

# ====================================================================
# Optional: Ollama Configuration (fallback)
# ====================================================================
# These settings are used if LLM_BACKEND=ollama

OLLAMA_HOST=http://localhost:11434


================================================
FILE: eslint.config.mjs
================================================
import { dirname } from "path";
import { fileURLToPath } from "url";
import { FlatCompat } from "@eslint/eslintrc";

const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

const compat = new FlatCompat({
  baseDirectory: __dirname,
});

const eslintConfig = [
  ...compat.extends("next/core-web-vitals", "next/typescript"),
];

export default eslintConfig;


================================================
FILE: next.config.ts
================================================
import type { NextConfig } from "next";

const nextConfig: NextConfig = {
  /* config options here */
  eslint: {
    // Warning: This allows production builds to successfully complete even if your project has ESLint errors.
    ignoreDuringBuilds: true,
  },
  typescript: {
    // Warning: This allows production builds to successfully complete even if your project has type errors.
    ignoreBuildErrors: true,
  },
};

export default nextConfig;


================================================
FILE: package.json
================================================
{
  "name": "multimodal_rag",
  "version": "0.1.0",
  "private": true,
  "scripts": {
    "dev": "next dev",
    "build": "next build",
    "start": "next start",
    "lint": "next lint"
  },
  "dependencies": {
    "@radix-ui/react-avatar": "^1.1.10",
    "@radix-ui/react-dropdown-menu": "^2.1.15",
    "@radix-ui/react-scroll-area": "^1.2.9",
    "@radix-ui/react-separator": "^1.1.7",
    "@radix-ui/react-slot": "^1.2.3",
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
    "framer-motion": "^12.16.0",
    "lucide-react": "^0.513.0",
    "next": "15.3.3",
    "react": "^19.0.0",
    "react-dom": "^19.0.0",
    "react-markdown": "^10.1.0",
    "remark-gfm": "^4.0.1",
    "tailwind-merge": "^3.3.0"
  },
  "devDependencies": {
    "@eslint/eslintrc": "^3",
    "@tailwindcss/postcss": "^4",
    "@types/node": "^20",
    "@types/react": "^19",
    "@types/react-dom": "^19",
    "eslint": "^9",
    "eslint-config-next": "15.3.3",
    "tailwindcss": "^4",
    "tw-animate-css": "^1.3.4",
    "typescript": "^5"
  }
}


================================================
FILE: postcss.config.mjs
================================================
const config = {
  plugins: ["@tailwindcss/postcss"],
};

export default config;


================================================
FILE: rag_system/DOCUMENTATION.md
================================================
# RAG System Documentation

This document provides a detailed overview of the RAG (Retrieval-Augmented Generation) system, its architecture, and how to use it.

## System Overview

This RAG system is a sophisticated, multimodal question-answering system designed to work with a variety of documents. It can understand and process both the text and the visual layout of documents, and it uses a knowledge graph to understand the relationships between the entities in the documents.

The system is built around an agentic workflow that allows it to:

*   **Decompose complex questions** into smaller, more manageable sub-questions.
*   **Triage queries** to determine if they can be answered directly or if they require retrieval from the knowledge base.
*   **Verify answers** against the retrieved context to ensure they are accurate and supported by the documents.

## Architecture

The system is composed of two main pipelines: an indexing pipeline and a retrieval pipeline.

### Indexing Pipeline

The indexing pipeline is responsible for processing the documents and building the knowledge base. It performs the following steps:

1.  **Text Extraction**: The pipeline uses `PyMuPDF` to extract the text from each page of the PDF documents, preserving the original layout.
2.  **Text Embedding**: The extracted text is then passed to a text embedding model (`Qwen/Qwen3-Embedding-0.6B`) to create numerical vector representations of the text.
3.  **Knowledge Graph Creation**: The text is also passed to a `GraphExtractor` that uses a large language model (`qwen2.5vl:7b`) to extract entities and their relationships. This information is then used to build a knowledge graph, which is stored as a `.gml` file.
4.  **Indexing**: The text embeddings and the knowledge graph are then stored in a LanceDB database.

### Retrieval Pipeline

The retrieval pipeline is responsible for answering user queries. It uses an agentic workflow that includes the following steps:

1.  **Triage**: The agent first triages the user's query to determine if it can be answered directly or if it requires retrieval from the knowledge base.
2.  **Query Decomposition**: If the query is complex, the agent uses a `QueryDecomposer` to break it down into smaller, more manageable sub-questions.
3.  **Retrieval**: The agent then uses a `MultiVectorRetriever` and a `GraphRetriever` to retrieve relevant information from the knowledge base.
4.  **Verification**: The retrieved context is then passed to a `Verifier` that uses an LLM to check if the context is sufficient to answer the query.
5.  **Synthesis**: Finally, the agent uses an LLM to synthesize a final answer from the verified context.

## API Endpoints

The system provides the following command-line endpoints:

*   `index`: This endpoint runs the indexing pipeline to process the documents and build the knowledge base.
*   `chat`: This endpoint runs the retrieval pipeline to answer a user's query.
*   `show_graph`: This endpoint displays the knowledge graph in a human-readable format and also provides a visual representation of the graph.

### Usage

To run the system, use the following commands:

```bash
# Activate the virtual environment
source rag_system/rag_venv/bin/activate

# Index the documents
python rag_system/main.py index

# Ask a question
python rag_system/main.py chat "Your question here"

# Show the knowledge graph
python rag_system/main.py show_graph
```


================================================
FILE: rag_system/README.md
================================================
# Multimodal RAG System

This document provides a detailed overview of the multimodal Retrieval-Augmented Generation (RAG) system implemented in this directory. The system is designed to process and understand information from PDF documents, combining both textual and visual data to answer complex queries.

## 1. Overview

This RAG system is a sophisticated pipeline that leverages state-of-the-art open-source models to provide accurate, context-aware answers from a document corpus. Unlike traditional RAG systems that only process text, this implementation is fully multimodal. It extracts and indexes both text and images from PDFs, allowing a Vision Language Model (VLM) to reason over both modalities when generating a final answer.

The core capabilities include:
-   **Multimodal Indexing**: Extracts text and images from PDFs and creates separate vector embeddings for each.
-   **Hybrid Retrieval**: Combines dense vector search (for semantic similarity) with traditional keyword-based search (BM25) for robust retrieval.
-   **Advanced Reranking**: Utilizes a powerful reranker model to improve the relevance of retrieved documents before they are passed to the generator.
-   **VLM-Powered Synthesis**: Employs a Vision Language Model to synthesize the final answer, allowing it to analyze both the text and the images from the retrieved document chunks.

## 2. Architecture

The system is composed of several key Python modules that work together to form the RAG pipeline.

### Key Modules:

-   `main.py`: The main entry point for the application. It contains the configuration for all models and pipelines and orchestrates the indexing and retrieval processes.
-   `rag_system/pipelines/`: Contains the high-level orchestration for indexing and retrieval.
    -   `indexing_pipeline.py`: Manages the process of converting raw PDFs into indexed, searchable data.
    -   `retrieval_pipeline.py`: Handles the end-to-end process of taking a user query, retrieving relevant information, and generating a final answer.
-   `rag_system/indexing/`: Contains all modules related to data processing and indexing.
    -   `multimodal.py`: Responsible for extracting text and images from PDFs and generating embeddings using the configured vision model (`colqwen2-v1.0`).
    -   `representations.py`: Defines the text embedding model (`Qwen2-7B-instruct`) and other data representation generators.
    -   `embedders.py`: Manages the connection to the **LanceDB** vector database and handles the indexing of vector embeddings.
-   `rag_system/retrieval/`: Contains modules for retrieving and ranking documents.
    -   `retrievers.py`: Implements the logic for searching the vector database to find relevant text and image chunks.
    -   `reranker.py`: Contains the `QwenReranker` class, which re-ranks the retrieved documents for improved relevance.
-   `rag_system/agent/`: Contains the `Agent` loop that interacts with the user and the RAG pipelines.
-   `rag_system/utils/`: Contains utility clients, such as the `OllamaClient` for interacting with the Ollama server.

### Data Flow:

1.  **Indexing**:
    -   The `MultimodalProcessor` reads a PDF and splits it into pages.
    -   For each page, it extracts the raw text and a full-page image.
    -   The `QwenEmbedder` generates a vector embedding for the text.
    -   The `LocalVisionModel` (using `colqwen2-v1.0`) generates a vector embedding for the image.
    -   The `VectorIndexer` stores these embeddings in separate tables within a **LanceDB** database.
2.  **Retrieval**:
    -   A user submits a query to the `Agent`.
    -   The `RetrievalPipeline`'s `MultiVectorRetriever` searches both the text and image tables in LanceDB for relevant chunks.
    -   The retrieved documents are passed to the `QwenReranker`, which re-orders them based on relevance to the query.
    -   The top-ranked documents (containing both text and image references) are passed to the Vision Language Model (`qwen-vl`).
    -   The VLM analyzes the text and images to extract key facts.
    -   A final text generation model (`llama3`) synthesizes these facts into a coherent, human-readable answer.

## 3. Models

This system relies on a suite of powerful, open-source models.

| Component             | Model                               | Framework      | Purpose                                     |
| --------------------- | ----------------------------------- | -------------- | ------------------------------------------- |
| **Image Embedding**   | `vidore/colqwen2-v1.0`              | `colpali`      | Generates vector embeddings from images.    |
| **Text Embedding**    | `Qwen/Qwen2-7B-instruct`            | `transformers` | Generates vector embeddings from text.      |
| **Reranker**          | `Qwen/Qwen-reranker`                | `transformers` | Re-ranks retrieved documents for relevance. |
| **Vision Language Model** | `qwen2.5vl:7b`                      | `Ollama`       | Extracts facts from text and images.        |
| **Text Generation**   | `llama3`                            | `Ollama`       | Synthesizes the final answer.               |

## 4. Configuration

All system configurations are centralized in `main.py`.

-   **`OLLAMA_CONFIG`**: Defines the models that will be run via the Ollama server. This includes the final text generation model and the Vision Language Model.
-   **`PIPELINE_CONFIGS`**: Contains the configurations for both the `indexing` and `retrieval` pipelines. Here you can specify:
    -   The paths for the LanceDB database and source documents.
    -   The names of the tables to be used for text and image embeddings.
    -   The Hugging Face model names for the text embedder, vision model, and reranker.
    -   Parameters for the reranker and retrieval process (e.g., `top_k`, `retrieval_k`).

To change a model, simply update the corresponding model name in this configuration file.

## 5. Usage

To run the system, you first need to ensure the required models are available.

### Prerequisites:

1.  **Install Dependencies**:
    ```bash
    pip install -r requirements.txt
    ```
2.  **Download Ollama Models**:
    ```bash
    ollama pull llama3
    ollama pull qwen2.5vl:7b
    ```
3.  **Hugging Face Models**: The `transformers` and `colpali` libraries will automatically download the required models the first time they are used. Ensure you have a stable internet connection.

### Running the System:

1.  **Execute the Main Script**:
    ```bash
    python rag_system/main.py
    ```
2.  **Indexing**: The script will first run the indexing pipeline, processing any documents in the `rag_system/documents` directory and storing their embeddings in LanceDB.
3.  **Querying**: Once indexing is complete, the RAG agent will be ready. You can ask questions about the documents you have indexed.
    ```
    > What was the revenue growth in Q3?
    ```
4.  **Exit**: To stop the agent, type `quit`.


================================================
FILE: rag_system/__init__.py
================================================
import logging
import os

# ---------------------------------------------------------
# Global logging setup for the entire `rag_system` package.
# ---------------------------------------------------------
# You can control verbosity with an env variable, e.g.:
#   export RAG_LOG_LEVEL=DEBUG  (or INFO / WARNING / ERROR)
# If not set, we default to INFO to avoid excessive noise.
# ---------------------------------------------------------
_level_str = os.getenv("RAG_LOG_LEVEL", "INFO").upper()
_level = getattr(logging, _level_str, logging.INFO)

# Only configure root logger if it hasn't been configured yet
if not logging.getLogger().handlers:
    logging.basicConfig(
        level=_level,
        format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
    )
else:
    logging.getLogger().setLevel(_level)

logging.getLogger(__name__).debug(
    "Initialized rag_system logging (level=%s)", _level_str
)

# ---------------------------------------------------------
# Authenticate to Hugging Face Hub if a token is provided
# ---------------------------------------------------------
from typing import Optional


def _hf_auto_login() -> None:
    """Attempt to authenticate with Hugging Face Hub using an env token.

    We support both the new canonical env var name (HF_TOKEN) and the two
    historical variants to avoid breaking user setups. The login call is
    idempotent: if a cached token already exists, the hub library will simply
    reuse it, so it is safe to run on every import.
    """

    import os

    token: Optional[str] = (
        os.getenv("HF_TOKEN")
        or os.getenv("HUGGINGFACE_HUB_TOKEN")
        or os.getenv("HUGGING_FACE_HUB_TOKEN")
    )

    if not token:
        logging.getLogger(__name__).debug("No Hugging Face token found in env; proceeding anonymously.")
        return

    try:
        from huggingface_hub import login as hf_login

        hf_login(token=token, add_to_git_credential=False)  # type: ignore
        logging.getLogger(__name__).info("Authenticated to Hugging Face Hub via env token.")
    except Exception as exc:  # pragma: no cover – best-effort login
        logging.getLogger(__name__).warning(
            "Failed to login to Hugging Face Hub automatically: %s", exc
        )


# Run on module import
_hf_auto_login() 

================================================
FILE: rag_system/agent/__init__.py
================================================


================================================
FILE: rag_system/agent/loop.py
================================================
from typing import Dict, Any, Optional
import json
import time, asyncio, os
import numpy as np
import concurrent.futures
from cachetools import TTLCache, LRUCache
from rag_system.utils.ollama_client import OllamaClient
from rag_system.pipelines.retrieval_pipeline import RetrievalPipeline
from rag_system.agent.verifier import Verifier
from rag_system.retrieval.query_transformer import QueryDecomposer, GraphQueryTranslator
from rag_system.retrieval.retrievers import GraphRetriever

class Agent:
    """
    The main agent, now fully wired to use a live Ollama client.
    """
    def __init__(self, pipeline_configs: Dict[str, Dict], llm_client: OllamaClient, ollama_config: Dict[str, str]):
        self.pipeline_configs = pipeline_configs
        self.llm_client = llm_client
        self.ollama_config = ollama_config
        
        gen_model = self.ollama_config["generation_model"]
        
        # Initialize the single, persistent retrieval pipeline for this agent
        self.retrieval_pipeline = RetrievalPipeline(pipeline_configs, self.llm_client, self.ollama_config)
        
        self.verifier = Verifier(llm_client, gen_model)
        self.query_decomposer = QueryDecomposer(llm_client, gen_model)
        
        # 🚀 OPTIMIZED: TTL cache now stores embeddings for semantic matching
        self._cache_max_size = 100  # fallback size limit for manual eviction helper
        self._query_cache: TTLCache = TTLCache(maxsize=self._cache_max_size, ttl=300)
        self.semantic_cache_threshold = self.pipeline_configs.get("semantic_cache_threshold", 0.98)
        # If set to "session", semantic-cache hits will be restricted to the same chat session.
        # Otherwise (default "global") answers can be reused across sessions.
        self.cache_scope = self.pipeline_configs.get("cache_scope", "global")  # 'global' or 'session'
        
        # 🚀 NEW: In-memory store for conversational history per session
        self.chat_histories: LRUCache = LRUCache(maxsize=100) # Stores history for 100 recent sessions

        graph_config = self.pipeline_configs.get("graph_strategy", {})
        if graph_config.get("enabled"):
            self.graph_query_translator = GraphQueryTranslator(llm_client, gen_model)
            self.graph_retriever = GraphRetriever(graph_config["graph_path"])
            print("Agent initialized with live GraphRAG capabilities.")
        else:
            print("Agent initialized (GraphRAG disabled).")

        # ---- Load document overviews for fast routing ----
        self._global_overview_path = os.path.join("index_store", "overviews", "overviews.jsonl")
        self.doc_overviews: list[str] = []
        self._current_overview_session: str | None = None  # cache key to avoid rereading on every query
        self._load_overviews(self._global_overview_path)

    def _load_overviews(self, path: str):
        """Helper to load overviews from a .jsonl file into self.doc_overviews."""
        import json, os
        self.doc_overviews.clear()
        if not os.path.exists(path):
            return
        try:
            with open(path, encoding="utf-8") as fh:
                for line in fh:
                    try:
                        rec = json.loads(line)
                        if isinstance(rec, dict) and rec.get("overview"):
                            self.doc_overviews.append(rec["overview"].strip())
                    except Exception:
                        continue
            print(f"📖 Loaded {len(self.doc_overviews)} overviews from {path}")
        except Exception as e:
            print(f"⚠️  Failed to load document overviews from {path}: {e}")

    def load_overviews_for_indexes(self, idx_ids: list[str]):
        """Aggregate overviews for the given indexes or fall back to global file."""
        import os, json
        aggregated: list[str] = []
        for idx in idx_ids:
            path = os.path.join("index_store", "overviews", f"{idx}.jsonl")
            if os.path.exists(path):
                try:
                    with open(path, encoding="utf-8") as fh:
                        for line in fh:
                            if not line.strip():
                                continue
                            try:
                                rec = json.loads(line)
                                ov = rec.get("overview", "").strip()
                                if ov:
                                    aggregated.append(ov)
                            except json.JSONDecodeError:
                                continue
                except Exception as e:
                    print(f"⚠️  Error reading {path}: {e}")
        if aggregated:
            self.doc_overviews = aggregated
            self._current_overview_session = "|".join(idx_ids)  # cache composite key so no overwrite
            print(f"📖 Loaded {len(aggregated)} overviews for indexes {[i[:8] for i in idx_ids]}")
        else:
            print(f"⚠️  No per-index overviews found for {idx_ids}. Using global overview file.")
            self._load_overviews(self._global_overview_path)
            self._current_overview_session = "GLOBAL"

    def _cosine_similarity(self, v1: np.ndarray, v2: np.ndarray) -> float:
        """Computes cosine similarity between two vectors."""
        if not isinstance(v1, np.ndarray): v1 = np.array(v1)
        if not isinstance(v2, np.ndarray): v2 = np.array(v2)
        
        if v1.shape != v2.shape:
            raise ValueError("Vectors must have the same shape for cosine similarity.")

        if np.all(v1 == 0) or np.all(v2 == 0):
            return 0.0
            
        dot_product = np.dot(v1, v2)
        norm_v1 = np.linalg.norm(v1)
        norm_v2 = np.linalg.norm(v2)
        
        # Avoid division by zero
        if norm_v1 == 0 or norm_v2 == 0:
            return 0.0
        
        return dot_product / (norm_v1 * norm_v2)

    def _find_in_semantic_cache(self, query_embedding: np.ndarray, session_id: Optional[str] = None) -> Optional[Dict[str, Any]]:
        """Finds a semantically similar query in the cache."""
        if not self._query_cache or query_embedding is None:
            return None

        for key, cached_item in self._query_cache.items():
            cached_embedding = cached_item.get('embedding')
            if cached_embedding is None:
                continue

            # Respect cache scoping: if scope is session-level, skip results from other sessions
            if self.cache_scope == "session" and session_id is not None:
                if cached_item.get("session_id") != session_id:
                    continue

            try:
                similarity = self._cosine_similarity(query_embedding, cached_embedding)

                if similarity >= self.semantic_cache_threshold:
                    print(f"🚀 Semantic cache hit! Similarity: {similarity:.3f} with cached query '{key}'")
                    return cached_item.get('result')
            except ValueError:
                # In case of shape mismatch, just skip
                continue

        return None

    def _format_query_with_history(self, query: str, history: list) -> str:
        """Formats the user query with conversation history for context."""
        if not history:
            return query
        
        formatted_history = "\n".join([f"User: {turn['query']}\nAssistant: {turn['answer']}" for turn in history])
        
        prompt = f"""
Given the following conversation history, answer the user's latest query. The history provides context for resolving pronouns or follow-up questions.

--- Conversation History ---
{formatted_history}
---

Latest User Query: "{query}"
"""
        return prompt

    # ---------------- Asynchronous triage using Ollama ----------------
    async def _triage_query_async(self, query: str, history: list) -> str:
        
        print(f"🔍 ROUTING DEBUG: Starting triage for query: '{query[:100]}...'")
        
        # 1️⃣ Fast routing using precomputed overviews (if available)
        print(f"📖 ROUTING DEBUG: Attempting overview-based routing...")
        routed = self._route_via_overviews(query)
        if routed:
            print(f"✅ ROUTING DEBUG: Overview routing decided: '{routed}'")
            return routed
        else:
            print(f"❌ ROUTING DEBUG: Overview routing returned None, falling back to LLM triage")

        if history:
            # If there's history, the query is likely a follow-up, so we default to RAG.
            # A more advanced implementation could use an LLM to see if the new query
            # changes the topic entirely.
            print(f"📜 ROUTING DEBUG: History exists, defaulting to 'rag_query'")
            return "rag_query"

        print(f"🤖 ROUTING DEBUG: No history, using LLM fallback triage...")
        prompt = f"""
You are a query routing expert. Analyze the user's question and decide which backend should handle it.

Choose **exactly one** category:

1. "rag_query" – Questions about the user's uploaded documents or specific document content that should be searched. Examples: "What is the invoice amount?", "Summarize the research paper", "What companies are mentioned?"

2. "direct_answer" – General knowledge questions, greetings, or queries unrelated to uploaded documents. Examples: "Who are the CEOs of Tesla and Amazon?", "What is the capital of France?", "Hello", "Explain quantum physics"

3. "graph_query" – Specific factual relations for knowledge-graph lookup (currently limited use)

IMPORTANT: For general world knowledge about well-known companies, people, or facts NOT related to uploaded documents, choose "direct_answer".

User query: "{query}"

Respond with JSON: {{"category": "<your_choice>"}}
"""
        resp = self.llm_client.generate_completion(
            model=self.ollama_config["generation_model"], prompt=prompt, format="json"
        )
        try:
            data = json.loads(resp.get("response", "{}"))
            decision = data.get("category", "rag_query")
            print(f"🤖 ROUTING DEBUG: LLM fallback triage decided: '{decision}'")
            return decision
        except json.JSONDecodeError:
            print(f"❌ ROUTING DEBUG: LLM fallback triage JSON parsing failed, defaulting to 'rag_query'")
            return "rag_query"

    def _run_graph_query(self, query: str, history: list) -> Dict[str, Any]:
        contextual_query = self._format_query_with_history(query, history)
        structured_query = self.graph_query_translator.translate(contextual_query)
        if not structured_query.get("start_node"):
            return self.retrieval_pipeline.run(contextual_query, window_size_override=0)
        results = self.graph_retriever.retrieve(structured_query)
        if not results:
            return self.retrieval_pipeline.run(contextual_query, window_size_override=0)
        answer = ", ".join([res['details']['node_id'] for res in results])
        return {"answer": f"From the knowledge graph: {answer}", "source_documents": results}

    def _get_cache_key(self, query: str, query_type: str) -> str:
        """Generate a cache key for the query"""
        # Simple cache key based on query and type
        return f"{query_type}:{query.strip().lower()}"
    
    def _cache_result(self, cache_key: str, result: Dict[str, Any], session_id: Optional[str] = None):
        """Cache a result with size limit"""
        if len(self._query_cache) >= self._cache_max_size:
            # Remove oldest entry (simple FIFO eviction)
            oldest_key = next(iter(self._query_cache))
            del self._query_cache[oldest_key]
        
        self._query_cache[cache_key] = {
            'result': result,
            'timestamp': time.time(),
            'session_id': session_id
        }

    # ---------------- Public sync API (kept for backwards compatibility) --------------
    def run(self, query: str, table_name: str = None, session_id: str = None, compose_sub_answers: Optional[bool] = None, query_decompose: Optional[bool] = None, ai_rerank: Optional[bool] = None, context_expand: Optional[bool] = None, verify: Optional[bool] = None, retrieval_k: Optional[int] = None, context_window_size: Optional[int] = None, reranker_top_k: Optional[int] = None, search_type: Optional[str] = None, dense_weight: Optional[float] = None, max_retries: int = 1, event_callback: Optional[callable] = None) -> Dict[str, Any]:
        """Synchronous helper. If *event_callback* is supplied, important
        milestones will be forwarded to that callable as

            event_callback(phase:str, payload:Any)
        """
        return asyncio.run(self._run_async(query, table_name, session_id, compose_sub_answers, query_decompose, ai_rerank, context_expand, verify, retrieval_k, context_window_size, reranker_top_k, search_type, dense_weight, max_retries, event_callback))

    # ---------------- Main async implementation --------------------------------------
    async def _run_async(self, query: str, table_name: str = None, session_id: str = None, compose_sub_answers: Optional[bool] = None, query_decompose: Optional[bool] = None, ai_rerank: Optional[bool] = None, context_expand: Optional[bool] = None, verify: Optional[bool] = None, retrieval_k: Optional[int] = None, context_window_size: Optional[int] = None, reranker_top_k: Optional[int] = None, search_type: Optional[str] = None, dense_weight: Optional[float] = None, max_retries: int = 1, event_callback: Optional[callable] = None) -> Dict[str, Any]:
        start_time = time.time()
        
        # Emit analyze event at the start
        if event_callback:
            event_callback("analyze", {"query": query})
        
        # 🚀 NEW: Get conversation history
        history = self.chat_histories.get(session_id, []) if session_id else []
        
        # 🔄 Refresh overviews for this session if available
        # if session_id and session_id != getattr(self, "_current_overview_session", None):
        #     candidate_path = os.path.join("index_store", "overviews", f"{session_id}.jsonl")
        #     if os.path.exists(candidate_path):
        #         self._load_overviews(candidate_path)
        #         self._current_overview_session = session_id
        #     else:
        #         # Fall back to global overviews if per-session file not found
        #         if self._current_overview_session != "GLOBAL":
        #             self._load_overviews(self._global_overview_path)
        #             self._current_overview_session = "GLOBAL"
        
        query_type = await self._triage_query_async(query, history)
        print(f"🎯 ROUTING DEBUG: Final triage decision: '{query_type}'")
        print(f"Agent Triage Decision: '{query_type}'")
        
        # Create a contextual query that includes history for most operations
        contextual_query = self._format_query_with_history(query, history)
        raw_query = query.strip()
        
        # --- Apply runtime AI reranker override (must happen before any retrieval calls) ---
        if ai_rerank is not None:
            rr_cfg = self.retrieval_pipeline.config.setdefault("reranker", {})
            rr_cfg["enabled"] = bool(ai_rerank)
            if ai_rerank:
                # Ensure the pipeline knows to use the external ColBERT reranker
                rr_cfg.setdefault("type", "ai")
                rr_cfg.setdefault("strategy", "rerankers-lib")
                rr_cfg.setdefault(
                    "model_name",
                    # Falls back to ColBERT-small if the caller did not supply one
                    self.ollama_config.get("rerank_model", "answerai-colbert-small-v1"),
                )

        # --- Apply runtime retrieval configuration overrides ---
        if retrieval_k is not None:
            self.retrieval_pipeline.config["retrieval_k"] = retrieval_k
            print(f"🔍 Retrieval K set to: {retrieval_k}")
            
        if context_window_size is not None:
            self.retrieval_pipeline.config["context_window_size"] = context_window_size
            print(f"🔍 Context window size set to: {context_window_size}")
            
        if reranker_top_k is not None:
            rr_cfg = self.retrieval_pipeline.config.setdefault("reranker", {})
            rr_cfg["top_k"] = reranker_top_k
            print(f"🔍 Reranker top K set to: {reranker_top_k}")
            
        if search_type is not None:
            retrieval_cfg = self.retrieval_pipeline.config.setdefault("retrieval", {})
            retrieval_cfg["search_type"] = search_type
            print(f"🔍 Search type set to: {search_type}")
            
        if dense_weight is not None:
            dense_cfg = self.retrieval_pipeline.config.setdefault("retrieval", {}).setdefault("dense", {})
            dense_cfg["weight"] = dense_weight
            print(f"🔍 Dense search weight set to: {dense_weight}")

        query_embedding = None
        # 🚀 OPTIMIZED: Semantic Cache Check
        if query_type != "direct_answer":
            text_embedder = self.retrieval_pipeline._get_text_embedder()
            if text_embedder:
                # The embedder expects a list, so we wrap the *raw* query only.
                query_embedding_list = text_embedder.create_embeddings([raw_query])
                if isinstance(query_embedding_list, np.ndarray):
                    query_embedding = query_embedding_list[0]
                else:
                    # Some embedders return a list – convert if necessary
                    query_embedding = np.array(query_embedding_list[0])

                cached_result = self._find_in_semantic_cache(query_embedding, session_id)

                if cached_result:
                    # Update history even on cache hit
                    if session_id:
                        history.append({"query": query, "answer": cached_result.get('answer', 'Cached answer not found.')})
                        self.chat_histories[session_id] = history
                    return cached_result

        if query_type == "direct_answer":
            print(f"✅ ROUTING DEBUG: Executing DIRECT_ANSWER path")
            if event_callback:
                event_callback("direct_answer", {})

            prompt = (
                "You are a helpful assistant. Read the conversation history below. "
                "If the answer to the user's latest question is already present in the history, quote it concisely. "
                "Otherwise answer from your general world knowledge. Provide a short, factual reply (1‒2 sentences).\n\n"
                f"Conversation + Latest Question:\n{contextual_query}\n\nAssistant:"
            )

            async def _run_stream():
                answer_parts: list[str] = []

                def _blocking_stream():
                    for tok in self.llm_client.stream_completion(
                        model=self.ollama_config["generation_model"], prompt=prompt
                    ):
                        answer_parts.append(tok)
                        if event_callback:
                            event_callback("token", {"text": tok})

                # Run the blocking generator in a thread so the event loop stays responsive
                await asyncio.to_thread(_blocking_stream)
                return "".join(answer_parts)

            final_answer = await _run_stream()
            result = {"answer": final_answer, "source_documents": []}
        
        elif query_type == "graph_query" and hasattr(self, 'graph_retriever'):
            print(f"✅ ROUTING DEBUG: Executing GRAPH_QUERY path")
            result = self._run_graph_query(query, history)

        # --- RAG Query Processing with Optional Query Decomposition ---
        else: # Default to rag_query
            print(f"✅ ROUTING DEBUG: Executing RAG_QUERY path (query_type='{query_type}')")
            query_decomp_config = self.pipeline_configs.get("query_decomposition", {})
            decomp_enabled = query_decomp_config.get("enabled", False)
            if query_decompose is not None:
                decomp_enabled = query_decompose

            if decomp_enabled:
                print(f"\n--- Query Decomposition Enabled ---")
                # Use the raw user query (without conversation history) for decomposition to avoid leakage of prior context
                # Pass the last 5 conversation turns for context resolution within the decomposer
                recent_history = history[-5:] if history else []
                sub_queries = self.query_decomposer.decompose(raw_query, recent_history)
                if event_callback:
                    event_callback("decomposition", {"sub_queries": sub_queries})
                print(f"Original query: '{query}' (Contextual: '{contextual_query}')")
                print(f"Decomposed into {len(sub_queries)} sub-queries: {sub_queries}")
                
                # Emit retrieval_started event before any retrievals
                if event_callback:
                    event_callback("retrieval_started", {"count": len(sub_queries)})
                
                # If decomposition produced only a single sub-query, skip the
                # parallel/composition machinery for efficiency.
                if len(sub_queries) == 1:
                    print("--- Only one sub-query after decomposition; using direct retrieval path ---")
                    result = self.retrieval_pipeline.run(
                        sub_queries[0],
                        table_name,
                        0 if context_expand is False else None,
                        event_callback=event_callback
                    )
                    if event_callback:
                        event_callback("single_query_result", result)
                    # Emit retrieval_done and rerank_done for single sub-query
                    if event_callback:
                        event_callback("retrieval_done", {"count": 1})
                        event_callback("rerank_started", {"count": 1})
                        event_callback("rerank_done", {"count": 1})
                else:
                    compose_from_sub_answers = query_decomp_config.get("compose_from_sub_answers", True)
                    if compose_sub_answers is not None:
                        compose_from_sub_answers = compose_sub_answers

                    print(f"\n--- Processing {len(sub_queries)} sub-queries in parallel ---")
                    start_time_inner = time.time()

                    # Shared containers
                    sub_answers = []  # For two-stage composition
                    all_source_docs = []  # For single-stage aggregation
                    citations_seen = set()

                    # Emit rerank_started event before parallel retrievals (since each sub-query will rerank)
                    if event_callback:
                        event_callback("rerank_started", {"count": len(sub_queries)})

                    # Emit token chunks as soon as we receive them. The UI
                    # keeps answers separated by `index`, so interleaving is
                    # harmless and gives continuous feedback.

                    def make_cb(idx: int):
                        def _cb(ev_type: str, payload):
                            if event_callback is None:
                                return
                            if ev_type == "token":
                                event_callback("sub_query_token", {"index": idx, "text": payload.get("text", ""), "question": sub_queries[idx]})
                            else:
                                event_callback(ev_type, payload)
                        return _cb

                    with concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sub_queries))) as executor:
                        future_to_query = {
                            executor.submit(
                                self.retrieval_pipeline.run,
                                sub_query,
                                table_name,
                                0 if context_expand is False else None,
                                make_cb(i),
                            ): (i, sub_query)
                            for i, sub_query in enumerate(sub_queries)
                        }

                        for future in concurrent.futures.as_completed(future_to_query):
                            i, sub_query = future_to_query[future]
                            try:
                                sub_result = future.result()
                                print(f"✅ Sub-Query {i+1} completed: '{sub_query}'")

                                if event_callback:
                                    event_callback("sub_query_result", {
                                        "index": i,
                                        "query": sub_query,
                                        "answer": sub_result.get("answer", ""),
                                        "source_documents": sub_result.get("source_documents", []),
                                    })

                                if compose_from_sub_answers:
                                    sub_answers.append({
                                        "question": sub_query,
                                        "answer": sub_result.get("answer", "")
                                    })
                                    # Keep up to 5 citations per sub-query for traceability
                                    for doc in sub_result.get("source_documents", [])[:5]:
                                        if doc['chunk_id'] not in citations_seen:
                                            all_source_docs.append(doc)
                                            citations_seen.add(doc['chunk_id'])
                                else:
                                    # Aggregate unique docs (single-stage path)
                                    for doc in sub_result.get('source_documents', []):
                                        if doc['chunk_id'] not in citations_seen:
                                            all_source_docs.append(doc)
                                            citations_seen.add(doc['chunk_id'])
                            except Exception as e:
                                print(f"❌ Sub-Query {i+1} failed: '{sub_query}' - {e}")

                    parallel_time = time.time() - start_time_inner
                    print(f"🚀 Parallel processing completed in {parallel_time:.2f}s")

                    # Emit retrieval_done and rerank_done after all sub-queries are processed
                    if event_callback:
                        event_callback("retrieval_done", {"count": len(sub_queries)})
                        event_callback("rerank_done", {"count": len(sub_queries)})

                    if compose_from_sub_answers:
                        print("\n--- Composing final answer from sub-answers ---")
                        compose_prompt = f"""
You are an expert answer composer for a Retrieval-Augmented Generation (RAG) system.

Context:
• The ORIGINAL QUESTION from the user is shown below.
• That question was automatically decomposed into simpler SUB-QUESTIONS.
• Each sub-question has already been answered by an earlier step and the resulting Question→Answer pairs are provided to you in JSON.

Your task:
1. Read every sub-answer carefully.
2. Write a single, final answer to the ORIGINAL QUESTION **using only the information contained in the sub-answers**. Do NOT invent facts that are not present.
3. If the original question includes a comparison (e.g., "Which, A or B, …") clearly state the outcome (e.g., "A > B"). Quote concrete numbers when available.
4. If any aspect of the original question cannot be answered with the given sub-answers, explicitly say so (e.g., "The provided context does not mention …").
5. Keep the answer concise (≤ 5 sentences) and use a factual, third-person tone.

Input
------
ORIGINAL QUESTION:
"{contextual_query}"

SUB-ANSWERS (JSON):
{json.dumps(sub_answers, indent=2)}

------
FINAL ANSWER:
"""
                        # --- Stream composition answer token-by-token ---
                        answer_parts: list[str] = []

                        for tok in self.llm_client.stream_completion(
                            model=self.ollama_config["generation_model"],
                            prompt=compose_prompt,
                        ):
                            answer_parts.append(tok)
                            if event_callback:
                                event_callback("token", {"text": tok})

                        final_answer = "".join(answer_parts) or "Unable to generate an answer."

                        result = {
                            "answer": final_answer,
                            "source_documents": all_source_docs
                        }
                        if event_callback:
                            event_callback("final_answer", result)
                    else:
                        print(f"\n--- Aggregated {len(all_source_docs)} unique documents from all sub-queries ---")

                        if all_source_docs:
                            aggregated_context = "\n\n".join([doc['text'] for doc in all_source_docs])
                            final_answer = self.retrieval_pipeline._synthesize_final_answer(contextual_query, aggregated_context)
                            result = {
                                "answer": final_answer,
                                "source_documents": all_source_docs
                            }
                            if event_callback:
                                event_callback("final_answer", result)
                        else:
                            result = {
                                "answer": "I could not find relevant information to answer your question.",
                                "source_documents": []
                            }
                            if event_callback:
                                event_callback("final_answer", result)
            else:
                # Standard retrieval (single-query)
                retrieved_docs = (self.retrieval_pipeline.retriever.retrieve(
                    text_query=contextual_query,
                    table_name=table_name or self.retrieval_pipeline.storage_config["text_table_name"],
                    k=self.retrieval_pipeline.config.get("retrieval_k", 10),
                ) if hasattr(self.retrieval_pipeline, "retriever") and self.retrieval_pipeline.retriever else [])

                print("\n=== DEBUG: Original retrieval order ===")
                for i, d in enumerate(retrieved_docs[:10]):
                    snippet = (d.get('text','') or '')[:200].replace('\n',' ')
                    print(f"Orig[{i}] id={d.get('chunk_id')} dist={d.get('_distance','') or d.get('score','')}  {snippet}")

                result = self.retrieval_pipeline.run(contextual_query, table_name, 0 if context_expand is False else None, event_callback=event_callback)

                # After run, result['source_documents'] is reranked list
                reranked_docs = result.get('source_documents', [])
                print("\n=== DEBUG: Reranked docs order ===")
                for i, d in enumerate(reranked_docs[:10]):
                    snippet = (d.get('text','') or '')[:200].replace('\n',' ')
                    print(f"ReRank[{i}] id={d.get('chunk_id')} score={d.get('rerank_score','')} {snippet}")
        
        # Verification step (simplified for now) - Skip in fast mode
        verification_enabled = self.pipeline_configs.get("verification", {}).get("enabled", True)
        if verify is not None:
            verification_enabled = verify
            
        if verification_enabled and result.get("source_documents"):
            context_str = "\n".join([doc['text'] for doc in result['source_documents']])
            verification = await self.verifier.verify_async(contextual_query, context_str, result['answer'])
            
            score = verification.confidence_score

            # Only include confidence details if we received a non-zero score (0 usually means JSON parse failure)
            if score > 0:
                result['answer'] += f" [Confidence: {score}%]"
                # Add warning only when the verifier explicitly reported low confidence / not grounded
                if (not verification.is_grounded) or score < 50:
                    result['answer'] += f" [Warning: Low confidence. Groundedness: {verification.is_grounded}]"
            else:
                # Skip appending any verifier note – 0 likely indicates a parser error
                print("⚠️  Verifier returned 0 confidence – likely JSON parse error; omitting tags.")
        else:
            print("🚀 Skipping verification for speed or lack of sources")
        
        # 🚀 NEW: Update history
        if session_id:
            history.append({"query": query, "answer": result['answer']})
            self.chat_histories[session_id] = history
            
        # 🚀 OPTIMIZED: Cache the result for future queries
        if query_type != "direct_answer" and query_embedding is not None:
            cache_key = raw_query  # Key is for logging/debugging
            self._query_cache[cache_key] = {
                "embedding": query_embedding,
                "result": result,
                "session_id": session_id,
            }
        
        total_time = time.time() - start_time
        print(f"🚀 Total query processing time: {total_time:.2f}s")
        
        return result

    # ------------------------------------------------------------------
    def _route_via_overviews(self, query: str) -> str | None:
        """Use document overviews and a small model to decide routing.
        Returns 'rag_query', 'direct_answer', or None if unsure/disabled."""
        if not self.doc_overviews:
            print(f"📖 ROUTING DEBUG: No document overviews available, returning None")
            return None
        
        print(f"📖 ROUTING DEBUG: Found {len(self.doc_overviews)} document overviews, using LLM routing...")

        # Keep prompt concise: if more than 40 overviews, take first 40
        overviews_snip = self.doc_overviews[:40]
        overviews_block = "\n".join(f"[{i+1}] {ov}" for i, ov in enumerate(overviews_snip))

        router_prompt = f"""Task: Route query to correct system.

Documents available: Invoices, DeepSeek-V3 research papers

Query: "{query}"

Is this query asking about:
A) Greetings/social: "Hi", "Hello", "Thanks", "What's up", "How are you"
B) General knowledge: "CEO of Tesla", "capital of France", "what is 2+2"  
C) Document content: invoice amounts, DeepSeek-V3 details, companies mentioned

If A or B → {{"category": "direct_answer"}}
If C → {{"category": "rag_query"}}

Response:"""
        
        resp = self.llm_client.generate_completion(
            model=self.ollama_config["generation_model"], prompt=router_prompt, format="json"
        )
        try:
            raw_response = resp.get("response", "{}")
            print(f"📖 ROUTING DEBUG: Overview LLM raw response: '{raw_response[:200]}...'")
            data = json.loads(raw_response)
            decision = data.get("category", "rag_query")
            print(f"📖 ROUTING DEBUG: Overview routing final decision: '{decision}'")
            return decision
        except json.JSONDecodeError as e:
            print(f"❌ ROUTING DEBUG: Overview routing JSON parsing failed: {e}, defaulting to 'rag_query'")
            return "rag_query"


================================================
FILE: rag_system/agent/verifier.py
================================================
import json
from rag_system.utils.ollama_client import OllamaClient

class VerificationResult:
    def __init__(self, is_grounded: bool, reasoning: str, verdict: str, confidence_score: int):
        self.is_grounded = is_grounded
        self.reasoning = reasoning
        self.verdict = verdict
        self.confidence_score = confidence_score

class Verifier:
    """
    Verifies if a generated answer is grounded in the provided context using Ollama.
    """
    def __init__(self, llm_client: OllamaClient, llm_model: str):
        self.llm_client = llm_client
        self.llm_model = llm_model
        print(f"Initialized Verifier with Ollama model '{self.llm_model}'.")

    # Synchronous verify() method removed – async version is used everywhere.

    # --- Async wrapper ------------------------------------------------
    async def verify_async(self, query: str, context: str, answer: str) -> VerificationResult:
        """Async variant that calls the Ollama client asynchronously."""
        prompt = f"""
        You are an automated fact-checker. Determine whether the ANSWER is fully supported by the CONTEXT and output a single line of JSON.

        # EXAMPLES

        <QUERY>
        What color is the sky?
        </QUERY>
        <CONTEXT>
        During the day, the sky appears blue due to Rayleigh scattering.
        </CONTEXT>
        <ANSWER>
        The sky is blue during the day.
        </ANSWER>
        <OUTPUT>
        {{"verdict": "SUPPORTED", "is_grounded": true, "reasoning": "The context explicitly supports that the sky is blue during the day.", "confidence_score": 100}}
        </OUTPUT>

        <QUERY>
        Where are apples and oranges grown?
        </QUERY>
        <CONTEXT>
        Apples are grown in orchards.
        </CONTEXT>
        <ANSWER>
        Apples are grown in orchards and oranges are grown in groves.
        </ANSWER>
        <OUTPUT>
        {{"verdict": "NOT_SUPPORTED", "is_grounded": false, "reasoning": "The context mentions orchards, but not oranges or groves.", "confidence_score": 80}}
        </OUTPUT>

        <QUERY>
        How long is the process?
        </QUERY>
        <CONTEXT>
        The first step takes 3 days. The second step takes 5 days.
        </CONTEXT>
        <ANSWER>
        The process takes 3 days.
        </ANSWER>
        <OUTPUT>
        {{"verdict": "NEEDS_CLARIFICATION", "is_grounded": false, "reasoning": "The answer omits the 5 days required for the second step.", "confidence_score": 70}}
        </OUTPUT>

        # TASK

        <QUERY>
        "{query}"
        </QUERY>
        <CONTEXT>
        """
        prompt += context[:4000]  # Clamp to avoid huge prompts
        prompt += """
        </CONTEXT>
        <ANSWER>
        """
        prompt += answer
        prompt += """
        </ANSWER>
        <OUTPUT>
        """
        resp = await self.llm_client.generate_completion_async(self.llm_model, prompt, format="json")
        try:
            data = json.loads(resp.get("response", "{}"))
            return VerificationResult(
                is_grounded=data.get("is_grounded", False),
                reasoning=data.get("reasoning", "async parse error"),
                verdict=data.get("verdict", "NOT_SUPPORTED"),
                confidence_score=data.get('confidence_score', 0)
            )
        except (json.JSONDecodeError, AttributeError):
            return VerificationResult(False, "Failed async parse", "NOT_SUPPORTED", 0)


================================================
FILE: rag_system/api_server.py
================================================
import json
import http.server
import socketserver
from urllib.parse import urlparse, parse_qs
import os
import requests
import sys
import logging

# Add backend directory to path for database imports
backend_dir = os.path.join(os.path.dirname(__file__), '..', 'backend')
if backend_dir not in sys.path:
    sys.path.append(backend_dir)

from backend.database import ChatDatabase, generate_session_title
from rag_system.main import get_agent
from rag_system.factory import get_indexing_pipeline

# Initialize database connection once at module level
# Use auto-detection for environment-appropriate path
db = ChatDatabase()

# Get the desired agent mode from environment variables, defaulting to 'default'
# This allows us to easily switch between 'default', 'fast', 'react', etc.
AGENT_MODE = os.getenv("RAG_CONFIG_MODE", "default")
RAG_AGENT = get_agent(AGENT_MODE)
INDEXING_PIPELINE = get_indexing_pipeline(AGENT_MODE)

# --- Global Singleton for the RAG Agent ---
# The agent is initialized once when the server starts.
# This avoids reloading all the models on every request.
print("🧠 Initializing RAG Agent with MAXIMUM ACCURACY... (This may take a moment)")
if RAG_AGENT is None:
    print("❌ Critical error: RAG Agent could not be initialized. Exiting.")
    exit(1)
print("✅ RAG Agent initialized successfully with MAXIMUM ACCURACY.")
# ---

# Add helper near top after db & agent init
# -------------- Helper ----------------

def _apply_index_embedding_model(idx_ids):
    """Ensure retrieval pipeline uses the embedding model stored with the first index."""
    debug_info = f"🔧 _apply_index_embedding_model called with idx_ids: {idx_ids}\n"
    
    if not idx_ids:
        debug_info += "⚠️ No index IDs provided\n"
        with open("logs/embedding_debug.log", "a") as f:
            f.write(debug_info)
        return
    try:
        idx = db.get_index(idx_ids[0])
        debug_info += f"🔧 Retrieved index: {idx.get('id')} with metadata: {idx.get('metadata', {})}\n"
        model = (idx.get("metadata") or {}).get("embedding_model")
        debug_info += f"🔧 Embedding model from metadata: {model}\n"
        if model:
            rp = RAG_AGENT.retrieval_pipeline
            current_model = rp.config.get("embedding_model_name")
            debug_info += f"🔧 Current embedding model: {current_model}\n"
            rp.update_embedding_model(model)
            debug_info += f"🔧 Updated embedding model to: {model}\n"
        else:
            debug_info += "⚠️ No embedding model found in metadata\n"
    except Exception as e:
        debug_info += f"⚠️ Could not apply index embedding model: {e}\n"
    
    # Write debug info to file
    with open("logs/embedding_debug.log", "a") as f:
        f.write(debug_info)

def _get_table_name_for_session(session_id):
    """Get the correct vector table name for a session by looking up its linked indexes."""
    logger = logging.getLogger(__name__)
    
    if not session_id:
        logger.info("❌ No session_id provided")
        return None
    
    try:
        # Get indexes linked to this session
        idx_ids = db.get_indexes_for_session(session_id)
        logger.info(f"🔍 Session {session_id[:8]}... has {len(idx_ids)} indexes: {idx_ids}")
        
        if not idx_ids:
            logger.warning(f"⚠️ No indexes found for session {session_id}")
            # Use the default table name from config instead of session-specific name
            from rag_system.main import PIPELINE_CONFIGS
            default_table = PIPELINE_CONFIGS["default"]["storage"]["text_table_name"]
            logger.info(f"📊 Using default table '{default_table}' for session {session_id[:8]}...")
            return default_table
        
        # Use the first index's vector table name
        idx = db.get_index(idx_ids[0])
        if idx and idx.get('vector_table_name'):
            table_name = idx['vector_table_name']
            logger.info(f"📊 Using table '{table_name}' for session {session_id[:8]}...")
            print(f"📊 RAG API: Using table '{table_name}' for session {session_id[:8]}...")
            return table_name
        else:
            logger.warning(f"⚠️ Index found but no vector table name for session {session_id}")
            # Use the default table name from config instead of session-specific name
            from rag_system.main import PIPELINE_CONFIGS
            default_table = PIPELINE_CONFIGS["default"]["storage"]["text_table_name"]
            logger.info(f"📊 Using default table '{default_table}' for session {session_id[:8]}...")
            return default_table
            
    except Exception as e:
        logger.error(f"❌ Error getting table name for session {session_id}: {e}")
        # Use the default table name from config instead of session-specific name
        from rag_system.main import PIPELINE_CONFIGS
        default_table = PIPELINE_CONFIGS["default"]["storage"]["text_table_name"]
        logger.info(f"📊 Using default table '{default_table}' for session {session_id[:8]}...")
        return default_table

class AdvancedRagApiHandler(http.server.BaseHTTPRequestHandler):
    def do_OPTIONS(self):
        """Handle CORS preflight requests for frontend integration."""
        self.send_response(200)
        self.send_header('Access-Control-Allow-Origin', '*')
        self.send_header('Access-Control-Allow-Methods', 'POST, OPTIONS')
        self.send_header('Access-Control-Allow-Headers', 'Content-Type')
        self.end_headers()

    def do_POST(self):
        """Handle POST requests for chat and indexing."""
        parsed_path = urlparse(self.path)

        if parsed_path.path == '/chat':
            self.handle_chat()
        elif parsed_path.path == '/chat/stream':
            self.handle_chat_stream()
        elif parsed_path.path == '/index':
            self.handle_index()
        else:
            self.send_json_response({"error": "Not Found"}, status_code=404)

    def do_GET(self):
        parsed_path = urlparse(self.path)

        if parsed_path.path == '/models':
            self.handle_models()
        else:
            self.send_json_response({"error": "Not Found"}, status_code=404)

    def handle_chat(self):
        """Handles a chat query by calling the agentic RAG pipeline."""
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length)
            data = json.loads(post_data.decode('utf-8'))
            
            query = data.get('query')
            session_id = data.get('session_id')
            compose_flag = data.get('compose_sub_answers')
            decomp_flag = data.get('query_decompose')
            ai_rerank_flag = data.get('ai_rerank')
            ctx_expand_flag = data.get('context_expand')
            verify_flag = data.get('verify')
            
            # ✨ NEW RETRIEVAL PARAMETERS
            retrieval_k = data.get('retrieval_k', 20)
            context_window_size = data.get('context_window_size', 1)
            reranker_top_k = data.get('reranker_top_k', 10)
            search_type = data.get('search_type', 'hybrid')
            dense_weight = data.get('dense_weight', 0.7)
            
            # 🚩 NEW: Force RAG override from frontend
            force_rag = bool(data.get('force_rag', False))
            
            # 🌿 Provence sentence pruning
            provence_prune = data.get('provence_prune')
            provence_threshold = data.get('provence_threshold')
            
            # User-selected generation model
            requested_model = data.get('model')
            if isinstance(requested_model,str) and requested_model:
                RAG_AGENT.ollama_config['generation_model']=requested_model
            
            if not query:
                self.send_json_response({"error": "Query is required"}, status_code=400)
                return

            # 🔄 UPDATE SESSION TITLE: If this is the first message in the session, update the title
            if session_id:
                try:
                    # Check if this is the first message by calling the backend server
                    backend_url = f"http://localhost:8000/sessions/{session_id}"
                    session_resp = requests.get(backend_url)
                    if session_resp.status_code == 200:
                        session_data = session_resp.json()
                        session = session_data.get('session', {})
                        # If message_count is 0, this is the first message
                        if session.get('message_count', 0) == 0:
                            # Generate a title from the first message
                            title = generate_session_title(query)
                            # Update the session title via backend API
                            # We'll need to add this endpoint to the backend, for now let's make a direct database call
                            # This is a temporary solution until we add a proper API endpoint
                            db.update_session_title(session_id, title)
                            print(f"📝 Updated session title to: {title}")
                            
                            # 💾 STORE USER MESSAGE: Add the user message to the database
                            user_message_id = db.add_message(session_id, query, "user")
                            print(f"💾 Stored user message: {user_message_id}")
                        else:
                            # Not the first message, but still store the user message
                            user_message_id = db.add_message(session_id, query, "user")
                            print(f"💾 Stored user message: {user_message_id}")
                except Exception as e:
                    print(f"⚠️ Failed to update session title or store user message: {e}")
                    # Continue with the request even if title update fails

            # Allow explicit table_name override
            table_name = data.get('table_name')
            if not table_name and session_id:
                table_name = _get_table_name_for_session(session_id)

            # Decide execution path
            print(f"🔧 Force RAG flag: {force_rag}")
            if force_rag:
                # --- Apply runtime overrides manually because we skip Agent.run()
                rp_cfg = RAG_AGENT.retrieval_pipeline.config
                if retrieval_k is not None:
                    rp_cfg["retrieval_k"] = retrieval_k
                if reranker_top_k is not None:
                    rp_cfg.setdefault("reranker", {})["top_k"] = reranker_top_k
                if search_type is not None:
                    rp_cfg.setdefault("retrieval", {})["search_type"] = search_type
                if dense_weight is not None:
                    rp_cfg.setdefault("retrieval", {}).setdefault("dense", {})["weight"] = dense_weight

                # Provence overrides
                if provence_prune is not None:
                    rp_cfg.setdefault("provence", {})["enabled"] = bool(provence_prune)
                if provence_threshold is not None:
                    rp_cfg.setdefault("provence", {})["threshold"] = float(provence_threshold)

                # 🔄 Apply embedding model for this session (same as in agent path)
                if session_id:
                    idx_ids = db.get_indexes_for_session(session_id)
                    _apply_index_embedding_model(idx_ids)

                # Directly invoke retrieval pipeline to bypass triage
                result = RAG_AGENT.retrieval_pipeline.run(
                    query,
                    table_name=table_name,
                    window_size_override=context_window_size,
                )
            else:
                # Use full agent with smart routing
                # Apply Provence overrides even in agent path
                rp_cfg = RAG_AGENT.retrieval_pipeline.config
                if provence_prune is not None:
                    rp_cfg.setdefault("provence", {})["enabled"] = bool(provence_prune)
                if provence_threshold is not None:
                    rp_cfg.setdefault("provence", {})["threshold"] = float(provence_threshold)

                # 🔄 Refresh document overviews for this session
                if session_id:
                    idx_ids = db.get_indexes_for_session(session_id)
                    _apply_index_embedding_model(idx_ids)
                    RAG_AGENT.load_overviews_for_indexes(idx_ids)

                # 🔧 Set index-specific overview path
                if session_id:
                    rp_cfg["overview_path"] = f"index_store/overviews/{session_id}.jsonl"

                # 🔧 Configure late chunking
                rp_cfg.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True

                result = RAG_AGENT.run(
                    query,
                    table_name=table_name,
                    session_id=session_id,
                    compose_sub_answers=compose_flag,
                    query_decompose=decomp_flag,
                    ai_rerank=ai_rerank_flag,
                    context_expand=ctx_expand_flag,
                    verify=verify_flag,
                    retrieval_k=retrieval_k,
                    context_window_size=context_window_size,
                    reranker_top_k=reranker_top_k,
                    search_type=search_type,
                    dense_weight=dense_weight,
                )
            
            # The result is a dict, so we need to dump it to a JSON string
            self.send_json_response(result)
            
            # 💾 STORE AI RESPONSE: Add the AI response to the database
            if session_id and result and result.get("answer"):
                try:
                    ai_message_id = db.add_message(session_id, result["answer"], "assistant")
                    print(f"💾 Stored AI response: {ai_message_id}")
                except Exception as e:
                    print(f"⚠️ Failed to store AI response: {e}")
                    # Continue even if storage fails

        except json.JSONDecodeError:
            self.send_json_response({"error": "Invalid JSON"}, status_code=400)
        except Exception as e:
            self.send_json_response({"error": f"Server error: {str(e)}"}, status_code=500)

    def handle_chat_stream(self):
        """Stream internal phases and final answer using SSE (text/event-stream)."""
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length)
            data = json.loads(post_data.decode('utf-8'))

            query = data.get('query')
            session_id = data.get('session_id')
            compose_flag = data.get('compose_sub_answers')
            decomp_flag = data.get('query_decompose')
            ai_rerank_flag = data.get('ai_rerank')
            ctx_expand_flag = data.get('context_expand')
            verify_flag = data.get('verify')
            
            # ✨ NEW RETRIEVAL PARAMETERS
            retrieval_k = data.get('retrieval_k', 20)
            context_window_size = data.get('context_window_size', 1)
            reranker_top_k = data.get('reranker_top_k', 10)
            search_type = data.get('search_type', 'hybrid')
            dense_weight = data.get('dense_weight', 0.7)

            # 🚩 NEW: Force RAG override from frontend
            force_rag = bool(data.get('force_rag', False))

            # 🌿 Provence sentence pruning
            provence_prune = data.get('provence_prune')
            provence_threshold = data.get('provence_threshold')

            # User-selected generation model
            requested_model = data.get('model')
            if isinstance(requested_model,str) and requested_model:
                RAG_AGENT.ollama_config['generation_model']=requested_model

            if not query:
                self.send_json_response({"error": "Query is required"}, status_code=400)
                return

            # 🔄 UPDATE SESSION TITLE: If this is the first message in the session, update the title
            if session_id:
                try:
                    # Check if this is the first message by calling the backend server
                    backend_url = f"http://localhost:8000/sessions/{session_id}"
                    session_resp = requests.get(backend_url)
                    if session_resp.status_code == 200:
                        session_data = session_resp.json()
                        session = session_data.get('session', {})
                        # If message_count is 0, this is the first message
                        if session.get('message_count', 0) == 0:
                            # Generate a title from the first message
                            title = generate_session_title(query)
                            # Update the session title via backend API
                            # We'll need to add this endpoint to the backend, for now let's make a direct database call
                            # This is a temporary solution until we add a proper API endpoint
                            db.update_session_title(session_id, title)
                            print(f"📝 Updated session title to: {title}")
                            
                            # 💾 STORE USER MESSAGE: Add the user message to the database
                            user_message_id = db.add_message(session_id, query, "user")
                            print(f"💾 Stored user message: {user_message_id}")
                        else:
                            # Not the first message, but still store the user message
                            user_message_id = db.add_message(session_id, query, "user")
                            print(f"💾 Stored user message: {user_message_id}")
                except Exception as e:
                    print(f"⚠️ Failed to update session title or store user message: {e}")
                    # Continue with the request even if title update fails

            # Allow explicit table_name override
            table_name = data.get('table_name')
            if not table_name and session_id:
                table_name = _get_table_name_for_session(session_id)

            # Prepare response headers for SSE
            self.send_response(200)
            self.send_header('Content-Type', 'text/event-stream')
            self.send_header('Cache-Control', 'no-cache')
            # Keep connection alive for SSE; no manual chunked encoding (Python http.server
            # does not add chunk sizes automatically, so declaring it breaks clients).
            self.send_header('Connection', 'keep-alive')
            self.send_header('Access-Control-Allow-Origin', '*')
            self.end_headers()

            def emit(event_type: str, payload):
                """Send a single SSE event."""
                try:
                    data_str = json.dumps({"type": event_type, "data": payload})
                    self.wfile.write(f"data: {data_str}\n\n".encode('utf-8'))
                    self.wfile.flush()
                except BrokenPipeError:
                    # Client disconnected
                    raise

            # Run the agent synchronously, emitting checkpoints
            try:
                if force_rag:
                    # Apply overrides same as above since we bypass Agent.run
                    rp_cfg = RAG_AGENT.retrieval_pipeline.config
                    if retrieval_k is not None:
                        rp_cfg["retrieval_k"] = retrieval_k
                    if reranker_top_k is not None:
                        rp_cfg.setdefault("reranker", {})["top_k"] = reranker_top_k
                    if search_type is not None:
                        rp_cfg.setdefault("retrieval", {})["search_type"] = search_type
                    if dense_weight is not None:
                        rp_cfg.setdefault("retrieval", {}).setdefault("dense", {})["weight"] = dense_weight

                    # Provence overrides
                    if provence_prune is not None:
                        rp_cfg.setdefault("provence", {})["enabled"] = bool(provence_prune)
                    if provence_threshold is not None:
                        rp_cfg.setdefault("provence", {})["threshold"] = float(provence_threshold)

                    # 🔄 Apply embedding model for this session (same as in agent path)
                    if session_id:
                        idx_ids = db.get_indexes_for_session(session_id)
                        _apply_index_embedding_model(idx_ids)

                    # 🔧 Set index-specific overview path so each index writes separate file
                    if session_id:
                        rp_cfg["overview_path"] = f"index_store/overviews/{session_id}.jsonl"

                    # 🔧 Configure late chunking
                    rp_cfg.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True

                    # Straight retrieval pipeline with streaming events
                    final_result = RAG_AGENT.retrieval_pipeline.run(
                        query,
                        table_name=table_name,
                        window_size_override=context_window_size,
                        event_callback=emit,
                    )
                else:
                    # Provence overrides
                    rp_cfg = RAG_AGENT.retrieval_pipeline.config
                    if provence_prune is not None:
                        rp_cfg.setdefault("provence", {})["enabled"] = bool(provence_prune)
                    if provence_threshold is not None:
                        rp_cfg.setdefault("provence", {})["threshold"] = float(provence_threshold)

                    # 🔄 Refresh overviews for this session
                    if session_id:
                        idx_ids = db.get_indexes_for_session(session_id)
                        _apply_index_embedding_model(idx_ids)
                        RAG_AGENT.load_overviews_for_indexes(idx_ids)

                    # 🔧 Set index-specific overview path
                    if session_id:
                        rp_cfg["overview_path"] = f"index_store/overviews/{session_id}.jsonl"

                    # 🔧 Configure late chunking
                    rp_cfg.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True

                    final_result = RAG_AGENT.run(
                        query,
                        table_name=table_name,
                        session_id=session_id,
                        compose_sub_answers=compose_flag,
                        query_decompose=decomp_flag,
                        ai_rerank=ai_rerank_flag,
                        context_expand=ctx_expand_flag,
                        verify=verify_flag,
                        # ✨ NEW RETRIEVAL PARAMETERS
                        retrieval_k=retrieval_k,
                        context_window_size=context_window_size,
                        reranker_top_k=reranker_top_k,
                        search_type=search_type,
                        dense_weight=dense_weight,
                        event_callback=emit,
                    )

                # Ensure the final answer is sent (in case callback missed it)
                emit("complete", final_result)
                
                # 💾 STORE AI RESPONSE: Add the AI response to the database
                if session_id and final_result and final_result.get("answer"):
                    try:
                        ai_message_id = db.add_message(session_id, final_result["answer"], "assistant")
                        print(f"💾 Stored AI response: {ai_message_id}")
                    except Exception as e:
                        print(f"⚠️ Failed to store AI response: {e}")
                        # Continue even if storage fails
            except BrokenPipeError:
                print("🔌 Client disconnected from SSE stream.")
            except Exception as e:
                # Send error event then close
                error_payload = {"error": str(e)}
                try:
                    emit("error", error_payload)
                finally:
                    print(f"❌ Stream error: {e}")

        except json.JSONDecodeError:
            self.send_json_response({"error": "Invalid JSON"}, status_code=400)
        except Exception as e:
            self.send_json_response({"error": f"Server error: {str(e)}"}, status_code=500)

    def handle_index(self):
        """Triggers the document indexing pipeline for specific files."""
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length)
            data = json.loads(post_data.decode('utf-8'))
            
            file_paths = data.get('file_paths')
            session_id = data.get('session_id')
            compose_flag = data.get('compose_sub_answers')
            decomp_flag = data.get('query_decompose')
            ai_rerank_flag = data.get('ai_rerank')
            ctx_expand_flag = data.get('context_expand')
            enable_latechunk = bool(data.get("enable_latechunk", False))
            enable_docling_chunk = bool(data.get("enable_docling_chunk", False))
            
            # 🆕 NEW CONFIGURATION OPTIONS:
            chunk_size = int(data.get("chunk_size", 512))
            chunk_overlap = int(data.get("chunk_overlap", 64))
            retrieval_mode = data.get("retrieval_mode", "hybrid")
            window_size = int(data.get("window_size", 2))
            enable_enrich = bool(data.get("enable_enrich", True))
            embedding_model = data.get('embeddingModel')
            enrich_model = data.get('enrichModel')
            overview_model = data.get('overviewModel') or data.get('overview_model_name')
            batch_size_embed = int(data.get("batch_size_embed", 50))
            batch_size_enrich = int(data.get("batch_size_enrich", 25))
            
            if not file_paths or not isinstance(file_paths, list):
                self.send_json_response({
                    "error": "A 'file_paths' list is required."
                }, status_code=400)
                return

            # Allow explicit table_name override
            table_name = data.get('table_name')
            if not table_name and session_id:
                table_name = _get_table_name_for_session(session_id)

            # The INDEXING_PIPELINE is already initialized. We just need to use it.
            # If a session-specific table is needed, we can override the config for this run.
            if table_name:
                import copy
                config_override = copy.deepcopy(INDEXING_PIPELINE.config)
                config_override["storage"]["text_table_name"] = table_name
                config_override.setdefault("retrievers", {}).setdefault("dense", {})["lancedb_table_name"] = table_name
                
                # 🔧 Configure late chunking
                if enable_latechunk:
                    config_override["retrievers"].setdefault("latechunk", {})["enabled"] = True
                else:
                    # ensure disabled if not requested
                    config_override["retrievers"].setdefault("latechunk", {})["enabled"] = False
                
                # 🔧 Configure docling chunking
                if enable_docling_chunk:
                    config_override["chunker_mode"] = "docling"
                
                # 🔧 Configure contextual enrichment (THIS WAS MISSING!)
                config_override.setdefault("contextual_enricher", {})
                config_override["contextual_enricher"]["enabled"] = enable_enrich
                config_override["contextual_enricher"]["window_size"] = window_size
                
                # 🔧 Configure indexing batch sizes
                config_override.setdefault("indexing", {})
                config_override["indexing"]["embedding_batch_size"] = batch_size_embed
                config_override["indexing"]["enrichment_batch_size"] = batch_size_enrich
                
                # 🔧 Configure chunking parameters
                config_override.setdefault("chunking", {})
                config_override["chunking"]["chunk_size"] = chunk_size
                config_override["chunking"]["chunk_overlap"] = chunk_overlap
                
                # 🔧 Configure embedding model if specified
                if embedding_model:
                    config_override["embedding_model_name"] = embedding_model
                
                # 🔧 Configure enrichment model if specified
                if enrich_model:
                    config_override["enrich_model"] = enrich_model
                
                # 🔧 Overview model (can differ from enrichment)
                if overview_model:
                    config_override["overview_model_name"] = overview_model
                
                print(f"🔧 INDEXING CONFIG: Contextual Enrichment: {enable_enrich}, Window Size: {window_size}")
                print(f"🔧 CHUNKING CONFIG: Size: {chunk_size}, Overlap: {chunk_overlap}")
                print(f"🔧 MODEL CONFIG: Embedding: {embedding_model or 'default'}, Enrichment: {enrich_model or 'default'}")
                
                # 🔧 Set index-specific overview path so each index writes separate file
                if session_id:
                    config_override["overview_path"] = f"index_store/overviews/{session_id}.jsonl"

                # 🔧 Configure late chunking
                config_override.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True

                # Create a temporary pipeline instance with the overridden config
                temp_pipeline = INDEXING_PIPELINE.__class__(
                    config_override, 
                    INDEXING_PIPELINE.llm_client, 
                    INDEXING_PIPELINE.ollama_config
                )
                temp_pipeline.run(file_paths)
            else:
                # Use the default pipeline with overrides
                import copy
                config_override = copy.deepcopy(INDEXING_PIPELINE.config)
                
                # 🔧 Configure late chunking
                if enable_latechunk:
                    config_override.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True
                
                # 🔧 Configure docling chunking
                if enable_docling_chunk:
                    config_override["chunker_mode"] = "docling"
                
                # 🔧 Configure contextual enrichment (THIS WAS MISSING!)
                config_override.setdefault("contextual_enricher", {})
                config_override["contextual_enricher"]["enabled"] = enable_enrich
                config_override["contextual_enricher"]["window_size"] = window_size
                
                # 🔧 Configure indexing batch sizes
                config_override.setdefault("indexing", {})
                config_override["indexing"]["embedding_batch_size"] = batch_size_embed
                config_override["indexing"]["enrichment_batch_size"] = batch_size_enrich
                
                # 🔧 Configure chunking parameters
                config_override.setdefault("chunking", {})
                config_override["chunking"]["chunk_size"] = chunk_size
                config_override["chunking"]["chunk_overlap"] = chunk_overlap
                
                # 🔧 Configure embedding model if specified
                if embedding_model:
                    config_override["embedding_model_name"] = embedding_model
                
                # 🔧 Configure enrichment model if specified
                if enrich_model:
                    config_override["enrich_model"] = enrich_model
                
                # 🔧 Overview model (can differ from enrichment)
                if overview_model:
                    config_override["overview_model_name"] = overview_model
                
                print(f"🔧 INDEXING CONFIG: Contextual Enrichment: {enable_enrich}, Window Size: {window_size}")
                print(f"🔧 CHUNKING CONFIG: Size: {chunk_size}, Overlap: {chunk_overlap}")
                print(f"🔧 MODEL CONFIG: Embedding: {embedding_model or 'default'}, Enrichment: {enrich_model or 'default'}")
                
                # 🔧 Set index-specific overview path so each index writes separate file
                if session_id:
                    config_override["overview_path"] = f"index_store/overviews/{session_id}.jsonl"

                # 🔧 Configure late chunking
                config_override.setdefault("retrievers", {}).setdefault("latechunk", {})["enabled"] = True

                # Create temporary pipeline with overridden config
                temp_pipeline = INDEXING_PIPELINE.__class__(
                    config_override, 
                    INDEXING_PIPELINE.llm_client, 
                    INDEXING_PIPELINE.ollama_config
                )
                temp_pipeline.run(file_paths)

            self.send_json_response({
                "message": f"Indexing process for {len(file_paths)} file(s) completed successfully.",
                "table_name": table_name or "default_text_table",
                "latechunk": enable_latechunk,
                "docling_chunk": enable_docling_chunk,
                "indexing_config": {
                    "chunk_size": chunk_size,
                    "chunk_overlap": chunk_overlap,
                    "retrieval_mode": retrieval_mode,
                    "window_size": window_size,
                    "enable_enrich": enable_enrich,
                    "embedding_model": embedding_model,
                    "enrich_model": enrich_model,
                    "batch_size_embed": batch_size_embed,
                    "batch_size_enrich": batch_size_enrich
                }
            })

            if embedding_model:
                try:
                    db.update_index_metadata(session_id, {"embedding_model": embedding_model})
                except Exception as e:
                    print(f"⚠️ Could not update embedding_model metadata: {e}")

        except json.JSONDecodeError:
            self.send_json_response({"error": "Invalid JSON"}, status_code=400)
        except Exception as e:
            self.send_json_response({"error": f"Failed to start indexing: {str(e)}"}, status_code=500)

    def handle_models(self):
        """Return a list of locally installed Ollama models and supported HuggingFace models, grouped by capability."""
        try:
            generation_models = []
            embedding_models = []
            
            # Get Ollama models if available
            try:
                resp = requests.get(f"{RAG_AGENT.ollama_config['host']}/api/tags", timeout=5)
                resp.raise_for_status()
                data = resp.json()

                all_ollama_models = [m.get('name') for m in data.get('models', [])]

                # Very naive classification
                ollama_embedding_models = [m for m in all_ollama_models if any(k in m for k in ['embed','bge','embedding','text'])]
                ollama_generation_models = [m for m in all_ollama_models if m not in ollama_embedding_models]
                
                generation_models.extend(ollama_generation_models)
                embedding_models.extend(ollama_embedding_models)
            except Exception as e:
                print(f"⚠️ Could not get Ollama models: {e}")
            
            # Add supported HuggingFace embedding models
            huggingface_embedding_models = [
                "Qwen/Qwen3-Embedding-0.6B",
                "Qwen/Qwen3-Embedding-4B", 
                "Qwen/Qwen3-Embedding-8B"
            ]
            embedding_models.extend(huggingface_embedding_models)
            
            # Sort models for consistent ordering
            generation_models.sort()
            embedding_models.sort()

            self.send_json_response({
                "generation_models": generation_models,
                "embedding_models": embedding_models
            })
        except Exception as e:
            self.send_json_response({"error": f"Could not list models: {e}"}, status_code=500)

    def send_json_response(self, data, status_code=200):
        """Utility to send a JSON response with CORS headers."""
        self.send_response(status_code)
        self.send_header('Content-Type', 'application/json')
        self.send_header('Access-Control-Allow-Origin', '*')
        self.end_headers()
        response = json.dumps(data, indent=2)
        self.wfile.write(response.encode('utf-8'))

def start_server(port=8001):
    """Starts the API server."""
    # Use a reusable TCP server to avoid "address in use" errors on restart
    class ReusableTCPServer(socketserver.TCPServer):
        allow_reuse_address = True

    with ReusableTCPServer(("", port), AdvancedRagApiHandler) as httpd:
        print(f"🚀 Starting Advanced RAG API server on port {port}")
        print(f"💬 Chat endpoint: http://localhost:{port}/chat")
        print(f"✨ Indexing endpoint: http://localhost:{port}/index")
        httpd.serve_forever()

if __name__ == "__main__":
    # To run this server: python -m rag_system.api_server
    start_server() 

================================================
FILE: rag_system/api_server_with_progress.py
================================================
import json
import threading
import time
from typing import Dict, List, Any
import logging
from urllib.parse import urlparse, parse_qs
import http.server
import socketserver

# Import the core logic and batch processing utilities
from rag_system.main import get_agent
from rag_system.utils.batch_processor import ProgressTracker, timer

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global progress tracking storage
ACTIVE_PROGRESS_SESSIONS: Dict[str, Dict[str, Any]] = {}

# --- Global Singleton for the RAG Agent ---
print("🧠 Initializing RAG Agent... (This may take a moment)")
RAG_AGENT = get_agent()
if RAG_AGENT is None:
    print("❌ Critical error: RAG Agent could not be initialized. Exiting.")
    exit(1)
print("✅ RAG Agent initialized successfully.")

class ServerSentEventsHandler:
    """Handler for Server-Sent Events (SSE) for real-time progress updates"""
    
    active_connections: Dict[str, Any] = {}
    
    @classmethod
    def add_connection(cls, session_id: str, response_handler):
        """Add a new SSE connection"""
        cls.active_connections[session_id] = response_handler
        logger.info(f"SSE connection added for session: {session_id}")
    
    @classmethod
    def remove_connection(cls, session_id: str):
        """Remove an SSE connection"""
        if session_id in cls.active_connections:
            del cls.active_connections[session_id]
            logger.info(f"SSE connection removed for session: {session_id}")
    
    @classmethod
    def send_event(cls, session_id: str, event_type: str, data: Dict[str, Any]):
        """Send an SSE event to a specific session"""
        if session_id not in cls.active_connections:
            return
        
        try:
            handler = cls.active_connections[session_id]
            event_data = json.dumps(data)
            message = f"event: {event_type}\ndata: {event_data}\n\n"
            handler.wfile.write(message.encode('utf-8'))
            handler.wfile.flush()
        except Exception as e:
            logger.error(f"Failed to send SSE event: {e}")
            cls.remove_connection(session_id)

class RealtimeProgressTracker(ProgressTracker):
    """Enhanced ProgressTracker that sends updates via Server-Sent Events"""
    
    def __init__(self, total_items: int, operation_name: str, session_id: str):
        super().__init__(total_items, operation_name)
        self.session_id = session_id
        self.last_update = 0
        self.update_interval = 1  # Update every 1 second
        
        # Initialize session progress
        ACTIVE_PROGRESS_SESSIONS[session_id] = {
            "operation_name": operation_name,
            "total_items": total_items,
            "processed_items": 0,
            "errors_encountered": 0,
            "start_time": self.start_time,
            "status": "running",
            "current_step": "",
            "eta_seconds": 0,
            "throughput": 0,
            "progress_percentage": 0
        }
        
        # Send initial progress update
        self._send_progress_update()
    
    def update(self, items_processed: int, errors: int = 0, current_step: str = ""):
        """Update progress and send notification"""
        super().update(items_processed, errors)
        
        # Update session data
        session_data = ACTIVE_PROGRESS_SESSIONS.get(self.session_id)
        if session_data:
            session_data.update({
                "processed_items": self.processed_items,
                "errors_encountered": self.errors_encountered,
                "current_step": current_step,
                "progress_percentage": (self.processed_items / self.total_items) * 100,
            })
            
            # Calculate throughput and ETA
            elapsed = time.time() - self.start_time
            if elapsed > 0:
                session_data["throughput"] = self.processed_items / elapsed
                remaining = self.total_items - self.processed_items
                session_data["eta_seconds"] = remaining / session_data["throughput"] if session_data["throughput"] > 0 else 0
        
        # Send update if enough time has passed
        current_time = time.time()
        if current_time - self.last_update >= self.update_interval:
            self._send_progress_update()
            self.last_update = current_time
    
    def finish(self):
        """Mark progress as finished and send final update"""
        super().finish()
        
        # Update session status
        session_data = ACTIVE_PROGRESS_SESSIONS.get(self.session_id)
        if session_data:
            session_data.update({
                "status": "completed",
                "progress_percentage": 100,
                "eta_seconds": 0
            })
        
        # Send final update
        self._send_progress_update(final=True)
    
    def _send_progress_update(self, final: bool = False):
        """Send progress update via Server-Sent Events"""
        session_data = ACTIVE_PROGRESS_SESSIONS.get(self.session_id, {})
        
        event_data = {
            "session_id": self.session_id,
            "progress": session_data.copy(),
            "final": final,
            "timestamp": time.time()
        }
        
        ServerSentEventsHandler.send_event(self.session_id, "progress", event_data)

def run_indexing_with_progress(file_paths: List[str], session_id: str):
    """Enhanced indexing function with real-time progress tracking"""
    from rag_system.pipelines.indexing_pipeline import IndexingPipeline
    from rag_system.utils.ollama_client import OllamaClient
    import json
    
    try:
        # Send initial status
        ServerSentEventsHandler.send_event(session_id, "status", {
            "message": "Initializing indexing pipeline...",
            "session_id": session_id
        })
        
        # Load configuration
        config_file = "batch_indexing_config.json"
        try:
            with open(config_file, 'r') as f:
                config = json.load(f)
        except FileNotFoundError:
            # Fallback to default config
            config = {
                "embedding_model_name": "Qwen/Qwen3-Embedding-0.6B",
                "indexing": {
                    "embedding_batch_size": 50,
                    "enrichment_batch_size": 10,
                    "enable_progress_tracking": True
                },
                "contextual_enricher": {"enabled": True, "window_size": 1},
                "retrievers": {
                    "dense": {"enabled": True, "lancedb_table_name": "default_text_table"},
                    "bm25": {"enabled": True, "index_name": "default_bm25_index"}
                },
                "storage": {
                    "chunk_store_path": "./index_store/chunks/chunks.pkl",
                    "lancedb_uri": "./index_store/lancedb",
                    "bm25_path": "./index_store/bm25"
                }
            }
        
        # Initialize components
        ollama_client = OllamaClient()
        ollama_config = {
            "generation_model": "llama3.2:1b",
            "embedding_model": "mxbai-embed-large"
        }
        
        # Create enhanced pipeline
        pipeline = IndexingPipeline(config, ollama_client, ollama_config)
        
        # Create progress tracker for the overall process
        total_steps = 6  # Rough estimate of pipeline steps
        step_tracker = RealtimeProgressTracker(total_steps, "Document Indexing", session_id)
        
        with timer("Complete Indexing Pipeline"):
            try:
                # Step 1: Document Processing
                step_tracker.update(1, current_step="Processing documents...")
                
                # Run the indexing pipeline
                pipeline.run(file_paths)
                
                # Update progress through the steps
                step_tracker.update(1, current_step="Chunking completed...")
                step_tracker.update(1, current_step="BM25 indexing completed...")
                step_tracker.update(1, current_step="Contextual enrichment completed...")
                step_tracker.update(1, current_step="Vector embeddings completed...")
                step_tracker.update(1, current_step="Indexing finalized...")
                
                step_tracker.finish()
                
                # Send completion notification
                ServerSentEventsHandler.send_event(session_id, "completion", {
                    "message": f"Successfully indexed {len(file_paths)} file(s)",
                    "file_count": len(file_paths),
                    "session_id": session_id
                })
                
            except Exception as e:
                # Send error notification
                ServerSentEventsHandler.send_event(session_id, "error", {
                    "message": str(e),
                    "session_id": session_id
                })
                raise
        
    except Exception as e:
        logger.error(f"Indexing failed for session {session_id}: {e}")
        ServerSentEventsHandler.send_event(session_id, "error", {
            "message": str(e),
            "session_id": session_id
        })
        raise

class EnhancedRagApiHandler(http.server.BaseHTTPRequestHandler):
    """Enhanced API handler with progress tracking support"""
    
    def do_OPTIONS(self):
        """Handle CORS preflight requests for frontend integration."""
        self.send_response(200)
        self.send_header('Access-Control-Allow-Origin', '*')
        self.send_header('Access-Control-Allow-Methods', 'POST, GET, OPTIONS')
        self.send_header('Access-Control-Allow-Headers', 'Content-Type')
        self.end_headers()

    def do_GET(self):
        """Handle GET requests for progress status and SSE streams"""
        parsed_path = urlparse(self.path)
        
        if parsed_path.path == '/progress':
            self.handle_progress_status()
        elif parsed_path.path == '/stream':
            self.handle_progress_stream()
        else:
            self.send_json_response({"error": "Not Found"}, status_code=404)

    def do_POST(self):
        """Handle POST requests for chat and indexing."""
        parsed_path = urlparse(self.path)

        if parsed_path.path == '/chat':
            self.handle_chat()
        elif parsed_path.path == '/index':
            self.handle_index_with_progress()
        else:
            self.send_json_response({"error": "Not Found"}, status_code=404)

    def handle_chat(self):
        """Handles a chat query by calling the agentic RAG pipeline."""
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length)
            data = json.loads(post_data.decode('utf-8'))
            
            query = data.get('query')
            if not query:
                self.send_json_response({"error": "Query is required"}, status_code=400)
                return

            # Use the single, persistent agent instance to run the query
            result = RAG_AGENT.run(query)
            
            # The result is a dict, so we need to dump it to a JSON string
            self.send_json_response(result)

        except json.JSONDecodeError:
            self.send_json_response({"error": "Invalid JSON"}, status_code=400)
        except Exception as e:
            self.send_json_response({"error": f"Server error: {str(e)}"}, status_code=500)

    def handle_index_with_progress(self):
        """Triggers the document indexing pipeline with real-time progress tracking."""
        try:
            content_length = int(self.headers['Content-Length'])
            post_data = self.rfile.read(content_length)
            data = json.loads(post_data.decode('utf-8'))
            
            file_paths = data.get('file_paths')
            session_id = data.get('session_id')
            
            if not file_paths or not isinstance(file_paths, list):
                self.send_json_response({
                    "error": "A 'file_paths' list is required."
                }, status_code=400)
                return
            
            if not session_id:
                self.send_json_response({
                    "error": "A 'session_id' is required for progress tracking."
                }, status_code=400)
                return

            # Start indexing in a separate thread to avoid blocking
            def run_indexing_thread():
                try:
                    run_indexing_with_progress(file_paths, session_id)
                except Exception as e:
                    logger.error(f"Indexing thread failed: {e}")

            thread = threading.Thread(target=run_indexing_thread)
            thread.daemon = True
            thread.start()

            # Return immediate response
            self.send_json_response({
                "message": f"Indexing started for {len(file_paths)} file(s)",
                "session_id": session_id,
                "status": "started",
                "progress_stream_url": f"http://localhost:8001/stream?session_id={session_id}"
            })
            
        except json.JSONDecodeError:
            self.send_json_response({"error": "Invalid JSON"}, status_code=400)
        except Exception as e:
            self.send_json_response({"error": f"Failed to start indexing: {str(e)}"}, status_code=500)

    def handle_progress_status(self):
        """Handle GET requests for current progress status"""
        parsed_url = urlparse(self.path)
        params = parse_qs(parsed_url.query)
        session_id = params.get('session_id', [None])[0]
        
        if not session_id:
            self.send_json_response({"error": "session_id is required"}, status_code=400)
            return
        
        progress_data = ACTIVE_PROGRESS_SESSIONS.get(session_id)
        if not progress_data:
            self.send_json_response({"error": "No active progress for this session"}, status_code=404)
            return
        
        self.send_json_response({
            "session_id": session_id,
            "progress": progress_data
        })

    def handle_progress_stream(self):
        """Handle Server-Sent Events stream for real-time progress"""
        parsed_url = urlparse(self.path)
        params = parse_qs(parsed_url.query)
        session_id = params.get('session_id', [None])[0]
        
        if not session_id:
            self.send_response(400)
            self.end_headers()
            return
        
        # Set up SSE headers
        self.send_response(200)
        self.send_header('Content-Type', 'text/event-stream')
        self.send_header('Cache-Control', 'no-cache')
        self.send_header('Connection', 'keep-alive')
        self.send_header('Access-Control-Allow-Origin', '*')
        self.end_headers()
        
        # Add this connection to the SSE handler
        ServerSentEventsHandler.add_connection(session_id, self)
        
        # Send initial connection message
        initial_message = json.dumps({
            "session_id": session_id,
            "message": "Progress stream connected",
            "timestamp": time.time()
        })
        self.wfile.write(f"event: connected\ndata: {initial_message}\n\n".encode('utf-8'))
        self.wfile.flush()
        
        # Keep connection alive
        try:
            while session_id in ServerSentEventsHandler.active_connections:
                time.sleep(1)
                # Send heartbeat
                heartbeat = json.dumps({"type": "heartbeat", "timestamp": time.time()})
                self.wfile.write(f"event: heartbeat\ndata: {heartbeat}\n\n".encode('utf-8'))
                self.wfile.flush()
        except Exception as e:
            logger.info(f"SSE connection closed for session {session_id}: {e}")
        finally:
            ServerSentEventsHandler.remove_connection(session_id)
    
    def send_json_response(self, data, status_code=200):
        """Utility to send a JSON response with CORS headers."""
        self.send_response(status_code)
        self.send_header('Content-Type', 'application/json')
        self.send_header('Access-Control-Allow-Origin', '*')
        self.end_headers()
        response = json.dumps(data, indent=2)
        self.wfile.write(response.encode('utf-8'))

def start_enhanced_server(port=8000):
    """Start the enhanced API server with a reusable TCP socket."""
    
    # Use a custom TCPServer that allows address reuse
    class ReusableTCPServer(socketserver.TCPServer):
        allow_reuse_address = True

    with ReusableTCPServer(("", port), EnhancedRagApiHandler) as httpd:
        print(f"🚀 Starting Enhanced RAG API server on port {port}")
        print(f"💬 Chat endpoint: http://localhost:{port}/chat")
        print(f"✨ Indexing endpoint: http://localhost:{port}/index")
        print(f"📊 Progress endpoint: http://localhost:{port}/progress")
        print(f"🌊 Progress stream: http://localhost:{port}/stream")
        print(f"📈 Real-time progress tracking enabled via Server-Sent Events!")
        httpd.serve_forever()

if __name__ == '__main__':
    # Start the server on a dedicated thread
    server_thread = threading.Thread(target=start_enhanced_server)
    server_thread.daemon = True
    server_thread.start()
    
    print("🚀 Enhanced RAG API server with progress tracking is running.")
    print("Press Ctrl+C to stop.")
    
    # Keep the main thread alive
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        print("\nStopping server...") 

================================================
FILE: rag_system/factory.py
================================================
from dotenv import load_dotenv

def get_agent(mode: str = "default"):
    """
    Factory function to get an instance of the RAG agent based on the specified mode.
    This uses local imports to prevent circular dependencies.
    """
    from rag_system.agent.loop import Agent
    from rag_system.utils.ollama_client import OllamaClient
    from rag_system.main import PIPELINE_CONFIGS, OLLAMA_CONFIG, LLM_BACKEND, WATSONX_CONFIG

    load_dotenv()
    
    # Initialize the appropriate LLM client based on backend configuration
    if LLM_BACKEND.lower() == "watsonx":
        from rag_system.utils.watsonx_client import WatsonXClient
        
        if not WATSONX_CONFIG["api_key"] or not WATSONX_CONFIG["project_id"]:
            raise ValueError(
                "Watson X configuration incomplete. Please set WATSONX_API_KEY and WATSONX_PROJECT_ID "
                "environment variables."
            )
        
        llm_client = WatsonXClient(
            api_key=WATSONX_CONFIG["api_key"],
            project_id=WATSONX_CONFIG["project_id"],
            url=WATSONX_CONFIG["url"]
        )
        llm_config = WATSONX_CONFIG
    else:
        llm_client = OllamaClient(host=OLLAMA_CONFIG["host"])
        llm_config = OLLAMA_CONFIG
    
    config = PIPELINE_CONFIGS.get(mode, PIPELINE_CONFIGS['default'])
    
    if 'storage' not in config:
        config['storage'] = {
            'db_path': 'lancedb',
            'text_table_name': 'text_pages_default',
            'image_table_name': 'image_pages'
        }
    
    agent = Agent(
        pipeline_configs=config, 
        llm_client=llm_client, 
        ollama_config=llm_config
    )
    return agent

def get_indexing_pipeline(mode: str = "default"):
    """
    Factory function to get an instance of the Indexing Pipeline.
    """
    from rag_system.pipelines.indexing_pipeline import IndexingPipeline
    from rag_system.main import PIPELINE_CONFIGS, OLLAMA_CONFIG, LLM_BACKEND, WATSONX_CONFIG
    from rag_system.utils.ollama_client import OllamaClient

    load_dotenv()
    
    # Initialize the appropriate LLM client based on backend configuration
    if LLM_BACKEND.lower() == "watsonx":
        from rag_system.utils.watsonx_client import WatsonXClient
        
        if not WATSONX_CONFIG["api_key"] or not WATSONX_CONFIG["project_id"]:
            raise ValueError(
                "Watson X configuration incomplete. Please set WATSONX_API_KEY and WATSONX_PROJECT_ID "
                "environment variables."
            )
        
        llm_client = WatsonXClient(
            api_key=WATSONX_CONFIG["api_key"],
            project_id=WATSONX_CONFIG["project_id"],
            url=WATSONX_CONFIG["url"]
        )
        llm_config = WATSONX_CONFIG
    else:
        llm_client = OllamaClient(host=OLLAMA_CONFIG["host"])
        llm_config = OLLAMA_CONFIG
    
    config = PIPELINE_CONFIGS.get(mode, PIPELINE_CONFIGS['default'])
    
    return IndexingPipeline(config, llm_client, llm_config)     

================================================
FILE: rag_system/indexing/__init__.py
================================================


================================================
FILE: rag_system/indexing/contextualizer.py
================================================
from typing import List, Dict, Any
from rag_system.utils.ollama_client import OllamaClient
from rag_system.ingestion.chunking import create_contextual_window
import logging
import re

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the structured prompt templates, adapted from the example
SYSTEM_PROMPT = "You are an expert at summarizing and providing context for document sections based on their local surroundings."

LOCAL_CONTEXT_PROMPT_TEMPLATE = """<local_context>
{local_context_text}
</local_context>"""

CHUNK_PROMPT_TEMPLATE = """Here is the specific chunk we want to situate within the local context provided:
<chunk>
{chunk_content}
</chunk>

Based *only* on the local context provided, give a very short (2-5 sentence) context summary to situate this specific chunk. 
Focus on the chunk's topic and its relation to the immediately surrounding text shown in the local context. 
Focus on the the overall theme of the context, make sure to include topics, concepts, and other relevant information.
Answer *only* with the succinct context and nothing else."""

class ContextualEnricher:
    """
    Enriches chunks with a prepended summary of their surrounding context using Ollama,
    while preserving the original text.
    """
    def __init__(self, llm_client: OllamaClient, llm_model: str, batch_size: int = 10):
        self.llm_client = llm_client
        self.llm_model = llm_model
        self.batch_size = batch_size
        logger.info(f"Initialized ContextualEnricher with Ollama model '{self.llm_model}' (batch_size={batch_size}).")

    def _generate_summary(self, local_context_text: str, chunk_text: str) -> str:
        """Generates a contextual summary using a structured, multi-part prompt."""
        # Combine the templates to form the final content for the HumanMessage equivalent
        human_prompt_content = (
            f"{LOCAL_CONTEXT_PROMPT_TEMPLATE.format(local_context_text=local_context_text)}\n\n"
            f"{CHUNK_PROMPT_TEMPLATE.format(chunk_content=chunk_text)}"
        )

        try:
            # Although we don't use LangChain's message objects, we can simulate the
            # System + Human message structure in the single prompt for the Ollama client.
            # A common way is to provide the system prompt and then the user's request.
            full_prompt = f"{SYSTEM_PROMPT}\n\n{human_prompt_content}"
            
            response = self.llm_client.generate_completion(self.llm_model, full_prompt, enable_thinking=False)
            summary_raw = response.get('response', '').strip()

            # --- Sanitize the summary to remove chain-of-thought markers ---
            # Many Qwen models wrap reasoning in <think>...</think> or similar tags.
            cleaned = re.sub(r'<think[^>]*>.*?</think>', '', summary_raw, flags=re.IGNORECASE | re.DOTALL)
            # Remove any assistant role tags that may appear
            cleaned = re.sub(r'<assistant[^>]*>|</assistant>', '', cleaned, flags=re.IGNORECASE)
            # If the model used an explicit "Answer:" delimiter keep only the part after it
            if 'Answer:' in cleaned:
                cleaned = cleaned.split('Answer:', 1)[1]

            # Take the first non-empty line to avoid leftover blank lines
            summary = next((ln.strip() for ln in cleaned.splitlines() if ln.strip()), '')

            # Fallback to raw if cleaning removed everything
            if not summary:
                summary = summary_raw

            if not summary or len(summary) < 5:
                logger.warning("Generated context summary is too short or empty. Skipping enrichment for this chunk.")
                return ""
            
            return summary

        except Exception as e:
            logger.error(f"LLM invocation failed during contextualization: {e}", exc_info=True)
            return "" # Gracefully fail by returning no summary

    def enrich_chunks(self, chunks: List[Dict[str, Any]], window_size: int = 1) -> List[Dict[str, Any]]:
        if not chunks:
            return []

        logger.info(f"Enriching {len(chunks)} chunks with contextual summaries (window_size={window_size}) using Ollama...")
        
        # Import batch processor
        from rag_system.utils.batch_processor import BatchProcessor, estimate_memory_usage
        
        # Estimate memory usage
        memory_mb = estimate_memory_usage(chunks)
        logger.info(f"Estimated memory usage for contextual enrichment: {memory_mb:.1f}MB")
        
        # Use batch processing for better performance and progress tracking
        batch_processor = BatchProcessor(batch_size=self.batch_size)
        
        def process_chunk_batch(chunk_indices):
            """Process a batch of chunk indices for contextual enrichment"""
            batch_results = []
            for i in chunk_indices:
                chunk = chunks[i]
                try:
                    local_context_text = create_contextual_window(chunks, chunk_index=i, window_size=window_size)
                    
                    # The summary is generated based on the original, unmodified text
                    original_text = chunk['text']
                    summary = self._generate_summary(local_context_text, original_text)
                    
                    new_chunk = chunk.copy()
                    
                    # Ensure metadata is a dictionary
                    if 'metadata' not in new_chunk or not isinstance(new_chunk['metadata'], dict):
                        new_chunk['metadata'] = {}

                    # Store original text and summary in metadata
                    new_chunk['metadata']['original_text'] = original_text
                    new_chunk['metadata']['contextual_summary'] = "N/A"

                    # Prepend the context summary ONLY if it was successfully generated
                    if summary:
                        new_chunk['text'] = f"Context: {summary}\n\n---\n\n{original_text}"
                        new_chunk['metadata']['contextual_summary'] = summary
                    
                    batch_results.append(new_chunk)
                    
                except Exception as e:
                    logger.error(f"Error enriching chunk {i}: {e}")
                    # Return original chunk if enrichment fails
                    batch_results.append(chunk)
                    
            return batch_results
        
        # Create list of chunk indices for batch processing
        chunk_indices = list(range(len(chunks)))
        
        # Process chunks in batches
        enriched_chunks = batch_processor.process_in_batches(
            chunk_indices,
            process_chunk_batch,
            "Contextual Enrichment"
        )
        
        return enriched_chunks
    
    def enrich_chunks_sequential(self, chunks: List[Dict[str, Any]], window_size: int = 1) -> List[Dict[str, Any]]:
        """Sequential enrichment method (legacy) - kept for comparison"""
        if not chunks:
            return []

        logger.info(f"Enriching {len(chunks)} chunks sequentially (window_size={window_size})...")
        enriched_chunks = []
        
        for i, chunk in enumerate(chunks):
            local_context_text = create_contextual_window(chunks, chunk_index=i, window_size=window_size)
            
            # The summary is generated based on the original, unmodified text
            original_text = chunk['text']
            summary = self._generate_summary(local_context_text, original_text)
            
            new_chunk = chunk.copy()
            
            # Ensure metadata is a dictionary
            if 'metadata' not in new_chunk or not isinstance(new_chunk['metadata'], dict):
                new_chunk['metadata'] = {}

            # Store original text and summary in metadata
            new_chunk['metadata']['original_text'] = original_text
            new_chunk['metadata']['contextual_summary'] = "N/A"

            # Prepend the context summary ONLY if it was successfully generated
            if summary:
                new_chunk['text'] = f"Context: {summary}\n\n---\n\n{original_text}"
                new_chunk['metadata']['contextual_summary'] = summary
            
            enriched_chunks.append(new_chunk)
            
            if (i + 1) % 10 == 0 or i == len(chunks) - 1:
                logger.info(f"  ...processed {i+1}/{len(chunks)} chunks.")
            
        return enriched_chunks

================================================
FILE: rag_system/indexing/embedders.py
================================================
# from rag_system.indexing.representations import BM25Generator
import lancedb
import pyarrow as pa
from typing import List, Dict, Any
import numpy as np
import json

class LanceDBManager:
    def __init__(self, db_path: str):
        self.db_path = db_path
        self.db = lancedb.connect(db_path)
        print(f"LanceDB connection established at: {db_path}")

    def get_table(self, table_name: str):
        return self.db.open_table(table_name)

    def create_table(self, table_name: str, schema: pa.Schema, mode: str = "overwrite"):
        print(f"Creating table '{table_name}' with mode '{mode}'...")
        return self.db.create_table(table_name, schema=schema, mode=mode)

class VectorIndexer:
    """
    Handles the indexing of vector embeddings and rich metadata into LanceDB.
    The 'text' field is the content that gets embedded (which can be enriched).
    The original, clean text is stored in the metadata.
    """
    def __init__(self, db_manager: LanceDBManager):
        self.db_manager = db_manager

    def index(self, table_name: str, chunks: List[Dict[str, Any]], embeddings: np.ndarray):
        if len(chunks) != len(embeddings):
            raise ValueError("The number of chunks and embeddings must be the same.")
        if not chunks:
            print("No chunks to index.")
            return

        vector_dim = embeddings[0].shape[0]
        
        # The schema stores the text that was used for the embedding (potentially enriched)
        # and the full metadata object as a JSON string.
        schema = pa.schema([
            pa.field("vector", pa.list_(pa.float32(), vector_dim)),
            pa.field("text", pa.string(), nullable=False),
            pa.field("chunk_id", pa.string()),
            pa.field("document_id", pa.string()),
            pa.field("chunk_index", pa.int32()),
            pa.field("metadata", pa.string())
        ])

        data = []
        skipped_count = 0
        
        for chunk, vector in zip(chunks, embeddings):
            # Check for NaN values in the vector
            if np.isnan(vector).any():
                print(f"⚠️ Skipping chunk '{chunk.get('chunk_id', 'unknown')}' due to NaN values in embedding")
                skipped_count += 1
                continue
                
            # Check for infinite values in the vector
            if np.isinf(vector).any():
                print(f"⚠️ Skipping chunk '{chunk.get('chunk_id', 'unknown')}' due to infinite values in embedding")
                skipped_count += 1
                continue
            
            # Ensure original_text is in metadata if not already present
            if 'original_text' not in chunk['metadata']:
                chunk['metadata']['original_text'] = chunk['text']

            # Extract document_id and chunk_index for top-level storage
            doc_id = chunk.get("metadata", {}).get("document_id", "unknown")
            chunk_idx = chunk.get("metadata", {}).get("chunk_index", -1)

            # Defensive check for text content to ensure it's a non-empty string
            text_content = chunk.get('text', '')
            if not text_content or not isinstance(text_content, str):
                text_content = ""

            data.append({
                "vector": vector.tolist(),
                "text": text_content,
                "chunk_id": chunk['chunk_id'],
                "document_id": doc_id,
                "chunk_index": chunk_idx,
                "metadata": json.dumps(chunk)
            })

        if skipped_count > 0:
            print(f"⚠️ Skipped {skipped_count} chunks due to invalid embeddings (NaN or infinite values)")
        
        if not data:
            print("❌ No valid embeddings to index after filtering out NaN/infinite values")
            return

        # Incremental indexing: append to existing table if present, otherwise create it
        db = self.db_manager.db  # underlying LanceDB connection

        if hasattr(db, "table_names") and table_name in db.table_names():
            tbl = self.db_manager.get_table(table_name)
            print(f"Appending {len(data)} vectors to existing table '{table_name}'.")
        else:
            print(f"Creating table '{table_name}' (new) and adding {len(data)} vectors...")
            tbl = self.db_manager.create_table(table_name, schema=schema, mode="create")

        # Add data with NaN handling configuration
        try:
            tbl.add(data, on_bad_vectors='drop')
            print(f"✅ Indexed {len(data)} vectors into table '{table_name}'.")
        except Exception as e:
            print(f"❌ Failed to add data to table: {e}")
            # Fallback: try with fill strategy
            try:
                print("🔄 Retrying with NaN fill strategy...")
                tbl.add(data, on_bad_vectors='fill', fill_value=0.0)
                print(f"✅ Indexed {len(data)} vectors into table '{table_name}' (with NaN fill).")
            except Exception as e2:
                print(f"❌ Failed to add data even with NaN fill: {e2}")
                raise

# BM25Indexer is no longer needed as we are moving to LanceDB's native FTS.
# class BM25Indexer:
#     ...

if __name__ == '__main__':
    print("embedders.py updated for contextual enrichment.")
    
    # This chunk has been "enriched". The 'text' field contains the context.
    enriched_chunk = {
        'chunk_id': 'doc1_0', 
        'text': 'Context: Discusses animals.\n\n---\n\nOriginal: The cat sat on the mat.', 
        'metadata': {
            'original_text': 'The cat sat on the mat.',
            'contextual_summary': 'Discusses animals.',
            'document_id': 'doc1', 
            'title': 'Pet Stories'
        }
    }
    sample_embeddings = np.random.rand(1, 128).astype('float32')

    DB_PATH = "./rag_system/index_store/lancedb"
    db_manager = LanceDBManager(db_path=DB_PATH)
    vector_indexer = VectorIndexer(db_manager=db_manager)

    vector_indexer.index(
        table_name="enriched_text_embeddings", 
        chunks=[enriched_chunk], 
        embeddings=sample_embeddings
    )
    
    try:
        tbl = db_manager.get_table("enriched_text_embeddings")
        df = tbl.limit(1).to_pandas()
        df['metadata'] = df['metadata'].apply(json.loads)
        print("\n--- Verification ---")
        print("Embedded Text:", df['text'].iloc[0])
        print("Original Text from Metadata:", df['metadata'].iloc[0]['original_text'])
    except Exception as e:
        print(f"Could not verify LanceDB table. Error: {e}")


================================================
FILE: rag_system/indexing/graph_extractor.py
================================================
from typing import List, Dict, Any
import json
from rag_system.utils.ollama_client import OllamaClient

class GraphExtractor:
    """
    Extracts entities and relationships from text chunks using a live Ollama model.
    """
    def __init__(self, llm_client: OllamaClient, llm_model: str):
        self.llm_client = llm_client
        self.llm_model = llm_model
        print(f"Initialized GraphExtractor with Ollama model '{self.llm_model}'.")

    def extract(self, chunks: List[Dict[str, Any]]) -> Dict[str, List[Dict]]:
        all_entities = {}
        all_relationships = set()

        print(f"Extracting graph from {len(chunks)} chunks with Ollama...")
        for i, chunk in enumerate(chunks):
            # Step 1: Extract Entities
            entity_prompt = f"""
            From the following text, extract key entities (people, companies, locations).
            Return the answer as a JSON object with a single key 'entities', which is a list of strings.
            Each entity should be a short, specific name, not a long string of text.

            Text: "{chunk['text']}"
            """
            
            entity_response = self.llm_client.generate_completion(
                self.llm_model, 
                entity_prompt,
                format="json" 
            )
            
            entity_response_text = entity_response.get('response', '{}')

            try:
                entity_data = json.loads(entity_response_text)
                entities = entity_data.get('entities', [])
                
                if not entities:
                    continue

                # Clean up entities
                cleaned_entities = []
                for entity in entities:
                    if len(entity) < 50 and not any(c in entity for c in "[]{}()"):
                        cleaned_entities.append(entity)

                if not cleaned_entities:
                    continue

                # Step 2: Extract Relationships
                relationship_prompt = f"""
                Given the following entities: {cleaned_entities}
                And the following text: "{chunk['text']}"
                Extract the relationships between the entities.
                Return the answer as a JSON object with a single key 'relationships', which is a list of objects, each with 'source', 'target', and 'label'.
                """

                relationship_response = self.llm_client.generate_completion(
                    self.llm_model,
                    relationship_prompt,
                    format="json"
                )

                relationship_response_text = relationship_response.get('response', '{}')
                relationship_data = json.loads(relationship_response_text)

                for entity_name in cleaned_entities:
                    all_entities[entity_name] = {"id": entity_name, "type": "Unknown"} # Placeholder type

                for rel in relationship_data.get("relationships", []):
                    if 'source' in rel and 'target' in rel and 'label' in rel:
                        all_relationships.add(
                            (rel['source'], rel['target'], rel['label'])
                        )

            except json.JSONDecodeError:
                print(f"Warning: Could not decode JSON from LLM for chunk {i+1}.")
                continue
        
        return {
            "entities": list(all_entities.values()),
            "relationships": [{"source": s, "target": t, "label": l} for s, t, l in all_relationships]
        }


================================================
FILE: rag_system/indexing/latechunk.py
================================================
from __future__ import annotations

"""Late Chunking encoder.

This helper feeds the *entire* document to the embedding model, collects
per-token hidden-states and then mean-pools those vectors inside pre-defined
chunk spans.  The end result is one vector per chunk – but each vector has
been produced with knowledge of the *whole* document, alleviating context-loss
issues of vanilla chunking.

We purposefully keep this class lightweight and free of LanceDB/Chunking
logic so it can be re-used elsewhere (e.g. notebook experiments).
"""

from typing import List, Tuple

import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np

class LateChunkEncoder:
    """Generate late-chunked embeddings given character-offset spans."""

    def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B", *, max_tokens: int = 8192) -> None:
        self.model_name = model_name
        self.max_len = max_tokens
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Back-compat: allow short alias without repo namespace
        repo_id = model_name
        if "/" not in model_name and not model_name.startswith("Qwen/"):
            # map common alias to official repo
            alias_map = {
                "qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B",
            }
            repo_id = alias_map.get(model_name.lower(), model_name)

        self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
        self.model.to(self.device)
        self.model.eval()

    @torch.inference_mode()
    def encode(self, text: str, chunk_spans: List[Tuple[int, int]]) -> List[np.ndarray]:
        """Return one vector *per* span.

        Args:
            text: Full document text.
            chunk_spans: List of (char_start, char_end) offsets for each chunk.

        Returns:
            List of numpy float32 arrays – one per chunk.
        """
        if not chunk_spans:
            return []

        # Tokenise and obtain per-token hidden states
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            return_offsets_mapping=True,
            truncation=True,
            max_length=self.max_len,
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        offsets = inputs.pop("offset_mapping").squeeze(0).cpu().tolist()  # (seq_len, 2)

        out = self.model(**inputs)
        last_hidden = out.last_hidden_state.squeeze(0)  # (seq_len, dim)
        last_hidden = last_hidden.cpu()

        # For each chunk span, gather token indices belonging to it
        vectors: List[np.ndarray] = []
        for start_char, end_char in chunk_spans:
            token_indices = [i for i, (s, e) in enumerate(offsets) if s >= start_char and e <= end_char]
            if not token_indices:
                # Fallback: if tokenizer lost the span (e.g. due to trimming) just average CLS + SEP
                token_indices = [0]
            chunk_vec = last_hidden[token_indices].mean(dim=0).numpy().astype("float32")
            
            # Check for NaN or infinite values
            if np.isnan(chunk_vec).any() or np.isinf(chunk_vec).any():
                print(f"⚠️ Warning: Invalid values detected in late chunk embedding for span ({start_char}, {end_char})")
                # Replace invalid values with zeros
                chunk_vec = np.nan_to_num(chunk_vec, nan=0.0, posinf=0.0, neginf=0.0)
                print(f"🔄 Replaced invalid values with zeros")
            
            vectors.append(chunk_vec)
        return vectors 

================================================
FILE: rag_system/indexing/multimodal.py
================================================
import fitz  # PyMuPDF
from PIL import Image
import torch
import os
from typing import List, Dict, Any

from rag_system.indexing.embedders import LanceDBManager, VectorIndexer
from rag_system.indexing.representations import QwenEmbedder


from transformers import ColPaliForRetrieval, ColPaliProcessor, Qwen2TokenizerFast

class LocalVisionModel:
    """
    A wrapper for a local vision model (ColPali) from the transformers library.
    """
    def __init__(self, model_name: str = "vidore/colqwen2-v1.0", device: str = "cpu"):
        print(f"Initializing local vision model '{model_name}' on device '{device}'.")
        self.device = device
        self.model = ColPaliForRetrieval.from_pretrained(model_name).to(self.device).eval()
        self.tokenizer = Qwen2TokenizerFast.from_pretrained(model_name)
        self.image_processor = ColPaliProcessor.from_pretrained(model_name).image_processor
        self.processor = ColPaliProcessor(tokenizer=self.tokenizer, image_processor=self.image_processor)
        print("Local vision model loaded successfully.")

    def embed_image(self, image: Image.Image) -> torch.Tensor:
        """
        Generates a multi-vector embedding for a single image.
        """
        inputs = self.processor(text="", images=image, return_tensors="pt").to(self.device)
        with torch.no_grad():
            image_embeds = self.model.get_image_features(**inputs)
        return image_embeds


class MultimodalProcessor:
    """
    Processes PDFs into separate text and image embeddings using local models.
    """
    def __init__(self, vision_model: LocalVisionModel, text_embedder: QwenEmbedder, db_manager: LanceDBManager):
        self.vision_model = vision_model
        self.text_embedder = text_embedder
        self.text_vector_indexer = VectorIndexer(db_manager)
        self.image_vector_indexer = VectorIndexer(db_manager)

    def process_and_index(
        self, 
        pdf_path: str, 
        text_table_name: str, 
        image_table_name: str
    ):
        print(f"\n--- Processing PDF for multimodal indexing: {os.path.basename(pdf_path)} ---")
        doc = fitz.open(pdf_path)
        document_id = os.path.basename(pdf_path)
        
        all_pages_text_chunks = []
        all_pages_images = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            
            # 1. Extract Text
            text = page.get_text("text")
            if not text.strip():
                text = f"Page {page_num + 1} contains no extractable text."
            
            all_pages_text_chunks.append({
                "chunk_id": f"{document_id}_page_{page_num+1}",
                "text": text,
                "metadata": {"document_id": document_id, "page_number": page_num + 1}
            })
            
            # 2. Extract Image
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            all_pages_images.append(img)

        # --- Batch Indexing ---
        # Index all text chunks
        if all_pages_text_chunks:
            text_embeddings = self.text_embedder.create_embeddings([c['text'] for c in all_pages_text_chunks])
            self.text_vector_indexer.index(text_table_name, all_pages_text_chunks, text_embeddings)
            print(f"Indexed {len(all_pages_text_chunks)} text pages into '{text_table_name}'.")

        # Index all images
        if all_pages_images:
            image_embeddings = self.vision_model.create_image_embeddings(all_pages_images)
            # We use the text chunks as placeholders for metadata
            self.image_vector_indexer.index(image_table_name, all_pages_text_chunks, image_embeddings)
            print(f"Indexed {len(all_pages_images)} image pages into '{image_table_name}'.")

if __name__ == '__main__':
    # This test requires an internet connection to download the models.
    try:
        # 1. Setup models and dependencies
        text_embedder = QwenEmbedder()
        vision_model = LocalVisionModel()
        db_manager = LanceDBManager(db_path="./rag_system/index_store/lancedb")
        
        # 2. Create a dummy PDF
        dummy_pdf_path = "multimodal_test.pdf"
        doc = fitz.open()
        page = doc.new_page()
        page.insert_text((50, 72), "This is a test page with text and an image.")
        doc.save(dummy_pdf_path)
        
        # 3. Run the processor
        processor = MultimodalProcessor(vision_model, text_embedder, db_manager)
        processor.process_and_index(
            pdf_path=dummy_pdf_path,
            text_table_name="test_text_pages",
            image_table_name="test_image_pages"
        )
        
        # 4. Verify
        print("\n--- Verification ---")
        text_tbl = db_manager.get_table("test_text_pages")
        img_tbl = db_manager.get_table("test_image_pages")
        print(f"Text table has {len(text_tbl)} rows.")
        print(f"Image table has {len(img_tbl)} rows.")

    except Exception as e:
        print(f"\nAn error occurred during the multimodal test: {e}")
        print("Please ensure you have an internet connection for model downloads.")

================================================
FILE: rag_system/indexing/overview_builder.py
================================================
from __future__ import annotations

import os, json, logging, re
from typing import List, Dict, Any

logger = logging.getLogger(__name__)

class OverviewBuilder:
    """Generates and stores a one-paragraph overview for each document.
    The overview is derived from the first *n* chunks of the document.
    """

    DEFAULT_PROMPT = (
        "You will receive the beginning of a document. "
        "In no more than 120 tokens, describe what the document is about, "
        "state its type (e.g. invoice, slide deck, policy, research paper, receipt) "
        "and mention 3-5 important entities, numbers or dates it contains.\n\n"
        "DOCUMENT_START:\n{text}\n\nOVERVIEW:"
    )

    def __init__(self, llm_client, model: str = "qwen3:0.6b", first_n_chunks: int = 5,
                 out_path: str | None = None):
        if out_path is None:
            out_path = "index_store/overviews/overviews.jsonl"
        self.llm_client = llm_client
        self.model = model
        self.first_n = first_n_chunks
        self.out_path = out_path
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

    def build_and_store(self, doc_id: str, chunks: List[Dict[str, Any]]):
        if not chunks:
            return
        head_text = "\n".join(c["text"] for c in chunks[: self.first_n] if c.get("text"))
        prompt = self.DEFAULT_PROMPT.format(text=head_text[:5000])  # safety cap
        try:
            resp = self.llm_client.generate_completion(model=self.model, prompt=prompt, enable_thinking=False)
            summary_raw = resp.get("response", "")
            # Remove any lingering <think>...</think> blocks just in case
            summary = re.sub(r'<think[^>]*>.*?</think>', '', summary_raw, flags=re.IGNORECASE | re.DOTALL).strip()
        except Exception as e:
            summary = f"Failed to generate overview: {e}"
        record = {"doc_id": doc_id, "overview": summary.strip()}
        with open(self.out_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

        logger.info(f"📄 Overview generated for {doc_id} (stored in {self.out_path})") 

================================================
FILE: rag_system/indexing/representations.py
================================================
from typing import List, Dict, Any, Protocol
import numpy as np
from transformers import AutoModel, AutoTokenizer
import torch
import os

# We keep the protocol to ensure a consistent interface
class EmbeddingModel(Protocol):
    def create_embeddings(self, texts: List[str]) -> np.ndarray: ...

# Global cache for models - use dict to cache by model name
_MODEL_CACHE = {}

# --- New Ollama Embedder ---
class QwenEmbedder(EmbeddingModel):
    """
    An embedding model that uses a local Hugging Face transformer model.
    """
    def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-0.6B"):
        self.model_name = model_name
        # Auto-select the best available device: CUDA > MPS > CPU
        if torch.cuda.is_available():
            self.device = "cuda"
        elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
            self.device = "mps"
        else:
            self.device = "cpu"

        # Use model-specific cache
        if model_name not in _MODEL_CACHE:
            print(f"Initializing HF Embedder with model '{model_name}' on device '{self.device}'. (first load)")
            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left")
            model = AutoModel.from_pretrained(
                model_name,
                trust_remote_code=True,
                torch_dtype=torch.float16 if self.device != "cpu" else None,
            ).to(self.device).eval()
            _MODEL_CACHE[model_name] = (tokenizer, model)
            print(f"QwenEmbedder weights loaded and cached for {model_name}.")
        else:
            print(f"Reusing cached QwenEmbedder weights for {model_name}.")
        
        self.tokenizer, self.model = _MODEL_CACHE[model_name]

    def create_embeddings(self, texts: List[str]) -> np.ndarray:
        print(f"Generating {len(texts)} embeddings with {self.model_name} model...")
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
            last_hidden = outputs.last_hidden_state  # [B, seq, dim]
            # Pool via last valid token per sequence (recommended for Qwen3)
            seq_len = inputs["attention_mask"].sum(dim=1) - 1  # index of last token
            batch_indices = torch.arange(last_hidden.size(0), device=self.device)
            embeddings = last_hidden[batch_indices, seq_len]
        
        # Convert to numpy and validate
        embeddings_np = embeddings.cpu().numpy()
        
        # Check for NaN or infinite values
        if np.isnan(embeddings_np).any():
            print(f"⚠️ Warning: NaN values detected in embeddings from {self.model_name}")
            # Replace NaN values with zeros
            embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
            print(f"🔄 Replaced NaN values with zeros")
        
        if np.isinf(embeddings_np).any():
            print(f"⚠️ Warning: Infinite values detected in embeddings from {self.model_name}")
            # Replace infinite values with zeros
            embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
            print(f"🔄 Replaced infinite values with zeros")
        
        return embeddings_np

class EmbeddingGenerator:
    def __init__(self, embedding_model: EmbeddingModel, batch_size: int = 50):
        self.model = embedding_model
        self.batch_size = batch_size

    def generate(self, chunks: List[Dict[str, Any]]) -> List[np.ndarray]:
        """Generate embeddings for all chunks using batch processing"""
        texts_to_embed = [chunk['text'] for chunk in chunks]
        if not texts_to_embed: 
            return []
        
        from rag_system.utils.batch_processor import BatchProcessor, estimate_memory_usage
        
        memory_mb = estimate_memory_usage(chunks)
        print(f"Estimated memory usage for {len(chunks)} chunks: {memory_mb:.1f}MB")
        
        batch_processor = BatchProcessor(batch_size=self.batch_size)
        
        def process_text_batch(text_batch):
            if not text_batch:
                return []
            batch_embeddings = self.model.create_embeddings(text_batch)
            return [embedding for embedding in batch_embeddings]
        
        all_embeddings = batch_processor.process_in_batches(
            texts_to_embed,
            process_text_batch,
            "Embedding Generation"
        )
        
        return all_embeddings

class OllamaEmbedder(EmbeddingModel):
    """Call Ollama's /api/embeddings endpoint for each text."""
    def __init__(self, model_name: str, host: str | None = None, timeout: int = 60):
        self.model_name = model_name
        self.host = (host or os.getenv("OLLAMA_HOST") or "http://localhost:11434").rstrip("/")
        self.timeout = timeout

    def _embed_single(self, text: str):
        import requests, numpy as np, json
        payload = {"model": self.model_name, "prompt": text}
        r = requests.post(f"{self.host}/api/embeddings", json=payload, timeout=self.timeout)
        r.raise_for_status()
        data = r.json()
        # Ollama may return {"embedding": [...]} or {"data": [...]} depending on version
        vec = data.get("embedding") or data.get("data")
        if vec is None:
            raise ValueError("Unexpected Ollama embeddings response format")
        return np.array(vec, dtype="float32")

    def create_embeddings(self, texts: List[str]):
        import numpy as np
        vectors = [self._embed_single(t) for t in texts]
        embeddings_np = np.vstack(vectors)
        
        # Check for NaN or infinite values
        if np.isnan(embeddings_np).any():
            print(f"⚠️ Warning: NaN values detected in Ollama embeddings from {self.model_name}")
            # Replace NaN values with zeros
            embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
            print(f"🔄 Replaced NaN values with zeros")
        
        if np.isinf(embeddings_np).any():
            print(f"⚠️ Warning: Infinite values detected in Ollama embeddings from {self.model_name}")
            # Replace infinite values with zeros
            embeddings_np = np.nan_to_num(embeddings_np, nan=0.0, posinf=0.0, neginf=0.0)
            print(f"🔄 Replaced infinite values with zeros")
        
        return embeddings_np

def select_embedder(model_name: str, ollama_host: str | None = None):
    """Return appropriate EmbeddingModel implementation for the given name."""
    if "/" in model_name or model_name.startswith("http"):
        # Treat as HF model path
        return QwenEmbedder(model_name=model_name)
    # Otherwise assume it's an Ollama tag
    return OllamaEmbedder(model_name=model_name, host=ollama_host)

if __name__ == '__main__':
    print("representations.py cleaned up.")
    try:
        qwen_embedder = QwenEmbedder()
        emb_gen = EmbeddingGenerator(embedding_model=qwen_embedder)
        
        sample_chunks = [{'text': 'Hello world'}, {'text': 'This is a test'}]
        embeddings = emb_gen.generate(sample_chunks)
        
        print(f"\nSuccessfully generated {len(embeddings)} embeddings.")
        print(f"Shape of first embedding: {embeddings[0].shape}")

    except Exception as e:
        print(f"\nAn error occurred during the QwenEmbedder test: {e}")
        print("Please ensure you have an internet connection for model downloads.")

================================================
FILE: rag_system/ingestion/__init__.py
================================================


================================================
FILE: rag_system/ingestion/chunking.py
================================================
from typing import List, Dict, Any, Optional
import re
from transformers import AutoTokenizer

class MarkdownRecursiveChunker:
    """
    A recursive chunker that splits Markdown text based on its semantic structure
    and embeds document-level metadata into each chunk.
    """

    def __init__(self, max_chunk_size: int = 1500, min_chunk_size: int = 200, tokenizer_model: str = "Qwen/Qwen3-Embedding-0.6B"):
        self.max_chunk_size = max_chunk_size
        self.min_chunk_size = min_chunk_size
        self.split_priority = ["\n## ", "\n### ", "\n#### ", "```", "\n\n"]
        
        repo_id = tokenizer_model
        if "/" not in tokenizer_model and not tokenizer_model.startswith("Qwen/"):
            repo_id = {
                "qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B",
            }.get(tokenizer_model.lower(), tokenizer_model)
        
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
        except Exception as e:
            print(f"Warning: Failed to load tokenizer {repo_id}: {e}")
            print("Falling back to character-based approximation (4 chars ≈ 1 token)")
            self.tokenizer = None

    def _token_len(self, text: str) -> int:
        """Get token count for text using the tokenizer."""
        if self.tokenizer is not None:
            return len(self.tokenizer.tokenize(text))
        else:
            return max(1, len(text) // 4)
    
    def _split_text(self, text: str, separators: List[str]) -> List[str]:
        final_chunks = []
        chunks_to_process = [text]
        
        for sep in separators:
            new_chunks = []
            for chunk in chunks_to_process:
                if self._token_len(chunk) > self.max_chunk_size:
                    sub_chunks = re.split(f'({sep})', chunk)
                    combined = []
                    i = 0
                    while i < len(sub_chunks):
                        if i + 1 < len(sub_chunks) and sub_chunks[i+1] == sep:
                            combined.append(sub_chunks[i+1] + sub_chunks[i+2])
                            i += 3
                        else:
                            if sub_chunks[i]:
                                combined.append(sub_chunks[i])
                            i += 1
                    new_chunks.extend(combined)
                else:
                    new_chunks.append(chunk)
            chunks_to_process = new_chunks
        
        final_chunks = []
        for chunk in chunks_to_process:
            if self._token_len(chunk) > self.max_chunk_size:
                words = chunk.split()
                current_chunk = ""
                for word in words:
                    test_chunk = current_chunk + " " + word if current_chunk else word
                    if self._token_len(test_chunk) <= self.max_chunk_size:
                        current_chunk = test_chunk
                    else:
                        if current_chunk:
                            final_chunks.append(current_chunk)
                        current_chunk = word
                if current_chunk:
                    final_chunks.append(current_chunk)
            else:
                final_chunks.append(chunk)

        return final_chunks

    def chunk(self, text: str, document_id: str, document_metadata: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
        """
        Chunks the Markdown text and injects metadata.

        Args:
            text: The Markdown text to chunk.
            document_id: The identifier for the source document.
            document_metadata: A dictionary of metadata for the source document.

        Returns:
            A list of dictionaries, where each dictionary is a chunk with metadata.
        """
        if not text:
            return []

        raw_chunks = self._split_text(text, self.split_priority)
        
        merged_chunks_text = []
        current_chunk = ""
        for chunk_text in raw_chunks:
            test_chunk = current_chunk + chunk_text if current_chunk else chunk_text
            if not current_chunk or self._token_len(test_chunk) <= self.max_chunk_size:
                current_chunk = test_chunk
            elif self._token_len(current_chunk) < self.min_chunk_size:
                 current_chunk = test_chunk
            else:
                merged_chunks_text.append(current_chunk)
                current_chunk = chunk_text
        if current_chunk:
            merged_chunks_text.append(current_chunk)

        final_chunks = []
        for i, chunk_text in enumerate(merged_chunks_text):
            # Combine document-level metadata with chunk-specific metadata
            combined_metadata = (document_metadata or {}).copy()
            combined_metadata.update({
                "document_id": document_id,
                "chunk_number": i,
            })
            
            final_chunks.append({
                "chunk_id": f"{document_id}_{i}", # Create a more unique ID
                "text": chunk_text.strip(),
                "metadata": combined_metadata
            })

        return final_chunks

def create_contextual_window(all_chunks: List[Dict[str, Any]], chunk_index: int, window_size: int = 1) -> str:
    if not (0 <= chunk_index < len(all_chunks)):
        raise ValueError("chunk_index is out of bounds.")
    start = max(0, chunk_index - window_size)
    end = min(len(all_chunks), chunk_index + window_size + 1)
    context_chunks = all_chunks[start:end]
    return " ".join([chunk['text'] for chunk in context_chunks])

if __name__ == '__main__':
    print("chunking.py updated to include document metadata in each chunk.")
    
    sample_markdown = "# Doc Title\n\nContent paragraph."
    doc_meta = {"title": "My Awesome Document", "author": "Jane Doe", "year": 2024}
    
    chunker = MarkdownRecursiveChunker()
    chunks = chunker.chunk(
        text=sample_markdown, 
        document_id="doc456", 
        document_metadata=doc_meta
    )
    
    print(f"\n--- Created {len(chunks)} chunk(s) ---")
    for chunk in chunks:
        print(f"Chunk ID: {chunk['chunk_id']}")
        print(f"Text: '{chunk['text']}'")
        print(f"Metadata: {chunk['metadata']}")
        print("-" * 20)


================================================
FILE: rag_system/ingestion/docling_chunker.py
================================================
from __future__ import annotations

"""Docling-aware chunker (simplified).

For now we proxy the old MarkdownRecursiveChunker but add:
• sentence-aware packing to max_tokens with overlap
• breadcrumb metadata stubs so downstream code already handles them

In a follow-up we can replace the internals with true Docling element-tree
walking once the PDFConverter returns structured nodes.
"""
from typing import List, Dict, Any, Tuple
import math
import re
from itertools import islice
from rag_system.ingestion.chunking import MarkdownRecursiveChunker
from transformers import AutoTokenizer

class DoclingChunker:
    def __init__(self, *, max_tokens: int = 512, overlap: int = 1, tokenizer_model: str = "Qwen/Qwen3-Embedding-0.6B"):
        self.max_tokens = max_tokens
        self.overlap = overlap  # sentences of overlap
        repo_id = tokenizer_model
        if "/" not in tokenizer_model and not tokenizer_model.startswith("Qwen/"):
            repo_id = {
                "qwen3-embedding-0.6b": "Qwen/Qwen3-Embedding-0.6B",
            }.get(tokenizer_model.lower(), tokenizer_model)
        
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
        except Exception as e:
            print(f"Warning: Failed to load tokenizer {repo_id}: {e}")
            print("Falling back to character-based approximation (4 chars ≈ 1 token)")
            self.tokenizer = None
        # Fallback simple sentence splitter (period, question, exclamation, newline)
        self._sent_re = re.compile(r"(?<=[\.\!\?])\s+|\n+")
        self.legacy = MarkdownRecursiveChunker(max_chunk_size=10_000, min_chunk_size=100)

    # ------------------------------------------------------------------
    def _token_len(self, text: str) -> int:
        if self.tokenizer is not None:
            return len(self.tokenizer.tokenize(text))
        else:
            # Fallback: approximate 4 characters per token
            return max(1, len(text) // 4)

    def split_markdown(self, markdown: str, *, document_id: str, metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Split one Markdown doc into chunks with max_tokens limit."""
        base_chunks = self.legacy.chunk(markdown, document_id, metadata)
        new_chunks: List[Dict[str, Any]] = []
        global_idx = 0
        for ch in base_chunks:
            sentences = [s.strip() for s in self._sent_re.split(ch["text"]) if s.strip()]
            if not sentences:
                continue
            window: List[str] = []
            while sentences:
                # Add until over limit
                while sentences and self._token_len(" ".join(window + [sentences[0]])) <= self.max_tokens:
                    window.append(sentences.pop(0))
                if not window:  # single sentence > limit → hard cut
                    window.append(sentences.pop(0))
                chunk_text = " ".join(window)
                new_chunk = {
                    "chunk_id": f"{document_id}_{global_idx}",
                    "text": chunk_text,
                    "metadata": {
                        **metadata,
                        "document_id": document_id,
                        "chunk_index": global_idx,
                        "heading_path": metadata.get("heading_path", []),
                        "heading_level": len(metadata.get("heading_path", [])),
                        "block_type": metadata.get("block_type", "paragraph"),
                    },
                }
                new_chunks.append(new_chunk)
                global_idx += 1
                # Overlap: prepend last `overlap` sentences of the current window to the remaining queue
                if self.overlap and sentences:
                    back = window[-self.overlap:] if self.overlap <= len(window) else window[:]
                    sentences = back + sentences
                window = []
        return new_chunks

    # ------------------------------------------------------------------
    # Element-tree based chunking (true Docling path)
    # ------------------------------------------------------------------
    def chunk_document(self, doc, *, document_id: str, metadata: Dict[str, Any] | None = None) -> List[Dict[str, Any]]:
        """Walk a DoclingDocument and emit chunks.

        Tables / Code / Figures are emitted as atomic chunks.
        Paragraph-like nodes are sentence-packed to <= max_tokens.
        """
        metadata = metadata or {}

        def _token_len(txt: str) -> int:
            if self.tokenizer is not None:
                return len(self.tokenizer.tokenize(txt))
            else:
                # Fallback: approximate 4 characters per token
                return max(1, len(txt) // 4)

        chunks: List[Dict[str, Any]] = []
        global_idx = 0

        # Helper to create a chunk and append to list
        def _add_chunk(text: str, block_type: str, heading_path: List[str], page_no: int | None = None):
            nonlocal global_idx
            if not text.strip():
                return
            chunk_meta = {
                **metadata,
                "document_id": document_id,
                "chunk_index": global_idx,
                "heading_path": heading_path,
                "heading_level": len(heading_path),
                "block_type": block_type,
            }
            if page_no is not None:
                chunk_meta["page"] = page_no
            chunks.append({
                "chunk_id": f"{document_id}_{global_idx}",
                "text": text,
                "metadata": chunk_meta,
            })
            global_idx += 1

        # The Docling API exposes .body which is a tree of nodes; we fall back to .texts/.tables lists if available
        try:
            # We walk doc.texts (reading order). We'll buffer consecutive paragraph items
            current_heading_path: List[str] = []
            buffer: List[str] = []
            buffer_tokens = 0
            buffer_page = None

            def flush_buffer():
                nonlocal buffer, buffer_tokens, buffer_page
                if buffer:
                    _add_chunk(" ".join(buffer), "paragraph", heading_path=current_heading_path[:], page_no=buffer_page)
                buffer, buffer_tokens, buffer_page = [], 0, None

            # Create quick lookup for table items by id to preserve later insertion order if needed
            tables_by_anchor = {
                getattr(t, "anchor_text_id", None): t
                for t in getattr(doc, "tables", [])
                if getattr(t, "anchor_text_id", None) is not None
            }

            for txt_item in getattr(doc, "texts", []):
                # If this text item is a placeholder for a table anchor, emit table first
                anchor_id = getattr(txt_item, "id", None)
                if anchor_id in tables_by_anchor:
                    flush_buffer()
                    tbl = tables_by_anchor[anchor_id]
                    try:
                        tbl_md = tbl.export_to_markdown(doc)  # pass doc for deprecation compliance
                    except Exception:
                        tbl_md = tbl.export_to_markdown() if hasattr(tbl, "export_to_markdown") else str(tbl)
                    _add_chunk(tbl_md, "table", heading_path=current_heading_path[:], page_no=getattr(tbl, "page_no", None))

                role = getattr(txt_item, "role", None)
                if role == "heading":
                    flush_buffer()
                    level = getattr(txt_item, "level", 1)
                    current_heading_path = current_heading_path[: max(0, level - 1)]
                    current_heading_path.append(txt_item.text.strip())
                    continue  # skip heading as content

                text_piece = txt_item.text if hasattr(txt_item, "text") else str(txt_item)
                piece_tokens = _token_len(text_piece)
                if piece_tokens > self.max_tokens:  # very long paragraph
                    flush_buffer()
                    _add_chunk(text_piece, "paragraph", heading_path=current_heading_path[:], page_no=getattr(txt_item, "page_no", None))
                    continue

                if buffer_tokens + piece_tokens > self.max_tokens:
                    flush_buffer()

                buffer.append(text_piece)
                buffer_tokens += piece_tokens
                if buffer_page is None:
                    buffer_page = getattr(txt_item, "page_no", None)

            flush_buffer()

            # Emit any remaining tables that were not anchored
            for tbl in getattr(doc, "tables", []):
                if tbl in tables_by_anchor.values():
                    continue  # already emitted
                try:
                    tbl_md = tbl.export_to_markdown(doc)
                except Exception:
                    tbl_md = tbl.export_to_markdown() if hasattr(tbl, "export_to_markdown") else str(tbl)
                _add_chunk(tbl_md, "table", heading_path=current_heading_path[:], page_no=getattr(tbl, "page_no", None))
        except Exception as e:
            print(f"⚠️  Docling tree walk failed: {e}. Falling back to markdown splitter.")
            return self.split_markdown(doc.export_to_markdown(), document_id=document_id, metadata=metadata)

        # --------------------------------------------------------------
        # Second-pass consolidation: merge small consecutive paragraph
        # chunks that share heading & page into up-to-max_tokens blobs.
        # --------------------------------------------------------------
        consolidated: List[Dict[str, Any]] = []
        buf_txt: List[str] = []
        buf_meta: Dict[str, Any] | None = None

        def flush_paragraph_buffer():
            nonlocal buf_txt, buf_meta
            if not buf_txt:
                return
            merged_text = " ".join(buf_txt)
            # Re-use meta from first piece but update chunk_id later
            new_chunk = {
                "chunk_id": buf_meta["chunk_id"],
                "text": merged_text,
                "metadata": buf_meta["metadata"],
            }
            consolidated.append(new_chunk)
            buf_txt = []
            buf_meta = None

        for ch in chunks:
            if ch["metadata"].get("block_type") != "paragraph":
                flush_paragraph_buffer()
                consolidated.append(ch)
                continue

            if not buf_txt:
                buf_txt.append(ch["text"])
                buf_meta = ch
                continue

            same_page = ch["metadata"].get("page") == buf_meta["metadata"].get("page")
            same_heading = ch["metadata"].get("heading_path") == buf_meta["metadata"].get("heading_path")

            prospective_len = self._token_len(" ".join(buf_txt + [ch["text"]]))
            if same_page and same_heading and prospective_len <= self.max_tokens:
                buf_txt.append(ch["text"])
            else:
                flush_paragraph_buffer()
                buf_txt.append(ch["text"])
                buf_meta = ch

        flush_paragraph_buffer()

        return consolidated

    # Public API expected by IndexingPipeline --------------------------------
    def chunk(self, text: str, document_id: str, document_metadata: Dict[str, Any] | None = None) -> List[Dict[str, Any]]:
        return self.split_markdown(text, document_id=document_id, metadata=document_metadata or {})    

================================================
FILE: rag_system/ingestion/document_converter.py
================================================
from typing import List, Tuple, Dict, Any
from docling.document_converter import DocumentConverter as DoclingConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, OcrMacOptions
from docling.datamodel.base_models import InputFormat
import fitz  # PyMuPDF for quick text inspection
import os

class DocumentConverter:
    """
    A class to convert various document formats to structured Markdown using the docling library.
    Supports PDF, DOCX, HTML, and other formats.
    """
    
    # Mapping of file extensions to InputFormat
    SUPPORTED_FORMATS = {
        '.pdf': InputFormat.PDF,
        '.docx': InputFormat.DOCX,
        '.html': InputFormat.HTML,
        '.htm': InputFormat.HTML,
        '.md': InputFormat.MD,
        '.txt': 'TXT',  # Special handling for plain text files
    }
    
    def __init__(self):
        """Initializes the docling document converter with forced OCR enabled for macOS."""
        try:
            # --- Converter WITHOUT OCR (fast path) ---
            pipeline_no_ocr = PdfPipelineOptions()
            pipeline_no_ocr.do_ocr = False
            format_no_ocr = {
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_no_ocr)
            }
            self.converter_no_ocr = DoclingConverter(format_options=format_no_ocr)

            # --- Converter WITH OCR (fallback) ---
            pipeline_ocr = PdfPipelineOptions()
            pipeline_ocr.do_ocr = True
            ocr_options = OcrMacOptions(force_full_page_ocr=True)
            pipeline_ocr.ocr_options = ocr_options
            format_ocr = {
                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_ocr)
            }
            self.converter_ocr = DoclingConverter(format_options=format_ocr)
            
            self.converter_general = DoclingConverter()

            print("docling DocumentConverter(s) initialized (OCR + no-OCR + general).")
        except Exception as e:
            print(f"Error initializing docling DocumentConverter(s): {e}")
            self.converter_no_ocr = None
            self.converter_ocr = None
            self.converter_general = None

    def convert_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]:
        """
        Converts a document to a single Markdown string, preserving layout and tables.
        Supports PDF, DOCX, HTML, and other formats.
        """
        if not (self.converter_no_ocr and self.converter_ocr and self.converter_general):
            print("docling converters not available. Skipping conversion.")
            return []
        
        file_ext = os.path.splitext(file_path)[1].lower()
        if file_ext not in self.SUPPORTED_FORMATS:
            print(f"Unsupported file format: {file_ext}")
            return []
        
        input_format = self.SUPPORTED_FORMATS[file_ext]
        
        if input_format == InputFormat.PDF:
            return self._convert_pdf_to_markdown(file_path)
        elif input_format == 'TXT':
            return self._convert_txt_to_markdown(file_path)
        else:
            return self._convert_general_to_markdown(file_path, input_format)
    
    def _convert_pdf_to_markdown(self, pdf_path: str) -> List[Tuple[str, Dict[str, Any]]]:
        """Convert PDF with OCR detection logic."""
        # Quick heuristic: if the PDF already contains a text layer, skip OCR for speed
        def _pdf_has_text(path: str) -> bool:
            try:
                doc = fitz.open(path)
                for page in doc:
                    if page.get_text("text").strip():
                        return True
            except Exception:
                pass
            return False

        use_ocr = not _pdf_has_text(pdf_path)
        converter = self.converter_ocr if use_ocr else self.converter_no_ocr
        ocr_msg = "(OCR enabled)" if use_ocr else "(no OCR)"

        print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...")
        return self._perform_conversion(pdf_path, converter, ocr_msg)
    
    def _convert_txt_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]:
        """Convert plain text files to markdown by reading content directly."""
        print(f"Converting {file_path} (TXT) to Markdown...")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            markdown_content = f"```\n{content}\n```"
            metadata = {"source": file_path}
            
            print(f"Successfully converted {file_path} (TXT) to Markdown.")
            return [(markdown_content, metadata)]
        except Exception as e:
            print(f"Error processing TXT file {file_path}: {e}")
            return []
    
    def _convert_general_to_markdown(self, file_path: str, input_format: InputFormat) -> List[Tuple[str, Dict[str, Any]]]:
        """Convert non-PDF formats using general converter."""
        print(f"Converting {file_path} ({input_format.name}) to Markdown using docling...")
        return self._perform_conversion(file_path, self.converter_general, f"({input_format.name})")
    
    def _perform_conversion(self, file_path: str, converter, format_msg: str) -> List[Tuple[str, Dict[str, Any]]]:
        """Perform the actual conversion using the specified converter."""
        pages_data = []
        try:
            result = converter.convert(file_path)
            markdown_content = result.document.export_to_markdown()
            
            metadata = {"source": file_path}
            # Return the *DoclingDocument* object as third tuple element so downstream
            # chunkers that understand the element tree can use it.  Legacy callers that
            # expect only (markdown, metadata) can simply ignore the extra value.
            pages_data.append((markdown_content, metadata, result.document))
            print(f"Successfully converted {file_path} with docling {format_msg}.")
            return pages_data
        except Exception as e:
            print(f"Error processing {file_path} with docling: {e}")
            return []


================================================
FILE: rag_system/main.py
================================================
import os
import json
import sys
import argparse
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# The sys.path manipulation has been removed to prevent import conflicts.
# This script should be run as a module from the project root, e.g.:
# python -m rag_system.main api

from rag_system.agent.loop import Agent
from rag_system.utils.ollama_client import OllamaClient
# Configuration is now defined in this file - no import needed

# Advanced RAG System Configuration
# ==================================
# This file contains the MASTER configuration for all models used in the RAG system.
# All components should reference these configurations to ensure consistency.

# ============================================================================
# 🎯 MASTER MODEL CONFIGURATION
# ============================================================================
# All model configurations are centralized here to prevent conflicts

# LLM Backend Configuration
LLM_BACKEND = os.getenv("LLM_BACKEND", "ollama")

# Ollama Models Configuration (for inference via Ollama)
OLLAMA_CONFIG = {
    "host": os.getenv("OLLAMA_HOST", "http://localhost:11434"),
    "generation_model": "qwen3:8b",  # Main text generation model
    "enrichment_model": "qwen3:0.6b",  # Lightweight model for routing/enrichment
}

WATSONX_CONFIG = {
    "api_key": os.getenv("WATSONX_API_KEY", ""),
    "project_id": os.getenv("WATSONX_PROJECT_ID", ""),
    "url": os.getenv("WATSONX_URL", "https://us-south.ml.cloud.ibm.com"),
    "generation_model": os.getenv("WATSONX_GENERATION_MODEL", "ibm/granite-13b-chat-v2"),
    "enrichment_model": os.getenv("WATSONX_ENRICHMENT_MODEL", "ibm/granite-8b-japanese"),  # Lightweight model
}

# External Model Configuration (HuggingFace models used directly)
EXTERNAL_MODELS = {
    "embedding_model": "Qwen/Qwen3-Embedding-0.6B",  # HuggingFace embedding model (1024 dims - fresh start)
    "reranker_model": "answerdotai/answerai-colbert-small-v1",  # ColBERT reranker
    "vision_model": "Qwen/Qwen-VL-Chat",  # Vision model for multimodal
    "fallback_reranker": "BAAI/bge-reranker-base",  # Backup reranker
}

# ============================================================================
# 🔧 PIPELINE CONFIGURATIONS
# ============================================================================

PIPELINE_CONFIGS = {
    "default": {
        "description": "Production-ready pipeline with hybrid search, AI reranking, and verification",
        "storage": {
            "lancedb_uri": "./lancedb",
            "text_table_name": "text_pages_v3", 
            "image_table_name": "image_pages_v3",
            "bm25_path": "./index_store/bm25",
            "graph_path": "./index_store/graph/knowledge_graph.gml"
        },
        "retrieval": {
            "retriever": "multivector",
            "search_type": "hybrid",
            "late_chunking": {
                "enabled": True,
                "table_suffix": "_lc_v3"
        },
            "dense": { 
                "enabled": True,
                "weight": 0.7
            },
            "bm25": { 
                "enabled": True,
                "index_name": "rag_bm25_index"
            },
            "graph": { 
                "enabled": False,
                "graph_path": "./index_store/graph/knowledge_graph.gml"
            }
        },
        # 🎯 EMBEDDING MODEL: Uses HuggingFace Qwen model directly
        "embedding_model_name": EXTERNAL_MODELS["embedding_model"],
        # 🎯 VISION MODEL: For multimodal capabilities  
        "vision_model_name": EXTERNAL_MODELS["vision_model"],
        # 🎯 RERANKER: AI-powered reranking with ColBERT
        "reranker": {
            "enabled": True, 
            "type": "ai",
            "strategy": "rerankers-lib",
            "model_name": EXTERNAL_MODELS["reranker_model"],
            "top_k": 10
        },
        "query_decomposition": {
            "enabled": True,
            "max_sub_queries": 3,
            "compose_from_sub_answers": True
        },
        "verification": {"enabled": True},
        "retrieval_k": 20,
        "context_window_size": 0,
        "semantic_cache_threshold": 0.98,
        "cache_scope": "global",
        # 🔧 Contextual enrichment configuration
        "contextual_enricher": {
            "enabled": True,
            "window_size": 1
        },
        # 🔧 Indexing configuration
        "indexing": {
            "embedding_batch_size": 50,
            "enrichment_batch_size": 10,
            "enable_progress_tracking": True
        }
    },
    "fast": {
        "description": "Speed-optimized pipeline with minimal overhead",
        "storage": {
            "lancedb_uri": "./lancedb",
            "text_table_name": "text_pages_v3",
            "image_table_name": "image_pages_v3", 
            "bm25_path": "./index_store/bm25"
        },
        "retrieval": {
            "retriever": "multivector",
            "search_type": "vector_only",
            "late_chunking": {"enabled": False},
            "dense": {"enabled": True}
        },
        "embedding_model_name": EXTERNAL_MODELS["embedding_model"],
        "reranker": {"enabled": False},
        "query_decomposition": {"enabled": False},
        "verification": {"enabled": False},
        "retrieval_k": 10,
        "context_window_size": 0,
        # 🔧 Contextual enrichment (disabled for speed)
        "contextual_enricher": {
            "enabled": False,
            "window_size": 1
        },
        # 🔧 Indexing configuration
        "indexing": {
            "embedding_batch_size": 100,
            "enrichment_batch_size": 50,
            "enable_progress_tracking": False
        }
    },
    "bm25": {
        "enabled": True,
        "index_name": "rag_bm25_index"
    },
    "graph_rag": {
        "enabled": False, # Keep disabled for now unless specified
    }
}

# ============================================================================
# 🏭 FACTORY FUNCTIONS
# ============================================================================

def get_agent(mode: str = "default") -> Agent:
    """
    Factory function to get an instance of the RAG agent based on the specified mode.
    
    Args:
        mode: Configuration mode ("default", "fast")
        
    Returns:
        Configured Agent instance
    """
    load_dotenv()
    
    # Initialize the appropriate LLM client based on backend configuration
    if LLM_BACKEND.lower() == "watsonx":
        from rag_system.utils.watsonx_client import WatsonXClient
        
        if not WATSONX_CONFIG["api_key"] or not WATSONX_CONFIG["project_id"]:
            raise ValueError(
                "Watson X configuration incomplete. Please set WATSONX_API_KEY and WATSONX_PROJECT_ID "
                "environment variables."
            )
        
        llm_client = WatsonXClient(
            api_key=WATSONX_CONFIG["api_key"],
            project_id=WATSONX_CONFIG["project_id"],
            url=WATSONX_CONFIG["url"]
        )
        llm_config = WATSONX_CONFIG
        print(f"🔧 Using Watson X backend with granite models")
    else:
        llm_client = OllamaClient(host=OLLAMA_CONFIG["host"])
        llm_config = OLLAMA_CONFIG
        print(f"🔧 Using Ollama backend")
    
    # Get the configuration for the specified mode
    config = PIPELINE_CONFIGS.get(mode, PIPELINE_CONFIGS['default'])
    
    agent = Agent(
        pipeline_configs=config, 
        llm_client=llm_client, 
        ollama_config=llm_config
    )
    return agent

def validate_model_config():
    """
    Validates the model configuration for consistency and availability.
    
    Raises:
        ValueError: If configuration conflicts are detected
    """
    print("🔍 Validating model configuration...")
    
    # Check for embedding model consistency
    default_embedding = PIPELINE_CONFIGS["default"]["embedding_model_name"]
    external_embedding = EXTERNAL_MODELS["embedding_model"]
    
    if default_embedding != external_embedding:
        raise ValueError(f"Embedding model mismatch: {default_embedding} != {external_embedding}")
    
    # Check reranker configuration
    default_reranker = PIPELINE_CONFIGS["default"]["reranker"]["model_name"]
    external_reranker = EXTERNAL_MODELS["reranker_model"]
    
    if default_reranker != external_reranker:
        raise ValueError(f"Reranker model mismatch: {default_reranker} != {external_reranker}")
    
    print("✅ Model configuration validation passed!")
    
    return True

# ============================================================================
# 🚀 UTILITY FUNCTIONS  
# ============================================================================

def run_indexing(docs_path: str, config_mode: str = "default"):
    """Runs the indexing pipeline for the specified documents."""
    print(f"📚 Starting indexing for documents in: {docs_path}")
    validate_model_config()
    
    # Local import to avoid circular dependencies
    from rag_system.pipelines.indexing_pipeline import IndexingPipeline
    
    # Get the appropriate indexing pipeline from the factory
    indexing_pipeline = IndexingPipeline(PIPELINE_CONFIGS[config_mode])
    
    # Find all PDF files in the directory
    pdf_files = [os.path.join(docs_path, f) for f in os.listdir(docs_path) if f.endswith(".pdf")]
    
    if not pdf_files:
        print("No PDF files found to index.")
        return

    # Process all documents through the pipeline
    indexing_pipeline.process_documents(pdf_files)
    print("✅ Indexing complete.")

def run_chat(query: str):
    """
    Runs the agentic RAG pipeline for a given query.
    Returns the result as a JSON string.
    """
    try:
        validate_model_config()
        ollama_client = OllamaClient(OLLAMA_CONFIG["host"])
    except ConnectionError as e:
        print(e)
        return json.dumps({"error": str(e)}, indent=2)
    except ValueError as e:
        print(f"Configuration Error: {e}")
        return json.dumps({"error": f"Configuration Error: {e}"}, indent=2)

    agent = Agent(PIPELINE_CONFIGS['default'], ollama_client, OLLAMA_CONFIG)
    result = agent.run(query)
    return json.dumps(result, indent=2, ensure_ascii=False)

def show_graph():
    """
    Loads and displays the knowledge graph.
    """
    import networkx as nx
    import matplotlib.pyplot as plt

    graph_path = PIPELINE_CONFIGS["indexing"]["graph_path"]
    if not os.path.exists(graph_path):
        print("Knowledge graph not found. Please run the 'index' command first.")
        return

    G = nx.read_gml(graph_path)
    print("--- Knowledge Graph ---")
    print("Nodes:", G.nodes(data=True))
    print("Edges:", G.edges(data=True))
    print("---------------------")

    # Optional: Visualize the graph
    try:
        pos = nx.spring_layout(G)
        nx.draw(G, pos, with_labels=True, node_size=2000, node_color="skyblue", font_size=10, font_weight="bold")
        edge_labels = nx.get_edge_attributes(G, 'label')
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
        plt.title("Knowledge Graph Visualization")
        plt.show()
    except Exception as e:
        print(f"\nCould not visualize the graph. Matplotlib might not be installed or configured for your environment.")
        print(f"Error: {e}")

def run_api_server():
    """Starts the advanced RAG API server."""
    from rag_system.api_server import start_server
    start_server()

def main():
    if len(sys.argv) < 2:
        print("Usage: python main.py [index|chat|show_graph|api] [query]")
        return

    command = sys.argv[1]
    if command == "index":
        # Allow passing file paths from the command line
        files = sys.argv[2:] if len(sys.argv) > 2 else None
        run_indexing(files)
    elif command == "chat":
        if len(sys.argv) < 3:
            print("Usage: python main.py chat <query>")
            return
        query = " ".join(sys.argv[2:])
        # 🆕 Print the result for command-line usage
        print(run_chat(query))
    elif command == "show_graph":
        show_graph()
    elif command == "api":
        run_api_server()
    else:
        print(f"Unknown command: {command}")

if __name__ == "__main__":
    # This allows running the script from the command line to index documents.
    parser = argparse.ArgumentParser(description="Main entry point for the RAG system.")
    parser.add_argument(
        '--index',
        type=str,
        help='Path to the directory containing documents to index.'
    )
    parser.add_argument(
        '--config',
        type=str,
        default='default',
        help='The configuration profile to use (e.g., "default", "fast").'
    )

    args = parser.parse_args()

    # Load environment variables
    load_dotenv()

    if args.index:
        run_indexing(args.index, args.config)
    else:
        # This is where you might start a server or interactive session
        print("No action specified. Use --index to process documents.")
        # Example of how to get an agent instance
        # agent = get_agent(args.config)
        # print(f"Agent loaded with '{args.config}' config.")


================================================
FILE: rag_system/pipelines/__init__.py
================================================


================================================
FILE: rag_system/pipelines/indexing_pipeline.py
================================================
from typing import List, Dict, Any
import os
import networkx as nx
from rag_system.ingestion.document_converter import DocumentConverter
from rag_system.ingestion.chunking import MarkdownRecursiveChunker
from rag_system.indexing.representations import EmbeddingGenerator, select_embedder
from rag_system.indexing.embedders import LanceDBManager, VectorIndexer
from rag_system.indexing.graph_extractor import GraphExtractor
from rag_system.utils.ollama_client import OllamaClient
from rag_system.indexing.contextualizer import ContextualEnricher
from rag_system.indexing.overview_builder import OverviewBuilder

class IndexingPipeline:
    def __init__(self, config: Dict[str, Any], ollama_client: OllamaClient, ollama_config: Dict[str, str]):
        self.config = config
        self.llm_client = ollama_client
        self.ollama_config = ollama_config
        self.document_converter = DocumentConverter()
        # Chunker selection: docling (token-based) or legacy (character-based)
        chunker_mode = config.get("chunker_mode", "docling")
        
        # 🔧 Get chunking configuration from frontend parameters
        chunking_config = config.get("chunking", {})
        chunk_size = chunking_config.get("chunk_size", config.get("chunk_size", 1500))
        chunk_overlap = chunking_config.get("chunk_overlap", config.get("chunk_overlap", 200))
        
        print(f"🔧 CHUNKING CONFIG: Size: {chunk_size}, Overlap: {chunk_overlap}, Mode: {chunker_mode}")
        
        if chunker_mode == "docling":
            try:
                from rag_system.ingestion.docling_chunker import DoclingChunker
                self.chunker = DoclingChunker(
                    max_tokens=config.get("max_tokens", chunk_size),
                    overlap=config.get("overlap_sentences", 1),
                    tokenizer_model=config.get("embedding_model_name", "qwen3-embedding-0.6b"),
                )
                print("🪄 Using DoclingChunker for high-recall sentence packing.")
            except Exception as e:
                print(f"⚠️  Failed to initialise DoclingChunker: {e}. Falling back to legacy chunker.")
                self.chunker = MarkdownRecursiveChunker(
                    max_chunk_size=chunk_size,
                    min_chunk_size=min(chunk_overlap, chunk_size // 4),  # Sensible minimum
                    tokenizer_model=config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B")
                )
        else:
            self.chunker = MarkdownRecursiveChunker(
                max_chunk_size=chunk_size,
                min_chunk_size=min(chunk_overlap, chunk_size // 4),  # Sensible minimum
                tokenizer_model=config.get("embedding_model_name", "Qwen/Qwen3-Embedding-0.6B")
            )

        retriever_configs = self.config.get("retrievers") or self.config.get("retrieval", {})
        storage_config = self.config["storage"]
        
        # Get batch processing configuration
        indexing_config = self.config.get("indexing", {})
        self.embedding_batch_size = indexing_config.get("embedding_batch_size", 50)
        self.enrichment_batch_size = indexing_config.get("enrichment_batch_size", 10)
        self.enable_progress_tracking = indexing_config.get("enable_progress_tracking", True)

        # Treat dense retrieval as enabled by default unless explicitly disabled
        dense_cfg = retriever_configs.setdefault("dense", {})
        dense_cfg.setdefault("enabled", True)

        if dense_cfg.get("enabled"):
            # Accept modern keys: db_path or lancedb_path; fall back to legacy lancedb_uri
            db_path = (
                storage_config.get("db_path")
                or storage_config.get("lancedb_path")
                or storage_config.get("lancedb_uri")
            )
            if not db_path:
                raise KeyError(
                    "Storage config must include 'db_path', 'lancedb_path', or 'lancedb_uri' for LanceDB."
                )
            self.lancedb_manager = LanceDBManager(db_path=db_path)
            self.vector_indexer = VectorIndexer(self.lancedb_manager)
            embedding_model = select_embedder(
                self.config.get("embedding_model_name", "BAAI/bge-small-en-v1.5"),
                self.ollama_config.get("host") if isinstance(self.ollama_config, dict) else None,
            )
            self.embedding_generator = EmbeddingGenerator(
                embedding_model=embedding_model, 
                batch_size=self.embedding_batch_size
            )

        if retriever_configs.get("graph", {}).get("enabled"):
            self.graph_extractor = GraphExtractor(
                llm_client=self.llm_client,
                llm_model=self.ollama_config["generation_model"]
            )

        if self.config.get("contextual_enricher", {}).get("enabled"):
            # 🔧 Use frontend enrich_model parameter if provided
            enrichment_model = (
                self.config.get("enrich_model") or  # Frontend parameter
                self.config.get("enrichment_model_name") or  # Alternative config key
                self.ollama_config.get("enrichment_model") or  # Default from ollama config
                self.ollama_config["generation_model"]  # Final fallback
            )
            print(f"🔧 ENRICHMENT MODEL: Using '{enrichment_model}' for contextual enrichment")
            
            self.contextual_enricher = ContextualEnricher(
                llm_client=self.llm_client,
                llm_model=enrichment_model,
                batch_size=self.enrichment_batch_size
            )

        # Overview builder always enabled for triage routing
        ov_path = self.config.get("overview_path")
        self.overview_builder = OverviewBuilder(
            llm_client=self.llm_client,
            model=self.config.get("overview_model_name", self.ollama_config.get("enrichment_model", "qwen3:0.6b")),
            first_n_chunks=self.config.get("overview_first_n_chunks", 5),
            out_path=ov_path if ov_path else None,
        )

        # ------------------------------------------------------------------
        # Late-Chunk encoder initialisation (optional)
        # ------------------------------------------------------------------
        self.latechunk_enabled = retriever_configs.get("latechunk", {}).get("enabled", False)
        if self.latechunk_enabled:
            try:
                from rag_system.indexing.latechunk import LateChunkEncoder
                self.latechunk_cfg = retriever_configs["latechunk"]
                self.latechunk_encoder = LateChunkEncoder(model_name=self.config.get("embedding_model_name", "qwen3-embedding-0.6b"))
            except Exception as e:
                print(f"⚠️  Failed to initialise LateChunkEncoder: {e}. Disabling latechunk retrieval.")
                self.latechunk_enabled = False

    def run(self, file_paths: List[str] | None = None, *, documents: List[str] | None = None):
        """
        Processes and indexes documents based on the pipeline's configuration.
        Accepts legacy keyword *documents* as an alias for *file_paths* so that
        older callers (backend/index builder) keep working.
        """
        # Back-compat shim ---------------------------------------------------
        if file_paths is None and documents is not None:
            file_paths = documents
        if file_paths is None:
            raise TypeError("IndexingPipeline.run() expects 'file_paths' (or alias 'documents') argument")

        print(f"--- Starting indexing process for {len(file_paths)} files. ---")
        
        # Import progress tracking utilities
        from rag_system.utils.batch_processor import timer, ProgressTracker, estimate_memory_usage
        
        with timer("Complete Indexing Pipeline"):
            # Step 1: Document Processing and Chunking
            all_chunks = []
            doc_chunks_map = {}
            with timer("Document Processing & Chunking"):
                file_tracker = ProgressTracker(len(file_paths), "Document Processing")
                
                for file_path in file_paths:
                    try:
                        document_id = os.path.basename(file_path)
                        print(f"Processing: {document_id}")
                        
                        pages_data = self.document_converter.convert_to_markdown(file_path)
                        file_chunks = []
                        
                        for tpl in pages_data:
                            if len(tpl) == 3:
                                markdown_text, metadata, doc_obj = tpl
                                if hasattr(self.chunker, "chunk_document"):
                                    chunks = self.chunker.chunk_document(doc_obj, document_id=document_id, metadata=metadata)
                                else:
                                    chunks = self.chunker.chunk(markdown_text, document_id, metadata)
                            else:
                                markdown_text, metadata = tpl
                                chunks = self.chunker.chunk(markdown_text, document_id, metadata)
                            file_chunks.extend(chunks)
                        
                        # Add a sequential chunk_index to each chunk within the document
                        for i, chunk in enumerate(file_chunks):
                            if 'metadata' not in chunk:
                                chunk['metadata'] = {}
                            chunk['metadata']['chunk_index'] = i
                        
                        # Build and persist document overview (non-blocking errors)
                        try:
                            self.overview_builder.build_and_store(document_id, file_chunks)
                        except Exception as e:
                            print(f"  ⚠️  Failed to create overview for {document_id}: {e}")
                        
                        all_chunks.extend(file_chunks)
                        doc_chunks_map[document_id] = file_chunks  # save for late-chunk step
                        print(f"  Generated {len(file_chunks)} chunks from {document_id}")
                        file_tracker.update(1)
                        
                    except Exception as e:
                        print(f"  ❌ Error processing {file_path}: {e}")
                        file_tracker.update(1, errors=1)
                        continue
                
                file_tracker.finish()

            if not all_chunks:
                print("No text chunks were generated. Skipping indexing.")
                return

            print(f"\n✅ Generated {len(all_chunks)} text chunks total.")
            memory_mb = estimate_memory_usage(all_chunks)
            print(f"📊 Estimated memory usage: {memory_mb:.1f}MB")

            retriever_configs = self.config.get("retrievers") or self.config.get("retrieval", {})

            # Step 3: Optional Contextual Enrichment (before indexing for consistency)
            enricher_config = self.config.get("contextual_enricher", {})
            enricher_enabled = enricher_config.get("enabled", False)
            
            print(f"\n🔍 CONTEXTUAL ENRICHMENT DEBUG:")
            print(f"   Config present: {bool(enricher_config)}")
            print(f"   Enabled: {enricher_enabled}")
            print(f"   Has enricher object: {hasattr(self, 'contextual_enricher')}")
            
            if hasattr(self, 'contextual_enricher') and enricher_enabled:
                with timer("Contextual Enrichment"):
                    window_size = enricher_config.get("window_size", 1)
                    print(f"\n🚀 CONTEXTUAL ENRICHMENT ACTIVE!")
                    print(f"   Window size: {window_size}")
                    print(f"   Model: {self.contextual_enricher.llm_model}")
                    print(f"   Batch size: {self.contextual_enricher.batch_size}")
                    print(f"   Processing {len(all_chunks)} chunks...")
                    
                    # Show before/after example
                    if all_chunks:
                        print(f"   Example BEFORE: '{all_chunks[0]['text'][:100]}...'")
                    
                    # This modifies the 'text' field in each chunk dictionary
                    all_chunks = self.contextual_enricher.enrich_chunks(all_chunks, window_size=window_size)
                    
                    if all_chunks:
                        print(f"   Example AFTER: '{all_chunks[0]['text'][:100]}...'")
                    
                    print(f"✅ Enriched {len(all_chunks)} chunks with context for indexing.")
            else:
                print(f"⚠️  CONTEXTUAL ENRICHMENT SKIPPED:")
                if not hasattr(self, 'contextual_enricher'):
                    print(f"   Reason: No enricher object (config enabled={enricher_enabled})")
                elif not enricher_enabled:
                    print(f"   Reason: Disabled in config")
                print(f"   Chunks will be indexed without contextual enrichment.")

            # Step 4: Create BM25 Index from enriched chunks (for consistency with vector index)
            if hasattr(self, 'vector_indexer') and hasattr(self, 'embedding_generator'):
                with timer("Vector Embedding & Indexing"):
                    table_name = self.config["storage"].get("text_table_name") or retriever_configs.get("dense", {}).get("lancedb_table_name", "default_text_table")
                    print(f"\n--- Generating embeddings with {self.config.get('embedding_model_name')} ---")
                    
                    embeddings = self.embedding_generator.generate(all_chunks)
                    
                    print(f"\n--- Indexing {len(embeddings)} vectors into LanceDB table: {table_name} ---")
                    self.vector_indexer.index(table_name, all_chunks, embeddings)
                    print("✅ Vector embeddings indexed successfully")

                    # Create FTS index on the 'text' field after adding data
                    print(f"\n--- Ensuring Full-Text Search (FTS) index on table '{table_name}' ---")
                    try:
                        tbl = self.lancedb_manager.get_table(table_name)
                        # LanceDB's default index name is "text_idx" while older
                        # revisions of this pipeline used our own name "fts_text".
                        # Guard against both so we don't attempt to create a     
                        # duplicate index and trigger a LanceError.
                        existing_indices = [idx.name for idx in tbl.list_indices()]
                        if not any(name in existing_indices for name in ("text_idx", "fts_text")):
                            # Use LanceDB default index naming ("text_idx")
                            tbl.create_fts_index(
                                "text",
                                use_tantivy=False,
                                replace=False,
                            )
                            print("✅ FTS index created successfully (using Lance native FTS).")
                        else:
                            print("ℹ️  FTS index already exists – skipped creation.")
                    except Exception as e:
                        print(f"❌ Failed to create/verify FTS index: {e}")

                    # ---------------------------------------------------
                    # Late-Chunk Embedding + Indexing (optional)
                    # ---------------------------------------------------
                    if self.latechunk_enabled:
                        with timer("Late-Chunk Embedding & Indexing"):
                            lc_table_name = self.latechunk_cfg.get("lancedb_table_name", f"{table_name}_lc")
                            print(f"\n--- Generating late-chunk embeddings (table={lc_table_name}) ---")

                            total_lc_vecs = 0
                            for doc_id, doc_chunks in doc_chunks_map.items():
                                # Build full text and span list
                                full_text_parts = []
                                spans = []
                                current_pos = 0
                                for ch in doc_chunks:
                                    ch_text = ch["text"]
                                    full_text_parts.append(ch_text)
                                    start = current_pos
                                    end = start + len(ch_text)
                                    spans.append((start, end))
                                    current_pos = end + 1  # +1 for newline to join later
                                full_doc = "\n".join(full_text_parts)

                                try:
                                    lc_vecs = self.latechunk_encoder.encode(full_doc, spans)
                                except Exception as e:
                                    print(f"⚠️  LateChunk encode failed for {doc_id}: {e}")
                                    continue

                                if len(doc_chunks) == 0 or len(lc_vecs) == 0:
                                    # Nothing to index for this document
                                    continue
                                if len(lc_vecs) != len(doc_chunks):
                                    print(f"⚠️  Mismatch LC vecs ({len(lc_vecs)}) vs chunks ({len(doc_chunks)}) for {doc_id}. Skipping.")
                                    continue

                                self.vector_indexer.index(lc_table_name, doc_chunks, lc_vecs)
                                total_lc_vecs += len(lc_vecs)

                            print(f"✅ Late-chunk vectors indexed: {total_lc_vecs}")
                
            # Step 6: Knowledge Graph Extraction (Optional)
            if hasattr(self, 'graph_extractor'):
                with timer("Knowledge Graph Extraction"):
                    graph_path = retriever_configs.get("graph", {}).get("graph_path", "./index_store/graph/default_graph.gml")
                    print(f"\n--- Building and saving knowledge graph to: {graph_path} ---")
                    
                    graph_data = self.graph_extractor.extract(all_chunks)
                    G = nx.DiGraph()
                    for entity in graph_data['entities']:
                        G.add_node(entity['id'], type=entity.get('type', 'Unknown'), properties=entity.get('properties', {}))
                    for rel in graph_data['relationships']:
                        G.add_edge(rel['source'], rel['target'], label=rel['label'])
                    
                    os.makedirs(os.path.dirname(graph_path), exist_ok=True)
                    nx.write_gml(G, graph_path)
                    print(f"✅ Knowledge graph saved successfully.")
                    
        print("\n--- ✅ Indexing Complete ---")
        self._print_final_statistics(len(file_paths), len(all_chunks))
    
    def _print_final_statistics(self, num_files: int, num_chunks: int):
        """Print final indexing statistics"""
        print(f"\n📈 Final Statistics:")
        print(f"  Files processed: {num_files}")
        print(f"  Chunks generated: {num_chunks}")
        print(f"  Average chunks per file: {num_chunks/num_files:.1f}")
        
        # Component status
        components = []
        if hasattr(self, 'contextual_enricher'):
            components.append("✅ Contextual Enrichment")
        if hasattr(self, 'vector_indexer'):
            components.append("✅ Vector & FTS Index")
        if hasattr(self, 'graph_extractor'):
            components.append("✅ Knowledge Graph")
            
        print(f"  Components: {', '.join(components)}")
        print(f"  Batch sizes: Embeddings={self.embedding_batch_size}, Enrichment={self.enrichment_batch_size}")


================================================
FILE: rag_system/pipelines/retrieval_pipeline.py
================================================
import pymupdf
from typing import List, Dict, Any, Tuple, Optional
from PIL import Image
import concurrent.futures
import time
import json
import lancedb
import logging
import math
import numpy as np
from threading import Lock

from rag_system.utils.ollama_client import OllamaClient
from rag_system.retrieval.retrievers import MultiVectorRetriever, GraphRetriever
from rag_system.indexing.multimodal import LocalVisionModel
from rag_system.indexing.representations import select_embedder
from rag_system.indexing.embedders import LanceDBManager
from rag_system.rerankers.reranker import QwenReranker
from rag_system.rerankers.sentence_pruner import SentencePruner
# from rag_system.indexing.chunk_store import ChunkStore

import os
from PIL import Image

# ---------------------------------------------------------------------------
# Thread-safety helpers
# ---------------------------------------------------------------------------

# 1. ColBERT (via `rerankers` lib) is not thread-safe.  We protect the actual
#    `.rank()` call with `_rerank_lock`.
_rerank_lock: Lock = Lock()

# 2. Loading a large cross-encoder or ColBERT model can easily take >1 GB of
#    RAM.  When multiple sub-queries are processed in parallel they may try to
#    instantiate the reranker simultaneously, which results in PyTorch meta
#    tensor errors.  We therefore guard the *initialisation* with its own
#    lock so only one thread carries out the heavy `from_pretrained()` call.
_ai_reranker_init_lock: Lock = Lock()

# Lock to serialise first-time Provence model load
_sentence_pruner_lock: Lock = Lock()

class RetrievalPipeline:
    """
    Orchestrates the state-of-the-art multimodal RAG pipeline.
    """
    def __init__(self, config: Dict[str, Any], ollama_client: OllamaClient, ollama_config: Dict[str, Any]):
        self.config = config
        self.ollama_config = ollama_config
        self.ollama_client = ollama_client
        
        # Support both legacy "retrievers" key and newer "retrieval" key
        self.retriever_configs = self.config.get("retrievers") or self.config.get("retrieval", {})
        self.storage_config = self.config["storage"]
        
        # Defer initialization to just-in-time methods
        self.db_manager = None
        self.text_embedder = None
        self.dense_retriever = None
        self.bm25_retriever = None
        # Use a private attribute to avoid clashing with the public property
        self._graph_retriever = None
        self.reranker = None
        self.ai_reranker = None

    def _get_db_manager(self):
        if self.db_manager is None:
            # Accept either "db_path" (preferred) or legacy "lancedb_uri"
            db_path = self.storage_config.get("db_path") or self.storage_config.get("lancedb_uri")
            if not db_path:
                raise ValueError("Storage config must contain 'db_path' or 'lancedb_uri'.")
            self.db_manager = LanceDBManager(db_path=db_path)
        return self.db_manager

    def _get_text_embedder(self):
        if self.text_embedder is None:
            from rag_system.indexing.representations import select_embedder
            self.text_embedder = select_embedder(
                self.config.get("embedding_model_name", "BAAI/bge-small-en-v1.5"),
                self.ollama_config.get("host") if isinstance(self.ollama_config, dict) else None,
            )
        return self.text_embedder

    def _get_dense_retriever(self):
        """Ensure a dense MultiVectorRetriever is always available unless explicitly disabled."""
        if self.dense_retriever is None:
            # If the config explicitly sets dense.enabled to False, respect it
            if self.retriever_configs.get("dense", {}).get("enabled", True) is False:
                return None

            try:
                db_manager = self._get_db_manager()
                text_embedder = self._get_text_embedder()
                fusion_cfg = self.config.get("fusion", {})
                self.dense_retriever = MultiVectorRetriever(
                    db_manager,
                    text_embedder,
                    vision_model=None,
                    fusion_config=fusion_cfg,
                )
            except Exception as e:
                print(f"❌ Failed to initialise dense retriever: {e}")
                self.dense_retriever = None
        return self.dense_retriever

    def _get_bm25_retriever(self):
        if self.bm25_retriever is None and self.retriever_configs.get("bm25", {}).get("enabled"):
            try:
                print(f"🔧 Lazily initializing BM25 retriever...")
                self.bm25_retriever = BM25Retriever(
                    index_path=self.storage_config["bm25_path"],
                    index_name=self.retriever_configs["bm25"]["index_name"]
                )
                print("✅ BM25 retriever initialized successfully")
            except Exception as e:
                print(f"❌ Failed to initialize BM25 retriever on demand: {e}")
                # Keep it None so we don't try again
        return self.bm25_retriever

    def _get_graph_retriever(self):
        if self._graph_retriever is None and self.retriever_configs.get("graph", {}).get("enabled"):
            self._graph_retriever = GraphRetriever(graph_path=self.storage_config["graph_path"])
        return self._graph_retriever

    def _get_reranker(self):
        """Initializes the reranker for hybrid search score fusion."""
        reranker_config = self.config.get("reranker", {})
        # This is for the LanceDB internal reranker, not the AI one.
        if self.reranker is None and reranker_config.get("type") == "linear_combination":
            rerank_weight = reranker_config.get("weight", 0.5) 
            self.reranker = lancedb.rerankers.LinearCombinationReranker(weight=rerank_weight)
            print(f"✅ Initialized LinearCombinationReranker with weight {rerank_weight}")
        return self.reranker

    def _get_ai_reranker(self):
        """Initializes a dedicated AI-based reranker."""
        reranker_config = self.config.get("reranker", {})
        if self.ai_reranker is None and reranker_config.get("enabled"):
            # Serialise first-time initialisation so only one thread attempts
            # to load the (very large) model.  Other threads will wait and use
            # the instance once ready, preventing the meta-tensor crash.
            with _ai_reranker_init_lock:
                # Another thread may have completed init while we waited
                if self.ai_reranker is None:
                    try:
                        model_name = reranker_config.get("model_name")
                        strategy = reranker_config.get("strategy", "qwen")

                        if strategy == "rerankers-lib":
                            print(f"🔧 Initialising Answer.AI ColBERT reranker ({model_name}) via rerankers lib…")
                            from rerankers import Reranker
                            self.ai_reranker = Reranker(model_name, model_type="colbert")
                        else:
                            print(f"🔧 Lazily initializing Qwen reranker ({model_name})…")
                            self.ai_reranker = QwenReranker(model_name=model_name)

                        print("✅ AI reranker initialized successfully.")
                    except Exception as e:
                        # Leave as None so the pipeline can proceed without reranking
                        print(f"❌ Failed to initialize AI reranker: {e}")
        return self.ai_reranker

    def _get_sentence_pruner(self):
        if getattr(self, "_sentence_pruner", None) is None:
            with _sentence_pruner_lock:
                if getattr(self, "_sentence_pruner", None) is None:
                    self._sentence_pruner = SentencePruner()
        return self._sentence_pruner

    def _get_surrounding_chunks_lancedb(self, chunk: Dict[str, Any], window_size: int) -> List[Dict[str, Any]]:
        """
        Retrieves a window of chunks around a central chunk using LanceDB.
        """
        db_manager = self._get_db_manager()
        if not db_manager:
            return [chunk]

        # Extract identifiers needed for the query
        document_id = chunk.get("document_id")
        chunk_index = chunk.get("chunk_index")

        # If essential identifiers are missing, return the chunk itself
        if document_id is None or chunk_index is None or chunk_index == -1:
            return [chunk]

        table_name = self.config["storage"]["text_table_name"]
        try:
            tbl = db_manager.get_table(table_name)
        except Exception:
            # If the table can't be opened, we can't get surrounding chunks
            return [chunk]

        # Define the window for the search
        start_index = max(0, chunk_index - window_size)
        end_index = chunk_index + window_size
        
        # Construct the SQL filter for an efficient metadata-based search
        sql_filter = f"document_id = '{document_id}' AND chunk_index >= {start_index} AND chunk_index <= {end_index}"
        
        try:
            # Execute a filter-only search, which is very fast on indexed metadata
            results = tbl.search().where(sql_filter).to_list()
            
            # The results must be sorted by chunk_index to maintain logical order
            results.sort(key=lambda c: c['chunk_index'])

            # The 'metadata' field is a JSON string and needs to be parsed
            for res in results:
                if isinstance(res.get('metadata'), str):
                    try:
                        res['metadata'] = json.loads(res['metadata'])
                    except json.JSONDecodeError:
                        res['metadata'] = {} # Handle corrupted metadata gracefully
            return results
        except Exception:
            # If the query fails for any reason, fall back to the single chunk
            return [chunk]

    def _synthesize_final_answer(self, query: str, facts: str, *, event_callback=None) -> str:
        """Uses a text LLM to synthesize a final answer from extracted facts."""
        prompt = f"""
You are an AI assistant specialised in answering questions from retrieved context.

Context you receive
• VERIFIED FACTS – text snippets retrieved from the user's documents. Some may be irrelevant noise.  
• ORIGINAL QUESTION – the user's actual query.

Instructions
1. Evaluate each snippet for relevance to the ORIGINAL QUESTION; ignore those that do not help answer it.  
2. Synthesise an answer **using only information from the relevant snippets**.  
3. If snippets contradict one another, mention the contradiction explicitly.  
4. If the snippets do not contain the needed information, reply exactly with:  
   "I could not find that information in the provided documents."  
5. Provide a thorough, well-structured answer. Use paragraphs or bullet points where helpful, and include any relevant numbers/names exactly as they appear. There is **no strict sentence limit**, but aim for clarity over brevity.  
6. Do **not** introduce external knowledge unless step 4 applies; in that case you may add a clearly-labelled "General knowledge" sentence after the required statement.

Output format
Answer:
<your answer here>

–––––  Retrieved Snippets  –––––
{facts}
––––––––––––––––––––––––––––––

ORIGINAL QUESTION: "{query}"
"""
        # Stream the answer token-by-token so the caller can forward them as SSE
        answer_parts: list[str] = []
        for tok in self.ollama_client.stream_completion(
            model=self.ollama_config["generation_model"],
            prompt=prompt,
        ):
            answer_parts.append(tok)
            if event_callback:
                event_callback("token", {"text": tok})

        return "".join(answer_parts)

    def run(self, query: str, table_name: str = None, window_size_override: Optional[int] = None, event_callback=None) -> Dict[str, Any]:
        start_time = time.time()
        retrieval_k = self.config.get("retrieval_k", 10)

        logger = logging.getLogger(__name__)
        logger.debug("--- Running Hybrid Search for query '%s' (table=%s) ---", query, table_name or self.storage_config.get("text_table_name"))
        
        # If a custom table_name is provided, propagate it to storage config so helper methods use it
        if table_name:
            self.storage_config["text_table_name"] = table_name

        if event_callback:
            event_callback("retrieval_started", {})
        # Unified retrieval using the refactored MultiVectorRetriever
        dense_retriever = self._get_dense_retriever()
        # Get the LanceDB reranker for initial score fusion
        lancedb_reranker = self._get_reranker()
        
        retrieved_docs = []
        if dense_retriever:
            retrieved_docs = dense_retriever.retrieve(
                text_query=query,
                table_name=table_name or self.storage_config["text_table_name"],
                k=retrieval_k,
                reranker=lancedb_reranker # Pass the reranker to enable hybrid search
            )

        # ---------------------------------------------------------------
        # Late-Chunk retrieval (optional)
        # ---------------------------------------------------------------
        if self.retriever_configs.get("latechunk", {}).get("enabled"):
            lc_table = self.retriever_configs["latechunk"].get("lancedb_table_name")
            if lc_table:
                try:
                    lc_docs = dense_retriever.retrieve(
                        text_query=query,
                        table_name=lc_table,
                        k=retrieval_k,
                        reranker=lancedb_reranker,
                    )
                    retrieved_docs.extend(lc_docs)
                except Exception as e:
                    print(f"⚠️  Late-chunk retrieval failed: {e}")

        if event_callback:
            event_callback("retrieval_done", {"count": len(retrieved_docs)})
        
        retrieval_time = time.time() - start_time
        logger.debug("Retrieved %s chunks in %.2fs", len(retrieved_docs), retrieval_time)

        # -----------------------------------------------------------
        #  LATE-CHUNK MERGING (merge ±1 sub-vector into central hit)
        # -----------------------------------------------------------
        if self.retriever_configs.get("latechunk", {}).get("enabled") and retrieved_docs:
            merged_count = 0
            for doc in retrieved_docs:
                try:
                    cid = doc.get("chunk_id")
                    meta = doc.get("metadata", {})
                    if meta.get("latechunk_merged"):
                        continue  # already processed
                    doc_id = doc.get("document_id")
                    cidx = doc.get("chunk_index")
                    if doc_id is None or cidx is None or cidx == -1:
                        continue
                    # Fetch neighbouring late-chunks inside same document (±1)
                    siblings = self._get_surrounding_chunks_lancedb(doc, window_size=1)
                    # Keep only same document_id and ordered by chunk_index
                    siblings = [s for s in siblings if s.get("document_id") == doc_id]
                    siblings.sort(key=lambda s: s.get("chunk_index", 0))
                    merged_text = " \n".join(s.get("text", "") for s in siblings)
                    if merged_text:
                        doc["text"] = merged_text
                        meta["latechunk_merged"] = True
                        merged_count += 1
                except Exception as e:
                    print(f"⚠️  Late-chunk merge failed for chunk {doc.get('chunk_id')}: {e}")
            if merged_count:
                print(f"🪄 Late-chunk merging applied to {merged_count} retrieved chunks.")

        # --- AI Reranking Step ---
        ai_reranker = self._get_ai_reranker()
        if ai_reranker and retrieved_docs:
            if event_callback:
                event_callback("rerank_started", {"count": len(retrieved_docs)})
            print(f"\n--- Reranking top {len(retrieved_docs)} docs with AI model... ---")
            start_rerank_time = time.time()

            rerank_cfg = self.config.get("reranker", {})
            top_k_cfg = rerank_cfg.get("top_k")
            top_percent = rerank_cfg.get("top_percent")  # value in range 0–1

            if top_percent is not None:
                try:
                    pct = float(top_percent)
                    assert 0 < pct <= 1
                    top_k = max(1, int(len(retrieved_docs) * pct))
                except Exception:
                    print("⚠️  Invalid top_percent value; falling back to top_k")
                    top_k = top_k_cfg or len(retrieved_docs)
            else:
                top_k = top_k_cfg or len(retrieved_docs)

            strategy = self.config.get("reranker", {}).get("strategy", "qwen")

            if strategy == "rerankers-lib":
                texts = [d['text'] for d in retrieved_docs]
                # ColBERT's Rust backend isn't Sync; serialise calls.
                with _rerank_lock:
                    ranked = ai_reranker.rank(query=query, docs=texts)
                # ranked is RankedResults; convert to list of (score, idx)
                try:
                    pairs = [(r.score, r.document.doc_id) for r in ranked.results]
                    if any(p[1] is None for p in pairs):
                        pairs = [(r.score, i) for i, r in enumerate(ranked.results)]
                except Exception:
                    pairs = ranked
                # Keep only top_k results if requested
                if top_k is not None and len(pairs) > top_k:
                    pairs = pairs[:top_k]
                reranked_docs = [retrieved_docs[idx] | {"rerank_score": score} for score, idx in pairs]
            else:
                try:
                    reranked_docs = ai_reranker.rerank(query, retrieved_docs, top_k=top_k)
                except TypeError:
                    texts = [d['text'] for d in retrieved_docs]
                    pairs = ai_reranker.rank(query, texts, top_k=top_k)
                    reranked_docs = [retrieved_docs[idx] | {"rerank_score": score} for score, idx in pairs]

            rerank_time = time.time() - start_rerank_time
            print(f"✅ Reranking completed in {rerank_time:.2f}s. Refined to {len(reranked_docs)} docs.")
            if event_callback:
                event_callback("rerank_done", {"count": len(reranked_docs)})
        else:
            # If no AI reranker, proceed with the initially retrieved docs
            reranked_docs = retrieved_docs

        window_size = self.config.get("context_window_size", 1)
        if window_size_override is not None:
            window_size = window_size_override
        if window_size > 0 and reranked_docs:
            if event_callback:
                event_callback("context_expand_started", {"count": len(reranked_docs)})
            print(f"\n--- Expanding context for {len(reranked_docs)} top documents (window size: {window_size})... ---")
            expanded_chunks = {}
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future_to_chunk = {executor.submit(self._get_surrounding_chunks_lancedb, chunk, window_size): chunk for chunk in reranked_docs}
                for future in concurrent.futures.as_completed(future_to_chunk):
                    try:
                        seed_chunk = future_to_chunk[future]
                        surrounding_chunks = future.result()
                        for surrounding_chunk in surrounding_chunks:
                            cid = surrounding_chunk['chunk_id']
                            if cid not in expanded_chunks:
                                # If this is the *central* chunk we already reranked, carry over its score
                                if cid == seed_chunk.get('chunk_id') and 'rerank_score' in seed_chunk:
                                    surrounding_chunk['rerank_score'] = seed_chunk['rerank_score']
                                expanded_chunks[cid] = surrounding_chunk
                    except Exception as e:
                        print(f"Error expanding context for a chunk: {e}")

            final_docs = list(expanded_chunks.values())
            # Sort by reranker score if present, otherwise by raw score/distance
            if any('rerank_score' in d for d in final_docs):
                final_docs.sort(key=lambda c: c.get('rerank_score', -1), reverse=True)
            elif any('_distance' in d for d in final_docs):
                # For vector search smaller distance is better
                final_docs.sort(key=lambda c: c.get('_distance', 1e9))
            elif any('score' in d for d in final_docs):
                final_docs.sort(key=lambda c: c.get('score', 0), reverse=True)
            else:
                # Fallback to document order
                final_docs.sort(key=lambda c: (c.get('document_id', ''), c.get('chunk_index', 0)))

            print(f"Expanded to {len(final_docs)} unique chunks for synthesis.")
            if event_callback:
                event_callback("context_expand_done", {"count": len(final_docs)})
        else:
            final_docs = reranked_docs

        # Optionally hide non-reranked chunks: if any chunk carries a
        # `rerank_score`, we assume the caller wants to focus on those.
        if any('rerank_score' in d for d in final_docs):
            final_docs = [d for d in final_docs if 'rerank_score' in d]

        # ------------------------------------------------------------------
        # Sentence-level pruning (Provence)
        # ------------------------------------------------------------------
        prov_cfg = self.config.get("provence", {})
        if prov_cfg.get("enabled"):
            if event_callback:
                event_callback("prune_started", {"count": len(final_docs)})
            thresh = float(prov_cfg.get("threshold", 0.1))
            print(f"\n--- Provence pruning enabled (threshold={thresh}) ---")
            pruner = self._get_sentence_pruner()
            final_docs = pruner.prune_documents(query, final_docs, threshold=thresh)
            # Remove any chunks that were fully pruned (empty text)
            final_docs = [d for d in final_docs if d.get('text', '').strip()]
            if event_callback:
                event_callback("prune_done", {"count": len(final_docs)})

        print("\n--- Final Documents for Synthesis ---")
        if not final_docs:
            print("No documents to synthesize.")
        else:
            for i, doc in enumerate(final_docs):
                print(f"  [{i+1}] Chunk ID: {doc.get('chunk_id')}")
                print(f"      Score: {doc.get('score', 'N/A')}")
                if 'rerank_score' in doc:
                    print(f"      Rerank Score: {doc.get('rerank_score'):.4f}")
                print(f"      Text: \"{doc.get('text', '').strip()}\"")
        print("------------------------------------")

        if not final_docs:
            return {"answer": "I could not find an answer in the documents.", "source_documents": []}
        
        # --- Sanitize docs for JSON serialization (no NaN/Inf types) ---
        def _clean_val(v):
            if isinstance(v, float) and (math.isnan(v) or math.isinf(v)):
                return None
            if isinstance(v, (np.floating,)):
                try:
                    f = float(v)
                    if math.isnan(f) or math.isinf(f):
                        return None
                    return f
                except Exception:
                    return None
            return v

        for doc in final_docs:
            # Remove heavy or internal-only fields before serialising
            doc.pop("vector", None)
            doc.pop("_distance", None)
            # Clean numeric fields
            for key in ['score', '_distance', 'rerank_score']:
                if key in doc:
                    doc[key] = _clean_val(doc[key])

        context = "\n\n".join([doc['text'] for doc in final_docs])

        # 👀 DEBUG: Show the exact context passed to the LLM after pruning
        print("\n=== Context passed to LLM (post-pruning) ===")
        if len(context) > 2000:
            print(context[:2000] + "…\n[truncated] (total {} chars)".format(len(context)))
        else:
            print(context)
        print("=== End of context ===\n")

        final_answer = self._synthesize_final_answer(query, context, event_callback=event_callback)
        
        return {"answer": final_answer, "source_documents": final_docs}

    # ------------------------------------------------------------------
    # Public utility
    # ------------------------------------------------------------------
    def list_document_titles(self, max_items: int = 25) -> List[str]:
        """Return up to *max_items* distinct document titles (or IDs).

        This is used only for prompt-routing, so we favour robustness over
        perfect recall. If anything goes wrong we return an empty list so
        the caller can degrade gracefully.
        """
        try:
            tbl_name = self.storage_config.get("text_table_name")
            if not tbl_name:
                return []

            tbl = self._get_db_manager().get_table(tbl_name)

            field_name = "document_title" if "document_title" in tbl.schema.names else "document_id"

            # Use a cheap SQL filter to grab distinct values; fall back to a
            # simple scan if the driver lacks DISTINCT support.
            try:
                sql = f"SELECT DISTINCT {field_name} FROM tbl LIMIT {max_items}"
                rows = tbl.search().where("true").sql(sql).to_list()  # type: ignore
                titles = [r[field_name] for r in rows if r.get(field_name)]
            except Exception:
                # Fallback: scan first N rows
                rows = tbl.search().select(field_name).limit(max_items * 4).to_list()
                seen = set()
                titles = []
                for r in rows:
                    val = r.get(field_name)
                    if val and val not in seen:
                        titles.append(val)
                        seen.add(val)
                        if len(titles) >= max_items:
                            break

            # Ensure we don't exceed max_items
            return titles[:max_items]
        except Exception:
            # Any issues (missing table, bad schema, etc.) –> just return []
            return []

    # -------------------- Public helper properties --------------------
    @property
    def retriever(self):
        """Lazily exposes the main (dense) retriever so external components
        like the ReAct agent tools can call `.retrieve()` directly without
        reaching into private helpers. If the retriever has not yet been
        instantiated, it is created on first access via `_get_dense_retriever`."""
        return self._get_dense_retriever()

    def update_embedding_model(self, model_name: str):
        """Switch embedding model at runtime and clear cached objects so they re-initialize."""
        if self.config.get("embedding_model_name") == model_name:
            return  # nothing to do
        print(f"🔧 RetrievalPipeline switching embedding model to '{model_name}' (was '{self.config.get('embedding_model_name')}')")
        self.config["embedding_model_name"] = model_name
        # Reset caches so new instances are built on demand
        self.text_embedder = None
        self.dense_retriever = None

================================================
FILE: rag_system/requirements.txt
================================================
colpali-engine
PyMuPDF
Pillow
transformers==4.51.0
torch==2.4.1
torchvision==0.19.1
lancedb
rank_bm25
fuzzywuzzy
python-Levenshtein
torchaudio
transformers
sentencepiece
accelerate
docling
ocrmac
ibm-watsonx-ai>=1.3.39


================================================
FILE: rag_system/rerankers/__init__.py
================================================


================================================
FILE: rag_system/rerankers/reranker.py
================================================
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from typing import List, Dict, Any

class QwenReranker:
    """
    A reranker that uses a local Hugging Face transformer model.
    """
    def __init__(self, model_name: str = "BAAI/bge-reranker-base"):
        # Auto-select the best available device: CUDA > MPS > CPU
        if torch.cuda.is_available():
            self.device = "cuda"
        elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
            self.device = "mps"
        else:
            self.device = "cpu"
        print(f"Initializing BGE Reranker with model '{model_name}' on device '{self.device}'.")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if self.device != "cpu" else None,
        ).to(self.device).eval()
        
        print("BGE Reranker loaded successfully.")

    def _format_instruction(self, query: str, doc: str):
        instruction = 'Given a web search query, retrieve relevant passages that answer the query'
        return f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}"

    def rerank(self, query: str, documents: List[Dict[str, Any]], top_k: int = 5, *, early_exit: bool = True, margin: float = 0.4, min_scored: int = 8, batch_size: int = 8) -> List[Dict[str, Any]]:
        """
        Reranks a list of documents based on their relevance to a query.

        If *early_exit* is True the cross-encoder scores documents in mini-batches and
        stops once the best-so-far score beats the worst-so-far by *margin* after at
        least *min_scored* docs have been processed.  This accelerates "easy" queries
        where strong positives dominate.
        """
        if not documents:
            return []

        # Sort by the upstream (hybrid) score so that the strongest candidates are evaluated first.
        docs_sorted = sorted(documents, key=lambda d: d.get('score', 0.0), reverse=True)

        scored_pairs: List[tuple[float, Dict[str, Any]]] = []

        with torch.no_grad():
            for start in range(0, len(docs_sorted), batch_size):
                batch_docs = docs_sorted[start : start + batch_size]
                batch_pairs = [[query, d['text']] for d in batch_docs]

                inputs = self.tokenizer(
                    batch_pairs,
                    padding=True,
                    truncation=True,
                    return_tensors="pt",
                    max_length=512,
                ).to(self.device)

                logits = self.model(**inputs).logits.view(-1)
                batch_scores = logits.float().cpu().tolist()

                scored_pairs.extend(zip(batch_scores, batch_docs))

                # --- Early-exit check ---
                if early_exit and len(scored_pairs) >= min_scored:
                    # Current best and worst among *already* scored docs
                    best_score = max(scored_pairs, key=lambda x: x[0])[0]
                    worst_score = min(scored_pairs, key=lambda x: x[0])[0]
                    if best_score - worst_score >= margin:
                        break

        # Sort final set and attach scores
        sorted_by_score = sorted(scored_pairs, key=lambda x: x[0], reverse=True)
        reranked_docs: List[Dict[str, Any]] = []
        for score, doc in sorted_by_score[:top_k]:
            doc_with_score = doc.copy()
            doc_with_score['rerank_score'] = score
            reranked_docs.append(doc_with_score)

        return reranked_docs

if __name__ == '__main__':
    # This test requires an internet connection to download the models.
    try:
        reranker = QwenReranker(model_name="BAAI/bge-reranker-base")
        
        query = "What is the capital of France?"
        documents = [
            {'text': "Paris is the capital of France.", 'metadata': {'doc_id': 'a'}},
            {'text': "The Eiffel Tower is in Paris.", 'metadata': {'doc_id': 'b'}},
            {'text': "France is a country in Europe.", 'metadata': {'doc_id': 'c'}},
        ]
        
        reranked_documents = reranker.rerank(query, documents)
        
        print("\n--- Verification ---")
        print(f"Query: {query}")
        print("Reranked documents:")
        for doc in reranked_documents:
            print(f"  - Score: {doc['rerank_score']:.4f}, Text: {doc['text']}")

    except Exception as e:
        print(f"\nAn error occurred during the QwenReranker test: {e}")
        print("Please ensure you have an internet connection for model downloads.")


================================================
FILE: rag_system/rerankers/sentence_pruner.py
================================================
from __future__ import annotations

"""Sentence-level context pruning using the Provence model (ICLR 2025).

This lightweight helper wraps the HuggingFace model hosted at
`naver/provence-reranker-debertav3-v1` and exposes a thread-safe
`prune_documents()` method that converts a list of RAG chunks into their
pruned variants.

The module fails gracefully – if the model weights cannot be downloaded
(or the `transformers` / `nltk` deps are missing) we simply return the
original documents unchanged so the upstream pipeline continues
unaffected.
"""

from threading import Lock
from typing import List, Dict, Any


class SentencePruner:
    """Lightweight singleton wrapper around the Provence model."""

    _model = None  # shared across all instances
    _init_lock: Lock = Lock()

    def __init__(self, model_name: str = "naver/provence-reranker-debertav3-v1") -> None:
        self.model_name = model_name
        self._ensure_model()

    # ---------------------------------------------------------------------
    # Internal helpers
    # ---------------------------------------------------------------------
    def _ensure_model(self) -> None:
        """Lazily download and load the Provence model exactly once."""
        if SentencePruner._model is not None:
            return

        with SentencePruner._init_lock:
            if SentencePruner._model is not None:
                return  # another thread beat us
            try:
                from transformers import AutoModel  # local import to keep base deps light

                print("🔧 Loading Provence sentence-pruning model …")
                SentencePruner._model = AutoModel.from_pretrained(
                    self.model_name,
                    trust_remote_code=True,
                )
                print("✅ Provence model loaded successfully.")
            except Exception as e:
                # Any failure leaves the singleton as None so callers can skip pruning.
                print(f"❌ Failed to load Provence model: {e}. Context pruning will be skipped.")
                SentencePruner._model = None

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------
    def prune_documents(
        self,
        question: str,
        docs: List[Dict[str, Any]],
        *,
        threshold: float = 0.1,
    ) -> List[Dict[str, Any]]:
        """Return *docs* with their `text` field pruned sentence-wise.

        If the model could not be initialised we simply echo the input.
        """
        if SentencePruner._model is None:
            return docs  # model unavailable – no-op

        # Batch texts for efficiency when >1 doc
        texts = [d.get("text", "") for d in docs]

        try:
            if len(texts) == 1:
                # returns dict
                outputs = [SentencePruner._model.process(question, texts[0], threshold=threshold)]
            else:
                # Batch call expects list[list[str]] with same outer length as questions list (1)
                batched_out = SentencePruner._model.process(question, [texts], threshold=threshold)
                # HF returns List[Dict] per question
                outputs = batched_out[0] if isinstance(batched_out, list) else batched_out
                if isinstance(outputs, dict):
                    outputs = [outputs]
                if len(outputs) != len(texts):
                    print("⚠️ Provence batch size mismatch; falling back to per-doc loop")
                    raise ValueError

            pruned: List[Dict[str, Any]] = []
            for doc, out in zip(docs, outputs):
                raw = out.get("pruned_context", doc.get("text", "")) if isinstance(out, dict) else doc.get("text", "")
                new_text = raw if isinstance(raw, str) else " ".join(raw)  # HF model may return a list of sentences
                pruned.append({**doc, "text": new_text})
        except Exception as e:
            print(f"⚠️ Provence batch pruning failed ({e}); falling back to individual calls")
            pruned = []
            for doc in docs:
                text = doc.get("text", "")
                if not text:
                    pruned.append(doc)
                    continue
                try:
                    res = SentencePruner._model.process(question, text, threshold=threshold)
                    raw = res.get("pruned_context", text) if isinstance(res, dict) else text
                    new_text = raw if isinstance(raw, str) else " ".join(raw)
                    pruned.append({**doc, "text": new_text})
                except Exception as err:
                    print(f"⚠️ Provence pruning failed for chunk {doc.get('chunk_id')}: {err}")
                    pruned.append(doc)

        return pruned 

================================================
FILE: rag_system/retrieval/__init__.py
================================================


================================================
FILE: rag_system/retrieval/query_transformer.py
================================================
from typing import List, Any, Dict
import json
from rag_system.utils.ollama_client import OllamaClient

class QueryDecomposer:
    def __init__(self, llm_client: OllamaClient, llm_model: str):
        self.llm_client = llm_client
        self.llm_model = llm_model

    def decompose(self, query: str, chat_history: List[Dict[str, Any]] | None = None) -> List[str]:
        """Decompose *query* into standalone sub-queries.

        Parameters
        ----------
        query : str
            The latest user message.
        chat_history : list[dict] | None
            Recent conversation turns (each item should contain at least the original
            user query under the key ``"query"``). Only the **last 5** turns are
            included to keep the prompt short.
        """

        # ---- Limit history to last 5 user turns and extract the queries ----
        history_snippets: List[str] = []
        if chat_history:
            # Keep only the last 5 turns
            recent_turns = chat_history[-5:]
            # Extract user queries (fallback: full dict as string if key missing)
            for turn in recent_turns:
                history_snippets.append(str(turn.get("query", turn)))

        # Serialize chat_history for the prompt (single string)
        chat_history_text = " | ".join(history_snippets)

        # ---- Build the new SYSTEM prompt with added legacy examples ----
        system_prompt = """
You are an expert at query decomposition for a Retrieval-Augmented Generation (RAG) system.

Return one RFC-8259-compliant JSON object and nothing else.
Schema:
{
“requires_decomposition”: <bool>,
“reasoning”:              <string>,  // ≤ 50 words
“resolved_query”:         <string>,  // query after context resolution
“sub_queries”:            <string[]> // 1–10 standalone items
}

Think step-by-step internally, but reveal only the concise reasoning.

⸻

Context Resolution  (perform FIRST)

You will receive:
	•	query – the current user message
	•	chat_history – the most recent user turns (may be empty)

If query contains pronouns, ellipsis, or shorthand that can be unambiguously linked to something in chat_history, rewrite it to a fully self-contained question and place the result in resolved_query.
Otherwise, copy query into resolved_query unchanged.

⸻

When is decomposition REQUIRED?
	•	MULTI-PART questions joined by “and”, “or”, “also”, list commas, etc.
	•	COMPARATIVE / SUPERLATIVE questions (two or more entities, e.g. “bigger, better, fastest”).
	•	TEMPORAL / SEQUENTIAL questions (changes over time, event timelines).
	•	ENUMERATIONS (pros, cons, impacts).
	•	ENTITY-SET COMPARISONS (A, B, C revenue…).

When is decomposition NOT REQUIRED?
	•	A single, factual information need.
	•	Ambiguous queries needing clarification rather than splitting.

⸻

Output rules
	1.	Use resolved_query—not the raw query—to decide on decomposition.
	2.	If requires_decomposition is false, sub_queries must contain exactly resolved_query.
	3.	Otherwise, produce 2–10 self-contained questions; avoid pronouns and shared context.

⸻
"""

        # ---- Append NEW examples provided by the user ----
        new_examples = """

Normalise pronouns and references: turn “this paper” into the explicit title if it can be inferred, otherwise leave as-is.
chat_history: “What is the email address of the computer vision consultants?”
query: “What is their revenue?”

{
  "requires_decomposition": false,
  "reasoning": "Pronoun resolved; single information need.",
  "resolved_query": "What is the revenue of the computer vision consultants?",
  "sub_queries": [
    "What is the revenue of the computer vision consultants?"
  ]
}

Context resolution (single info need)
chat_history: “What is the email address of the computer vision consultants?”
query: “What is the address?”

{
  "requires_decomposition": false,
  "reasoning": "Pronoun resolved; single information need.",
  "resolved_query": "What is the physical address of the computer vision consultants?",
  "sub_queries": [
    "What is the physical address of the computer vision consultants?"
  ]
}

Context resolution (single info need)
chat_history: “ComputeX has a revenue of 100M?”
query: “Who is the CEO?”

{
  "requires_decomposition": false,
  "reasoning": "entities normalization.",
  "resolved_query": "who is the CEO of ComputeX",
  "sub_queries": [
    "who is the CEO of ComputeX"
  ]
}

No unique antecedent → leave unresolved
chat_history: “Tell me about the paper.”
query: “What is the address?”

{
  "requires_decomposition": false,
  "reasoning": "Ambiguous reference; cannot resolve safely.",
  "resolved_query": "What is the address?",
  "sub_queries": ["What is the address?"]
}

Temporal + Comparative
chat_history: ""
query: “How did Nvidia’s 2024 revenue compare with 2023?”

{
  "requires_decomposition": true,
  "reasoning": "Needs revenue for two separate years before comparison.",
  "resolved_query": "How did Nvidia’s 2024 revenue compare with 2023?",
  "sub_queries": [
    "What was Nvidia’s revenue in 2024?",
    "What was Nvidia’s revenue in 2023?"
  ]
}

Enumeration (pros / cons / cost)
chat_history: ""
query: “List the pros, cons, and estimated implementation cost of adopting a vector database.”

{
  "requires_decomposition": true,
  "reasoning": "Three distinct information needs: pros, cons, cost.",
  "resolved_query": "List the pros, cons, and estimated implementation cost of adopting a vector database.",
  "sub_queries": [
    "What are the pros of adopting a vector database?",
    "What are the cons of adopting a vector database?",
    "What is the estimated implementation cost of adopting a vector database?"
  ]
}

Entity-set comparison (multiple companies)
chat_history: ""
query: “How did Nvidia, AMD, and Intel perform in Q2 2025 in terms of revenue?”

{
  "requires_decomposition": true,
  "reasoning": "Need revenue for each of three entities before comparison.",
  "resolved_query": "How did Nvidia, AMD, and Intel perform in Q2 2025 in terms of revenue?",
  "sub_queries": [
    "What was Nvidia's revenue in Q2 2025?",
    "What was AMD's revenue in Q2 2025?",
    "What was Intel's revenue in Q2 2025?"
  ]
}

Multi-part question (limitations + mitigations)
chat_history: ""
query: “What are the limitations of GPT-4o and what are the recommended mitigations?”

{
  "requires_decomposition": true,
  "reasoning": "Two distinct pieces of information: limitations and mitigations.",
  "resolved_query": "What are the limitations of GPT-4o and what are the recommended mitigations?",
  "sub_queries": [
    "What are the known limitations of GPT-4o?",
    "What are the recommended mitigations for the limitations of GPT-4o?"
  ]
}
"""

        # ---- Append legacy examples that already existed in the old prompt ----
        legacy_examples_header = """
⸻

Additional legacy examples
"""

        legacy_examples_body = """
**Example 1: Multi-Part Query**
Query: "What were the main findings of the aiconfig report and how do they compare to the results from the RAG paper?"
JSON Output:
{
  "reasoning": "The query asks for two distinct pieces of information: the findings from one report and a comparison to another. This requires two separate retrieval steps.",
  "sub_queries": [
    "What were the main findings of the aiconfig report?",
    "How do the findings of the aiconfig report compare to the results from the RAG paper?"
  ]
}

**Example 2: Simple Query**
Query: "Summarize the contributions of the DeepSeek-V3 paper."
JSON Output:
{
  "reasoning": "This is a direct request for a summary of a single document and does not contain multiple parts.",
  "sub_queries": [
    "Summarize the contributions of the DeepSeek-V3 paper."
  ]
}

**Example 3: Comparative Query**
Query: "Did Microsoft or Google make more money last year?"
JSON Output:
{
  "reasoning": "This is a comparative query that requires fetching the profit for each company before a comparison can be made.",
  "sub_queries": [
    "How much profit did Microsoft make last year?",
    "How much profit did Google make last year?"
  ]
}

**Example 4: Comparative Query with different phrasing**
Query: "Who has more siblings, Jamie or Sansa?"
JSON Output:
{
  "reasoning": "This comparative query needs the sibling count for both individuals to be answered.",
  "sub_queries": [
    "How many siblings does Jamie have?",
    "How many siblings does Sansa have?"
  ]
}
"""

        full_prompt = (
            system_prompt
            + new_examples
            # + legacy_examples_header
            # + legacy_examples_body
            + """

⸻

Now process

Input payload:

""" + json.dumps({"query": query, "chat_history": chat_history_text}, indent=2) + """
"""
        )

        # ---- Call the LLM ----
        response = self.llm_client.generate_completion(self.llm_model, full_prompt, format="json")

        response_text = response.get('response', '{}')
        try:
            # Handle potential markdown code blocks in the response
            if response_text.strip().startswith("```json"):
                response_text = response_text.strip()[7:-3].strip()

            data = json.loads(response_text)

            sub_queries = data.get('sub_queries') or [query]
            reasoning = data.get('reasoning', 'No reasoning provided.')

            print(f"Query Decomposition Reasoning: {reasoning}")

            # Fallback: ensure at least the resolved_query if sub_queries empty
            if not sub_queries:
                sub_queries = [data.get('resolved_query', query)]

            # Deduplicate while preserving order
            sub_queries = list(dict.fromkeys(sub_queries))

            # Enforce 10 sub-query limit per new requirements
            return sub_queries[:10]
        except json.JSONDecodeError:
            print(f"Failed to decode JSON from query decomposer: {response_text}")
            return [query]

class HyDEGenerator:
    def __init__(self, llm_client: OllamaClient, llm_model: str):
        self.llm_client = llm_client
        self.llm_model = llm_model

    def generate(self, query: str) -> str:
        prompt = f"Generate a short, hypothetical document that answers the following question. The document should be dense with keywords and concepts related to the query.\n\nQuery: {query}\n\nHypothetical Document:"
        response = self.llm_client.generate_completion(self.llm_model, prompt)
        return response.get('response', '')

class GraphQueryTranslator:
    def __init__(self, llm_client: OllamaClient, llm_model: str):
        self.llm_client = llm_client
        self.llm_model = llm_model

    def _generate_translation_prompt(self, query: str) -> str:
        return f"""
You are an expert query planner. Convert the user's question into a structured JSON query for a knowledge graph.
The JSON should contain a 'start_node' (the known entity in the query) and an 'edge_label' (the relationship being asked about).
The graph has nodes (entities) and directed edges (relationships). For example, (Tim Cook) -[IS_CEO_OF]-> (Apple).
Return ONLY the JSON object.

User Question: "{query}"

JSON Output:
"""

    def translate(self, query: str) -> Dict[str, Any]:
        prompt = self._generate_translation_prompt(query)
        response = self.llm_client.generate_completion(self.llm_model, prompt, format="json")
        try:
            return json.loads(response.get('response', '{}'))
        except json.JSONDecodeError:
            return {}

================================================
FILE: rag_system/retrieval/retrievers.py
================================================
import lancedb
import pickle
import json
from typing import List, Dict, Any
import numpy as np
import networkx as nx
import os
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch
import logging
import pandas as pd
import math
import concurrent.futures
from functools import lru_cache

from rag_system.indexing.embedders import LanceDBManager
from rag_system.indexing.representations import QwenEmbedder
from rag_system.indexing.multimodal import LocalVisionModel
from rag_system.utils.logging_utils import log_retrieval_results

# BM25Retriever is no longer needed.
# class BM25Retriever: ...

from fuzzywuzzy import process

class GraphRetriever:
    def __init__(self, graph_path: str):
        self.graph = nx.read_gml(graph_path)

    def retrieve(self, query: str, k: int = 5, score_cutoff: int = 80) -> List[Dict[str, Any]]:
        print(f"\n--- Performing Graph Retrieval for query: '{query}' ---")
        
        query_parts = query.split()
        entities = []
        for part in query_parts:
            match = process.extractOne(part, self.graph.nodes(), score_cutoff=score_cutoff)
            if match and isinstance(match[0], str):
                entities.append(match[0])
        
        retrieved_docs = []
        for entity in set(entities):
            for neighbor in self.graph.neighbors(entity):
                retrieved_docs.append({
                    'chunk_id': f"graph_{entity}_{neighbor}",
                    'text': f"Entity: {entity}, Neighbor: {neighbor}",
                    'score': 1.0,
                    'metadata': {'source': 'graph'}
                })
        
        print(f"Retrieved {len(retrieved_docs)} documents from the graph.")
        return retrieved_docs[:k]

# region === MultiVectorRetriever ===
class MultiVectorRetriever:
    """
    Performs hybrid (vector + FTS) or vector-only retrieval.
    """
    def __init__(self, db_manager: LanceDBManager, text_embedder: QwenEmbedder, vision_model: LocalVisionModel = None, *, fusion_config: Dict[str, Any] | None = None):
        self.db_manager = db_manager
        self.text_embedder = text_embedder
        self.vision_model = vision_model
        self.fusion_config = fusion_config or {"method": "linear", "bm25_weight": 0.5, "vec_weight": 0.5}

        # Lightweight in-memory LRU cache for single-query embeddings (256 entries)
        @lru_cache(maxsize=256)
        def _embed_single(q: str):
            return self.text_embedder.create_embeddings([q])[0]

        self._embed_single = _embed_single

    def retrieve(self, text_query: str, table_name: str, k: int, reranker=None) -> List[Dict[str, Any]]:
        """
        Performs a search on a single LanceDB table.
        If a reranker is provided, it performs a hybrid search.
        Otherwise, it performs a standard vector search.
        """
        print(f"\n--- Performing Retrieval for query: '{text_query}' on table '{table_name}' ---")
        
        try:
            if table_name is None:
                table_name = "default_text_table"
            tbl = self.db_manager.get_table(table_name)
            
            # Create / fetch cached text embedding for the query
            text_query_embedding = self._embed_single(text_query)
            
            logger = logging.getLogger(__name__)

            # Always perform hybrid lexical + vector search
            logger.debug(
                "Running hybrid search on table '%s' (k=%s, have_reranker=%s)",
                table_name,
                k,
                bool(reranker),
            )

            if reranker:
                logger.debug("Hybrid + reranker path not yet implemented with manual fusion; proceeding without extra reranker.")

            # Manual two-leg hybrid: take half from each modality
            fts_k = k // 2
            vec_k = k - fts_k

            # Run FTS and vector search in parallel to cut latency
            def _run_fts():
                # Very short queries often underperform → add fuzzy wildcard
                fts_query = text_query
                if len(text_query.split()) == 1:
                    fts_query = f"{text_query}* OR {text_query}~"
                return (
                     tbl.search(query=fts_query, query_type="fts")
                        .limit(fts_k)
                        .to_df()
                 )

            def _run_vec():
                if vec_k == 0:
                    return None
                return (
                    tbl.search(text_query_embedding)
                       .limit(vec_k * 2)  # fetch extra to allow for dedup
                       .to_df()
                )

            with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
                fts_future = executor.submit(_run_fts)
                vec_future = executor.submit(_run_vec)
                fts_df = fts_future.result()
                vec_df = vec_future.result()

            if vec_df is not None:
                combined = pd.concat([fts_df, vec_df])
            else:
                combined = fts_df

            # Remove duplicates preserving first occurrence, then trim to k
            dedup_subset = ["_rowid"] if "_rowid" in combined.columns else (["chunk_id"] if "chunk_id" in combined.columns else None)
            if dedup_subset:
                combined = combined.drop_duplicates(subset=dedup_subset, keep="first")
            combined = combined.head(k)

            results_df = combined
            logger.debug(
                "Hybrid (fts=%s, vec=%s) → %s unique chunks",
                len(fts_df),
                0 if vec_df is None else len(vec_df),
                len(results_df),
            )
            
            retrieved_docs = []
            for _, row in results_df.iterrows():
                metadata = json.loads(row.get('metadata', '{}'))
                # Add top-level fields back into metadata for consistency if they don't exist
                metadata.setdefault('document_id', row.get('document_id'))
                metadata.setdefault('chunk_index', row.get('chunk_index'))
                
                # Determine score (vector distance or FTS). Replace NaN with 0.0
                raw_score = row.get('_distance') if '_distance' in row else row.get('score')
                try:
                    if raw_score is None or (isinstance(raw_score, float) and math.isnan(raw_score)):
                        raw_score = 0.0
                except Exception:
                    raw_score = 0.0

                combined_score = raw_score
                # Optional linear-weight fusion if both FTS & vector scores exist
                if '_distance' in row and 'score' in row:
                    try:
                        bm25 = row.get('score', 0.0)
                        vec_sim = 1.0 / (1.0 + row.get('_distance', 1.0))  # convert distance to similarity
                        w_bm25 = float(self.fusion_config.get('bm25_weight', 0.5))
                        w_vec = float(self.fusion_config.get('vec_weight', 0.5))
                        combined_score = w_bm25 * bm25 + w_vec * vec_sim
                    except Exception:
                        pass

                retrieved_docs.append({
                    'chunk_id': row.get('chunk_id'),
                    'text': metadata.get('original_text', row.get('text')),
                    'score': combined_score,
                    'bm25': row.get('score'),
                    '_distance': row.get('_distance'),
                    'document_id': row.get('document_id'),
                    'chunk_index': row.get('chunk_index'),
                    'metadata': metadata
                })

            logger.debug("Hybrid search returned %s results", len(retrieved_docs))
            log_retrieval_results(retrieved_docs, k)
            print(f"Retrieved {len(retrieved_docs)} documents.")
            return retrieved_docs
        
        except Exception as e:
            print(f"Could not search table '{table_name}': {e}")
            return []
# endregion

if __name__ == '__main__':
    print("retrievers.py updated for LanceDB FTS Hybrid Search.")


================================================
FILE: rag_system/utils/batch_processor.py
================================================
import time
import logging
from typing import List, Dict, Any, Callable, Optional, Iterator
from contextlib import contextmanager
import gc

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@contextmanager
def timer(operation_name: str):
    """Context manager to time operations"""
    start = time.time()
    try:
        yield
    finally:
        duration = time.time() - start
        logger.info(f"{operation_name} completed in {duration:.2f}s")

class ProgressTracker:
    """Tracks progress and performance metrics for batch operations"""
    
    def __init__(self, total_items: int, operation_name: str = "Processing"):
        self.total_items = total_items
        self.operation_name = operation_name
        self.processed_items = 0
        self.errors_encountered = 0
        self.start_time = time.time()
        self.last_report_time = time.time()
        self.report_interval = 10  # Report every 10 seconds
        
    def update(self, items_processed: int, errors: int = 0):
        """Update progress with number of items processed"""
        self.processed_items += items_processed
        self.errors_encountered += errors
        
        current_time = time.time()
        if current_time - self.last_report_time >= self.report_interval:
            self._report_progress()
            self.last_report_time = current_time
            
    def _report_progress(self):
        """Report current progress"""
        elapsed = time.time() - self.start_time
        if elapsed > 0:
            rate = self.processed_items / elapsed
            remaining = self.total_items - self.processed_items
            eta = remaining / rate if rate > 0 else 0
            
            progress_pct = (self.processed_items / self.total_items) * 100
            
            logger.info(
                f"{self.operation_name}: {self.processed_items}/{self.total_items} "
                f"({progress_pct:.1f}%) - {rate:.2f} items/sec - "
                f"ETA: {eta/60:.1f}min - Errors: {self.errors_encountered}"
            )
            
    def finish(self):
        """Report final statistics"""
        elapsed = time.time() - self.start_time
        rate = self.processed_items / elapsed if elapsed > 0 else 0
        
        logger.info(
            f"{self.operation_name} completed: {self.processed_items}/{self.total_items} items "
            f"in {elapsed:.2f}s ({rate:.2f} items/sec) - {self.errors_encountered} errors"
        )

class BatchProcessor:
    """Generic batch processor with progress tracking and error handling"""
    
    def __init__(self, batch_size: int = 50, enable_gc: bool = True):
        self.batch_size = batch_size
        self.enable_gc = enable_gc
        
    def process_in_batches(
        self,
        items: List[Any],
        process_func: Callable,
        operation_name: str = "Processing",
        **kwargs
    ) -> List[Any]:
        """
        Process items in batches with progress tracking
        
        Args:
            items: List of items to process
            process_func: Function to process each batch
            operation_name: Name for progress reporting
            **kwargs: Additional arguments passed to process_func
            
        Returns:
            List of results from all batches
        """
        if not items:
            logger.info(f"{operation_name}: No items to process")
            return []
            
        tracker = ProgressTracker(len(items), operation_name)
        results = []
        
        logger.info(f"Starting {operation_name} for {len(items)} items in batches of {self.batch_size}")
        
        with timer(f"{operation_name} (total)"):
            for i in range(0, len(items), self.batch_size):
                batch = items[i:i + self.batch_size]
                batch_num = i // self.batch_size + 1
                total_batches = (len(items) + self.batch_size - 1) // self.batch_size
                
                try:
                    with timer(f"Batch {batch_num}/{total_batches}"):
                        batch_results = process_func(batch, **kwargs)
                        results.extend(batch_results)
                        
                    tracker.update(len(batch))
                    
                except Exception as e:
                    logger.error(f"Error in batch {batch_num}: {e}")
                    tracker.update(len(batch), errors=len(batch))
                    # Continue processing other batches
                    continue
                
                # Optional garbage collection to manage memory
                if self.enable_gc and batch_num % 5 == 0:
                    gc.collect()
                    
        tracker.finish()
        return results
        
    def batch_iterator(self, items: List[Any]) -> Iterator[List[Any]]:
        """Generate batches as an iterator for memory-efficient processing"""
        for i in range(0, len(items), self.batch_size):
            yield items[i:i + self.batch_size]

class StreamingProcessor:
    """Process items one at a time with minimal memory usage"""
    
    def __init__(self, enable_gc_interval: int = 100):
        self.enable_gc_interval = enable_gc_interval
        
    def process_streaming(
        self,
        items: List[Any],
        process_func: Callable,
        operation_name: str = "Streaming Processing",
        **kwargs
    ) -> List[Any]:
        """
        Process items one at a time with minimal memory footprint
        
        Args:
            items: List of items to process
            process_func: Function to process each item
            operation_name: Name for progress reporting
            **kwargs: Additional arguments passed to process_func
            
        Returns:
            List of results
        """
        if not items:
            logger.info(f"{operation_name}: No items to process")
            return []
            
        tracker = ProgressTracker(len(items), operation_name)
        results = []
        
        logger.info(f"Starting {operation_name} for {len(items)} items (streaming)")
        
        with timer(f"{operation_name} (streaming)"):
            for i, item in enumerate(items):
                try:
                    result = process_func(item, **kwargs)
                    results.append(result)
                    tracker.update(1)
                    
                except Exception as e:
                    logger.error(f"Error processing item {i}: {e}")
                    tracker.update(1, errors=1)
                    continue
                    
                # Periodic garbage collection
                if self.enable_gc_interval and (i + 1) % self.enable_gc_interval == 0:
                    gc.collect()
                    
        tracker.finish()
        return results

# Utility functions for common batch operations
def batch_chunks_by_document(chunks: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    """Group chunks by document_id for document-level batch processing"""
    document_batches = {}
    for chunk in chunks:
        doc_id = chunk.get('metadata', {}).get('document_id', 'unknown')
        if doc_id not in document_batches:
            document_batches[doc_id] = []
        document_batches[doc_id].append(chunk)
    return document_batches

def estimate_memory_usage(chunks: List[Dict[str, Any]]) -> float:
    """Estimate memory usage of chunks in MB"""
    if not chunks:
        return 0.0
        
    # Rough estimate: average text length * number of chunks * 2 (for overhead)
    avg_text_length = sum(len(chunk.get('text', '')) for chunk in chunks[:min(10, len(chunks))]) / min(10, len(chunks))
    estimated_bytes = avg_text_length * len(chunks) * 2
    return estimated_bytes / (1024 * 1024)  # Convert to MB

if __name__ == '__main__':
    # Test the batch processor
    def dummy_process_func(batch):
        time.sleep(0.1)  # Simulate processing time
        return [f"processed_{item}" for item in batch]
    
    test_items = list(range(100))
    processor = BatchProcessor(batch_size=10)
    results = processor.process_in_batches(
        test_items, 
        dummy_process_func, 
        "Test Processing"
    )
    
    print(f"Processed {len(results)} items") 

================================================
FILE: rag_system/utils/logging_utils.py
================================================
import logging
from typing import List, Dict
from textwrap import shorten

logger = logging.getLogger("rag-system")

# Global log format – only set if user has not configured logging
if not logger.handlers:
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s",
    )


def log_query(query: str, sub_queries: List[str] | None = None) -> None:
    """Emit a nicely-formatted block describing the incoming query and any
    decomposition."""
    border = "=" * 60
    logger.info("\n%s\nUSER QUERY: %s", border, query)
    if sub_queries:
        for i, q in enumerate(sub_queries, 1):
            logger.info("  sub-%d → %s", i, q)
    logger.info("%s", border)


def log_retrieval_results(results: List[Dict], k: int) -> None:
    """Show chunk_id, truncated text and score for the first *k* rows."""
    if not results:
        logger.info("Retrieval returned 0 documents.")
        return
    logger.info("Top %d results:", min(k, len(results)))
    header = f"{'chunk_id':<14} {'score':<7} preview"
    logger.info(header)
    logger.info("-" * len(header))
    for row in results[:k]:
        preview = shorten(row.get("text", ""), width=60, placeholder="…")
        logger.info("%s %-7.3f %s", str(row.get("chunk_id"))[:12], row.get("score", 0.0), preview) 

================================================
FILE: rag_system/utils/ollama_client.py
================================================
import requests
import json
from typing import List, Dict, Any
import base64
from io import BytesIO
from PIL import Image
import httpx, asyncio

class OllamaClient:
    """
    An enhanced client for Ollama that now handles image data for VLM models.
    """
    def __init__(self, host: str = "http://localhost:11434"):
        self.host = host
        self.api_url = f"{host}/api"
        # (Connection check remains the same)

    def _image_to_base64(self, image: Image.Image) -> str:
        """Converts a Pillow Image to a base64 string."""
        buffered = BytesIO()
        image.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode('utf-8')

    def generate_embedding(self, model: str, text: str) -> List[float]:
        try:
            response = requests.post(
                f"{self.api_url}/embeddings",
                json={"model": model, "prompt": text}
            )
            response.raise_for_status()
            return response.json().get("embedding", [])
        except requests.exceptions.RequestException as e:
            print(f"Error generating embedding: {e}")
            return []

    def generate_completion(
        self,
        model: str,
        prompt: str,
        *,
        format: str = "",
        images: List[Image.Image] | None = None,
        enable_thinking: bool | None = None,
    ) -> Dict[str, Any]:
        """
        Generates a completion, now with optional support for images.

        Args:
            model: The name of the generation model (e.g., 'llava', 'qwen-vl').
            prompt: The text prompt for the model.
            format: The format for the response, e.g., "json".
            images: A list of Pillow Image objects to send to the VLM.
            enable_thinking: Optional flag to disable chain-of-thought for Qwen models.
        """
        try:
            payload = {
                "model": model,
                "prompt": prompt,
                "stream": False
            }
            if format:
                payload["format"] = format
            
            if images:
                payload["images"] = [self._image_to_base64(img) for img in images]

            # Optional: disable thinking mode for Qwen3 / DeepSeek models
            if enable_thinking is not None:
                payload["chat_template_kwargs"] = {"enable_thinking": enable_thinking}

            response = requests.post(
                f"{self.api_url}/generate",
                json=payload
            )
            response.raise_for_status()
            response_lines = response.text.strip().split('\n')
            final_response = json.loads(response_lines[-1])
            return final_response

        except requests.exceptions.RequestException as e:
            print(f"Error generating completion: {e}")
            return {}

    # -------------------------------------------------------------
    # Async variant – uses httpx so the caller can await multiple
    # LLM calls concurrently (triage, verification, etc.).
    # -------------------------------------------------------------
    async def generate_completion_async(
        self,
        model: str,
        prompt: str,
        *,
        format: str = "",
        images: List[Image.Image] | None = None,
        enable_thinking: bool | None = None,
        timeout: int = 60,
    ) -> Dict[str, Any]:
        """Asynchronous version of generate_completion using httpx."""

        payload = {"model": model, "prompt": prompt, "stream": False}
        if format:
            payload["format"] = format
        if images:
            payload["images"] = [self._image_to_base64(img) for img in images]

        if enable_thinking is not None:
            payload["chat_template_kwargs"] = {"enable_thinking": enable_thinking}

        try:
            async with httpx.AsyncClient(timeout=timeout) as client:
                resp = await client.post(f"{self.api_url}/generate", json=payload)
                resp.raise_for_status()
                return json.loads(resp.text.strip().split("\n")[-1])
        except (httpx.HTTPError, asyncio.CancelledError) as e:
            print(f"Async Ollama completion error: {e}")
            return {}

    # -------------------------------------------------------------
    # Streaming variant – yields token chunks in real time
    # -------------------------------------------------------------
    def stream_completion(
        self,
        model: str,
        prompt: str,
        *,
        images: List[Image.Image] | None = None,
        enable_thinking: bool | None = None,
    ):
        """Generator that yields partial *response* strings as they arrive.

        Example:

            for tok in client.stream_completion("qwen2", "Hello"):
                print(tok, end="", flush=True)
        """
        payload: Dict[str, Any] = {"model": model, "prompt": prompt, "stream": True}
        if images:
            payload["images"] = [self._image_to_base64(img) for img in images]
        if enable_thinking is not None:
            payload["chat_template_kwargs"] = {"enable_thinking": enable_thinking}

        with requests.post(f"{self.api_url}/generate", json=payload, stream=True) as resp:
            resp.raise_for_status()
            for raw_line in resp.iter_lines():
                if not raw_line:
                    # Keep-alive newline
                    continue
                try:
                    data = json.loads(raw_line.decode())
                except json.JSONDecodeError:
                    continue
                # The Ollama streaming API sends objects like {"response":"Hi","done":false}
                chunk = data.get("response", "")
                if chunk:
                    yield chunk
                if data.get("done"):
                    break

if __name__ == '__main__':
    # This test now requires a VLM model like 'llava' or 'qwen-vl' to be pulled.
    print("Ollama client updated for multimodal (VLM) support.")
    try:
        client = OllamaClient()
        # Create a dummy black image for testing
        dummy_image = Image.new('RGB', (100, 100), 'black')
        
        # Test VLM completion
        vlm_response = client.generate_completion(
            model="llava", # Make sure you have run 'ollama pull llava'
            prompt="What color is this image?",
            images=[dummy_image]
        )
        
        if vlm_response and 'response' in vlm_response:
            print("\n--- VLM Test Response ---")
            print(vlm_response['response'])
        else:
            print("\nFailed to get VLM response. Is 'llava' model pulled and running?")

    except Exception as e:
        print(f"An error occurred: {e}")

================================================
FILE: rag_system/utils/validate_model_config.py
================================================
#!/usr/bin/env python3
"""
Model Configuration Validation Script
=====================================

This script validates the consolidated model configuration system to ensure:
1. No configuration conflicts exist
2. All model names are consistent across components
3. Models are accessible and properly configured
4. The configuration validation system works correctly

Run this after making configuration changes to catch issues early.
"""

import sys
import os
# Add parent directories to path for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

from rag_system.main import (
    PIPELINE_CONFIGS, 
    OLLAMA_CONFIG, 
    EXTERNAL_MODELS,
    validate_model_config
)

def print_header(title: str):
    """Print a formatted header."""
    print(f"\n{'='*60}")
    print(f"🔍 {title}")
    print(f"{'='*60}")

def print_section(title: str):
    """Print a formatted section header.""" 
    print(f"\n{'─'*40}")
    print(f"📋 {title}")
    print(f"{'─'*40}")

def validate_configuration_consistency():
    """Validate that all configurations are consistent."""
    print_header("CONFIGURATION CONSISTENCY VALIDATION")
    
    errors = []
    
    # 1. Check embedding model consistency
    print_section("Embedding Model Consistency")
    default_embedding = PIPELINE_CONFIGS["default"]["embedding_model_name"]
    external_embedding = EXTERNAL_MODELS["embedding_model"]
    fast_embedding = PIPELINE_CONFIGS["fast"]["embedding_model_name"]
    
    print(f"Default Config: {default_embedding}")
    print(f"External Models: {external_embedding}")  
    print(f"Fast Config: {fast_embedding}")
    
    if default_embedding != external_embedding:
        errors.append(f"❌ Embedding model mismatch: default={default_embedding}, external={external_embedding}")
    elif default_embedding != fast_embedding:
        errors.append(f"❌ Embedding model mismatch: default={default_embedding}, fast={fast_embedding}")
    else:
        print("✅ Embedding models are consistent")
    
    # 2. Check reranker model consistency
    print_section("Reranker Model Consistency")
    default_reranker = PIPELINE_CONFIGS["default"]["reranker"]["model_name"]
    external_reranker = EXTERNAL_MODELS["reranker_model"]
    
    print(f"Default Config: {default_reranker}")
    print(f"External Models: {external_reranker}")
    
    if default_reranker != external_reranker:
        errors.append(f"❌ Reranker model mismatch: default={default_reranker}, external={external_reranker}")
    else:
        print("✅ Reranker models are consistent")
    
    # 3. Check vision model consistency
    print_section("Vision Model Consistency")
    default_vision = PIPELINE_CONFIGS["default"]["vision_model_name"]
    external_vision = EXTERNAL_MODELS["vision_model"]
    
    print(f"Default Config: {default_vision}")
    print(f"External Models: {external_vision}")
    
    if default_vision != external_vision:
        errors.append(f"❌ Vision model mismatch: default={default_vision}, external={external_vision}")
    else:
        print("✅ Vision models are consistent")
    
    return errors

def print_model_usage_map():
    """Print a comprehensive map of which models are used where."""
    print_header("MODEL USAGE MAP")
    
    print_section("🤖 Ollama Models (Local Inference)")
    for model_type, model_name in OLLAMA_CONFIG.items():
        if model_type != "host":
            print(f"  {model_type.replace('_', ' ').title()}: {model_name}")
    
    print_section("🔗 External Models (HuggingFace/Direct)")
    for model_type, model_name in EXTERNAL_MODELS.items():
        print(f"  {model_type.replace('_', ' ').title()}: {model_name}")
    
    print_section("📍 Model Usage by Component")
    usage_map = {
        "🔤 Text Embedding": {
            "Model": EXTERNAL_MODELS["embedding_model"],
            "Used In": ["Retrieval Pipeline", "Semantic Cache", "Dense Retrieval", "Late Chunking"],
            "Component": "QwenEmbedder (representations.py)"
        },
        "🧠 Text Generation": {
            "Model": OLLAMA_CONFIG["generation_model"],
            "Used In": ["Agent Loop", "Answer Synthesis", "Query Decomposition", "Verification"],
            "Component": "OllamaClient"
        },
        "🚀 Enrichment/Routing": {
            "Model": OLLAMA_CONFIG["enrichment_model"],
            "Used In": ["Query Routing", "Document Overview Analysis"],
            "Component": "Agent Loop (_route_via_overviews)"
        },
        "🔀 Reranking": {
            "Model": EXTERNAL_MODELS["reranker_model"],
            "Used In": ["Hybrid Search", "Document Reranking", "AI Reranker"],
            "Component": "ColBERT (rerankers-lib) or QwenReranker"
        },
        "👁️ Vision": {
            "Model": EXTERNAL_MODELS["vision_model"],
            "Used In": ["Multimodal Processing", "Image Embeddings"],
            "Component": "Vision Pipeline (when enabled)"
        }
    }
    
    for model_name, details in usage_map.items():
        print(f"\n{model_name}")
        print(f"  Model: {details['Model']}")
        print(f"  Component: {details['Component']}")
        print(f"  Used In: {', '.join(details['Used In'])}")

def test_validation_function():
    """Test the built-in validation function."""
    print_header("VALIDATION FUNCTION TEST")
    
    try:
        result = validate_model_config()
        if result:
            print("✅ validate_model_config() passed successfully!")
        else:
            print("❌ validate_model_config() returned False")
    except Exception as e:
        print(f"❌ validate_model_config() failed with error: {e}")
        return False
    
    return True

def check_pipeline_configurations():
    """Check all pipeline configurations for completeness."""
    print_header("PIPELINE CONFIGURATION COMPLETENESS")
    
    required_keys = {
        "default": ["storage", "retrieval", "embedding_model_name", "reranker"],
        "fast": ["storage", "retrieval", "embedding_model_name"]
    }
    
    errors = []
    
    for config_name, required in required_keys.items():
        print_section(f"{config_name.title()} Configuration")
        config = PIPELINE_CONFIGS.get(config_name, {})
        
        for key in required:
            if key in config:
                print(f"  ✅ {key}: {type(config[key]).__name__}")
            else:
                error_msg = f"❌ Missing required key '{key}' in {config_name} config"
                errors.append(error_msg)  
                print(f"  {error_msg}")
    
    return errors

def main():
    """Run all validation checks."""
    print("🚀 Starting Model Configuration Validation")
    print(f"Python Path: {sys.path[0]}")
    
    all_errors = []
    
    # Run all validation checks
    all_errors.extend(validate_configuration_consistency())
    all_errors.extend(check_pipeline_configurations())
    
    # Print model usage map
    print_model_usage_map()
    
    # Test validation function
    validation_passed = test_validation_function()
    
    # Final summary
    print_header("VALIDATION SUMMARY")
    
    if all_errors:
        print("❌ VALIDATION FAILED - Issues Found:")
        for error in all_errors:
            print(f"  {error}")
        return 1
    elif not validation_passed:
        print("❌ VALIDATION FAILED - validate_model_config() function failed")
        return 1
    else:
        print("✅ ALL VALIDATIONS PASSED!")
        print("\n🎉 Your model configuration is consistent and properly structured!")
        print("\n📋 Summary:")
        print(f"   • Embedding Model: {EXTERNAL_MODELS['embedding_model']}")
        print(f"   • Generation Model: {OLLAMA_CONFIG['generation_model']}")
        print(f"   • Enrichment Model: {OLLAMA_CONFIG['enrichment_model']}")
        print(f"   • Reranker Model: {EXTERNAL_MODELS['reranker_model']}")
        print(f"   • Vision Model: {EXTERNAL_MODELS['vision_model']}")
        return 0

if __name__ == "__main__":
    sys.exit(main()) 

================================================
FILE: rag_system/utils/watsonx_client.py
================================================
import json
from typing import List, Dict, Any, Optional
import base64
from io import BytesIO
from PIL import Image


class WatsonXClient:
    """
    A client for IBM Watson X AI that provides similar interface to OllamaClient
    for seamless integration with the RAG system.
    """
    def __init__(
        self,
        api_key: str,
        project_id: str,
        url: str = "https://us-south.ml.cloud.ibm.com",
    ):
        """
        Initialize the Watson X client.
        
        Args:
            api_key: IBM Cloud API key for authentication
            project_id: Watson X project ID
            url: Watson X service URL (default: us-south region)
        """
        self.api_key = api_key
        self.project_id = project_id
        self.url = url
        
        try:
            from ibm_watsonx_ai import APIClient
            from ibm_watsonx_ai import Credentials
            from ibm_watsonx_ai.foundation_models import ModelInference
            from ibm_watsonx_ai.foundation_models.schema import TextGenParameters
        except ImportError:
            raise ImportError(
                "ibm-watsonx-ai package is required. "
                "Install it with: pip install ibm-watsonx-ai"
            )
        
        self._APIClient = APIClient
        self._Credentials = Credentials
        self._ModelInference = ModelInference
        self._TextGenParameters = TextGenParameters
        
        self.credentials = self._Credentials(
            api_key=self.api_key,
            url=self.url
        )
        
        self.client = self._APIClient(self.credentials)
        self.client.set.default_project(self.project_id)

    def _image_to_base64(self, image: Image.Image) -> str:
        """Converts a Pillow Image to a base64 string."""
        buffered = BytesIO()
        image.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode('utf-8')

    def generate_embedding(self, model: str, text: str) -> List[float]:
        """
        Generate embeddings using Watson X embedding models.
        Note: This requires using Watson X embedding models through the embeddings API.
        """
        try:
            from ibm_watsonx_ai.foundation_models import Embeddings
            
            embedding_model = Embeddings(
                model_id=model,
                credentials=self.credentials,
                project_id=self.project_id
            )
            
            result = embedding_model.embed_query(text)
            return result if isinstance(result, list) else []
            
        except Exception as e:
            print(f"Error generating embedding: {e}")
            return []

    def generate_completion(
        self,
        model: str,
        prompt: str,
        *,
        format: str = "",
        images: Optional[List[Image.Image]] = None,
        enable_thinking: Optional[bool] = None,
        **kwargs
    ) -> Dict[str, Any]:
        """
        Generates a completion using Watson X foundation models.
        
        Args:
            model: The name/ID of the Watson X model (e.g., 'ibm/granite-13b-chat-v2')
            prompt: The text prompt for the model
            format: The format for the response (e.g., "json")
            images: List of Pillow Image objects (for multimodal models)
            enable_thinking: Optional flag (not used in Watson X, kept for compatibility)
            **kwargs: Additional parameters for text generation
        
        Returns:
            Dictionary with response in Ollama-compatible format
        """
        try:
            gen_params = {}
            
            if kwargs.get('max_tokens'):
                gen_params['max_new_tokens'] = kwargs['max_tokens']
            if kwargs.get('temperature'):
                gen_params['temperature'] = kwargs['temperature']
            if kwargs.get('top_p'):
                gen_params['top_p'] = kwargs['top_p']
            if kwargs.get('top_k'):
                gen_params['top_k'] = kwargs['top_k']
            
            parameters = self._TextGenParameters(**gen_params) if gen_params else None
            
            model_inference = self._ModelInference(
                model_id=model,
                credentials=self.credentials,
                project_id=self.project_id,
                params=parameters
            )
            
            if images:
                print("Warning: Image support in Watson X may vary by model")
                result = model_inference.generate(prompt=prompt)
            else:
                result = model_inference.generate(prompt=prompt)
            
            generated_text = ""
            if isinstance(result, dict):
                generated_text = result.get('results', [{}])[0].get('generated_text', '')
            else:
                generated_text = str(result)
            
            return {
                'response': generated_text,
                'model': model,
                'done': True
            }
            
        except Exception as e:
            print(f"Error generating completion: {e}")
            return {'response': '', 'error': str(e)}

    async def generate_completion_async(
        self,
        model: str,
        prompt: str,
        *,
        format: str = "",
        images: Optional[List[Image.Image]] = None,
        enable_thinking: Optional[bool] = None,
        timeout: int = 60,
        **kwargs
    ) -> Dict[str, Any]:
        """
        Asynchronous version of generate_completion.
        
        Note: IBM Watson X SDK may not have native async support,
        so this is a wrapper around the sync version.
        """
        import asyncio
        
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(
            None,
            lambda: self.generate_completion(
                model, prompt, format=format, images=images,
                enable_thinking=enable_thinking, **kwargs
            )
        )

    def stream_completion(
        self,
        model: str,
        prompt: str,
        *,
        images: Optional[List[Image.Image]] = None,
        enable_thinking: Optional[bool] = None,
        **kwargs
    ):
        """
        Generator that yields partial response strings as they arrive.
        
        Note: Watson X streaming support depends on the SDK version and model.
        """
        try:
            gen_params = {}
            if kwargs.get('max_tokens'):
                gen_params['max_new_tokens'] = kwargs['max_tokens']
            if kwargs.get('temperature'):
                gen_params['temperature'] = kwargs['temperature']
                
            parameters = self._TextGenParameters(**gen_params) if gen_params else None
            
            model_inference = self._ModelInference(
                model_id=model,
                credentials=self.credentials,
                project_id=self.project_id,
                params=parameters
            )
            
            try:
                for chunk in model_inference.generate_text_stream(prompt=prompt):
                    if chunk:
                        yield chunk
            except AttributeError:
                result = model_inference.generate(prompt=prompt)
                generated_text = ""
                if isinstance(result, dict):
                    generated_text = result.get('results', [{}])[0].get('generated_text', '')
                else:
                    generated_text = str(result)
                yield generated_text
                
        except Exception as e:
            print(f"Error in stream_completion: {e}")
            yield ""


if __name__ == '__main__':
    print("Watson X Client for IBM watsonx.ai integration")
    print("This client provides Ollama-compatible interface for Watson X granite models")
    print("\nTo use this client, you need:")
    print("1. IBM Cloud API key")
    print("2. Watson X project ID")
    print("3. ibm-watsonx-ai package installed")
    print("\nExample usage:")
    print("""
    from rag_system.utils.watsonx_client import WatsonXClient
    
    client = WatsonXClient(
        api_key="your-api-key",
        project_id="your-project-id"
    )
    
    response = client.generate_completion(
        model="ibm/granite-13b-chat-v2",
        prompt="What is AI?"
    )
    print(response['response'])
    """)


================================================
FILE: requirements-docker.txt
================================================
requests
python-dotenv
PyPDF2
colpali-engine
PyMuPDF
Pillow
transformers==4.51.0
torch==2.4.1
torchvision==0.19.1
lancedb
rank_bm25
fuzzywuzzy
python-Levenshtein
torchaudio
sentencepiece
accelerate
docling
cachetools
numpy
networkx
matplotlib
psutil
httpx
scikit-learn
pandas
sentence_transformers
rerankers
nltk
# Standard library modules (no need to install)
# asyncio, logging, json, os, sys, typing, threading, itertools, math, re
# ocrmac - removed for Docker compatibility (macOS-specific) 


================================================
FILE: requirements.txt
================================================
requests
python-dotenv
PyPDF2
colpali-engine
requests
python-dotenv
PyPDF2
colpali-engine
PyMuPDF
Pillow
transformers==4.51.0
torch==2.4.1
torchvision==0.19.1
lancedb
rank_bm25
fuzzywuzzy
python-Levenshtein
torchaudio
sentencepiece
accelerate
docling
cachetools
numpy
networkx
matplotlib
psutil
httpx
scikit-learn
pandas
sentence_transformers
rerankers
nltk


================================================
FILE: run_system.py
================================================
#!/usr/bin/env python3
"""
RAG System Unified Launcher
===========================

A comprehensive launcher that starts all RAG system components:
- Ollama server
- RAG API server (port 8001)
- Backend server (port 8000)  
- Frontend server (port 3000)

Features:
- Single command startup
- Real-time log aggregation
- Process health monitoring
- Graceful shutdown
- Production-ready deployment support

Usage:
    python run_system.py [--mode dev|prod] [--logs-only] [--no-frontend]
"""

import subprocess
import threading
import time
import signal
import sys
import os
import argparse
import json
import requests
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, TextIO
import logging
from dataclasses import dataclass
import psutil

@dataclass
class ServiceConfig:
    name: str
    command: List[str]
    port: int
    cwd: Optional[str] = None
    env: Optional[Dict[str, str]] = None
    health_check_path: str = "/health"
    startup_delay: int = 2
    required: bool = True

class ColoredFormatter(logging.Formatter):
    """Custom formatter with colors for different log levels and services."""
    
    COLORS = {
        'DEBUG': '\033[36m',     # Cyan
        'INFO': '\033[32m',      # Green
        'WARNING': '\033[33m',   # Yellow
        'ERROR': '\033[31m',     # Red
        'CRITICAL': '\033[35m',  # Magenta
    }
    
    SERVICE_COLORS = {
        'ollama': '\033[94m',     # Blue
        'rag-api': '\033[95m',    # Magenta
        'backend': '\033[96m',    # Cyan
        'frontend': '\033[93m',   # Yellow
        'system': '\033[92m',     # Green
    }
    
    RESET = '\033[0m'
    
    def format(self, record):
        # Add service-specific coloring
        service_name = getattr(record, 'service', 'system')
        service_color = self.SERVICE_COLORS.get(service_name, self.COLORS.get(record.levelname, ''))
        
        # Format timestamp
        timestamp = datetime.fromtimestamp(record.created).strftime('%H:%M:%S')
        
        # Create colored log line
        colored_service = f"{service_color}[{service_name.upper()}]{self.RESET}"
        colored_level = f"{self.COLORS.get(record.levelname, '')}{record.levelname}{self.RESET}"
        
        return f"{timestamp} {colored_service} {colored_level}: {record.getMessage()}"

class ServiceManager:
    """Manages multiple system services with logging and health monitoring."""
    
    def __init__(self, mode: str = "dev", logs_dir: str = "logs"):
        self.mode = mode
        self.logs_dir = Path(logs_dir)
        self.logs_dir.mkdir(exist_ok=True)
        
        self.processes: Dict[str, subprocess.Popen] = {}
        self.log_threads: Dict[str, threading.Thread] = {}
        self.running = False
        
        # Setup logging
        self.setup_logging()
        
        # Service configurations
        self.services = self._get_service_configs()
        
        # Register signal handlers for graceful shutdown
        signal.signal(signal.SIGINT, self._signal_handler)
        signal.signal(signal.SIGTERM, self._signal_handler)
    
    def setup_logging(self):
        """Setup centralized logging with colors."""
        # Create main logger
        self.logger = logging.getLogger('system')
        self.logger.setLevel(logging.INFO)
        
        # Console handler with colors
        console_handler = logging.StreamHandler(sys.stdout)
        console_handler.setFormatter(ColoredFormatter())
        self.logger.addHandler(console_handler)
        
        # File handler for system logs
        file_handler = logging.FileHandler(self.logs_dir / 'system.log')
        file_handler.setFormatter(logging.Formatter(
            '%(asctime)s [%(levelname)s] %(message)s'
        ))
        self.logger.addHandler(file_handler)
    
    def _get_service_configs(self) -> Dict[str, ServiceConfig]:
        """Define service configurations based on mode."""
        base_configs = {
            'ollama': ServiceConfig(
                name='ollama',
                command=['ollama', 'serve'],
                port=11434,
                startup_delay=5,
                required=True
            ),
            'rag-api': ServiceConfig(
                name='rag-api',
                command=[sys.executable, '-m', 'rag_system.api_server'],
                port=8001,
                startup_delay=3,
                required=True
            ),
            'backend': ServiceConfig(
                name='backend',
                command=[sys.executable, 'backend/server.py'],
                port=8000,
                startup_delay=2,
                required=True
            ),
            'frontend': ServiceConfig(
                name='frontend',
                command=['npm', 'run', 'dev' if self.mode == 'dev' else 'start'],
                port=3000,
                startup_delay=5,
                required=False  # Optional in case Node.js not available
            )
        }
        
        # Production mode adjustments
        if self.mode == 'prod':
            # Use production build for frontend
            base_configs['frontend'].command = ['npm', 'run', 'start']
            # Add production environment variables
            base_configs['rag-api'].env = {'NODE_ENV': 'production'}
            base_configs['backend'].env = {'NODE_ENV': 'production'}
        
        return base_configs
    
    def _signal_handler(self, signum, frame):
        """Handle shutdown signals gracefully."""
        self.logger.info(f"Received signal {signum}, shutting down...")
        self.shutdown()
        sys.exit(0)
    
    def is_port_in_use(self, port: int) -> bool:
        """Check if a port is already in use."""
        try:
            for conn in psutil.net_connections():
                if conn.laddr.port == port and conn.status == 'LISTEN':
                    return True
            return False
        except (psutil.AccessDenied, AttributeError):
            # Fallback method
            import socket
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                return s.connect_ex(('localhost', port)) == 0
    
    def check_prerequisites(self) -> bool:
        """Check if all required tools are available."""
        self.logger.info("🔍 Checking prerequisites...")
        
        missing_tools = []
        
        # Check Ollama
        if not self._command_exists('ollama'):
            missing_tools.append('ollama (https://ollama.ai)')
        
        # Check Python
        if not self._command_exists('python') and not self._command_exists('python3'):
            missing_tools.append('python')
        
        # Check Node.js (optional)
        if not self._command_exists('npm'):
            self.logger.warning("⚠️  npm not found - frontend will be disabled")
            self.services['frontend'].required = False
        
        if missing_tools:
            self.logger.error(f"❌ Missing required tools: {', '.join(missing_tools)}")
            return False
        
        self.logger.info("✅ All prerequisites satisfied")
        return True
    
    def _command_exists(self, command: str) -> bool:
        """Check if a command exists in PATH."""
        try:
            subprocess.run([command, '--version'], 
                         capture_output=True, check=True, timeout=5)
            return True
        except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
            return False
    
    def ensure_models(self):
        """Ensure required Ollama models are available."""
        self.logger.info("📥 Checking required models...")
        
        required_models = ['qwen3:8b', 'qwen3:0.6b']
        
        try:
            # Get list of installed models
            result = subprocess.run(['ollama', 'list'], 
                                  capture_output=True, text=True, timeout=10)
            installed_models = result.stdout
            
            for model in required_models:
                if model not in installed_models:
                    self.logger.info(f"📥 Pulling {model}...")
                    subprocess.run(['ollama', 'pull', model], 
                                 check=True, timeout=300)  # 5 min timeout
                    self.logger.info(f"✅ {model} ready")
                else:
                    self.logger.info(f"✅ {model} already available")
                    
        except subprocess.TimeoutExpired:
            self.logger.warning("⚠️  Model check timed out - continuing anyway")
        except subprocess.CalledProcessError as e:
            self.logger.warning(f"⚠️  Could not check/pull models: {e}")
    
    def start_service(self, service_name: str, config: ServiceConfig) -> bool:
        """Start a single service."""
        if service_name in self.processes:
            self.logger.warning(f"⚠️  {service_name} already running")
            return True
        
        # Check if port is in use
        if self.is_port_in_use(config.port):
            self.logger.warning(f"⚠️  Port {config.port} already in use, skipping {service_name}")
            return not config.required
        
        self.logger.info(f"🔄 Starting {service_name} on port {config.port}...")
        
        try:
            # Setup environment
            env = os.environ.copy()
            if config.env:
                env.update(config.env)
            
            # Start process
            process = subprocess.Popen(
                config.command,
                cwd=config.cwd,
                env=env,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                bufsize=1,
                universal_newlines=True
            )
            
            self.processes[service_name] = process
            
            # Start log monitoring thread
            log_thread = threading.Thread(
                target=self._monitor_service_logs,
                args=(service_name, process),
                daemon=True
            )
            log_thread.start()
            self.log_threads[service_name] = log_thread
            
            # Wait for startup
            time.sleep(config.startup_delay)
            
            # Check if process is still running
            if process.poll() is None:
                self.logger.info(f"✅ {service_name} started successfully (PID: {process.pid})")
                return True
            else:
                self.logger.error(f"❌ {service_name} failed to start")
                return False
                
        except Exception as e:
            self.logger.error(f"❌ Failed to start {service_name}: {e}")
            return False
    
    def _monitor_service_logs(self, service_name: str, process: subprocess.Popen):
        """Monitor service logs and forward to main logger."""
        service_logger = logging.getLogger(service_name)
        service_logger.setLevel(logging.INFO)
        
        # Add file handler for this service
        file_handler = logging.FileHandler(self.logs_dir / f'{service_name}.log')
        file_handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
        service_logger.addHandler(file_handler)
        
        try:
            for line in iter(process.stdout.readline, ''):
                if line.strip():
                    # Create log record with service context
                    record = logging.LogRecord(
                        name=service_name,
                        level=logging.INFO,
                        pathname='',
                        lineno=0,
                        msg=line.strip(),
                        args=(),
                        exc_info=None
                    )
                    record.service = service_name
                    
                    # Log to both service file and main console
                    service_logger.handle(record)
                    self.logger.handle(record)
                    
        except Exception as e:
            self.logger.error(f"Error monitoring {service_name} logs: {e}")
    
    def health_check(self, service_name: str, config: ServiceConfig) -> bool:
        """Perform health check on a service."""
        try:
            url = f"http://localhost:{config.port}{config.health_check_path}"
            response = requests.get(url, timeout=5)
            return response.status_code == 200
        except:
            return False
    
    def start_all(self, skip_frontend: bool = False) -> bool:
        """Start all services in order."""
        self.logger.info("🚀 Starting RAG System Components...")
        
        if not self.check_prerequisites():
            return False
        
        self.running = True
        failed_services = []
        
        # Start services in dependency order
        service_order = ['ollama', 'rag-api', 'backend']
        if not skip_frontend and 'frontend' in self.services:
            service_order.append('frontend')
        
        for service_name in service_order:
            if service_name not in self.services:
                continue
                
            config = self.services[service_name]
            
            # Special handling for Ollama
            if service_name == 'ollama':
                if not self._start_ollama():
                    if config.required:
                        failed_services.append(service_name)
                        continue
                    else:
                        self.logger.warning(f"⚠️  Skipping optional service: {service_name}")
                        continue
            else:
                if not self.start_service(service_name, config):
                    if config.required:
                        failed_services.append(service_name)
                    else:
                        self.logger.warning(f"⚠️  Skipping optional service: {service_name}")
        
        if failed_services:
            self.logger.error(f"❌ Failed to start required services: {', '.join(failed_services)}")
            return False
        
        # Print status summary
        self._print_status_summary()
        return True
    
    def _start_ollama(self) -> bool:
        """Special handling for Ollama startup."""
        # Check if Ollama is already running
        if self.is_port_in_use(11434):
            self.logger.info("✅ Ollama already running")
            self.ensure_models()
            return True
        
        # Start Ollama
        if self.start_service('ollama', self.services['ollama']):
            self.ensure_models()
            return True
        
        return False
    
    def _print_status_summary(self):
        """Print system status summary."""
        self.logger.info("")
        self.logger.info("🎉 RAG System Started!")
        self.logger.info("📊 Services Status:")
        
        for service_name, config in self.services.items():
            if service_name in self.processes or self.is_port_in_use(config.port):
                status = "✅ Running"
                url = f"http://localhost:{config.port}"
                self.logger.info(f"   • {service_name.capitalize():<10}: {status:<10} {url}")
            else:
                self.logger.info(f"   • {service_name.capitalize():<10}: ❌ Stopped")
        
        self.logger.info("")
        self.logger.info("🌐 Access your RAG system at: http://localhost:3000")
        self.logger.info("")
        self.logger.info("📋 Useful commands:")
        self.logger.info("   • Stop system:  Ctrl+C")
        self.logger.info("   • Check logs:   tail -f logs/*.log")
        self.logger.info("   • Health check: python run_system.py --health")
    
    def shutdown(self):
        """Gracefully shutdown all services."""
        if not self.running:
            return
        
        self.logger.info("🛑 Shutting down RAG system...")
        self.running = False
        
        # Stop services in reverse order
        for service_name in reversed(list(self.processes.keys())):
            self._stop_service(service_name)
        
        self.logger.info("✅ All services stopped")
    
    def _stop_service(self, service_name: str):
        """Stop a single service."""
        if service_name not in self.processes:
            return
        
        process = self.processes[service_name]
        self.logger.info(f"🔄 Stopping {service_name}...")
        
        try:
            # Try graceful shutdown first
            process.terminate()
            
            # Wait up to 10 seconds for graceful shutdown
            try:
                process.wait(timeout=10)
            except subprocess.TimeoutExpired:
                # Force kill if graceful shutdown fails
                process.kill()
                process.wait()
            
            self.logger.info(f"✅ {service_name} stopped")
            
        except Exception as e:
            self.logger.error(f"❌ Error stopping {service_name}: {e}")
        finally:
            del self.processes[service_name]
    
    def monitor(self):
        """Monitor running services and restart if needed."""
        self.logger.info("👁️  Monitoring services... (Press Ctrl+C to stop)")
        
        try:
            while self.running:
                time.sleep(30)  # Check every 30 seconds
                
                for service_name, process in list(self.processes.items()):
                    if process.poll() is not None:
                        self.logger.warning(f"⚠️  {service_name} has stopped unexpectedly")
                        
                        # Restart the service
                        config = self.services[service_name]
                        if config.required:
                            self.logger.info(f"🔄 Restarting {service_name}...")
                            del self.processes[service_name]
                            self.start_service(service_name, config)
                        
        except KeyboardInterrupt:
            self.logger.info("Monitoring stopped by user")

def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(description='RAG System Unified Launcher')
    parser.add_argument('--mode', choices=['dev', 'prod'], default='dev',
                       help='Run mode (default: dev)')
    parser.add_argument('--logs-only', action='store_true',
                       help='Only show aggregated logs from running services')
    parser.add_argument('--no-frontend', action='store_true',
                       help='Skip frontend startup')
    parser.add_argument('--health', action='store_true',
                       help='Check health of running services')
    parser.add_argument('--stop', action='store_true',
                       help='Stop all running services')
    
    args = parser.parse_args()
    
    # Create service manager
    manager = ServiceManager(mode=args.mode)
    
    try:
        if args.health:
            # Health check mode
            manager._print_status_summary()
            return
        
        if args.stop:
            # Stop mode - kill any running processes
            manager.logger.info("🛑 Stopping all RAG system processes...")
            # Implementation for stopping would go here
            return
        
        if args.logs_only:
            # Logs only mode - just tail existing logs
            manager.logger.info("📋 Showing aggregated logs... (Press Ctrl+C to stop)")
            manager.monitor()
            return
        
        # Normal startup mode
        if manager.start_all(skip_frontend=args.no_frontend):
            manager.monitor()
        else:
            manager.logger.error("❌ System startup failed")
            sys.exit(1)
            
    except KeyboardInterrupt:
        manager.logger.info("Received interrupt signal")
    finally:
        manager.shutdown()

if __name__ == "__main__":
    main() 

================================================
FILE: setup_rag_system.sh
================================================
#!/bin/bash
# setup_rag_system.sh - Complete RAG System Setup Script
# This script handles Docker installation, system setup, and initial configuration

set -e

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Logging function
log() {
    echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}

warn() {
    echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}

error() {
    echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
}

info() {
    echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}"
}

# Check if running as root
if [[ $EUID -eq 0 ]]; then
    error "This script should not be run as root (except for package installation steps)"
    exit 1
fi

echo "================================================================"
echo "🚀 RAG System Complete Setup Script"
echo "================================================================"
echo ""

# Step 1: System Requirements Check
log "Step 1: Checking system requirements..."

# Check OS
if [[ "$OSTYPE" == "darwin"* ]]; then
    OS="macos"
    info "Detected macOS"
elif [[ -f /etc/os-release ]]; then
    . /etc/os-release
    OS=$ID
    info "Detected Linux: $OS"
else
    error "Unsupported operating system"
    exit 1
fi

# Check available memory
MEMORY_GB=$(free -g 2>/dev/null | grep '^Mem:' | awk '{print $2}' || sysctl -n hw.memsize 2>/dev/null | awk '{print int($1/1024/1024/1024)}' || echo "unknown")
if [[ "$MEMORY_GB" != "unknown" && "$MEMORY_GB" -lt 8 ]]; then
    warn "System has ${MEMORY_GB}GB RAM. Recommended: 16GB+ for optimal performance"
else
    info "Memory check passed: ${MEMORY_GB}GB RAM"
fi

# Check available disk space
DISK_GB=$(df -BG . | tail -1 | awk '{print $4}' | sed 's/G//' || echo "unknown")
if [[ "$DISK_GB" != "unknown" && "$DISK_GB" -lt 50 ]]; then
    warn "Available disk space: ${DISK_GB}GB. Recommended: 50GB+ free space"
else
    info "Disk space check passed: ${DISK_GB}GB available"
fi

# Step 2: Install Dependencies
log "Step 2: Installing system dependencies..."

# Install Git if not present
if ! command -v git &> /dev/null; then
    info "Installing Git..."
    case $OS in
        "macos")
            if command -v brew &> /dev/null; then
                brew install git
            else
                error "Git not found. Please install Git first or install Homebrew"
                exit 1
            fi
            ;;
        "ubuntu"|"debian")
            sudo apt-get update
            sudo apt-get install -y git
            ;;
        "centos"|"rhel"|"fedora")
            if command -v dnf &> /dev/null; then
                sudo dnf install -y git
            else
                sudo yum install -y git
            fi
            ;;
    esac
else
    info "Git is already installed: $(git --version)"
fi

# Install curl if not present
if ! command -v curl &> /dev/null; then
    info "Installing curl..."
    case $OS in
        "macos")
            # curl is usually pre-installed on macOS
            ;;
        "ubuntu"|"debian")
            sudo apt-get install -y curl
            ;;
        "centos"|"rhel"|"fedora")
            if command -v dnf &> /dev/null; then
                sudo dnf install -y curl
            else
                sudo yum install -y curl
            fi
            ;;
    esac
else
    info "curl is already installed"
fi

# Step 3: Install Docker
log "Step 3: Installing Docker..."

if command -v docker &> /dev/null; then
    info "Docker is already installed: $(docker --version)"
else
    info "Docker not found. Installing Docker..."
    
    case $OS in
        "macos")
            # Check if Homebrew is installed
            if ! command -v brew &> /dev/null; then
                info "Installing Homebrew..."
                /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
            fi
            
            # Install Docker Desktop
            info "Installing Docker Desktop..."
            brew install --cask docker
            
            warn "Docker Desktop installed. Please:"
            warn "1. Start Docker Desktop from Applications"
            warn "2. Wait for Docker to start completely"
            warn "3. Run this script again"
            exit 0
            ;;
            
        "ubuntu"|"debian")
            # Update package index
            sudo apt-get update
            
            # Install dependencies
            sudo apt-get install -y \
                ca-certificates \
                curl \
                gnupg \
                lsb-release
            
            # Add Docker's official GPG key
            sudo mkdir -p /etc/apt/keyrings
            curl -fsSL https://download.docker.com/linux/$OS/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
            
            # Set up repository
            echo \
              "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/$OS \
              $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
            
            # Install Docker Engine
            sudo apt-get update
            sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
            
            # Add user to docker group
            sudo usermod -aG docker $USER
            
            # Start Docker service
            sudo systemctl enable docker
            sudo systemctl start docker
            
            info "Docker installed successfully!"
            warn "Please log out and log back in for group changes to take effect, then run this script again"
            warn "Or run: newgrp docker && $0"
            exit 0
            ;;
            
        "centos"|"rhel"|"fedora")
            # Install required packages
            if command -v dnf &> /dev/null; then
                sudo dnf install -y yum-utils
                sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
                sudo dnf install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
            else
                sudo yum install -y yum-utils
                sudo yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
                sudo yum install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
            fi
            
            # Add user to docker group
            sudo usermod -aG docker $USER
            
            # Start Docker service
            sudo systemctl enable docker
            sudo systemctl start docker
            
            info "Docker installed successfully!"
            warn "Please log out and log back in for group changes to take effect, then run this script again"
            exit 0
            ;;
    esac
fi

# Verify Docker is working
if ! docker --version &> /dev/null; then
    error "Docker is not working properly. Please check Docker installation"
    exit 1
fi

if ! docker compose version &> /dev/null; then
    error "Docker Compose is not working properly. Please check Docker Compose installation"
    exit 1
fi

info "Docker verification passed: $(docker --version)"
info "Docker Compose verification passed: $(docker compose version)"

# Test Docker daemon
if ! docker ps &> /dev/null; then
    error "Cannot connect to Docker daemon. Please ensure Docker is running"
    exit 1
fi

# Step 4: Setup RAG System
log "Step 4: Setting up RAG System..."

# Create project directory structure
info "Creating directory structure..."
mkdir -p {lancedb,shared_uploads,logs,ollama_data}
mkdir -p index_store/{overviews,bm25,graph}
mkdir -p backups

# Set proper permissions
chmod 755 {lancedb,shared_uploads,logs,ollama_data}
chmod 755 index_store/{overviews,bm25,graph}
chmod 755 backups

# Create environment file
if [[ ! -f ".env" ]]; then
    info "Creating environment configuration..."
    cat > .env << 'EOF'
# System Configuration
NODE_ENV=production
LOG_LEVEL=info
DEBUG=false

# Service URLs
FRONTEND_URL=http://localhost:3000
BACKEND_URL=http://localhost:8000
RAG_API_URL=http://localhost:8001
OLLAMA_URL=http://localhost:11434

# Database Configuration
DATABASE_PATH=./backend/chat_data.db
LANCEDB_PATH=./lancedb
UPLOADS_PATH=./shared_uploads
INDEX_STORE_PATH=./index_store

# Model Configuration
DEFAULT_EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2
# Default model names - updated to current versions
DEFAULT_GENERATION_MODEL=qwen3:8b
DEFAULT_RERANKER_MODEL=answerdotai/answerai-colbert-small-v1
DEFAULT_ENRICHMENT_MODEL=qwen3:0.6b

# Performance Configuration
MAX_CONCURRENT_REQUESTS=5
REQUEST_TIMEOUT=300
EMBEDDING_BATCH_SIZE=32
MAX_CONTEXT_LENGTH=4096

# Security Configuration
CORS_ORIGINS=http://localhost:3000
API_KEY_REQUIRED=false
RATE_LIMIT_REQUESTS=100
RATE_LIMIT_WINDOW=60

# Storage Configuration
MAX_FILE_SIZE=50MB
MAX_UPLOAD_FILES=10
CLEANUP_INTERVAL=3600
BACKUP_RETENTION_DAYS=30
EOF
    info "Environment file created: .env"
else
    info "Environment file already exists: .env"
fi

# Step 5: Build and Start Services
log "Step 5: Building and starting services..."

info "Building Docker containers (this may take 10-15 minutes)..."
docker compose build --no-cache

info "Starting services..."
docker compose up -d

# Wait for services to start
info "Waiting for services to initialize..."
sleep 30

# Check service status
info "Checking service status..."
docker compose ps

# Step 6: Install AI Models
log "Step 6: Installing AI models..."

# Wait for Ollama to be ready
info "Waiting for Ollama to be ready..."
max_attempts=30
attempt=0
while ! docker compose exec ollama ollama list &> /dev/null; do
    if [ $attempt -ge $max_attempts ]; then
        error "Ollama failed to start after $max_attempts attempts"
        exit 1
    fi
    info "Waiting for Ollama... (attempt $((attempt+1))/$max_attempts)"
    sleep 10
    ((attempt++))
done

# Download Ollama models
info "Downloading required Ollama models..."
docker compose exec ollama ollama pull qwen3:8b
docker compose exec ollama ollama pull qwen3:0.6b

info "Verifying model installation..."
docker compose exec ollama ollama list

# Step 7: System Verification
log "Step 7: Verifying system installation..."

# Check service health
info "Checking service health..."
services=("frontend:3000" "backend:8000" "rag-api:8001" "ollama:11434")
for service in "${services[@]}"; do
    name="${service%:*}"
    port="${service#*:}"
    
    if curl -s -f "http://localhost:$port" &> /dev/null || curl -s -f "http://localhost:$port/health" &> /dev/null || curl -s -f "http://localhost:$port/api/tags" &> /dev/null || curl -s -f "http://localhost:$port/models" &> /dev/null; then
        info "✅ $name service is healthy"
    else
        warn "⚠️ $name service may not be ready yet"
    fi
done

# Step 8: Create Helper Scripts
log "Step 8: Creating helper scripts..."

# Create start script
cat > start_rag_system.sh << 'EOF'
#!/bin/bash
# Start RAG System
echo "Starting RAG System..."
docker compose up -d
echo "RAG System started. Access at: http://localhost:3000"
EOF
chmod +x start_rag_system.sh

# Create stop script
cat > stop_rag_system.sh << 'EOF'
#!/bin/bash
# Stop RAG System
echo "Stopping RAG System..."
docker compose down
echo "RAG System stopped."
EOF
chmod +x stop_rag_system.sh

# Create status script
cat > status_rag_system.sh << 'EOF'
#!/bin/bash
# Check RAG System Status
echo "=== RAG System Status ==="
docker compose ps
echo ""
echo "=== Service Health ==="
curl -s -f http://localhost:3000 && echo "✅ Frontend: OK" || echo "❌ Frontend: FAIL"
curl -s -f http://localhost:8000/health && echo "✅ Backend: OK" || echo "❌ Backend: FAIL"
curl -s -f http://localhost:8001/models && echo "✅ RAG API: OK" || echo "❌ RAG API: FAIL"
curl -s -f http://localhost:11434/api/tags && echo "✅ Ollama: OK" || echo "❌ Ollama: FAIL"
EOF
chmod +x status_rag_system.sh

# Create backup script
cat > backup_rag_system.sh << 'EOF'
#!/bin/bash
# Backup RAG System Data
BACKUP_DIR="./backups/$(date +%Y%m%d_%H%M%S)"
mkdir -p "$BACKUP_DIR"

echo "Creating backup in $BACKUP_DIR..."

# Stop services
docker compose down

# Backup data
cp -r ./backend/chat_data.db "$BACKUP_DIR/" 2>/dev/null || true
cp -r ./lancedb "$BACKUP_DIR/" 2>/dev/null || true
cp -r ./shared_uploads "$BACKUP_DIR/" 2>/dev/null || true
cp -r ./index_store "$BACKUP_DIR/" 2>/dev/null || true

# Backup configuration
cp .env "$BACKUP_DIR/"
cp docker-compose.yml "$BACKUP_DIR/"

# Restart services
docker compose up -d

echo "Backup completed: $BACKUP_DIR"
EOF
chmod +x backup_rag_system.sh

# Create update script
cat > update_rag_system.sh << 'EOF'
#!/bin/bash
# Update RAG System
echo "Updating RAG System..."

# Backup first
./backup_rag_system.sh

# Pull latest changes
git pull origin main

# Rebuild containers
docker compose build --no-cache

# Restart services
docker compose up -d

echo "Update completed!"
EOF
chmod +x update_rag_system.sh

info "Helper scripts created:"
info "  - start_rag_system.sh: Start the system"
info "  - stop_rag_system.sh: Stop the system"
info "  - status_rag_system.sh: Check system status"
info "  - backup_rag_system.sh: Backup system data"
info "  - update_rag_system.sh: Update the system"

# Step 9: Final Setup
log "Step 9: Final setup and verification..."

# Create initial database if it doesn't exist
if [[ ! -f "./backend/chat_data.db" ]]; then
    info "Creating initial database..."
    docker compose exec backend python -c "
import sqlite3
conn = sqlite3.connect('/app/backend/chat_data.db')
conn.execute('CREATE TABLE IF NOT EXISTS sessions (id TEXT PRIMARY KEY, title TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)')
conn.execute('CREATE TABLE IF NOT EXISTS messages (id INTEGER PRIMARY KEY, session_id TEXT, content TEXT, role TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)')
conn.execute('CREATE TABLE IF NOT EXISTS indexes (id TEXT PRIMARY KEY, name TEXT, metadata TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)')
conn.execute('CREATE TABLE IF NOT EXISTS session_indexes (session_id TEXT, index_id TEXT, PRIMARY KEY (session_id, index_id))')
conn.commit()
conn.close()
print('Database initialized')
" 2>/dev/null || warn "Database initialization may have failed"
fi

# Final health check
info "Performing final health check..."
sleep 10
./status_rag_system.sh

echo ""
echo "================================================================"
echo "🎉 RAG System Setup Complete!"
echo "================================================================"
echo ""
echo "✅ System Status:"
echo "   - Frontend: http://localhost:3000"
echo "   - Backend API: http://localhost:8000"
echo "   - RAG API: http://localhost:8001"
echo "   - Ollama: http://localhost:11434"
echo ""
echo "📚 Documentation:"
echo "   - System Overview: Documentation/system_overview.md"
echo "   - Deployment Guide: Documentation/deployment_guide.md"
echo "   - Docker Usage: Documentation/docker_usage.md"
echo "   - Installation Guide: Documentation/installation_guide.md"
echo ""
echo "🔧 Helper Scripts:"
echo "   - Start system: ./start_rag_system.sh"
echo "   - Stop system: ./stop_rag_system.sh"
echo "   - Check status: ./status_rag_system.sh"
echo "   - Backup data: ./backup_rag_system.sh"
echo "   - Update system: ./update_rag_system.sh"
echo ""
echo "🚀 Next Steps:"
echo "   1. Open http://localhost:3000 in your browser"
echo "   2. Create a new chat session"
echo "   3. Upload some PDF documents"
echo "   4. Start asking questions about your documents!"
echo ""
echo "📋 System Information:"
echo "   - OS: $OS"
echo "   - Memory: ${MEMORY_GB}GB"
echo "   - Disk Space: ${DISK_GB}GB available"
echo "   - Docker: $(docker --version)"
echo "   - Docker Compose: $(docker compose version)"
echo ""
echo "For support and troubleshooting, check the documentation in the"
echo "Documentation/ folder or run ./status_rag_system.sh to check system health."
echo "" 

================================================
FILE: simple_create_index.sh
================================================
#!/bin/bash

# Simple Index Creation Script for LocalGPT RAG System
# Usage: ./simple_create_index.sh "Index Name" "path/to/document.pdf" [additional_files...]

set -e  # Exit on any error

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Function to print colored output
print_status() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# Function to check if a command exists
command_exists() {
    command -v "$1" >/dev/null 2>&1
}

# Function to check prerequisites
check_prerequisites() {
    print_status "Checking prerequisites..."
    
    # Check Python
    if ! command_exists python3; then
        print_error "Python 3 is required but not installed."
        exit 1
    fi
    
    # Check if we're in the right directory
    if [ ! -f "run_system.py" ] || [ ! -d "rag_system" ]; then
        print_error "This script must be run from the LocalGPT project root directory."
        exit 1
    fi
    
    # Check if Ollama is running
    if ! curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
        print_error "Ollama is not running. Please start Ollama first:"
        echo "  ollama serve"
        exit 1
    fi
    
    print_success "Prerequisites check passed"
}

# Function to validate documents
validate_documents() {
    local documents=("$@")
    local valid_docs=()
    
    print_status "Validating documents..."
    
    for doc in "${documents[@]}"; do
        if [ -f "$doc" ]; then
            # Check file extension
            case "${doc##*.}" in
                pdf|txt|docx|md|html|htm)
                    valid_docs+=("$doc")
                    print_status "✓ Valid document: $doc"
                    ;;
                *)
                    print_warning "Unsupported file type: $doc (skipping)"
                    ;;
            esac
        else
            print_warning "File not found: $doc (skipping)"
        fi
    done
    
    if [ ${#valid_docs[@]} -eq 0 ]; then
        print_error "No valid documents found."
        exit 1
    fi
    
    echo "${valid_docs[@]}"
}

# Function to create index using Python
create_index() {
    local index_name="$1"
    shift
    local documents=("$@")
    
    print_status "Creating index: $index_name"
    print_status "Documents: ${documents[*]}"
    
    # Create a temporary Python script to create the index
    cat > /tmp/create_index_temp.py << EOF
#!/usr/bin/env python3
import sys
import os
import json
sys.path.insert(0, os.getcwd())

from rag_system.main import PIPELINE_CONFIGS
from rag_system.pipelines.indexing_pipeline import IndexingPipeline
from rag_system.utils.ollama_client import OllamaClient
from backend.database import ChatDatabase
import uuid

def create_index_simple():
    try:
        # Initialize database
        db = ChatDatabase()
        
        # Create index record
        index_id = db.create_index(
            name="$index_name",
            description="Created with simple_create_index.sh",
            metadata={
                "chunk_size": 512,
                "chunk_overlap": 64,
                "enable_enrich": True,
                "enable_latechunk": True,
                "retrieval_mode": "hybrid",
                "created_by": "simple_create_index.sh"
            }
        )
        
        # Add documents to index
        documents = [${documents[@]/#/\"} ${documents[@]/%/\"}]
        for doc_path in documents:
            if doc_path.strip():  # Skip empty strings
                filename = os.path.basename(doc_path.strip())
                db.add_document_to_index(index_id, filename, os.path.abspath(doc_path.strip()))
        
        # Initialize pipeline
        config = PIPELINE_CONFIGS.get("default", {})
        ollama_client = OllamaClient()
        ollama_config = {
            "generation_model": "qwen3:0.6b",
            "embedding_model": "qwen3:0.6b"
        }
        
        pipeline = IndexingPipeline(config, ollama_client, ollama_config)
        
        # Process documents
        valid_docs = [doc.strip() for doc in documents if doc.strip() and os.path.exists(doc.strip())]
        if valid_docs:
            pipeline.process_documents(valid_docs)
        
        print(f"✅ Index '{index_name}' created successfully!")
        print(f"Index ID: {index_id}")
        print(f"Processed {len(valid_docs)} documents")
        
        return index_id
        
    except Exception as e:
        print(f"❌ Error creating index: {e}")
        import traceback
        traceback.print_exc()
        return None

if __name__ == "__main__":
    create_index_simple()
EOF

    # Run the Python script
    python3 /tmp/create_index_temp.py
    
    # Clean up
    rm -f /tmp/create_index_temp.py
}

# Function to show usage
show_usage() {
    echo "Usage: $0 \"Index Name\" \"path/to/document.pdf\" [additional_files...]"
    echo ""
    echo "Examples:"
    echo "  $0 \"My Documents\" \"document.pdf\""
    echo "  $0 \"Research Papers\" \"paper1.pdf\" \"paper2.pdf\" \"notes.txt\""
    echo "  $0 \"Invoice Collection\" ./invoices/*.pdf"
    echo ""
    echo "Supported file types: PDF, TXT, DOCX, MD, HTML"
}

# Main script
main() {
    # Check arguments
    if [ $# -lt 2 ]; then
        print_error "Insufficient arguments provided."
        show_usage
        exit 1
    fi
    
    local index_name="$1"
    shift
    local documents=("$@")
    
    # Check prerequisites
    check_prerequisites
    
    # Validate documents
    local valid_documents
    valid_documents=($(validate_documents "${documents[@]}"))
    
    if [ ${#valid_documents[@]} -eq 0 ]; then
        print_error "No valid documents to process."
        exit 1
    fi
    
    # Create the index
    print_status "Starting index creation process..."
    create_index "$index_name" "${valid_documents[@]}"
    
    print_success "Index creation completed!"
    print_status "You can now use the index in the LocalGPT interface."
}

# Run main function with all arguments
main "$@"  

================================================
FILE: src/app/globals.css
================================================
@import "tailwindcss";
@import "tw-animate-css";

@custom-variant dark (&:is(.dark *));

@theme inline {
  --color-background: var(--background);
  --color-foreground: var(--foreground);
  --font-sans: var(--font-geist-sans);
  --font-mono: var(--font-geist-mono);
  --color-sidebar-ring: var(--sidebar-ring);
  --color-sidebar-border: var(--sidebar-border);
  --color-sidebar-accent-foreground: var(--sidebar-accent-foreground);
  --color-sidebar-accent: var(--sidebar-accent);
  --color-sidebar-primary-foreground: var(--sidebar-primary-foreground);
  --color-sidebar-primary: var(--sidebar-primary);
  --color-sidebar-foreground: var(--sidebar-foreground);
  --color-sidebar: var(--sidebar);
  --color-chart-5: var(--chart-5);
  --color-chart-4: var(--chart-4);
  --color-chart-3: var(--chart-3);
  --color-chart-2: var(--chart-2);
  --color-chart-1: var(--chart-1);
  --color-ring: var(--ring);
  --color-input: var(--input);
  --color-border: var(--border);
  --color-destructive: var(--destructive);
  --color-accent-foreground: var(--accent-foreground);
  --color-accent: var(--accent);
  --color-muted-foreground: var(--muted-foreground);
  --color-muted: var(--muted);
  --color-secondary-foreground: var(--secondary-foreground);
  --color-secondary: var(--secondary);
  --color-primary-foreground: var(--primary-foreground);
  --color-primary: var(--primary);
  --color-popover-foreground: var(--popover-foreground);
  --color-popover: var(--popover);
  --color-card-foreground: var(--card-foreground);
  --color-card: var(--card);
  --radius-sm: calc(var(--radius) - 4px);
  --radius-md: calc(var(--radius) - 2px);
  --radius-lg: var(--radius);
  --radius-xl: calc(var(--radius) + 4px);
}

:root {
  --radius: 0.625rem;
  --background: oklch(1 0 0);
  --foreground: oklch(0.145 0 0);
  --card: oklch(1 0 0);
  --card-foreground: oklch(0.145 0 0);
  --popover: oklch(1 0 0);
  --popover-foreground: oklch(0.145 0 0);
  --primary: oklch(0.205 0 0);
  --primary-foreground: oklch(0.985 0 0);
  --secondary: oklch(0.97 0 0);
  --secondary-foreground: oklch(0.205 0 0);
  --muted: oklch(0.97 0 0);
  --muted-foreground: oklch(0.556 0 0);
  --accent: oklch(0.97 0 0);
  --accent-foreground: oklch(0.205 0 0);
  --destructive: oklch(0.577 0.245 27.325);
  --border: oklch(0.922 0 0);
  --input: oklch(0.922 0 0);
  --ring: oklch(0.708 0 0);
  --chart-1: oklch(0.646 0.222 41.116);
  --chart-2: oklch(0.6 0.118 184.704);
  --chart-3: oklch(0.398 0.07 227.392);
  --chart-4: oklch(0.828 0.189 84.429);
  --chart-5: oklch(0.769 0.188 70.08);
  --sidebar: oklch(0.985 0 0);
  --sidebar-foreground: oklch(0.145 0 0);
  --sidebar-primary: oklch(0.205 0 0);
  --sidebar-primary-foreground: oklch(0.985 0 0);
  --sidebar-accent: oklch(0.97 0 0);
  --sidebar-accent-foreground: oklch(0.205 0 0);
  --sidebar-border: oklch(0.922 0 0);
  --sidebar-ring: oklch(0.708 0 0);
}

.dark {
  --background: oklch(0.145 0 0);
  --foreground: oklch(0.985 0 0);
  --card: oklch(0.205 0 0);
  --card-foreground: oklch(0.985 0 0);
  --popover: oklch(0.205 0 0);
  --popover-foreground: oklch(0.985 0 0);
  --primary: oklch(0.922 0 0);
  --primary-foreground: oklch(0.205 0 0);
  --secondary: oklch(0.269 0 0);
  --secondary-foreground: oklch(0.985 0 0);
  --muted: oklch(0.269 0 0);
  --muted-foreground: oklch(0.708 0 0);
  --accent: oklch(0.269 0 0);
  --accent-foreground: oklch(0.985 0 0);
  --destructive: oklch(0.704 0.191 22.216);
  --border: oklch(1 0 0 / 10%);
  --input: oklch(1 0 0 / 15%);
  --ring: oklch(0.556 0 0);
  --chart-1: oklch(0.488 0.243 264.376);
  --chart-2: oklch(0.696 0.17 162.48);
  --chart-3: oklch(0.769 0.188 70.08);
  --chart-4: oklch(0.627 0.265 303.9);
  --chart-5: oklch(0.645 0.246 16.439);
  --sidebar: oklch(0.205 0 0);
  --sidebar-foreground: oklch(0.985 0 0);
  --sidebar-primary: oklch(0.488 0.243 264.376);
  --sidebar-primary-foreground: oklch(0.985 0 0);
  --sidebar-accent: oklch(0.269 0 0);
  --sidebar-accent-foreground: oklch(0.985 0 0);
  --sidebar-border: oklch(1 0 0 / 10%);
  --sidebar-ring: oklch(0.556 0 0);
}

@layer base {
  * {
    @apply border-border outline-ring/50;
  }
  html {
    @apply bg-black overflow-x-hidden overflow-y-hidden;
    font-size: 17px;
  }
  body {
    @apply bg-black text-white overflow-x-hidden;
  }
}

/* Style for <think> tokens */
.thinking-block summary::-webkit-details-marker {
  display: none;
}
.thinking-block summary::after {
  content: "▸";
  display: inline-block;
  margin-left: 4px;
  transform-origin: center;
  transition: transform 0.15s ease-out;
}
.thinking-block[open] summary::after {
  transform: rotate(90deg);
}
.thinking-block summary {
  outline: none;
}

.thinking-block div {
  color: #9ca3af;
  font-style: italic;
}


================================================
FILE: src/app/layout.tsx
================================================
import type { Metadata } from "next";
import { Geist, Geist_Mono } from "next/font/google";
import "./globals.css";

const geistSans = Geist({
  variable: "--font-geist-sans",
  subsets: ["latin"],
});

const geistMono = Geist_Mono({
  variable: "--font-geist-mono",
  subsets: ["latin"],
});

export const metadata: Metadata = {
  title: "Create Next App",
  description: "Generated by create next app",
};

export default function RootLayout({
  children,
}: Readonly<{
  children: React.ReactNode;
}>) {
  return (
    <html lang="en" className="bg-black">
      <body
        className={`${geistSans.variable} ${geistMono.variable} antialiased h-screen overflow-hidden flex flex-col`}
      >
        {children}
      </body>
    </html>
  );
}


================================================
FILE: src/app/page.tsx
================================================
import { Demo } from "@/components/demo";

export default function Home() {
  return (
    <main className="flex flex-col flex-1 min-h-0">
      <Demo />
    </main>
  );
}


================================================
FILE: src/components/IndexForm.tsx
================================================
"use client";
import { useState } from 'react';
import { GlassInput } from '@/components/ui/GlassInput';
import { GlassToggle } from '@/components/ui/GlassToggle';
import { AccordionGroup } from '@/components/ui/AccordionGroup';
import { ModelSelect } from '@/components/ModelSelect';
import { chatAPI, ChatSession } from '@/lib/api';
import { InfoTooltip } from '@/components/ui/InfoTooltip';

interface Props {
  onClose: () => void;
  onIndexed?: (session: ChatSession) => void;
}

export function IndexForm({ onClose, onIndexed }: Props) {
  const [files, setFiles] = useState<FileList | null>(null);
  const [indexName, setIndexName] = useState('');
  const [chunkSize, setChunkSize] = useState(512);
  const [chunkOverlap, setChunkOverlap] = useState(64);
  const [windowSize, setWindowSize] = useState(5);
  const [enableEnrich, setEnableEnrich] = useState(true);
  const [retrievalMode, setRetrievalMode] = useState<'hybrid' | 'vector' | 'fts'>('hybrid');
  const [embeddingModel, setEmbeddingModel] = useState<string>();
  const DEFAULT_LLM = 'qwen3:0.6b';
  const [enrichModel, setEnrichModel] = useState<string>(DEFAULT_LLM);
  const [overviewModel, setOverviewModel] = useState<string>(DEFAULT_LLM);
  const [batchSizeEmbed, setBatchSizeEmbed] = useState(64);
  const [batchSizeEnrich, setBatchSizeEnrich] = useState(64);
  const [loading, setLoading] = useState(false);
  const [enableLateChunk, setEnableLateChunk] = useState(false);
  const [enableDoclingChunk, setEnableDoclingChunk] = useState(true);

  const handleSubmit = async () => {
    if (!files) return;
    setLoading(true);
    try {
      // 1. create index record
      const { index_id } = await chatAPI.createIndex(indexName);

      // 2. upload files to index
      await chatAPI.uploadFilesToIndex(index_id, Array.from(files));

      // 3. build index (run pipeline) with ALL OPTIONS
      await chatAPI.buildIndex(index_id, { 
        latechunk: enableLateChunk, 
        doclingChunk: enableDoclingChunk,
        chunkSize: chunkSize,
        chunkOverlap: chunkOverlap,
        retrievalMode: retrievalMode==='fts' ? 'bm25' : retrievalMode,
        windowSize: windowSize,
        enableEnrich: enableEnrich,
        embeddingModel: embeddingModel,
        enrichModel: enrichModel,
        overviewModel: overviewModel,
        batchSizeEmbed: batchSizeEmbed,
        batchSizeEnrich: batchSizeEnrich
      });

      // 4. create chat session and link index
      const session = await chatAPI.createSession(indexName);
      await chatAPI.linkIndexToSession(session.id, index_id);

      // 5. callback
      if (onIndexed) onIndexed(session);
    } catch (e) {
      console.error('Indexing failed', e);
      setLoading(false);
      alert('Indexing failed. See console for details.');
    }
  };

  return (
    <div className="relative bg-white/5 backdrop-blur rounded-xl p-6 w-[640px] text-white space-y-6">
      {/* Loading overlay */}
      {loading && (
        <div className="absolute inset-0 bg-black/60 backdrop-blur-sm flex flex-col items-center justify-center rounded-xl z-20">
          <div className="w-10 h-10 border-4 border-white/30 border-t-transparent rounded-full animate-spin"></div>
          <p className="mt-4 text-sm text-gray-200">Indexing… this may take a moment</p>
        </div>
      )}

      <h2 className="text-lg font-semibold">Create new index</h2>

      {/* Index name */}
      <div>
        <label className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Index name</label>
        <GlassInput placeholder="My project docs" value={indexName} onChange={(e)=>setIndexName(e.target.value)} />
      </div>

      {/* Upload & defaults */}
      <div className="space-y-4">
        <div>
          <label className="block text-xs uppercase tracking-wide text-gray-300 mb-1">PDF files</label>
          <label
            htmlFor="file-upload"
            className="flex flex-col items-center justify-center w-full h-32 border border-dashed border-white/20 rounded cursor-pointer hover:border-white/40 transition"
            onDragOver={(e)=>e.preventDefault()}
            onDrop={(e)=>{e.preventDefault(); if(e.dataTransfer.files) setFiles(e.dataTransfer.files)}}
          >
            <svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round" className="mb-2 text-white/80"><path d="M4 16v2a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2v-2"/><polyline points="7 10 12 5 17 10"/><line x1="12" y1="5" x2="12" y2="16"/></svg>
            <span className="text-xs text-gray-400">Drag & Drop documents here or click to browse</span>
            <input id="file-upload" type="file" accept="application/pdf,.docx,.doc,.html,.htm,.md,.txt" multiple className="hidden" onChange={(e)=>setFiles(e.target.files)} />
          </label>
          {files && <p className="mt-1 text-xs text-green-400">{files.length} file(s) selected</p>}
        </div>

        {/* Retrieval mode & Late-chunk toggle */}
        <div>
          <label className="flex items-center gap-1 text-xs uppercase tracking-wide text-gray-300 mb-1">Retrieval mode <InfoTooltip text="Choose how chunks are found. Hybrid combines full-text search with vectors; FTS uses textual matching only; Vector relies purely on dense similarity." /></label>
          <div className="flex gap-3">
            {(['hybrid','vector','fts'] as const).map((m)=>(
              <button key={m} onClick={()=>setRetrievalMode(m)} className={`px-3 py-1 rounded text-xs font-sans ${retrievalMode===m?'bg-white/20':'bg-white/10 hover:bg-white/20'}`}>{m==='fts' ? 'FTS' : m}</button>
            ))}
          </div>
          <div className="grid grid-cols-2 gap-4 mt-3">
            <div className="flex items-center gap-2">
              <span className="text-xs text-gray-400">Late-chunk vectors <InfoTooltip text="Split chunks into sub-vectors to improve recall, then merge them back after retrieval." size={12} /></span>
              <GlassToggle checked={enableLateChunk} onChange={setEnableLateChunk} />
            </div>
            <div className="flex items-center gap-2">
              <span className="text-xs text-gray-400">High-recall chunking <InfoTooltip text="Advanced sentence-level packing with Docling features for maximum recall. Both modes use token-based sizing." size={12} /></span>
              <GlassToggle checked={enableDoclingChunk} onChange={setEnableDoclingChunk} />
            </div>
          </div>
          <div className="grid grid-cols-2 gap-4 mt-4">
            <div>
              <label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Chunk size <InfoTooltip text="Maximum token length for each chunk. Both legacy and high-recall modes now use token-based sizing." size={12} /></label>
              <GlassInput type="number" value={chunkSize} onChange={(e) => setChunkSize(parseInt(e.target.value))} />
            </div>
            <div>
              <label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Chunk overlap <InfoTooltip text="Tokens reused between adjacent chunks to preserve context." size={12} /></label>
              <GlassInput
                type="number"
                value={chunkOverlap}
                onChange={(e) => setChunkOverlap(parseInt(e.target.value))}
              />
            </div>
          </div>

          {/* Embedding & Overview models */}
          <div className="grid grid-cols-2 gap-4 mt-4">
            <div>
              <label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Embedding model <InfoTooltip text="Model used to generate dense vectors stored in the index." size={12} /></label>
              <ModelSelect 
                value={embeddingModel} 
                onChange={setEmbeddingModel}
                type="embedding"
                placeholder="Select embedding model"
              />
            </div>
            <div>
              <label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Overview LLM <InfoTooltip text="LLM that writes the short overview paragraph per document." size={12} /></label>
              <ModelSelect 
                value={overviewModel}
                onChange={setOverviewModel}
                type="generation"
                placeholder="Select overview LLM"
              />
            </div>
          </div>
        </div>

        {/* Contextual retrieval section */}
        <AccordionGroup title={<><span>Contextual Retrieval</span> <InfoTooltip text="Adds neighbour chunks into each original chunk then enriches with LLM – improves semantic continuity but increases indexing latency." /></>}>
          <div className="flex items-center gap-3">
            <span className="text-xs text-gray-400">Enable</span>
            <GlassToggle checked={enableEnrich} onChange={setEnableEnrich} />
          </div>
          <div className="grid grid-cols-2 gap-4 mt-3">
            <div>
              <label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Context window <InfoTooltip text="Number of neighbour chunks included when enriching context." size={12} /></label>
              <GlassInput type="number" value={windowSize} onChange={(e)=>setWindowSize(parseInt(e.target.value))} />
            </div>
            <div>
              <label className="block text-xs mb-1 text-gray-400">Retrieval LLM</label>
              <ModelSelect 
                value={enrichModel}
                onChange={setEnrichModel}
                type="generation"
                placeholder="Select retrieval LLM"
              />
            </div>
          </div>
        </AccordionGroup>
      </div>

      {/* Advanced */}
      <AccordionGroup title={<><span>Batch Size</span> <InfoTooltip text="Control the number of chunks processed per batch. Larger values speed up indexing but require more memory." /></>}>
        <div className="grid grid-cols-2 gap-4">
          <div>
            <label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Embedding batch size <InfoTooltip text="Chunks processed per batch when producing embeddings." size={12} /></label>
            <GlassInput
              type="number"
              value={batchSizeEmbed}
              onChange={(e) => setBatchSizeEmbed(parseInt(e.target.value))}
            />
          </div>
          <div>
            <label className="flex items-center gap-1 text-xs mb-1 text-gray-400">Context retrieval batch size <InfoTooltip text="Chunks sent per request during contextual enrichment." size={12} /></label>
            <GlassInput
              type="number"
              value={batchSizeEnrich}
              onChange={(e) => setBatchSizeEnrich(parseInt(e.target.value))}
            />
          </div>
        </div>
      </AccordionGroup>

      <div className="flex justify-end gap-3 pt-4 border-t border-white/10">
        <button onClick={onClose} className="px-4 py-2 bg-gray-700 rounded hover:bg-gray-600 text-sm">
          Cancel
        </button>
        <button
          disabled={loading || !files || !indexName.trim()}
          onClick={handleSubmit}
          className="px-4 py-2 bg-green-600 rounded disabled:opacity-40 text-sm"
        >
          {loading ? 'Indexing…' : 'Start indexing'}
        </button>
      </div>
    </div>
  );
}                        

================================================
FILE: src/components/IndexPicker.tsx
================================================
import { useEffect, useState } from 'react';
import { chatAPI } from '@/lib/api';

interface Props {
  onSelect: (indexId: string) => void;
  onClose: () => void;
}

export default function IndexPicker({ onSelect, onClose }: Props) {
  const [indexes, setIndexes] = useState<any[]>([]);
  const [loading, setLoading] = useState(true);
  const [error, setError] = useState<string | null>(null);
  const [search, setSearch] = useState('');

  const [menuOpenId, setMenuOpenId] = useState<string | null>(null);

  useEffect(() => {
    (async () => {
      try {
        const data = await chatAPI.listIndexes();
        setIndexes(data.indexes);
      } catch (e: any) {
        setError(e.message || 'Failed to load indexes');
      } finally {
        setLoading(false);
      }
    })();
  }, []);

  const filtered = indexes.filter(i => i.name.toLowerCase().includes(search.toLowerCase()));

  async function handleDelete(idxId: string, name: string) {
    if (!confirm(`Delete index "${name}"? This cannot be undone.`)) return;
    try {
      await chatAPI.deleteIndex(idxId);
      setIndexes(prev => prev.filter(i => i.id!==idxId));
      setMenuOpenId(null);
    } catch (e:any){
      alert(e.message || 'Failed to delete index');
    }
  }

  useEffect(() => {
    function handleOutside(e: MouseEvent) {
      if ((e.target as Element).closest('.index-row-menu') === null) {
        setMenuOpenId(null);
      }
    }
    if (menuOpenId) {
      document.addEventListener('click', handleOutside);
    }
    return () => document.removeEventListener('click', handleOutside);
  }, [menuOpenId]);

  return (
    <div className="fixed inset-0 bg-black/60 backdrop-blur-sm flex items-center justify-center z-50 p-4">
      <div className="bg-white/5 backdrop-blur rounded-xl w-full max-w-xl max-h-full overflow-y-auto p-6 text-white space-y-6">
        <h2 className="text-lg font-semibold">Select an index</h2>
        <input value={search} onChange={e=>setSearch(e.target.value)} placeholder="Search…" className="w-full px-3 py-2 rounded bg-black/30 border border-white/20 focus:outline-none" />
        {loading && <p className="text-sm text-gray-300">Loading…</p>}
        {error && <p className="text-sm text-red-400">{error}</p>}
        {!loading && !error && (
          <ul className="space-y-2">
            {filtered.map(idx => (
              <li key={idx.id}>
                <div className="relative group">
                  <button onClick={()=>onSelect(idx.id)} className="w-full px-4 py-3 bg-white/10 hover:bg-white/20 rounded transition flex justify-between items-center pr-10">
                    <span className="font-medium truncate max-w-[60%]">{idx.name}</span>
                    <span className="text-xs text-gray-400">{idx.documents?.length || 0} files</span>
                  </button>

                  <button onClick={(e)=>{e.stopPropagation(); setMenuOpenId(menuOpenId===idx.id?null:idx.id);}} title="More actions" className="absolute right-4 top-1/2 -translate-y-1/2 opacity-0 group-hover:opacity-100 text-gray-400 hover:text-white transition text-lg leading-none font-bold">
                    …
                  </button>

                  {menuOpenId===idx.id && (
                    <div className="index-row-menu absolute right-0 top-full mt-1 bg-black/80 backdrop-blur border border-white/10 rounded shadow-lg py-1 w-32 text-sm z-50">
                      <button onClick={()=>{onSelect(idx.id); setMenuOpenId(null);}} className="block w-full text-left px-4 py-2 hover:bg-white/10">Open</button>
                      <button onClick={()=>handleDelete(idx.id, idx.name)} className="block w-full text-left px-4 py-2 hover:bg-white/10 text-red-400 hover:text-red-500">Delete</button>
                    </div>
                  )}
                </div>
              </li>
            ))}
            {filtered.length===0 && <p className="text-sm text-gray-400">No indexes found.</p>}
          </ul>
        )}
        <div className="pt-4 border-t border-white/10 flex justify-end">
          <button onClick={onClose} className="px-4 py-2 bg-gray-700 rounded hover:bg-gray-600 text-sm">Close</button>
        </div>
      </div>
    </div>
  );
} 

================================================
FILE: src/components/IndexWizard.tsx
================================================
"use client";
import { useState } from 'react';
import { ModelSelect } from '@/components/ModelSelect';

interface Props {
  onClose: () => void;
}

export function IndexWizard({ onClose }: Props) {
  const [files, setFiles] = useState<FileList | null>(null);
  const [chunkSize, setChunkSize] = useState(512);
  const [chunkOverlap, setChunkOverlap] = useState(64);
  const [embeddingModel, setEmbeddingModel] = useState<string>();
  // TODO: more params

  const handleFile = (e: React.ChangeEvent<HTMLInputElement>) => {
    setFiles(e.target.files);
  };

  return (
    <div className="fixed inset-0 bg-black/60 backdrop-blur flex items-center justify-center z-50">
      <div className="bg-gray-900 w-[600px] max-h-[90vh] overflow-auto rounded-xl p-6 text-white space-y-6">
        <h2 className="text-lg font-semibold">Create new index</h2>

        <div className="space-y-4">
          <div>
            <label className="block text-sm mb-1">Document files</label>
            <input type="file" accept="application/pdf,.docx,.doc,.html,.htm,.md,.txt" multiple onChange={handleFile} className="text-sm" />
          </div>

          <div className="grid grid-cols-2 gap-4">
            <div>
              <label className="block text-sm mb-1">Chunk size</label>
              <input
                type="number"
                value={chunkSize}
                onChange={(e) => setChunkSize(parseInt(e.target.value))}
                className="w-full bg-gray-800 rounded px-2 py-1"
              />
            </div>
            <div>
              <label className="block text-sm mb-1">Chunk overlap</label>
              <input
                type="number"
                value={chunkOverlap}
                onChange={(e) => setChunkOverlap(parseInt(e.target.value))}
                className="w-full bg-gray-800 rounded px-2 py-1"
              />
            </div>
          </div>

          <div>
            <label className="block text-sm mb-1">Embedding model</label>
            <ModelSelect type="embedding" value={embeddingModel} onChange={setEmbeddingModel} />
          </div>
        </div>

        <div className="flex justify-end gap-3 pt-4 border-t border-white/10">
          <button onClick={onClose} className="px-4 py-2 bg-gray-700 rounded hover:bg-gray-600 text-sm">
            Cancel
          </button>
          <button
            disabled={!files || !embeddingModel}
            className="px-4 py-2 bg-green-600 rounded disabled:opacity-40 text-sm"
          >
            Start indexing
          </button>
        </div>
      </div>
    </div>
  );
}    

================================================
FILE: src/components/LandingMenu.tsx
================================================
"use client";

import React from 'react';

interface Props {
  onSelect: (mode: 'INDEX' | 'CHAT_EXISTING' | 'QUICK_CHAT') => void;
}

export function LandingMenu({ onSelect }: Props) {
  const Tile = ({ label, mode, icon }: { label: string; mode: Props["onSelect"] extends (m: infer U)=>void ? U: never; icon: React.ReactNode;}) => (
    <button
      onClick={() => onSelect(mode)}
      className="w-56 h-44 rounded-xl bg-white/5 backdrop-blur border border-white/10 hover:border-white/30 text-white flex flex-col items-center justify-center gap-2 transition"
    >
      {icon}
      <span className="text-sm font-medium">{label}</span>
    </button>
  );

  const FileIcon = (
    <svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
      <path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z" />
      <polyline points="14 2 14 8 20 8" />
    </svg>
  );

  const DbIcon = (
    <svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
      <ellipse cx="12" cy="5" rx="9" ry="3" />
      <path d="M3 5v6c0 1.7 4 3 9 3s9-1.3 9-3V5" />
      <path d="M3 11v6c0 1.7 4 3 9 3s9-1.3 9-3v-6" />
    </svg>
  );

  const ChatIcon = (
    <svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round">
      <path d="M21 15a2 2 0 0 1-2 2H7l-4 4V5a2 2 0 0 1 2-2h14a2 2 0 0 1 2 2z" />
    </svg>
  );

  return (
    <div className="flex gap-8">
      <Tile label="Create new index" mode={"INDEX"} icon={FileIcon} />
      <Tile label="Chat with index" mode={"CHAT_EXISTING"} icon={DbIcon} />
      <Tile label="LLM Chat" mode={"QUICK_CHAT"} icon={ChatIcon} />
    </div>
  );
} 

================================================
FILE: src/components/Markdown.tsx
================================================
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-nocheck
'use client'

import dynamic from 'next/dynamic'
import React, { useMemo } from 'react'
import remarkGfm from 'remark-gfm'

// Dynamically import react-markdown to avoid SSR issues
const ReactMarkdown: any = dynamic(() => import('react-markdown') as any, { ssr: false })

interface MarkdownProps {
  text: string
  className?: string
}

export default function Markdown({ text, className = '' }: MarkdownProps) {
  const plugins = useMemo(() => [remarkGfm], [])
  return (
    <div className={`prose prose-invert max-w-none ${className}`}>
      {/* @ts-ignore – react-markdown type doesn't recognise remarkPlugins array */}
    <ReactMarkdown
        remarkPlugins={plugins}
        components={{
          a: ({ node, ...props }) => (
            <a {...props} target="_blank" rel="noopener noreferrer" />
          ),
        }}
    >
      {text}
    </ReactMarkdown>
    </div>
  )
} 

================================================
FILE: src/components/ModelSelect.tsx
================================================
import { useEffect, useState } from 'react';
import { chatAPI, ModelsResponse } from '@/lib/api';

interface Props {
  value: string | undefined;
  onChange: (v: string) => void;
  type: 'generation' | 'embedding';
  className?: string;
  placeholder?: string;
}

export function ModelSelect({ value, onChange, type, className, placeholder }: Props) {
  const [models, setModels] = useState<string[]>([]);
  const [loading, setLoading] = useState(true);
  const [error, setError] = useState<string | null>(null);

  useEffect(() => {
    let mounted = true;
    chatAPI
      .getModels()
      .then((res: ModelsResponse) => {
        if (!mounted) return;
        const list = type === 'generation' ? res.generation_models : res.embedding_models;
        setModels(list);
        // Auto-select default qwen3:0.6b if available and not chosen yet
        if(!value && list.includes('qwen3:0.6b')){
          onChange('qwen3:0.6b');
        }
        setLoading(false);
      })
      .catch((e) => {
        if (!mounted) return;
        setError(String(e));
        setLoading(false);
      });
    return () => {
      mounted = false;
    };
  }, [type]);

  if (loading) {
    return (
      <select className={className} disabled>
        <option>Loading…</option>
      </select>
    );
  }
  if (error || models.length === 0) {
    return (
      <select className={className} disabled>
        <option>No models</option>
      </select>
    );
  }

  return (
    <select
      className={`w-full px-3 py-2 bg-gray-700 border border-gray-600 rounded-lg text-white text-sm focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent ${className || ''}`}
      value={value || ''}
      onChange={(e) => onChange(e.target.value)}
    >
      <option value="" disabled>
        {placeholder || `Select ${type === 'generation' ? 'LLM' : 'embed model'}`}
      </option>
      {models.map((m) => (
        <option key={m} value={m}>
          {m}
        </option>
      ))}
    </select>
  );
} 

================================================
FILE: src/components/SessionIndexInfo.tsx
================================================
import { useEffect, useState } from 'react';
import { chatAPI, ChatSession } from '@/lib/api';

interface Props {
  sessionId: string;
  onClose: () => void;
}

export default function SessionIndexInfo({ sessionId, onClose }: Props) {
  const [files, setFiles] = useState<string[]>([]);
  const [indexMeta, setIndexMeta] = useState<any | null>(null);
  const [session, setSession] = useState<ChatSession | null>(null);
  const [loading, setLoading] = useState(true);
  const [error, setError] = useState<string | null>(null);

  useEffect(() => {
    (async () => {
      try {
        const data = await chatAPI.getSessionIndexes(sessionId);
        const first = data.indexes[0];
        if(first){
          setSession(first.session??{...first, title:first.name, model_used:first.model_used||''});
          setFiles(first.documents?.map((d:any)=>d.filename) || []);
          setIndexMeta(first.metadata || {});
        } else {
          setError('No indexes linked to this chat');
        }
      } catch (e:any){ setError(e.message||'Failed to load'); }
      finally{ setLoading(false);}
    })();
  }, [sessionId]);

  const hasMetadata = indexMeta && Object.keys(indexMeta).length > 0;
  const isInferredMetadata = indexMeta?.metadata_source === 'lancedb_inspection';
  const indexStatus = indexMeta?.status;

  const getStatusMessage = () => {
    if (!hasMetadata) {
      return {
        type: 'warning',
        title: '⚠️ No Configuration Data',
        message: 'This index was created before metadata tracking was implemented. Configuration details are not available.'
      };
    }
    
    if (indexStatus === 'incomplete') {
      return {
        type: 'error',
        title: '❌ Index Incomplete',
        message: indexMeta.issue || 'The index appears to be incomplete or was never properly built.'
      };
    }
    
    if (indexStatus === 'empty') {
      return {
        type: 'error',
        title: '❌ Index Empty',
        message: 'The vector table exists but contains no data. The index may need to be rebuilt.'
      };
    }
    
    if (indexStatus === 'legacy') {
      return {
        type: 'warning',
        title: '⚠️ Legacy Index',
        message: indexMeta.issue || 'This index was created before metadata tracking was implemented. Configuration details are not available.'
      };
    }
    
    if (isInferredMetadata) {
      return {
        type: 'info',
        title: '🔍 Metadata Inferred',
        message: 'This metadata was inferred from the vector database structure. Some configuration details may be incomplete.'
      };
    }
    
    if (indexStatus === 'functional') {
      // Check if we have complete configuration metadata
      const hasCompleteConfig = indexMeta.chunk_size && 
                               indexMeta.chunk_overlap !== undefined &&
                               indexMeta.retrieval_mode &&
                               indexMeta.embedding_model;
      
      // Only show limited message if we truly have limited data
      if (indexMeta.inspection_limitation && !hasCompleteConfig) {
        return {
          type: 'info',
          title: '🔍 Limited Configuration Data',
          message: 'This index is functional but detailed configuration inspection requires direct RAG system access. Basic information is shown below.'
        };
      }
      
      // Don't show any status message for functional indexes with complete metadata
      return null;
    }
    
    return null;
  };

  const statusMessage = getStatusMessage();

  return (
    <div className="fixed inset-0 flex items-center justify-center bg-black/60 backdrop-blur-sm z-50 p-4">
      <div className="relative bg-white/5 backdrop-blur rounded-xl p-8 w-full max-w-2xl text-white space-y-6 overflow-y-auto max-h-full">
        <h2 className="text-lg font-semibold">Index details</h2>

        {loading && <p className="text-sm text-gray-300">Loading…</p>}
        {error && <p className="text-sm text-red-400">{error}</p>}

        {(!loading && !error) && (
          <>
            <div>
              <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Name</span>
              <p className="text-sm">{session?.title}</p>
            </div>

            {statusMessage && (
              <div className={`rounded-lg p-4 ${
                statusMessage.type === 'error' ? 'bg-red-900/20 border border-red-600/30' :
                statusMessage.type === 'warning' ? 'bg-yellow-900/20 border border-yellow-600/30' :
                'bg-blue-900/20 border border-blue-600/30'
              }`}>
                <p className={`text-sm font-medium mb-1 ${
                  statusMessage.type === 'error' ? 'text-red-200' :
                  statusMessage.type === 'warning' ? 'text-yellow-200' :
                  'text-blue-200'
                }`}>
                  {statusMessage.title}
                </p>
                <p className={`text-sm ${
                  statusMessage.type === 'error' ? 'text-red-300' :
                  statusMessage.type === 'warning' ? 'text-yellow-300' :
                  'text-blue-300'
                }`}>
                  {statusMessage.message}
                </p>
              </div>
            )}

            {hasMetadata && (indexStatus === 'functional' || indexStatus === 'created' || !indexStatus) && (
              <>
                {/* Basic Information */}
                <div className="grid grid-cols-2 gap-4">
                  {(indexMeta.embedding_model || indexMeta.embedding_model_inferred) && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Embedding model</span>
                      <p className="text-sm break-words">
                        {indexMeta.embedding_model || indexMeta.embedding_model_inferred}
                        {indexMeta.embedding_model_inferred && <span className="text-gray-400"> (inferred)</span>}
                      </p>
                    </div>
                  )}
                  {(indexMeta.retrieval_mode || indexMeta.retrieval_mode_inferred) && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Retrieval mode</span>
                      <p className="text-sm capitalize">
                        {indexMeta.retrieval_mode || indexMeta.retrieval_mode_inferred}
                        {indexMeta.retrieval_mode_inferred && <span className="text-gray-400"> (inferred)</span>}
                      </p>
                    </div>
                  )}
                  {indexMeta.vector_dimensions && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Vector dimensions</span>
                      <p className="text-sm">{indexMeta.vector_dimensions}</p>
                    </div>
                  )}
                  {indexMeta.total_chunks && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Total chunks</span>
                      <p className="text-sm">{indexMeta.total_chunks.toLocaleString()}</p>
                    </div>
                  )}
                </div>

                {/* Chunk Configuration */}
                <div className="grid grid-cols-2 gap-4">
                  {(typeof indexMeta.chunk_size==='number' || indexMeta.chunk_size_inferred) && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Chunk size</span>
                      <p className="text-sm">
                        {typeof indexMeta.chunk_size==='number' ? `${indexMeta.chunk_size} tokens` : indexMeta.chunk_size_inferred}
                        {indexMeta.chunk_size_inferred && <span className="text-gray-400"> (estimated)</span>}
                      </p>
                    </div>
                  )}
                  {typeof indexMeta.chunk_overlap==='number' && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Chunk overlap</span>
                      <p className="text-sm">{indexMeta.chunk_overlap} tokens</p>
                    </div>
                  )}
                </div>

                {/* Context and Features */}
                <div className="grid grid-cols-2 gap-4">
                  {typeof indexMeta.window_size==='number' && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Context window</span>
                      <p className="text-sm">{indexMeta.window_size}</p>
                    </div>
                  )}
                  {typeof indexMeta.enable_enrich==='boolean' && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Contextual enrichment</span>
                      <p className="text-sm">{indexMeta.enable_enrich ? '✅ Enabled' : '❌ Disabled'}</p>
                    </div>
                  )}
                  {indexMeta.has_contextual_enrichment && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Contextual enrichment</span>
                      <p className="text-sm">🔍 Detected</p>
                    </div>
                  )}
                </div>

                {/* Advanced features */}
                <div className="grid grid-cols-2 gap-4">
                  {typeof indexMeta.latechunk==='boolean' && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Late-chunk vectors</span>
                      <p className="text-sm">{indexMeta.latechunk ? '✅ Enabled' : '❌ Disabled'}</p>
                    </div>
                  )}
                  {typeof indexMeta.docling_chunk==='boolean' && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">High-recall chunking</span>
                      <p className="text-sm">{indexMeta.docling_chunk ? '✅ Enabled' : '❌ Disabled'}</p>
                    </div>
                  )}
                  {indexMeta.has_fts_index && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Full-text search</span>
                      <p className="text-sm">🔍 Available</p>
                    </div>
                  )}
                  {indexMeta.has_document_structure && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Document structure</span>
                      <p className="text-sm">🔍 Organized</p>
                    </div>
                  )}
                </div>

                {/* LLM Models section */}
                {(indexMeta.enrich_model || indexMeta.overview_model) && (
                  <>
                    <div className="border-t border-white/10 pt-4">
                      <h3 className="text-sm font-medium text-gray-300 mb-3">LLM Models</h3>
                      <div className="grid grid-cols-2 gap-4">
                        {indexMeta.enrich_model && (
                          <div>
                            <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Enrichment LLM</span>
                            <p className="text-sm break-words">{indexMeta.enrich_model}</p>
                          </div>
                        )}
                        {indexMeta.overview_model && (
                          <div>
                            <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Overview LLM</span>
                            <p className="text-sm break-words">{indexMeta.overview_model}</p>
                          </div>
                        )}
                      </div>
                    </div>
                  </>
                )}

                {/* Batch sizes section */}
                {(typeof indexMeta.batch_size_embed==='number' || typeof indexMeta.batch_size_enrich==='number') && (
                  <>
                    <div className="border-t border-white/10 pt-4">
                      <h3 className="text-sm font-medium text-gray-300 mb-3">Batch Configuration</h3>
                      <div className="grid grid-cols-2 gap-4">
                        {typeof indexMeta.batch_size_embed==='number' && (
                          <div>
                            <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Embedding batch size</span>
                            <p className="text-sm">{indexMeta.batch_size_embed}</p>
                          </div>
                        )}
                        {typeof indexMeta.batch_size_enrich==='number' && (
                          <div>
                            <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Enrichment batch size</span>
                            <p className="text-sm">{indexMeta.batch_size_enrich}</p>
                          </div>
                        )}
                      </div>
                    </div>
                  </>
                )}

                {/* Metadata info */}
                {isInferredMetadata && indexMeta.metadata_inferred_at && (
                  <div className="border-t border-white/10 pt-4">
                    <h3 className="text-sm font-medium text-gray-300 mb-3">Metadata Information</h3>
                    <div className="text-xs text-gray-400 space-y-1">
                      <p>Inferred at: {new Date(indexMeta.metadata_inferred_at).toLocaleString()}</p>
                      <p>Source: LanceDB table inspection</p>
                      {indexMeta.sample_chunk_length && (
                        <p>Sample chunk length: {indexMeta.sample_chunk_length} characters</p>
                      )}
                    </div>
                  </div>
                )}
              </>
            )}

            {/* Legacy index information */}
            {hasMetadata && indexStatus === 'legacy' && (
              <>
                <div className="grid grid-cols-2 gap-4">
                  {typeof indexMeta.documents_count === 'number' && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Documents</span>
                      <p className="text-sm">{indexMeta.documents_count}</p>
                    </div>
                  )}
                  {indexMeta.created_at && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Created</span>
                      <p className="text-sm">{new Date(indexMeta.created_at).toLocaleDateString()}</p>
                    </div>
                  )}
                  {indexMeta.vector_table_name && (
                    <div>
                      <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Vector table</span>
                      <p className="text-sm text-gray-400 text-xs break-all">{indexMeta.vector_table_name}</p>
                    </div>
                  )}
                </div>
                
                {indexMeta.note && (
                  <div className="border-t border-white/10 pt-4">
                    <h3 className="text-sm font-medium text-gray-300 mb-3">Technical Note</h3>
                    <p className="text-xs text-gray-400">{indexMeta.note}</p>
                  </div>
                )}
              </>
            )}

            {/* Debug info for incomplete indexes */}
            {indexStatus === 'incomplete' && indexMeta.available_tables && (
              <div className="border-t border-white/10 pt-4">
                <h3 className="text-sm font-medium text-gray-300 mb-3">Debug Information</h3>
                <div className="text-xs text-gray-400 space-y-1">
                  <p>Expected table: {indexMeta.vector_table_expected}</p>
                  <p>Available tables: {indexMeta.available_tables.join(', ') || 'None'}</p>
                </div>
              </div>
            )}

            <div className="border-t border-white/10 pt-4">
              <span className="block text-xs uppercase tracking-wide text-gray-300 mb-1">Files ({files.length})</span>
              <ul className="list-disc list-inside space-y-1 text-sm max-h-32 overflow-y-auto">
                {files.map((f) => (
                  <li key={f}>{f}</li>
                ))}
              </ul>
            </div>
          </>
        )}

        <div className="flex justify-end pt-4 border-t border-white/10">
          <button onClick={onClose} className="px-4 py-2 bg-gray-700 rounded hover:bg-gray-600 text-sm">Close</button>
        </div>
      </div>
    </div>
  );
} 

================================================
FILE: src/components/demo.tsx
================================================
"use client";

import { useState, useEffect } from "react"
import { LocalGPTChat } from "@/components/ui/localgpt-chat"
import { SessionSidebar } from "@/components/ui/session-sidebar"
import { SessionChat } from '@/components/ui/session-chat'
import { chatAPI, ChatSession } from "@/lib/api"
import { LandingMenu } from "@/components/LandingMenu";
import { IndexForm } from "@/components/IndexForm";
import SessionIndexInfo from "@/components/SessionIndexInfo";
import IndexPicker from "@/components/IndexPicker";
import { QuickChat } from '@/components/ui/quick-chat'

export function Demo() {
    const [currentSessionId, setCurrentSessionId] = useState<string | undefined>()
    const [currentSession, setCurrentSession] = useState<ChatSession | null>(null)
    const [showConversation, setShowConversation] = useState(false)
    const [backendStatus, setBackendStatus] = useState<'checking' | 'connected' | 'error'>('checking')
    const [sidebarRef, setSidebarRef] = useState<{ refreshSessions: () => Promise<void> } | null>(null)
    const [homeMode, setHomeMode] = useState<'HOME' | 'INDEX' | 'CHAT_EXISTING' | 'QUICK_CHAT'>('HOME')
    const [showIndexInfo, setShowIndexInfo] = useState(false)
    const [showIndexPicker, setShowIndexPicker] = useState(false)
    const [sidebarOpen, setSidebarOpen] = useState(true)

    console.log('Demo component rendering...')

    useEffect(() => {
        console.log('Demo component mounted')
        checkBackendHealth()
    }, [])

    const checkBackendHealth = async () => {
        try {
            const health = await chatAPI.checkHealth()
            setBackendStatus('connected')
            console.log('Backend connected:', health)
        } catch (error) {
            console.error('Backend health check failed:', error)
            setBackendStatus('error')
        }
    }

    const handleSessionSelect = (sessionId: string) => {
        setCurrentSessionId(sessionId)
        setShowConversation(true)
        setHomeMode('CHAT_EXISTING') // Ensure we're in the right mode to show SessionChat
    }

    const handleNewSession = () => {
        // Reset state and return to landing page so user can choose chat type
        setCurrentSessionId(undefined)
        setCurrentSession(null)
        setShowConversation(false)  // Hide conversation view & sidebar
        setHomeMode('HOME')         // Show landing selector (Create index / Chat with index / LLM Chat)
    }

    const handleSessionChange = async (session: ChatSession) => {
        setCurrentSession(session)

        // Update the current session ID if it changed (e.g., brand-new session)
        if (session.id !== currentSessionId) {
            setCurrentSessionId(session.id)
        }

        // Always refresh the sidebar so that updated titles / message counts are displayed
            if (sidebarRef) {
                await sidebarRef.refreshSessions()
        }
    }

    const handleSessionDelete = (deletedSessionId: string) => {
        if (currentSessionId === deletedSessionId) {
            // Stay in conversation mode but show empty state
            setCurrentSessionId(undefined)
            setCurrentSession(null)
        }
    }

    const handleStartConversation = () => {
        if (backendStatus === 'connected') {
            // Just show empty state, don't create session yet
            handleNewSession()
        } else {
            setShowConversation(true)
        }
    }

    return (
        <div className="flex h-full w-full flex-col bg-black">
            {/* Top App Bar */}
            <header className="h-12 relative flex items-center justify-center border-b border-gray-800 flex-shrink-0">
                <button onClick={()=>setSidebarOpen(o=>!o)} className="absolute left-4 p-1 rounded hover:bg-gray-800 text-gray-200 focus:outline-none" title="Toggle sidebar">
                    {sidebarOpen ? <span className="text-xl leading-none">◀</span> : <span className="text-xl leading-none">▶</span>}
                </button>
                {homeMode !== 'HOME' && (
                    <h1 className="text-lg font-semibold text-white">localGPT</h1>
                )}
            </header>
            {/* Main content row */}
            <div className="flex flex-1 flex-row min-h-0">
                {/* Session Sidebar */}
                {sidebarOpen && showConversation && (homeMode === 'CHAT_EXISTING' || homeMode === 'QUICK_CHAT') && (
                    <SessionSidebar
                        currentSessionId={currentSessionId}
                        onSessionSelect={handleSessionSelect}
                        onNewSession={handleNewSession}
                        onSessionDelete={handleSessionDelete}
                        onSessionCreated={setSidebarRef}
                    />
                )}
                
                <main className="flex flex-1 flex-col transition-all duration-200 bg-black min-h-0 overflow-hidden">
                    {homeMode === 'HOME' ? (
                        <div className="flex items-center justify-center h-full">
                            <div className="space-y-8">
                                <div className="text-center space-y-2">
                                    <h1 className="text-4xl font-bold text-white">LocalGPT</h1>
                                    <p className="text-lg text-gray-400">What can I help you find today?</p>
                                </div>

                                <LandingMenu onSelect={(m)=>{
                                    if(m==='CHAT_EXISTING'){ setShowIndexPicker(true); return; }
                                    if(m==='QUICK_CHAT'){
                                        setHomeMode('QUICK_CHAT');
                                        setShowConversation(true);
                                        return;
                                    }
                                    setHomeMode('INDEX');
                                }} />
                                <div className="flex flex-col items-center gap-3 mt-12">
                                    <div className="flex items-center gap-2 text-sm">
                                        {backendStatus === 'checking' && (
                                            <div className="flex items-center gap-2 text-gray-400">
                                                <div className="w-2 h-2 bg-yellow-500 rounded-full animate-pulse"></div>
                                                Connecting to backend...
                                            </div>
                                        )}
                                        {backendStatus === 'connected' && (
                                            <div className="flex items-center gap-2 text-green-400">
                                                <div className="w-2 h-2 bg-green-500 rounded-full"></div>
                                                Backend connected • Session-based chat ready
                                            </div>
                                        )}
                                        {backendStatus === 'error' && (
                                            <div className="flex items-center gap-2 text-red-400">
                                                <div className="w-2 h-2 bg-red-500 rounded-full"></div>
                                                Backend offline • Start backend server to enable chat
                                            </div>
                                        )}
                                    </div>
                                </div>
                            </div>
                        </div>
                    ) : homeMode==='CHAT_EXISTING' ? (
                        <SessionChat
                            sessionId={currentSessionId}
                            onSessionChange={handleSessionChange}
                            className="flex-1"
                        />
                    ) : homeMode==='QUICK_CHAT' ? (
                        <QuickChat sessionId={currentSessionId} onSessionChange={handleSessionChange} className="flex-1" />
                    ) : null}
                </main>

                {homeMode==='INDEX' && (
                  <div className="fixed inset-0 flex items-center justify-center bg-black/50 backdrop-blur-sm z-50">
                    <IndexForm onClose={()=>setHomeMode('HOME')} onIndexed={(s)=>{setHomeMode('CHAT_EXISTING'); handleSessionSelect(s.id);}} />
                  </div>
                )}

                {showIndexInfo && currentSessionId && (
                  <SessionIndexInfo sessionId={currentSessionId} onClose={()=>setShowIndexInfo(false)} />
                )}

                {showIndexPicker && (
                  <IndexPicker onClose={()=>setShowIndexPicker(false)} onSelect={async (idxId)=>{
                    // create session and link index then open chat
                    const session = await chatAPI.createSession()
                    await chatAPI.linkIndexToSession(session.id, idxId)
                    setShowIndexPicker(false)
                    setHomeMode('CHAT_EXISTING')
                    handleSessionSelect(session.id)
                  }} />
                )}
            </div>
        </div>
    );
} 

================================================
FILE: src/components/ui/AccordionGroup.tsx
================================================
"use client";
import React from 'react';

interface Props {
  title: React.ReactNode;
  children: React.ReactNode;
  defaultOpen?: boolean;
}

export function AccordionGroup({ title, children, defaultOpen }: Props) {
  return (
    <details open={defaultOpen} className="border-t border-white/10 py-4 group">
      <summary className="cursor-pointer select-none list-none text-xs uppercase tracking-wide text-gray-400 mb-3 flex items-center gap-2">
        {title}
        <svg
          className="w-3 h-3 text-gray-400 ml-auto transition-transform group-open:rotate-90"
          viewBox="0 0 20 20"
          fill="none"
          stroke="currentColor"
          strokeWidth="2"
        >
          <path d="M6 6l6 4-6 4V6z" />
        </svg>
      </summary>
      <div className="space-y-4 pl-1">{children}</div>
    </details>
  );
} 

================================================
FILE: src/components/ui/GlassInput.tsx
================================================
"use client";
import React, { InputHTMLAttributes } from 'react';

export function GlassInput(props: InputHTMLAttributes<HTMLInputElement>) {
  return (
    <input
      {...props}
      className={`w-full rounded bg-white/5 hover:bg-white/10 focus:bg-white/10 px-2 py-1 text-sm font-sans text-white outline-none focus:ring-2 focus:ring-white/20 transition ${props.className || ''}`}
    />
  );
} 

================================================
FILE: src/components/ui/GlassSelect.tsx
================================================
"use client";
import React, { SelectHTMLAttributes } from 'react';

export function GlassSelect(props: SelectHTMLAttributes<HTMLSelectElement>) {
  return (
    <select
      {...props}
      className={`w-full rounded bg-white/5 hover:bg-white/10 focus:bg-white/10 px-2 py-1 text-sm font-sans text-white outline-none focus:ring-2 focus:ring-white/20 transition ${props.className || ''}`}
    >
      {props.children}
    </select>
  );
} 

================================================
FILE: src/components/ui/GlassToggle.tsx
================================================
"use client";
import React from 'react';

interface Props {
  checked: boolean;
  onChange: (v: boolean) => void;
}

export function GlassToggle({ checked, onChange }: Props) {
  return (
    <button
      onClick={() => onChange(!checked)}
      className={`w-10 h-5 rounded-full transition relative ${checked ? 'bg-green-500/70' : 'bg-white/20'} font-sans`}
    >
      <span
        className={`absolute top-0.5 left-0.5 w-4 h-4 rounded-full bg-white transition-transform ${checked ? 'translate-x-5' : ''}`}
      />
    </button>
  );
} 

================================================
FILE: src/components/ui/InfoTooltip.tsx
================================================
import { useState } from "react";
import { Info } from "lucide-react";

interface Props {
  text: string;
  className?: string;
  size?: number;
}

// A lightweight hover / focus tooltip used next to form labels.
// It shows a small Info icon; on hover (or focus) a dark glassy popover appears.
export function InfoTooltip({ text, className = "", size = 14 }: Props) {
  const [open, setOpen] = useState(false);
  return (
    <span
      className={`relative inline-block align-middle ${className}`}
      onMouseEnter={() => setOpen(true)}
      onMouseLeave={() => setOpen(false)}
      onFocus={() => setOpen(true)}
      onBlur={() => setOpen(false)}
      tabIndex={0}
    >
      <Info size={size} className="text-gray-400 hover:text-white cursor-pointer" />
      {open && (
        <div className="absolute left-1/2 -translate-x-1/2 top-full mt-2 w-56 bg-black/80 backdrop-blur-sm text-gray-200 text-xs px-3 py-2 rounded shadow-lg z-50 normal-case whitespace-normal break-words">
          {text}
        </div>
      )}
    </span>
  );
} 

================================================
FILE: src/components/ui/avatar.tsx
================================================
"use client"

import * as React from "react"
import * as AvatarPrimitive from "@radix-ui/react-avatar"

import { cn } from "@/lib/utils"

function Avatar({
  className,
  ...props
}: React.ComponentProps<typeof AvatarPrimitive.Root>) {
  return (
    <AvatarPrimitive.Root
      data-slot="avatar"
      className={cn(
        "relative flex size-8 shrink-0 overflow-hidden rounded-full",
        className
      )}
      {...props}
    />
  )
}

function AvatarImage({
  className,
  ...props
}: React.ComponentProps<typeof AvatarPrimitive.Image>) {
  return (
    <AvatarPrimitive.Image
      data-slot="avatar-image"
      className={cn("aspect-square size-full", className)}
      {...props}
    />
  )
}

function AvatarFallback({
  className,
  ...props
}: React.ComponentProps<typeof AvatarPrimitive.Fallback>) {
  return (
    <AvatarPrimitive.Fallback
      data-slot="avatar-fallback"
      className={cn(
        "bg-muted flex size-full items-center justify-center rounded-full text-black",
        className
      )}
      {...props}
    />
  )
}

export { Avatar, AvatarImage, AvatarFallback }


================================================
FILE: src/components/ui/badge.tsx
================================================
import * as React from "react"
import { Slot } from "@radix-ui/react-slot"
import { cva, type VariantProps } from "class-variance-authority"

import { cn } from "@/lib/utils"

const badgeVariants = cva(
  "inline-flex items-center justify-center rounded-md border px-2 py-0.5 text-xs font-medium w-fit whitespace-nowrap shrink-0 [&>svg]:size-3 gap-1 [&>svg]:pointer-events-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive transition-[color,box-shadow] overflow-hidden",
  {
    variants: {
      variant: {
        default:
          "border-transparent bg-primary text-primary-foreground [a&]:hover:bg-primary/90",
        secondary:
          "border-transparent bg-secondary text-secondary-foreground [a&]:hover:bg-secondary/90",
        destructive:
          "border-transparent bg-destructive text-white [a&]:hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60",
        outline:
          "text-foreground [a&]:hover:bg-accent [a&]:hover:text-accent-foreground",
      },
    },
    defaultVariants: {
      variant: "default",
    },
  }
)

function Badge({
  className,
  variant,
  asChild = false,
  ...props
}: React.ComponentProps<"span"> &
  VariantProps<typeof badgeVariants> & { asChild?: boolean }) {
  const Comp = asChild ? Slot : "span"

  return (
    <Comp
      data-slot="badge"
      className={cn(badgeVariants({ variant }), className)}
      {...props}
    />
  )
}

export { Badge, badgeVariants }


================================================
FILE: src/components/ui/button.tsx
================================================
import * as React from "react"
import { Slot } from "@radix-ui/react-slot"
import { cva, type VariantProps } from "class-variance-authority"

import { cn } from "@/lib/utils"

const buttonVariants = cva(
  "inline-flex items-center justify-center gap-2 whitespace-nowrap rounded-md text-sm font-medium transition-all disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg:not([class*='size-'])]:size-4 shrink-0 [&_svg]:shrink-0 outline-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive",
  {
    variants: {
      variant: {
        default:
          "bg-primary text-primary-foreground shadow-xs hover:bg-primary/90",
        destructive:
          "bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60",
        outline:
          "border bg-background shadow-xs hover:bg-accent hover:text-accent-foreground dark:bg-input/30 dark:border-input dark:hover:bg-input/50",
        secondary:
          "bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80",
        ghost:
          "hover:bg-accent hover:text-accent-foreground dark:hover:bg-accent/50",
        link: "text-primary underline-offset-4 hover:underline",
      },
      size: {
        default: "h-9 px-4 py-2 has-[>svg]:px-3",
        sm: "h-8 rounded-md gap-1.5 px-3 has-[>svg]:px-2.5",
        lg: "h-10 rounded-md px-6 has-[>svg]:px-4",
        icon: "size-9",
      },
    },
    defaultVariants: {
      variant: "default",
      size: "default",
    },
  }
)

function Button({
  className,
  variant,
  size,
  asChild = false,
  ...props
}: React.ComponentProps<"button"> &
  VariantProps<typeof buttonVariants> & {
    asChild?: boolean
  }) {
  const Comp = asChild ? Slot : "button"

  return (
    <Comp
      data-slot="button"
      className={cn(buttonVariants({ variant, size, className }))}
      {...props}
    />
  )
}

export { Button, buttonVariants }


================================================
FILE: src/components/ui/chat-bubble-demo.tsx
================================================
"use client"

import {
  ChatBubble,
  ChatBubbleAvatar,
  ChatBubbleMessage
} from "@/components/ui/chat-bubble"
import { Copy, RefreshCcw } from "lucide-react"

const messages = [
  {
    id: 1,
    message: "Help me with my essay.",
    sender: "user",
  },
  {
    id: 2,
    message: "I can help you with that. What do you need help with?",
    sender: "bot",
  },
]

const actionIcons = [
  { icon: Copy, type: "Copy" },
  { icon: RefreshCcw, type: "Regenerate" },
]

export function ChatBubbleVariants() {
  return (
    <div className="max-w-md space-y-4 p-4">
      <ChatBubble variant="sent">
        <ChatBubbleAvatar fallback="US" src="https://images.unsplash.com/photo-1534528741775-53994a69daeb?w=64&h=64&q=80&crop=faces&fit=crop" />
        <ChatBubbleMessage variant="sent">
          I have a question about the library.
        </ChatBubbleMessage>
      </ChatBubble>

      <ChatBubble variant="received">
        <ChatBubbleAvatar fallback="AI" src="https://images.unsplash.com/photo-1677442136019-21780ecad995?w=64&h=64&q=80&crop=faces&fit=crop"  />
        <ChatBubbleMessage>
          Sure, I&apos;d be happy to help!
        </ChatBubbleMessage>
      </ChatBubble>
    </div>
  )
}

export function ChatBubbleAiLayout() {
  return (
    <div className="max-w-md divide-y">
      {messages.map((message, index) => {
        const variant = message.sender === "user" ? "sent" : "received"
        return (
          <div key={message.id} className="py-6 first:pt-0 last:pb-0">
            <div className="flex gap-3">
              <ChatBubbleAvatar 
                src={variant === "sent" 
                  ? "https://images.unsplash.com/photo-1534528741775-53994a69daeb?w=64&h=64&q=80&crop=faces&fit=crop"
                  : "https://images.unsplash.com/photo-1677442136019-21780ecad995?w=64&h=64&q=80&crop=faces&fit=crop"
                }
                fallback={variant === "sent" ? "US" : "L"} 
              />
              <div className="flex-1">
                {message.message}
                {message.sender === "bot" && (
                  <div className="flex gap-2 mt-2">
                    {actionIcons.map(({ icon: Icon, type }) => (
                      <button
                        key={type}
                        onClick={() => console.log(`Action ${type} clicked for message ${index}`)}
                        className="p-1 hover:bg-muted rounded-md transition-colors"
                      >
                        <Icon className="size-3" />
                      </button>
                    ))}
                  </div>
                )}
              </div>
            </div>
          </div>
        )
      })}
    </div>
  )
}

export function ChatBubbleStates() {
  return (
    <div className="max-w-md space-y-4 p-4">
      <ChatBubble variant="received">
        <ChatBubbleAvatar fallback="L" />
        <ChatBubbleMessage isLoading />
      </ChatBubble>

      <ChatBubble variant="received">
        <ChatBubbleAvatar fallback="L" />
        <ChatBubbleMessage className="bg-destructive/10 text-destructive">
          Error processing request
        </ChatBubbleMessage>
      </ChatBubble>
    </div>
  )
} 

================================================
FILE: src/components/ui/chat-bubble.tsx
================================================
"use client"

import * as React from "react"
import { cn } from "@/lib/utils"
import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar"
import { Button } from "@/components/ui/button"
import { MessageLoading } from "@/components/ui/message-loading";

interface ChatBubbleProps {
  variant?: "sent" | "received"
  layout?: "default" | "ai"
  className?: string
  children: React.ReactNode
}

export function ChatBubble({
  variant = "received",
  layout = "default", // eslint-disable-line @typescript-eslint/no-unused-vars
  className,
  children,
}: ChatBubbleProps) {
  return (
    <div
      className={cn(
        "flex items-start gap-2 mb-4",
        variant === "sent" && "flex-row-reverse",
        className,
      )}
    >
      {children}
    </div>
  )
}

interface ChatBubbleMessageProps {
  variant?: "sent" | "received"
  isLoading?: boolean
  className?: string
  children?: React.ReactNode
}

export function ChatBubbleMessage({
  variant = "received",
  isLoading,
  className,
  children,
}: ChatBubbleMessageProps) {
  return (
    <div
      className={cn(
        "rounded-lg p-3",
        variant === "sent" ? "bg-primary text-primary-foreground" : "bg-muted",
        className
      )}
    >
      {isLoading ? (
        <div className="flex items-center space-x-2">
          <MessageLoading />
        </div>
      ) : (
        children
      )}
    </div>
  )
}

interface ChatBubbleAvatarProps {
  src?: string
  fallback?: string
  className?: string
}

export function ChatBubbleAvatar({
  src,
  fallback = "AI",
  className,
}: ChatBubbleAvatarProps) {
  return (
    <Avatar className={cn("h-8 w-8", className)}>
      {src && <AvatarImage src={src} />}
      <AvatarFallback>{fallback}</AvatarFallback>
    </Avatar>
  )
}

interface ChatBubbleActionProps {
  icon?: React.ReactNode
  onClick?: () => void
  className?: string
}

export function ChatBubbleAction({
  icon,
  onClick,
  className,
}: ChatBubbleActionProps) {
  return (
    <Button
      variant="ghost"
      size="icon"
      className={cn("h-6 w-6", className)}
      onClick={onClick}
    >
      {icon}
    </Button>
  )
}

export function ChatBubbleActionWrapper({
  className,
  children,
}: {
  className?: string
  children: React.ReactNode
}) {
  return (
    <div className={cn("flex items-center gap-1 mt-2", className)}>
      {children}
    </div>
  )
} 

================================================
FILE: src/components/ui/chat-input.tsx
================================================
"use client"

import * as React from "react"
import { useState, useRef } from "react"
import { ArrowUp, Settings as SettingsIcon, Plus, X, FileText } from "lucide-react"
import { Button } from "@/components/ui/button"
import { AttachedFile } from "@/lib/types"

interface ChatInputProps {
  onSendMessage: (message: string, attachedFiles?: AttachedFile[]) => Promise<void>
  disabled?: boolean
  placeholder?: string
  className?: string
  onOpenSettings?: () => void
  onAddIndex?: () => void
  leftExtras?: React.ReactNode
}

export function ChatInput({ 
  onSendMessage, 
  disabled = false,
  placeholder = "Message localGPT...",
  className = "",
  onOpenSettings,
  onAddIndex,
  leftExtras
}: ChatInputProps) {
  const [message, setMessage] = useState("")
  const [attachedFiles, setAttachedFiles] = useState<AttachedFile[]>([])
  const [isLoading, setIsLoading] = useState(false)
  const textareaRef = useRef<HTMLTextAreaElement>(null)
  const fileInputRef = useRef<HTMLInputElement>(null)

  const handleSubmit = async (e: React.FormEvent) => {
    e.preventDefault()
    if ((!message.trim() && attachedFiles.length === 0) || disabled || isLoading) return

    const messageToSend = message.trim()
    const filesToSend = [...attachedFiles]
    setMessage("")
    setAttachedFiles([])
    setIsLoading(true)

    try {
      await onSendMessage(messageToSend, filesToSend)
    } catch (error) {
      console.error("Failed to send message:", error)
      // Restore message and files on error
      setMessage(messageToSend)
      setAttachedFiles(filesToSend)
    } finally {
      setIsLoading(false)
    }
  }

  const handleKeyDown = (e: React.KeyboardEvent) => {
    if (e.key === 'Enter' && !e.shiftKey) {
      e.preventDefault()
      handleSubmit(e as unknown as React.FormEvent)
    }
  }

  const handleInput = (e: React.ChangeEvent<HTMLTextAreaElement>) => {
    setMessage(e.target.value)
    
    // Auto-resize textarea
    const textarea = textareaRef.current
    if (textarea) {
      textarea.style.height = 'auto'
      textarea.style.height = Math.min(textarea.scrollHeight, 120) + 'px'
    }
  }

  const handleFileAttach = () => {
    fileInputRef.current?.click()
  }

  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
    const files = e.target.files
    if (!files) return

    const newFiles: AttachedFile[] = []
    for (let i = 0; i < files.length; i++) {
      const file = files[i]
      console.log('🔧 Frontend: File selected:', {
        name: file.name,
        size: file.size,
        type: file.type,
        lastModified: file.lastModified
      });
      
      if (file.type === 'application/pdf' || 
          file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
          file.type === 'application/msword' ||
          file.type === 'text/html' ||
          file.type === 'text/markdown' ||
          file.type === 'text/plain' ||
          file.name.toLowerCase().endsWith('.pdf') ||
          file.name.toLowerCase().endsWith('.docx') ||
          file.name.toLowerCase().endsWith('.doc') ||
          file.name.toLowerCase().endsWith('.html') ||
          file.name.toLowerCase().endsWith('.htm') ||
          file.name.toLowerCase().endsWith('.md') ||
          file.name.toLowerCase().endsWith('.txt')) {
        newFiles.push({
          id: crypto.randomUUID(),
          name: file.name,
          size: file.size,
          type: file.type,
          file: file,
        })
      } else {
        console.log('🔧 Frontend: File rejected - unsupported format:', file.type);
      }
    }

    setAttachedFiles(prev => [...prev, ...newFiles])
    
    // Reset the input
    if (fileInputRef.current) {
      fileInputRef.current.value = ''
    }
  }

  const removeFile = (fileId: string) => {
    setAttachedFiles(prev => prev.filter(f => f.id !== fileId))
  }

  const formatFileSize = (bytes: number) => {
    if (bytes === 0) return '0 Bytes'
    const k = 1024
    const sizes = ['Bytes', 'KB', 'MB', 'GB']
    const i = Math.floor(Math.log(bytes) / Math.log(k))
    return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]
  }

  return (
    <div className={`border-t border-white/10 bg-black/60 backdrop-blur-sm p-4 ${className}`}>
      <form onSubmit={handleSubmit} className="max-w-4xl mx-auto">
        {/* Attached Files Display */}
        {attachedFiles.length > 0 && (
          <div className="mb-3 space-y-2">
            <div className="text-sm text-gray-400 font-medium">Attached Files:</div>
            <div className="space-y-2">
              {attachedFiles.map((file) => (
                <div key={file.id} className="flex items-center gap-3 bg-gray-800 rounded-lg p-3">
                  <FileText className="w-5 h-5 text-red-400" />
                  <div className="flex-1 min-w-0">
                    <div className="text-sm text-white truncate">{file.name}</div>
                    <div className="text-xs text-gray-400">{formatFileSize(file.size)}</div>
                  </div>
                  <button
                    type="button"
                    onClick={() => removeFile(file.id)}
                    className="p-1 hover:bg-gray-700 rounded transition-colors"
                  >
                    <X className="w-4 h-4 text-gray-400 hover:text-white" />
                  </button>
                </div>
              ))}
            </div>
          </div>
        )}

        <div className="bg-white/5 backdrop-blur border border-white/10 rounded-2xl px-5 pt-4 pb-3 space-y-2">
          {/* Hidden file input (kept for future use) */}
          <input ref={fileInputRef} type="file" accept=".pdf,.docx,.doc,.html,.htm,.md,.txt" multiple onChange={handleFileChange} className="hidden" />

          {/* Textarea */}
          <textarea
            ref={textareaRef}
            value={message}
            onChange={handleInput}
            onKeyDown={handleKeyDown}
            placeholder={attachedFiles.length > 0 ? "Ask questions about your attached files..." : placeholder}
            disabled={disabled || isLoading}
            rows={1}
            className="w-full bg-transparent border-none text-white placeholder-gray-400 resize-none overflow-y-hidden focus:outline-none focus:ring-0 disabled:opacity-50 disabled:cursor-not-allowed text-base"
            style={{ maxHeight: '120px', minHeight: '44px' }}
          />

          {/* Action row */}
          <div className="mt-1 flex items-center justify-between">
            <div className="flex items-center gap-4">
              <button
                type="button"
                onClick={()=>onOpenSettings && onOpenSettings()}
                disabled={disabled || isLoading}
                className="flex items-center gap-1 p-2 text-gray-400 hover:text-white hover:bg-gray-800 rounded-full transition-colors disabled:opacity-50 disabled:cursor-not-allowed"
                title="Chat settings"
              >
                <SettingsIcon className="w-5 h-5" />
                <span className="text-xs hidden sm:inline">Settings</span>
              </button>
              {leftExtras}
            </div>
            <Button
              type="submit"
              size="sm"
              disabled={(!message.trim() && attachedFiles.length === 0) || disabled || isLoading}
              className="w-8 h-8 p-0 rounded-full bg-white hover:bg-gray-100 text-black disabled:bg-gray-600 disabled:text-gray-400"
            >
              {isLoading ? (
                <div className="w-4 h-4 border-2 border-gray-400 border-t-transparent rounded-full animate-spin" />
              ) : (
                <ArrowUp className="w-4 h-4" />
              )}
            </Button>
          </div>
        </div>
      </form>
    </div>
  )
}    

================================================
FILE: src/components/ui/chat-settings-modal.tsx
================================================
"use client";

import { GlassToggle } from '@/components/ui/GlassToggle';
import { InfoTooltip } from '@/components/ui/InfoTooltip';

export interface ToggleOption {
  type: 'toggle';
  label: string;
  checked: boolean;
  setter: (v: boolean) => void;
}

export interface SliderOption {
  type: 'slider';
  label: string;
  value: number;
  setter: (v: number) => void;
  min: number;
  max: number;
  step?: number;
  unit?: string;
}

export interface DropdownOption {
  type: 'dropdown';
  label: string;
  value: string;
  setter: (v: string) => void;
  options: { value: string; label: string }[];
}

export type SettingOption = ToggleOption | SliderOption | DropdownOption;

interface Props {
  options: SettingOption[];
  onClose: () => void;
}

const optionHelp: Record<string,string> = {
  'Query decomposition':'Breaks a complex question into sub-queries to improve recall (adds latency).',
  'Compose sub-answers':'Merges answers from decomposed sub-queries into a single response.',
  'Pruning':'Removes sentences deemed irrelevant by a lightweight model before synthesis.',
  'RAG (no-triage)':'Force retrieval on every query; disables index-selection triage.',
  'Verify answer':'Runs an extra LLM pass to self-critique the draft answer.',
  'Streaming':'Send tokens to the UI as they are generated.',
  'AI reranker':'Re-orders retrieved chunks with a cross-encoder (higher quality, more latency).',
  'Expand context window':'Adds neighbour chunks around each top chunk to provide more context.',
  'Context window size':'How many neighbour chunks to include on each side.',
  'Retrieval chunks':'Number of chunks fetched before reranking.',
  'LLM':'Select which model generates the final answer.',
  'Search type':'Choose retrieval strategy (Hybrid recommended).',
  'Reranker top chunks':'Limit how many chunks are re-ranked to speed up processing.'
};

export function ChatSettingsModal({ options, onClose }: Props) {
  const renderOption = (opt: SettingOption) => {
    switch (opt.type) {
      case 'toggle':
        return (
          <div key={opt.label} className="flex items-center justify-between">
            <span className="text-sm text-gray-300 flex items-center gap-1 whitespace-nowrap">
              {displayName(opt.label)}
              {optionHelp[displayName(opt.label)] && <InfoTooltip text={optionHelp[displayName(opt.label)]} size={12} />}
            </span>
            <GlassToggle checked={opt.checked} onChange={opt.setter} />
          </div>
        );
      
      case 'slider':
        return (
          <div key={opt.label} className="space-y-2">
            <div className="flex items-center justify-between">
              <span className="text-sm text-gray-300 flex items-center gap-1">{displayName(opt.label)}{optionHelp[displayName(opt.label)] && <InfoTooltip text={optionHelp[displayName(opt.label)]} size={12} />}</span>
              <span className="text-sm text-gray-400">
                {opt.value}{opt.unit || ''}
              </span>
            </div>
            <input
              type="range"
              min={opt.min}
              max={opt.max}
              step={opt.step || 1}
              value={opt.value}
              onChange={(e) => opt.setter(Number(e.target.value))}
              className="w-full h-2 bg-gray-700 rounded-lg appearance-none cursor-pointer slider"
              style={{
                background: `linear-gradient(to right, #3b82f6 0%, #3b82f6 ${((opt.value - opt.min) / (opt.max - opt.min)) * 100}%, #374151 ${((opt.value - opt.min) / (opt.max - opt.min)) * 100}%, #374151 100%)`
              }}
            />
            <div className="flex justify-between text-xs text-gray-500">
              <span>{opt.min}{opt.unit || ''}</span>
              <span>{opt.max}{opt.unit || ''}</span>
            </div>
          </div>
        );
      
      case 'dropdown':
        return (
          <div key={opt.label} className="space-y-2">
            <span className="text-sm text-gray-300 flex items-center gap-1">{displayName(opt.label)}{optionHelp[displayName(opt.label)] && <InfoTooltip text={optionHelp[displayName(opt.label)]} size={12} />}</span>
            <select
              value={opt.value}
              onChange={(e) => opt.setter(e.target.value)}
              className="w-full px-3 py-2 bg-gray-700 border border-gray-600 rounded-lg text-white text-sm focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent"
            >
              {opt.options.map((option) => (
                <option key={option.value} value={option.value}>
                  {option.label}
                </option>
              ))}
            </select>
          </div>
        );
      
      default:
        return null;
    }
  };

  const gridToggleLabels: string[] = [
    'Query decomposition',
    'Compose sub-answers',
    'Prune irrelevant sentences',
    'Always search documents', // will be displayed as RAG (no-triage)
    'Verify answer',
    'Stream phases',
  ];

  const retrievalGridLabels = ['LLM model','Search type'];

  const displayName = (label: string) => {
    if (label === 'Always search documents') return 'RAG (no-triage)';
    if (label === 'LLM model') return 'LLM';
    if (label === 'Prune irrelevant sentences') return 'Pruning';
    if (label === 'Stream phases') return 'Streaming';
    return label;
  };

  const renderOptionOrdered = (label: string) => {
    const opt = options.find(o => o.label === label);
    if (!opt) return null;
    // Clone option with display label override
    const clone = { ...opt, label: displayName(label) } as SettingOption;
    return renderOption(clone);
  };

  return (
    <div className="fixed inset-0 bg-black/60 backdrop-blur-sm flex items-center justify-center z-50 p-4">
      <div className="bg-white/5 backdrop-blur rounded-xl w-full max-w-xl max-h-full overflow-y-auto p-6 text-white space-y-6">
        <h2 className="text-lg font-semibold mb-6">Chat Settings</h2>

        <div className="space-y-6">
          {/* High-level Settings */}
          <div>
            <h3 className="text-md font-medium text-gray-200 mb-4 flex items-center gap-1">General Settings <InfoTooltip text="High-level toggles that affect how the assistant thinks and whether it always performs RAG." /></h3>
            {/* Two-column grid for key toggles */}
            <div className="grid grid-cols-2 gap-4 mb-4">
              {gridToggleLabels.map(renderOptionOrdered)}
            </div>
            {/* No additional general options after grid */}
          </div>

          {/* Retrieval Settings */}
          <div>
            <h3 className="text-md font-medium text-gray-200 mb-4 flex items-center gap-1">Retrieval Settings <InfoTooltip text="Configure which LLM answers and how the system searches your indexes." /></h3>
            {/* LLM + Search type grid */}
            {(() => {
              const arr: SettingOption[] = retrievalGridLabels
                .map(lbl => {
                  const opt = options.find(o=>o.label===lbl);
                  return opt ? ({...opt, label: displayName(lbl) } as SettingOption) : undefined;
                })
                .filter((o): o is SettingOption => !!o);
              return <div className="grid grid-cols-2 gap-4 mb-4">{arr.map(renderOption)}</div>;
            })()}
            {/* Sliders */}
            <div className="space-y-4">
              {options.filter(opt => ['Retrieval chunks'].includes(opt.label)).map(renderOption)}
            </div>
          </div>

          {/* Reranking Settings */}
          <div>
            <h3 className="text-md font-medium text-gray-200 mb-4 flex items-center gap-1">Reranking & Context <InfoTooltip text="Controls post-retrieval reordering, context window expansion and pruning (may add latency)." /></h3>
            <div className="space-y-4">
              {options.filter(opt => 
                ['AI reranker', 'Reranker top chunks', 'Expand context window', 'Context window size'].includes(opt.label)
              ).map(renderOption)}
            </div>
          </div>
        </div>

        <div className="flex justify-end pt-6 border-t border-white/10 mt-6">
          <button
            onClick={onClose}
            className="px-4 py-2 bg-gray-700 rounded hover:bg-gray-600 text-sm"
          >
            Close
          </button>
        </div>
      </div>
    </div>
  );
} 

================================================
FILE: src/components/ui/conversation-page.tsx
================================================
"use client"

import * as React from "react"
import { useRef, useEffect, useState } from "react"
import {
  ChatBubbleAvatar,
} from "@/components/ui/chat-bubble"
import { Copy, RefreshCcw, ThumbsUp, ThumbsDown, Volume2, MoreHorizontal, ChevronDown, Loader2, CheckCircle, XOctagon } from "lucide-react"
import { ScrollArea } from "@/components/ui/scroll-area"
import { ChatMessage } from "@/lib/api"
import { cn } from "@/lib/utils"
import Markdown from "@/components/Markdown"
import { normalizeWhitespace } from "@/utils/textNormalization"

interface ConversationPageProps {
  messages: ChatMessage[]
  isLoading?: boolean
  className?: string
  onAction?: (action: string, messageId: string, messageContent: string) => void
}

const actionIcons = [
  { icon: Copy, type: "Copy", action: "copy" },
  { icon: ThumbsUp, type: "Like", action: "like" },
  { icon: ThumbsDown, type: "Dislike", action: "dislike" },
  { icon: Volume2, type: "Speak", action: "speak" },
  { icon: RefreshCcw, type: "Regenerate", action: "regenerate" },
  { icon: MoreHorizontal, type: "More", action: "more" },
]

// Citation block toggle component
function Citation({doc, idx}: {doc:any, idx:number}){
  const [open,setOpen]=React.useState(false);
  const preview = (doc.text||'').replace(/\s+/g,' ').trim().slice(0,160) + ((doc.text||'').length>160?'…':'');
  return (
    <div onClick={()=>setOpen(!open)} className="text-xs text-gray-300 bg-gray-900/60 rounded p-2 cursor-pointer hover:bg-gray-800 transition">
      <span className="font-semibold mr-1">[{idx+1}]</span>{open?doc.text:preview}
    </div>
  );
}

// NEW: Expandable list of citations per assistant message
function CitationsBlock({docs}:{docs:any[]}){
  const scored = docs.filter(d => d.rerank_score || d.score || d._distance)
  scored.sort((a, b) => (b.rerank_score ?? b.score ?? 1/b._distance) - (a.rerank_score ?? a.score ?? 1/a._distance))
  const [expanded, setExpanded] = useState(false);

  if (scored.length === 0) return null;

  const visibleDocs = expanded ? scored : scored.slice(0, 5);

  return (
    <div className="mt-2 text-xs text-gray-400">
      <p className="font-semibold mb-1">Sources:</p>
      <div className="grid grid-cols-1 gap-2">
        {visibleDocs.map((doc, i) => <Citation key={doc.chunk_id || i} doc={doc} idx={i} />)}
      </div>
      {scored.length > 5 && (
        <button 
          onClick={() => setExpanded(!expanded)} 
          className="text-blue-400 hover:text-blue-300 mt-2 text-xs"
        >
          {expanded ? 'Show less' : `Show ${scored.length-5} more`}
        </button>
      )}
    </div>
  );
}

function StepIcon({ status }: { status: 'pending' | 'active' | 'done' | 'error' }) {
  switch (status) {
    case 'pending':
      return <MoreHorizontal className="w-4 h-4 text-neutral-600" />
    case 'active':
      return <Loader2 className="w-4 h-4 text-blue-400 animate-spin" />
    case 'done':
      return <CheckCircle className="w-4 h-4 text-green-400" />
    case 'error':
      return <XOctagon className="w-4 h-4 text-red-400" />
    default:
      return null
  }
}

const statusBorder: Record<string, string> = {
  pending: 'border-neutral-800',
  active: 'border-blue-400 animate-pulse',
  done: 'border-green-400',
  error: 'border-red-400'
}

// Component to handle <think> tokens and render them in a collapsible block
function ThinkingText({ text }: { text: string }) {
  const regex = /<think>([\s\S]*?)<\/think>/g;
  const thinkSegments: string[] = [];
  const visibleText = text.replace(regex, (_, p1) => {
    thinkSegments.push(p1.trim());
    return ""; // remove thinking content from main text
  });

  return (
    <>
      {thinkSegments.length > 0 && (
        <details className="thinking-block inline-block align-baseline mr-2" open={false}>
          <summary className="cursor-pointer text-xs text-gray-400 uppercase select-none">Thinking</summary>
          <div className="mt-1 space-y-1 text-xs text-gray-400 italic">
            {thinkSegments.map((seg, idx) => (
              <div key={idx}>{seg}</div>
            ))}
          </div>
        </details>
      )}
      {visibleText.trim() && (
        <Markdown text={normalizeWhitespace(visibleText)} className="whitespace-pre-wrap" />
      )}
    </>
  );
}

function StructuredMessageBlock({ content }: { content: Array<Record<string, any>> | { steps: any[] } }) {
  const steps: any[] = Array.isArray(content) ? content : (content as any).steps;
  // Determine if sub-query answers are present
  const hasSubAnswers = steps.some((s: any) => s.key === 'answer' && Array.isArray(s.details) && s.details.length > 0);
  // Compute the last index that has started (status !== 'pending') so we only
  // render steps that are in progress or completed. This avoids showing the
  // whole plan upfront and reveals each stage sequentially.
  const lastRevealedIdx = (() => {
    for (let i = steps.length - 1; i >= 0; i--) {
      if (steps[i].status && steps[i].status !== 'pending') {
        return i;
      }
    }
    return -1; // nothing started yet
  })();

  const visibleSteps = lastRevealedIdx >= 0 ? steps.slice(0, lastRevealedIdx + 1) : [];

  return (
    <div className="flex flex-col">
      {visibleSteps.map((step: any, index: number) => {
        if (step.key && step.label) {
          const borderCls = statusBorder[step.status] || statusBorder['pending']
          const statusClass = `timeline-card card my-1 py-2 pl-3 pr-2 bg-[#0d0d0d] rounded border-l-2 ${borderCls}`
          
          return (
            <div key={step.key} className={statusClass}>
              <div className="flex items-center gap-2 mb-1">
                <StepIcon status={step.status} />
                <span className="text-sm font-medium text-neutral-100">{step.label}</span>
              </div>
              {/* Details for each step */}
              {step.key === 'final' && step.details && typeof step.details === 'object' && !Array.isArray(step.details) ? (
                <div className="space-y-3">
                  <div className="whitespace-pre-wrap text-gray-100">
                    <ThinkingText text={normalizeWhitespace(step.details.answer)} />
                  </div>
                  {!hasSubAnswers && step.details.source_documents && step.details.source_documents.length > 0 && (
                    <CitationsBlock docs={step.details.source_documents} />
                  )}
                </div>
              ) : step.key === 'final' && step.details && typeof step.details === 'string' ? (
                <div className="whitespace-pre-wrap text-gray-100">
                  <ThinkingText text={normalizeWhitespace(step.details)} />
                </div>
              ) : Array.isArray(step.details) ? (
                step.key === 'decompose' && step.details.every((d: any)=> typeof d === 'string') ? (
                  // Render list of sub-query strings
                  <ul className="list-disc list-inside space-y-1 text-neutral-200">
                    {step.details.map((q: string, idx:number)=>(
                      <li key={idx}>{q}</li>
                    ))}
                  </ul>
                ) : (
                  // Handle array of sub-answers
                  <div className="space-y-2">
                    {step.details.map((detail: any, idx: number) => (
                      <div key={idx} className="border-l-2 border-blue-400 pl-2">
                        <div className="font-semibold">{detail.question}</div>
                        <div><ThinkingText text={normalizeWhitespace(detail.answer)} /></div>
                        {detail.source_documents && detail.source_documents.length > 0 && (
                          <CitationsBlock docs={detail.source_documents} />
                        )}
                      </div>
                    ))}
                  </div>
                )
              ) : (
                // Handle string details
                <ThinkingText text={normalizeWhitespace(step.details as string)} />
              )}
            </div>
          );
        }
        return null;
      })}
    </div>
  );
}

export function ConversationPage({ 
  messages, 
  isLoading = false,
  className = "",
  onAction
}: ConversationPageProps) {
  const scrollAreaRef = useRef<HTMLDivElement>(null)
  const messagesEndRef = useRef<HTMLDivElement>(null)
  const [showScrollButton, setShowScrollButton] = useState(false)
  const [isUserNearBottom,setIsUserNearBottom]=useState(true)

  // Track if user is near bottom so we don't interrupt manual scrolling
  useEffect(() => {
    if(isUserNearBottom){
    scrollToBottom()
    }
  }, [messages, isLoading])

  // Monitor scroll position to show/hide scroll button
  useEffect(() => {
    const scrollContainer = scrollAreaRef.current?.querySelector('[data-radix-scroll-area-viewport]')
    if (!scrollContainer) return

    const handleScroll = () => {
      const { scrollTop, scrollHeight, clientHeight } = scrollContainer
      const isNearBottom = scrollHeight - scrollTop - clientHeight < 100
      setShowScrollButton(!isNearBottom)
      setIsUserNearBottom(isNearBottom)
    }

    scrollContainer.addEventListener('scroll', handleScroll)
    handleScroll() // Check initial state

    return () => scrollContainer.removeEventListener('scroll', handleScroll)
  }, [])

  const scrollToBottom = () => {
    // Try multiple methods to ensure scrolling works
    if (messagesEndRef.current) {
      messagesEndRef.current.scrollIntoView({ behavior: 'smooth' })
    }
    
    // Fallback: scroll the container directly
    setTimeout(() => {
      if (scrollAreaRef.current) {
        const scrollContainer = scrollAreaRef.current.querySelector('[data-radix-scroll-area-viewport]') || scrollAreaRef.current
        if (scrollContainer) {
          scrollContainer.scrollTop = scrollContainer.scrollHeight
        }
      }
    }, 100)
  }

  const handleAction = (action: string, messageId: string, messageContent: string) => {
    if (onAction) {
      // For structured messages, we'll just join the text parts for copy/paste
      let contentToPass: string;
      if (typeof messageContent === 'string') {
        contentToPass = messageContent;
      } else if (Array.isArray(messageContent)) {
        contentToPass = (messageContent as any[]).map((s: any) => s.text || s.answer || '').join('\n');
      } else if (messageContent && typeof messageContent === 'object' && Array.isArray((messageContent as any).steps)) {
        // For {steps: Step[]} structure
        contentToPass = (messageContent as any).steps.map((s: any) => s.label + (s.details ? (typeof s.details === 'string' ? (': ' + s.details) : '') : '')).join('\n');
      } else {
        contentToPass = '';
      }
      onAction(action, messageId, contentToPass)
      return
    }
    
    console.log(`Action ${action} clicked for message ${messageId}`)
    // Handle different actions here
    switch (action) {
      case 'copy':
        navigator.clipboard.writeText(messageContent)
        break
      case 'regenerate':
        // Regenerate AI response
        break
      case 'like':
        // Add like reaction
        break
      case 'dislike':
        // Add dislike reaction
        break
      case 'speak':
        // Text to speech
        break
      case 'more':
        // Show more options
        break
    }
  }

  return (
    <div className={`flex flex-col h-full bg-black relative overflow-hidden ${className}`}>
      <ScrollArea ref={scrollAreaRef} className="flex-1 h-full px-4 pt-4 pb-6 min-h-0">
        <div className="max-w-4xl mx-auto space-y-6">
          {messages.map((message) => {
            const isUser = message.sender === "user"
            
            return (
              <div key={message.id} className="w-full group">
                <div className={`flex gap-3 ${isUser ? 'justify-end' : 'justify-start'}`}>
                  {!isUser && (
                    <ChatBubbleAvatar 
                      fallback="AI" 
                      className="mt-1 flex-shrink-0 text-black"
                    />
                  )}
                  
                  <div className={`flex flex-col space-y-2 ${isUser ? 'items-end' : 'items-start'} max-w-full md:max-w-3xl`}>
                    <div
                      className={`rounded-2xl px-5 py-4 ${
                        isUser 
                          ? "bg-white text-black" 
                          : "bg-gray-800 text-gray-100"
                      }`}
                    >
                      {message.isLoading ? (
                        <div className="flex items-center space-x-2">
                          <div className="flex space-x-1">
                            <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce"></div>
                            <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce" style={{animationDelay: '0.1s'}}></div>
                            <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce" style={{animationDelay: '0.2s'}}></div>
                          </div>
                        </div>
                      ) : (
                        <div className="whitespace-pre-wrap text-base leading-relaxed">
                          {typeof message.content === 'string' 
                              ? <ThinkingText text={normalizeWhitespace(message.content)} />
                              : <StructuredMessageBlock content={message.content} />
                          }
                        </div>
                      )}
                    </div>
                    
                    {!isUser && !message.isLoading && (
                      <div className="flex items-center gap-1 opacity-0 group-hover:opacity-100 transition-opacity duration-200">
                        {actionIcons.map(({ icon: Icon, type, action }) => (
                          <button
                            key={action}
                            onClick={() => {
                              const content = typeof message.content === 'string' ? message.content : (message.content as any[]).map(s => s.text || s.answer).join('\\n');
                              handleAction(action, message.id, content)
                            }}
                            className="p-1.5 hover:bg-gray-700 rounded-md transition-colors text-gray-400 hover:text-gray-200"
                            title={type}
                          >
                            <Icon className="w-3.5 h-3.5" />
                          </button>
                        ))}
                      </div>
                    )}

                    {/* Global citations only for plain-string messages */}
                    {(!isUser &&
                      !message.isLoading &&
                      typeof message.content === 'string' &&
                      Array.isArray((message as any).metadata?.source_documents) &&
                      (message as any).metadata.source_documents.length > 0) && (
                        <CitationsBlock docs={(message as any).metadata.source_documents} />
                    )}
                  </div>

                  {isUser && (
                    <ChatBubbleAvatar 
                      className="mt-1 flex-shrink-0 text-black"
                      src="https://i.pravatar.cc/40?u=user"
                      fallback="User"
                    />
                  )}
                </div>
              </div>
            )
          })}
          
          {/* Loading indicator for new message */}
          {isLoading && (
            <div className="w-full group">
              <div className="flex gap-3 justify-start">
                <ChatBubbleAvatar fallback="AI" className="mt-1 flex-shrink-0 text-black" />
                <div className="flex flex-col space-y-2 items-start max-w-[80%]">
                  <div className="rounded-2xl px-4 py-3 bg-gray-800 text-gray-100">
                    <div className="flex items-center space-x-2">
                      <div className="flex space-x-1">
                        <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce"></div>
                        <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce" style={{animationDelay: '0.1s'}}></div>
                        <div className="w-2 h-2 bg-gray-400 rounded-full animate-bounce" style={{animationDelay: '0.2s'}}></div>
                      </div>
                    </div>
                  </div>
                </div>
              </div>
            </div>
                      )}
          
          {/* Invisible element to scroll to */}
          <div ref={messagesEndRef} />
        </div>
      </ScrollArea>
      
      {/* Scroll to bottom button - only show when not at bottom */}
      {showScrollButton && (
        <div className="absolute bottom-20 left-1/2 transform -translate-x-1/2 z-10">
          <button
            onClick={scrollToBottom}
            className="p-2 bg-gray-800 border border-gray-700 rounded-full hover:bg-gray-700 transition-all duration-200 shadow-lg group animate-in fade-in slide-in-from-bottom-2"
            title="Scroll to bottom"
          >
            <ChevronDown className="w-4 h-4 text-gray-400 group-hover:text-gray-200 transition-colors" />
          </button>
        </div>
      )}
    </div>
  )
}  

================================================
FILE: src/components/ui/dropdown-menu.tsx
================================================
"use client"

import * as React from "react"
import * as DropdownMenuPrimitive from "@radix-ui/react-dropdown-menu"
import { CheckIcon, ChevronRightIcon, CircleIcon } from "lucide-react"

import { cn } from "@/lib/utils"

function DropdownMenu({
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.Root>) {
  return <DropdownMenuPrimitive.Root data-slot="dropdown-menu" {...props} />
}

function DropdownMenuPortal({
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.Portal>) {
  return (
    <DropdownMenuPrimitive.Portal data-slot="dropdown-menu-portal" {...props} />
  )
}

function DropdownMenuTrigger({
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.Trigger>) {
  return (
    <DropdownMenuPrimitive.Trigger
      data-slot="dropdown-menu-trigger"
      {...props}
    />
  )
}

function DropdownMenuContent({
  className,
  sideOffset = 4,
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.Content>) {
  return (
    <DropdownMenuPrimitive.Portal>
      <DropdownMenuPrimitive.Content
        data-slot="dropdown-menu-content"
        sideOffset={sideOffset}
        className={cn(
          "bg-popover text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 max-h-(--radix-dropdown-menu-content-available-height) min-w-[8rem] origin-(--radix-dropdown-menu-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border p-1 shadow-md",
          className
        )}
        {...props}
      />
    </DropdownMenuPrimitive.Portal>
  )
}

function DropdownMenuGroup({
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.Group>) {
  return (
    <DropdownMenuPrimitive.Group data-slot="dropdown-menu-group" {...props} />
  )
}

function DropdownMenuItem({
  className,
  inset,
  variant = "default",
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.Item> & {
  inset?: boolean
  variant?: "default" | "destructive"
}) {
  return (
    <DropdownMenuPrimitive.Item
      data-slot="dropdown-menu-item"
      data-inset={inset}
      data-variant={variant}
      className={cn(
        "focus:bg-accent focus:text-accent-foreground data-[variant=destructive]:text-destructive data-[variant=destructive]:focus:bg-destructive/10 dark:data-[variant=destructive]:focus:bg-destructive/20 data-[variant=destructive]:focus:text-destructive data-[variant=destructive]:*:[svg]:!text-destructive [&_svg:not([class*='text-'])]:text-muted-foreground relative flex cursor-default items-center gap-2 rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 data-[inset]:pl-8 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
        className
      )}
      {...props}
    />
  )
}

function DropdownMenuCheckboxItem({
  className,
  children,
  checked,
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.CheckboxItem>) {
  return (
    <DropdownMenuPrimitive.CheckboxItem
      data-slot="dropdown-menu-checkbox-item"
      className={cn(
        "focus:bg-accent focus:text-accent-foreground relative flex cursor-default items-center gap-2 rounded-sm py-1.5 pr-2 pl-8 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
        className
      )}
      checked={checked}
      {...props}
    >
      <span className="pointer-events-none absolute left-2 flex size-3.5 items-center justify-center">
        <DropdownMenuPrimitive.ItemIndicator>
          <CheckIcon className="size-4" />
        </DropdownMenuPrimitive.ItemIndicator>
      </span>
      {children}
    </DropdownMenuPrimitive.CheckboxItem>
  )
}

function DropdownMenuRadioGroup({
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.RadioGroup>) {
  return (
    <DropdownMenuPrimitive.RadioGroup
      data-slot="dropdown-menu-radio-group"
      {...props}
    />
  )
}

function DropdownMenuRadioItem({
  className,
  children,
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.RadioItem>) {
  return (
    <DropdownMenuPrimitive.RadioItem
      data-slot="dropdown-menu-radio-item"
      className={cn(
        "focus:bg-accent focus:text-accent-foreground relative flex cursor-default items-center gap-2 rounded-sm py-1.5 pr-2 pl-8 text-sm outline-hidden select-none data-[disabled]:pointer-events-none data-[disabled]:opacity-50 [&_svg]:pointer-events-none [&_svg]:shrink-0 [&_svg:not([class*='size-'])]:size-4",
        className
      )}
      {...props}
    >
      <span className="pointer-events-none absolute left-2 flex size-3.5 items-center justify-center">
        <DropdownMenuPrimitive.ItemIndicator>
          <CircleIcon className="size-2 fill-current" />
        </DropdownMenuPrimitive.ItemIndicator>
      </span>
      {children}
    </DropdownMenuPrimitive.RadioItem>
  )
}

function DropdownMenuLabel({
  className,
  inset,
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.Label> & {
  inset?: boolean
}) {
  return (
    <DropdownMenuPrimitive.Label
      data-slot="dropdown-menu-label"
      data-inset={inset}
      className={cn(
        "px-2 py-1.5 text-sm font-medium data-[inset]:pl-8",
        className
      )}
      {...props}
    />
  )
}

function DropdownMenuSeparator({
  className,
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.Separator>) {
  return (
    <DropdownMenuPrimitive.Separator
      data-slot="dropdown-menu-separator"
      className={cn("bg-border -mx-1 my-1 h-px", className)}
      {...props}
    />
  )
}

function DropdownMenuShortcut({
  className,
  ...props
}: React.ComponentProps<"span">) {
  return (
    <span
      data-slot="dropdown-menu-shortcut"
      className={cn(
        "text-muted-foreground ml-auto text-xs tracking-widest",
        className
      )}
      {...props}
    />
  )
}

function DropdownMenuSub({
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.Sub>) {
  return <DropdownMenuPrimitive.Sub data-slot="dropdown-menu-sub" {...props} />
}

function DropdownMenuSubTrigger({
  className,
  inset,
  children,
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.SubTrigger> & {
  inset?: boolean
}) {
  return (
    <DropdownMenuPrimitive.SubTrigger
      data-slot="dropdown-menu-sub-trigger"
      data-inset={inset}
      className={cn(
        "focus:bg-accent focus:text-accent-foreground data-[state=open]:bg-accent data-[state=open]:text-accent-foreground flex cursor-default items-center rounded-sm px-2 py-1.5 text-sm outline-hidden select-none data-[inset]:pl-8",
        className
      )}
      {...props}
    >
      {children}
      <ChevronRightIcon className="ml-auto size-4" />
    </DropdownMenuPrimitive.SubTrigger>
  )
}

function DropdownMenuSubContent({
  className,
  ...props
}: React.ComponentProps<typeof DropdownMenuPrimitive.SubContent>) {
  return (
    <DropdownMenuPrimitive.SubContent
      data-slot="dropdown-menu-sub-content"
      className={cn(
        "bg-popover text-popover-foreground data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2 z-50 min-w-[8rem] origin-(--radix-dropdown-menu-content-transform-origin) overflow-hidden rounded-md border p-1 shadow-lg",
        className
      )}
      {...props}
    />
  )
}

export {
  DropdownMenu,
  DropdownMenuPortal,
  DropdownMenuTrigger,
  DropdownMenuContent,
  DropdownMenuGroup,
  DropdownMenuLabel,
  DropdownMenuItem,
  DropdownMenuCheckboxItem,
  DropdownMenuRadioGroup,
  DropdownMenuRadioItem,
  DropdownMenuSeparator,
  DropdownMenuShortcut,
  DropdownMenuSub,
  DropdownMenuSubTrigger,
  DropdownMenuSubContent,
}


================================================
FILE: src/components/ui/empty-chat-state.tsx
================================================
"use client";

import { useEffect, useRef, useCallback } from "react";
import { useState } from "react";
import { Textarea } from "@/components/ui/textarea";
import { cn } from "@/lib/utils";
import {
    ArrowUpIcon,
    Paperclip,
    PlusIcon,
    X,
    FileText,
} from "lucide-react";
import { AttachedFile } from "@/lib/types";

interface UseAutoResizeTextareaProps {
    minHeight: number;
    maxHeight?: number;
}

function useAutoResizeTextarea({
    minHeight,
    maxHeight,
}: UseAutoResizeTextareaProps) {
    const textareaRef = useRef<HTMLTextAreaElement>(null);

    const adjustHeight = useCallback(
        (reset?: boolean) => {
            const textarea = textareaRef.current;
            if (!textarea) return;

            if (reset) {
                textarea.style.height = `${minHeight}px`;
                return;
            }

            // Temporarily shrink to get the right scrollHeight
            textarea.style.height = `${minHeight}px`;

            // Calculate new height
            const newHeight = Math.max(
                minHeight,
                Math.min(
                    textarea.scrollHeight,
                    maxHeight ?? Number.POSITIVE_INFINITY
                )
            );

            textarea.style.height = `${newHeight}px`;
        },
        [minHeight, maxHeight]
    );

    useEffect(() => {
        // Set initial height
        const textarea = textareaRef.current;
        if (textarea) {
            textarea.style.height = `${minHeight}px`;
        }
    }, [minHeight]);

    // Adjust height on window resize
    useEffect(() => {
        const handleResize = () => adjustHeight();
        window.addEventListener("resize", handleResize);
        return () => window.removeEventListener("resize", handleResize);
    }, [adjustHeight]);

    return { textareaRef, adjustHeight };
}

interface EmptyChatStateProps {
    onSendMessage: (message: string, attachedFiles?: AttachedFile[]) => void;
    disabled?: boolean;
    placeholder?: string;
}

export function EmptyChatState({ 
    onSendMessage, 
    disabled = false, 
    placeholder = "Ask localgpt a question..." 
}: EmptyChatStateProps) {
    const [value, setValue] = useState("");
    const [attachedFiles, setAttachedFiles] = useState<AttachedFile[]>([]);
    const fileInputRef = useRef<HTMLInputElement>(null);
    const { textareaRef, adjustHeight } = useAutoResizeTextarea({
        minHeight: 60,
        maxHeight: 200,
    });

    const handleSend = () => {
        if ((value.trim() || attachedFiles.length > 0) && !disabled) {
            onSendMessage(value.trim(), attachedFiles);
            setValue("");
            setAttachedFiles([]);
            adjustHeight(true);
        }
    };

    const handleKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
        if (e.key === "Enter" && !e.shiftKey) {
            e.preventDefault();
            handleSend();
        }
    };

    const handleFileAttach = () => {
        fileInputRef.current?.click();
    };

    const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
        const files = e.target.files;
        if (!files) return;

        const newFiles: AttachedFile[] = [];
        for (let i = 0; i < files.length; i++) {
            const file = files[i];
            if (file.type === 'application/pdf' || 
                file.type === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
                file.type === 'application/msword' ||
                file.type === 'text/html' ||
                file.type === 'text/markdown' ||
                file.type === 'text/plain' ||
                file.name.toLowerCase().endsWith('.pdf') ||
                file.name.toLowerCase().endsWith('.docx') ||
                file.name.toLowerCase().endsWith('.doc') ||
                file.name.toLowerCase().endsWith('.html') ||
                file.name.toLowerCase().endsWith('.htm') ||
                file.name.toLowerCase().endsWith('.md') ||
                file.name.toLowerCase().endsWith('.txt')) {
                newFiles.push({
                    id: crypto.randomUUID(),
                    name: file.name,
                    size: file.size,
                    type: file.type,
                    file: file,
                });
            }
        }

        setAttachedFiles(prev => [...prev, ...newFiles]);
        
        // Reset the input
        if (fileInputRef.current) {
            fileInputRef.current.value = '';
        }

        // --- NEW: Immediately trigger upload when files are selected ---
        if (newFiles.length > 0) {
            onSendMessage("", newFiles);
            // Clear the local attachment state as the parent now handles it
            setAttachedFiles([]); 
        }
    };

    const removeFile = (fileId: string) => {
        setAttachedFiles(prev => prev.filter(f => f.id !== fileId));
    };

    const formatFileSize = (bytes: number) => {
        if (bytes === 0) return '0 Bytes';
        const k = 1024;
        const sizes = ['Bytes', 'KB', 'MB', 'GB'];
        const i = Math.floor(Math.log(bytes) / Math.log(k));
        return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
    };

    return (
        <div className="flex flex-col items-center justify-center h-full w-full max-w-4xl mx-auto p-4 space-y-8">
            <h1 className="text-4xl font-bold text-white">
                What can I help you find?
            </h1>

            <div className="w-full">
                {/* Attached Files Display */}
                {attachedFiles.length > 0 && (
                    <div className="mb-4 space-y-2">
                        <div className="text-sm text-gray-400 font-medium">Attached Files:</div>
                        <div className="space-y-2">
                            {attachedFiles.map((file) => (
                                <div key={file.id} className="flex items-center gap-3 bg-gray-800 rounded-lg p-3">
                                    <FileText className="w-5 h-5 text-red-400" />
                                    <div className="flex-1 min-w-0">
                                        <div className="text-sm text-white truncate">{file.name}</div>
                                        <div className="text-xs text-gray-400">{formatFileSize(file.size)}</div>
                                    </div>
                                    {/* The remove button is commented out as the parent will manage the state now */}
                                    {/* <button
                                        onClick={() => removeFile(file.id)}
                                        className="p-1 hover:bg-gray-700 rounded transition-colors"
                                    >
                                        <X className="w-4 h-4 text-gray-400 hover:text-white" />
                                    </button> */}
                                </div>
                            ))}
                        </div>
                    </div>
                )}

                <div className="relative bg-neutral-900 rounded-xl border border-neutral-800">
                    <div className="overflow-y-auto">
                        <Textarea
                            ref={textareaRef}
                            value={value}
                            onChange={(e) => {
                                setValue(e.target.value);
                                adjustHeight();
                            }}
                            onKeyDown={handleKeyDown}
                            placeholder={attachedFiles.length > 0 ? "Ask questions about your attached files..." : placeholder}
                            disabled={disabled}
                            className={cn(
                                "w-full px-4 py-3",
                                "resize-none",
                                "bg-transparent",
                                "border-none",
                                "text-white text-sm",
                                "focus:outline-none",
                                "focus-visible:ring-0 focus-visible:ring-offset-0",
                                "placeholder:text-neutral-500 placeholder:text-sm",
                                "min-h-[60px]",
                                disabled && "opacity-50 cursor-not-allowed"
                            )}
                            style={{
                                overflow: "hidden",
                            }}
                        />
                    </div>

                    {/* Hidden file input */}
                    <input
                        ref={fileInputRef}
                        type="file"
                        accept=".pdf,.docx,.doc,.html,.htm,.md,.txt"
                        multiple
                        onChange={handleFileChange}
                        className="hidden"
                    />

                    <div className="flex items-center justify-between p-3">
                        <div className="flex items-center gap-2">
                            <button
                                type="button"
                                onClick={handleFileAttach}
                                disabled={disabled}
                                className="group p-2 hover:bg-neutral-800 rounded-lg transition-colors flex items-center gap-1 disabled:opacity-50 disabled:cursor-not-allowed"
                                title="Attach PDF files"
                            >
                                <Paperclip className="w-4 h-4 text-white" />
                                <span className="text-xs text-zinc-400 hidden group-hover:inline transition-opacity">
                                    Attach PDF
                                </span>
                            </button>
                        </div>
                        <div className="flex items-center gap-2">
                            <button
                                type="button"
                                disabled={disabled}
                                className="px-2 py-1 rounded-lg text-sm text-zinc-400 transition-colors border border-dashed border-zinc-700 hover:border-zinc-600 hover:bg-zinc-800 flex items-center justify-between gap-1 disabled:opacity-50 disabled:cursor-not-allowed"
                            >
                                <PlusIcon className="w-4 h-4" />
                                Project
                            </button>
                            <button
                                type="button"
                                onClick={handleSend}
                                disabled={disabled || (!value.trim() && attachedFiles.length === 0)}
                                className={cn(
                                    "px-1.5 py-1.5 rounded-lg text-sm transition-colors border border-zinc-700 hover:border-zinc-600 hover:bg-zinc-800 flex items-center justify-between gap-1",
                                    (value.trim() || attachedFiles.length > 0) && !disabled
                                        ? "bg-white text-black hover:bg-gray-200"
                                        : "text-zinc-400",
                                    "disabled:opacity-50 disabled:cursor-not-allowed"
                                )}
                            >
                                <ArrowUpIcon
                                    className={cn(
                                        "w-4 h-4",
                                        (value.trim() || attachedFiles.length > 0) && !disabled
                                            ? "text-black"
                                            : "text-zinc-400"
                                    )}
                                />
                                <span className="sr-only">Send</span>
                            </button>
                        </div>
                    </div>
                </div>
            </div>
        </div>
    );
}    

================================================
FILE: src/components/ui/localgpt-chat.tsx
================================================
"use client";

import { useEffect, useRef, useCallback } from "react";
import { useState } from "react";
import { Textarea } from "@/components/ui/textarea";
import { cn } from "@/lib/utils";
import {
    ArrowUpIcon,
    Paperclip,
    PlusIcon,
} from "lucide-react";

interface UseAutoResizeTextareaProps {
    minHeight: number;
    maxHeight?: number;
}

function useAutoResizeTextarea({
    minHeight,
    maxHeight,
}: UseAutoResizeTextareaProps) {
    const textareaRef = useRef<HTMLTextAreaElement>(null);

    const adjustHeight = useCallback(
        (reset?: boolean) => {
            const textarea = textareaRef.current;
            if (!textarea) return;

            if (reset) {
                textarea.style.height = `${minHeight}px`;
                return;
            }

            // Temporarily shrink to get the right scrollHeight
            textarea.style.height = `${minHeight}px`;

            // Calculate new height
            const newHeight = Math.max(
                minHeight,
                Math.min(
                    textarea.scrollHeight,
                    maxHeight ?? Number.POSITIVE_INFINITY
                )
            );

            textarea.style.height = `${newHeight}px`;
        },
        [minHeight, maxHeight]
    );

    useEffect(() => {
        // Set initial height
        const textarea = textareaRef.current;
        if (textarea) {
            textarea.style.height = `${minHeight}px`;
        }
    }, [minHeight]);

    // Adjust height on window resize
    useEffect(() => {
        const handleResize = () => adjustHeight();
        window.addEventListener("resize", handleResize);
        return () => window.removeEventListener("resize", handleResize);
    }, [adjustHeight]);

    return { textareaRef, adjustHeight };
}

export function LocalGPTChat() {
    const [value, setValue] = useState("");
    const { textareaRef, adjustHeight } = useAutoResizeTextarea({
        minHeight: 60,
        maxHeight: 200,
    });

    const handleKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
        if (e.key === "Enter" && !e.shiftKey) {
            e.preventDefault();
            if (value.trim()) {
                setValue("");
                adjustHeight(true);
            }
        }
    };

    return (
        <div className="flex flex-col items-center w-full max-w-4xl mx-auto p-4 space-y-8">
            <h1 className="text-4xl font-bold text-white">
                What can I help you find?
            </h1>

            <div className="w-full">
                <div className="relative bg-neutral-900 rounded-xl border border-neutral-800">
                    <div className="overflow-y-auto">
                        <Textarea
                            ref={textareaRef}
                            value={value}
                            onChange={(e) => {
                                setValue(e.target.value);
                                adjustHeight();
                            }}
                            onKeyDown={handleKeyDown}
                            placeholder="Ask localgpt a question..."
                            className={cn(
                                "w-full px-4 py-3",
                                "resize-none",
                                "bg-transparent",
                                "border-none",
                                "text-white text-sm",
                                "focus:outline-none",
                                "focus-visible:ring-0 focus-visible:ring-offset-0",
                                "placeholder:text-neutral-500 placeholder:text-sm",
                                "min-h-[60px]"
                            )}
                            style={{
                                overflow: "hidden",
                            }}
                        />
                    </div>

                    <div className="flex items-center justify-between p-3">
                        <div className="flex items-center gap-2">
                            <button
                                type="button"
                                className="group p-2 hover:bg-neutral-800 rounded-lg transition-colors flex items-center gap-1"
                            >
                                <Paperclip className="w-4 h-4 text-white" />
                                <span className="text-xs text-zinc-400 hidden group-hover:inline transition-opacity">
                                    Attach
                                </span>
                            </button>
                        </div>
                        <div className="flex items-center gap-2">
                            <button
                                type="button"
                                className="px-2 py-1 rounded-lg text-sm text-zinc-400 transition-colors border border-dashed border-zinc-700 hover:border-zinc-600 hover:bg-zinc-800 flex items-center justify-between gap-1"
                            >
                                <PlusIcon className="w-4 h-4" />
                                Project
                            </button>
                            <button
                                type="button"
                                className={cn(
                                    "px-1.5 py-1.5 rounded-lg text-sm transition-colors border border-zinc-700 hover:border-zinc-600 hover:bg-zinc-800 flex items-center justify-between gap-1",
                                    value.trim()
                                        ? "bg-white text-black"
                                        : "text-zinc-400"
                                )}
                            >
                                <ArrowUpIcon
                                    className={cn(
                                        "w-4 h-4",
                                        value.trim()
                                            ? "text-black"
                                            : "text-zinc-400"
                                    )}
                                />
                                <span className="sr-only">Send</span>
                            </button>
                        </div>
                    </div>
                </div>


            </div>
        </div>
    );
}

 
================================================
FILE: src/components/ui/message-loading.tsx
================================================
"use client"

function MessageLoading() {
  return (
    <svg
      width="24"
      height="24"
      viewBox="0 0 24 24"
      xmlns="http://www.w3.org/2000/svg"
      className="text-foreground"
    >
      <circle cx="4" cy="12" r="2" fill="currentColor">
        <animate
          id="spinner_qFRN"
          begin="0;spinner_OcgL.end+0.25s"
          attributeName="cy"
          calcMode="spline"
          dur="0.6s"
          values="12;6;12"
          keySplines=".33,.66,.66,1;.33,0,.66,.33"
        />
      </circle>
      <circle cx="12" cy="12" r="2" fill="currentColor">
        <animate
          begin="spinner_qFRN.begin+0.1s"
          attributeName="cy"
          calcMode="spline"
          dur="0.6s"
          values="12;6;12"
          keySplines=".33,.66,.66,1;.33,0,.66,.33"
        />
      </circle>
      <circle cx="20" cy="12" r="2" fill="currentColor">
        <animate
          id="spinner_OcgL"
          begin="spinner_qFRN.begin+0.2s"
          attributeName="cy"
          calcMode="spline"
          dur="0.6s"
          values="12;6;12"
          keySplines=".33,.66,.66,1;.33,0,.66,.33"
        />
      </circle>
    </svg>
  );
}

export { MessageLoading }; 

================================================
FILE: src/components/ui/quick-chat.tsx
================================================
"use client";

import React, { useState, useEffect } from 'react';
import { ChatInput } from '@/components/ui/chat-input';
import { chatAPI, ChatMessage } from '@/lib/api';
import { ConversationPage } from '@/components/ui/conversation-page';
import { ChatSettingsModal } from '@/components/ui/chat-settings-modal';

interface QuickChatProps {
  sessionId?: string;
  onSessionChange?: (s: any) => void;
  className?: string;
}

export function QuickChat({ sessionId: externalSessionId, onSessionChange, className="" }: QuickChatProps) {
  const [messages, setMessages] = useState<ChatMessage[]>([]);
  const [isLoading, setIsLoading] = useState(false);
  const [sessionId, setSessionId] = useState<string | undefined>(externalSessionId);
  const [generationModels, setGenerationModels] = useState<string[]>([]);
  const [selectedModel, setSelectedModel] = useState<string>('');
  const [showSettings, setShowSettings] = useState(false);
  const api = chatAPI;

  // 🔄 Sync prop -> state: when sidebar selects a different session, update local session and reset chat window
  useEffect(() => {
    if (externalSessionId && externalSessionId !== sessionId) {
      setSessionId(externalSessionId);
      // Fetch existing messages for the selected session
      (async () => {
        try {
          const data = await api.getSession(externalSessionId);
          // Convert DB messages to ChatMessage format expected by UI helper
          const msgs: ChatMessage[] = data.messages.map((m: any) => api.convertDbMessage(m));
          setMessages(msgs);
        } catch (err) {
          console.error('Failed to load messages for session', err);
          setMessages([]);
        }
      })();
    }
    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [externalSessionId]);

  // Fetch available models
  useEffect(()=>{
    (async()=>{
      try{
        const resp = await api.getModels();
        setGenerationModels(resp.generation_models||[]);
        if(resp.generation_models && resp.generation_models.length>0){
          const def = resp.generation_models.find((m:string)=>m==='qwen3:8b');
          setSelectedModel(def || resp.generation_models[0]);
        }
      }catch(e){console.warn('Failed to load models',e);}
    })();
  },[api]);

  const sendMessage = async (content: string, _files?: any) => {
    if (!content.trim()) return;

    const userMsg: ChatMessage = {
      id: crypto.randomUUID(),
      content,
      sender: 'user',
      timestamp: new Date().toISOString(),
    };
    setMessages((prev) => [...prev, userMsg]);

    setIsLoading(true);

    // Ensure we have a backend session to preserve history on the agent side
    let activeSessionId = sessionId;
    if (!activeSessionId) {
      try {
        const newSess = await api.createSession('Quick Chat');
        activeSessionId = newSess.id;
        setSessionId(activeSessionId);
        if(onSessionChange){ 
          onSessionChange(newSess); 
        }
      } catch (err) {
        console.error('Failed to create quick-chat session', err);
      }
    }

    try {
      const history = api.messagesToHistory(messages);
      const resp = await api.sendMessage({ message: content, conversation_history: history, model: selectedModel });

    const assistantMsg: ChatMessage = {
      id: crypto.randomUUID(),
        content: resp.response,
      sender: 'assistant',
      timestamp: new Date().toISOString(),
    };
    setMessages((prev) => [...prev, assistantMsg]);
    } catch (err) {
      console.error('Quick chat failed', err);
    } finally {
      setIsLoading(false);
    }

    // if session existed externally and callback provided, still sync id
    if(onSessionChange && activeSessionId && activeSessionId!==externalSessionId){
      // no additional action; already sent on creation
    }
  };

  const showEmptyState = messages.length === 0 && !isLoading

  return (
    <div className={`flex flex-col h-full ${className}`}>
      {showEmptyState ? (
        <div className="flex-1 flex flex-col items-center justify-center gap-6">
          <div className="text-center text-2xl font-semibold text-gray-300 select-none">What can I help you find today?</div>
          <div className="w-full max-w-2xl px-4">
            <ChatInput onSendMessage={sendMessage} disabled={isLoading} placeholder="Ask anything…" onOpenSettings={()=>setShowSettings(true)} />
          </div>
        </div>
      ) : (
        <>
          <ConversationPage messages={messages} isLoading={isLoading} className="flex-1 overflow-y-auto" />
          <div className="flex-shrink-0">
            <ChatInput onSendMessage={sendMessage} disabled={isLoading} placeholder="Ask anything…" onOpenSettings={()=>setShowSettings(true)} />
          </div>
        </>
      )}
      {showSettings && (
        <ChatSettingsModal
          onClose={()=>setShowSettings(false)}
          options={[
            { type:'dropdown', label:'LLM model', value:selectedModel, setter:setSelectedModel, options:generationModels.map(m=>({value:m,label:m})) }
          ]}
        />
      )}
    </div>
  );
} 

================================================
FILE: src/components/ui/scroll-area.tsx
================================================
"use client"

import * as React from "react"
import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area"

import { cn } from "@/lib/utils"

function ScrollArea({
  className,
  children,
  ...props
}: React.ComponentProps<typeof ScrollAreaPrimitive.Root>) {
  return (
    <ScrollAreaPrimitive.Root
      data-slot="scroll-area"
      className={cn("relative h-full w-full", className)}
      {...props}
    >
      <ScrollAreaPrimitive.Viewport
        data-slot="scroll-area-viewport"
        className="focus-visible:ring-ring/50 size-full rounded-[inherit] transition-[color,box-shadow] outline-none focus-visible:ring-[3px] focus-visible:outline-1"
      >
        {children}
      </ScrollAreaPrimitive.Viewport>
      <ScrollBar />
      <ScrollAreaPrimitive.Corner />
    </ScrollAreaPrimitive.Root>
  )
}

function ScrollBar({
  className,
  orientation = "vertical",
  ...props
}: React.ComponentProps<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>) {
  return (
    <ScrollAreaPrimitive.ScrollAreaScrollbar
      data-slot="scroll-area-scrollbar"
      orientation={orientation}
      className={cn(
        "flex touch-none p-px transition-colors select-none",
        orientation === "vertical" &&
          "h-full w-2.5 border-l border-l-transparent",
        orientation === "horizontal" &&
          "h-2.5 flex-col border-t border-t-transparent",
        className
      )}
      {...props}
    >
      <ScrollAreaPrimitive.ScrollAreaThumb
        data-slot="scroll-area-thumb"
        className="bg-border relative flex-1 rounded-full"
      />
    </ScrollAreaPrimitive.ScrollAreaScrollbar>
  )
}

export { ScrollArea, ScrollBar }


================================================
FILE: src/components/ui/separator.tsx
================================================
"use client"

import * as React from "react"
import * as SeparatorPrimitive from "@radix-ui/react-separator"

import { cn } from "@/lib/utils"

function Separator({
  className,
  orientation = "horizontal",
  decorative = true,
  ...props
}: React.ComponentProps<typeof SeparatorPrimitive.Root>) {
  return (
    <SeparatorPrimitive.Root
      data-slot="separator"
      decorative={decorative}
      orientation={orientation}
      className={cn(
        "bg-border shrink-0 data-[orientation=horizontal]:h-px data-[orientation=horizontal]:w-full data-[orientation=vertical]:h-full data-[orientation=vertical]:w-px",
        className
      )}
      {...props}
    />
  )
}

export { Separator }


================================================
FILE: src/components/ui/session-chat.tsx
================================================
"use client"

import * as React from "react"
import { ConversationPage } from "./conversation-page"
import { ChatInput } from "./chat-input"
import { EmptyChatState } from "./empty-chat-state"
import { ChatMessage, ChatSession, chatAPI, generateUUID } from "@/lib/api"
import { AttachedFile } from "@/lib/types"
import { useEffect, useState, forwardRef, useImperativeHandle, useCallback } from "react"
import { normalizeStreamingToken } from "@/utils/textNormalization"
import { Button } from "./button"
import type { Step } from '@/lib/api'
import { ChatSettingsModal } from '@/components/ui/chat-settings-modal'
import { IndexForm } from '@/components/IndexForm'
import SessionIndexInfo from '@/components/SessionIndexInfo'
import { Database } from 'lucide-react'

interface SessionChatProps {
  sessionId?: string
  onSessionChange?: (session: ChatSession) => void
  onNewMessage?: (message: ChatMessage) => void
  className?: string
}

// Export sendMessage function for parent components
export interface SessionChatRef {
  sendMessage: (content: string, attachedFiles?: AttachedFile[]) => Promise<void>
  currentSession: ChatSession | null
}

// Helper to shorten long titles
const truncate = (str: string, n: number = 18) => str.length > n ? str.slice(0, n) + '…' : str;

export const SessionChat = forwardRef<SessionChatRef, SessionChatProps>(({ 
  sessionId,
  onSessionChange,
  onNewMessage,
  className = ""
}, ref) => {
  const [messages, setMessages] = useState<ChatMessage[]>([])
  const [isLoading, setIsLoading] = useState(false)
  const [currentSession, setCurrentSession] = useState<ChatSession | null>(null)
  const [error, setError] = useState<string | null>(null)
  const [uploadedFiles, setUploadedFiles] = useState<{filename: string, stored_path: string}[]>([])
  const [isIndexed, setIsIndexed] = useState(false)
  const [composeSubAnswers, setComposeSubAnswers] = useState<boolean>(true)
  const [enableDecompose, setEnableDecompose] = useState<boolean>(true)
  const [enableAiRerank, setEnableAiRerank] = useState<boolean>(true)
  const [enableContextExpand, setEnableContextExpand] = useState<boolean>(true)
  const [enableStream, setEnableStream] = useState<boolean>(true)
  const [enableVerify, setEnableVerify] = useState<boolean>(true)
  // Force RAG toggle
  const [forceDocs, setForceDocs] = useState<boolean>(false)
  // Provence pruning toggle
  const [provencePrune, setProvencePrune] = useState<boolean>(false)
  
  // ✨ NEW RETRIEVAL PARAMETERS
  const [retrievalK, setRetrievalK] = useState<number>(20)
  const [contextWindowSize, setContextWindowSize] = useState<number>(1)
  const [rerankerTopK, setRerankerTopK] = useState<number>(10)
  const [searchType, setSearchType] = useState<string>('hybrid')
  const [generationModels,setGenerationModels]=useState<string[]>([])
  const [selectedModel,setSelectedModel]=useState<string>('qwen3:8b')
  const [currentIndexId, setCurrentIndexId] = useState<string | null>(null)
  const [currentIndexName, setCurrentIndexName] = useState<string | null>(null)
  const [showSettings, setShowSettings] = useState(false)
  const [showIndexForm, setShowIndexForm] = useState(false)
  const [showIndexInfo, setShowIndexInfo] = useState(false)
  
  const apiService = chatAPI

  // Define loadSession with useCallback before useEffect
  const loadSession = useCallback(async (id: string) => {
    try {
      setError(null)
      const { session, messages: sessionMessages } = await apiService.getSession(id)
      
      const convertedMessages = sessionMessages.map((msg: unknown) => apiService.convertDbMessage(msg as Record<string, unknown>))
      setMessages(convertedMessages)
      setCurrentSession(session)
      
      if (onSessionChange) {
        onSessionChange(session)
      }

      // Fetch linked indexes to know table name for streaming
      try {
        const idxResp = await apiService.getSessionIndexes(id)
        if (idxResp.indexes && idxResp.indexes.length > 0) {
          const lastIdxObj = idxResp.indexes[idxResp.indexes.length - 1] as any
          const idxId = (lastIdxObj.index_id ?? lastIdxObj.id) as string
          setCurrentIndexId(idxId ?? null)
          setCurrentIndexName(lastIdxObj.name ?? lastIdxObj.title ?? idxId.slice(0,8))
        }
      } catch {}
    } catch (error) {
      console.error('Failed to load session:', error)
      setError('Failed to load session')
    }
  }, [apiService, onSessionChange])

  // Load session when sessionId changes
  useEffect(() => {
    if (sessionId) {
      // Only load session if we don't already have the current session
      // This prevents overriding messages when a new session is created
      if (!currentSession || currentSession.id !== sessionId) {
        loadSession(sessionId)
      }
    } else {
      // Clear messages if no session
      setMessages([])
      setCurrentSession(null)
    }
  }, [sessionId, currentSession, loadSession]) // Added missing dependencies

  // Fetch available models on mount
  useEffect(()=>{
    (async()=>{
      try{
        const resp=await apiService.getModels();
        setGenerationModels(resp.generation_models||[])
        if(resp.generation_models&&resp.generation_models.length>0){
          const def = resp.generation_models.find((m:string)=>m==='qwen3:8b');
          setSelectedModel(def || resp.generation_models[0])
        }
      }catch(e){console.warn('Failed to load models',e)}
    })()
  },[apiService])

  const sendMessage = async (content: string, attachedFiles?: AttachedFile[]) => {
    // --- Guard Clauses ---
    // If files are being indexed, do nothing.
    if (uploadedFiles.length > 0 && !isIndexed) {
      console.warn("sendMessage called while waiting for indexing. Action blocked.");
      return;
    }
    // If no content and no files, do nothing.
    if (!content.trim() && (!attachedFiles || attachedFiles.length === 0)) return;

    try {
      setError(null)
      
      let activeSessionId = sessionId
      if (!activeSessionId) {
        try {
          const newSession = await apiService.createSession()
          activeSessionId = newSession.id
          setCurrentSession(newSession)
          if (onSessionChange) {
            onSessionChange(newSession)
          }
        } catch (error) {
          console.error('Failed to create session:', error)
          setError('Failed to create session')
          return
        }
      }

      // --- Action Router: Decide if this is an upload or a chat message ---
      
      // A) UPLOAD ACTION: If files are attached, this action's priority is to upload. Ignore any text content.
      if (attachedFiles && attachedFiles.length > 0) {
        setIsLoading(true)
        try {
          const files = attachedFiles.map(af => af.file)
          const uploadResult = await apiService.uploadFiles(activeSessionId, files)
          console.log('✅ Files uploaded successfully:', uploadResult)
          
          setUploadedFiles(uploadResult.uploaded_files)
          setIsIndexed(false)

          const uploadMessage = apiService.createMessage(
            `📎 Uploaded ${uploadResult.uploaded_files.length} file(s): ${uploadResult.uploaded_files.map(f => f.filename).join(', ')}. Please click 'Index Documents' to chat with them.`,
            'assistant'
          )
          setMessages(prev => [...prev, uploadMessage])
        } catch (error) {
          console.error('❌ Failed to upload files:', error)
          const errorMessage = apiService.createMessage('❌ Failed to upload files. Please try again.', 'assistant')
          setMessages(prev => [...prev, errorMessage])
        } finally {
          setIsLoading(false)
        }
        return; // End the function here.
      }

      // B) CHAT ACTION: If no files, it's a standard chat message.
      if (!content.trim()) return;

      const userMessage = apiService.createMessage(content, 'user')
      setMessages(prev => [...prev, userMessage])
      if (onNewMessage) onNewMessage(userMessage)

      setIsLoading(true)

      // Ensure we know the index id for table_name; fetch if missing
      let idxId = currentIndexId;
      if (!idxId) {
        try {
          const idxResp = await apiService.getSessionIndexes(activeSessionId as string);
          if (idxResp.indexes && idxResp.indexes.length > 0) {
            const lastIdxObj = idxResp.indexes[idxResp.indexes.length - 1] as any;
            idxId = (lastIdxObj.index_id ?? lastIdxObj.id) as string;
            setCurrentIndexId(idxId ?? null);
            setCurrentIndexName(lastIdxObj.name ?? lastIdxObj.title ?? idxId.slice(0,8));
          }
        } catch {}
      }

      if (enableStream) {
        // Stepwise progress structure
        const steps: Step[] = [
          { key: 'analyze', label: 'Analyzing user question', status: 'pending' as const, details: '' },
          { key: 'decompose', label: 'Generating sub-queries', status: 'pending' as const, details: '' },
          { key: 'retrieval', label: 'Retrieving context', status: 'pending' as const, details: '' },
          { key: 'rerank', label: 'Reranking results', status: 'pending' as const, details: '' },
          { key: 'expand', label: 'Expanding context window', status: 'pending' as const, details: '' },
          { key: 'answer', label: 'Answering sub-queries', status: 'pending' as const, details: [] },
          { key: 'synthesize', label: 'Putting everything together', status: 'pending' as const, details: '' },
          { key: 'final', label: 'Final answer', status: 'pending' as const, details: '' },
        ];
        const placeholder: ChatMessage = {
          id: generateUUID(),
          content: { steps },
          sender: 'assistant',
          timestamp: new Date().toISOString(),
          isLoading: false,
          metadata: { message_type: 'in_progress' }
        }
        setMessages(prev => {
          const withoutLoaders = prev.filter(m => m.metadata?.message_type !== 'in_progress' && !m.isLoading)
          return [...withoutLoaders, placeholder]
        })
        // keep global isLoading true so input disabled until completion

        await apiService.streamSessionMessage(
          {
            query: content,
            session_id: activeSessionId,
            table_name: idxId ? `text_pages_${idxId}` : undefined,
            composeSubAnswers,
            decompose: enableDecompose,
            aiRerank: enableAiRerank,
            contextExpand: enableContextExpand,
            verify: enableVerify,
            model: selectedModel,
            // ✨ NEW RETRIEVAL PARAMETERS
            retrievalK,
            contextWindowSize,
            rerankerTopK,
            searchType,
            forceRag: forceDocs,
            provencePrune,
          },
          (evt) => {
            console.log('STREAM EVENT:', evt.type, evt.data); // Debug log for SSE events
            setMessages(prev => prev.map(m => {
              if (m.id !== placeholder.id) return m;
              const steps = [...(m.content as any).steps];
              if (evt.type === 'analyze') {
                steps[0].status = 'active';
                steps[0].details = 'Analyzing your question...';
                return { ...m, content: { steps } };
              }
              if (evt.type === 'decomposition') {
                steps[0].status = 'done';
                steps[1].status = 'active';
                steps[1].details = (evt.data.sub_queries || []);
                return { ...m, content: { steps } };
              }
              if (evt.type === 'retrieval_started') {
                steps[1].status = 'done';
                steps[2].status = 'active';
                steps[2].details = 'Retrieving relevant documents...';
                return { ...m, content: { steps } };
              }
              if (evt.type === 'retrieval_done') {
                const ridx = steps.findIndex(s => s.key === 'retrieval');
                if (ridx !== -1) {
                  steps[ridx].status = 'done';
                  steps[ridx].details = 'Retrieval complete.';
                }
                const rrxIdx = steps.findIndex(s => s.key === 'rerank');
                if (rrxIdx !== -1) {
                  steps[rrxIdx].status = 'active';
                  steps[rrxIdx].details = 'Reranking results...';
                }
                return { ...m, content: { steps } };
              }
              if (evt.type === 'rerank_started') {
                const rrxIdx = steps.findIndex(s => s.key === 'rerank');
                if (rrxIdx !== -1) {
                  steps[rrxIdx].status = 'active';
                  steps[rrxIdx].details = 'Reranking results...';
                }
                return { ...m, content: { steps } };
              }
              if (evt.type === 'rerank_done') {
                const rrxIdx = steps.findIndex(s => s.key === 'rerank');
                if (rrxIdx !== -1) {
                  steps[rrxIdx].status = 'done';
                  steps[rrxIdx].details = 'Reranking complete.';
                }
                return { ...m, content: { steps } };
              }
              if (evt.type === 'context_expand_started') {
                const eidx = steps.findIndex(s => s.key === 'expand');
                if (eidx !== -1) {
                  steps[eidx].status = 'active';
                  steps[eidx].details = 'Expanding context window...';
                }
                return { ...m, content: { steps } };
              }
              if (evt.type === 'context_expand_done') {
                const eidx = steps.findIndex(s => s.key === 'expand');
                if (eidx !== -1) {
                  steps[eidx].status = 'done';
                  steps[eidx].details = 'Context expansion complete.';
                }
                // Activate answering sub-queries stage to show spinner while we wait
                const ansIdx = steps.findIndex(s => s.key === 'answer');
                if (ansIdx !== -1 && steps[ansIdx].status === 'pending') {
                  steps[ansIdx].status = 'active';
                  steps[ansIdx].details = 'Answering sub-queries...';
                }
                return { ...m, content: { steps } };
              }
              if (evt.type === 'sub_query_result') {
                steps[5].status = 'active';
                const existing = Array.isArray(steps[5].details) ? steps[5].details : [];
                if (!existing.some((d: any) => d.question === evt.data.query)) {
                  steps[5].details = [...existing, {
                    question: evt.data.query,
                    answer: evt.data.answer,
                    source_documents: evt.data.source_documents || []
                  }];
                } else {
                  steps[5].details = existing; // no change if duplicate
                }
                return { ...m, content: { steps } };
              }
              if (evt.type === 'final_answer' || evt.type === 'single_query_result') {
                steps[5].status = 'done';
                steps[6].status = 'active';
                steps[6].details = 'Synthesizing final answer...';
                if (isLoading) setIsLoading(false);
                return { ...m, content: { steps } };
              }
              if (evt.type === 'token') {
                // Determine final step index dynamically (7 for RAG, 0 for direct)
                const finalIdx = steps.findIndex(s => s.key === 'final' || s.key === 'direct');
                if (finalIdx === -1) return m;
                if (steps[finalIdx].key !== 'direct') {
                  steps[6].status = 'done';
                  steps[7].status = 'active';
                } else {
                  steps[0].status = 'active';
                }
                let current = '' as string;
                const detHolder = steps[finalIdx].details;
                if (detHolder && typeof detHolder === 'object' && !Array.isArray(detHolder)) {
                  current = (detHolder as any).answer || '';
                } else if (typeof detHolder === 'string') {
                  current = detHolder;
                }
                const tok: string = (evt.data.text || '') as string;
                if (!tok.trim()) {
                  return m; // skip empty/whitespace-only chunks
                }
                let updated = current.endsWith(tok) ? current : current + tok;
                updated = normalizeStreamingToken('', updated);
                if (steps[finalIdx].key === 'direct') {
                  steps[0].details = updated;
                } else {
                  steps[7].details = { answer: updated, source_documents: [] };
                }
                steps[finalIdx].details = updated;
                // Mark "Putting everything together" step as done once tokens start
                const synthIdx = steps.findIndex(s => s.key === 'synthesize');
                if (synthIdx !== -1 && steps[synthIdx].status !== 'done') {
                  steps[synthIdx].status = 'done';
                }
                if (isLoading) setIsLoading(false);
                return { ...m, content: { steps } };
              }
              if (evt.type === 'sub_query_token') {
                const idx = evt.data.index as number;
                const tok: string = evt.data.text || '';
                if (!tok.trim()) return m;
                steps[5].status = 'active';
                let detailsArr: any[] = Array.isArray(steps[5].details) ? steps[5].details as any[] : [];
                while (detailsArr.length <= idx) {
                  detailsArr.push({ question: evt.data.question || `Sub-query ${idx+1}`, answer: '' });
                }
                const curAns: string = detailsArr[idx].answer || '';
                if (!curAns.endsWith(tok)) {
                  let updatedAnswer = curAns + tok;
                  updatedAnswer = normalizeStreamingToken('', updatedAnswer);
                  detailsArr[idx].answer = updatedAnswer;
                }
                steps[5].details = detailsArr;
                if (isLoading) setIsLoading(false);
                return { ...m, content: { steps } };
              }
              if (evt.type === 'complete') {
                const finalIdx = steps.findIndex(s => s.key === 'final' || s.key === 'direct');
                if (finalIdx === -1) return m;
                steps[finalIdx].status = 'done';

                if (steps[finalIdx].key === 'direct') {
                  // Direct answer: details is plain string
                  steps[finalIdx].details = evt.data.answer;
                } else {
                  steps[finalIdx].details = {
                    answer: evt.data.answer,
                    source_documents: evt.data.source_documents || []
                  };
                }

                setIsLoading(false);
                // Make sure any lingering steps are marked done
                steps.forEach(s => {
                  if (s.status !== 'done') s.status = 'done';
                });
                
                // 🔄 REFRESH SESSION: After completion, refresh session data to get updated title
                if (activeSessionId) {
                  // Always refresh session data so updated title & message count are reflected in the UI
                  setTimeout(async () => {
                    try {
                      const { session } = await apiService.getSession(activeSessionId as string);
                      setCurrentSession(session);
                      if (onSessionChange) {
                        onSessionChange(session);
                      }
                    } catch (error) {
                      console.error('Failed to refresh session after completion:', error);
                    }
                  }, 100); // Small delay to ensure backend has processed the title update
                }
                
                return { ...m, content: { steps }, metadata: { message_type: 'complete' } };
              }
              if (evt.type === 'direct_answer') {
                const stepsDir: Step[] = [
                  { key: 'direct', label: 'Answering directly', status: 'active' as const, details: '' }
                ];
                return { ...m, content: { steps: stepsDir } };
              }
              return m;
            }));
          }
        )
      } else {
        const response = await apiService.sendSessionMessage(activeSessionId, content, { 
          composeSubAnswers, 
          decompose: enableDecompose, 
          aiRerank: enableAiRerank, 
          contextExpand: enableContextExpand, 
          verify: enableVerify,
          model: selectedModel,
          // ✨ NEW RETRIEVAL PARAMETERS
          retrievalK,
          contextWindowSize,
          rerankerTopK,
          searchType,
          forceRag: forceDocs,
          provencePrune,
        })
      
      const aiMessage: ChatMessage = {
        id: response.ai_message_id || generateUUID(),
        content: response.response,
        sender: 'assistant',
        timestamp: new Date().toISOString(),
          metadata: { 
            message_type: 'sub_answer',
            source_documents: (response as any).source_documents || [] 
          }
      }
      setMessages(prev => [...prev, aiMessage])
      
        if ((response as any).session) {
          const sess = (response as any).session as ChatSession
          setCurrentSession(sess)
          if (onSessionChange) onSessionChange(sess)
        }
        if (onNewMessage) onNewMessage(aiMessage)
      }

    } catch (error) {
      console.error('Failed to send message:', error)
      setError('Failed to send message')
    } finally {
      setIsLoading(false)
    }
  }

  const handleIndexDocuments = async () => {
    if (!currentSession) return;

    setIsLoading(true);
    setError(null);
    try {
      const result = await apiService.indexDocuments(currentSession.id);
      console.log('✅ Indexing complete:', result);

      const indexMessage = apiService.createMessage(
        `✅ ${result.message}`,
        'assistant'
      );
      setMessages(prev => [...prev, indexMessage]);
      setIsIndexed(true);
      setUploadedFiles([]); // Clear uploaded files after indexing

    } catch (error) {
      console.error('❌ Failed to index documents:', error);
      const errorMessage = apiService.createMessage(
        '❌ Failed to index documents. Please try again.',
        'assistant'
      );
      setMessages(prev => [...prev, errorMessage]);
    } finally {
      setIsLoading(false);
    }
  }

  // Expose functions to parent component
  useImperativeHandle(ref, () => ({
    sendMessage,
    currentSession
  }))

  const handleAction = async (action: string, messageId: string, messageContent: string | Record<string, any>[] | { steps: Step[] }) => {
    console.log(`Action ${action} on message ${messageId}`)
    
    switch (action) {
      case 'copy':
        await navigator.clipboard.writeText(typeof messageContent === 'string' ? messageContent : JSON.stringify(messageContent, null, 2))
        break
      case 'regenerate':
        // Find the user message before this AI message and resend it
        const messageIndex = messages.findIndex(m => m.id === messageId)
        if (messageIndex > 0 && messages[messageIndex].sender === 'assistant') {
          const userMessage = messages[messageIndex - 1]
          if (userMessage.sender === 'user') {
            // Remove the AI message and resend the user message
            setMessages(prev => prev.filter(m => m.id !== messageId))
            await sendMessage(userMessage.content as string)
          }
        }
        break
      default:
        // Handle other actions
        break
    }
  }

  const showEmptyState = (!sessionId || messages.length === 0) && !isLoading

  return (
    <div className={`flex flex-col h-full ${className}`}>
      {error && (
        <div className="bg-red-900 text-red-200 px-4 py-2 text-sm flex-shrink-0">
          {error}
        </div>
      )}
      
      {showEmptyState ? (
        <div className="flex-1 flex flex-col items-center justify-center gap-6 min-h-0">
          <div className="text-center text-2xl font-semibold text-gray-300 select-none">What can I help you find today?</div>
          <div className="w-full max-w-2xl px-4">
            <ChatInput
              onSendMessage={sendMessage}
              disabled={isLoading}
              placeholder="Ask anything"
              onOpenSettings={()=>setShowSettings(true)}
              onAddIndex={()=>setShowIndexForm(true)}
              leftExtras={currentIndexId && currentIndexName ? (
                <button
                  type="button"
                  onClick={()=>setShowIndexInfo(true)}
                  title="View index info"
                  className="flex items-center gap-1 p-2 text-gray-400 hover:text-white hover:bg-gray-800 rounded-full transition-colors"
                >
                  <Database className="w-5 h-5" />
                  <span className="text-xs hidden sm:inline">{truncate(currentIndexName,12)}</span>
                </button>
              ) : undefined}
            />
          </div>
        </div>
      ) : (
        <>
          <ConversationPage 
            messages={messages}
            isLoading={isLoading}
            onAction={handleAction}
            className="flex-1 overflow-y-auto"
          />

          {/* Bottom input when chat active */}
          <div className="flex-shrink-0">
            {uploadedFiles.length > 0 && !isIndexed && (
              <div className="p-2 text-center bg-yellow-100 dark:bg-yellow-900 border-t border-b border-gray-200 dark:border-gray-700">
                <Button onClick={handleIndexDocuments} disabled={isLoading}>
                  {isLoading ? 'Indexing...' : 'Index Documents to Enable Chat'}
                </Button>
              </div>
            )}
            <ChatInput
              onSendMessage={sendMessage}
              disabled={isLoading || (uploadedFiles.length > 0 && !isIndexed)}
              placeholder="Message localGPT..."
              onOpenSettings={()=>setShowSettings(true)}
              onAddIndex={()=>setShowIndexForm(true)}
              leftExtras={currentIndexId && currentIndexName ? (
                <button
                  type="button"
                  onClick={()=>setShowIndexInfo(true)}
                  title="View index info"
                  className="flex items-center gap-1 p-2 text-gray-400 hover:text-white hover:bg-gray-800 rounded-full transition-colors"
                >
                  <Database className="w-5 h-5" />
                  <span className="text-xs hidden sm:inline">{truncate(currentIndexName,12)}</span>
                </button>
              ) : undefined}
            />
          </div>
        </>
      )}

      {showSettings && (
        <ChatSettingsModal
          onClose={()=>setShowSettings(false)}
          options={[
            // General Settings
            {type: 'toggle', label:'Query decomposition', checked: enableDecompose, setter: setEnableDecompose},
            {type: 'toggle', label:'Compose sub-answers', checked: composeSubAnswers, setter: setComposeSubAnswers},
            {type: 'toggle', label:'Verify answer', checked: enableVerify, setter: setEnableVerify},
            {type: 'toggle', label:'Stream phases', checked: enableStream, setter: setEnableStream},
            
            // Retrieval Settings
            {type: 'dropdown', label:'LLM model', value: selectedModel, setter: setSelectedModel, options: generationModels.map(m=>({value:m,label:m}))},
            {type: 'dropdown', label:'Search type', value: searchType, setter: setSearchType, options: [
              {value: 'hybrid', label: 'Hybrid (Vector + FTS)'},
              {value: 'vector_only', label: 'Vector Only'},
              {value: 'bm25_only', label: 'FTS Only'}
            ]},
            {type: 'slider', label:'Retrieval chunks', value: retrievalK, setter: setRetrievalK, min: 5, max: 50, unit: ' chunks'},
            
            // Reranking & Context
            {type: 'toggle', label:'AI reranker', checked: enableAiRerank, setter: setEnableAiRerank},
            {type: 'slider', label:'Reranker top chunks', value: rerankerTopK, setter: setRerankerTopK, min: 3, max: 20, unit: ' chunks'},
            {type: 'toggle', label:'Expand context window', checked: enableContextExpand, setter: setEnableContextExpand},
            {type: 'slider', label:'Context window size', value: contextWindowSize, setter: setContextWindowSize, min: 0, max: 5, unit: ' chunks'},
            {type: 'toggle', label:'Prune irrelevant sentences', checked: provencePrune, setter: setProvencePrune},
            {type: 'toggle', label:'Always search documents', checked: forceDocs, setter: setForceDocs},
          ]}
        />
      )}

      {showIndexForm && (
        <IndexForm
          onClose={()=>setShowIndexForm(false)}
          onIndexed={(s)=>{
            setShowIndexForm(false);
            setCurrentSession(s);
            if(onSessionChange) onSessionChange(s);
          }}
        />
      )}

      {/* Index info modal */}
      {showIndexInfo && currentSession && (
        <SessionIndexInfo sessionId={currentSession.id} onClose={()=>setShowIndexInfo(false)} />
      )}
    </div>
  )
})

SessionChat.displayName = "SessionChat"  

================================================
FILE: src/components/ui/session-sidebar.tsx
================================================
"use client"

import * as React from "react"
import { useState, useEffect } from "react"
import { Plus, MessageSquare, MoreVertical } from "lucide-react"
import { Button } from "@/components/ui/button"
import { ScrollArea } from "@/components/ui/scroll-area"
import { ChatSession, chatAPI } from "@/lib/api"

interface SessionSidebarRef {
  refreshSessions: () => Promise<void>
}

interface SessionSidebarProps {
  currentSessionId?: string
  onSessionSelect: (sessionId: string) => void
  onNewSession: () => void
  onSessionDelete?: (sessionId: string) => void
  onSessionCreated?: (ref: SessionSidebarRef) => void
  className?: string
}

export function SessionSidebar({
  currentSessionId,
  onSessionSelect,
  onNewSession,
  onSessionDelete,
  onSessionCreated,
  className = ""
}: SessionSidebarProps) {
  const [sessions, setSessions] = useState<ChatSession[]>([])
  const [isLoading, setIsLoading] = useState(true)
  const [error, setError] = useState<string | null>(null)
  const [menuOpenId, setMenuOpenId] = useState<string | null>(null)

  // Load sessions on mount
  useEffect(() => {
    loadSessions()
  }, [])

  const loadSessions = React.useCallback(async () => {
    try {
      setError(null)
      const response = await chatAPI.getSessions()
      setSessions(response.sessions)
    } catch (error) {
      console.error('Failed to load sessions:', error)
      setError('Failed to load sessions')
    } finally {
      setIsLoading(false)
    }
  }, [])

  const handleNewSession = () => {
    // Don't create session immediately - just trigger empty state
    onNewSession()
  }

  // Refresh sessions when a new session is created
  const refreshSessions = React.useCallback(async () => {
    await loadSessions()
  }, [loadSessions])

  // Expose refresh function to parent
  React.useEffect(() => {
    if (onSessionCreated) {
      onSessionCreated({ refreshSessions })
    }
  }, [onSessionCreated, refreshSessions])

  const handleDeleteSession = async (sessionId: string, event: React.MouseEvent) => {
    event.stopPropagation() // Prevent session selection when clicking delete
    
    if (!confirm('Are you sure you want to delete this conversation? This action cannot be undone.')) {
      return
    }

    try {
      await chatAPI.deleteSession(sessionId)
      setSessions(prev => prev.filter(s => s.id !== sessionId))
      
      // If the deleted session was currently selected, notify parent
      if (currentSessionId === sessionId && onSessionDelete) {
        onSessionDelete(sessionId)
      }
    } catch (error) {
      console.error('Failed to delete session:', error)
      setError('Failed to delete session')
    }
  }

  const handleRenameSession = async (sessionId: string, event: React.MouseEvent) => {
    event.stopPropagation();
    const current = sessions.find(s => s.id === sessionId);
    const newTitle = prompt('Enter new title', current?.title || '');
    if (!newTitle || newTitle.trim() === '' || newTitle === current?.title) {
      return;
    }
    try {
      const result = await chatAPI.renameSession(sessionId, newTitle.trim());
      // Update local state with new session data
      setSessions(prev => prev.map(s => s.id === sessionId ? result.session : s));
      // If this is the currently open session, notify parent to refresh
      if (currentSessionId === sessionId && onSessionSelect) {
        onSessionSelect(sessionId);
      }
      setMenuOpenId(null);
    } catch (error) {
      console.error('Failed to rename session:', error);
      setError('Failed to rename session');
    }
  }

  const formatDate = (dateString: string) => {
    const date = new Date(dateString)
    const now = new Date()
    const diffInHours = (now.getTime() - date.getTime()) / (1000 * 60 * 60)
    
    if (diffInHours < 24) {
      return date.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })
    } else if (diffInHours < 24 * 7) {
      return date.toLocaleDateString([], { weekday: 'short' })
    } else {
      return date.toLocaleDateString([], { month: 'short', day: 'numeric' })
    }
  }

  const truncateTitle = (title: string, maxLength: number = 25) => {
    return title.length > maxLength ? title.substring(0, maxLength) + '...' : title
  }

  return (
    <div className={`w-64 h-full min-h-0 bg-black border-r border-gray-800 flex flex-col ${className}`}>
      {/* Header */}
      <div className="p-4 border-b border-gray-800">
        <div className="flex items-center justify-between mb-3">
          <h2 className="text-lg font-semibold text-white">Chats</h2>
          <Button
            onClick={handleNewSession}
            size="sm"
            className="h-8 w-8 p-0 bg-gray-700 hover:bg-gray-600 text-white"
            title="New Chat"
          >
            <Plus className="h-4 w-4" />
          </Button>
        </div>
      </div>

      {/* Sessions List */}
      <ScrollArea className="flex-1 min-h-0 overflow-y-auto">
        <div className="p-2">
          {error && (
            <div className="mb-4 p-3 bg-red-900 text-red-200 text-sm rounded-lg">
              {error}
              <Button
                onClick={loadSessions}
                size="sm"
                className="ml-2 h-6 px-2 text-xs bg-red-800 hover:bg-red-700"
              >
                Retry
              </Button>
            </div>
          )}

          {isLoading ? (
            <div className="space-y-2">
              {[...Array(5)].map((_, i) => (
                <div key={i} className="h-12 bg-gray-900 rounded-lg animate-pulse" />
              ))}
            </div>
          ) : sessions.length === 0 ? (
            <div className="text-center py-8 text-gray-400">
              <MessageSquare className="w-8 h-8 mx-auto mb-2 opacity-50" />
              <p className="text-sm">No conversations yet</p>
              <p className="text-xs mt-1">Start a new chat to begin</p>
            </div>
          ) : (
            <div className="space-y-px">
              {sessions.map((session) => (
                <div
                  key={session.id}
                  className={`relative group pl-1 rounded transition-colors ${
                    currentSessionId === session.id
                      ? 'bg-gray-700/60 text-white border-l-2 border-white'
                      : 'hover:bg-gray-800 text-gray-300'
                  }`}
                >
                  <button
                    onClick={() => onSessionSelect(session.id)}
                    className="w-full pl-3 pr-8 py-2 text-left text-sm"
                  >
                    <p className="truncate">
                      {truncateTitle(session.title)}
                    </p>
                  </button>
                  
                  {/* Overflow menu */}
                  <div className="absolute right-2 top-2 index-row-menu">
                    <button onClick={(e)=>{e.stopPropagation(); setMenuOpenId(menuOpenId===session.id?null:session.id);}} className="p-1 text-gray-400 hover:text-white opacity-0 group-hover:opacity-100 transition">
                      <MoreVertical className="w-4 h-4" />
                    </button>
                    {menuOpenId===session.id && (
                      <div className="absolute right-0 top-full mt-1 bg-black/90 backdrop-blur border border-white/10 rounded shadow-lg py-1 w-32 text-sm z-50">
                        <button onClick={(e)=>{e.stopPropagation(); onSessionSelect(session.id); setMenuOpenId(null);}} className="block w-full text-left px-4 py-2 hover:bg-white/10">Open</button>
                        <button onClick={(e)=>handleRenameSession(session.id,e)} className="block w-full text-left px-4 py-2 hover:bg-white/10">Rename</button>
                        <button onClick={(e)=>handleDeleteSession(session.id,e)} className="block w-full text-left px-4 py-2 hover:bg-white/10 text-red-400 hover:text-red-500">Delete</button>
                      </div>
                    )}
                  </div>
                </div>
              ))}
            </div>
          )}
        </div>
      </ScrollArea>

      {/* Footer with stats */}
      {sessions.length > 0 && (
        <div className="p-4 border-t border-gray-800 text-xs text-gray-400 bg-black">
          <div className="flex justify-between">
            <span>{sessions.length} conversations</span>
            <span>
              {sessions.reduce((sum, s) => sum + s.message_count, 0)} messages
            </span>
          </div>
        </div>
      )}
    </div>
  )
} 

================================================
FILE: src/components/ui/sidebar.tsx
================================================
"use client";

import { cn } from "@/lib/utils";
import { ScrollArea } from "@/components/ui/scroll-area";
import { motion } from "framer-motion";
import {
  ChevronsUpDown,
  LogOut,
  MessagesSquare,
  Plus,
  Settings,
  UserCircle,
} from "lucide-react";
import { Avatar, AvatarFallback } from "@/components/ui/avatar"
import { useState } from "react";
import { Button } from "@/components/ui/button";
import {
  DropdownMenu,
  DropdownMenuContent,
  DropdownMenuItem,
  DropdownMenuSeparator,
  DropdownMenuTrigger,
} from "@/components/ui/dropdown-menu";
import { Separator } from "@/components/ui/separator";

const sidebarVariants = {
  open: {
    width: "15rem",
  },
  closed: {
    width: "3.05rem",
  },
};

const contentVariants = {
  open: { display: "block", opacity: 1 },
  closed: { display: "block", opacity: 1 },
};

const variants = {
  open: {
    x: 0,
    opacity: 1,
    transition: {
      x: { stiffness: 1000, velocity: -100 },
    },
  },
  closed: {
    x: -20,
    opacity: 0,
    transition: {
      x: { stiffness: 100 },
    },
  },
};

const transitionProps = {
  type: "tween",
  ease: "easeOut",
  duration: 0.2,
  staggerChildren: 0.1,
};

const staggerVariants = {
  open: {
    transition: { staggerChildren: 0.03, delayChildren: 0.02 },
  },
};

// Mock chat sessions data
const chatSessions = [
  { id: 1, title: "React Component Help", lastMessage: "How to create a sidebar?", timestamp: "2 min ago", isActive: true },
  { id: 2, title: "TypeScript Questions", lastMessage: "Interface vs Type", timestamp: "1 hour ago", isActive: false },
  { id: 3, title: "Next.js Setup", lastMessage: "Setting up shadcn/ui", timestamp: "3 hours ago", isActive: false },
  { id: 4, title: "Tailwind CSS", lastMessage: "Dark mode implementation", timestamp: "1 day ago", isActive: false },
  { id: 5, title: "Database Design", lastMessage: "Schema optimization", timestamp: "2 days ago", isActive: false },
];

export function SessionNavBar() {
  const [isCollapsed, setIsCollapsed] = useState(true);
  
  return (
    <motion.div
      className={cn(
        "sidebar fixed left-0 z-40 h-full shrink-0 border-r border-neutral-800",
      )}
      initial={isCollapsed ? "closed" : "open"}
      animate={isCollapsed ? "closed" : "open"}
      variants={sidebarVariants}
      transition={transitionProps}
      onMouseEnter={() => setIsCollapsed(false)}
      onMouseLeave={() => setIsCollapsed(true)}
    >
      <motion.div
        className={`relative z-40 flex text-muted-foreground h-full shrink-0 flex-col bg-black transition-all`}
        variants={contentVariants}
      >
        <motion.ul variants={staggerVariants} className="flex h-full flex-col">
          <div className="flex grow flex-col items-center">
            {/* Header */}
            <div className="flex h-[54px] w-full shrink-0 border-b border-neutral-800 p-2">
              <div className="mt-[1.5px] flex w-full">
                <DropdownMenu modal={false}>
                  <DropdownMenuTrigger className="w-full" asChild>
                    <Button
                      variant="ghost"
                      size="sm"
                      className="flex w-fit items-center gap-2 px-2 text-white hover:bg-neutral-800" 
                    >
                      <Avatar className='rounded size-4'>
                        <AvatarFallback className="bg-blue-600 text-white">L</AvatarFallback>
                      </Avatar>
                      <motion.li
                        variants={variants}
                        className="flex w-fit items-center gap-2"
                      >
                        {!isCollapsed && (
                          <>
                            <p className="text-sm font-medium text-white">
                              localGPT
                            </p>
                            <ChevronsUpDown className="h-4 w-4 text-neutral-400" />
                          </>
                        )}
                      </motion.li>
                    </Button>
                  </DropdownMenuTrigger>
                  <DropdownMenuContent align="start" className="bg-neutral-900 border-neutral-800">
                    <DropdownMenuItem className="flex items-center gap-2 text-white hover:bg-neutral-800">
                      <Settings className="h-4 w-4" /> Preferences
                    </DropdownMenuItem>
                    <DropdownMenuItem className="flex items-center gap-2 text-white hover:bg-neutral-800">
                      <Plus className="h-4 w-4" /> New Chat
                    </DropdownMenuItem>
                  </DropdownMenuContent>
                </DropdownMenu>
              </div>
            </div>

            {/* Chat Sessions */}
            <div className="flex h-full w-full flex-col">
              <div className="flex grow flex-col gap-4">
                <ScrollArea className="h-16 grow p-2">
                  <div className={cn("flex w-full flex-col gap-1")}>
                    {/* New Chat Button */}
                    <Button
                      variant="ghost"
                      className="flex h-8 w-full flex-row items-center justify-start rounded-md px-2 py-1.5 text-white hover:bg-neutral-800 mb-2"
                    >
                      <Plus className="h-4 w-4" />
                      <motion.span variants={variants} className="ml-2">
                        {!isCollapsed && (
                          <p className="text-sm font-medium">New Chat</p>
                        )}
                      </motion.span>
                    </Button>
                    
                    <Separator className="w-full bg-neutral-800" />
                    
                    {/* Chat Sessions List */}
                    {chatSessions.map((session) => (
                      <div
                        key={session.id}
                        className={cn(
                          "flex h-auto w-full flex-col rounded-md px-2 py-2 transition hover:bg-neutral-800 cursor-pointer",
                          session.isActive && "bg-neutral-800"
                        )}
                      >
                        <div className="flex items-center gap-2">
                          <MessagesSquare className="h-4 w-4 text-neutral-400 shrink-0" />
                          <motion.div variants={variants} className="flex-1 min-w-0">
                            {!isCollapsed && (
                              <div className="flex flex-col gap-1">
                                <p className="text-sm font-medium text-white truncate">
                                  {session.title}
                                </p>
                                <p className="text-xs text-neutral-400 truncate">
                                  {session.lastMessage}
                                </p>
                                <p className="text-xs text-neutral-500">
                                  {session.timestamp}
                                </p>
                              </div>
                            )}
                          </motion.div>
                        </div>
                      </div>
                    ))}
                  </div>
                </ScrollArea>
              </div>
              
              {/* Footer */}
              <div className="flex flex-col p-2 border-t border-neutral-800">
                <Button
                  variant="ghost"
                  className="mt-auto flex h-8 w-full flex-row items-center rounded-md px-2 py-1.5 text-white hover:bg-neutral-800"
                >
                  <Settings className="h-4 w-4 shrink-0" />
                  <motion.span variants={variants}>
                    {!isCollapsed && (
                      <p className="ml-2 text-sm font-medium">Settings</p>
                    )}
                  </motion.span>
                </Button>
                
                <DropdownMenu modal={false}>
                  <DropdownMenuTrigger className="w-full">
                    <div className="flex h-8 w-full flex-row items-center gap-2 rounded-md px-2 py-1.5 transition hover:bg-neutral-800">
                      <Avatar className="size-4">
                        <AvatarFallback className="bg-blue-600 text-white text-xs">
                          U
                        </AvatarFallback>
                      </Avatar>
                      <motion.div
                        variants={variants}
                        className="flex w-full items-center gap-2"
                      >
                        {!isCollapsed && (
                          <>
                            <p className="text-sm font-medium text-white">User</p>
                            <ChevronsUpDown className="ml-auto h-4 w-4 text-neutral-400" />
                          </>
                        )}
                      </motion.div>
                    </div>
                  </DropdownMenuTrigger>
                  <DropdownMenuContent sideOffset={5} className="bg-neutral-900 border-neutral-800">
                    <div className="flex flex-row items-center gap-2 p-2">
                      <Avatar className="size-6">
                        <AvatarFallback className="bg-blue-600 text-white">
                          U
                        </AvatarFallback>
                      </Avatar>
                      <div className="flex flex-col text-left">
                        <span className="text-sm font-medium text-white">
                          User
                        </span>
                        <span className="line-clamp-1 text-xs text-neutral-400">
                          user@example.com
                        </span>
                      </div>
                    </div>
                    <DropdownMenuSeparator className="bg-neutral-800" />
                    <DropdownMenuItem className="flex items-center gap-2 text-white hover:bg-neutral-800">
                      <UserCircle className="h-4 w-4" /> Profile
                    </DropdownMenuItem>
                    <DropdownMenuItem className="flex items-center gap-2 text-white hover:bg-neutral-800">
                      <LogOut className="h-4 w-4" /> Sign out
                    </DropdownMenuItem>
                  </DropdownMenuContent>
                </DropdownMenu>
              </div>
            </div>
          </div>
        </motion.ul>
      </motion.div>
    </motion.div>
  );
} 

================================================
FILE: src/components/ui/skeleton.tsx
================================================
import { cn } from "@/lib/utils"

function Skeleton({ className, ...props }: React.ComponentProps<"div">) {
  return (
    <div
      data-slot="skeleton"
      className={cn("bg-accent animate-pulse rounded-md", className)}
      {...props}
    />
  )
}

export { Skeleton }


================================================
FILE: src/components/ui/textarea.tsx
================================================
import * as React from "react"

import { cn } from "@/lib/utils"

function Textarea({ className, ...props }: React.ComponentProps<"textarea">) {
  return (
    <textarea
      data-slot="textarea"
      className={cn(
        "border-input placeholder:text-muted-foreground focus-visible:border-ring focus-visible:ring-ring/50 aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive dark:bg-input/30 flex field-sizing-content min-h-16 w-full rounded-md border bg-transparent px-3 py-2 text-base shadow-xs transition-[color,box-shadow] outline-none focus-visible:ring-[3px] disabled:cursor-not-allowed disabled:opacity-50 md:text-sm",
        className
      )}
      {...props}
    />
  )
}

export { Textarea }


================================================
FILE: src/lib/api.ts
================================================
const API_BASE_URL = 'http://localhost:8000';

// 🆕 Simple UUID generator for client-side message IDs
export const generateUUID = () => {
  if (typeof window !== 'undefined' && window.crypto && window.crypto.randomUUID) {
    return window.crypto.randomUUID();
  }
  // Fallback for older browsers or non-secure contexts
  return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, (c) => {
    const r = (Math.random() * 16) | 0;
    const v = c === 'x' ? r : (r & 0x3) | 0x8;
    return v.toString(16);
  });
};

export interface Step {
  key: string;
  label: string;
  status: 'pending' | 'active' | 'done';
  details: any;
}

export interface ChatMessage {
  id: string;
  content: string | Array<Record<string, any>> | { steps: Step[] };
  sender: 'user' | 'assistant';
  timestamp: string;
  isLoading?: boolean;
  metadata?: Record<string, unknown>;
}

export interface ChatSession {
  id: string;
  title: string;
  created_at: string;
  updated_at: string;
  model_used: string;
  message_count: number;
}

export interface ChatRequest {
  message: string;
  model?: string;
  conversation_history?: Array<{
    role: 'user' | 'assistant';
    content: string;
  }>;
}

export interface ChatResponse {
  response: string;
  model: string;
  message_count: number;
}

export interface HealthResponse {
  status: string;
  ollama_running: boolean;
  available_models: string[];
  database_stats?: {
    total_sessions: number;
    total_messages: number;
    most_used_model: string | null;
  };
}

export interface ModelsResponse {
  generation_models: string[];
  embedding_models: string[];
}

export interface SessionResponse {
  sessions: ChatSession[];
  total: number;
}

export interface SessionChatResponse {
  response: string;
  session: ChatSession;
  user_message_id: string;
  ai_message_id: string;
}

class ChatAPI {
  async checkHealth(): Promise<HealthResponse> {
    try {
      const response = await fetch(`${API_BASE_URL}/health`);
      if (!response.ok) {
        throw new Error(`Health check failed: ${response.status}`);
      }
      return await response.json();
    } catch (error) {
      console.error('Health check failed:', error);
      throw error;
    }
  }

  async sendMessage(request: ChatRequest): Promise<ChatResponse> {
    try {
      const response = await fetch(`${API_BASE_URL}/chat`, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({
          message: request.message,
          model: request.model || 'llama3.2:latest',
          conversation_history: request.conversation_history || [],
        }),
      });

      if (!response.ok) {
        const errorData = await response.json().catch(() => ({ error: 'Unknown error' }));
        throw new Error(`Chat API error: ${errorData.error || response.statusText}`);
      }

      return await response.json();
    } catch (error) {
      console.error('Chat API failed:', error);
      throw error;
    }
  }

  // Convert ChatMessage array to conversation history format
  messagesToHistory(messages: ChatMessage[]): Array<{ role: 'user' | 'assistant'; content: string }> {
    return messages
      .filter(msg => typeof msg.content === 'string' && msg.content.trim())
      .map(msg => ({
        role: msg.sender,
        content: msg.content as string,
      }));
  }

  // Session Management
  async getSessions(): Promise<SessionResponse> {
    try {
      const response = await fetch(`${API_BASE_URL}/sessions`);
      if (!response.ok) {
        throw new Error(`Failed to get sessions: ${response.status}`);
      }
      return await response.json();
    } catch (error) {
      console.error('Get sessions failed:', error);
      throw error;
    }
  }

  async createSession(title: string = 'New Chat', model: string = 'llama3.2:latest'): Promise<ChatSession> {
    try {
      const response = await fetch(`${API_BASE_URL}/sessions`, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({ title, model }),
      });

      if (!response.ok) {
        throw new Error(`Failed to create session: ${response.status}`);
      }

      const data = await response.json();
      return data.session;
    } catch (error) {
      console.error('Create session failed:', error);
      throw error;
    }
  }

  async getSession(sessionId: string): Promise<{ session: ChatSession; messages: ChatMessage[] }> {
    try {
      const response = await fetch(`${API_BASE_URL}/sessions/${sessionId}`);
      if (!response.ok) {
        throw new Error(`Failed to get session: ${response.status}`);
      }
      return await response.json();
    } catch (error) {
      console.error('Get session failed:', error);
      throw error;
    }
  }

  async sendSessionMessage(
    sessionId: string,
    message: string,
    opts: { 
      model?: string; 
      composeSubAnswers?: boolean; 
      decompose?: boolean; 
      aiRerank?: boolean; 
      contextExpand?: boolean; 
      verify?: boolean;
      // ✨ NEW RETRIEVAL PARAMETERS
      retrievalK?: number;
      contextWindowSize?: number;
      rerankerTopK?: number;
      searchType?: string;
      denseWeight?: number;
      forceRag?: boolean;
      provencePrune?: boolean;
    } = {}
  ): Promise<SessionChatResponse & { source_documents: any[] }> {
    try {
      const response = await fetch(`${API_BASE_URL}/sessions/${sessionId}/messages`, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({
          message,
          ...(opts.model && { model: opts.model }),
          ...(typeof opts.composeSubAnswers === 'boolean' && { compose_sub_answers: opts.composeSubAnswers }),
          ...(typeof opts.decompose === 'boolean' && { query_decompose: opts.decompose }),
          ...(typeof opts.aiRerank === 'boolean' && { ai_rerank: opts.aiRerank }),
          ...(typeof opts.contextExpand === 'boolean' && { context_expand: opts.contextExpand }),
          ...(typeof opts.verify === 'boolean' && { verify: opts.verify }),
          // ✨ ADD NEW RETRIEVAL PARAMETERS
          ...(typeof opts.retrievalK === 'number' && { retrieval_k: opts.retrievalK }),
          ...(typeof opts.contextWindowSize === 'number' && { context_window_size: opts.contextWindowSize }),
          ...(typeof opts.rerankerTopK === 'number' && { reranker_top_k: opts.rerankerTopK }),
          ...(typeof opts.searchType === 'string' && { search_type: opts.searchType }),
          ...(typeof opts.denseWeight === 'number' && { dense_weight: opts.denseWeight }),
          ...(typeof opts.forceRag === 'boolean' && { force_rag: opts.forceRag }),
          ...(typeof opts.provencePrune === 'boolean' && { provence_prune: opts.provencePrune }),
        }),
      });

      if (!response.ok) {
        const errorData = await response.json().catch(() => ({ error: 'Unknown error' }));
        throw new Error(`Session chat error: ${errorData.error || response.statusText}`);
      }

      return await response.json();
    } catch (error) {
      console.error('Session chat failed:', error);
      throw error;
    }
  }

  async deleteSession(sessionId: string): Promise<{ message: string; deleted_session_id: string }> {
    try {
      const response = await fetch(`${API_BASE_URL}/sessions/${sessionId}`, {
        method: 'DELETE',
      });

      if (!response.ok) {
        const errorData = await response.json().catch(() => ({ error: 'Unknown error' }));
        throw new Error(`Delete session error: ${errorData.error || response.statusText}`);
      }

      return await response.json();
    } catch (error) {
      console.error('Delete session failed:', error);
      throw error;
    }
  }

  async renameSession(sessionId: string, newTitle: string): Promise<{ message: string; session: ChatSession }> {
    try {
      const response = await fetch(`${API_BASE_URL}/sessions/${sessionId}/rename`, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({ title: newTitle }),
      });

      if (!response.ok) {
        const errorData = await response.json().catch(() => ({ error: 'Unknown error' }));
        throw new Error(`Rename session error: ${errorData.error || response.statusText}`);
      }

      return await response.json();
    } catch (error) {
      console.error('Rename session failed:', error);
      throw error;
    }
  }

  async cleanupEmptySessions(): Promise<{ message: string; cleanup_count: number }> {
    try {
      const response = await fetch(`${API_BASE_URL}/sessions/cleanup`);

      if (!response.ok) {
        const errorData = await response.json().catch(() => ({ error: 'Unknown error' }));
        throw new Error(`Cleanup sessions error: ${errorData.error || response.statusText}`);
      }

      return await response.json();
    } catch (error) {
      console.error('Cleanup sessions failed:', error);
      throw error;
    }
  }

  async uploadFiles(sessionId: string, files: File[]): Promise<{ 
    message: string; 
    uploaded_files: {filename: string, stored_path: string}[]; 
  }> {
    try {
      const formData = new FormData();
      files.forEach((file) => {
        formData.append('files', file, file.name);
      });

      const response = await fetch(`${API_BASE_URL}/sessions/${sessionId}/upload`, {
        method: 'POST',
        body: formData,
      });

      if (!response.ok) {
        const errorData = await response.json().catch(() => ({ error: 'Upload failed' }));
        throw new Error(`Upload error: ${errorData.error || response.statusText}`);
      }
      return await response.json();
    } catch (error) {
      console.error('File upload failed:', error);
      throw error;
    }
  }

  async indexDocuments(sessionId: string): Promise<{ message: string }> {
    try {
      const response = await fetch(`${API_BASE_URL}/sessions/${sessionId}/index`, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
      });

      if (!response.ok) {
        const errorData = await response.json().catch(() => ({ error: 'Indexing failed' }));
        throw new Error(`Indexing error: ${errorData.error || response.statusText}`);
      }
      return await response.json();
    } catch (error) {
      console.error('Indexing failed:', error);
      throw error;
    }
  }

  // Legacy upload function - can be removed if no longer needed
  async uploadPDFs(sessionId: string, files: File[]): Promise<{ 
    message: string; 
    uploaded_files: any[]; 
    processing_results: any[];
    session_documents: any[];
    total_session_documents: number;
  }> {
    try {
      // Test if files have content and show size info
      let totalSize = 0;
      for (const file of files) {
        if (file.size === 0) {
          throw new Error(`File ${file.name} is empty (0 bytes)`);
        }
        totalSize += file.size;
        const sizeMB = (file.size / (1024 * 1024)).toFixed(2);
        console.log(`📄 File ${file.name}: ${sizeMB}MB (${file.size} bytes), type: ${file.type}`);
      }
      
      const totalSizeMB = (totalSize / (1024 * 1024)).toFixed(2);
      console.log(`📄 Total upload size: ${totalSizeMB}MB`);
      
      if (totalSize > 50 * 1024 * 1024) { // 50MB limit
        throw new Error(`Total file size ${totalSizeMB}MB exceeds 50MB limit`);
      }
      
      const formData = new FormData();
      
      // Use a generic field name 'file' that the backend expects
      let i = 0;
      for (const file of files) {
        formData.append(`file_${i}`, file, file.name);
        i++;
      }
      
      const response = await fetch(`${API_BASE_URL}/sessions/${sessionId}/upload`, {
        method: 'POST',
        body: formData,
      });

      if (!response.ok) {
        const errorData = await response.json().catch(() => ({ error: 'Unknown error' }));
        throw new Error(`Upload error: ${errorData.error || response.statusText}`);
      }

      return await response.json();
    } catch (error) {
      console.error('PDF upload failed:', error);
      throw error;
    }
  }

  // Convert database message format to ChatMessage format
  convertDbMessage(dbMessage: Record<string, unknown>): ChatMessage {
    return {
      id: dbMessage.id as string,
      content: dbMessage.content as string,
      sender: dbMessage.sender as 'user' | 'assistant',
      timestamp: dbMessage.timestamp as string,
      metadata: dbMessage.metadata as Record<string, unknown> | undefined,
    };
  }

  // Create a new ChatMessage with UUID (for loading states)
  createMessage(
    content: string, 
    sender: 'user' | 'assistant', 
    isLoading = false
  ): ChatMessage {
    return {
      id: generateUUID(),
      content,
      sender,
      timestamp: new Date().toISOString(),
      isLoading,
    };
  }

  // ---------------- Models ----------------
  async getModels(): Promise<ModelsResponse> {
    const resp = await fetch(`${API_BASE_URL}/models`);
    if (!resp.ok) {
      throw new Error(`Failed to fetch models list: ${resp.status}`);
    }
    return resp.json();
  }

  async getSessionDocuments(sessionId: string): Promise<{ files: string[]; file_count: number; session: ChatSession }> {
    const resp = await fetch(`${API_BASE_URL}/sessions/${sessionId}/documents`);
    if (!resp.ok) {
      throw new Error(`Failed to fetch session documents: ${resp.status}`);
    }
    return resp.json();
  }

  // ---------- Index endpoints ----------

  async createIndex(name: string, description?: string, metadata: Record<string, unknown> = {}): Promise<{ index_id: string }> {
    const resp = await fetch(`${API_BASE_URL}/indexes`, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ name, description, metadata }),
    });
    if (!resp.ok) {
      const err = await resp.json().catch(() => ({}));
      throw new Error(`Create index error: ${err.error || resp.statusText}`);
    }
    return resp.json();
  }

  async uploadFilesToIndex(indexId: string, files: File[]): Promise<{ message: string; uploaded_files: any[] }> {
    const fd = new FormData();
    files.forEach((f) => fd.append('files', f, f.name));
    const resp = await fetch(`${API_BASE_URL}/indexes/${indexId}/upload`, { method: 'POST', body: fd });
    if (!resp.ok) {
      const err = await resp.json().catch(() => ({}));
      throw new Error(`Upload to index error: ${err.error || resp.statusText}`);
    }
    return resp.json();
  }

  async buildIndex(indexId: string, opts: { 
    latechunk?: boolean; 
    doclingChunk?: boolean;
    chunkSize?: number;
    chunkOverlap?: number;
    retrievalMode?: string;
    windowSize?: number;
    enableEnrich?: boolean;
    embeddingModel?: string;
    enrichModel?: string;
    overviewModel?: string;
    batchSizeEmbed?: number;
    batchSizeEnrich?: number;
  } = {}): Promise<{ message: string }> {
    try {
      const response = await fetch(`${API_BASE_URL}/indexes/${indexId}/build`, {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({ 
          latechunk: opts.latechunk ?? false,
          doclingChunk: opts.doclingChunk ?? false,
          chunkSize: opts.chunkSize ?? 512,
          chunkOverlap: opts.chunkOverlap ?? 64,
          retrievalMode: opts.retrievalMode ?? 'hybrid',
          windowSize: opts.windowSize ?? 2,
          enableEnrich: opts.enableEnrich ?? true,
          embeddingModel: opts.embeddingModel,
          enrichModel: opts.enrichModel,
          overviewModel: opts.overviewModel,
          batchSizeEmbed: opts.batchSizeEmbed ?? 50,
          batchSizeEnrich: opts.batchSizeEnrich ?? 25,
        }),
      });

      if (!response.ok) {
        const errorData = await response.json().catch(() => ({ error: 'Unknown error' }));
        throw new Error(`Build index error: ${errorData.error || response.statusText}`);
      }

      return await response.json();
    } catch (error) {
      console.error('Build index failed:', error);
      throw error;
    }
  }

  async linkIndexToSession(sessionId: string, indexId: string): Promise<{ message: string }> {
    const resp = await fetch(`${API_BASE_URL}/sessions/${sessionId}/indexes/${indexId}`, { method: 'POST' });
    if (!resp.ok) {
      const err = await resp.json().catch(() => ({}));
      throw new Error(`Link index error: ${err.error || resp.statusText}`);
    }
    return resp.json();
  }

  async listIndexes(): Promise<{ indexes: any[]; total: number }> {
    const resp = await fetch(`${API_BASE_URL}/indexes`);
    if (!resp.ok) {
      throw new Error(`Failed to list indexes: ${resp.status}`);
    }
    return resp.json();
  }

  async getSessionIndexes(sessionId: string): Promise<{ indexes: any[]; total: number }> {
    const resp = await fetch(`${API_BASE_URL}/sessions/${sessionId}/indexes`);
    if (!resp.ok) throw new Error(`Failed to get session indexes: ${resp.status}`);
    return resp.json();
  }

  async deleteIndex(indexId: string): Promise<{ message: string }> {
    const resp = await fetch(`${API_BASE_URL}/indexes/${indexId}`, {
      method: 'DELETE',
    });
    if (!resp.ok) {
      const data = await resp.json().catch(() => ({ error: 'Unknown error'}));
      throw new Error(data.error || `Failed to delete index: ${resp.status}`);
    }
    return resp.json();
  }

  // -------------------- Streaming (SSE-over-fetch) --------------------
  async streamSessionMessage(
    params: {
      query: string;
      model?: string;
      session_id?: string;
      table_name?: string;
      composeSubAnswers?: boolean;
      decompose?: boolean;
      aiRerank?: boolean;
      contextExpand?: boolean;
      verify?: boolean;
      // ✨ NEW RETRIEVAL PARAMETERS
      retrievalK?: number;
      contextWindowSize?: number;
      rerankerTopK?: number;
      searchType?: string;
      denseWeight?: number;
      forceRag?: boolean;
      provencePrune?: boolean;
    },
    onEvent: (event: { type: string; data: any }) => void,
  ): Promise<void> {
    const { query, model, session_id, table_name, composeSubAnswers, decompose, aiRerank, contextExpand, verify, retrievalK, contextWindowSize, rerankerTopK, searchType, denseWeight, forceRag, provencePrune } = params;

    const payload: Record<string, unknown> = { query };
    if (model) payload.model = model;
    if (session_id) payload.session_id = session_id;
    if (table_name) payload.table_name = table_name;
    if (typeof composeSubAnswers === 'boolean') payload.compose_sub_answers = composeSubAnswers;
    if (typeof decompose === 'boolean') payload.query_decompose = decompose;
    if (typeof aiRerank === 'boolean') payload.ai_rerank = aiRerank;
    if (typeof contextExpand === 'boolean') payload.context_expand = contextExpand;
    if (typeof verify === 'boolean') payload.verify = verify;
    // ✨ ADD NEW RETRIEVAL PARAMETERS TO PAYLOAD
    if (typeof retrievalK === 'number') payload.retrieval_k = retrievalK;
    if (typeof contextWindowSize === 'number') payload.context_window_size = contextWindowSize;
    if (typeof rerankerTopK === 'number') payload.reranker_top_k = rerankerTopK;
    if (typeof searchType === 'string') payload.search_type = searchType;
    if (typeof denseWeight === 'number') payload.dense_weight = denseWeight;
    if (typeof forceRag === 'boolean') payload.force_rag = forceRag;
    if (typeof provencePrune === 'boolean') payload.provence_prune = provencePrune;

    const resp = await fetch('http://localhost:8001/chat/stream', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify(payload),
    });

    if (!resp.ok || !resp.body) {
      throw new Error(`Stream request failed: ${resp.status}`);
    }

    const reader = resp.body.getReader();
    const decoder = new TextDecoder();
    let buffer = '';

    let streamClosed = false;
    while (!streamClosed) {
      const { value, done } = await reader.read();
      if (done) break;
      buffer += decoder.decode(value, { stream: true });

      const parts = buffer.split('\n\n');
      buffer = parts.pop() || '';

      for (const part of parts) {
        const line = part.trim();
        if (!line.startsWith('data:')) continue;
        const jsonStr = line.replace(/^data:\s*/, '');
        try {
          const evt = JSON.parse(jsonStr);
          onEvent(evt);
          if (evt.type === 'complete') {
            // Gracefully close the stream so the caller unblocks
            try { await reader.cancel(); } catch {}
            streamClosed = true;
            break;
          }
        } catch {
          /* noop */
        }
      }
    }
  }
}

export const chatAPI = new ChatAPI(); 

================================================
FILE: src/lib/types.ts
================================================
export interface AttachedFile {
  id: string;
  name: string;
  size: number;
  type: string;
  file: File;
} 

================================================
FILE: src/lib/utils.ts
================================================
import { clsx, type ClassValue } from "clsx"
import { twMerge } from "tailwind-merge"

export function cn(...inputs: ClassValue[]) {
  return twMerge(clsx(inputs))
}


================================================
FILE: src/test-upload.html
================================================
<!DOCTYPE html>
<html>
<head>
    <title>Test PDF Upload</title>
</head>
<body>
    <h1>Test PDF Upload</h1>
    <form id="uploadForm">
        <input type="file" id="fileInput" accept=".pdf,.docx,.doc,.html,.htm,.md,.txt" />
        <button type="submit">Upload PDF</button>
    </form>
    
    <div id="result"></div>
    
    <script>
        document.getElementById('uploadForm').addEventListener('submit', async (e) => {
            e.preventDefault();
            
            const fileInput = document.getElementById('fileInput');
            const file = fileInput.files[0];
            
            if (!file) {
                alert('Please select a file');
                return;
            }
            
            console.log('Selected file:', {
                name: file.name,
                size: file.size,
                type: file.type,
                lastModified: file.lastModified
            });
            
            const formData = new FormData();
            formData.append('file_0', file);
            
            try {
                const response = await fetch('http://localhost:8000/sessions/4b545007-f13f-4bc8-be69-3f0633645e52/upload', {
                    method: 'POST',
                    body: formData
                });
                
                const result = await response.json();
                document.getElementById('result').innerHTML = '<pre>' + JSON.stringify(result, null, 2) + '</pre>';
                console.log('Upload result:', result);
                
            } catch (error) {
                console.error('Upload failed:', error);
                document.getElementById('result').innerHTML = 'Upload failed: ' + error.message;
            }
        });
    </script>
</body>
</html>    

================================================
FILE: src/utils/textNormalization.ts
================================================
/**
 * Comprehensive text normalization utility for cleaning up excessive whitespace
 * in streaming markdown responses to prevent large visual gaps in the UI.
 */

export function normalizeWhitespace(text: string): string {
  if (!text || typeof text !== 'string') {
    return '';
  }

  text = text.replace(/\n{3,}/g, '\n\n');
  
  text = text.replace(/[ \t]+$/gm, '');
  
  text = text.replace(/[ \t]{3,}/g, ' ');
  
  text = text.replace(/[ \t]*\n[ \t]*\n[ \t]*\n/g, '\n\n');
  
  text = text.replace(/[ \t]+\n/g, '\n');
  
  text = text.trim();
  
  return text;
}

/**
 * Specialized normalization for streaming tokens to prevent accumulation
 * of excessive whitespace during real-time text generation.
 */
export function normalizeStreamingToken(currentText: string, newToken: string): string {
  if (!newToken || typeof newToken !== 'string') {
    return currentText;
  }

  let combined = currentText + newToken;
  
  combined = normalizeWhitespace(combined);
  
  return combined;
}

/**
 * Check if text contains excessive whitespace that needs normalization
 */
export function hasExcessiveWhitespace(text: string): boolean {
  if (!text || typeof text !== 'string') {
    return false;
  }
  
  if (/\n{3,}/.test(text)) {
    return true;
  }
  
  if (/[ \t]{3,}/.test(text)) {
    return true;
  }
  
  if (/[ \t]*\n[ \t]*\n[ \t]*\n/.test(text)) {
    return true;
  }
  
  return false;
}


================================================
FILE: start-docker.sh
================================================
#!/bin/bash

# LocalGPT Docker Startup Script
# This script provides easy options for running LocalGPT in Docker

set -e

echo "🐳 LocalGPT Docker Deployment"
echo "============================"

# Function to check if local Ollama is running
check_local_ollama() {
    if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
        echo "✅ Local Ollama detected on port 11434"
        return 0
    else
        echo "❌ No local Ollama detected on port 11434"
        return 1
    fi
}

# Function to start with local Ollama
start_with_local_ollama() {
    echo "🚀 Starting LocalGPT containers (using local Ollama)..."
    echo "📝 Note: Make sure your local Ollama is running on port 11434"
    
    # Use the docker.env file for configuration
    docker compose --env-file docker.env up --build -d
    
    echo ""
    echo "🎉 LocalGPT is starting up!"
    echo "📱 Frontend: http://localhost:3000"
    echo "🔧 Backend API: http://localhost:8000"
    echo "🧠 RAG API: http://localhost:8001"
    echo "🤖 Ollama: http://localhost:11434 (local)"
    echo ""
    echo "📊 Check container status: docker compose ps"
    echo "📝 View logs: docker compose logs -f"
    echo "🛑 Stop services: docker compose down"
}

# Function to start with containerized Ollama
start_with_container_ollama() {
    echo "🚀 Starting LocalGPT containers (including Ollama container)..."
    
    # Set environment variable for containerized Ollama
    export OLLAMA_HOST=http://ollama:11434
    
    # Start all services including Ollama
    docker compose --profile with-ollama up --build -d
    
    echo ""
    echo "🎉 LocalGPT is starting up!"
    echo "📱 Frontend: http://localhost:3000"
    echo "🔧 Backend API: http://localhost:8000"
    echo "🧠 RAG API: http://localhost:8001"
    echo "🤖 Ollama: http://localhost:11434 (containerized)"
    echo ""
    echo "⏳ Note: First startup may take longer as Ollama container initializes"
    echo "📊 Check container status: docker compose --profile with-ollama ps"
    echo "📝 View logs: docker compose --profile with-ollama logs -f"
    echo "🛑 Stop services: docker compose --profile with-ollama down"
}

# Function to show usage
show_usage() {
    echo "Usage: $0 [option]"
    echo ""
    echo "Options:"
    echo "  local     - Use local Ollama instance (default)"
    echo "  container - Use containerized Ollama"
    echo "  stop      - Stop all containers"
    echo "  logs      - Show container logs"
    echo "  status    - Show container status"
    echo "  help      - Show this help message"
    echo ""
    echo "Examples:"
    echo "  $0 local      # Use local Ollama (recommended)"
    echo "  $0 container  # Use containerized Ollama"
    echo "  $0 stop       # Stop all services"
}

# Function to stop containers
stop_containers() {
    echo "🛑 Stopping LocalGPT containers..."
    docker compose down
    docker compose --profile with-ollama down 2>/dev/null || true
    echo "✅ All containers stopped"
}

# Function to show logs
show_logs() {
    echo "📝 Showing container logs (Ctrl+C to exit)..."
    if docker compose ps | grep -q "rag-ollama"; then
        docker compose --profile with-ollama logs -f
    else
        docker compose logs -f
    fi
}

# Function to show status
show_status() {
    echo "📊 Container Status:"
    docker compose ps
    echo ""
    echo "🐳 All Docker containers:"
    docker ps | grep -E "(rag-|CONTAINER)" || echo "No LocalGPT containers running"
}

# Main script logic
case "${1:-local}" in
    "local")
        if check_local_ollama; then
            start_with_local_ollama
        else
            echo ""
            echo "⚠️  No local Ollama detected. Options:"
            echo "1. Start local Ollama: 'ollama serve'"
            echo "2. Use containerized Ollama: '$0 container'"
            echo ""
            read -p "Start with containerized Ollama instead? (y/N): " -n 1 -r
            echo
            if [[ $REPLY =~ ^[Yy]$ ]]; then
                start_with_container_ollama
            else
                echo "❌ Cancelled. Please start local Ollama or use '$0 container'"
                exit 1
            fi
        fi
        ;;
    "container")
        start_with_container_ollama
        ;;
    "stop")
        stop_containers
        ;;
    "logs")
        show_logs
        ;;
    "status")
        show_status
        ;;
    "help"|"-h"|"--help")
        show_usage
        ;;
    *)
        echo "❌ Unknown option: $1"
        echo ""
        show_usage
        exit 1
        ;;
esac 

================================================
FILE: system_health_check.py
================================================
#!/usr/bin/env python3
"""
System Health Check for RAG System
Quick validation of configurations, models, and data access.
"""

import sys
import traceback
from pathlib import Path

def print_status(message, success=None):
    """Print status with emoji"""
    if success is True:
        print(f"✅ {message}")
    elif success is False:
        print(f"❌ {message}")
    else:
        print(f"🔍 {message}")

def check_imports():
    """Test basic imports"""
    print_status("Testing basic imports...")
    try:
        from rag_system.main import get_agent, EXTERNAL_MODELS, OLLAMA_CONFIG, PIPELINE_CONFIGS
        print_status("Basic imports successful", True)
        return True
    except Exception as e:
        print_status(f"Import failed: {e}", False)
        return False

def check_configurations():
    """Validate configurations"""
    print_status("Checking configurations...")
    try:
        from rag_system.main import EXTERNAL_MODELS, OLLAMA_CONFIG, PIPELINE_CONFIGS
        
        print(f"📊 External Models: {EXTERNAL_MODELS}")
        print(f"📊 Ollama Config: {OLLAMA_CONFIG}")
        print(f"📊 Pipeline Configs: {PIPELINE_CONFIGS}")
        
        # Check for common model dimension issues
        embedding_model = EXTERNAL_MODELS.get("embedding_model", "Unknown")
        if "bge-small" in embedding_model:
            print_status(f"Embedding model: {embedding_model} (384 dims)", True)
        elif "Qwen3-Embedding" in embedding_model:
            print_status(f"Embedding model: {embedding_model} (1024 dims) - Check data compatibility!", None)
        else:
            print_status(f"Embedding model: {embedding_model} - Verify dimensions!", None)
            
        print_status("Configuration check completed", True)
        return True
    except Exception as e:
        print_status(f"Configuration check failed: {e}", False)
        return False

def check_agent_initialization():
    """Test agent initialization"""
    print_status("Testing agent initialization...")
    try:
        from rag_system.main import get_agent
        agent = get_agent('default')
        print_status("Agent initialization successful", True)
        return agent
    except Exception as e:
        print_status(f"Agent initialization failed: {e}", False)
        traceback.print_exc()
        return None

def check_embedding_model(agent):
    """Test embedding model"""
    print_status("Testing embedding model...")
    try:
        embedder = agent.retrieval_pipeline._get_text_embedder()
        test_emb = embedder.create_embeddings(['test'])
        
        model_name = getattr(embedder.model, 'name_or_path', 'Unknown')
        dimensions = test_emb.shape[1]
        
        print_status(f"Embedding model: {model_name}", True)
        print_status(f"Vector dimension: {dimensions}", True)
        
        # Warn about dimension compatibility
        if dimensions == 384:
            print_status("Using 384-dim embeddings (bge-small compatible)", True)
        elif dimensions == 1024:
            print_status("Using 1024-dim embeddings (Qwen3 compatible) - Ensure data compatibility!", None)
        
        return True
    except Exception as e:
        print_status(f"Embedding model test failed: {e}", False)
        return False

def check_database_access():
    """Test database access"""
    print_status("Testing database access...")
    try:
        import lancedb
        db = lancedb.connect('./lancedb')
        tables = db.table_names()
        
        print_status(f"LanceDB connected - {len(tables)} tables available", True)
        if tables:
            print("📋 Available tables:")
            for table in tables[:5]:  # Show first 5 tables
                print(f"   - {table}")
            if len(tables) > 5:
                print(f"   ... and {len(tables) - 5} more")
        else:
            print_status("No tables found - may need to index documents first", None)
            
        return True
    except Exception as e:
        print_status(f"Database access failed: {e}", False)
        return False

def check_sample_query(agent):
    """Test a sample query if tables exist"""
    print_status("Testing sample query...")
    try:
        import lancedb
        db = lancedb.connect('./lancedb')
        tables = db.table_names()
        
        if not tables:
            print_status("No tables available for query test", None)
            return True
            
        # Use first available table
        table_name = tables[0]
        print_status(f"Testing query on table: {table_name}")
        
        result = agent.run('what is this document about?', table_name=table_name)
        
        if result and 'answer' in result:
            print_status("Sample query successful", True)
            print(f"📝 Answer preview: {result['answer'][:100]}...")
            print(f"📊 Found {len(result.get('source_documents', []))} source documents")
        else:
            print_status("Query returned empty result", None)
            
        return True
    except Exception as e:
        print_status(f"Sample query failed: {e}", False)
        return False

def main():
    """Run complete system health check"""
    print("🏥 RAG System Health Check")
    print("=" * 50)
    
    checks_passed = 0
    total_checks = 6
    
    # Basic checks
    if check_imports():
        checks_passed += 1
    
    if check_configurations():
        checks_passed += 1
    
    if check_database_access():
        checks_passed += 1
    
    # Agent-dependent checks
    agent = check_agent_initialization()
    if agent:
        checks_passed += 1
        
        if check_embedding_model(agent):
            checks_passed += 1
            
        if check_sample_query(agent):
            checks_passed += 1
    
    # Summary
    print("\n" + "=" * 50)
    print(f"🏥 Health Check Complete: {checks_passed}/{total_checks} checks passed")
    
    if checks_passed == total_checks:
        print_status("System is healthy! 🎉", True)
        return 0
    elif checks_passed >= total_checks - 1:
        print_status("System mostly healthy with minor issues", None)
        return 0
    else:
        print_status("System has significant issues that need attention", False)
        return 1

if __name__ == "__main__":
    sys.exit(main()) 

================================================
FILE: tailwind.config.js
================================================
/** @type {import('tailwindcss').Config} */
module.exports = {
  content: [
    './src/**/*.{js,ts,jsx,tsx}',
    './src/components/**/*.{js,ts,jsx,tsx}',
  ],
  theme: {
    extend: {},
  },
  plugins: [],
} 

================================================
FILE: test_docker_build.sh
================================================
#!/bin/bash

# Test Docker builds individually
echo "🐳 Testing Docker builds individually..."

# Function to check if Docker is running
check_docker() {
    if ! docker version >/dev/null 2>&1; then
        echo "❌ Docker is not running. Please start Docker Desktop."
        exit 1
    fi
    echo "✅ Docker is running"
}

# Function to build and test a single container
build_and_test() {
    local service=$1
    local dockerfile=$2
    local port=$3
    
    echo ""
    echo "🔨 Building $service..."
    docker build -f $dockerfile -t "rag-$service" .
    if [ $? -ne 0 ]; then
        echo "❌ Failed to build $service"
        return 1
    fi
    
    echo "✅ $service built successfully"
    
    # Test running the container
    echo "🚀 Testing $service container..."
    docker run -d --name "test-$service" -p "$port:$port" "rag-$service"
    if [ $? -ne 0 ]; then
        echo "❌ Failed to run $service"
        return 1
    fi
    
    echo "⏳ Waiting for $service to start..."
    sleep 10
    
    # Test health
    if [ "$service" = "frontend" ]; then
        curl -f "http://localhost:$port" >/dev/null 2>&1
    elif [ "$service" = "backend" ]; then
        curl -f "http://localhost:$port/health" >/dev/null 2>&1
    elif [ "$service" = "rag-api" ]; then
        curl -f "http://localhost:$port/models" >/dev/null 2>&1
    fi
    
    if [ $? -eq 0 ]; then
        echo "✅ $service is healthy"
    else
        echo "⚠️ $service health check failed (but container is running)"
        docker logs "test-$service" | tail -10
    fi
    
    # Cleanup
    docker stop "test-$service" >/dev/null 2>&1
    docker rm "test-$service" >/dev/null 2>&1
    
    return 0
}

# Main execution
check_docker

echo "🧹 Cleaning up old containers and images..."
docker container prune -f >/dev/null 2>&1
docker image prune -f >/dev/null 2>&1

# Build in dependency order
echo "📦 Building containers in dependency order..."

# 1. RAG API (no dependencies)
build_and_test "rag-api" "Dockerfile.rag-api" "8001"
if [ $? -ne 0 ]; then
    echo "❌ RAG API build failed, stopping"
    exit 1
fi

# 2. Backend (depends on RAG API)
build_and_test "backend" "Dockerfile.backend" "8000"
if [ $? -ne 0 ]; then
    echo "❌ Backend build failed, stopping"
    exit 1
fi

# 3. Frontend (depends on Backend)
build_and_test "frontend" "Dockerfile.frontend" "3000"
if [ $? -ne 0 ]; then
    echo "❌ Frontend build failed, stopping"
    exit 1
fi

echo ""
echo "🎉 All containers built and tested successfully!"
echo "🚀 You can now run: ./start-docker.sh" 

================================================
FILE: test_markdown_streaming.js
================================================

const testMarkdownWithExcessiveNewlines = `# Test Response

This is a test response with excessive newlines.


Here's some content after multiple empty lines.


## Section Header

More content here.


### Subsection

Final content with lots of spacing.


The end.`;

const testStreamingTokens = [
  "# Test Response\n\n",
  "This is a test response",
  " with excessive newlines.\n\n\n\n",
  "Here's some content after",
  " multiple empty lines.\n\n\n\n\n",
  "## Section Header\n\n",
  "More content here.\n\n\n\n\n\n\n",
  "### Subsection\n\n",
  "Final content with lots",
  " of spacing.\n\n\n\n\n",
  "The end."
];

function currentCleanup(text) {
  return text.replace(/\n{3,}/g, '\n\n');
}

function improvedCleanup(text) {
  text = text.replace(/\n{3,}/g, '\n\n');
  
  text = text.replace(/[ \t]+$/gm, '');
  
  text = text.replace(/[ \t]{3,}/g, ' ');
  
  text = text.replace(/[ \t]*\n[ \t]*\n[ \t]*\n/g, '\n\n');
  
  text = text.trim();
  
  return text;
}

console.log("=== ORIGINAL TEXT ===");
console.log(JSON.stringify(testMarkdownWithExcessiveNewlines));

console.log("\n=== CURRENT CLEANUP ===");
console.log(JSON.stringify(currentCleanup(testMarkdownWithExcessiveNewlines)));

console.log("\n=== IMPROVED CLEANUP ===");
console.log(JSON.stringify(improvedCleanup(testMarkdownWithExcessiveNewlines)));

console.log("\n=== STREAMING SIMULATION ===");
let streamedText = "";
testStreamingTokens.forEach((token, i) => {
  streamedText += token;
  console.log(`Token ${i + 1}: "${token}"`);
  console.log(`Accumulated (current): "${currentCleanup(streamedText)}"`);
  console.log(`Accumulated (improved): "${improvedCleanup(streamedText)}"`);
  console.log("---");
});


================================================
FILE: tsconfig.json
================================================
{
  "compilerOptions": {
    "target": "ES2017",
    "lib": ["dom", "dom.iterable", "esnext"],
    "allowJs": true,
    "skipLibCheck": true,
    "strict": true,
    "noEmit": true,
    "esModuleInterop": true,
    "module": "esnext",
    "moduleResolution": "bundler",
    "resolveJsonModule": true,
    "isolatedModules": true,
    "jsx": "preserve",
    "incremental": true,
    "plugins": [
      {
        "name": "next"
      }
    ],
    "paths": {
      "@/*": ["./src/*"]
    }
  },
  "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
  "exclude": ["node_modules"]
}