[
  {
    "path": ".gitignore",
    "content": "/audio_data.transcription.json\n/db.ipynb\n/elevenlabs.ipynb\n/main.ipynb\n/venv\n/stt/__init__.py\n/data\n/logs\n/.env\n/app/nlp/pos_tagger.model"
  },
  {
    "path": "app/__init__.py",
    "content": ""
  },
  {
    "path": "app/audio/preprocess.py",
    "content": "import librosa\nimport soundfile as sf\nimport noisereduce as nr\nimport os\n\n\ndef preprocess_audio(input_path, output_path):\n\n\n    try:\n\n    #loading the audio, 16khz and in mono format\n        y, sr = librosa.load(input_path, sr=16000, mono = True)\n\n    #reducing the noise\n        y  = nr.reduce_noise(\n        y=y,\n        sr=sr,\n        prop_decrease=0.8\n        )\n\n    #removing the silent time in the voices\n        #y, _ = librosa.effects.trim(y, top_db=20)\n\n    #voice normalization\n        y = librosa.util.normalize(y)\n\n        #saving the outputs\n        os.makedirs(os.path.dirname(output_path), exist_ok=True)\n\n        sf.write(output_path, y, 16000)\n\n        return output_path\n    \n    except Exception as e:\n        raise RuntimeError(f\"Preprocessing failed: {str(e)}\")\n\n\n"
  },
  {
    "path": "app/config.py",
    "content": "import os\nfrom dotenv import load_dotenv\nfrom pathlib import Path\n\nBASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n\nload_dotenv()\n\nMONGO_URL = \"mongodb://admin:secret@localhost:27017\"\n\nELEVENLABS_API_KEY = os.getenv(\"ELEVENLABS_API_KEY\")\n\n#PATHS\nRECORDINGS_PATH = os.path.join(BASE_DIR, \"data\", \"raw\") #then we can change it to the server url\n\nTEMP_DIR = os.path.join(BASE_DIR, \"data\", \"temp\")\n\n#this one is for later for when i wanted to handle errors and api calls\nMAX_RETRIES = 3\n\n#SERVER CONFIG\nISSABEL_HOST = os.getenv(\"ISSABEL_HOST\")\nISSABEL_PORT = 22\nISSABEL_USER = os.getenv(\"ISSABEL_USER\")\nISSABEL_PASSWORD = os.getenv(\"ISSABEL_PASSWORD\")\n\nISSABEL_REMOTE_PATH = os.getenv(\"ISSABEL_REMOTE_PATH\")\nLOCAL_DOWNLOAD_PATH = os.path.join(BASE_DIR, \"data\", \"raw\")\n"
  },
  {
    "path": "app/dashboard/__init__.py",
    "content": ""
  },
  {
    "path": "app/dashboard/dashboard.py",
    "content": "import streamlit as st\nfrom collections import Counter\nfrom pymongo import MongoClient\nimport pandas as pd\n\nMONGO_URL = \"mongodb://admin:secret@localhost:27017\"\n\nclient = MongoClient(MONGO_URL)\ndb = client.audio_data\n\nrecordings = db['recordings']\ntranscription = db['transcription']\nnlp = db['nlp']\n\nst.set_page_config(layout=\"wide\")\nst.title(\"🎙️ STT Pipeline Dashboard\")\n\n# =========================\n# 1. PIPELINE OVERVIEW\n# =========================\n\nst.header(\"📊 Pipeline Overview\")\n\nrecords = list(recordings.find())\n\nif records:\n    df = pd.DataFrame(records)\n\n    status_counts = df[\"status\"].value_counts().to_dict()\n\n    col1, col2, col3, col4 = st.columns(4)\n\n    col1.metric(\"Total Files\", len(df))\n    col2.metric(\"Done\", status_counts.get(\"done\", 0))\n    col3.metric(\"Failed\", status_counts.get(\"failed\", 0))\n    col4.metric(\"Processing\", status_counts.get(\"processing\", 0))\n\n    success_rate = (\n        status_counts.get(\"done\", 0) / len(df)\n        if len(df) > 0 else 0\n    )\n\n    st.progress(success_rate)\n    st.write(f\"✅ Success Rate: {round(success_rate * 100, 2)}%\")\n\nelse:\n    st.warning(\"No data available\")\n\n# =========================\n# 2. CURRENT PIPELINE STATE\n# =========================\n\nst.header(\"⚙️ Current Pipeline State\")\n\nstate_counts = Counter([r.get(\"current_stage\", \"unknown\") for r in records])\nst.write(state_counts)\n\n# =========================\n# REAL TIME LOG SYSTEM\n#==========================\nimport json\nimport pandas as pd\n\nst.header(\"📡 Logs Explorer\")\n\nlog_file = \"logs/app.json\"\n\ndef load_logs():\n    logs = []\n    try:\n        with open(log_file, \"r\") as f:\n            for line in f:\n                logs.append(json.loads(line))\n    except:\n        pass\n    return logs\n\nlogs = load_logs()\n\nif logs:\n    df_logs = pd.DataFrame(logs)\n\n    # FILTERS\n    col1, col2, col3 = st.columns(3)\n\n    level_filter = col1.selectbox(\"Level\", [\"ALL\"] + sorted(df_logs[\"level\"].unique().tolist()))\n    stage_filter = col2.selectbox(\"Stage\", [\"ALL\"] + sorted(df_logs.get(\"stage\", []).dropna().unique().tolist()))\n    rec_filter = col3.text_input(\"Recording ID\")\n\n    filtered = df_logs.copy()\n\n    if level_filter != \"ALL\":\n        filtered = filtered[filtered[\"level\"] == level_filter]\n\n    if stage_filter != \"ALL\":\n        filtered = filtered[filtered[\"stage\"] == stage_filter]\n\n    if rec_filter:\n        filtered = filtered[filtered[\"recording_id\"].astype(str).str.contains(rec_filter)]\n\n    st.dataframe(filtered.sort_values(\"timestamp\", ascending=False))\nelse:\n    st.info(\"No logs yet\")\n\n# =========================\n# 3. RECENT ACTIVITY\n# =========================\n\nst.header(\"🕒 Recent Activity\")\n\nrecent = recordings.find().sort(\"updated_at\", -1).limit(10)\n\nfor r in recent:\n    st.write({\n        \"file\": r.get(\"filename\"),\n        \"status\": r.get(\"status\"),\n        \"stage\": r.get(\"current_stage\"),\n        \"retry\": r.get(\"retry_count\"),\n    })\n\n# =========================\n# 4. ERROR MONITORING\n# =========================\n\nst.header(\"❌ Recent Errors\")\n\nerrors = recordings.find({\n    \"status\": {\"$in\": [\"failed\", \"permanently_failed\"]},\n    \"error\": {\"$ne\": None}\n}).sort(\"updated_at\", -1).limit(10)\n\nfor e in errors:\n    st.error({\n        \"file\": e.get(\"filename\"),\n        \"error\": e.get(\"error\"),\n        \"retry\": e.get(\"retry_count\")\n    })\n\n#=====================\n#RETRY MONITORING\n#===================\n\nst.header(\"📈 Retry Monitoring\")\n\nretry_data = [\n    {\n        \"file\": r.get(\"filename\"),\n        \"retry\": r.get(\"retry_count\"),\n        \"status\": r.get(\"status\")\n    }\n    for r in recordings.find()\n]\n\nst.dataframe(retry_data)\n\n# =========================\n# 5. TRANSCRIPTION INSIGHTS\n# =========================\n\nst.header(\"📝 Transcription Insights\")\n\ntotal_transcripts = transcription.count_documents({})\nst.write(f\"Total transcriptions: {total_transcripts}\")\n\n# =========================\n# 6. NLP INSIGHTS (optional)\n# =========================\n\nst.header(\"🧠 NLP Insights\")\n\nall_tokens = []\nfor doc in nlp.find():\n    all_tokens.extend(doc.get(\"filtered_tokens\", []))\n\nif all_tokens:\n    top_keywords = Counter(all_tokens).most_common(10)\n    st.write(\"Top Keywords:\", top_keywords)\nelse:\n    st.info(\"No NLP data yet\")\n\n\n#==========\n#FILE EXPLORER\n#==============\nst.header(\"📂 File Explorer\")\n\nrecords = list(recordings.find())\n\nfile_map = {\n    f\"{r.get('filename')} | {str(r.get('_id'))[:6]}\": r\n    for r in records\n}\n\nselected_key = st.selectbox(\"Select a file\", list(file_map.keys()))\n\nif selected_key:\n    selected = file_map[selected_key]\n    rec_id = selected[\"_id\"]\n\n    st.subheader(\"📄 Recording Metadata\")\n    st.json(selected)\n\n    # =====================\n    # TRANSCRIPTION VIEW\n    # =====================\n    trans_doc = transcription.find_one({\"recording_id\": rec_id})\n\n    if trans_doc:\n        st.subheader(\"📝 Transcription\")\n\n        st.text_area(\"Full Text\", trans_doc.get(\"text\", \"\"), height=150)\n\n        st.subheader(\"🗣️ Diarized Text\")\n        st.text_area(\"Speakers\", trans_doc.get(\"diarized_text\", \"\"), height=150)\n\n        # =====================\n        # WORDS TABLE\n        # =====================\n        st.subheader(\"🔍 Word-level Data\")\n\n        words = trans_doc.get(\"words\", [])\n        if words:\n            st.dataframe(words[:100])  # limit for performance\n\n    else:\n        st.warning(\"No transcription found\")\n\n  "
  },
  {
    "path": "app/database/__init__.py",
    "content": ""
  },
  {
    "path": "app/database/mongo_db.py",
    "content": "from pymongo import MongoClient\nfrom app.config import MONGO_URL\nfrom app.utils.logger import logger\n\n#connecting to mongodb\nclient = MongoClient(MONGO_URL)\n\n#selecting the database\ndb = client.audio_data\n\nrecordings = db['recordings']\ntranscription = db['transcription']\nnlp = db['nlp']\nrag_segments = db['rag_segments']\nlog = db['log']\n\n\n\n"
  },
  {
    "path": "app/database/operation.py",
    "content": "from datetime import datetime\nfrom app.utils.logger import logger\nfrom app.config import MAX_RETRIES\nfrom app.database.mongo_db import recordings, transcription, rag_segments\nfrom app.database.schemas import build_recording_doc, build_transcription_doc, build_rag_segment \nimport os\n\n\n#function to save the recordings metadata first\ndef create_recordings(file_path,file_hash):\n    doc = build_recording_doc(file_path, file_hash)\n    return recordings.insert_one(doc).inserted_id\n\n\ndef save_transcription(recording_id, result):\n    doc = build_transcription_doc(recording_id, result)\n    transcription.insert_one(doc)\n\n\ndef save_rag_segments(recording_id, segments):\n    doc = build_rag_segment(recording_id, segments)\n    rag_segments.insert_one(doc)\n\n\n#function to update the stage of the recordings\ndef update_stage(recording_id, stage):\n    recordings.update_one(\n        {\"_id\": recording_id},\n        {\"$set\": {\"current_stage\": stage, \"status\": \"processing\"}}\n    )\n\n#function to marke the done audio\ndef mark_done(recording_id):\n    recordings.update_one(\n        {\"_id\": recording_id},\n        {\"$set\": {\"status\": \"done\",\n                   \"current_stage\": \"done\",\n                   \"updated_at\": datetime.utcnow()}}\n    )\n\n#function to mark the failed transcripted audio\ndef mark_failed(recording_id, error):\n\n    rec = recordings.find_one({\"_id\": recording_id})\n\n    if rec[\"status\"] == \"done\":\n        return\n\n    new_retry = rec.get(\"retry_count\", 0) + 1\n\n    status = \"failed\"\n    if new_retry >= MAX_RETRIES:\n        status = \"permanently_failed\"\n\n    recordings.update_one(\n        {\"_id\": recording_id},\n        {\n            \"$set\": {\n                \"status\": status,\n                \"error\": error,\n                \"current_stage\": \"failed\",\n                \"updated_at\": datetime.utcnow()\n            },\n            \"$inc\": {\"retry_count\": 1}\n        }\n    )\n\n"
  },
  {
    "path": "app/database/schemas.py",
    "content": "from datetime import datetime\nimport os\n\n#recordings collection\ndef build_recording_doc(path, file_hash):\n    return{\n        \"filename\": os.path.basename(path),\n\n        \"raw_path\": path,\n        \"processed_path\": None,\n\n        \"file_hash\": file_hash,\n        \"file_size\": os.path.getsize(path),\n        \"last_modified\": os.path.getmtime(path),\n\n        #pipeline status\n        \"status\" : \"pending\",\n        \"current_stage\": \"ingestion\",\n\n        \"stages\":{\n            \"preprocessing\": \"pending\",\n            \"transcription\": \"pending\",\n            \"postprocessing\": \"pending\",\n            \"rag_segmentation\" : \"pending\"\n        },\n\n        \"retry_count\": 0,\n        \"error\": None,\n\n        \"created_at\": datetime.utcnow(),\n        \"updated_at\": datetime.utcnow()\n\n    }\n\n#transcription segments\ndef build_transcription_doc(recording_id, result): \n    return {\n        \"recording_id\": recording_id,\n\n        \"text\": result.get(\"text\"),\n        \"diarized_text\": result.get(\"diarized_text\"),\n        \"speaker_segments\": result.get(\"segments\"),\n        \"words\": result.get(\"words\"),\n\n        \"audio_duration_secs\": result.get(\"audio_duration_secs\"),\n\n        \"created_at\": datetime.utcnow()\n    }\n\n#rag_Segment collection\ndef build_rag_segment(recording_id, segments):\n\n    return{\n        \"recording_id\": recording_id,\n        \"segments\": segments,\n        \"created_at\": datetime.utcnow()\n    }\n\n"
  },
  {
    "path": "app/ingestion/__init__.py",
    "content": ""
  },
  {
    "path": "app/ingestion/deduplicator.py",
    "content": "from app.database.mongo_db import recordings\nfrom app.utils.logger import logger\n\ndef is_duplicate(file_hash):\n    return recordings.find_one({\"file_hash\": file_hash}) is not None"
  },
  {
    "path": "app/ingestion/fetcher.py",
    "content": "import os\nfrom app.ingestion.issabelle_client import create_sftp_client\nfrom app.config import ISSABEL_REMOTE_PATH, LOCAL_DOWNLOAD_PATH\n\n\ndef fetch_new_files():\n    sftp, transport = create_sftp_client()\n\n    os.makedirs(LOCAL_DOWNLOAD_PATH, exist_ok=True)\n\n    downloaded = []\n\n    try:\n        for file in sftp.listdir(ISSABEL_REMOTE_PATH):\n\n            if not file.endswith(\".wav\"):\n                continue\n\n            remote_path = f\"{ISSABEL_REMOTE_PATH}/{file}\"\n            local_path = os.path.join(LOCAL_DOWNLOAD_PATH, file)\n\n            if os.path.exists(local_path):\n                continue\n\n            sftp.get(remote_path, local_path)\n            downloaded.append(local_path)\n\n    finally:\n        sftp.close()\n        transport.close()\n\n    return downloaded"
  },
  {
    "path": "app/ingestion/hashing.py",
    "content": "import hashlib\nfrom app.utils.logger import logger\n\ndef compute_file_hash(file_path, chunk_size = 8192):\n\n    sha256 = hashlib.sha256()\n\n    with open(file_path, \"rb\") as f:\n        while chunk := f.read(chunk_size):\n            sha256.update(chunk)\n\n    return sha256.hexdigest()"
  },
  {
    "path": "app/ingestion/issabelle_client.py",
    "content": "import paramiko\nimport os\nfrom app.config import (\n    ISSABEL_HOST,\n    ISSABEL_PORT,\n    ISSABEL_USER,\n    ISSABEL_PASSWORD\n)\n\n\ndef create_sftp_client():\n    transport = paramiko.Transport((ISSABEL_HOST, ISSABEL_PORT))\n    transport.connect(username= ISSABEL_USER, password = ISSABEL_PASSWORD)\n\n    sftp = paramiko.SFTPClient.from_transport(transport)\n\n    return sftp, transport"
  },
  {
    "path": "app/ingestion/load_audio.py",
    "content": "import os\nfrom app.config import RECORDINGS_PATH\nfrom app.utils.logger import logger\n\nSUPPORTED_FORMATS = (\".wav\", \".mp3\", \".flac\", \".m4a\")\n\ndef get_audio_files():\n    audio_files = []\n\n    base_path = os.path.abspath(RECORDINGS_PATH)\n\n    for root, _, files in os.walk(base_path):\n        for file in files:\n            if file.lower().endswith(SUPPORTED_FORMATS):\n                full_path = os.path.abspath(os.path.join(root, file))\n                audio_files.append(full_path)\n\n    #sorting the files\n    audio_files.sort()\n\n    return audio_files\n\n\"\"\"\n#previous code \n    for file in os.listdir(RECORDINGS_PATH):\n        if file.endswith(\".wav\") or file.endswith(\".mp3\"):\n            audio_files.append(os.path.join(RECORDINGS_PATH, file))\n    return audio_files\n\"\"\""
  },
  {
    "path": "app/ingestion/runner.py",
    "content": "from app.database.mongo_db import recordings\nfrom app.database.operation import create_recordings\nfrom app.ingestion.hashing import compute_file_hash\nfrom app.ingestion.deduplicator import is_duplicate\nfrom app.ingestion.load_audio import get_audio_files\nfrom app.utils.logger import logger\nimport os\nfrom datetime import datetime\nimport hashlib\n\n\ndef ingest_new_files():\n\n\n    logger.info(\"Starting ingestion\")\n\n    audio_files = get_audio_files()\n\n    new_count =0\n    skipped_count = 0\n\n    for path in audio_files:\n\n        try:\n            file_hash = compute_file_hash(path)\n\n            if is_duplicate(file_hash):\n                skipped_count += 1\n                logger.warning(f\"Duplicate file skipped: {path}\")\n                continue\n\n            create_recordings(path, file_hash)\n            new_count +=1\n\n        except Exception as e:\n            logger.error(f\"Ingestion failed for {path}\",\n                         extra = {\"error\": str(e)})\n            \n    logger.info(f\"Ingestion done: {new_count} new, {skipped_count} skipped\")\n"
  },
  {
    "path": "app/main.py",
    "content": "import asyncio\nfrom app.pipeline.orchestrator import run_pipeline\nfrom app.ingestion.runner import ingest_new_files\nfrom app.utils.state_validator import run_state_validator\n\n\n\nif __name__ == \"__main__\":\n\n    #validate states before starting the pipeline\n    run_state_validator()\n\n    #detect and ingest new files\n    ingest_new_files()\n    \n    #processing the files\n    asyncio.run(run_pipeline())\n\n\n"
  },
  {
    "path": "app/maintanance/cleanup.py",
    "content": "import os\nfrom app.database.mongo_db import recordings\nfrom app.utils.logger import logger\n\nSAFE_TO_DELETE_STATUS = {\"done\", \"permanently_failed\"}\n\ndef cleanup_files(dry_run: bool = True):\n\n    logger.info(\"Starting cleanup job\")\n\n    records = recordings.find({\n        \"status\": {\"$in\": list(SAFE_TO_DELETE_STATUS)}\n    })\n\n    deleted = 0\n    skipped = 0\n\n    for rec in records:\n\n        raw_path = rec.get(\"raw_path\")\n        processed_path = rec.get(\"processed_path\")\n        recording_id = rec[\"_id\"]\n\n        \n        logger.info(\n            \"checking record\",\n            extra={\n            \"recording_id\": str(rec[\"_id\"]),\n            \"status\": rec.get(\"status\"),\n            \"raw_path\": rec.get(\"raw_path\")\n            }\n         )\n\n\n        for path in [raw_path, processed_path]:\n\n            if not path:\n                continue\n\n            if not os.path.exists(path):\n                skipped +=1\n                continue\n\n            if dry_run:\n                logger.info(\n                    \"DRY RUN - would delete file\",\n                    extra ={\n                        \"recording_id\": str(recording_id), \n                        \"path\": path\n                    }\n                )\n                continue\n\n            try:\n                os.remove(path)\n                deleted += 1\n\n                logger.info(\n                    \"File deleted safely\",\n                    extra = {\n                        \"recording_id\": str(recording_id),\n                        \"path\": path\n                    }\n                )\n\n            except Exception as e:\n                logger.error(\n                    \"Failed to delete file\",\n                    extra = {\n                        \"recording_id\": str(recording_id),\n                        \"path\": path,\n                        \"error\": str(e)\n                    }\n                )\n\n    logger.info(\n        \"Cleanup finished\",\n        extra ={\n            \"deleted\":deleted,\n            \"skipped\": skipped\n        }\n    )\n\n#to run seperatly\ncleanup_files()"
  },
  {
    "path": "app/pipeline/__init__.py",
    "content": ""
  },
  {
    "path": "app/pipeline/orchestrator.py",
    "content": "import asyncio\nfrom app.database.mongo_db import recordings\nfrom app.pipeline.worker import process_audio\nfrom app.config import MAX_RETRIES\nfrom app.utils.logger import logger\n\n\"\"\"\nfrom app.queue import queue\nfrom app.pipeline.worker import process_audio_sync\n\"\"\"\n\n\nasync def run_pipeline():\n    logger.info(\"Pipeline started\")\n\n\n    #fetching the pending records from the database\n    cursor = recordings.find({\n        \"status\": {\"$in\": [\"pending\", \"failed\"]},\n        \"retry_count\": {\"$lt\": MAX_RETRIES}\n    })\n\n    tasks = [process_audio(r) for r in cursor]\n\n    results =await asyncio.gather(*tasks, return_exceptions=True)\n\n    for r in results:\n        if isinstance(r, Exception):\n            logger.error(f\"Unhandled exception in pipeline: {r}\")\n    logger.info(\"Pipeline finished\")\n    \n\n    \"\"\"\n    # app/pipeline/orchestrator.py\n\nfrom app.queue import queue\nfrom app.pipeline.worker import process_audio_sync\n\ndef run_pipeline():\n    logger.info(\"Pipeline started\")\n\n    cursor = recordings.find({\n        \"status\": {\"$in\": [\"pending\", \"failed\"]},\n        \"retry_count\": {\"$lt\": MAX_RETRIES}\n    })\n\n    for record in cursor:\n        queue.enqueue(\n            process_audio_sync,\n            record,\n            job_timeout=600  # 10 min\n        )\n\n    logger.info(\"Jobs enqueued\")\n    \"\"\""
  },
  {
    "path": "app/pipeline/worker.py",
    "content": "import asyncio\nimport os\nfrom concurrent.futures import ProcessPoolExecutor\n\nfrom app.utils.logger import logger\nfrom app.utils.retry import retry_async\n\nfrom app.database.mongo_db import recordings\n\nfrom app.utils.stage_manager import (\n    is_stage_done,\n    mark_stage_done,\n    mark_stage_failed\n)\n\nfrom app.config import TEMP_DIR\nfrom app.audio.preprocess import preprocess_audio\nfrom app.stt.transcribe import transcribe\n\nfrom app.process.segmentation import segment_speakers\nfrom app.process.utils import words_to_dict, print_transcript_by_speaker\n\nfrom app.database.operation import (\n    update_stage,\n    mark_done,\n    mark_failed,\n    save_transcription,\n    save_rag_segments\n)\n\nfrom app.stt.rag_segments import build_rag_segments\n\nsemaphore = asyncio.Semaphore(3)\nexecutor = ProcessPoolExecutor(max_workers=os.cpu_count() // 2 or 1)\n\n\nasync def process_audio(record):\n    recording_id = record[\"_id\"]\n    success = False\n    processed_path = record.get(\"processed_path\")\n\n    logger.info(\"Worker started\", extra={\"recording_id\": recording_id})\n\n    try:\n        #to remove the temp file after the successfully done process\n        processed_path = None\n        raw_path = record.get(\"raw_path\")\n\n        if not raw_path or not os.path.exists(raw_path):\n            raise FileNotFoundError(f\"Raw file missing: {raw_path}\")\n\n        # ---------------- PREPROCESS ----------------\n        update_stage(recording_id, \"preprocessing\")\n\n        logger.info(\"Preprocessing started\",\n                    extra = {\"recording_Id\": recording_id})\n\n        temp_path = os.path.join(TEMP_DIR, f\"{recording_id}.wav\")\n\n        async def run_preprocessing():\n            loop = asyncio.get_event_loop()\n            return await loop.run_in_executor(\n                executor,\n                preprocess_audio,\n                raw_path,\n                temp_path\n            )\n\n        processed_path = await retry_async(\n            run_preprocessing,\n            retries=2,\n            recording_id=recording_id,\n            stage=\"preprocessing\"\n        )\n\n        recordings.update_one(\n            {\"_id\": recording_id},\n            {\"$set\": {\"processed_path\": processed_path}}\n        )\n\n        # ---------------- TRANSCRIPTION ----------------\n        update_stage(recording_id, \"transcription\")\n        \n        logger.info(\n            \"Transcription\",\n            extra = {\"recording_id\": recording_id}\n        )\n\n        async with semaphore:\n            result = await retry_async(\n                transcribe,\n                processed_path,\n                retries=3,\n                recording_id=recording_id,\n                stage=\"transcription\"\n            )\n\n        # ---------------- POST PROCESS ----------------\n        update_stage(recording_id, \"postprocessing\")\n\n        logger.info(\"Postprocessing started\",\n                    extra = {\"recording_id\": recording_id})\n\n        diarized = print_transcript_by_speaker(result)\n        words = words_to_dict(result)\n        segments = segment_speakers(result)\n\n        save_transcription(recording_id, {\n            \"text\": result.get(\"text\"),\n            \"diarized_text\": diarized,\n            \"words\": words,\n            \"segments\": segments,\n            \"audio_duration_secs\": result.get(\"audio_duration_secs\")\n        })\n\n        # ---------------- RAG ----------------\n        update_stage(recording_id, \"rag_segmentation\")\n\n        logger.info(\n            \"RAG Segmentation started\",\n            extra = {\"recording_id\": recording_id}\n        )\n\n        rag = build_rag_segments(result)\n        save_rag_segments(recording_id, rag)\n\n        # ---------------- DONE ----------------\n        mark_done(recording_id)\n        success = True\n\n        logger.info(\"Processing completed\", extra={\"recording_id\": recording_id})\n\n    except Exception as e:\n        logger.error(\"Processing failed\", extra={\n            \"recording_id\": recording_id,\n            \"error\": str(e)\n        })\n        mark_failed(recording_id, str(e))\n\n    finally:\n        if success:\n            logger.info(\n                \"Cleaning up files(success)\",\n                extra = {\"recording_id\": recording_id}\n            )\n\n            #delete the raw file\n            if raw_path and os.path.exists(raw_path):\n                try:\n                    os.remove(raw_path)\n                except Exception as e:\n                    logger.warning(\n                        \"Failed to delete raw file\",\n                        extra = {\"recording_id\": recording_id, \"error\": str(e)}\n                    )\n\n            #deleting the TEMP \n            if processed_path and os.path.exists(processed_path):\n                try:\n                    os.remove(processed_path)\n                except Exception as e:\n                    logger.warning(\n                        \"Failed to delete temp file\",\n                        extra = {\"recording_id\": recording_id, \"error\": str(e)}\n                    )\n\n        else:\n            logger.info(\n                \"Skipping cleanup(failure)- files preserverd for retry\",\n                extra = {\"recording_id\": recording_id}\n            )\n\n\n#this part is to make the worker pipeline into a queue\n\"\"\"def process_audio_sync(record):\n    import asyncio\n    return asyncio.run(process_audio(record))\n\nimport rq\nfrom redis import Redis\n\nredis_conn = Redis(host=\"localhost\")\"\"\""
  },
  {
    "path": "app/process/__init__.py",
    "content": ""
  },
  {
    "path": "app/process/segmentation.py",
    "content": "#segmentation \n\ndef segment_speakers(result):\n    words = result.get(\"words\", [])\n\n    segments = []\n    current = None\n\n    for word in words:\n        if word.get(\"type\") != \"word\":\n            continue\n\n        if current is None:\n            current = {\n                \"speaker_id\": word.get(\"speaker_id\"),\n                \"start\": word.get(\"start\"),\n                \"end\": word.get(\"end\"),\n                \"text\": [word.get(\"text\", \"\")]\n            }\n            continue\n\n        if word.get(\"speaker_id\") != current[\"speaker_id\"]:\n            current[\"text\"] = \" \".join(current[\"text\"])\n            segments.append(current)\n\n            current = {\n                \"speaker_id\": word.get(\"speaker_id\"),\n                \"start\": word.get(\"start\"),\n                \"end\": word.get(\"end\"),\n                \"text\": [word.get(\"text\", \"\")]\n            }\n        else:\n            current[\"text\"].append(word.get(\"text\", \"\"))\n            current[\"end\"] = word.get(\"end\")\n\n    if current:\n        current[\"text\"] = \" \".join(current[\"text\"])\n        segments.append(current)\n\n    return segments"
  },
  {
    "path": "app/process/utils.py",
    "content": "#words to dict \n\ndef words_to_dict(result):\n    return result.get(\"words\", [])\n\n\n#speaker diarization function\ndef print_transcript_by_speaker(result):\n    words = result.get(\"words\", [])\n\n    current_speaker = None\n    current_text = []\n    lines = []\n\n    for word in words:\n        if word.get(\"type\") != \"word\":\n            continue\n\n        if word.get(\"speaker_id\") != current_speaker:\n            if current_text:\n                lines.append(f\"{current_speaker}: {' '.join(current_text)}\")\n\n            current_speaker = word.get(\"speaker_id\")\n            current_text = [word.get(\"text\", \"\")]\n        else:\n            current_text.append(word.get(\"text\", \"\"))\n\n    if current_text:\n        lines.append(f\"{current_speaker}: {' '.join(current_text)}\")\n\n    return \"\\n\".join(lines)"
  },
  {
    "path": "app/stt/__init__.py",
    "content": ""
  },
  {
    "path": "app/stt/rag_segments.py",
    "content": "#rag segment funcion\ndef build_rag_segments(result, max_duration = 15.0):\n\n    words = result.get(\"words\", [])\n\n    segments = []\n    current = None\n\n    for word in words:\n\n        if word.get(\"type\") != \"word\":\n            continue\n\n        if current is None:\n            current = {\n                \"speaker_id\": word.get(\"speaker_id\"),\n                \"start\": word.get(\"start\"),\n                \"end\": word.get(\"end\"),\n                \"text\": [word.get(\"text\", \"\")]\n            }\n            continue\n\n        duration = word.get(\"end\", 0 ) - current['start']\n\n        #condition to split\n        speaker_changed = word.get(\"speaker_id\") != current['speaker_id']\n        too_long = duration > max_duration\n\n        if speaker_changed or too_long:\n            current['text'] = \" \".join(current['text'])\n\n            segments.append({\n                \"speaker_id\": current['speaker_id'],\n                \"start\": current['start'],\n                \"end\": current['end'],\n                \"text\": \" \".join(current['text'])\n            })\n\n            current = {\n                \"speaker_id\":word.get(\"speaker_id\"),\n                \"start\": word.get(\"start\"),\n                \"end\": word.get(\"end\"),\n                \"text\": [word.get(\"text\", \"\")]\n            }\n\n\n        else:\n            current['text'].append(word.get(\"text\", ''))\n            current['end'] = word.get(\"end\")\n\n    if current:\n        segments.append({\n            \"speaker_id\": current['speaker_id'],\n            \"start\": current['start'],\n            \"end\": current['end'],\n            \"text\": \" \".join(current['text'])\n        })\n\n        \n\n    return segments\n\n"
  },
  {
    "path": "app/stt/transcribe.py",
    "content": "from elevenlabs import ElevenLabs\nfrom app.config import ELEVENLABS_API_KEY\nimport asyncio\n\n\n\n\nclient = ElevenLabs(api_key = ELEVENLABS_API_KEY)\n\nasync def transcribe(audio_path):\n    loop = asyncio.get_event_loop()\n\n    def _sync_call():\n        with open(audio_path, \"rb\") as audio_file:\n            result = client.speech_to_text.convert(\n                file=audio_file,\n                model_id=\"scribe_v2\",\n                diarize=True,\n                language_code=\"fa\",\n                num_speakers=2,\n                timestamps_granularity=\"word\",\n                keyterms=  [\"آوانته\" , \"کلاریس\"]\n            )\n            return result.dict()  \n\n    return await loop.run_in_executor(None, _sync_call)\n\n\n\n\n\"\"\"result = transcribe(\"sample2.wav\")\njson_result = json.dumps(result.dict(), indent = 4, ensure_ascii = False)\nprint(json_result)\"\"\"\n\n"
  },
  {
    "path": "app/utils/logger.py",
    "content": "import logging\nimport os \nimport json\nfrom datetime import datetime\n\n# Configure logging\nLOG_DIR = \"logs\"\nos.makedirs(LOG_DIR, exist_ok = True)\n\nclass JsonFormatter(logging.Formatter):\n    def format(self, record):\n        log_record = {\n            \"timestamp\" : datetime.utcnow().isoformat(),\n            \"level\": record.levelname,\n            \"message\" : record.getMessage(),\n            \"module\": record.module,\n            \"function\": record.funcName\n        }\n\n        #attaching extra fields if exist\n        if hasattr(record, \"recording_id\"):\n            log_record['recording_id'] = str(record.recording_id)\n\n        if hasattr(record, \"stage\"):\n            log_record['stage'] = record.stage\n\n        if hasattr(record, \"error\"):\n            log_record['error'] = record.error\n\n        return json.dumps(log_record)\n    \n\ndef setup_logger():\n    logger = logging.getLogger(\"pipeline\")\n    logger.setLevel(logging.INFO)\n\n    #console handler\n    console_handler = logging.StreamHandler()\n    console_handler.setLevel(logging.INFO)\n    console_handler.setFormatter(\n        logging.Formatter(\"[%(levelname)s] %(message)s\")\n    )\n\n    #file handler\n    file_handler = logging.FileHandler(f\"{LOG_DIR}/app.log\")\n    file_handler.setLevel(logging.INFO)\n    file_handler.setFormatter(\n        logging.Formatter(\n            \"%(asctime)s | %(levelname)s | %(message)s\"\n        )\n    )\n\n    #json handler\n    json_handler = logging.FileHandler(f\"{LOG_DIR}/app.json\")\n    json_handler.setLevel(logging.INFO)\n    json_handler.setFormatter(JsonFormatter())\n\n    logger.addHandler(console_handler)\n    logger.addHandler(file_handler)\n    logger.addHandler(json_handler)\n\n    return logger\n\nlogger = setup_logger()"
  },
  {
    "path": "app/utils/retry.py",
    "content": "import asyncio\nfrom app.utils.logger import logger\n\nasync def retry_async(\n        func,\n        *args,\n        retries=3,\n        base_delay =2,\n        factor = 2,\n        recording_id = None,\n        stage = None,\n):\n    \n    #generating retry with exponential backoff\n\n    for attempt in range(retries):\n        try:\n            return await func(*args)\n        \n        except Exception as e:\n\n            if attempt ==retries -1:\n                logger.error(\n                    \"Max retries reached\",\n                    extra={\n                        \"recording_id\": recording_id,\n                        \"stage\": stage,\n                        \"error\": str(e)\n                    }\n                )\n                raise\n        \n            delay = base_delay * (factor ** attempt)\n\n            logger.warning(\n            f\"Retrying in {delay}s (attempt {attempt+1})\",\n            extra = {\n                \"recording_id\": recording_id,\n                \"stage\": stage,\n                \"error\": str(e)\n                }\n            )\n\n            await asyncio.sleep(delay)"
  },
  {
    "path": "app/utils/stage_manager.py",
    "content": "from app.database.mongo_db import recordings\n\n\ndef is_stage_done(record, stage):\n    return record.get(\"stages\", {}).get(stage) == \"done\"\n\n\ndef mark_stage_done(recording_id, stage):\n    recordings.update_one(\n        {\"_id\": recording_id},\n        {\n            \"$set\": {f\"stages.{stage}\": \"done\"}\n        }\n    )\n\n\ndef mark_stage_failed(recording_id, stage, error):\n    recordings.update_one(\n        {\"_id\": recording_id},\n        {\n            \"$set\": {\n                f\"stages.{stage}\": \"failed\",\n                \"error\": error\n            },\n            \"$inc\": {\"retry_count\": 1}\n        }\n    )"
  },
  {
    "path": "app/utils/state_validator.py",
    "content": "import os\nfrom datetime import datetime, timedelta\n\nfrom app.database.mongo_db import recordings\nfrom app.utils.logger import logger\nfrom app.database.operation import mark_failed\n\n\nPOST_PREPROCESSING_STAGES = {\n    \"transcription\",\n    \"postprocessing\",\n    \"rag_segmentation\",\n    \"done\"\n}\n\n# how long a job can stay \"processing\" before considered stuck\nSTUCK_TIMEOUT_MINUTES = 10\n\n\ndef validate_record(record):\n\n    recording_id = record[\"_id\"]\n\n    raw_path = record.get(\"raw_path\")\n    processed_path = record.get(\"processed_path\")\n\n    stage = record.get(\"current_stage\", \"ingestion\")\n    status = record.get(\"status\", \"pending\")\n\n    updated_at = record.get(\"updated_at\")\n\n    raw_exists = raw_path and os.path.exists(raw_path)\n    processed_exists = processed_path and os.path.exists(processed_path)\n\n\n\n    # ----------------------------------------------------\n    # CASE 1: RAW FILE EXISTS → ALWAYS RECOVERABLE\n    # ----------------------------------------------------\n\n    if raw_exists:\n\n        # auto-recover failed jobs\n        if status == \"failed\":\n\n            logger.warning(\n                \"Recovering failed job\",\n                extra={\"recording_id\": recording_id}\n            )\n\n            recordings.update_one(\n                {\"_id\": recording_id},\n                {\n                    \"$set\": {\n                        \"status\": \"pending\"\n                    }\n                }\n            )\n\n        return\n\n    # ----------------------------------------------------\n    # CASE 2: BEFORE PREPROCESSING → RAW SHOULD EXIST\n    # ----------------------------------------------------\n\n    if stage in [\"ingestion\", \"preprocessing\"]:\n\n        logger.warning(\n            \"Raw file missing before preprocessing finished\",\n            extra={\"recording_id\": recording_id}\n        )\n\n        return\n\n    # ----------------------------------------------------\n    # CASE 3: AFTER PREPROCESSING → PROCESSED FILE REQUIRED\n    # ----------------------------------------------------\n\n    if stage in POST_PREPROCESSING_STAGES:\n\n        if processed_exists:\n            return\n\n        logger.error(\n            \"Both raw and processed files missing\",\n            extra={\n                \"recording_id\": recording_id,\n                \"stage\": stage\n            }\n        )\n\n        mark_failed(recording_id, \"Audio files missing\")\n        return\n\n\ndef recover_stuck_jobs():\n\n    now = datetime.utcnow()\n\n    stuck_threshold = now - timedelta(minutes=STUCK_TIMEOUT_MINUTES)\n\n    stuck_jobs = recordings.find({\n        \"status\": \"processing\",\n        \"updated_at\": {\"$lt\": stuck_threshold}\n    })\n\n    for record in stuck_jobs:\n\n        recording_id = record[\"_id\"]\n\n        logger.warning(\n            \"Recovering stuck job\",\n            extra={\"recording_id\": recording_id}\n        )\n\n        recordings.update_one(\n            {\"_id\": recording_id},\n            {\n                \"$set\": {\n                    \"status\": \"pending\"\n                }\n            }\n        )\n\n\ndef run_state_validator():\n\n    logger.info(\"State validator started\")\n\n    # validate files\n    records = recordings.find({\n        \"status\": {\"$in\": [\"pending\", \"processing\", \"failed\"]}\n    })\n\n    for record in records:\n        validate_record(record)\n    \n\n\n    # recover stuck jobs\n    recover_stuck_jobs()\n\n    logger.info(\"State validator finished\")\n"
  },
  {
    "path": "requirement.txt",
    "content": "librosa\nnoisereduce\nelevenlabs\nsoundfile\nasyncio\nconfurrent.futures\npymongo\ndatetime\nhazms\n"
  }
]