Repository: Arefe-Masnouie/audio_transcript
Branch: master
Commit: 49af87240431
Files: 33
Total size: 33.6 KB

Directory structure:
gitextract_lm6zn_63/

├── .gitignore
├── app/
│   ├── __init__.py
│   ├── audio/
│   │   └── preprocess.py
│   ├── config.py
│   ├── dashboard/
│   │   ├── __init__.py
│   │   └── dashboard.py
│   ├── database/
│   │   ├── __init__.py
│   │   ├── mongo_db.py
│   │   ├── operation.py
│   │   └── schemas.py
│   ├── ingestion/
│   │   ├── __init__.py
│   │   ├── deduplicator.py
│   │   ├── fetcher.py
│   │   ├── hashing.py
│   │   ├── issabelle_client.py
│   │   ├── load_audio.py
│   │   └── runner.py
│   ├── main.py
│   ├── maintanance/
│   │   └── cleanup.py
│   ├── pipeline/
│   │   ├── __init__.py
│   │   ├── orchestrator.py
│   │   └── worker.py
│   ├── process/
│   │   ├── __init__.py
│   │   ├── segmentation.py
│   │   └── utils.py
│   ├── stt/
│   │   ├── __init__.py
│   │   ├── rag_segments.py
│   │   └── transcribe.py
│   └── utils/
│       ├── logger.py
│       ├── retry.py
│       ├── stage_manager.py
│       └── state_validator.py
└── requirement.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
/audio_data.transcription.json
/db.ipynb
/elevenlabs.ipynb
/main.ipynb
/venv
/stt/__init__.py
/data
/logs
/.env
/app/nlp/pos_tagger.model

================================================
FILE: app/__init__.py
================================================


================================================
FILE: app/audio/preprocess.py
================================================
import librosa
import soundfile as sf
import noisereduce as nr
import os


def preprocess_audio(input_path, output_path):


    try:

    #loading the audio, 16khz and in mono format
        y, sr = librosa.load(input_path, sr=16000, mono = True)

    #reducing the noise
        y  = nr.reduce_noise(
        y=y,
        sr=sr,
        prop_decrease=0.8
        )

    #removing the silent time in the voices
        #y, _ = librosa.effects.trim(y, top_db=20)

    #voice normalization
        y = librosa.util.normalize(y)

        #saving the outputs
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        sf.write(output_path, y, 16000)

        return output_path
    
    except Exception as e:
        raise RuntimeError(f"Preprocessing failed: {str(e)}")


================================================
FILE: app/config.py
================================================
import os
from dotenv import load_dotenv
from pathlib import Path

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

load_dotenv()

MONGO_URL = "mongodb://admin:secret@localhost:27017"

ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")

#PATHS
RECORDINGS_PATH = os.path.join(BASE_DIR, "data", "raw") #then we can change it to the server url

TEMP_DIR = os.path.join(BASE_DIR, "data", "temp")

#this one is for later for when i wanted to handle errors and api calls
MAX_RETRIES = 3

#SERVER CONFIG
ISSABEL_HOST = os.getenv("ISSABEL_HOST")
ISSABEL_PORT = 22
ISSABEL_USER = os.getenv("ISSABEL_USER")
ISSABEL_PASSWORD = os.getenv("ISSABEL_PASSWORD")

ISSABEL_REMOTE_PATH = os.getenv("ISSABEL_REMOTE_PATH")
LOCAL_DOWNLOAD_PATH = os.path.join(BASE_DIR, "data", "raw")


================================================
FILE: app/dashboard/__init__.py
================================================


================================================
FILE: app/dashboard/dashboard.py
================================================
import streamlit as st
from collections import Counter
from pymongo import MongoClient
import pandas as pd

MONGO_URL = "mongodb://admin:secret@localhost:27017"

client = MongoClient(MONGO_URL)
db = client.audio_data

recordings = db['recordings']
transcription = db['transcription']
nlp = db['nlp']

st.set_page_config(layout="wide")
st.title("🎙️ STT Pipeline Dashboard")

# =========================
# 1. PIPELINE OVERVIEW
# =========================

st.header("📊 Pipeline Overview")

records = list(recordings.find())

if records:
    df = pd.DataFrame(records)

    status_counts = df["status"].value_counts().to_dict()

    col1, col2, col3, col4 = st.columns(4)

    col1.metric("Total Files", len(df))
    col2.metric("Done", status_counts.get("done", 0))
    col3.metric("Failed", status_counts.get("failed", 0))
    col4.metric("Processing", status_counts.get("processing", 0))

    success_rate = (
        status_counts.get("done", 0) / len(df)
        if len(df) > 0 else 0
    )

    st.progress(success_rate)
    st.write(f"✅ Success Rate: {round(success_rate * 100, 2)}%")

else:
    st.warning("No data available")

# =========================
# 2. CURRENT PIPELINE STATE
# =========================

st.header("⚙️ Current Pipeline State")

state_counts = Counter([r.get("current_stage", "unknown") for r in records])
st.write(state_counts)

# =========================
# REAL TIME LOG SYSTEM
#==========================
import json
import pandas as pd

st.header("📡 Logs Explorer")

log_file = "logs/app.json"

def load_logs():
    logs = []
    try:
        with open(log_file, "r") as f:
            for line in f:
                logs.append(json.loads(line))
    except:
        pass
    return logs

logs = load_logs()

if logs:
    df_logs = pd.DataFrame(logs)

    # FILTERS
    col1, col2, col3 = st.columns(3)

    level_filter = col1.selectbox("Level", ["ALL"] + sorted(df_logs["level"].unique().tolist()))
    stage_filter = col2.selectbox("Stage", ["ALL"] + sorted(df_logs.get("stage", []).dropna().unique().tolist()))
    rec_filter = col3.text_input("Recording ID")

    filtered = df_logs.copy()

    if level_filter != "ALL":
        filtered = filtered[filtered["level"] == level_filter]

    if stage_filter != "ALL":
        filtered = filtered[filtered["stage"] == stage_filter]

    if rec_filter:
        filtered = filtered[filtered["recording_id"].astype(str).str.contains(rec_filter)]

    st.dataframe(filtered.sort_values("timestamp", ascending=False))
else:
    st.info("No logs yet")

# =========================
# 3. RECENT ACTIVITY
# =========================

st.header("🕒 Recent Activity")

recent = recordings.find().sort("updated_at", -1).limit(10)

for r in recent:
    st.write({
        "file": r.get("filename"),
        "status": r.get("status"),
        "stage": r.get("current_stage"),
        "retry": r.get("retry_count"),
    })

# =========================
# 4. ERROR MONITORING
# =========================

st.header("❌ Recent Errors")

errors = recordings.find({
    "status": {"$in": ["failed", "permanently_failed"]},
    "error": {"$ne": None}
}).sort("updated_at", -1).limit(10)

for e in errors:
    st.error({
        "file": e.get("filename"),
        "error": e.get("error"),
        "retry": e.get("retry_count")
    })

#=====================
#RETRY MONITORING
#===================

st.header("📈 Retry Monitoring")

retry_data = [
    {
        "file": r.get("filename"),
        "retry": r.get("retry_count"),
        "status": r.get("status")
    }
    for r in recordings.find()
]

st.dataframe(retry_data)

# =========================
# 5. TRANSCRIPTION INSIGHTS
# =========================

st.header("📝 Transcription Insights")

total_transcripts = transcription.count_documents({})
st.write(f"Total transcriptions: {total_transcripts}")

# =========================
# 6. NLP INSIGHTS (optional)
# =========================

st.header("🧠 NLP Insights")

all_tokens = []
for doc in nlp.find():
    all_tokens.extend(doc.get("filtered_tokens", []))

if all_tokens:
    top_keywords = Counter(all_tokens).most_common(10)
    st.write("Top Keywords:", top_keywords)
else:
    st.info("No NLP data yet")


#==========
#FILE EXPLORER
#==============
st.header("📂 File Explorer")

records = list(recordings.find())

file_map = {
    f"{r.get('filename')} | {str(r.get('_id'))[:6]}": r
    for r in records
}

selected_key = st.selectbox("Select a file", list(file_map.keys()))

if selected_key:
    selected = file_map[selected_key]
    rec_id = selected["_id"]

    st.subheader("📄 Recording Metadata")
    st.json(selected)

    # =====================
    # TRANSCRIPTION VIEW
    # =====================
    trans_doc = transcription.find_one({"recording_id": rec_id})

    if trans_doc:
        st.subheader("📝 Transcription")

        st.text_area("Full Text", trans_doc.get("text", ""), height=150)

        st.subheader("🗣️ Diarized Text")
        st.text_area("Speakers", trans_doc.get("diarized_text", ""), height=150)

        # =====================
        # WORDS TABLE
        # =====================
        st.subheader("🔍 Word-level Data")

        words = trans_doc.get("words", [])
        if words:
            st.dataframe(words[:100])  # limit for performance

    else:
        st.warning("No transcription found")

  
================================================
FILE: app/database/__init__.py
================================================


================================================
FILE: app/database/mongo_db.py
================================================
from pymongo import MongoClient
from app.config import MONGO_URL
from app.utils.logger import logger

#connecting to mongodb
client = MongoClient(MONGO_URL)

#selecting the database
db = client.audio_data

recordings = db['recordings']
transcription = db['transcription']
nlp = db['nlp']
rag_segments = db['rag_segments']
log = db['log']


================================================
FILE: app/database/operation.py
================================================
from datetime import datetime
from app.utils.logger import logger
from app.config import MAX_RETRIES
from app.database.mongo_db import recordings, transcription, rag_segments
from app.database.schemas import build_recording_doc, build_transcription_doc, build_rag_segment 
import os


#function to save the recordings metadata first
def create_recordings(file_path,file_hash):
    doc = build_recording_doc(file_path, file_hash)
    return recordings.insert_one(doc).inserted_id


def save_transcription(recording_id, result):
    doc = build_transcription_doc(recording_id, result)
    transcription.insert_one(doc)


def save_rag_segments(recording_id, segments):
    doc = build_rag_segment(recording_id, segments)
    rag_segments.insert_one(doc)


#function to update the stage of the recordings
def update_stage(recording_id, stage):
    recordings.update_one(
        {"_id": recording_id},
        {"$set": {"current_stage": stage, "status": "processing"}}
    )

#function to marke the done audio
def mark_done(recording_id):
    recordings.update_one(
        {"_id": recording_id},
        {"$set": {"status": "done",
                   "current_stage": "done",
                   "updated_at": datetime.utcnow()}}
    )

#function to mark the failed transcripted audio
def mark_failed(recording_id, error):

    rec = recordings.find_one({"_id": recording_id})

    if rec["status"] == "done":
        return

    new_retry = rec.get("retry_count", 0) + 1

    status = "failed"
    if new_retry >= MAX_RETRIES:
        status = "permanently_failed"

    recordings.update_one(
        {"_id": recording_id},
        {
            "$set": {
                "status": status,
                "error": error,
                "current_stage": "failed",
                "updated_at": datetime.utcnow()
            },
            "$inc": {"retry_count": 1}
        }
    )


================================================
FILE: app/database/schemas.py
================================================
from datetime import datetime
import os

#recordings collection
def build_recording_doc(path, file_hash):
    return{
        "filename": os.path.basename(path),

        "raw_path": path,
        "processed_path": None,

        "file_hash": file_hash,
        "file_size": os.path.getsize(path),
        "last_modified": os.path.getmtime(path),

        #pipeline status
        "status" : "pending",
        "current_stage": "ingestion",

        "stages":{
            "preprocessing": "pending",
            "transcription": "pending",
            "postprocessing": "pending",
            "rag_segmentation" : "pending"
        },

        "retry_count": 0,
        "error": None,

        "created_at": datetime.utcnow(),
        "updated_at": datetime.utcnow()

    }

#transcription segments
def build_transcription_doc(recording_id, result): 
    return {
        "recording_id": recording_id,

        "text": result.get("text"),
        "diarized_text": result.get("diarized_text"),
        "speaker_segments": result.get("segments"),
        "words": result.get("words"),

        "audio_duration_secs": result.get("audio_duration_secs"),

        "created_at": datetime.utcnow()
    }

#rag_Segment collection
def build_rag_segment(recording_id, segments):

    return{
        "recording_id": recording_id,
        "segments": segments,
        "created_at": datetime.utcnow()
    }


================================================
FILE: app/ingestion/__init__.py
================================================


================================================
FILE: app/ingestion/deduplicator.py
================================================
from app.database.mongo_db import recordings
from app.utils.logger import logger

def is_duplicate(file_hash):
    return recordings.find_one({"file_hash": file_hash}) is not None

================================================
FILE: app/ingestion/fetcher.py
================================================
import os
from app.ingestion.issabelle_client import create_sftp_client
from app.config import ISSABEL_REMOTE_PATH, LOCAL_DOWNLOAD_PATH


def fetch_new_files():
    sftp, transport = create_sftp_client()

    os.makedirs(LOCAL_DOWNLOAD_PATH, exist_ok=True)

    downloaded = []

    try:
        for file in sftp.listdir(ISSABEL_REMOTE_PATH):

            if not file.endswith(".wav"):
                continue

            remote_path = f"{ISSABEL_REMOTE_PATH}/{file}"
            local_path = os.path.join(LOCAL_DOWNLOAD_PATH, file)

            if os.path.exists(local_path):
                continue

            sftp.get(remote_path, local_path)
            downloaded.append(local_path)

    finally:
        sftp.close()
        transport.close()

    return downloaded

================================================
FILE: app/ingestion/hashing.py
================================================
import hashlib
from app.utils.logger import logger

def compute_file_hash(file_path, chunk_size = 8192):

    sha256 = hashlib.sha256()

    with open(file_path, "rb") as f:
        while chunk := f.read(chunk_size):
            sha256.update(chunk)

    return sha256.hexdigest()

================================================
FILE: app/ingestion/issabelle_client.py
================================================
import paramiko
import os
from app.config import (
    ISSABEL_HOST,
    ISSABEL_PORT,
    ISSABEL_USER,
    ISSABEL_PASSWORD
)


def create_sftp_client():
    transport = paramiko.Transport((ISSABEL_HOST, ISSABEL_PORT))
    transport.connect(username= ISSABEL_USER, password = ISSABEL_PASSWORD)

    sftp = paramiko.SFTPClient.from_transport(transport)

    return sftp, transport

================================================
FILE: app/ingestion/load_audio.py
================================================
import os
from app.config import RECORDINGS_PATH
from app.utils.logger import logger

SUPPORTED_FORMATS = (".wav", ".mp3", ".flac", ".m4a")

def get_audio_files():
    audio_files = []

    base_path = os.path.abspath(RECORDINGS_PATH)

    for root, _, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith(SUPPORTED_FORMATS):
                full_path = os.path.abspath(os.path.join(root, file))
                audio_files.append(full_path)

    #sorting the files
    audio_files.sort()

    return audio_files

"""
#previous code 
    for file in os.listdir(RECORDINGS_PATH):
        if file.endswith(".wav") or file.endswith(".mp3"):
            audio_files.append(os.path.join(RECORDINGS_PATH, file))
    return audio_files
"""

================================================
FILE: app/ingestion/runner.py
================================================
from app.database.mongo_db import recordings
from app.database.operation import create_recordings
from app.ingestion.hashing import compute_file_hash
from app.ingestion.deduplicator import is_duplicate
from app.ingestion.load_audio import get_audio_files
from app.utils.logger import logger
import os
from datetime import datetime
import hashlib


def ingest_new_files():


    logger.info("Starting ingestion")

    audio_files = get_audio_files()

    new_count =0
    skipped_count = 0

    for path in audio_files:

        try:
            file_hash = compute_file_hash(path)

            if is_duplicate(file_hash):
                skipped_count += 1
                logger.warning(f"Duplicate file skipped: {path}")
                continue

            create_recordings(path, file_hash)
            new_count +=1

        except Exception as e:
            logger.error(f"Ingestion failed for {path}",
                         extra = {"error": str(e)})
            
    logger.info(f"Ingestion done: {new_count} new, {skipped_count} skipped")


================================================
FILE: app/main.py
================================================
import asyncio
from app.pipeline.orchestrator import run_pipeline
from app.ingestion.runner import ingest_new_files
from app.utils.state_validator import run_state_validator


if __name__ == "__main__":

    #validate states before starting the pipeline
    run_state_validator()

    #detect and ingest new files
    ingest_new_files()
    
    #processing the files
    asyncio.run(run_pipeline())


================================================
FILE: app/maintanance/cleanup.py
================================================
import os
from app.database.mongo_db import recordings
from app.utils.logger import logger

SAFE_TO_DELETE_STATUS = {"done", "permanently_failed"}

def cleanup_files(dry_run: bool = True):

    logger.info("Starting cleanup job")

    records = recordings.find({
        "status": {"$in": list(SAFE_TO_DELETE_STATUS)}
    })

    deleted = 0
    skipped = 0

    for rec in records:

        raw_path = rec.get("raw_path")
        processed_path = rec.get("processed_path")
        recording_id = rec["_id"]

        
        logger.info(
            "checking record",
            extra={
            "recording_id": str(rec["_id"]),
            "status": rec.get("status"),
            "raw_path": rec.get("raw_path")
            }
         )


        for path in [raw_path, processed_path]:

            if not path:
                continue

            if not os.path.exists(path):
                skipped +=1
                continue

            if dry_run:
                logger.info(
                    "DRY RUN - would delete file",
                    extra ={
                        "recording_id": str(recording_id), 
                        "path": path
                    }
                )
                continue

            try:
                os.remove(path)
                deleted += 1

                logger.info(
                    "File deleted safely",
                    extra = {
                        "recording_id": str(recording_id),
                        "path": path
                    }
                )

            except Exception as e:
                logger.error(
                    "Failed to delete file",
                    extra = {
                        "recording_id": str(recording_id),
                        "path": path,
                        "error": str(e)
                    }
                )

    logger.info(
        "Cleanup finished",
        extra ={
            "deleted":deleted,
            "skipped": skipped
        }
    )

#to run seperatly
cleanup_files()

================================================
FILE: app/pipeline/__init__.py
================================================


================================================
FILE: app/pipeline/orchestrator.py
================================================
import asyncio
from app.database.mongo_db import recordings
from app.pipeline.worker import process_audio
from app.config import MAX_RETRIES
from app.utils.logger import logger

"""
from app.queue import queue
from app.pipeline.worker import process_audio_sync
"""


async def run_pipeline():
    logger.info("Pipeline started")


    #fetching the pending records from the database
    cursor = recordings.find({
        "status": {"$in": ["pending", "failed"]},
        "retry_count": {"$lt": MAX_RETRIES}
    })

    tasks = [process_audio(r) for r in cursor]

    results =await asyncio.gather(*tasks, return_exceptions=True)

    for r in results:
        if isinstance(r, Exception):
            logger.error(f"Unhandled exception in pipeline: {r}")
    logger.info("Pipeline finished")
    

    """
    # app/pipeline/orchestrator.py

from app.queue import queue
from app.pipeline.worker import process_audio_sync

def run_pipeline():
    logger.info("Pipeline started")

    cursor = recordings.find({
        "status": {"$in": ["pending", "failed"]},
        "retry_count": {"$lt": MAX_RETRIES}
    })

    for record in cursor:
        queue.enqueue(
            process_audio_sync,
            record,
            job_timeout=600  # 10 min
        )

    logger.info("Jobs enqueued")
    """

================================================
FILE: app/pipeline/worker.py
================================================
import asyncio
import os
from concurrent.futures import ProcessPoolExecutor

from app.utils.logger import logger
from app.utils.retry import retry_async

from app.database.mongo_db import recordings

from app.utils.stage_manager import (
    is_stage_done,
    mark_stage_done,
    mark_stage_failed
)

from app.config import TEMP_DIR
from app.audio.preprocess import preprocess_audio
from app.stt.transcribe import transcribe

from app.process.segmentation import segment_speakers
from app.process.utils import words_to_dict, print_transcript_by_speaker

from app.database.operation import (
    update_stage,
    mark_done,
    mark_failed,
    save_transcription,
    save_rag_segments
)

from app.stt.rag_segments import build_rag_segments

semaphore = asyncio.Semaphore(3)
executor = ProcessPoolExecutor(max_workers=os.cpu_count() // 2 or 1)


async def process_audio(record):
    recording_id = record["_id"]
    success = False
    processed_path = record.get("processed_path")

    logger.info("Worker started", extra={"recording_id": recording_id})

    try:
        #to remove the temp file after the successfully done process
        processed_path = None
        raw_path = record.get("raw_path")

        if not raw_path or not os.path.exists(raw_path):
            raise FileNotFoundError(f"Raw file missing: {raw_path}")

        # ---------------- PREPROCESS ----------------
        update_stage(recording_id, "preprocessing")

        logger.info("Preprocessing started",
                    extra = {"recording_Id": recording_id})

        temp_path = os.path.join(TEMP_DIR, f"{recording_id}.wav")

        async def run_preprocessing():
            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(
                executor,
                preprocess_audio,
                raw_path,
                temp_path
            )

        processed_path = await retry_async(
            run_preprocessing,
            retries=2,
            recording_id=recording_id,
            stage="preprocessing"
        )

        recordings.update_one(
            {"_id": recording_id},
            {"$set": {"processed_path": processed_path}}
        )

        # ---------------- TRANSCRIPTION ----------------
        update_stage(recording_id, "transcription")
        
        logger.info(
            "Transcription",
            extra = {"recording_id": recording_id}
        )

        async with semaphore:
            result = await retry_async(
                transcribe,
                processed_path,
                retries=3,
                recording_id=recording_id,
                stage="transcription"
            )

        # ---------------- POST PROCESS ----------------
        update_stage(recording_id, "postprocessing")

        logger.info("Postprocessing started",
                    extra = {"recording_id": recording_id})

        diarized = print_transcript_by_speaker(result)
        words = words_to_dict(result)
        segments = segment_speakers(result)

        save_transcription(recording_id, {
            "text": result.get("text"),
            "diarized_text": diarized,
            "words": words,
            "segments": segments,
            "audio_duration_secs": result.get("audio_duration_secs")
        })

        # ---------------- RAG ----------------
        update_stage(recording_id, "rag_segmentation")

        logger.info(
            "RAG Segmentation started",
            extra = {"recording_id": recording_id}
        )

        rag = build_rag_segments(result)
        save_rag_segments(recording_id, rag)

        # ---------------- DONE ----------------
        mark_done(recording_id)
        success = True

        logger.info("Processing completed", extra={"recording_id": recording_id})

    except Exception as e:
        logger.error("Processing failed", extra={
            "recording_id": recording_id,
            "error": str(e)
        })
        mark_failed(recording_id, str(e))

    finally:
        if success:
            logger.info(
                "Cleaning up files(success)",
                extra = {"recording_id": recording_id}
            )

            #delete the raw file
            if raw_path and os.path.exists(raw_path):
                try:
                    os.remove(raw_path)
                except Exception as e:
                    logger.warning(
                        "Failed to delete raw file",
                        extra = {"recording_id": recording_id, "error": str(e)}
                    )

            #deleting the TEMP 
            if processed_path and os.path.exists(processed_path):
                try:
                    os.remove(processed_path)
                except Exception as e:
                    logger.warning(
                        "Failed to delete temp file",
                        extra = {"recording_id": recording_id, "error": str(e)}
                    )

        else:
            logger.info(
                "Skipping cleanup(failure)- files preserverd for retry",
                extra = {"recording_id": recording_id}
            )


#this part is to make the worker pipeline into a queue
"""def process_audio_sync(record):
    import asyncio
    return asyncio.run(process_audio(record))

import rq
from redis import Redis

redis_conn = Redis(host="localhost")"""

================================================
FILE: app/process/__init__.py
================================================


================================================
FILE: app/process/segmentation.py
================================================
#segmentation 

def segment_speakers(result):
    words = result.get("words", [])

    segments = []
    current = None

    for word in words:
        if word.get("type") != "word":
            continue

        if current is None:
            current = {
                "speaker_id": word.get("speaker_id"),
                "start": word.get("start"),
                "end": word.get("end"),
                "text": [word.get("text", "")]
            }
            continue

        if word.get("speaker_id") != current["speaker_id"]:
            current["text"] = " ".join(current["text"])
            segments.append(current)

            current = {
                "speaker_id": word.get("speaker_id"),
                "start": word.get("start"),
                "end": word.get("end"),
                "text": [word.get("text", "")]
            }
        else:
            current["text"].append(word.get("text", ""))
            current["end"] = word.get("end")

    if current:
        current["text"] = " ".join(current["text"])
        segments.append(current)

    return segments

================================================
FILE: app/process/utils.py
================================================
#words to dict 

def words_to_dict(result):
    return result.get("words", [])


#speaker diarization function
def print_transcript_by_speaker(result):
    words = result.get("words", [])

    current_speaker = None
    current_text = []
    lines = []

    for word in words:
        if word.get("type") != "word":
            continue

        if word.get("speaker_id") != current_speaker:
            if current_text:
                lines.append(f"{current_speaker}: {' '.join(current_text)}")

            current_speaker = word.get("speaker_id")
            current_text = [word.get("text", "")]
        else:
            current_text.append(word.get("text", ""))

    if current_text:
        lines.append(f"{current_speaker}: {' '.join(current_text)}")

    return "\n".join(lines)

================================================
FILE: app/stt/__init__.py
================================================


================================================
FILE: app/stt/rag_segments.py
================================================
#rag segment funcion
def build_rag_segments(result, max_duration = 15.0):

    words = result.get("words", [])

    segments = []
    current = None

    for word in words:

        if word.get("type") != "word":
            continue

        if current is None:
            current = {
                "speaker_id": word.get("speaker_id"),
                "start": word.get("start"),
                "end": word.get("end"),
                "text": [word.get("text", "")]
            }
            continue

        duration = word.get("end", 0 ) - current['start']

        #condition to split
        speaker_changed = word.get("speaker_id") != current['speaker_id']
        too_long = duration > max_duration

        if speaker_changed or too_long:
            current['text'] = " ".join(current['text'])

            segments.append({
                "speaker_id": current['speaker_id'],
                "start": current['start'],
                "end": current['end'],
                "text": " ".join(current['text'])
            })

            current = {
                "speaker_id":word.get("speaker_id"),
                "start": word.get("start"),
                "end": word.get("end"),
                "text": [word.get("text", "")]
            }


        else:
            current['text'].append(word.get("text", ''))
            current['end'] = word.get("end")

    if current:
        segments.append({
            "speaker_id": current['speaker_id'],
            "start": current['start'],
            "end": current['end'],
            "text": " ".join(current['text'])
        })

        
    return segments


================================================
FILE: app/stt/transcribe.py
================================================
from elevenlabs import ElevenLabs
from app.config import ELEVENLABS_API_KEY
import asyncio


client = ElevenLabs(api_key = ELEVENLABS_API_KEY)

async def transcribe(audio_path):
    loop = asyncio.get_event_loop()

    def _sync_call():
        with open(audio_path, "rb") as audio_file:
            result = client.speech_to_text.convert(
                file=audio_file,
                model_id="scribe_v2",
                diarize=True,
                language_code="fa",
                num_speakers=2,
                timestamps_granularity="word",
                keyterms=  ["آوانته" , "کلاریس"]
            )
            return result.dict()  

    return await loop.run_in_executor(None, _sync_call)


"""result = transcribe("sample2.wav")
json_result = json.dumps(result.dict(), indent = 4, ensure_ascii = False)
print(json_result)"""


================================================
FILE: app/utils/logger.py
================================================
import logging
import os 
import json
from datetime import datetime

# Configure logging
LOG_DIR = "logs"
os.makedirs(LOG_DIR, exist_ok = True)

class JsonFormatter(logging.Formatter):
    def format(self, record):
        log_record = {
            "timestamp" : datetime.utcnow().isoformat(),
            "level": record.levelname,
            "message" : record.getMessage(),
            "module": record.module,
            "function": record.funcName
        }

        #attaching extra fields if exist
        if hasattr(record, "recording_id"):
            log_record['recording_id'] = str(record.recording_id)

        if hasattr(record, "stage"):
            log_record['stage'] = record.stage

        if hasattr(record, "error"):
            log_record['error'] = record.error

        return json.dumps(log_record)
    

def setup_logger():
    logger = logging.getLogger("pipeline")
    logger.setLevel(logging.INFO)

    #console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(
        logging.Formatter("[%(levelname)s] %(message)s")
    )

    #file handler
    file_handler = logging.FileHandler(f"{LOG_DIR}/app.log")
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(
        logging.Formatter(
            "%(asctime)s | %(levelname)s | %(message)s"
        )
    )

    #json handler
    json_handler = logging.FileHandler(f"{LOG_DIR}/app.json")
    json_handler.setLevel(logging.INFO)
    json_handler.setFormatter(JsonFormatter())

    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    logger.addHandler(json_handler)

    return logger

logger = setup_logger()

================================================
FILE: app/utils/retry.py
================================================
import asyncio
from app.utils.logger import logger

async def retry_async(
        func,
        *args,
        retries=3,
        base_delay =2,
        factor = 2,
        recording_id = None,
        stage = None,
):
    
    #generating retry with exponential backoff

    for attempt in range(retries):
        try:
            return await func(*args)
        
        except Exception as e:

            if attempt ==retries -1:
                logger.error(
                    "Max retries reached",
                    extra={
                        "recording_id": recording_id,
                        "stage": stage,
                        "error": str(e)
                    }
                )
                raise
        
            delay = base_delay * (factor ** attempt)

            logger.warning(
            f"Retrying in {delay}s (attempt {attempt+1})",
            extra = {
                "recording_id": recording_id,
                "stage": stage,
                "error": str(e)
                }
            )

            await asyncio.sleep(delay)

================================================
FILE: app/utils/stage_manager.py
================================================
from app.database.mongo_db import recordings


def is_stage_done(record, stage):
    return record.get("stages", {}).get(stage) == "done"


def mark_stage_done(recording_id, stage):
    recordings.update_one(
        {"_id": recording_id},
        {
            "$set": {f"stages.{stage}": "done"}
        }
    )


def mark_stage_failed(recording_id, stage, error):
    recordings.update_one(
        {"_id": recording_id},
        {
            "$set": {
                f"stages.{stage}": "failed",
                "error": error
            },
            "$inc": {"retry_count": 1}
        }
    )

================================================
FILE: app/utils/state_validator.py
================================================
import os
from datetime import datetime, timedelta

from app.database.mongo_db import recordings
from app.utils.logger import logger
from app.database.operation import mark_failed


POST_PREPROCESSING_STAGES = {
    "transcription",
    "postprocessing",
    "rag_segmentation",
    "done"
}

# how long a job can stay "processing" before considered stuck
STUCK_TIMEOUT_MINUTES = 10


def validate_record(record):

    recording_id = record["_id"]

    raw_path = record.get("raw_path")
    processed_path = record.get("processed_path")

    stage = record.get("current_stage", "ingestion")
    status = record.get("status", "pending")

    updated_at = record.get("updated_at")

    raw_exists = raw_path and os.path.exists(raw_path)
    processed_exists = processed_path and os.path.exists(processed_path)


    # ----------------------------------------------------
    # CASE 1: RAW FILE EXISTS → ALWAYS RECOVERABLE
    # ----------------------------------------------------

    if raw_exists:

        # auto-recover failed jobs
        if status == "failed":

            logger.warning(
                "Recovering failed job",
                extra={"recording_id": recording_id}
            )

            recordings.update_one(
                {"_id": recording_id},
                {
                    "$set": {
                        "status": "pending"
                    }
                }
            )

        return

    # ----------------------------------------------------
    # CASE 2: BEFORE PREPROCESSING → RAW SHOULD EXIST
    # ----------------------------------------------------

    if stage in ["ingestion", "preprocessing"]:

        logger.warning(
            "Raw file missing before preprocessing finished",
            extra={"recording_id": recording_id}
        )

        return

    # ----------------------------------------------------
    # CASE 3: AFTER PREPROCESSING → PROCESSED FILE REQUIRED
    # ----------------------------------------------------

    if stage in POST_PREPROCESSING_STAGES:

        if processed_exists:
            return

        logger.error(
            "Both raw and processed files missing",
            extra={
                "recording_id": recording_id,
                "stage": stage
            }
        )

        mark_failed(recording_id, "Audio files missing")
        return


def recover_stuck_jobs():

    now = datetime.utcnow()

    stuck_threshold = now - timedelta(minutes=STUCK_TIMEOUT_MINUTES)

    stuck_jobs = recordings.find({
        "status": "processing",
        "updated_at": {"$lt": stuck_threshold}
    })

    for record in stuck_jobs:

        recording_id = record["_id"]

        logger.warning(
            "Recovering stuck job",
            extra={"recording_id": recording_id}
        )

        recordings.update_one(
            {"_id": recording_id},
            {
                "$set": {
                    "status": "pending"
                }
            }
        )


def run_state_validator():

    logger.info("State validator started")

    # validate files
    records = recordings.find({
        "status": {"$in": ["pending", "processing", "failed"]}
    })

    for record in records:
        validate_record(record)
    

    # recover stuck jobs
    recover_stuck_jobs()

    logger.info("State validator finished")


================================================
FILE: requirement.txt
================================================
librosa
noisereduce
elevenlabs
soundfile
asyncio
confurrent.futures
pymongo
datetime
hazms