Repository: Arefe-Masnouie/audio_transcript Branch: master Commit: 49af87240431 Files: 33 Total size: 33.6 KB Directory structure: gitextract_lm6zn_63/ ├── .gitignore ├── app/ │ ├── __init__.py │ ├── audio/ │ │ └── preprocess.py │ ├── config.py │ ├── dashboard/ │ │ ├── __init__.py │ │ └── dashboard.py │ ├── database/ │ │ ├── __init__.py │ │ ├── mongo_db.py │ │ ├── operation.py │ │ └── schemas.py │ ├── ingestion/ │ │ ├── __init__.py │ │ ├── deduplicator.py │ │ ├── fetcher.py │ │ ├── hashing.py │ │ ├── issabelle_client.py │ │ ├── load_audio.py │ │ └── runner.py │ ├── main.py │ ├── maintanance/ │ │ └── cleanup.py │ ├── pipeline/ │ │ ├── __init__.py │ │ ├── orchestrator.py │ │ └── worker.py │ ├── process/ │ │ ├── __init__.py │ │ ├── segmentation.py │ │ └── utils.py │ ├── stt/ │ │ ├── __init__.py │ │ ├── rag_segments.py │ │ └── transcribe.py │ └── utils/ │ ├── logger.py │ ├── retry.py │ ├── stage_manager.py │ └── state_validator.py └── requirement.txt ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ /audio_data.transcription.json /db.ipynb /elevenlabs.ipynb /main.ipynb /venv /stt/__init__.py /data /logs /.env /app/nlp/pos_tagger.model ================================================ FILE: app/__init__.py ================================================ ================================================ FILE: app/audio/preprocess.py ================================================ import librosa import soundfile as sf import noisereduce as nr import os def preprocess_audio(input_path, output_path): try: #loading the audio, 16khz and in mono format y, sr = librosa.load(input_path, sr=16000, mono = True) #reducing the noise y = nr.reduce_noise( y=y, sr=sr, prop_decrease=0.8 ) #removing the silent time in the voices #y, _ = librosa.effects.trim(y, top_db=20) #voice normalization y = librosa.util.normalize(y) #saving the outputs os.makedirs(os.path.dirname(output_path), exist_ok=True) sf.write(output_path, y, 16000) return output_path except Exception as e: raise RuntimeError(f"Preprocessing failed: {str(e)}") ================================================ FILE: app/config.py ================================================ import os from dotenv import load_dotenv from pathlib import Path BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) load_dotenv() MONGO_URL = "mongodb://admin:secret@localhost:27017" ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") #PATHS RECORDINGS_PATH = os.path.join(BASE_DIR, "data", "raw") #then we can change it to the server url TEMP_DIR = os.path.join(BASE_DIR, "data", "temp") #this one is for later for when i wanted to handle errors and api calls MAX_RETRIES = 3 #SERVER CONFIG ISSABEL_HOST = os.getenv("ISSABEL_HOST") ISSABEL_PORT = 22 ISSABEL_USER = os.getenv("ISSABEL_USER") ISSABEL_PASSWORD = os.getenv("ISSABEL_PASSWORD") ISSABEL_REMOTE_PATH = os.getenv("ISSABEL_REMOTE_PATH") LOCAL_DOWNLOAD_PATH = os.path.join(BASE_DIR, "data", "raw") ================================================ FILE: app/dashboard/__init__.py ================================================ ================================================ FILE: app/dashboard/dashboard.py ================================================ import streamlit as st from collections import Counter from pymongo import MongoClient import pandas as pd MONGO_URL = "mongodb://admin:secret@localhost:27017" client = MongoClient(MONGO_URL) db = client.audio_data recordings = db['recordings'] transcription = db['transcription'] nlp = db['nlp'] st.set_page_config(layout="wide") st.title("🎙️ STT Pipeline Dashboard") # ========================= # 1. PIPELINE OVERVIEW # ========================= st.header("📊 Pipeline Overview") records = list(recordings.find()) if records: df = pd.DataFrame(records) status_counts = df["status"].value_counts().to_dict() col1, col2, col3, col4 = st.columns(4) col1.metric("Total Files", len(df)) col2.metric("Done", status_counts.get("done", 0)) col3.metric("Failed", status_counts.get("failed", 0)) col4.metric("Processing", status_counts.get("processing", 0)) success_rate = ( status_counts.get("done", 0) / len(df) if len(df) > 0 else 0 ) st.progress(success_rate) st.write(f"✅ Success Rate: {round(success_rate * 100, 2)}%") else: st.warning("No data available") # ========================= # 2. CURRENT PIPELINE STATE # ========================= st.header("⚙️ Current Pipeline State") state_counts = Counter([r.get("current_stage", "unknown") for r in records]) st.write(state_counts) # ========================= # REAL TIME LOG SYSTEM #========================== import json import pandas as pd st.header("📡 Logs Explorer") log_file = "logs/app.json" def load_logs(): logs = [] try: with open(log_file, "r") as f: for line in f: logs.append(json.loads(line)) except: pass return logs logs = load_logs() if logs: df_logs = pd.DataFrame(logs) # FILTERS col1, col2, col3 = st.columns(3) level_filter = col1.selectbox("Level", ["ALL"] + sorted(df_logs["level"].unique().tolist())) stage_filter = col2.selectbox("Stage", ["ALL"] + sorted(df_logs.get("stage", []).dropna().unique().tolist())) rec_filter = col3.text_input("Recording ID") filtered = df_logs.copy() if level_filter != "ALL": filtered = filtered[filtered["level"] == level_filter] if stage_filter != "ALL": filtered = filtered[filtered["stage"] == stage_filter] if rec_filter: filtered = filtered[filtered["recording_id"].astype(str).str.contains(rec_filter)] st.dataframe(filtered.sort_values("timestamp", ascending=False)) else: st.info("No logs yet") # ========================= # 3. RECENT ACTIVITY # ========================= st.header("🕒 Recent Activity") recent = recordings.find().sort("updated_at", -1).limit(10) for r in recent: st.write({ "file": r.get("filename"), "status": r.get("status"), "stage": r.get("current_stage"), "retry": r.get("retry_count"), }) # ========================= # 4. ERROR MONITORING # ========================= st.header("❌ Recent Errors") errors = recordings.find({ "status": {"$in": ["failed", "permanently_failed"]}, "error": {"$ne": None} }).sort("updated_at", -1).limit(10) for e in errors: st.error({ "file": e.get("filename"), "error": e.get("error"), "retry": e.get("retry_count") }) #===================== #RETRY MONITORING #=================== st.header("📈 Retry Monitoring") retry_data = [ { "file": r.get("filename"), "retry": r.get("retry_count"), "status": r.get("status") } for r in recordings.find() ] st.dataframe(retry_data) # ========================= # 5. TRANSCRIPTION INSIGHTS # ========================= st.header("📝 Transcription Insights") total_transcripts = transcription.count_documents({}) st.write(f"Total transcriptions: {total_transcripts}") # ========================= # 6. NLP INSIGHTS (optional) # ========================= st.header("🧠 NLP Insights") all_tokens = [] for doc in nlp.find(): all_tokens.extend(doc.get("filtered_tokens", [])) if all_tokens: top_keywords = Counter(all_tokens).most_common(10) st.write("Top Keywords:", top_keywords) else: st.info("No NLP data yet") #========== #FILE EXPLORER #============== st.header("📂 File Explorer") records = list(recordings.find()) file_map = { f"{r.get('filename')} | {str(r.get('_id'))[:6]}": r for r in records } selected_key = st.selectbox("Select a file", list(file_map.keys())) if selected_key: selected = file_map[selected_key] rec_id = selected["_id"] st.subheader("📄 Recording Metadata") st.json(selected) # ===================== # TRANSCRIPTION VIEW # ===================== trans_doc = transcription.find_one({"recording_id": rec_id}) if trans_doc: st.subheader("📝 Transcription") st.text_area("Full Text", trans_doc.get("text", ""), height=150) st.subheader("🗣️ Diarized Text") st.text_area("Speakers", trans_doc.get("diarized_text", ""), height=150) # ===================== # WORDS TABLE # ===================== st.subheader("🔍 Word-level Data") words = trans_doc.get("words", []) if words: st.dataframe(words[:100]) # limit for performance else: st.warning("No transcription found") ================================================ FILE: app/database/__init__.py ================================================ ================================================ FILE: app/database/mongo_db.py ================================================ from pymongo import MongoClient from app.config import MONGO_URL from app.utils.logger import logger #connecting to mongodb client = MongoClient(MONGO_URL) #selecting the database db = client.audio_data recordings = db['recordings'] transcription = db['transcription'] nlp = db['nlp'] rag_segments = db['rag_segments'] log = db['log'] ================================================ FILE: app/database/operation.py ================================================ from datetime import datetime from app.utils.logger import logger from app.config import MAX_RETRIES from app.database.mongo_db import recordings, transcription, rag_segments from app.database.schemas import build_recording_doc, build_transcription_doc, build_rag_segment import os #function to save the recordings metadata first def create_recordings(file_path,file_hash): doc = build_recording_doc(file_path, file_hash) return recordings.insert_one(doc).inserted_id def save_transcription(recording_id, result): doc = build_transcription_doc(recording_id, result) transcription.insert_one(doc) def save_rag_segments(recording_id, segments): doc = build_rag_segment(recording_id, segments) rag_segments.insert_one(doc) #function to update the stage of the recordings def update_stage(recording_id, stage): recordings.update_one( {"_id": recording_id}, {"$set": {"current_stage": stage, "status": "processing"}} ) #function to marke the done audio def mark_done(recording_id): recordings.update_one( {"_id": recording_id}, {"$set": {"status": "done", "current_stage": "done", "updated_at": datetime.utcnow()}} ) #function to mark the failed transcripted audio def mark_failed(recording_id, error): rec = recordings.find_one({"_id": recording_id}) if rec["status"] == "done": return new_retry = rec.get("retry_count", 0) + 1 status = "failed" if new_retry >= MAX_RETRIES: status = "permanently_failed" recordings.update_one( {"_id": recording_id}, { "$set": { "status": status, "error": error, "current_stage": "failed", "updated_at": datetime.utcnow() }, "$inc": {"retry_count": 1} } ) ================================================ FILE: app/database/schemas.py ================================================ from datetime import datetime import os #recordings collection def build_recording_doc(path, file_hash): return{ "filename": os.path.basename(path), "raw_path": path, "processed_path": None, "file_hash": file_hash, "file_size": os.path.getsize(path), "last_modified": os.path.getmtime(path), #pipeline status "status" : "pending", "current_stage": "ingestion", "stages":{ "preprocessing": "pending", "transcription": "pending", "postprocessing": "pending", "rag_segmentation" : "pending" }, "retry_count": 0, "error": None, "created_at": datetime.utcnow(), "updated_at": datetime.utcnow() } #transcription segments def build_transcription_doc(recording_id, result): return { "recording_id": recording_id, "text": result.get("text"), "diarized_text": result.get("diarized_text"), "speaker_segments": result.get("segments"), "words": result.get("words"), "audio_duration_secs": result.get("audio_duration_secs"), "created_at": datetime.utcnow() } #rag_Segment collection def build_rag_segment(recording_id, segments): return{ "recording_id": recording_id, "segments": segments, "created_at": datetime.utcnow() } ================================================ FILE: app/ingestion/__init__.py ================================================ ================================================ FILE: app/ingestion/deduplicator.py ================================================ from app.database.mongo_db import recordings from app.utils.logger import logger def is_duplicate(file_hash): return recordings.find_one({"file_hash": file_hash}) is not None ================================================ FILE: app/ingestion/fetcher.py ================================================ import os from app.ingestion.issabelle_client import create_sftp_client from app.config import ISSABEL_REMOTE_PATH, LOCAL_DOWNLOAD_PATH def fetch_new_files(): sftp, transport = create_sftp_client() os.makedirs(LOCAL_DOWNLOAD_PATH, exist_ok=True) downloaded = [] try: for file in sftp.listdir(ISSABEL_REMOTE_PATH): if not file.endswith(".wav"): continue remote_path = f"{ISSABEL_REMOTE_PATH}/{file}" local_path = os.path.join(LOCAL_DOWNLOAD_PATH, file) if os.path.exists(local_path): continue sftp.get(remote_path, local_path) downloaded.append(local_path) finally: sftp.close() transport.close() return downloaded ================================================ FILE: app/ingestion/hashing.py ================================================ import hashlib from app.utils.logger import logger def compute_file_hash(file_path, chunk_size = 8192): sha256 = hashlib.sha256() with open(file_path, "rb") as f: while chunk := f.read(chunk_size): sha256.update(chunk) return sha256.hexdigest() ================================================ FILE: app/ingestion/issabelle_client.py ================================================ import paramiko import os from app.config import ( ISSABEL_HOST, ISSABEL_PORT, ISSABEL_USER, ISSABEL_PASSWORD ) def create_sftp_client(): transport = paramiko.Transport((ISSABEL_HOST, ISSABEL_PORT)) transport.connect(username= ISSABEL_USER, password = ISSABEL_PASSWORD) sftp = paramiko.SFTPClient.from_transport(transport) return sftp, transport ================================================ FILE: app/ingestion/load_audio.py ================================================ import os from app.config import RECORDINGS_PATH from app.utils.logger import logger SUPPORTED_FORMATS = (".wav", ".mp3", ".flac", ".m4a") def get_audio_files(): audio_files = [] base_path = os.path.abspath(RECORDINGS_PATH) for root, _, files in os.walk(base_path): for file in files: if file.lower().endswith(SUPPORTED_FORMATS): full_path = os.path.abspath(os.path.join(root, file)) audio_files.append(full_path) #sorting the files audio_files.sort() return audio_files """ #previous code for file in os.listdir(RECORDINGS_PATH): if file.endswith(".wav") or file.endswith(".mp3"): audio_files.append(os.path.join(RECORDINGS_PATH, file)) return audio_files """ ================================================ FILE: app/ingestion/runner.py ================================================ from app.database.mongo_db import recordings from app.database.operation import create_recordings from app.ingestion.hashing import compute_file_hash from app.ingestion.deduplicator import is_duplicate from app.ingestion.load_audio import get_audio_files from app.utils.logger import logger import os from datetime import datetime import hashlib def ingest_new_files(): logger.info("Starting ingestion") audio_files = get_audio_files() new_count =0 skipped_count = 0 for path in audio_files: try: file_hash = compute_file_hash(path) if is_duplicate(file_hash): skipped_count += 1 logger.warning(f"Duplicate file skipped: {path}") continue create_recordings(path, file_hash) new_count +=1 except Exception as e: logger.error(f"Ingestion failed for {path}", extra = {"error": str(e)}) logger.info(f"Ingestion done: {new_count} new, {skipped_count} skipped") ================================================ FILE: app/main.py ================================================ import asyncio from app.pipeline.orchestrator import run_pipeline from app.ingestion.runner import ingest_new_files from app.utils.state_validator import run_state_validator if __name__ == "__main__": #validate states before starting the pipeline run_state_validator() #detect and ingest new files ingest_new_files() #processing the files asyncio.run(run_pipeline()) ================================================ FILE: app/maintanance/cleanup.py ================================================ import os from app.database.mongo_db import recordings from app.utils.logger import logger SAFE_TO_DELETE_STATUS = {"done", "permanently_failed"} def cleanup_files(dry_run: bool = True): logger.info("Starting cleanup job") records = recordings.find({ "status": {"$in": list(SAFE_TO_DELETE_STATUS)} }) deleted = 0 skipped = 0 for rec in records: raw_path = rec.get("raw_path") processed_path = rec.get("processed_path") recording_id = rec["_id"] logger.info( "checking record", extra={ "recording_id": str(rec["_id"]), "status": rec.get("status"), "raw_path": rec.get("raw_path") } ) for path in [raw_path, processed_path]: if not path: continue if not os.path.exists(path): skipped +=1 continue if dry_run: logger.info( "DRY RUN - would delete file", extra ={ "recording_id": str(recording_id), "path": path } ) continue try: os.remove(path) deleted += 1 logger.info( "File deleted safely", extra = { "recording_id": str(recording_id), "path": path } ) except Exception as e: logger.error( "Failed to delete file", extra = { "recording_id": str(recording_id), "path": path, "error": str(e) } ) logger.info( "Cleanup finished", extra ={ "deleted":deleted, "skipped": skipped } ) #to run seperatly cleanup_files() ================================================ FILE: app/pipeline/__init__.py ================================================ ================================================ FILE: app/pipeline/orchestrator.py ================================================ import asyncio from app.database.mongo_db import recordings from app.pipeline.worker import process_audio from app.config import MAX_RETRIES from app.utils.logger import logger """ from app.queue import queue from app.pipeline.worker import process_audio_sync """ async def run_pipeline(): logger.info("Pipeline started") #fetching the pending records from the database cursor = recordings.find({ "status": {"$in": ["pending", "failed"]}, "retry_count": {"$lt": MAX_RETRIES} }) tasks = [process_audio(r) for r in cursor] results =await asyncio.gather(*tasks, return_exceptions=True) for r in results: if isinstance(r, Exception): logger.error(f"Unhandled exception in pipeline: {r}") logger.info("Pipeline finished") """ # app/pipeline/orchestrator.py from app.queue import queue from app.pipeline.worker import process_audio_sync def run_pipeline(): logger.info("Pipeline started") cursor = recordings.find({ "status": {"$in": ["pending", "failed"]}, "retry_count": {"$lt": MAX_RETRIES} }) for record in cursor: queue.enqueue( process_audio_sync, record, job_timeout=600 # 10 min ) logger.info("Jobs enqueued") """ ================================================ FILE: app/pipeline/worker.py ================================================ import asyncio import os from concurrent.futures import ProcessPoolExecutor from app.utils.logger import logger from app.utils.retry import retry_async from app.database.mongo_db import recordings from app.utils.stage_manager import ( is_stage_done, mark_stage_done, mark_stage_failed ) from app.config import TEMP_DIR from app.audio.preprocess import preprocess_audio from app.stt.transcribe import transcribe from app.process.segmentation import segment_speakers from app.process.utils import words_to_dict, print_transcript_by_speaker from app.database.operation import ( update_stage, mark_done, mark_failed, save_transcription, save_rag_segments ) from app.stt.rag_segments import build_rag_segments semaphore = asyncio.Semaphore(3) executor = ProcessPoolExecutor(max_workers=os.cpu_count() // 2 or 1) async def process_audio(record): recording_id = record["_id"] success = False processed_path = record.get("processed_path") logger.info("Worker started", extra={"recording_id": recording_id}) try: #to remove the temp file after the successfully done process processed_path = None raw_path = record.get("raw_path") if not raw_path or not os.path.exists(raw_path): raise FileNotFoundError(f"Raw file missing: {raw_path}") # ---------------- PREPROCESS ---------------- update_stage(recording_id, "preprocessing") logger.info("Preprocessing started", extra = {"recording_Id": recording_id}) temp_path = os.path.join(TEMP_DIR, f"{recording_id}.wav") async def run_preprocessing(): loop = asyncio.get_event_loop() return await loop.run_in_executor( executor, preprocess_audio, raw_path, temp_path ) processed_path = await retry_async( run_preprocessing, retries=2, recording_id=recording_id, stage="preprocessing" ) recordings.update_one( {"_id": recording_id}, {"$set": {"processed_path": processed_path}} ) # ---------------- TRANSCRIPTION ---------------- update_stage(recording_id, "transcription") logger.info( "Transcription", extra = {"recording_id": recording_id} ) async with semaphore: result = await retry_async( transcribe, processed_path, retries=3, recording_id=recording_id, stage="transcription" ) # ---------------- POST PROCESS ---------------- update_stage(recording_id, "postprocessing") logger.info("Postprocessing started", extra = {"recording_id": recording_id}) diarized = print_transcript_by_speaker(result) words = words_to_dict(result) segments = segment_speakers(result) save_transcription(recording_id, { "text": result.get("text"), "diarized_text": diarized, "words": words, "segments": segments, "audio_duration_secs": result.get("audio_duration_secs") }) # ---------------- RAG ---------------- update_stage(recording_id, "rag_segmentation") logger.info( "RAG Segmentation started", extra = {"recording_id": recording_id} ) rag = build_rag_segments(result) save_rag_segments(recording_id, rag) # ---------------- DONE ---------------- mark_done(recording_id) success = True logger.info("Processing completed", extra={"recording_id": recording_id}) except Exception as e: logger.error("Processing failed", extra={ "recording_id": recording_id, "error": str(e) }) mark_failed(recording_id, str(e)) finally: if success: logger.info( "Cleaning up files(success)", extra = {"recording_id": recording_id} ) #delete the raw file if raw_path and os.path.exists(raw_path): try: os.remove(raw_path) except Exception as e: logger.warning( "Failed to delete raw file", extra = {"recording_id": recording_id, "error": str(e)} ) #deleting the TEMP if processed_path and os.path.exists(processed_path): try: os.remove(processed_path) except Exception as e: logger.warning( "Failed to delete temp file", extra = {"recording_id": recording_id, "error": str(e)} ) else: logger.info( "Skipping cleanup(failure)- files preserverd for retry", extra = {"recording_id": recording_id} ) #this part is to make the worker pipeline into a queue """def process_audio_sync(record): import asyncio return asyncio.run(process_audio(record)) import rq from redis import Redis redis_conn = Redis(host="localhost")""" ================================================ FILE: app/process/__init__.py ================================================ ================================================ FILE: app/process/segmentation.py ================================================ #segmentation def segment_speakers(result): words = result.get("words", []) segments = [] current = None for word in words: if word.get("type") != "word": continue if current is None: current = { "speaker_id": word.get("speaker_id"), "start": word.get("start"), "end": word.get("end"), "text": [word.get("text", "")] } continue if word.get("speaker_id") != current["speaker_id"]: current["text"] = " ".join(current["text"]) segments.append(current) current = { "speaker_id": word.get("speaker_id"), "start": word.get("start"), "end": word.get("end"), "text": [word.get("text", "")] } else: current["text"].append(word.get("text", "")) current["end"] = word.get("end") if current: current["text"] = " ".join(current["text"]) segments.append(current) return segments ================================================ FILE: app/process/utils.py ================================================ #words to dict def words_to_dict(result): return result.get("words", []) #speaker diarization function def print_transcript_by_speaker(result): words = result.get("words", []) current_speaker = None current_text = [] lines = [] for word in words: if word.get("type") != "word": continue if word.get("speaker_id") != current_speaker: if current_text: lines.append(f"{current_speaker}: {' '.join(current_text)}") current_speaker = word.get("speaker_id") current_text = [word.get("text", "")] else: current_text.append(word.get("text", "")) if current_text: lines.append(f"{current_speaker}: {' '.join(current_text)}") return "\n".join(lines) ================================================ FILE: app/stt/__init__.py ================================================ ================================================ FILE: app/stt/rag_segments.py ================================================ #rag segment funcion def build_rag_segments(result, max_duration = 15.0): words = result.get("words", []) segments = [] current = None for word in words: if word.get("type") != "word": continue if current is None: current = { "speaker_id": word.get("speaker_id"), "start": word.get("start"), "end": word.get("end"), "text": [word.get("text", "")] } continue duration = word.get("end", 0 ) - current['start'] #condition to split speaker_changed = word.get("speaker_id") != current['speaker_id'] too_long = duration > max_duration if speaker_changed or too_long: current['text'] = " ".join(current['text']) segments.append({ "speaker_id": current['speaker_id'], "start": current['start'], "end": current['end'], "text": " ".join(current['text']) }) current = { "speaker_id":word.get("speaker_id"), "start": word.get("start"), "end": word.get("end"), "text": [word.get("text", "")] } else: current['text'].append(word.get("text", '')) current['end'] = word.get("end") if current: segments.append({ "speaker_id": current['speaker_id'], "start": current['start'], "end": current['end'], "text": " ".join(current['text']) }) return segments ================================================ FILE: app/stt/transcribe.py ================================================ from elevenlabs import ElevenLabs from app.config import ELEVENLABS_API_KEY import asyncio client = ElevenLabs(api_key = ELEVENLABS_API_KEY) async def transcribe(audio_path): loop = asyncio.get_event_loop() def _sync_call(): with open(audio_path, "rb") as audio_file: result = client.speech_to_text.convert( file=audio_file, model_id="scribe_v2", diarize=True, language_code="fa", num_speakers=2, timestamps_granularity="word", keyterms= ["آوانته" , "کلاریس"] ) return result.dict() return await loop.run_in_executor(None, _sync_call) """result = transcribe("sample2.wav") json_result = json.dumps(result.dict(), indent = 4, ensure_ascii = False) print(json_result)""" ================================================ FILE: app/utils/logger.py ================================================ import logging import os import json from datetime import datetime # Configure logging LOG_DIR = "logs" os.makedirs(LOG_DIR, exist_ok = True) class JsonFormatter(logging.Formatter): def format(self, record): log_record = { "timestamp" : datetime.utcnow().isoformat(), "level": record.levelname, "message" : record.getMessage(), "module": record.module, "function": record.funcName } #attaching extra fields if exist if hasattr(record, "recording_id"): log_record['recording_id'] = str(record.recording_id) if hasattr(record, "stage"): log_record['stage'] = record.stage if hasattr(record, "error"): log_record['error'] = record.error return json.dumps(log_record) def setup_logger(): logger = logging.getLogger("pipeline") logger.setLevel(logging.INFO) #console handler console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter( logging.Formatter("[%(levelname)s] %(message)s") ) #file handler file_handler = logging.FileHandler(f"{LOG_DIR}/app.log") file_handler.setLevel(logging.INFO) file_handler.setFormatter( logging.Formatter( "%(asctime)s | %(levelname)s | %(message)s" ) ) #json handler json_handler = logging.FileHandler(f"{LOG_DIR}/app.json") json_handler.setLevel(logging.INFO) json_handler.setFormatter(JsonFormatter()) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.addHandler(json_handler) return logger logger = setup_logger() ================================================ FILE: app/utils/retry.py ================================================ import asyncio from app.utils.logger import logger async def retry_async( func, *args, retries=3, base_delay =2, factor = 2, recording_id = None, stage = None, ): #generating retry with exponential backoff for attempt in range(retries): try: return await func(*args) except Exception as e: if attempt ==retries -1: logger.error( "Max retries reached", extra={ "recording_id": recording_id, "stage": stage, "error": str(e) } ) raise delay = base_delay * (factor ** attempt) logger.warning( f"Retrying in {delay}s (attempt {attempt+1})", extra = { "recording_id": recording_id, "stage": stage, "error": str(e) } ) await asyncio.sleep(delay) ================================================ FILE: app/utils/stage_manager.py ================================================ from app.database.mongo_db import recordings def is_stage_done(record, stage): return record.get("stages", {}).get(stage) == "done" def mark_stage_done(recording_id, stage): recordings.update_one( {"_id": recording_id}, { "$set": {f"stages.{stage}": "done"} } ) def mark_stage_failed(recording_id, stage, error): recordings.update_one( {"_id": recording_id}, { "$set": { f"stages.{stage}": "failed", "error": error }, "$inc": {"retry_count": 1} } ) ================================================ FILE: app/utils/state_validator.py ================================================ import os from datetime import datetime, timedelta from app.database.mongo_db import recordings from app.utils.logger import logger from app.database.operation import mark_failed POST_PREPROCESSING_STAGES = { "transcription", "postprocessing", "rag_segmentation", "done" } # how long a job can stay "processing" before considered stuck STUCK_TIMEOUT_MINUTES = 10 def validate_record(record): recording_id = record["_id"] raw_path = record.get("raw_path") processed_path = record.get("processed_path") stage = record.get("current_stage", "ingestion") status = record.get("status", "pending") updated_at = record.get("updated_at") raw_exists = raw_path and os.path.exists(raw_path) processed_exists = processed_path and os.path.exists(processed_path) # ---------------------------------------------------- # CASE 1: RAW FILE EXISTS → ALWAYS RECOVERABLE # ---------------------------------------------------- if raw_exists: # auto-recover failed jobs if status == "failed": logger.warning( "Recovering failed job", extra={"recording_id": recording_id} ) recordings.update_one( {"_id": recording_id}, { "$set": { "status": "pending" } } ) return # ---------------------------------------------------- # CASE 2: BEFORE PREPROCESSING → RAW SHOULD EXIST # ---------------------------------------------------- if stage in ["ingestion", "preprocessing"]: logger.warning( "Raw file missing before preprocessing finished", extra={"recording_id": recording_id} ) return # ---------------------------------------------------- # CASE 3: AFTER PREPROCESSING → PROCESSED FILE REQUIRED # ---------------------------------------------------- if stage in POST_PREPROCESSING_STAGES: if processed_exists: return logger.error( "Both raw and processed files missing", extra={ "recording_id": recording_id, "stage": stage } ) mark_failed(recording_id, "Audio files missing") return def recover_stuck_jobs(): now = datetime.utcnow() stuck_threshold = now - timedelta(minutes=STUCK_TIMEOUT_MINUTES) stuck_jobs = recordings.find({ "status": "processing", "updated_at": {"$lt": stuck_threshold} }) for record in stuck_jobs: recording_id = record["_id"] logger.warning( "Recovering stuck job", extra={"recording_id": recording_id} ) recordings.update_one( {"_id": recording_id}, { "$set": { "status": "pending" } } ) def run_state_validator(): logger.info("State validator started") # validate files records = recordings.find({ "status": {"$in": ["pending", "processing", "failed"]} }) for record in records: validate_record(record) # recover stuck jobs recover_stuck_jobs() logger.info("State validator finished") ================================================ FILE: requirement.txt ================================================ librosa noisereduce elevenlabs soundfile asyncio confurrent.futures pymongo datetime hazms