master 49af87240431 cached
33 files
33.6 KB
8.4k tokens
35 symbols
5 requests
Download .txt
Repository: Arefe-Masnouie/audio_transcript
Branch: master
Commit: 49af87240431
Files: 33
Total size: 33.6 KB

Directory structure:
gitextract_lm6zn_63/

├── .gitignore
├── app/
│   ├── __init__.py
│   ├── audio/
│   │   └── preprocess.py
│   ├── config.py
│   ├── dashboard/
│   │   ├── __init__.py
│   │   └── dashboard.py
│   ├── database/
│   │   ├── __init__.py
│   │   ├── mongo_db.py
│   │   ├── operation.py
│   │   └── schemas.py
│   ├── ingestion/
│   │   ├── __init__.py
│   │   ├── deduplicator.py
│   │   ├── fetcher.py
│   │   ├── hashing.py
│   │   ├── issabelle_client.py
│   │   ├── load_audio.py
│   │   └── runner.py
│   ├── main.py
│   ├── maintanance/
│   │   └── cleanup.py
│   ├── pipeline/
│   │   ├── __init__.py
│   │   ├── orchestrator.py
│   │   └── worker.py
│   ├── process/
│   │   ├── __init__.py
│   │   ├── segmentation.py
│   │   └── utils.py
│   ├── stt/
│   │   ├── __init__.py
│   │   ├── rag_segments.py
│   │   └── transcribe.py
│   └── utils/
│       ├── logger.py
│       ├── retry.py
│       ├── stage_manager.py
│       └── state_validator.py
└── requirement.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
/audio_data.transcription.json
/db.ipynb
/elevenlabs.ipynb
/main.ipynb
/venv
/stt/__init__.py
/data
/logs
/.env
/app/nlp/pos_tagger.model

================================================
FILE: app/__init__.py
================================================


================================================
FILE: app/audio/preprocess.py
================================================
import librosa
import soundfile as sf
import noisereduce as nr
import os


def preprocess_audio(input_path, output_path):


    try:

    #loading the audio, 16khz and in mono format
        y, sr = librosa.load(input_path, sr=16000, mono = True)

    #reducing the noise
        y  = nr.reduce_noise(
        y=y,
        sr=sr,
        prop_decrease=0.8
        )

    #removing the silent time in the voices
        #y, _ = librosa.effects.trim(y, top_db=20)

    #voice normalization
        y = librosa.util.normalize(y)

        #saving the outputs
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        sf.write(output_path, y, 16000)

        return output_path
    
    except Exception as e:
        raise RuntimeError(f"Preprocessing failed: {str(e)}")




================================================
FILE: app/config.py
================================================
import os
from dotenv import load_dotenv
from pathlib import Path

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

load_dotenv()

MONGO_URL = "mongodb://admin:secret@localhost:27017"

ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")

#PATHS
RECORDINGS_PATH = os.path.join(BASE_DIR, "data", "raw") #then we can change it to the server url

TEMP_DIR = os.path.join(BASE_DIR, "data", "temp")

#this one is for later for when i wanted to handle errors and api calls
MAX_RETRIES = 3

#SERVER CONFIG
ISSABEL_HOST = os.getenv("ISSABEL_HOST")
ISSABEL_PORT = 22
ISSABEL_USER = os.getenv("ISSABEL_USER")
ISSABEL_PASSWORD = os.getenv("ISSABEL_PASSWORD")

ISSABEL_REMOTE_PATH = os.getenv("ISSABEL_REMOTE_PATH")
LOCAL_DOWNLOAD_PATH = os.path.join(BASE_DIR, "data", "raw")


================================================
FILE: app/dashboard/__init__.py
================================================


================================================
FILE: app/dashboard/dashboard.py
================================================
import streamlit as st
from collections import Counter
from pymongo import MongoClient
import pandas as pd

MONGO_URL = "mongodb://admin:secret@localhost:27017"

client = MongoClient(MONGO_URL)
db = client.audio_data

recordings = db['recordings']
transcription = db['transcription']
nlp = db['nlp']

st.set_page_config(layout="wide")
st.title("🎙️ STT Pipeline Dashboard")

# =========================
# 1. PIPELINE OVERVIEW
# =========================

st.header("📊 Pipeline Overview")

records = list(recordings.find())

if records:
    df = pd.DataFrame(records)

    status_counts = df["status"].value_counts().to_dict()

    col1, col2, col3, col4 = st.columns(4)

    col1.metric("Total Files", len(df))
    col2.metric("Done", status_counts.get("done", 0))
    col3.metric("Failed", status_counts.get("failed", 0))
    col4.metric("Processing", status_counts.get("processing", 0))

    success_rate = (
        status_counts.get("done", 0) / len(df)
        if len(df) > 0 else 0
    )

    st.progress(success_rate)
    st.write(f"✅ Success Rate: {round(success_rate * 100, 2)}%")

else:
    st.warning("No data available")

# =========================
# 2. CURRENT PIPELINE STATE
# =========================

st.header("⚙️ Current Pipeline State")

state_counts = Counter([r.get("current_stage", "unknown") for r in records])
st.write(state_counts)

# =========================
# REAL TIME LOG SYSTEM
#==========================
import json
import pandas as pd

st.header("📡 Logs Explorer")

log_file = "logs/app.json"

def load_logs():
    logs = []
    try:
        with open(log_file, "r") as f:
            for line in f:
                logs.append(json.loads(line))
    except:
        pass
    return logs

logs = load_logs()

if logs:
    df_logs = pd.DataFrame(logs)

    # FILTERS
    col1, col2, col3 = st.columns(3)

    level_filter = col1.selectbox("Level", ["ALL"] + sorted(df_logs["level"].unique().tolist()))
    stage_filter = col2.selectbox("Stage", ["ALL"] + sorted(df_logs.get("stage", []).dropna().unique().tolist()))
    rec_filter = col3.text_input("Recording ID")

    filtered = df_logs.copy()

    if level_filter != "ALL":
        filtered = filtered[filtered["level"] == level_filter]

    if stage_filter != "ALL":
        filtered = filtered[filtered["stage"] == stage_filter]

    if rec_filter:
        filtered = filtered[filtered["recording_id"].astype(str).str.contains(rec_filter)]

    st.dataframe(filtered.sort_values("timestamp", ascending=False))
else:
    st.info("No logs yet")

# =========================
# 3. RECENT ACTIVITY
# =========================

st.header("🕒 Recent Activity")

recent = recordings.find().sort("updated_at", -1).limit(10)

for r in recent:
    st.write({
        "file": r.get("filename"),
        "status": r.get("status"),
        "stage": r.get("current_stage"),
        "retry": r.get("retry_count"),
    })

# =========================
# 4. ERROR MONITORING
# =========================

st.header("❌ Recent Errors")

errors = recordings.find({
    "status": {"$in": ["failed", "permanently_failed"]},
    "error": {"$ne": None}
}).sort("updated_at", -1).limit(10)

for e in errors:
    st.error({
        "file": e.get("filename"),
        "error": e.get("error"),
        "retry": e.get("retry_count")
    })

#=====================
#RETRY MONITORING
#===================

st.header("📈 Retry Monitoring")

retry_data = [
    {
        "file": r.get("filename"),
        "retry": r.get("retry_count"),
        "status": r.get("status")
    }
    for r in recordings.find()
]

st.dataframe(retry_data)

# =========================
# 5. TRANSCRIPTION INSIGHTS
# =========================

st.header("📝 Transcription Insights")

total_transcripts = transcription.count_documents({})
st.write(f"Total transcriptions: {total_transcripts}")

# =========================
# 6. NLP INSIGHTS (optional)
# =========================

st.header("🧠 NLP Insights")

all_tokens = []
for doc in nlp.find():
    all_tokens.extend(doc.get("filtered_tokens", []))

if all_tokens:
    top_keywords = Counter(all_tokens).most_common(10)
    st.write("Top Keywords:", top_keywords)
else:
    st.info("No NLP data yet")


#==========
#FILE EXPLORER
#==============
st.header("📂 File Explorer")

records = list(recordings.find())

file_map = {
    f"{r.get('filename')} | {str(r.get('_id'))[:6]}": r
    for r in records
}

selected_key = st.selectbox("Select a file", list(file_map.keys()))

if selected_key:
    selected = file_map[selected_key]
    rec_id = selected["_id"]

    st.subheader("📄 Recording Metadata")
    st.json(selected)

    # =====================
    # TRANSCRIPTION VIEW
    # =====================
    trans_doc = transcription.find_one({"recording_id": rec_id})

    if trans_doc:
        st.subheader("📝 Transcription")

        st.text_area("Full Text", trans_doc.get("text", ""), height=150)

        st.subheader("🗣️ Diarized Text")
        st.text_area("Speakers", trans_doc.get("diarized_text", ""), height=150)

        # =====================
        # WORDS TABLE
        # =====================
        st.subheader("🔍 Word-level Data")

        words = trans_doc.get("words", [])
        if words:
            st.dataframe(words[:100])  # limit for performance

    else:
        st.warning("No transcription found")

  

================================================
FILE: app/database/__init__.py
================================================


================================================
FILE: app/database/mongo_db.py
================================================
from pymongo import MongoClient
from app.config import MONGO_URL
from app.utils.logger import logger

#connecting to mongodb
client = MongoClient(MONGO_URL)

#selecting the database
db = client.audio_data

recordings = db['recordings']
transcription = db['transcription']
nlp = db['nlp']
rag_segments = db['rag_segments']
log = db['log']





================================================
FILE: app/database/operation.py
================================================
from datetime import datetime
from app.utils.logger import logger
from app.config import MAX_RETRIES
from app.database.mongo_db import recordings, transcription, rag_segments
from app.database.schemas import build_recording_doc, build_transcription_doc, build_rag_segment 
import os


#function to save the recordings metadata first
def create_recordings(file_path,file_hash):
    doc = build_recording_doc(file_path, file_hash)
    return recordings.insert_one(doc).inserted_id


def save_transcription(recording_id, result):
    doc = build_transcription_doc(recording_id, result)
    transcription.insert_one(doc)


def save_rag_segments(recording_id, segments):
    doc = build_rag_segment(recording_id, segments)
    rag_segments.insert_one(doc)


#function to update the stage of the recordings
def update_stage(recording_id, stage):
    recordings.update_one(
        {"_id": recording_id},
        {"$set": {"current_stage": stage, "status": "processing"}}
    )

#function to marke the done audio
def mark_done(recording_id):
    recordings.update_one(
        {"_id": recording_id},
        {"$set": {"status": "done",
                   "current_stage": "done",
                   "updated_at": datetime.utcnow()}}
    )

#function to mark the failed transcripted audio
def mark_failed(recording_id, error):

    rec = recordings.find_one({"_id": recording_id})

    if rec["status"] == "done":
        return

    new_retry = rec.get("retry_count", 0) + 1

    status = "failed"
    if new_retry >= MAX_RETRIES:
        status = "permanently_failed"

    recordings.update_one(
        {"_id": recording_id},
        {
            "$set": {
                "status": status,
                "error": error,
                "current_stage": "failed",
                "updated_at": datetime.utcnow()
            },
            "$inc": {"retry_count": 1}
        }
    )



================================================
FILE: app/database/schemas.py
================================================
from datetime import datetime
import os

#recordings collection
def build_recording_doc(path, file_hash):
    return{
        "filename": os.path.basename(path),

        "raw_path": path,
        "processed_path": None,

        "file_hash": file_hash,
        "file_size": os.path.getsize(path),
        "last_modified": os.path.getmtime(path),

        #pipeline status
        "status" : "pending",
        "current_stage": "ingestion",

        "stages":{
            "preprocessing": "pending",
            "transcription": "pending",
            "postprocessing": "pending",
            "rag_segmentation" : "pending"
        },

        "retry_count": 0,
        "error": None,

        "created_at": datetime.utcnow(),
        "updated_at": datetime.utcnow()

    }

#transcription segments
def build_transcription_doc(recording_id, result): 
    return {
        "recording_id": recording_id,

        "text": result.get("text"),
        "diarized_text": result.get("diarized_text"),
        "speaker_segments": result.get("segments"),
        "words": result.get("words"),

        "audio_duration_secs": result.get("audio_duration_secs"),

        "created_at": datetime.utcnow()
    }

#rag_Segment collection
def build_rag_segment(recording_id, segments):

    return{
        "recording_id": recording_id,
        "segments": segments,
        "created_at": datetime.utcnow()
    }



================================================
FILE: app/ingestion/__init__.py
================================================


================================================
FILE: app/ingestion/deduplicator.py
================================================
from app.database.mongo_db import recordings
from app.utils.logger import logger

def is_duplicate(file_hash):
    return recordings.find_one({"file_hash": file_hash}) is not None

================================================
FILE: app/ingestion/fetcher.py
================================================
import os
from app.ingestion.issabelle_client import create_sftp_client
from app.config import ISSABEL_REMOTE_PATH, LOCAL_DOWNLOAD_PATH


def fetch_new_files():
    sftp, transport = create_sftp_client()

    os.makedirs(LOCAL_DOWNLOAD_PATH, exist_ok=True)

    downloaded = []

    try:
        for file in sftp.listdir(ISSABEL_REMOTE_PATH):

            if not file.endswith(".wav"):
                continue

            remote_path = f"{ISSABEL_REMOTE_PATH}/{file}"
            local_path = os.path.join(LOCAL_DOWNLOAD_PATH, file)

            if os.path.exists(local_path):
                continue

            sftp.get(remote_path, local_path)
            downloaded.append(local_path)

    finally:
        sftp.close()
        transport.close()

    return downloaded

================================================
FILE: app/ingestion/hashing.py
================================================
import hashlib
from app.utils.logger import logger

def compute_file_hash(file_path, chunk_size = 8192):

    sha256 = hashlib.sha256()

    with open(file_path, "rb") as f:
        while chunk := f.read(chunk_size):
            sha256.update(chunk)

    return sha256.hexdigest()

================================================
FILE: app/ingestion/issabelle_client.py
================================================
import paramiko
import os
from app.config import (
    ISSABEL_HOST,
    ISSABEL_PORT,
    ISSABEL_USER,
    ISSABEL_PASSWORD
)


def create_sftp_client():
    transport = paramiko.Transport((ISSABEL_HOST, ISSABEL_PORT))
    transport.connect(username= ISSABEL_USER, password = ISSABEL_PASSWORD)

    sftp = paramiko.SFTPClient.from_transport(transport)

    return sftp, transport

================================================
FILE: app/ingestion/load_audio.py
================================================
import os
from app.config import RECORDINGS_PATH
from app.utils.logger import logger

SUPPORTED_FORMATS = (".wav", ".mp3", ".flac", ".m4a")

def get_audio_files():
    audio_files = []

    base_path = os.path.abspath(RECORDINGS_PATH)

    for root, _, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith(SUPPORTED_FORMATS):
                full_path = os.path.abspath(os.path.join(root, file))
                audio_files.append(full_path)

    #sorting the files
    audio_files.sort()

    return audio_files

"""
#previous code 
    for file in os.listdir(RECORDINGS_PATH):
        if file.endswith(".wav") or file.endswith(".mp3"):
            audio_files.append(os.path.join(RECORDINGS_PATH, file))
    return audio_files
"""

================================================
FILE: app/ingestion/runner.py
================================================
from app.database.mongo_db import recordings
from app.database.operation import create_recordings
from app.ingestion.hashing import compute_file_hash
from app.ingestion.deduplicator import is_duplicate
from app.ingestion.load_audio import get_audio_files
from app.utils.logger import logger
import os
from datetime import datetime
import hashlib


def ingest_new_files():


    logger.info("Starting ingestion")

    audio_files = get_audio_files()

    new_count =0
    skipped_count = 0

    for path in audio_files:

        try:
            file_hash = compute_file_hash(path)

            if is_duplicate(file_hash):
                skipped_count += 1
                logger.warning(f"Duplicate file skipped: {path}")
                continue

            create_recordings(path, file_hash)
            new_count +=1

        except Exception as e:
            logger.error(f"Ingestion failed for {path}",
                         extra = {"error": str(e)})
            
    logger.info(f"Ingestion done: {new_count} new, {skipped_count} skipped")


================================================
FILE: app/main.py
================================================
import asyncio
from app.pipeline.orchestrator import run_pipeline
from app.ingestion.runner import ingest_new_files
from app.utils.state_validator import run_state_validator



if __name__ == "__main__":

    #validate states before starting the pipeline
    run_state_validator()

    #detect and ingest new files
    ingest_new_files()
    
    #processing the files
    asyncio.run(run_pipeline())




================================================
FILE: app/maintanance/cleanup.py
================================================
import os
from app.database.mongo_db import recordings
from app.utils.logger import logger

SAFE_TO_DELETE_STATUS = {"done", "permanently_failed"}

def cleanup_files(dry_run: bool = True):

    logger.info("Starting cleanup job")

    records = recordings.find({
        "status": {"$in": list(SAFE_TO_DELETE_STATUS)}
    })

    deleted = 0
    skipped = 0

    for rec in records:

        raw_path = rec.get("raw_path")
        processed_path = rec.get("processed_path")
        recording_id = rec["_id"]

        
        logger.info(
            "checking record",
            extra={
            "recording_id": str(rec["_id"]),
            "status": rec.get("status"),
            "raw_path": rec.get("raw_path")
            }
         )


        for path in [raw_path, processed_path]:

            if not path:
                continue

            if not os.path.exists(path):
                skipped +=1
                continue

            if dry_run:
                logger.info(
                    "DRY RUN - would delete file",
                    extra ={
                        "recording_id": str(recording_id), 
                        "path": path
                    }
                )
                continue

            try:
                os.remove(path)
                deleted += 1

                logger.info(
                    "File deleted safely",
                    extra = {
                        "recording_id": str(recording_id),
                        "path": path
                    }
                )

            except Exception as e:
                logger.error(
                    "Failed to delete file",
                    extra = {
                        "recording_id": str(recording_id),
                        "path": path,
                        "error": str(e)
                    }
                )

    logger.info(
        "Cleanup finished",
        extra ={
            "deleted":deleted,
            "skipped": skipped
        }
    )

#to run seperatly
cleanup_files()

================================================
FILE: app/pipeline/__init__.py
================================================


================================================
FILE: app/pipeline/orchestrator.py
================================================
import asyncio
from app.database.mongo_db import recordings
from app.pipeline.worker import process_audio
from app.config import MAX_RETRIES
from app.utils.logger import logger

"""
from app.queue import queue
from app.pipeline.worker import process_audio_sync
"""


async def run_pipeline():
    logger.info("Pipeline started")


    #fetching the pending records from the database
    cursor = recordings.find({
        "status": {"$in": ["pending", "failed"]},
        "retry_count": {"$lt": MAX_RETRIES}
    })

    tasks = [process_audio(r) for r in cursor]

    results =await asyncio.gather(*tasks, return_exceptions=True)

    for r in results:
        if isinstance(r, Exception):
            logger.error(f"Unhandled exception in pipeline: {r}")
    logger.info("Pipeline finished")
    

    """
    # app/pipeline/orchestrator.py

from app.queue import queue
from app.pipeline.worker import process_audio_sync

def run_pipeline():
    logger.info("Pipeline started")

    cursor = recordings.find({
        "status": {"$in": ["pending", "failed"]},
        "retry_count": {"$lt": MAX_RETRIES}
    })

    for record in cursor:
        queue.enqueue(
            process_audio_sync,
            record,
            job_timeout=600  # 10 min
        )

    logger.info("Jobs enqueued")
    """

================================================
FILE: app/pipeline/worker.py
================================================
import asyncio
import os
from concurrent.futures import ProcessPoolExecutor

from app.utils.logger import logger
from app.utils.retry import retry_async

from app.database.mongo_db import recordings

from app.utils.stage_manager import (
    is_stage_done,
    mark_stage_done,
    mark_stage_failed
)

from app.config import TEMP_DIR
from app.audio.preprocess import preprocess_audio
from app.stt.transcribe import transcribe

from app.process.segmentation import segment_speakers
from app.process.utils import words_to_dict, print_transcript_by_speaker

from app.database.operation import (
    update_stage,
    mark_done,
    mark_failed,
    save_transcription,
    save_rag_segments
)

from app.stt.rag_segments import build_rag_segments

semaphore = asyncio.Semaphore(3)
executor = ProcessPoolExecutor(max_workers=os.cpu_count() // 2 or 1)


async def process_audio(record):
    recording_id = record["_id"]
    success = False
    processed_path = record.get("processed_path")

    logger.info("Worker started", extra={"recording_id": recording_id})

    try:
        #to remove the temp file after the successfully done process
        processed_path = None
        raw_path = record.get("raw_path")

        if not raw_path or not os.path.exists(raw_path):
            raise FileNotFoundError(f"Raw file missing: {raw_path}")

        # ---------------- PREPROCESS ----------------
        update_stage(recording_id, "preprocessing")

        logger.info("Preprocessing started",
                    extra = {"recording_Id": recording_id})

        temp_path = os.path.join(TEMP_DIR, f"{recording_id}.wav")

        async def run_preprocessing():
            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(
                executor,
                preprocess_audio,
                raw_path,
                temp_path
            )

        processed_path = await retry_async(
            run_preprocessing,
            retries=2,
            recording_id=recording_id,
            stage="preprocessing"
        )

        recordings.update_one(
            {"_id": recording_id},
            {"$set": {"processed_path": processed_path}}
        )

        # ---------------- TRANSCRIPTION ----------------
        update_stage(recording_id, "transcription")
        
        logger.info(
            "Transcription",
            extra = {"recording_id": recording_id}
        )

        async with semaphore:
            result = await retry_async(
                transcribe,
                processed_path,
                retries=3,
                recording_id=recording_id,
                stage="transcription"
            )

        # ---------------- POST PROCESS ----------------
        update_stage(recording_id, "postprocessing")

        logger.info("Postprocessing started",
                    extra = {"recording_id": recording_id})

        diarized = print_transcript_by_speaker(result)
        words = words_to_dict(result)
        segments = segment_speakers(result)

        save_transcription(recording_id, {
            "text": result.get("text"),
            "diarized_text": diarized,
            "words": words,
            "segments": segments,
            "audio_duration_secs": result.get("audio_duration_secs")
        })

        # ---------------- RAG ----------------
        update_stage(recording_id, "rag_segmentation")

        logger.info(
            "RAG Segmentation started",
            extra = {"recording_id": recording_id}
        )

        rag = build_rag_segments(result)
        save_rag_segments(recording_id, rag)

        # ---------------- DONE ----------------
        mark_done(recording_id)
        success = True

        logger.info("Processing completed", extra={"recording_id": recording_id})

    except Exception as e:
        logger.error("Processing failed", extra={
            "recording_id": recording_id,
            "error": str(e)
        })
        mark_failed(recording_id, str(e))

    finally:
        if success:
            logger.info(
                "Cleaning up files(success)",
                extra = {"recording_id": recording_id}
            )

            #delete the raw file
            if raw_path and os.path.exists(raw_path):
                try:
                    os.remove(raw_path)
                except Exception as e:
                    logger.warning(
                        "Failed to delete raw file",
                        extra = {"recording_id": recording_id, "error": str(e)}
                    )

            #deleting the TEMP 
            if processed_path and os.path.exists(processed_path):
                try:
                    os.remove(processed_path)
                except Exception as e:
                    logger.warning(
                        "Failed to delete temp file",
                        extra = {"recording_id": recording_id, "error": str(e)}
                    )

        else:
            logger.info(
                "Skipping cleanup(failure)- files preserverd for retry",
                extra = {"recording_id": recording_id}
            )


#this part is to make the worker pipeline into a queue
"""def process_audio_sync(record):
    import asyncio
    return asyncio.run(process_audio(record))

import rq
from redis import Redis

redis_conn = Redis(host="localhost")"""

================================================
FILE: app/process/__init__.py
================================================


================================================
FILE: app/process/segmentation.py
================================================
#segmentation 

def segment_speakers(result):
    words = result.get("words", [])

    segments = []
    current = None

    for word in words:
        if word.get("type") != "word":
            continue

        if current is None:
            current = {
                "speaker_id": word.get("speaker_id"),
                "start": word.get("start"),
                "end": word.get("end"),
                "text": [word.get("text", "")]
            }
            continue

        if word.get("speaker_id") != current["speaker_id"]:
            current["text"] = " ".join(current["text"])
            segments.append(current)

            current = {
                "speaker_id": word.get("speaker_id"),
                "start": word.get("start"),
                "end": word.get("end"),
                "text": [word.get("text", "")]
            }
        else:
            current["text"].append(word.get("text", ""))
            current["end"] = word.get("end")

    if current:
        current["text"] = " ".join(current["text"])
        segments.append(current)

    return segments

================================================
FILE: app/process/utils.py
================================================
#words to dict 

def words_to_dict(result):
    return result.get("words", [])


#speaker diarization function
def print_transcript_by_speaker(result):
    words = result.get("words", [])

    current_speaker = None
    current_text = []
    lines = []

    for word in words:
        if word.get("type") != "word":
            continue

        if word.get("speaker_id") != current_speaker:
            if current_text:
                lines.append(f"{current_speaker}: {' '.join(current_text)}")

            current_speaker = word.get("speaker_id")
            current_text = [word.get("text", "")]
        else:
            current_text.append(word.get("text", ""))

    if current_text:
        lines.append(f"{current_speaker}: {' '.join(current_text)}")

    return "\n".join(lines)

================================================
FILE: app/stt/__init__.py
================================================


================================================
FILE: app/stt/rag_segments.py
================================================
#rag segment funcion
def build_rag_segments(result, max_duration = 15.0):

    words = result.get("words", [])

    segments = []
    current = None

    for word in words:

        if word.get("type") != "word":
            continue

        if current is None:
            current = {
                "speaker_id": word.get("speaker_id"),
                "start": word.get("start"),
                "end": word.get("end"),
                "text": [word.get("text", "")]
            }
            continue

        duration = word.get("end", 0 ) - current['start']

        #condition to split
        speaker_changed = word.get("speaker_id") != current['speaker_id']
        too_long = duration > max_duration

        if speaker_changed or too_long:
            current['text'] = " ".join(current['text'])

            segments.append({
                "speaker_id": current['speaker_id'],
                "start": current['start'],
                "end": current['end'],
                "text": " ".join(current['text'])
            })

            current = {
                "speaker_id":word.get("speaker_id"),
                "start": word.get("start"),
                "end": word.get("end"),
                "text": [word.get("text", "")]
            }


        else:
            current['text'].append(word.get("text", ''))
            current['end'] = word.get("end")

    if current:
        segments.append({
            "speaker_id": current['speaker_id'],
            "start": current['start'],
            "end": current['end'],
            "text": " ".join(current['text'])
        })

        

    return segments



================================================
FILE: app/stt/transcribe.py
================================================
from elevenlabs import ElevenLabs
from app.config import ELEVENLABS_API_KEY
import asyncio




client = ElevenLabs(api_key = ELEVENLABS_API_KEY)

async def transcribe(audio_path):
    loop = asyncio.get_event_loop()

    def _sync_call():
        with open(audio_path, "rb") as audio_file:
            result = client.speech_to_text.convert(
                file=audio_file,
                model_id="scribe_v2",
                diarize=True,
                language_code="fa",
                num_speakers=2,
                timestamps_granularity="word",
                keyterms=  ["آوانته" , "کلاریس"]
            )
            return result.dict()  

    return await loop.run_in_executor(None, _sync_call)




"""result = transcribe("sample2.wav")
json_result = json.dumps(result.dict(), indent = 4, ensure_ascii = False)
print(json_result)"""



================================================
FILE: app/utils/logger.py
================================================
import logging
import os 
import json
from datetime import datetime

# Configure logging
LOG_DIR = "logs"
os.makedirs(LOG_DIR, exist_ok = True)

class JsonFormatter(logging.Formatter):
    def format(self, record):
        log_record = {
            "timestamp" : datetime.utcnow().isoformat(),
            "level": record.levelname,
            "message" : record.getMessage(),
            "module": record.module,
            "function": record.funcName
        }

        #attaching extra fields if exist
        if hasattr(record, "recording_id"):
            log_record['recording_id'] = str(record.recording_id)

        if hasattr(record, "stage"):
            log_record['stage'] = record.stage

        if hasattr(record, "error"):
            log_record['error'] = record.error

        return json.dumps(log_record)
    

def setup_logger():
    logger = logging.getLogger("pipeline")
    logger.setLevel(logging.INFO)

    #console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(
        logging.Formatter("[%(levelname)s] %(message)s")
    )

    #file handler
    file_handler = logging.FileHandler(f"{LOG_DIR}/app.log")
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(
        logging.Formatter(
            "%(asctime)s | %(levelname)s | %(message)s"
        )
    )

    #json handler
    json_handler = logging.FileHandler(f"{LOG_DIR}/app.json")
    json_handler.setLevel(logging.INFO)
    json_handler.setFormatter(JsonFormatter())

    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    logger.addHandler(json_handler)

    return logger

logger = setup_logger()

================================================
FILE: app/utils/retry.py
================================================
import asyncio
from app.utils.logger import logger

async def retry_async(
        func,
        *args,
        retries=3,
        base_delay =2,
        factor = 2,
        recording_id = None,
        stage = None,
):
    
    #generating retry with exponential backoff

    for attempt in range(retries):
        try:
            return await func(*args)
        
        except Exception as e:

            if attempt ==retries -1:
                logger.error(
                    "Max retries reached",
                    extra={
                        "recording_id": recording_id,
                        "stage": stage,
                        "error": str(e)
                    }
                )
                raise
        
            delay = base_delay * (factor ** attempt)

            logger.warning(
            f"Retrying in {delay}s (attempt {attempt+1})",
            extra = {
                "recording_id": recording_id,
                "stage": stage,
                "error": str(e)
                }
            )

            await asyncio.sleep(delay)

================================================
FILE: app/utils/stage_manager.py
================================================
from app.database.mongo_db import recordings


def is_stage_done(record, stage):
    return record.get("stages", {}).get(stage) == "done"


def mark_stage_done(recording_id, stage):
    recordings.update_one(
        {"_id": recording_id},
        {
            "$set": {f"stages.{stage}": "done"}
        }
    )


def mark_stage_failed(recording_id, stage, error):
    recordings.update_one(
        {"_id": recording_id},
        {
            "$set": {
                f"stages.{stage}": "failed",
                "error": error
            },
            "$inc": {"retry_count": 1}
        }
    )

================================================
FILE: app/utils/state_validator.py
================================================
import os
from datetime import datetime, timedelta

from app.database.mongo_db import recordings
from app.utils.logger import logger
from app.database.operation import mark_failed


POST_PREPROCESSING_STAGES = {
    "transcription",
    "postprocessing",
    "rag_segmentation",
    "done"
}

# how long a job can stay "processing" before considered stuck
STUCK_TIMEOUT_MINUTES = 10


def validate_record(record):

    recording_id = record["_id"]

    raw_path = record.get("raw_path")
    processed_path = record.get("processed_path")

    stage = record.get("current_stage", "ingestion")
    status = record.get("status", "pending")

    updated_at = record.get("updated_at")

    raw_exists = raw_path and os.path.exists(raw_path)
    processed_exists = processed_path and os.path.exists(processed_path)



    # ----------------------------------------------------
    # CASE 1: RAW FILE EXISTS → ALWAYS RECOVERABLE
    # ----------------------------------------------------

    if raw_exists:

        # auto-recover failed jobs
        if status == "failed":

            logger.warning(
                "Recovering failed job",
                extra={"recording_id": recording_id}
            )

            recordings.update_one(
                {"_id": recording_id},
                {
                    "$set": {
                        "status": "pending"
                    }
                }
            )

        return

    # ----------------------------------------------------
    # CASE 2: BEFORE PREPROCESSING → RAW SHOULD EXIST
    # ----------------------------------------------------

    if stage in ["ingestion", "preprocessing"]:

        logger.warning(
            "Raw file missing before preprocessing finished",
            extra={"recording_id": recording_id}
        )

        return

    # ----------------------------------------------------
    # CASE 3: AFTER PREPROCESSING → PROCESSED FILE REQUIRED
    # ----------------------------------------------------

    if stage in POST_PREPROCESSING_STAGES:

        if processed_exists:
            return

        logger.error(
            "Both raw and processed files missing",
            extra={
                "recording_id": recording_id,
                "stage": stage
            }
        )

        mark_failed(recording_id, "Audio files missing")
        return


def recover_stuck_jobs():

    now = datetime.utcnow()

    stuck_threshold = now - timedelta(minutes=STUCK_TIMEOUT_MINUTES)

    stuck_jobs = recordings.find({
        "status": "processing",
        "updated_at": {"$lt": stuck_threshold}
    })

    for record in stuck_jobs:

        recording_id = record["_id"]

        logger.warning(
            "Recovering stuck job",
            extra={"recording_id": recording_id}
        )

        recordings.update_one(
            {"_id": recording_id},
            {
                "$set": {
                    "status": "pending"
                }
            }
        )


def run_state_validator():

    logger.info("State validator started")

    # validate files
    records = recordings.find({
        "status": {"$in": ["pending", "processing", "failed"]}
    })

    for record in records:
        validate_record(record)
    


    # recover stuck jobs
    recover_stuck_jobs()

    logger.info("State validator finished")


================================================
FILE: requirement.txt
================================================
librosa
noisereduce
elevenlabs
soundfile
asyncio
confurrent.futures
pymongo
datetime
hazms
Download .txt
gitextract_lm6zn_63/

├── .gitignore
├── app/
│   ├── __init__.py
│   ├── audio/
│   │   └── preprocess.py
│   ├── config.py
│   ├── dashboard/
│   │   ├── __init__.py
│   │   └── dashboard.py
│   ├── database/
│   │   ├── __init__.py
│   │   ├── mongo_db.py
│   │   ├── operation.py
│   │   └── schemas.py
│   ├── ingestion/
│   │   ├── __init__.py
│   │   ├── deduplicator.py
│   │   ├── fetcher.py
│   │   ├── hashing.py
│   │   ├── issabelle_client.py
│   │   ├── load_audio.py
│   │   └── runner.py
│   ├── main.py
│   ├── maintanance/
│   │   └── cleanup.py
│   ├── pipeline/
│   │   ├── __init__.py
│   │   ├── orchestrator.py
│   │   └── worker.py
│   ├── process/
│   │   ├── __init__.py
│   │   ├── segmentation.py
│   │   └── utils.py
│   ├── stt/
│   │   ├── __init__.py
│   │   ├── rag_segments.py
│   │   └── transcribe.py
│   └── utils/
│       ├── logger.py
│       ├── retry.py
│       ├── stage_manager.py
│       └── state_validator.py
└── requirement.txt
Download .txt
SYMBOL INDEX (35 symbols across 21 files)

FILE: app/audio/preprocess.py
  function preprocess_audio (line 7) | def preprocess_audio(input_path, output_path):

FILE: app/dashboard/dashboard.py
  function load_logs (line 68) | def load_logs():

FILE: app/database/operation.py
  function create_recordings (line 10) | def create_recordings(file_path,file_hash):
  function save_transcription (line 15) | def save_transcription(recording_id, result):
  function save_rag_segments (line 20) | def save_rag_segments(recording_id, segments):
  function update_stage (line 26) | def update_stage(recording_id, stage):
  function mark_done (line 33) | def mark_done(recording_id):
  function mark_failed (line 42) | def mark_failed(recording_id, error):

FILE: app/database/schemas.py
  function build_recording_doc (line 5) | def build_recording_doc(path, file_hash):
  function build_transcription_doc (line 36) | def build_transcription_doc(recording_id, result):
  function build_rag_segment (line 51) | def build_rag_segment(recording_id, segments):

FILE: app/ingestion/deduplicator.py
  function is_duplicate (line 4) | def is_duplicate(file_hash):

FILE: app/ingestion/fetcher.py
  function fetch_new_files (line 6) | def fetch_new_files():

FILE: app/ingestion/hashing.py
  function compute_file_hash (line 4) | def compute_file_hash(file_path, chunk_size = 8192):

FILE: app/ingestion/issabelle_client.py
  function create_sftp_client (line 11) | def create_sftp_client():

FILE: app/ingestion/load_audio.py
  function get_audio_files (line 7) | def get_audio_files():

FILE: app/ingestion/runner.py
  function ingest_new_files (line 12) | def ingest_new_files():

FILE: app/maintanance/cleanup.py
  function cleanup_files (line 7) | def cleanup_files(dry_run: bool = True):

FILE: app/pipeline/orchestrator.py
  function run_pipeline (line 13) | async def run_pipeline():

FILE: app/pipeline/worker.py
  function process_audio (line 37) | async def process_audio(record):

FILE: app/process/segmentation.py
  function segment_speakers (line 3) | def segment_speakers(result):

FILE: app/process/utils.py
  function words_to_dict (line 3) | def words_to_dict(result):
  function print_transcript_by_speaker (line 8) | def print_transcript_by_speaker(result):

FILE: app/stt/rag_segments.py
  function build_rag_segments (line 2) | def build_rag_segments(result, max_duration = 15.0):

FILE: app/stt/transcribe.py
  function transcribe (line 10) | async def transcribe(audio_path):

FILE: app/utils/logger.py
  class JsonFormatter (line 10) | class JsonFormatter(logging.Formatter):
    method format (line 11) | def format(self, record):
  function setup_logger (line 33) | def setup_logger():

FILE: app/utils/retry.py
  function retry_async (line 4) | async def retry_async(

FILE: app/utils/stage_manager.py
  function is_stage_done (line 4) | def is_stage_done(record, stage):
  function mark_stage_done (line 8) | def mark_stage_done(recording_id, stage):
  function mark_stage_failed (line 17) | def mark_stage_failed(recording_id, stage, error):

FILE: app/utils/state_validator.py
  function validate_record (line 20) | def validate_record(record):
  function recover_stuck_jobs (line 96) | def recover_stuck_jobs():
  function run_state_validator (line 126) | def run_state_validator():
Condensed preview — 33 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (38K chars).
[
  {
    "path": ".gitignore",
    "chars": 137,
    "preview": "/audio_data.transcription.json\n/db.ipynb\n/elevenlabs.ipynb\n/main.ipynb\n/venv\n/stt/__init__.py\n/data\n/logs\n/.env\n/app/nlp"
  },
  {
    "path": "app/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "app/audio/preprocess.py",
    "chars": 785,
    "preview": "import librosa\nimport soundfile as sf\nimport noisereduce as nr\nimport os\n\n\ndef preprocess_audio(input_path, output_path)"
  },
  {
    "path": "app/config.py",
    "chars": 787,
    "preview": "import os\nfrom dotenv import load_dotenv\nfrom pathlib import Path\n\nBASE_DIR = os.path.dirname(os.path.dirname(os.path.ab"
  },
  {
    "path": "app/dashboard/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "app/dashboard/dashboard.py",
    "chars": 5317,
    "preview": "import streamlit as st\nfrom collections import Counter\nfrom pymongo import MongoClient\nimport pandas as pd\n\nMONGO_URL = "
  },
  {
    "path": "app/database/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "app/database/mongo_db.py",
    "chars": 341,
    "preview": "from pymongo import MongoClient\nfrom app.config import MONGO_URL\nfrom app.utils.logger import logger\n\n#connecting to mon"
  },
  {
    "path": "app/database/operation.py",
    "chars": 1881,
    "preview": "from datetime import datetime\nfrom app.utils.logger import logger\nfrom app.config import MAX_RETRIES\nfrom app.database.m"
  },
  {
    "path": "app/database/schemas.py",
    "chars": 1398,
    "preview": "from datetime import datetime\nimport os\n\n#recordings collection\ndef build_recording_doc(path, file_hash):\n    return{\n  "
  },
  {
    "path": "app/ingestion/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "app/ingestion/deduplicator.py",
    "chars": 179,
    "preview": "from app.database.mongo_db import recordings\nfrom app.utils.logger import logger\n\ndef is_duplicate(file_hash):\n    retur"
  },
  {
    "path": "app/ingestion/fetcher.py",
    "chars": 776,
    "preview": "import os\nfrom app.ingestion.issabelle_client import create_sftp_client\nfrom app.config import ISSABEL_REMOTE_PATH, LOCA"
  },
  {
    "path": "app/ingestion/hashing.py",
    "chars": 280,
    "preview": "import hashlib\nfrom app.utils.logger import logger\n\ndef compute_file_hash(file_path, chunk_size = 8192):\n\n    sha256 = h"
  },
  {
    "path": "app/ingestion/issabelle_client.py",
    "chars": 381,
    "preview": "import paramiko\nimport os\nfrom app.config import (\n    ISSABEL_HOST,\n    ISSABEL_PORT,\n    ISSABEL_USER,\n    ISSABEL_PAS"
  },
  {
    "path": "app/ingestion/load_audio.py",
    "chars": 772,
    "preview": "import os\nfrom app.config import RECORDINGS_PATH\nfrom app.utils.logger import logger\n\nSUPPORTED_FORMATS = (\".wav\", \".mp3"
  },
  {
    "path": "app/ingestion/runner.py",
    "chars": 1053,
    "preview": "from app.database.mongo_db import recordings\nfrom app.database.operation import create_recordings\nfrom app.ingestion.has"
  },
  {
    "path": "app/main.py",
    "chars": 403,
    "preview": "import asyncio\nfrom app.pipeline.orchestrator import run_pipeline\nfrom app.ingestion.runner import ingest_new_files\nfrom"
  },
  {
    "path": "app/maintanance/cleanup.py",
    "chars": 2048,
    "preview": "import os\nfrom app.database.mongo_db import recordings\nfrom app.utils.logger import logger\n\nSAFE_TO_DELETE_STATUS = {\"do"
  },
  {
    "path": "app/pipeline/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "app/pipeline/orchestrator.py",
    "chars": 1303,
    "preview": "import asyncio\nfrom app.database.mongo_db import recordings\nfrom app.pipeline.worker import process_audio\nfrom app.confi"
  },
  {
    "path": "app/pipeline/worker.py",
    "chars": 5362,
    "preview": "import asyncio\nimport os\nfrom concurrent.futures import ProcessPoolExecutor\n\nfrom app.utils.logger import logger\nfrom ap"
  },
  {
    "path": "app/process/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "app/process/segmentation.py",
    "chars": 1093,
    "preview": "#segmentation \n\ndef segment_speakers(result):\n    words = result.get(\"words\", [])\n\n    segments = []\n    current = None\n"
  },
  {
    "path": "app/process/utils.py",
    "chars": 789,
    "preview": "#words to dict \n\ndef words_to_dict(result):\n    return result.get(\"words\", [])\n\n\n#speaker diarization function\ndef print"
  },
  {
    "path": "app/stt/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "app/stt/rag_segments.py",
    "chars": 1636,
    "preview": "#rag segment funcion\ndef build_rag_segments(result, max_duration = 15.0):\n\n    words = result.get(\"words\", [])\n\n    segm"
  },
  {
    "path": "app/stt/transcribe.py",
    "chars": 852,
    "preview": "from elevenlabs import ElevenLabs\nfrom app.config import ELEVENLABS_API_KEY\nimport asyncio\n\n\n\n\nclient = ElevenLabs(api_k"
  },
  {
    "path": "app/utils/logger.py",
    "chars": 1711,
    "preview": "import logging\nimport os \nimport json\nfrom datetime import datetime\n\n# Configure logging\nLOG_DIR = \"logs\"\nos.makedirs(LO"
  },
  {
    "path": "app/utils/retry.py",
    "chars": 1086,
    "preview": "import asyncio\nfrom app.utils.logger import logger\n\nasync def retry_async(\n        func,\n        *args,\n        retries="
  },
  {
    "path": "app/utils/stage_manager.py",
    "chars": 602,
    "preview": "from app.database.mongo_db import recordings\n\n\ndef is_stage_done(record, stage):\n    return record.get(\"stages\", {}).get"
  },
  {
    "path": "app/utils/state_validator.py",
    "chars": 3352,
    "preview": "import os\nfrom datetime import datetime, timedelta\n\nfrom app.database.mongo_db import recordings\nfrom app.utils.logger i"
  },
  {
    "path": "requirement.txt",
    "chars": 91,
    "preview": "librosa\nnoisereduce\nelevenlabs\nsoundfile\nasyncio\nconfurrent.futures\npymongo\ndatetime\nhazms\n"
  }
]

About this extraction

This page contains the full source code of the Arefe-Masnouie/audio_transcript GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 33 files (33.6 KB), approximately 8.4k tokens, and a symbol index with 35 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!