Spaces:

kingabzpro
/

Urdu-STT-with-GPT-OSS

Sleeping

App Files Files Community

kingabzpro commited on Sep 6

Commit

6f481b1

verified ·

1 Parent(s): 980c187

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -179

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
-# app.py – Urdu Whisper (CT2) transcription demo with upload + record + optional LLM polishing via Groq
 import os
 import json
 from datetime import timedelta
-from typing import List, Tuple, Optional
 import gradio as gr
 import torch
@@ -13,16 +13,15 @@ import faster_whisper
 # Config
 # ────────────────────────────────────────────────────────────────────────────────
-# (Optional) cache Hugging Face files in a persistent dir when running in Spaces
 os.environ.setdefault("HF_HOME", "/home/user/app/.cache")
 MODEL_ID_CT2 = "kingabzpro/whisper-large-v3-urdu-ct2"
-GROQ_MODEL = "openai/gpt-oss-120b"  # as requested
 DEFAULT_SYSTEM_PROMPT_UR = (
     "آپ ایک ماہر اردو زبان ایڈیٹر ہیں۔ دیے گئے متن کو بہتر اردو املا، "
     "رموزِ اوقاف، وقفوں اور قدرتی روانی کے ساتھ پیش کریں۔ "
     "بولنے والے کے انداز اور معنی کو برقرار رکھیں، مبالغہ نہ کریں، "
-    "انگریزی الفاظ اگر بول چال میں عمومی ہوں تو برقرار رہنے دیں۔"
 )
 # ────────────────────────────────────────────────────────────────────────────────
@@ -30,141 +29,70 @@ DEFAULT_SYSTEM_PROMPT_UR = (
 # ────────────────────────────────────────────────────────────────────────────────
 def format_timestamp(seconds: float, format_type: str = "srt") -> str:
-    """
-    Format seconds to SRT/VTT timestamp with millisecond precision.
-    """
-    if seconds is None:
-        seconds = 0.0
-    total_ms = int(round(seconds * 1000))
-    hours, rem_ms = divmod(total_ms, 3600_000)
     minutes, rem_ms = divmod(rem_ms, 60_000)
     sec, ms = divmod(rem_ms, 1000)
     sep = "," if format_type == "srt" else "."
     return f"{hours:02d}:{minutes:02d}:{sec:02d}{sep}{ms:03d}"
 def basic_urdu_cleanup(text: str) -> str:
-    """
-    Lightweight post-processing before/without LLM:
-    - normalize whitespace
-    - swap some Latin punctuation to Urdu-friendly alternatives where appropriate
-    - fix duplicated punctuation
-    """
     if not text:
         return text
-    # Whitespace normalize
     t = " ".join(text.split())
-    # Common punctuation mapping (keep English tech terms intact; minimal changes)
     replacements = {
-        " ,": ",",
-        " .": ".",
-        " ?": "?",
-        " !": "!",
-        " ،": "،",
-        " ۔": "۔",
-        ",": "،",     # prefer Arabic comma
-        ";": "؛",     # Arabic semicolon
-        ". . .": "…",
-        "...": "…",
     }
     for a, b in replacements.items():
         t = t.replace(a, b)
-    # Remove spaces before Urdu punctuation
     t = t.replace(" ،", "،").replace(" ۔", "۔").replace(" ؛", "؛").replace(" ؟", "؟")
-    # Ensure a space after punctuation where natural
     for p in ["،", "؛", ",", ";"]:
         t = t.replace(p, p + " ")
-    t = " ".join(t.split())  # re-trim
-    return t.strip()
-# ────────────────────────────────────────────────────────────────────────────────
-# Groq LLM (OpenAI-compatible) client
-# ────────────────────────────────────────────────────────────────────────────────
 def get_groq_client(api_key: Optional[str] = None):
-    """
-    Lazily import and initialize Groq client.
-    Priority: explicit api_key arg → env GROQ_API_KEY → None (disabled).
-    """
-    key = api_key or os.getenv("GROQ_API_KEY", "").strip()
     if not key:
         return None
     try:
         from groq import Groq  # type: ignore
         return Groq(api_key=key)
     except Exception as e:
-        print(f"[WARN] Could not import/init Groq client: {e}")
         return None
-def enhance_text_with_llm(
-    text: str,
-    api_key: Optional[str],
-    temperature: float = 0.2,
-    system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR,
-) -> str:
-    """
-    Send full transcript to Groq for Urdu polishing.
-    Returns original text on error/fallback.
-    """
     client = get_groq_client(api_key)
     if not client:
         return basic_urdu_cleanup(text)
     try:
-        # OpenAI-style Chat Completions
         resp = client.chat.completions.create(
             model=GROQ_MODEL,
             temperature=float(temperature),
             messages=[
                 {"role": "system", "content": system_prompt},
-                {
-                    "role": "user",
-                    "content": (
-                        "براہِ کرم درج ذیل اردو متن کی زبان بہتر کریں، "
-                        "صرف بہتر متن واپس کریں، کوئی اضافی تبصرہ نہ کریں:\n\n"
-                        f"{text}"
-                    ),
-                },
             ],
         )
-        improved = resp.choices[0].message.content.strip()
-        return improved or basic_urdu_cleanup(text)
     except Exception as e:
-        print(f"[WARN] LLM full-text enhance failed, falling back. Error: {e}")
         return basic_urdu_cleanup(text)
-def enhance_lines_with_llm(
-    lines: List[str],
-    api_key: Optional[str],
-    temperature: float = 0.2,
-    system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR,
-) -> List[str]:
-    """
-    Batch-enhance multiple short lines (e.g., subtitle segments) while preserving order.
-    We enumerate lines and ask model to return same count with the same numbering.
-    Falls back to basic cleanup per line on failure.
-    """
     if not lines:
         return lines
     client = get_groq_client(api_key)
     if not client:
         return [basic_urdu_cleanup(x) for x in lines]
-    # Build a numbered list
     numbered = "\n".join(f"{i+1}. {ln}" for i, ln in enumerate(lines))
     user_msg = (
-        "درج ذیل جملوں کی اردو بہتر بنائیں۔ اسی ترتیب سے بالکل اتنی ہی سطور "
-        "واپس کریں، ہر سطر اسی نمبر کے ساتھ ہو۔ صرف بہتر جملے دیں، اضافی متن نہ دیں.\n\n"
-        f"{numbered}"
     )
     try:
         resp = client.chat.completions.create(
@@ -175,30 +103,21 @@ def enhance_lines_with_llm(
                 {"role": "user", "content": user_msg},
             ],
         )
-        raw = resp.choices[0].message.content.strip()
-        # Parse lines that start with "N. "
         improved_map = {}
         for line in raw.splitlines():
-            line = line.strip()
-            if not line:
                 continue
-            if "." in line:
-                num_part, rest = line.split(".", 1)
-                num_part = num_part.strip()
-                if num_part.isdigit():
-                    idx = int(num_part) - 1
-                    improved_map[idx] = rest.strip()
-        # Align back; fallback per-line cleanup if missing
-        out = []
-        for i, orig in enumerate(lines):
-            out.append(improved_map.get(i, basic_urdu_cleanup(orig)))
-        return out
     except Exception as e:
-        print(f"[WARN] LLM line enhance failed, falling back. Error: {e}")
         return [basic_urdu_cleanup(x) for x in lines]
 # ────────────────────────────────────────────────────────────────────────────────
 # Whisper (CT2) Model
 # ─────────────────���──────────────────────────────────────────────────────────────
@@ -214,19 +133,16 @@ print("Loading model... this may take a minute the first time.")
 model = faster_whisper.WhisperModel(
     MODEL_ID_CT2,
     device="cuda" if torch.cuda.is_available() else "cpu",
-    # 'auto' picks fastest viable type; you can force 'float16' on GPU, 'int8' on CPU, etc.
     compute_type="auto",
 )
 print("✅ Model loaded successfully!")
 # ────────────────────────────────────────────────────────────────────────────────
 # Core Transcription
 # ────────────────────────────────────────────────────────────────────────────────
 def transcribe_audio(
-    uploaded_path: Optional[str],
-    recorded_path: Optional[str],
     output_format: str,
     beam_size: int,
     llm_enhance: bool,
@@ -234,12 +150,10 @@ def transcribe_audio(
     llm_temperature: float,
     llm_system_prompt: str,
 ):
-    # pick the recording if present, else the uploaded file
-    audio_path = recorded_path or uploaded_path
     if not audio_path:
         raise gr.Error("Please upload or record an audio clip.")
-    segments_gen, info = model.transcribe(
         audio_path,
         language="ur",
         beam_size=int(beam_size),
@@ -247,24 +161,22 @@ def transcribe_audio(
         vad_filter=False,
     )
-    segments = []
-    raw_lines = []
-    for seg in segments_gen:
         text = (seg.text or "").strip()
         segments.append({"start": seg.start, "end": seg.end, "text": text})
         raw_lines.append(text)
-    # Cleanup/enhance
     if llm_enhance:
-        # For 'text' we do a single-shot polish; for subtitles we do line-preserving polish
         if output_format == "text":
-            improved_full = enhance_text_with_llm(
                 " ".join(raw_lines),
                 api_key=llm_api_key,
                 temperature=llm_temperature,
                 system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
             )
-            cleaned_lines = [improved_full]
         else:
             cleaned_lines = enhance_lines_with_llm(
                 raw_lines,
@@ -273,23 +185,23 @@ def transcribe_audio(
                 system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
             )
     else:
-        if output_format == "text":
-            cleaned_lines = [basic_urdu_cleanup(" ".join(raw_lines))]
-        else:
-            cleaned_lines = [basic_urdu_cleanup(x) for x in raw_lines]
-    # Render outputs
     if output_format == "text":
         return cleaned_lines[0]
     if output_format == "srt":
         lines = []
         for i, s in enumerate(segments, 1):
-            improved = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
             lines += [
                 str(i),
                 f"{format_timestamp(s['start'], 'srt')} --> {format_timestamp(s['end'], 'srt')}",
-                improved,
                 "",
             ]
         return "\n".join(lines)
@@ -297,16 +209,15 @@ def transcribe_audio(
     if output_format == "vtt":
         lines = ["WEBVTT", ""]
         for i, s in enumerate(segments, 1):
-            improved = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
             lines += [
                 f"{format_timestamp(s['start'], 'vtt')} --> {format_timestamp(s['end'], 'vtt')}",
-                improved,
                 "",
             ]
         return "\n".join(lines)
     if output_format == "json":
-        # If we enhanced line-by-line, rewrite segments with improved texts
         segs_out = []
         for i, s in enumerate(segments):
             txt = cleaned_lines[i] if len(cleaned_lines) == len(segments) else s["text"]
@@ -326,64 +237,54 @@ def transcribe_audio(
     raise gr.Error(f"Unsupported format: {output_format}")
 # ───────────────────────────────────────────────────────��────────────────────────
-# UI
 # ────────────────────────────────────────────────────────────────────────────────
 with gr.Blocks(title="Urdu Whisper Transcription") as iface:
     gr.Markdown("## Urdu Whisper Transcription (CT2) + Optional LLM Polishing (Groq)")
-    with gr.Row():
-        with gr.Column():
-            upload = gr.Audio(
-                sources=["upload"],  # file upload only
-                type="filepath",
-                label="Upload Audio File",
-            )
-            record = gr.Audio(
-                sources=["microphone"],  # microphone only
-                type="filepath",
-                label="Record Audio",
-            )
-            with gr.Row():
-                fmt = gr.Radio(
-                    choices=["text", "srt", "vtt", "json"],
-                    value="text",
-                    label="Output Format",
-                )
-                beam = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Beam Size")
-            gr.Markdown("### LLM Polishing (optional)")
-            llm_toggle = gr.Checkbox(value=False, label="Polish Urdu text with LLM (Groq · openai/gpt-oss-120b)")
-            with gr.Row():
-                llm_temp = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="LLM Temperature")
-                # Prefer env var; allow override here (kept local to session)
-                llm_key = gr.Textbox(
-                    label="GROQ_API_KEY (optional if set in environment)",
-                    type="password",
-                    value=""
-                )
-            llm_sys = gr.Textbox(
-                label="LLM System Prompt (Urdu)",
-                value=DEFAULT_SYSTEM_PROMPT_UR,
-                lines=3
             )
-            btn = gr.Button("Transcribe", variant="primary")
-        with gr.Column():
-            out = gr.Textbox(
-                label="Result",
-                lines=22,
-                max_lines=30,
-                show_copy_button=True,
             )
     btn.click(
         fn=transcribe_audio,
-        inputs=[upload, record, fmt, beam, llm_toggle, llm_key, llm_temp, llm_sys],
         outputs=out,
         api_name="predict",
     )

+# app.py – Urdu Whisper (CT2) transcription with unified audio input + collapsible settings
 import os
 import json
 from datetime import timedelta
+from typing import List, Optional
 import gradio as gr
 import torch
 # Config
 # ────────────────────────────────────────────────────────────────────────────────
 os.environ.setdefault("HF_HOME", "/home/user/app/.cache")
 MODEL_ID_CT2 = "kingabzpro/whisper-large-v3-urdu-ct2"
+GROQ_MODEL = "openai/gpt-oss-120b"
 DEFAULT_SYSTEM_PROMPT_UR = (
     "آپ ایک ماہر اردو زبان ایڈیٹر ہیں۔ دیے گئے متن کو بہتر اردو املا، "
     "رموزِ اوقاف، وقفوں اور قدرتی روانی کے ساتھ پیش کریں۔ "
     "بولنے والے کے انداز اور معنی کو برقرار رکھیں، مبالغہ نہ کریں، "
+    "انگریزی کے عام بول چال کے الفاظ برقرار رہنے دیں۔"
 )
 # ────────────────────────────────────────────────────────────────────────────────
 # ────────────────────────────────────────────────────────────────────────────────
 def format_timestamp(seconds: float, format_type: str = "srt") -> str:
+    total_ms = int(round((seconds or 0.0) * 1000))
+    hours, rem_ms = divmod(total_ms, 3_600_000)
     minutes, rem_ms = divmod(rem_ms, 60_000)
     sec, ms = divmod(rem_ms, 1000)
     sep = "," if format_type == "srt" else "."
     return f"{hours:02d}:{minutes:02d}:{sec:02d}{sep}{ms:03d}"
 def basic_urdu_cleanup(text: str) -> str:
     if not text:
         return text
     t = " ".join(text.split())
     replacements = {
+        " ,": ",", " .": ".", " ?": "?", " !": "!", " ،": "،", " ۔": "۔",
+        ",": "،", ";": "؛", ". . .": "…", "...": "…",
     }
     for a, b in replacements.items():
         t = t.replace(a, b)
     t = t.replace(" ،", "،").replace(" ۔", "۔").replace(" ؛", "؛").replace(" ؟", "؟")
     for p in ["،", "؛", ",", ";"]:
         t = t.replace(p, p + " ")
+    return " ".join(t.split()).strip()
 def get_groq_client(api_key: Optional[str] = None):
+    key = (api_key or os.getenv("GROQ_API_KEY", "")).strip()
     if not key:
         return None
     try:
         from groq import Groq  # type: ignore
         return Groq(api_key=key)
     except Exception as e:
+        print(f"[WARN] Groq client init failed: {e}")
         return None
+def enhance_text_with_llm(text: str, api_key: Optional[str], temperature: float = 0.2,
+                          system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> str:
     client = get_groq_client(api_key)
     if not client:
         return basic_urdu_cleanup(text)
     try:
         resp = client.chat.completions.create(
             model=GROQ_MODEL,
             temperature=float(temperature),
             messages=[
                 {"role": "system", "content": system_prompt},
+                {"role": "user", "content": "براہِ کرم اس متن کی اردو بہتر کریں اور صرف بہتر متن واپس کریں:\n\n" + text},
             ],
         )
+        return (resp.choices[0].message.content or "").strip() or basic_urdu_cleanup(text)
     except Exception as e:
+        print(f"[WARN] LLM full-text enhance failed: {e}")
         return basic_urdu_cleanup(text)
+def enhance_lines_with_llm(lines: List[str], api_key: Optional[str], temperature: float = 0.2,
+                           system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> List[str]:
     if not lines:
         return lines
     client = get_groq_client(api_key)
     if not client:
         return [basic_urdu_cleanup(x) for x in lines]
     numbered = "\n".join(f"{i+1}. {ln}" for i, ln in enumerate(lines))
     user_msg = (
+        "ان جملوں کی اردو بہتر کریں۔ اسی ترتیب اور گنتی کے ساتھ اتنی ہی سطور واپس کریں:"
+        "\n\n" + numbered
     )
     try:
         resp = client.chat.completions.create(
                 {"role": "user", "content": user_msg},
             ],
         )
+        raw = (resp.choices[0].message.content or "").strip()
         improved_map = {}
         for line in raw.splitlines():
+            s = line.strip()
+            if not s or "." not in s:
                 continue
+            num, rest = s.split(".", 1)
+            num = num.strip()
+            if num.isdigit():
+                improved_map[int(num) - 1] = rest.strip()
+        return [improved_map.get(i, basic_urdu_cleanup(lines[i])) for i in range(len(lines))]
     except Exception as e:
+        print(f"[WARN] LLM line enhance failed: {e}")
         return [basic_urdu_cleanup(x) for x in lines]
 # ────────────────────────────────────────────────────────────────────────────────
 # Whisper (CT2) Model
 # ─────────────────���──────────────────────────────────────────────────────────────
 model = faster_whisper.WhisperModel(
     MODEL_ID_CT2,
     device="cuda" if torch.cuda.is_available() else "cpu",
     compute_type="auto",
 )
 print("✅ Model loaded successfully!")
 # ────────────────────────────────────────────────────────────────────────────────
 # Core Transcription
 # ────────────────────────────────────────────────────────────────────────────────
 def transcribe_audio(
+    audio_path: Optional[str],
     output_format: str,
     beam_size: int,
     llm_enhance: bool,
     llm_temperature: float,
     llm_system_prompt: str,
 ):
     if not audio_path:
         raise gr.Error("Please upload or record an audio clip.")
+    seg_iter, info = model.transcribe(
         audio_path,
         language="ur",
         beam_size=int(beam_size),
         vad_filter=False,
     )
+    segments, raw_lines = [], []
+    for seg in seg_iter:
         text = (seg.text or "").strip()
         segments.append({"start": seg.start, "end": seg.end, "text": text})
         raw_lines.append(text)
+    # Enhance / clean
     if llm_enhance:
         if output_format == "text":
+            cleaned_blob = enhance_text_with_llm(
                 " ".join(raw_lines),
                 api_key=llm_api_key,
                 temperature=llm_temperature,
                 system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
             )
+            cleaned_lines = [cleaned_blob]
         else:
             cleaned_lines = enhance_lines_with_llm(
                 raw_lines,
                 system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
             )
     else:
+        cleaned_lines = (
+            [basic_urdu_cleanup(" ".join(raw_lines))] if output_format == "text"
+            else [basic_urdu_cleanup(x) for x in raw_lines]
+        )
+    # Render
     if output_format == "text":
         return cleaned_lines[0]
     if output_format == "srt":
         lines = []
         for i, s in enumerate(segments, 1):
+            txt = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
             lines += [
                 str(i),
                 f"{format_timestamp(s['start'], 'srt')} --> {format_timestamp(s['end'], 'srt')}",
+                txt,
                 "",
             ]
         return "\n".join(lines)
     if output_format == "vtt":
         lines = ["WEBVTT", ""]
         for i, s in enumerate(segments, 1):
+            txt = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
             lines += [
                 f"{format_timestamp(s['start'], 'vtt')} --> {format_timestamp(s['end'], 'vtt')}",
+                txt,
                 "",
             ]
         return "\n".join(lines)
     if output_format == "json":
         segs_out = []
         for i, s in enumerate(segments):
             txt = cleaned_lines[i] if len(cleaned_lines) == len(segments) else s["text"]
     raise gr.Error(f"Unsupported format: {output_format}")
 # ───────────────────────────────────────────────────────��────────────────────────
+# UI (collapsible settings + unified audio)
 # ────────────────────────────────────────────────────────────────────────────────
 with gr.Blocks(title="Urdu Whisper Transcription") as iface:
     gr.Markdown("## Urdu Whisper Transcription (CT2) + Optional LLM Polishing (Groq)")
+    # One component for both Upload + Microphone
+    audio = gr.Audio(
+        sources=["upload", "microphone"],
+        type="filepath",
+        label="Upload or Record Audio",
+        waveform_options={"show_controls": True},  # keeps recording controls visible
+    )
+    with gr.Accordion("Transcription Settings", open=False):
+        with gr.Row():
+            fmt = gr.Radio(
+                choices=["text", "srt", "vtt", "json"],
+                value="text",
+                label="Output Format",
             )
+            beam = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Beam Size")
+    with gr.Accordion("LLM Polishing (Optional)", open=False):
+        llm_toggle = gr.Checkbox(
+            value=False,
+            label="Polish Urdu text with LLM (Groq · openai/gpt-oss-120b)"
+        )
+        with gr.Row():
+            llm_temp = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="LLM Temperature")
+            llm_key = gr.Textbox(
+                label="GROQ_API_KEY (optional if set in environment)",
+                type="password",
+                value=""
             )
+        llm_sys = gr.Textbox(
+            label="LLM System Prompt (Urdu)",
+            value=DEFAULT_SYSTEM_PROMPT_UR,
+            lines=3
+        )
+    btn = gr.Button("Transcribe", variant="primary")
+    out = gr.Textbox(label="Result", lines=22, max_lines=30, show_copy_button=True)
     btn.click(
         fn=transcribe_audio,
+        inputs=[audio, fmt, beam, llm_toggle, llm_key, llm_temp, llm_sys],
         outputs=out,
         api_name="predict",
     )