kingabzpro commited on
Commit
6f481b1
Β·
verified Β·
1 Parent(s): 980c187

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -179
app.py CHANGED
@@ -1,9 +1,9 @@
1
- # app.py – Urdu Whisper (CT2) transcription demo with upload + record + optional LLM polishing via Groq
2
 
3
  import os
4
  import json
5
  from datetime import timedelta
6
- from typing import List, Tuple, Optional
7
 
8
  import gradio as gr
9
  import torch
@@ -13,16 +13,15 @@ import faster_whisper
13
  # Config
14
  # ────────────────────────────────────────────────────────────────────────────────
15
 
16
- # (Optional) cache Hugging Face files in a persistent dir when running in Spaces
17
  os.environ.setdefault("HF_HOME", "/home/user/app/.cache")
18
 
19
  MODEL_ID_CT2 = "kingabzpro/whisper-large-v3-urdu-ct2"
20
- GROQ_MODEL = "openai/gpt-oss-120b" # as requested
21
  DEFAULT_SYSTEM_PROMPT_UR = (
22
  "Ψ’ΩΎ ایک ماہر اردو Ψ²Ψ¨Ψ§Ω† ایڈیٹر ΫΫŒΪΊΫ” Ψ―ΫŒΫ’ Ϊ―Ψ¦Ϋ’ Ω…ΨͺΩ† کو بہΨͺΨ± اردو Ψ§Ω…Ω„Ψ§ΨŒ "
23
  "Ψ±Ω…ΩˆΨ²Ω Ψ§ΩˆΩ‚Ψ§ΩΨŒ ΩˆΩ‚ΩΩˆΪΊ اور Ω‚Ψ―Ψ±Ψͺی Ψ±ΩˆΨ§Ω†ΫŒ Ϊ©Ϋ’ Ψ³Ψ§ΨͺΪΎ پیش Ϊ©Ψ±ΫŒΪΊΫ” "
24
  "Ψ¨ΩˆΩ„Ω†Ϋ’ ΩˆΨ§Ω„Ϋ’ Ϊ©Ϋ’ Ψ§Ω†Ψ―Ψ§Ψ² اور Ω…ΨΉΩ†ΫŒ کو Ψ¨Ψ±Ω‚Ψ±Ψ§Ψ± رکھیں، مبالغہ نہ کریں، "
25
- "Ψ§Ω†Ϊ―Ψ±ΫŒΨ²ΫŒ الفاظ Ψ§Ϊ―Ψ± Ψ¨ΩˆΩ„ Ϊ†Ψ§Ω„ Ω…ΫŒΪΊ ΨΉΩ…ΩˆΩ…ΫŒ ہوں Ψͺو Ψ¨Ψ±Ω‚Ψ±Ψ§Ψ± رہنے Ψ―ΫŒΪΊΫ”"
26
  )
27
 
28
  # ────────────────────────────────────────────────────────────────────────────────
@@ -30,141 +29,70 @@ DEFAULT_SYSTEM_PROMPT_UR = (
30
  # ────────────────────────────────────────────────────────────────────────────────
31
 
32
  def format_timestamp(seconds: float, format_type: str = "srt") -> str:
33
- """
34
- Format seconds to SRT/VTT timestamp with millisecond precision.
35
- """
36
- if seconds is None:
37
- seconds = 0.0
38
- total_ms = int(round(seconds * 1000))
39
- hours, rem_ms = divmod(total_ms, 3600_000)
40
  minutes, rem_ms = divmod(rem_ms, 60_000)
41
  sec, ms = divmod(rem_ms, 1000)
42
  sep = "," if format_type == "srt" else "."
43
  return f"{hours:02d}:{minutes:02d}:{sec:02d}{sep}{ms:03d}"
44
 
45
-
46
  def basic_urdu_cleanup(text: str) -> str:
47
- """
48
- Lightweight post-processing before/without LLM:
49
- - normalize whitespace
50
- - swap some Latin punctuation to Urdu-friendly alternatives where appropriate
51
- - fix duplicated punctuation
52
- """
53
  if not text:
54
  return text
55
-
56
- # Whitespace normalize
57
  t = " ".join(text.split())
58
-
59
- # Common punctuation mapping (keep English tech terms intact; minimal changes)
60
  replacements = {
61
- " ,": ",",
62
- " .": ".",
63
- " ?": "?",
64
- " !": "!",
65
- " ،": "،",
66
- " Ϋ”": "Ϋ”",
67
- ",": "،", # prefer Arabic comma
68
- ";": "Ψ›", # Arabic semicolon
69
- ". . .": "…",
70
- "...": "…",
71
  }
72
  for a, b in replacements.items():
73
  t = t.replace(a, b)
74
-
75
- # Remove spaces before Urdu punctuation
76
  t = t.replace(" ،", "،").replace(" Ϋ”", "Ϋ”").replace(" Ψ›", "Ψ›").replace(" ؟", "؟")
77
- # Ensure a space after punctuation where natural
78
  for p in ["،", "Ψ›", ",", ";"]:
79
  t = t.replace(p, p + " ")
80
- t = " ".join(t.split()) # re-trim
81
-
82
- return t.strip()
83
-
84
-
85
- # ────────────────────────────────────────────────────────────────────────────────
86
- # Groq LLM (OpenAI-compatible) client
87
- # ────────────────────────────────────────────────────────────────────────────────
88
 
89
  def get_groq_client(api_key: Optional[str] = None):
90
- """
91
- Lazily import and initialize Groq client.
92
- Priority: explicit api_key arg β†’ env GROQ_API_KEY β†’ None (disabled).
93
- """
94
- key = api_key or os.getenv("GROQ_API_KEY", "").strip()
95
  if not key:
96
  return None
97
  try:
98
  from groq import Groq # type: ignore
99
  return Groq(api_key=key)
100
  except Exception as e:
101
- print(f"[WARN] Could not import/init Groq client: {e}")
102
  return None
103
 
104
-
105
- def enhance_text_with_llm(
106
- text: str,
107
- api_key: Optional[str],
108
- temperature: float = 0.2,
109
- system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR,
110
- ) -> str:
111
- """
112
- Send full transcript to Groq for Urdu polishing.
113
- Returns original text on error/fallback.
114
- """
115
  client = get_groq_client(api_key)
116
  if not client:
117
  return basic_urdu_cleanup(text)
118
-
119
  try:
120
- # OpenAI-style Chat Completions
121
  resp = client.chat.completions.create(
122
  model=GROQ_MODEL,
123
  temperature=float(temperature),
124
  messages=[
125
  {"role": "system", "content": system_prompt},
126
- {
127
- "role": "user",
128
- "content": (
129
- "براہِ Ϊ©Ψ±Ω… Ψ―Ψ±Ψ¬ Ψ°ΫŒΩ„ اردو Ω…ΨͺΩ† کی Ψ²Ψ¨Ψ§Ω† بہΨͺΨ± کریں، "
130
- "ءرف بہΨͺΨ± Ω…ΨͺΩ† واپس کریں، کوئی اآافی Ψͺبءرہ نہ کریں:\n\n"
131
- f"{text}"
132
- ),
133
- },
134
  ],
135
  )
136
- improved = resp.choices[0].message.content.strip()
137
- return improved or basic_urdu_cleanup(text)
138
  except Exception as e:
139
- print(f"[WARN] LLM full-text enhance failed, falling back. Error: {e}")
140
  return basic_urdu_cleanup(text)
141
 
142
-
143
- def enhance_lines_with_llm(
144
- lines: List[str],
145
- api_key: Optional[str],
146
- temperature: float = 0.2,
147
- system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR,
148
- ) -> List[str]:
149
- """
150
- Batch-enhance multiple short lines (e.g., subtitle segments) while preserving order.
151
- We enumerate lines and ask model to return same count with the same numbering.
152
- Falls back to basic cleanup per line on failure.
153
- """
154
  if not lines:
155
  return lines
156
-
157
  client = get_groq_client(api_key)
158
  if not client:
159
  return [basic_urdu_cleanup(x) for x in lines]
160
 
161
- # Build a numbered list
162
  numbered = "\n".join(f"{i+1}. {ln}" for i, ln in enumerate(lines))
163
-
164
  user_msg = (
165
- "Ψ―Ψ±Ψ¬ Ψ°ΫŒΩ„ Ψ¬Ω…Ω„ΩˆΪΊ کی اردو بہΨͺΨ± Ψ¨Ω†Ψ§Ψ¦ΫŒΪΊΫ” اسی ΨͺΨ±Ψͺیب Ψ³Ϋ’ Ψ¨Ψ§Ω„Ϊ©Ω„ Ψ§ΨͺΩ†ΫŒ ہی سطور "
166
- "واپس کریں، ہر Ψ³Ψ·Ψ± اسی Ω†Ω…Ψ¨Ψ± Ϊ©Ϋ’ Ψ³Ψ§ΨͺΪΎ ΫΩˆΫ” ءرف بہΨͺΨ± Ψ¬Ω…Ω„Ϋ’ دیں، اآافی Ω…ΨͺΩ† نہ دیں.\n\n"
167
- f"{numbered}"
168
  )
169
  try:
170
  resp = client.chat.completions.create(
@@ -175,30 +103,21 @@ def enhance_lines_with_llm(
175
  {"role": "user", "content": user_msg},
176
  ],
177
  )
178
- raw = resp.choices[0].message.content.strip()
179
- # Parse lines that start with "N. "
180
  improved_map = {}
181
  for line in raw.splitlines():
182
- line = line.strip()
183
- if not line:
184
  continue
185
- if "." in line:
186
- num_part, rest = line.split(".", 1)
187
- num_part = num_part.strip()
188
- if num_part.isdigit():
189
- idx = int(num_part) - 1
190
- improved_map[idx] = rest.strip()
191
-
192
- # Align back; fallback per-line cleanup if missing
193
- out = []
194
- for i, orig in enumerate(lines):
195
- out.append(improved_map.get(i, basic_urdu_cleanup(orig)))
196
- return out
197
  except Exception as e:
198
- print(f"[WARN] LLM line enhance failed, falling back. Error: {e}")
199
  return [basic_urdu_cleanup(x) for x in lines]
200
 
201
-
202
  # ────────────────────────────────────────────────────────────────────────────────
203
  # Whisper (CT2) Model
204
  # ─────────────────���──────────────────────────────────────────────────────────────
@@ -214,19 +133,16 @@ print("Loading model... this may take a minute the first time.")
214
  model = faster_whisper.WhisperModel(
215
  MODEL_ID_CT2,
216
  device="cuda" if torch.cuda.is_available() else "cpu",
217
- # 'auto' picks fastest viable type; you can force 'float16' on GPU, 'int8' on CPU, etc.
218
  compute_type="auto",
219
  )
220
  print("βœ… Model loaded successfully!")
221
 
222
-
223
  # ────────────────────────────────────────────────────────────────────────────────
224
  # Core Transcription
225
  # ────────────────────────────────────────────────────────────────────────────────
226
 
227
  def transcribe_audio(
228
- uploaded_path: Optional[str],
229
- recorded_path: Optional[str],
230
  output_format: str,
231
  beam_size: int,
232
  llm_enhance: bool,
@@ -234,12 +150,10 @@ def transcribe_audio(
234
  llm_temperature: float,
235
  llm_system_prompt: str,
236
  ):
237
- # pick the recording if present, else the uploaded file
238
- audio_path = recorded_path or uploaded_path
239
  if not audio_path:
240
  raise gr.Error("Please upload or record an audio clip.")
241
 
242
- segments_gen, info = model.transcribe(
243
  audio_path,
244
  language="ur",
245
  beam_size=int(beam_size),
@@ -247,24 +161,22 @@ def transcribe_audio(
247
  vad_filter=False,
248
  )
249
 
250
- segments = []
251
- raw_lines = []
252
- for seg in segments_gen:
253
  text = (seg.text or "").strip()
254
  segments.append({"start": seg.start, "end": seg.end, "text": text})
255
  raw_lines.append(text)
256
 
257
- # Cleanup/enhance
258
  if llm_enhance:
259
- # For 'text' we do a single-shot polish; for subtitles we do line-preserving polish
260
  if output_format == "text":
261
- improved_full = enhance_text_with_llm(
262
  " ".join(raw_lines),
263
  api_key=llm_api_key,
264
  temperature=llm_temperature,
265
  system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
266
  )
267
- cleaned_lines = [improved_full]
268
  else:
269
  cleaned_lines = enhance_lines_with_llm(
270
  raw_lines,
@@ -273,23 +185,23 @@ def transcribe_audio(
273
  system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
274
  )
275
  else:
276
- if output_format == "text":
277
- cleaned_lines = [basic_urdu_cleanup(" ".join(raw_lines))]
278
- else:
279
- cleaned_lines = [basic_urdu_cleanup(x) for x in raw_lines]
280
 
281
- # Render outputs
282
  if output_format == "text":
283
  return cleaned_lines[0]
284
 
285
  if output_format == "srt":
286
  lines = []
287
  for i, s in enumerate(segments, 1):
288
- improved = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
289
  lines += [
290
  str(i),
291
  f"{format_timestamp(s['start'], 'srt')} --> {format_timestamp(s['end'], 'srt')}",
292
- improved,
293
  "",
294
  ]
295
  return "\n".join(lines)
@@ -297,16 +209,15 @@ def transcribe_audio(
297
  if output_format == "vtt":
298
  lines = ["WEBVTT", ""]
299
  for i, s in enumerate(segments, 1):
300
- improved = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
301
  lines += [
302
  f"{format_timestamp(s['start'], 'vtt')} --> {format_timestamp(s['end'], 'vtt')}",
303
- improved,
304
  "",
305
  ]
306
  return "\n".join(lines)
307
 
308
  if output_format == "json":
309
- # If we enhanced line-by-line, rewrite segments with improved texts
310
  segs_out = []
311
  for i, s in enumerate(segments):
312
  txt = cleaned_lines[i] if len(cleaned_lines) == len(segments) else s["text"]
@@ -326,64 +237,54 @@ def transcribe_audio(
326
 
327
  raise gr.Error(f"Unsupported format: {output_format}")
328
 
329
-
330
  # ───────────────────────────────────────────────────────��────────────────────────
331
- # UI
332
  # ────────────────────────────────────────────────────────────────────────────────
333
 
334
  with gr.Blocks(title="Urdu Whisper Transcription") as iface:
335
  gr.Markdown("## Urdu Whisper Transcription (CT2) + Optional LLM Polishing (Groq)")
336
 
337
- with gr.Row():
338
- with gr.Column():
339
- upload = gr.Audio(
340
- sources=["upload"], # file upload only
341
- type="filepath",
342
- label="Upload Audio File",
343
- )
344
- record = gr.Audio(
345
- sources=["microphone"], # microphone only
346
- type="filepath",
347
- label="Record Audio",
348
- )
349
 
350
- with gr.Row():
351
- fmt = gr.Radio(
352
- choices=["text", "srt", "vtt", "json"],
353
- value="text",
354
- label="Output Format",
355
- )
356
- beam = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Beam Size")
357
-
358
- gr.Markdown("### LLM Polishing (optional)")
359
- llm_toggle = gr.Checkbox(value=False, label="Polish Urdu text with LLM (Groq Β· openai/gpt-oss-120b)")
360
- with gr.Row():
361
- llm_temp = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="LLM Temperature")
362
- # Prefer env var; allow override here (kept local to session)
363
- llm_key = gr.Textbox(
364
- label="GROQ_API_KEY (optional if set in environment)",
365
- type="password",
366
- value=""
367
- )
368
- llm_sys = gr.Textbox(
369
- label="LLM System Prompt (Urdu)",
370
- value=DEFAULT_SYSTEM_PROMPT_UR,
371
- lines=3
372
  )
 
373
 
374
- btn = gr.Button("Transcribe", variant="primary")
375
-
376
- with gr.Column():
377
- out = gr.Textbox(
378
- label="Result",
379
- lines=22,
380
- max_lines=30,
381
- show_copy_button=True,
 
 
 
382
  )
 
 
 
 
 
 
 
 
383
 
384
  btn.click(
385
  fn=transcribe_audio,
386
- inputs=[upload, record, fmt, beam, llm_toggle, llm_key, llm_temp, llm_sys],
387
  outputs=out,
388
  api_name="predict",
389
  )
 
1
+ # app.py – Urdu Whisper (CT2) transcription with unified audio input + collapsible settings
2
 
3
  import os
4
  import json
5
  from datetime import timedelta
6
+ from typing import List, Optional
7
 
8
  import gradio as gr
9
  import torch
 
13
  # Config
14
  # ────────────────────────────────────────────────────────────────────────────────
15
 
 
16
  os.environ.setdefault("HF_HOME", "/home/user/app/.cache")
17
 
18
  MODEL_ID_CT2 = "kingabzpro/whisper-large-v3-urdu-ct2"
19
+ GROQ_MODEL = "openai/gpt-oss-120b"
20
  DEFAULT_SYSTEM_PROMPT_UR = (
21
  "Ψ’ΩΎ ایک ماہر اردو Ψ²Ψ¨Ψ§Ω† ایڈیٹر ΫΫŒΪΊΫ” Ψ―ΫŒΫ’ Ϊ―Ψ¦Ϋ’ Ω…ΨͺΩ† کو بہΨͺΨ± اردو Ψ§Ω…Ω„Ψ§ΨŒ "
22
  "Ψ±Ω…ΩˆΨ²Ω Ψ§ΩˆΩ‚Ψ§ΩΨŒ ΩˆΩ‚ΩΩˆΪΊ اور Ω‚Ψ―Ψ±Ψͺی Ψ±ΩˆΨ§Ω†ΫŒ Ϊ©Ϋ’ Ψ³Ψ§ΨͺΪΎ پیش Ϊ©Ψ±ΫŒΪΊΫ” "
23
  "Ψ¨ΩˆΩ„Ω†Ϋ’ ΩˆΨ§Ω„Ϋ’ Ϊ©Ϋ’ Ψ§Ω†Ψ―Ψ§Ψ² اور Ω…ΨΉΩ†ΫŒ کو Ψ¨Ψ±Ω‚Ψ±Ψ§Ψ± رکھیں، مبالغہ نہ کریں، "
24
+ "Ψ§Ω†Ϊ―Ψ±ΫŒΨ²ΫŒ Ϊ©Ϋ’ ΨΉΨ§Ω… Ψ¨ΩˆΩ„ Ϊ†Ψ§Ω„ Ϊ©Ϋ’ الفاظ Ψ¨Ψ±Ω‚Ψ±Ψ§Ψ± رہنے Ψ―ΫŒΪΊΫ”"
25
  )
26
 
27
  # ────────────────────────────────────────────────────────────────────────────────
 
29
  # ────────────────────────────────────────────────────────────────────────────────
30
 
31
  def format_timestamp(seconds: float, format_type: str = "srt") -> str:
32
+ total_ms = int(round((seconds or 0.0) * 1000))
33
+ hours, rem_ms = divmod(total_ms, 3_600_000)
 
 
 
 
 
34
  minutes, rem_ms = divmod(rem_ms, 60_000)
35
  sec, ms = divmod(rem_ms, 1000)
36
  sep = "," if format_type == "srt" else "."
37
  return f"{hours:02d}:{minutes:02d}:{sec:02d}{sep}{ms:03d}"
38
 
 
39
  def basic_urdu_cleanup(text: str) -> str:
 
 
 
 
 
 
40
  if not text:
41
  return text
 
 
42
  t = " ".join(text.split())
 
 
43
  replacements = {
44
+ " ,": ",", " .": ".", " ?": "?", " !": "!", " ،": "،", " Ϋ”": "Ϋ”",
45
+ ",": "،", ";": "Ψ›", ". . .": "…", "...": "…",
 
 
 
 
 
 
 
 
46
  }
47
  for a, b in replacements.items():
48
  t = t.replace(a, b)
 
 
49
  t = t.replace(" ،", "،").replace(" Ϋ”", "Ϋ”").replace(" Ψ›", "Ψ›").replace(" ؟", "؟")
 
50
  for p in ["،", "Ψ›", ",", ";"]:
51
  t = t.replace(p, p + " ")
52
+ return " ".join(t.split()).strip()
 
 
 
 
 
 
 
53
 
54
  def get_groq_client(api_key: Optional[str] = None):
55
+ key = (api_key or os.getenv("GROQ_API_KEY", "")).strip()
 
 
 
 
56
  if not key:
57
  return None
58
  try:
59
  from groq import Groq # type: ignore
60
  return Groq(api_key=key)
61
  except Exception as e:
62
+ print(f"[WARN] Groq client init failed: {e}")
63
  return None
64
 
65
+ def enhance_text_with_llm(text: str, api_key: Optional[str], temperature: float = 0.2,
66
+ system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> str:
 
 
 
 
 
 
 
 
 
67
  client = get_groq_client(api_key)
68
  if not client:
69
  return basic_urdu_cleanup(text)
 
70
  try:
 
71
  resp = client.chat.completions.create(
72
  model=GROQ_MODEL,
73
  temperature=float(temperature),
74
  messages=[
75
  {"role": "system", "content": system_prompt},
76
+ {"role": "user", "content": "براہِ Ϊ©Ψ±Ω… Ψ§Ψ³ Ω…ΨͺΩ† کی اردو بہΨͺΨ± کریں اور ءرف بہΨͺΨ± Ω…ΨͺΩ† واپس کریں:\n\n" + text},
 
 
 
 
 
 
 
77
  ],
78
  )
79
+ return (resp.choices[0].message.content or "").strip() or basic_urdu_cleanup(text)
 
80
  except Exception as e:
81
+ print(f"[WARN] LLM full-text enhance failed: {e}")
82
  return basic_urdu_cleanup(text)
83
 
84
+ def enhance_lines_with_llm(lines: List[str], api_key: Optional[str], temperature: float = 0.2,
85
+ system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> List[str]:
 
 
 
 
 
 
 
 
 
 
86
  if not lines:
87
  return lines
 
88
  client = get_groq_client(api_key)
89
  if not client:
90
  return [basic_urdu_cleanup(x) for x in lines]
91
 
 
92
  numbered = "\n".join(f"{i+1}. {ln}" for i, ln in enumerate(lines))
 
93
  user_msg = (
94
+ "Ψ§Ω† Ψ¬Ω…Ω„ΩˆΪΊ کی اردو بہΨͺΨ± Ϊ©Ψ±ΫŒΪΊΫ” اسی ΨͺΨ±Ψͺیب اور Ϊ―Ω†Ψͺی Ϊ©Ϋ’ Ψ³Ψ§ΨͺΪΎ Ψ§ΨͺΩ†ΫŒ ہی سطور واپس کریں:"
95
+ "\n\n" + numbered
 
96
  )
97
  try:
98
  resp = client.chat.completions.create(
 
103
  {"role": "user", "content": user_msg},
104
  ],
105
  )
106
+ raw = (resp.choices[0].message.content or "").strip()
 
107
  improved_map = {}
108
  for line in raw.splitlines():
109
+ s = line.strip()
110
+ if not s or "." not in s:
111
  continue
112
+ num, rest = s.split(".", 1)
113
+ num = num.strip()
114
+ if num.isdigit():
115
+ improved_map[int(num) - 1] = rest.strip()
116
+ return [improved_map.get(i, basic_urdu_cleanup(lines[i])) for i in range(len(lines))]
 
 
 
 
 
 
 
117
  except Exception as e:
118
+ print(f"[WARN] LLM line enhance failed: {e}")
119
  return [basic_urdu_cleanup(x) for x in lines]
120
 
 
121
  # ────────────────────────────────────────────────────────────────────────────────
122
  # Whisper (CT2) Model
123
  # ─────────────────���──────────────────────────────────────────────────────────────
 
133
  model = faster_whisper.WhisperModel(
134
  MODEL_ID_CT2,
135
  device="cuda" if torch.cuda.is_available() else "cpu",
 
136
  compute_type="auto",
137
  )
138
  print("βœ… Model loaded successfully!")
139
 
 
140
  # ────────────────────────────────────────────────────────────────────────────────
141
  # Core Transcription
142
  # ────────────────────────────────────────────────────────────────────────────────
143
 
144
  def transcribe_audio(
145
+ audio_path: Optional[str],
 
146
  output_format: str,
147
  beam_size: int,
148
  llm_enhance: bool,
 
150
  llm_temperature: float,
151
  llm_system_prompt: str,
152
  ):
 
 
153
  if not audio_path:
154
  raise gr.Error("Please upload or record an audio clip.")
155
 
156
+ seg_iter, info = model.transcribe(
157
  audio_path,
158
  language="ur",
159
  beam_size=int(beam_size),
 
161
  vad_filter=False,
162
  )
163
 
164
+ segments, raw_lines = [], []
165
+ for seg in seg_iter:
 
166
  text = (seg.text or "").strip()
167
  segments.append({"start": seg.start, "end": seg.end, "text": text})
168
  raw_lines.append(text)
169
 
170
+ # Enhance / clean
171
  if llm_enhance:
 
172
  if output_format == "text":
173
+ cleaned_blob = enhance_text_with_llm(
174
  " ".join(raw_lines),
175
  api_key=llm_api_key,
176
  temperature=llm_temperature,
177
  system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
178
  )
179
+ cleaned_lines = [cleaned_blob]
180
  else:
181
  cleaned_lines = enhance_lines_with_llm(
182
  raw_lines,
 
185
  system_prompt=llm_system_prompt or DEFAULT_SYSTEM_PROMPT_UR,
186
  )
187
  else:
188
+ cleaned_lines = (
189
+ [basic_urdu_cleanup(" ".join(raw_lines))] if output_format == "text"
190
+ else [basic_urdu_cleanup(x) for x in raw_lines]
191
+ )
192
 
193
+ # Render
194
  if output_format == "text":
195
  return cleaned_lines[0]
196
 
197
  if output_format == "srt":
198
  lines = []
199
  for i, s in enumerate(segments, 1):
200
+ txt = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
201
  lines += [
202
  str(i),
203
  f"{format_timestamp(s['start'], 'srt')} --> {format_timestamp(s['end'], 'srt')}",
204
+ txt,
205
  "",
206
  ]
207
  return "\n".join(lines)
 
209
  if output_format == "vtt":
210
  lines = ["WEBVTT", ""]
211
  for i, s in enumerate(segments, 1):
212
+ txt = cleaned_lines[i - 1] if len(cleaned_lines) == len(segments) else s["text"]
213
  lines += [
214
  f"{format_timestamp(s['start'], 'vtt')} --> {format_timestamp(s['end'], 'vtt')}",
215
+ txt,
216
  "",
217
  ]
218
  return "\n".join(lines)
219
 
220
  if output_format == "json":
 
221
  segs_out = []
222
  for i, s in enumerate(segments):
223
  txt = cleaned_lines[i] if len(cleaned_lines) == len(segments) else s["text"]
 
237
 
238
  raise gr.Error(f"Unsupported format: {output_format}")
239
 
 
240
  # ───────────────────────────────────────────────────────��────────────────────────
241
+ # UI (collapsible settings + unified audio)
242
  # ────────────────────────────────────────────────────────────────────────────────
243
 
244
  with gr.Blocks(title="Urdu Whisper Transcription") as iface:
245
  gr.Markdown("## Urdu Whisper Transcription (CT2) + Optional LLM Polishing (Groq)")
246
 
247
+ # One component for both Upload + Microphone
248
+ audio = gr.Audio(
249
+ sources=["upload", "microphone"],
250
+ type="filepath",
251
+ label="Upload or Record Audio",
252
+ waveform_options={"show_controls": True}, # keeps recording controls visible
253
+ )
 
 
 
 
 
254
 
255
+ with gr.Accordion("Transcription Settings", open=False):
256
+ with gr.Row():
257
+ fmt = gr.Radio(
258
+ choices=["text", "srt", "vtt", "json"],
259
+ value="text",
260
+ label="Output Format",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  )
262
+ beam = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Beam Size")
263
 
264
+ with gr.Accordion("LLM Polishing (Optional)", open=False):
265
+ llm_toggle = gr.Checkbox(
266
+ value=False,
267
+ label="Polish Urdu text with LLM (Groq Β· openai/gpt-oss-120b)"
268
+ )
269
+ with gr.Row():
270
+ llm_temp = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="LLM Temperature")
271
+ llm_key = gr.Textbox(
272
+ label="GROQ_API_KEY (optional if set in environment)",
273
+ type="password",
274
+ value=""
275
  )
276
+ llm_sys = gr.Textbox(
277
+ label="LLM System Prompt (Urdu)",
278
+ value=DEFAULT_SYSTEM_PROMPT_UR,
279
+ lines=3
280
+ )
281
+
282
+ btn = gr.Button("Transcribe", variant="primary")
283
+ out = gr.Textbox(label="Result", lines=22, max_lines=30, show_copy_button=True)
284
 
285
  btn.click(
286
  fn=transcribe_audio,
287
+ inputs=[audio, fmt, beam, llm_toggle, llm_key, llm_temp, llm_sys],
288
  outputs=out,
289
  api_name="predict",
290
  )