| import os | |
| import torchaudio | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| from transformers import AutoProcessor, AutoModelForCTC | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # load examples | |
| examples = [] | |
| examples_dir = "examples" | |
| if os.path.exists(examples_dir): | |
| for filename in os.listdir(examples_dir): | |
| if filename.endswith((".wav", ".mp3", ".ogg")): | |
| examples.append([os.path.join(examples_dir, filename)]) | |
| # Load model and processor | |
| MODEL_PATH = "badrex/JASRv1.1" | |
| processor = AutoProcessor.from_pretrained(MODEL_PATH) | |
| model = AutoModelForCTC.from_pretrained(MODEL_PATH) | |
| # move model and processor to device | |
| model = model.to(device) | |
| #processor = processor.to(device) | |
| def process_audio(audio_path): | |
| """Process audio with return the generated respotextnse. | |
| Args: | |
| audio_path: Path to the audio file to be transcribed. | |
| Returns: | |
| String containing the transcribed text from the audio file, or an error message | |
| if the audio file is missing. | |
| """ | |
| if not audio_path: | |
| return "Please upload an audio file." | |
| # get audio array | |
| audio_array, sample_rate = torchaudio.load(audio_path) | |
| # if sample rate is not 16000, resample to 16000 | |
| if sample_rate != 16000: | |
| audio_array = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_array) | |
| #audio_array = audio_array.to(device) | |
| inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt") | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| #inputs = inputs.to(device, dtype=torch.bfloat16) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| outputs = torch.argmax(logits, dim=-1) | |
| decoded_outputs = processor.batch_decode( | |
| outputs, | |
| skip_special_tokens=False | |
| ) | |
| return decoded_outputs[0].strip() | |
| # Define Gradio interface | |
| with gr.Blocks(title="Voxtral Demo") as demo: | |
| gr.Markdown("# JASR v1.1 🎙️ Speech Recognition for Dialectal Arabic ☕") | |
| #gr.Markdown("Developed with ❤ by [Badr al-Absi](https://badrex.github.io/)") | |
| gr.Markdown( | |
| 'Developed with <span style="color:red;">❤</span> by <a href="https://badrex.github.io/">Badr al-Absi</a>' | |
| ) | |
| gr.Markdown( | |
| """### Ya Hala 👋🏼 | |
| This is a demo for **JASR**, pronounced *Jāsir* (جاسِر) — a Transformer-based automatic speech recognition (ASR) system for dialectal Arabic. | |
| The current running instance is optimized for the regional dialects of *Jazirat al-Arab* (the Arabian Peninsula). | |
| JASR is still under active development. """ | |
| ) | |
| gr.Markdown("Simply **upload an audio file** 📤 or **record yourself speaking** 🎙️⏺️ to try out the model!") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio(type="filepath", label="Upload Audio") | |
| submit_btn = gr.Button("Transcribe Audio", variant="primary") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Text Transcription", lines=10, text_align='right', show_copy_button=True) | |
| submit_btn.click( | |
| fn=process_audio, | |
| inputs=[audio_input], | |
| outputs=output_text | |
| ) | |
| gr.Examples( | |
| examples=examples if examples else None, | |
| inputs=[audio_input], | |
| example_labels=[ | |
| "Kuwait Theatre", | |
| "Saudi Radio Poetry", | |
| "News Report (MSA)", | |
| "San3ani Arabic male", | |
| "San3ani Arabic female", | |
| "Khaleeji Theatre", | |
| "TEDx KSA", | |
| "Yousif Saif Football Commentary", | |
| "Khaleeji Theatre 2", | |
| "TV Drama", | |
| "KSA Theatre", | |
| "TV Drama 2", | |
| "Radio Jeddah (KSA)", | |
| "Omani Theatre", | |
| "Khaleeji Drama", | |
| "Radio News", | |
| "TEDx KSA 2", | |
| "Radio Jeddah (KSA) 2", | |
| ], | |
| examples_per_page=18, | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.queue().launch() #share=False, ssr_mode=False, mcp_server=True | |
| # import gradio as gr | |
| # from transformers import pipeline | |
| # import numpy as np | |
| # import os | |
| # from huggingface_hub import login | |
| # import spaces | |
| # HF_TOKEN = os.environ.get("HF_TOKEN") | |
| # if HF_TOKEN: | |
| # login(token=HF_TOKEN) | |
| # MODEL_ID = "badrex/JASRv1.1" | |
| # transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID) | |
| # # @spaces.GPU | |
| # # def transcribe(audio): | |
| # # sr, y = audio | |
| # # # convert to mono if stereo | |
| # # #if y.ndim > 1: | |
| # # # y = y.mean(axis=1) | |
| # # #y = y.astype(np.float32) | |
| # # #y /= np.max(np.abs(y)) | |
| # # return transcriber({"sampling_rate": sr, "raw": y})["text"] | |
| # # @spaces.GPU | |
| # # def transcribe(audio): | |
| # # sr, y = audio | |
| # # # Convert stereo → mono | |
| # # if y.ndim > 1: | |
| # # y = np.mean(y, axis=1) | |
| # # # Ensure float32 | |
| # # y = y.astype(np.float32) | |
| # # # Normalize to [-1, 1] if it's not already | |
| # # if np.max(np.abs(y)) > 1.0: | |
| # # y /= np.max(np.abs(y)) | |
| # @spaces.GPU | |
| # def transcribe(audio): | |
| # sr, y = audio | |
| # # convert to mono if stereo | |
| # if y.ndim > 1: | |
| # y = y.mean(axis=1) | |
| # # resample to 16kHz if needed | |
| # if sr != 16000: | |
| # y = librosa.resample(y, orig_sr=sr, target_sr=16000) | |
| # y = y.astype(np.float32) | |
| # y /= np.max(np.abs(y)) | |
| # return transcriber({"sampling_rate": sr, "raw": y})["text"] | |
| # examples = [] | |
| # examples_dir = "examples" | |
| # if os.path.exists(examples_dir): | |
| # for filename in os.listdir(examples_dir): | |
| # if filename.endswith((".wav", ".mp3", ".ogg")): | |
| # examples.append([os.path.join(examples_dir, filename)]) | |
| # print(f"Found {len(examples)} example files") | |
| # else: | |
| # print("Examples directory not found") | |
| # # @spaces.GPU | |
| # # def transcribe(audio): | |
| # # sr, y = audio | |
| # # if y.ndim > 1: | |
| # # y = np.mean(y, axis=1) | |
| # # y = y.astype(np.float32) | |
| # # # normalize to [-1, 1] | |
| # # max_val = np.max(np.abs(y)) | |
| # # if max_val > 0: | |
| # # y /= max_val | |
| # # target_sr = transcriber.model.config.sampling_rate if hasattr(transcriber.model, "config") else 16000 | |
| # # if sr != target_sr: | |
| # # import librosa | |
| # # y = librosa.resample(y, orig_sr=sr, target_sr=target_sr) | |
| # # sr = target_sr | |
| # # return transcriber({"sampling_rate": sr, "raw": y})["text"] | |
| # demo = gr.Interface( | |
| # fn=transcribe, | |
| # inputs=gr.Audio(), | |
| # outputs="text", | |
| # title="<div>JASR v1.1 🎙️ <br>Speech Recognition for Dialectal Arabic</div>", | |
| # description=""" | |
| # <div class="centered-content"> | |
| # <div> | |
| # <p> | |
| # Developed with ❤ by <a href="https://badrex.github.io/" style="color: #2563eb;">Badr al-Absi</a> | |
| # </p> | |
| # <br> | |
| # <p style="font-size: 15px; line-height: 1.8;"> | |
| # Marhaban 👋🏼 | |
| # <br> | |
| # <br> | |
| # This is a demo for JASR, pronounced <i>Jāsir</i> [جاسِر], a Transformer-based automatic speech recognition (ASR) system for dialectal Arabic. | |
| # The current running instance is optimized for the regional dialects of <i>Jazirat al-Arab</i>, or the Arabian Peninsula. | |
| # JASR is still under active development. | |
| # <br> | |
| # <p style="font-size: 15px; line-height: 1.8;"> | |
| # Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model! | |
| # </p> | |
| # </div> | |
| # </div> | |
| # """, | |
| # examples=examples if examples else None, | |
| # example_labels=[ | |
| # "Kuwait Theatre", | |
| # "Saudi Radio Poetry", | |
| # "News Report (MSA)", | |
| # "San3ani Arabic male", | |
| # "San3ani Arabic female", | |
| # "Khaleeji Theatre", | |
| # "TEDx KSA", | |
| # "Yousif Saif Football Commentary", | |
| # "Khaleeji Theatre 2", | |
| # "TV Drama", | |
| # "KSA Theatre", | |
| # "TV Drama 2", | |
| # "Radio Jeddah (KSA)", | |
| # "Omani Theatre", | |
| # "Khaleeji Drama", | |
| # "Radio News", | |
| # "TEDx KSA 2", | |
| # "Radio Jeddah (KSA) 2", | |
| # ], | |
| # cache_examples=False, | |
| # examples_per_page=18, | |
| # flagging_mode=None, | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() |