File size: 4,310 Bytes
b53f321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import sys
import whisper
from transformers import MarianMTModel, MarianTokenizer
from gtts import gTTS
from pydub import AudioSegment
import os
import certifi
import json
import warnings

# Ensure proper SSL certificates are used for downloading models
os.environ["SSL_CERT_FILE"] = certifi.where()

# Suppress any unnecessary warnings
warnings.filterwarnings("ignore")

# Function to transcribe audio using OpenAI's Whisper model
def transcribe_audio(input_path):
    model = whisper.load_model("tiny")  # Load the 'tiny' model for fast transcription
    result = model.transcribe(input_path)  # Run transcription
    return result["text"]  # Return only the transcribed text

# Function to translate English text into the specified target language
def translate_text(text, target_language, output_path=None):
    # Select appropriate translation model based on the target language
    if target_language == 'hi':
        model_name = "Helsinki-NLP/opus-mt-en-hi"
    elif target_language == 'es':
        model_name = "Helsinki-NLP/opus-mt-en-es"
    elif target_language == 'fr':
        model_name = "Helsinki-NLP/opus-mt-en-fr"
    elif target_language == 'bn':
        model_name = "shhossain/opus-mt-en-to-bn"
    else:
        raise ValueError(f"Unsupported target language: {target_language}")

    # Load the tokenizer and model
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # Prepare input and generate translation
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs)
    translated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # If an output path is given, synthesize speech for the translated text
    if output_path:
        tts = gTTS(translated, lang=target_language)
        temp_mp3 = "temp.mp3"
        tts.save(temp_mp3)
        sound = AudioSegment.from_mp3(temp_mp3)
        sound.export(output_path, format="wav")
        os.remove(temp_mp3)

    return translated  # Return the translated text

# Function to generate speech audio for any given text
def generate_audio(text, target_language, output_path):
    tts = gTTS(text, lang=target_language)  # Generate TTS using gTTS
    temp_mp3 = "temp.mp3"
    tts.save(temp_mp3)  # Save as temporary MP3
    sound = AudioSegment.from_mp3(temp_mp3)  # Load MP3
    sound.export(output_path, format="wav")  # Export to WAV format
    os.remove(temp_mp3)  # Clean up temporary file

# Command-line interface entry point
if __name__ == "__main__":
    args = sys.argv  # Get command-line arguments
    print("Received args:", args, file=sys.stderr)  # Print arguments for debugging

    if len(args) < 3:
        # Not enough arguments provided
        print(json.dumps({"error": "Insufficient arguments"}))
        sys.exit(1)

    mode = args[1]  # Determine mode (transcribe, translate-text, synthesize-audio)

    # Handle transcription
    if mode == "transcribe":
        input_file = args[2]
        try:
            transcript = transcribe_audio(input_file)
            print(json.dumps({"transcription": transcript}))  # Output JSON
        except Exception as e:
            print(json.dumps({"error": str(e)}))

    # Handle translation
    elif mode == "translate-text":
        try:
            text = args[2]
            target_language = args[3]
            translated = translate_text(text, target_language, None)
            print(json.dumps({"translation": translated}))  # Output JSON
        except Exception as e:
            print(json.dumps({"translation": "", "error": str(e)}))

    # Handle audio synthesis
    elif mode == "synthesize-audio":
        try:
            text = args[2]
            output_path = args[3]
            target_language = args[4]

            tts = gTTS(text, lang=target_language)
            temp_mp3 = "temp.mp3"
            tts.save(temp_mp3)
            sound = AudioSegment.from_mp3(temp_mp3)
            sound.export(output_path, format="wav")
            os.remove(temp_mp3)

            print(json.dumps({"audioPath": output_path}))  # Output JSON
        except Exception as e:
            print(json.dumps({"error": str(e)}))

    # Handle unsupported mode
    else:
        print(json.dumps({"error": "Unsupported mode"}))