Spaces:

channelcorp
/

Ko-TTS-Arena

Sleeping

App Files Files Community

Ko-TTS-Arena Contributors commited on 15 days ago

Commit

c82ad44

1 Parent(s): 50a02f7

fix: Use REST API v1beta1 for Gemini TTS, disable google-wavenet/neural2

Browse files

Files changed (3) hide show

models.py +3 -3
requirements.txt +1 -2
tts.py +29 -34

models.py CHANGED Viewed

@@ -596,13 +596,13 @@ def insert_initial_models():
             is_active=has_openai,
             model_url="https://platform.openai.com/docs/guides/text-to-speech",
         ),
-        # Google Cloud TTS - API 키 있을 때만 활성화
         Model(
             id="google-wavenet",
             name="Google Wavenet (ko-KR)",
             model_type=ModelType.TTS,
             is_open=False,
-            is_active=has_google,
             model_url="https://cloud.google.com/text-to-speech",
         ),
         Model(
@@ -610,7 +610,7 @@ def insert_initial_models():
             name="Google Neural2 (ko-KR)",
             model_type=ModelType.TTS,
             is_open=False,
-            is_active=has_google,
             model_url="https://cloud.google.com/text-to-speech",
         ),
         # CLOVA TTS (네이버 클라우드 - 한국어 특화) - API 키 있을 때만 활성화

             is_active=has_openai,
             model_url="https://platform.openai.com/docs/guides/text-to-speech",
         ),
+        # Google Cloud TTS - 비활성화 (Gemini TTS 사용)
         Model(
             id="google-wavenet",
             name="Google Wavenet (ko-KR)",
             model_type=ModelType.TTS,
             is_open=False,
+            is_active=False,  # Gemini TTS로 대체
             model_url="https://cloud.google.com/text-to-speech",
         ),
         Model(
             name="Google Neural2 (ko-KR)",
             model_type=ModelType.TTS,
             is_open=False,
+            is_active=False,  # Gemini TTS로 대체
             model_url="https://cloud.google.com/text-to-speech",
         ),
         # CLOVA TTS (네이버 클라우드 - 한국어 특화) - API 키 있을 때만 활성화

requirements.txt CHANGED Viewed

@@ -14,5 +14,4 @@ huggingface-hub
 scipy
 numpy
 pydub
-typecast-python
-google-cloud-texttospeech

 scipy
 numpy
 pydub
+typecast-python

tts.py CHANGED Viewed

@@ -55,12 +55,10 @@ HUMELO_API_URL = "https://agitvxptajouhvoatxio.supabase.co/functions/v1/dive-syn
 # Typecast TTS
 TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
-# Gemini TTS (Google Cloud) - API Key 방식
 GEMINI_TTS_API_KEY = os.getenv("GEMINI_TTS_API_KEY")
 if GEMINI_TTS_API_KEY:
-    # 클라이언트 라이브러리가 GOOGLE_API_KEY 환경변수를 읽음
-    os.environ["GOOGLE_API_KEY"] = GEMINI_TTS_API_KEY
-    print("[Gemini TTS] API Key loaded and set as GOOGLE_API_KEY")
 def resample_wav_to_16khz(input_path: str) -> str:
     """
@@ -458,46 +456,43 @@ def predict_typecast_tts(text: str, voice_id: str = "tc_612ed01c7eb720fddd3ddedf
 def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
-    """Gemini TTS API 호출 (API Key 방식)"""
     if not GEMINI_TTS_API_KEY:
-        raise ValueError(
-            "GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다."
-        )
     try:
-        from google.api_core.client_options import ClientOptions
-        from google.cloud import texttospeech_v1beta1 as texttospeech
-        client = texttospeech.TextToSpeechClient(
-            client_options=ClientOptions(api_endpoint="texttospeech.googleapis.com")
-        )
-        voice_params = texttospeech.VoiceSelectionParams(
-            name=voice,
-            language_code="ko-kr",
-            model_name=model,
-        )
-        response = client.synthesize_speech(
-            input=texttospeech.SynthesisInput(
-                text=text,
-                prompt="친절하고 자연스러운 톤으로 말해주세요",
-            ),
-            voice=voice_params,
-            audio_config=texttospeech.AudioConfig(
-                audio_encoding=texttospeech.AudioEncoding.LINEAR16,
-                sample_rate_hertz=24000,
-            ),
-        )
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
-            f.write(response.audio_content)
             return f.name
-    except ImportError:
-        raise ValueError(
-            "google-cloud-texttospeech 패키지가 설치되지 않았습니다. requirements.txt를 확인하세요."
-        )
     except Exception as e:
         raise ValueError(f"Gemini TTS API 오류: {str(e)}")

 # Typecast TTS
 TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
+# Gemini TTS (Google Cloud) - REST API v1beta1 with API Key
 GEMINI_TTS_API_KEY = os.getenv("GEMINI_TTS_API_KEY")
 if GEMINI_TTS_API_KEY:
+    print("[Gemini TTS] API Key loaded")
 def resample_wav_to_16khz(input_path: str) -> str:
     """
 def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
+    """Gemini TTS API 호출 (REST API v1beta1 with API Key)"""
     if not GEMINI_TTS_API_KEY:
+        raise ValueError("GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다.")
     try:
+        url = f"https://texttospeech.googleapis.com/v1beta1/text:synthesize?key={GEMINI_TTS_API_KEY}"
+        payload = {
+            "input": {
+                "text": text,
+                "prompt": "친절하고 자연스러운 톤으로 말해주세요"
+            },
+            "voice": {
+                "languageCode": "ko-kr",
+                "name": voice,
+                "modelName": model
+            },
+            "audioConfig": {
+                "audioEncoding": "LINEAR16",
+                "sampleRateHertz": 24000
+            }
+        }
+        response = requests.post(url, json=payload, timeout=60)
+        response.raise_for_status()
+        audio_content = response.json().get("audioContent")
+        if not audio_content:
+            raise ValueError("Gemini TTS API가 오디오를 반환하지 않았습니다.")
+        audio_bytes = base64.b64decode(audio_content)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+            f.write(audio_bytes)
             return f.name
+    except requests.exceptions.RequestException as e:
+        raise ValueError(f"Gemini TTS API 요청 오류: {str(e)}")
     except Exception as e:
         raise ValueError(f"Gemini TTS API 오류: {str(e)}")