Ko-TTS-Arena Contributors commited on
Commit
c82ad44
·
1 Parent(s): 50a02f7

fix: Use REST API v1beta1 for Gemini TTS, disable google-wavenet/neural2

Browse files
Files changed (3) hide show
  1. models.py +3 -3
  2. requirements.txt +1 -2
  3. tts.py +29 -34
models.py CHANGED
@@ -596,13 +596,13 @@ def insert_initial_models():
596
  is_active=has_openai,
597
  model_url="https://platform.openai.com/docs/guides/text-to-speech",
598
  ),
599
- # Google Cloud TTS - API 있을 때만 활성화
600
  Model(
601
  id="google-wavenet",
602
  name="Google Wavenet (ko-KR)",
603
  model_type=ModelType.TTS,
604
  is_open=False,
605
- is_active=has_google,
606
  model_url="https://cloud.google.com/text-to-speech",
607
  ),
608
  Model(
@@ -610,7 +610,7 @@ def insert_initial_models():
610
  name="Google Neural2 (ko-KR)",
611
  model_type=ModelType.TTS,
612
  is_open=False,
613
- is_active=has_google,
614
  model_url="https://cloud.google.com/text-to-speech",
615
  ),
616
  # CLOVA TTS (네이버 클라우드 - 한국어 특화) - API 키 있을 때만 활성화
 
596
  is_active=has_openai,
597
  model_url="https://platform.openai.com/docs/guides/text-to-speech",
598
  ),
599
+ # Google Cloud TTS - 비활성화 (Gemini TTS 사용)
600
  Model(
601
  id="google-wavenet",
602
  name="Google Wavenet (ko-KR)",
603
  model_type=ModelType.TTS,
604
  is_open=False,
605
+ is_active=False, # Gemini TTS로 대체
606
  model_url="https://cloud.google.com/text-to-speech",
607
  ),
608
  Model(
 
610
  name="Google Neural2 (ko-KR)",
611
  model_type=ModelType.TTS,
612
  is_open=False,
613
+ is_active=False, # Gemini TTS로 대체
614
  model_url="https://cloud.google.com/text-to-speech",
615
  ),
616
  # CLOVA TTS (네이버 클라우드 - 한국어 특화) - API 키 있을 때만 활성화
requirements.txt CHANGED
@@ -14,5 +14,4 @@ huggingface-hub
14
  scipy
15
  numpy
16
  pydub
17
- typecast-python
18
- google-cloud-texttospeech
 
14
  scipy
15
  numpy
16
  pydub
17
+ typecast-python
 
tts.py CHANGED
@@ -55,12 +55,10 @@ HUMELO_API_URL = "https://agitvxptajouhvoatxio.supabase.co/functions/v1/dive-syn
55
  # Typecast TTS
56
  TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
57
 
58
- # Gemini TTS (Google Cloud) - API Key 방식
59
  GEMINI_TTS_API_KEY = os.getenv("GEMINI_TTS_API_KEY")
60
  if GEMINI_TTS_API_KEY:
61
- # 클라이언트 라이브러리가 GOOGLE_API_KEY 환경변수를 읽음
62
- os.environ["GOOGLE_API_KEY"] = GEMINI_TTS_API_KEY
63
- print("[Gemini TTS] API Key loaded and set as GOOGLE_API_KEY")
64
 
65
  def resample_wav_to_16khz(input_path: str) -> str:
66
  """
@@ -458,46 +456,43 @@ def predict_typecast_tts(text: str, voice_id: str = "tc_612ed01c7eb720fddd3ddedf
458
 
459
 
460
  def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
461
- """Gemini TTS API 호출 (API Key 방식)"""
462
  if not GEMINI_TTS_API_KEY:
463
- raise ValueError(
464
- "GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다."
465
- )
466
 
467
  try:
468
- from google.api_core.client_options import ClientOptions
469
- from google.cloud import texttospeech_v1beta1 as texttospeech
470
 
471
- client = texttospeech.TextToSpeechClient(
472
- client_options=ClientOptions(api_endpoint="texttospeech.googleapis.com")
473
- )
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
- voice_params = texttospeech.VoiceSelectionParams(
476
- name=voice,
477
- language_code="ko-kr",
478
- model_name=model,
479
- )
480
 
481
- response = client.synthesize_speech(
482
- input=texttospeech.SynthesisInput(
483
- text=text,
484
- prompt="친절하고 자연스러운 톤으로 말해주세요",
485
- ),
486
- voice=voice_params,
487
- audio_config=texttospeech.AudioConfig(
488
- audio_encoding=texttospeech.AudioEncoding.LINEAR16,
489
- sample_rate_hertz=24000,
490
- ),
491
- )
492
 
 
493
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
494
- f.write(response.audio_content)
495
  return f.name
496
 
497
- except ImportError:
498
- raise ValueError(
499
- "google-cloud-texttospeech 패키지가 설치되지 않았습니다. requirements.txt를 확인하세요."
500
- )
501
  except Exception as e:
502
  raise ValueError(f"Gemini TTS API 오류: {str(e)}")
503
 
 
55
  # Typecast TTS
56
  TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
57
 
58
+ # Gemini TTS (Google Cloud) - REST API v1beta1 with API Key
59
  GEMINI_TTS_API_KEY = os.getenv("GEMINI_TTS_API_KEY")
60
  if GEMINI_TTS_API_KEY:
61
+ print("[Gemini TTS] API Key loaded")
 
 
62
 
63
  def resample_wav_to_16khz(input_path: str) -> str:
64
  """
 
456
 
457
 
458
  def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
459
+ """Gemini TTS API 호출 (REST API v1beta1 with API Key)"""
460
  if not GEMINI_TTS_API_KEY:
461
+ raise ValueError("GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다.")
 
 
462
 
463
  try:
464
+ url = f"https://texttospeech.googleapis.com/v1beta1/text:synthesize?key={GEMINI_TTS_API_KEY}"
 
465
 
466
+ payload = {
467
+ "input": {
468
+ "text": text,
469
+ "prompt": "친절하고 자연스러운 톤으로 말해주세요"
470
+ },
471
+ "voice": {
472
+ "languageCode": "ko-kr",
473
+ "name": voice,
474
+ "modelName": model
475
+ },
476
+ "audioConfig": {
477
+ "audioEncoding": "LINEAR16",
478
+ "sampleRateHertz": 24000
479
+ }
480
+ }
481
 
482
+ response = requests.post(url, json=payload, timeout=60)
483
+ response.raise_for_status()
 
 
 
484
 
485
+ audio_content = response.json().get("audioContent")
486
+ if not audio_content:
487
+ raise ValueError("Gemini TTS API가 오디오를 반환하지 않았습니다.")
 
 
 
 
 
 
 
 
488
 
489
+ audio_bytes = base64.b64decode(audio_content)
490
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
491
+ f.write(audio_bytes)
492
  return f.name
493
 
494
+ except requests.exceptions.RequestException as e:
495
+ raise ValueError(f"Gemini TTS API 요청 오류: {str(e)}")
 
 
496
  except Exception as e:
497
  raise ValueError(f"Gemini TTS API 오류: {str(e)}")
498