Ko-TTS-Arena Contributors commited on
Commit
673d09e
·
1 Parent(s): b0bdfc9

feat: Enable Gemini TTS with service account JSON authentication

Browse files
Files changed (3) hide show
  1. models.py +10 -10
  2. requirements.txt +2 -1
  3. tts.py +41 -36
models.py CHANGED
@@ -566,7 +566,7 @@ def insert_initial_models():
566
  has_clova = bool(os.getenv("CLOVA_CLIENT_ID") and os.getenv("CLOVA_API_KEY"))
567
  has_humelo = bool(os.getenv("HUMELO_API_KEY"))
568
  has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
569
- has_gemini_tts = bool(os.getenv("GEMINI_TTS_API_KEY"))
570
 
571
  tts_models = [
572
  # 채널톡 TTS (한국어 특화) - 항상 활성화
@@ -649,15 +649,15 @@ def insert_initial_models():
649
  is_active=has_typecast,
650
  model_url="https://typecast.ai/",
651
  ),
652
- # Gemini TTS (Google Cloud - 다국어 지원) - OAuth2 인증 필요, 현재 비활성화
653
- # Model(
654
- # id="gemini-tts-aoede",
655
- # name="Gemini TTS (Aoede)",
656
- # model_type=ModelType.TTS,
657
- # is_open=False,
658
- # is_active=has_gemini_tts,
659
- # model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
660
- # ),
661
  ]
662
 
663
  for model in tts_models:
 
566
  has_clova = bool(os.getenv("CLOVA_CLIENT_ID") and os.getenv("CLOVA_API_KEY"))
567
  has_humelo = bool(os.getenv("HUMELO_API_KEY"))
568
  has_typecast = bool(os.getenv("TYPECAST_API_KEY"))
569
+ has_gemini_tts = bool(os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON"))
570
 
571
  tts_models = [
572
  # 채널톡 TTS (한국어 특화) - 항상 활성화
 
649
  is_active=has_typecast,
650
  model_url="https://typecast.ai/",
651
  ),
652
+ # Gemini TTS (Google Cloud - 다국어 지원) - 서비스 계정 JSON 필요
653
+ Model(
654
+ id="gemini-tts-aoede",
655
+ name="Gemini TTS (Aoede)",
656
+ model_type=ModelType.TTS,
657
+ is_open=False,
658
+ is_active=has_gemini_tts,
659
+ model_url="https://cloud.google.com/text-to-speech/docs/gemini-tts",
660
+ ),
661
  ]
662
 
663
  for model in tts_models:
requirements.txt CHANGED
@@ -14,4 +14,5 @@ huggingface-hub
14
  scipy
15
  numpy
16
  pydub
17
- typecast-python
 
 
14
  scipy
15
  numpy
16
  pydub
17
+ typecast-python
18
+ google-cloud-texttospeech
tts.py CHANGED
@@ -55,8 +55,20 @@ HUMELO_API_URL = "https://agitvxptajouhvoatxio.supabase.co/functions/v1/dive-syn
55
  # Typecast TTS
56
  TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
57
 
58
- # Gemini TTS (Google Cloud)
59
- GEMINI_TTS_API_KEY = os.getenv("GEMINI_TTS_API_KEY")
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def resample_wav_to_16khz(input_path: str) -> str:
62
  """
@@ -448,50 +460,43 @@ def predict_typecast_tts(text: str, voice_id: str = "tc_612ed01c7eb720fddd3ddedf
448
 
449
 
450
  def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
451
- """Gemini TTS API 호출 (REST API 방식)"""
452
- api_key = GEMINI_TTS_API_KEY
453
- if not api_key:
454
- raise ValueError("GEMINI_TTS_API_KEY 환경 변수가 설정되지 않았습니다.")
455
 
456
  try:
457
- # REST API 엔드포인트
458
- url = f"https://texttospeech.googleapis.com/v1beta1/text:synthesize?key={api_key}"
459
 
460
- payload = {
461
- "input": {
462
- "text": text,
463
- "prompt": "친절하고 자연스러운 톤으로 말해주세요"
464
- },
465
- "voice": {
466
- "languageCode": "ko-kr",
467
- "name": voice,
468
- "modelName": model
469
- },
470
- "audioConfig": {
471
- "audioEncoding": "LINEAR16",
472
- "sampleRateHertz": 24000
473
- }
474
- }
475
 
476
- response = requests.post(
477
- url,
478
- headers={"Content-Type": "application/json"},
479
- json=payload,
480
- timeout=60
481
  )
482
- response.raise_for_status()
483
 
484
- audio_content = response.json().get("audioContent")
485
- if not audio_content:
486
- raise ValueError("Gemini TTS API가 오디오를 반환하지 않았습니다.")
 
 
 
 
 
 
 
 
 
487
 
488
- audio_bytes = base64.b64decode(audio_content)
489
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
490
- f.write(audio_bytes)
491
  return f.name
492
 
493
- except requests.exceptions.RequestException as e:
494
- raise ValueError(f"Gemini TTS API 요청 오류: {str(e)}")
495
  except Exception as e:
496
  raise ValueError(f"Gemini TTS API 오류: {str(e)}")
497
 
 
55
  # Typecast TTS
56
  TYPECAST_API_KEY = os.getenv("TYPECAST_API_KEY")
57
 
58
+ # Gemini TTS (Google Cloud) - 서비스 계정 JSON 인증
59
+ GOOGLE_CREDENTIALS_JSON = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
60
+
61
+ # 서비스 계정 JSON이 있으면 임시 파일로 저장
62
+ if GOOGLE_CREDENTIALS_JSON:
63
+ import json as _json
64
+ _credentials_path = "/tmp/google_credentials.json"
65
+ try:
66
+ with open(_credentials_path, "w") as f:
67
+ f.write(GOOGLE_CREDENTIALS_JSON)
68
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = _credentials_path
69
+ print(f"[Gemini TTS] Credentials loaded from environment variable")
70
+ except Exception as e:
71
+ print(f"[Gemini TTS] Error saving credentials: {e}")
72
 
73
  def resample_wav_to_16khz(input_path: str) -> str:
74
  """
 
460
 
461
 
462
  def predict_gemini_tts(text: str, voice: str = "Aoede", model: str = "gemini-2.5-flash-tts") -> str:
463
+ """Gemini TTS API 호출 (서비스 계정 인증)"""
464
+ if not GOOGLE_CREDENTIALS_JSON:
465
+ raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON 환경 변수가 설정되지 않았습니다.")
 
466
 
467
  try:
468
+ from google.api_core.client_options import ClientOptions
469
+ from google.cloud import texttospeech_v1beta1 as texttospeech
470
 
471
+ client = texttospeech.TextToSpeechClient(
472
+ client_options=ClientOptions(api_endpoint='texttospeech.googleapis.com')
473
+ )
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
+ voice_params = texttospeech.VoiceSelectionParams(
476
+ name=voice,
477
+ language_code='ko-kr',
478
+ model_name=model
 
479
  )
 
480
 
481
+ # Synthesize speech with natural prompt
482
+ response = client.synthesize_speech(
483
+ input=texttospeech.SynthesisInput(
484
+ text=text,
485
+ prompt='친절하고 자연스러운 톤으로 말해주세요'
486
+ ),
487
+ voice=voice_params,
488
+ audio_config=texttospeech.AudioConfig(
489
+ audio_encoding=texttospeech.AudioEncoding.LINEAR16,
490
+ sample_rate_hertz=24000
491
+ ),
492
+ )
493
 
 
494
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
495
+ f.write(response.audio_content)
496
  return f.name
497
 
498
+ except ImportError:
499
+ raise ValueError("google-cloud-texttospeech 패키지가 설치되지 않았습니다.")
500
  except Exception as e:
501
  raise ValueError(f"Gemini TTS API 오류: {str(e)}")
502