blackhole1218 commited on
Commit
d88128c
·
1 Parent(s): 9cb156b

feat: Add CLOVA TTS, update OpenAI to gpt-4o-mini-tts, re-enable Supertone

Browse files

- Add CLOVA Voice (nara) with CLOVA_CLIENT_ID and CLOVA_API_KEY env vars
- Update OpenAI TTS to gpt-4o-mini-tts with Korean-optimized instructions
- Re-enable Supertone Sona model
- Remove old OpenAI tts-1 and tts-1-hd models

Files changed (2) hide show
  1. models.py +22 -20
  2. tts.py +70 -20
models.py CHANGED
@@ -571,6 +571,7 @@ def insert_initial_models():
571
  has_elevenlabs = bool(os.getenv("ELEVENLABS_API_KEY"))
572
  has_google = bool(os.getenv("GOOGLE_API_KEY"))
573
  has_supertone = bool(os.getenv("SUPERTONE_API_KEY"))
 
574
 
575
  tts_models = [
576
  # 채널톡 TTS (한국어 특화) - 항상 활성화
@@ -591,18 +592,10 @@ def insert_initial_models():
591
  is_active=has_elevenlabs,
592
  model_url="https://elevenlabs.io/",
593
  ),
594
- # OpenAI TTS - API 키 있을 때만 활성화
595
  Model(
596
- id="openai-tts-1",
597
- name="OpenAI TTS-1",
598
- model_type=ModelType.TTS,
599
- is_open=False,
600
- is_active=has_openai,
601
- model_url="https://platform.openai.com/docs/guides/text-to-speech",
602
- ),
603
- Model(
604
- id="openai-tts-1-hd",
605
- name="OpenAI TTS-1-HD",
606
  model_type=ModelType.TTS,
607
  is_open=False,
608
  is_active=has_openai,
@@ -625,15 +618,24 @@ def insert_initial_models():
625
  is_active=has_google,
626
  model_url="https://cloud.google.com/text-to-speech",
627
  ),
628
- # Supertone TTS (한국어 특화) - 임시 비활성화 (크레딧 부족)
629
- # Model(
630
- # id="supertone-sona",
631
- # name="Supertone Sona",
632
- # model_type=ModelType.TTS,
633
- # is_open=False,
634
- # is_active=has_supertone,
635
- # model_url="https://supertone.ai/",
636
- # ),
 
 
 
 
 
 
 
 
 
637
  ]
638
 
639
  for model in tts_models:
 
571
  has_elevenlabs = bool(os.getenv("ELEVENLABS_API_KEY"))
572
  has_google = bool(os.getenv("GOOGLE_API_KEY"))
573
  has_supertone = bool(os.getenv("SUPERTONE_API_KEY"))
574
+ has_clova = bool(os.getenv("CLOVA_CLIENT_ID") and os.getenv("CLOVA_API_KEY"))
575
 
576
  tts_models = [
577
  # 채널톡 TTS (한국어 특화) - 항상 활성화
 
592
  is_active=has_elevenlabs,
593
  model_url="https://elevenlabs.io/",
594
  ),
595
+ # OpenAI TTS (gpt-4o-mini-tts) - API 키 있을 때만 활성화
596
  Model(
597
+ id="openai-gpt-4o-mini-tts",
598
+ name="OpenAI GPT-4o Mini TTS",
 
 
 
 
 
 
 
 
599
  model_type=ModelType.TTS,
600
  is_open=False,
601
  is_active=has_openai,
 
618
  is_active=has_google,
619
  model_url="https://cloud.google.com/text-to-speech",
620
  ),
621
+ # CLOVA TTS (네이버 클라우드 - 한국어 특화) - API 있을 때만 활성화
622
+ Model(
623
+ id="clova-nara",
624
+ name="CLOVA Voice (나라)",
625
+ model_type=ModelType.TTS,
626
+ is_open=False,
627
+ is_active=has_clova,
628
+ model_url="https://clova.ai/",
629
+ ),
630
+ # Supertone TTS (한국어 특화) - API 키 있을 때만 활성화
631
+ Model(
632
+ id="supertone-sona",
633
+ name="Supertone Sona",
634
+ model_type=ModelType.TTS,
635
+ is_open=False,
636
+ is_active=has_supertone,
637
+ model_url="https://supertone.ai/",
638
+ ),
639
  ]
640
 
641
  for model in tts_models:
tts.py CHANGED
@@ -4,6 +4,8 @@ import json
4
  import base64
5
  import tempfile
6
  import requests
 
 
7
  from dotenv import load_dotenv
8
 
9
  load_dotenv()
@@ -11,8 +13,10 @@ load_dotenv()
11
  # 한국어 지원 TTS 제공자 매핑
12
  # - 채널톡: 자체 API
13
  # - ElevenLabs: 직접 API
14
- # - OpenAI: API
15
  # - Google: API
 
 
16
 
17
  CHANNEL_TTS_URL = os.getenv(
18
  "CHANNEL_TTS_URL",
@@ -25,6 +29,10 @@ ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "21m00Tcm4TlvDq8ikWAM")
25
  SUPERTONE_API_KEY = os.getenv("SUPERTONE_API_KEY")
26
  SUPERTONE_VOICE_ID = os.getenv("SUPERTONE_VOICE_ID", "91992bbd4758bdcf9c9b01") # 기본 보이스
27
 
 
 
 
 
28
  model_mapping = {
29
  # 채널톡 TTS (한국어 특화)
30
  "channel-hana": {
@@ -36,16 +44,11 @@ model_mapping = {
36
  "provider": "elevenlabs",
37
  "model": "eleven_multilingual_v2",
38
  },
39
- # OpenAI TTS
40
- "openai-tts-1": {
41
- "provider": "openai",
42
- "model": "tts-1",
43
- "voice": "alloy",
44
- },
45
- "openai-tts-1-hd": {
46
  "provider": "openai",
47
- "model": "tts-1-hd",
48
- "voice": "alloy",
49
  },
50
  # Google Cloud TTS
51
  "google-wavenet": {
@@ -56,6 +59,11 @@ model_mapping = {
56
  "provider": "google",
57
  "voice": "ko-KR-Neural2-A",
58
  },
 
 
 
 
 
59
  # Supertone TTS (한국어 특화)
60
  "supertone-sona": {
61
  "provider": "supertone",
@@ -114,24 +122,37 @@ def predict_elevenlabs_tts(text: str, model: str = "eleven_multilingual_v2") ->
114
  return f.name
115
 
116
 
117
- def predict_openai_tts(text: str, model: str = "tts-1", voice: str = "alloy") -> str:
118
- """OpenAI TTS API 호출"""
119
  api_key = os.getenv("OPENAI_API_KEY")
120
  if not api_key:
121
  raise ValueError("OPENAI_API_KEY 환경 변수가 설정되지 않았습니다.")
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  response = requests.post(
124
  "https://api.openai.com/v1/audio/speech",
125
  headers={
126
  "Authorization": f"Bearer {api_key}",
127
  "Content-Type": "application/json",
128
  },
129
- json={
130
- "model": model,
131
- "input": text,
132
- "voice": voice,
133
- "response_format": "wav",
134
- },
135
  timeout=60,
136
  )
137
  response.raise_for_status()
@@ -141,6 +162,32 @@ def predict_openai_tts(text: str, model: str = "tts-1", voice: str = "alloy") ->
141
  return f.name
142
 
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def predict_supertone_tts(text: str, model: str = "sona_speech_1") -> str:
145
  """Supertone TTS API 호출"""
146
  api_key = SUPERTONE_API_KEY
@@ -234,8 +281,8 @@ def predict_tts(text: str, model: str) -> str:
234
  elif provider == "openai":
235
  return predict_openai_tts(
236
  text,
237
- config.get("model", "tts-1"),
238
- config.get("voice", "alloy"),
239
  )
240
 
241
  elif provider == "google":
@@ -247,6 +294,9 @@ def predict_tts(text: str, model: str) -> str:
247
  elif provider == "supertone":
248
  return predict_supertone_tts(text, config.get("model", "sona_speech_1"))
249
 
 
 
 
250
  else:
251
  raise ValueError(f"알 수 없는 provider: {provider}")
252
 
 
4
  import base64
5
  import tempfile
6
  import requests
7
+ import urllib.request
8
+ import urllib.parse
9
  from dotenv import load_dotenv
10
 
11
  load_dotenv()
 
13
  # 한국어 지원 TTS 제공자 매핑
14
  # - 채널톡: 자체 API
15
  # - ElevenLabs: 직접 API
16
+ # - OpenAI: API (gpt-4o-mini-tts)
17
  # - Google: API
18
+ # - CLOVA: 네이버 클라우드 API
19
+ # - Supertone: API
20
 
21
  CHANNEL_TTS_URL = os.getenv(
22
  "CHANNEL_TTS_URL",
 
29
  SUPERTONE_API_KEY = os.getenv("SUPERTONE_API_KEY")
30
  SUPERTONE_VOICE_ID = os.getenv("SUPERTONE_VOICE_ID", "91992bbd4758bdcf9c9b01") # 기본 보이스
31
 
32
+ # CLOVA TTS (네이버 클라우드)
33
+ CLOVA_CLIENT_ID = os.getenv("CLOVA_CLIENT_ID")
34
+ CLOVA_API_KEY = os.getenv("CLOVA_API_KEY")
35
+
36
  model_mapping = {
37
  # 채널톡 TTS (한국어 특화)
38
  "channel-hana": {
 
44
  "provider": "elevenlabs",
45
  "model": "eleven_multilingual_v2",
46
  },
47
+ # OpenAI TTS (gpt-4o-mini-tts)
48
+ "openai-gpt-4o-mini-tts": {
 
 
 
 
 
49
  "provider": "openai",
50
+ "model": "gpt-4o-mini-tts",
51
+ "voice": "coral",
52
  },
53
  # Google Cloud TTS
54
  "google-wavenet": {
 
59
  "provider": "google",
60
  "voice": "ko-KR-Neural2-A",
61
  },
62
+ # CLOVA TTS (네이버 클라우드 - 한국어 특화)
63
+ "clova-nara": {
64
+ "provider": "clova",
65
+ "speaker": "nara",
66
+ },
67
  # Supertone TTS (한국어 특화)
68
  "supertone-sona": {
69
  "provider": "supertone",
 
122
  return f.name
123
 
124
 
125
+ def predict_openai_tts(text: str, model: str = "gpt-4o-mini-tts", voice: str = "coral") -> str:
126
+ """OpenAI TTS API 호출 (gpt-4o-mini-tts 지원)"""
127
  api_key = os.getenv("OPENAI_API_KEY")
128
  if not api_key:
129
  raise ValueError("OPENAI_API_KEY 환경 변수가 설정되지 않았습니다.")
130
 
131
+ # gpt-4o-mini-tts용 instructions (한국어 TTS에 최적화)
132
+ instructions = """Voice: Natural and clear Korean voice, with appropriate intonation and rhythm.
133
+ Punctuation: Well-structured with natural pauses for clarity.
134
+ Delivery: Calm, professional, and easy to understand.
135
+ Phrasing: Clear pronunciation with proper Korean phonetics.
136
+ Tone: Friendly yet professional, suitable for various contexts."""
137
+
138
+ payload = {
139
+ "model": model,
140
+ "input": text,
141
+ "voice": voice,
142
+ "response_format": "wav",
143
+ }
144
+
145
+ # gpt-4o-mini-tts 모델은 instructions 지원
146
+ if model == "gpt-4o-mini-tts":
147
+ payload["instructions"] = instructions
148
+
149
  response = requests.post(
150
  "https://api.openai.com/v1/audio/speech",
151
  headers={
152
  "Authorization": f"Bearer {api_key}",
153
  "Content-Type": "application/json",
154
  },
155
+ json=payload,
 
 
 
 
 
156
  timeout=60,
157
  )
158
  response.raise_for_status()
 
162
  return f.name
163
 
164
 
165
+ def predict_clova_tts(text: str, speaker: str = "nara") -> str:
166
+ """네이버 클라우드 CLOVA TTS API 호출"""
167
+ client_id = CLOVA_CLIENT_ID
168
+ client_secret = CLOVA_API_KEY
169
+
170
+ if not client_id or not client_secret:
171
+ raise ValueError("CLOVA_CLIENT_ID 또는 CLOVA_API_KEY 환경 변수가 설정되지 않았습니다.")
172
+
173
+ enc_text = urllib.parse.quote(text)
174
+ data = f"speaker={speaker}&volume=0&speed=0&pitch=0&format=mp3&text={enc_text}"
175
+ url = "https://naveropenapi.apigw.ntruss.com/tts-premium/v1/tts"
176
+
177
+ request = urllib.request.Request(url)
178
+ request.add_header("X-NCP-APIGW-API-KEY-ID", client_id)
179
+ request.add_header("X-NCP-APIGW-API-KEY", client_secret)
180
+
181
+ response = urllib.request.urlopen(request, data=data.encode('utf-8'), timeout=60)
182
+
183
+ if response.getcode() != 200:
184
+ raise ValueError(f"CLOVA TTS API 오류: {response.getcode()}")
185
+
186
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
187
+ f.write(response.read())
188
+ return f.name
189
+
190
+
191
  def predict_supertone_tts(text: str, model: str = "sona_speech_1") -> str:
192
  """Supertone TTS API 호출"""
193
  api_key = SUPERTONE_API_KEY
 
281
  elif provider == "openai":
282
  return predict_openai_tts(
283
  text,
284
+ config.get("model", "gpt-4o-mini-tts"),
285
+ config.get("voice", "coral"),
286
  )
287
 
288
  elif provider == "google":
 
294
  elif provider == "supertone":
295
  return predict_supertone_tts(text, config.get("model", "sona_speech_1"))
296
 
297
+ elif provider == "clova":
298
+ return predict_clova_tts(text, config.get("speaker", "nara"))
299
+
300
  else:
301
  raise ValueError(f"알 수 없는 provider: {provider}")
302