Akatuki25 commited on
Commit
a2327ae
·
1 Parent(s): 22aac38

Add Git LFS config

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. .gitignore +38 -0
  3. Dockerfile +36 -0
  4. app.py +393 -0
  5. requirements.txt +20 -0
  6. seed-vc +1 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ venv/
8
+ env/
9
+
10
+ # Checkpoints and cache
11
+ checkpoints/
12
+ *.ckpt
13
+ *.pth
14
+ *.pt
15
+
16
+ # Audio files (except reference)
17
+ output*.wav
18
+ chunk_*.wav
19
+ temp_*.wav
20
+
21
+ # Logs
22
+ *.log
23
+ server.log
24
+
25
+ # macOS
26
+ .DS_Store
27
+
28
+ # IDE
29
+ .vscode/
30
+ .idea/
31
+ *.swp
32
+ *.swo
33
+
34
+ # Test files
35
+ test_*.py
36
+ verify_*.py
37
+ measure_*.py
38
+ *.sh
Dockerfile ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
2
+
3
+ # 環境変数
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+ ENV PYTHONUNBUFFERED=1
6
+ ENV HF_HUB_CACHE=/app/checkpoints/hf_cache
7
+
8
+ # 作業ディレクトリ
9
+ WORKDIR /app
10
+
11
+ # システムパッケージのインストール
12
+ RUN apt-get update && apt-get install -y \
13
+ python3.10 \
14
+ python3-pip \
15
+ git \
16
+ ffmpeg \
17
+ libsndfile1 \
18
+ && rm -rf /var/lib/apt/lists/*
19
+
20
+ # Pythonパッケージのインストール
21
+ COPY requirements.txt .
22
+ RUN pip3 install --no-cache-dir -r requirements.txt
23
+
24
+ # アプリケーションファイルのコピー
25
+ COPY app.py .
26
+ COPY source_original.wav .
27
+ COPY seed-vc/ ./seed-vc/
28
+
29
+ # チェックポイント用ディレクトリ作成
30
+ RUN mkdir -p /app/checkpoints/hf_cache
31
+
32
+ # ポート公開(HF Spacesは7860を使用)
33
+ EXPOSE 7860
34
+
35
+ # アプリケーション起動
36
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Seed-VC Streaming API Server
3
+ architecture.md と model_ref.md に基づいて実装
4
+ """
5
+ import io
6
+ import os
7
+ import sys
8
+ import time
9
+ import uuid
10
+ from typing import Optional, Dict
11
+ from argparse import Namespace
12
+
13
+ import numpy as np
14
+ import soundfile as sf
15
+ import librosa
16
+ import torch
17
+ import torchaudio
18
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
19
+ from fastapi.responses import Response
20
+ from pydantic import BaseModel
21
+
22
+ # Seed-VC
23
+ sys.path.insert(0, 'seed-vc')
24
+ os.environ['HF_HUB_CACHE'] = './checkpoints/hf_cache'
25
+ os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
26
+
27
+ # MPSを無効化してCPUを強制
28
+ import torch
29
+ torch.backends.mps.is_available = lambda: False
30
+
31
+ from inference import load_models
32
+
33
+ # =============================================================================
34
+ # Configuration (architecture.md Section 5)
35
+ # =============================================================================
36
+ DEFAULT_SAMPLE_RATE = 16000
37
+ DEFAULT_CHUNK_LEN_MS = 1000
38
+ DEFAULT_OVERLAP_MS = 200
39
+ SESSION_EXPIRE_SEC = 600
40
+
41
+ # model_ref.md Section 3.1
42
+ DEFAULT_REF_PRESET = "default_01"
43
+ REF_PRESETS = {
44
+ "default_01": "source_original.wav",
45
+ }
46
+
47
+ # =============================================================================
48
+ # Global Variables
49
+ # =============================================================================
50
+ # MPSは避ける(seed-vcとの互換性問題)
51
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
+
53
+ # Seed-VCモデル (inference.py load_models()の戻り値)
54
+ model = None
55
+ semantic_fn = None
56
+ f0_fn = None
57
+ vocoder_fn = None
58
+ campplus_model = None
59
+ to_mel = None
60
+ mel_fn_args = None
61
+ model_sr = 22050
62
+
63
+ # =============================================================================
64
+ # Session State (architecture.md Section 4.1)
65
+ # =============================================================================
66
+ class SessionState:
67
+ def __init__(self, sample_rate: int, tgt_speaker_id: Optional[str] = None):
68
+ self.sample_rate = sample_rate
69
+ self.tgt_speaker_id = tgt_speaker_id
70
+ self.last_output_tail: Optional[np.ndarray] = None
71
+ # model_ref.md Section 3: 参照音声の管理
72
+ self.ref_audio_tensor = None # 参照音声 (model_sr, float tensor)
73
+ self.ref_mel = None
74
+ self.ref_semantic = None
75
+ self.style_embed = None
76
+ self.last_access_ts = time.time()
77
+ self.chunk_len_ms = DEFAULT_CHUNK_LEN_MS
78
+ self.overlap_ms = DEFAULT_OVERLAP_MS
79
+
80
+ SESSIONS: Dict[str, SessionState] = {}
81
+
82
+ # =============================================================================
83
+ # FastAPI App
84
+ # =============================================================================
85
+ app = FastAPI(title="Seed-VC Streaming API", version="1.0.0")
86
+
87
+ @app.on_event("startup")
88
+ async def startup_event():
89
+ """モデルロード (architecture.md Section 4.3.1)"""
90
+ global model, semantic_fn, f0_fn, vocoder_fn, campplus_model, to_mel, mel_fn_args, model_sr
91
+
92
+ print(f"Device: {device}")
93
+ print("Loading Seed-VC models...")
94
+
95
+ # inference.pyのload_modelsをそのまま使用
96
+ args = Namespace(
97
+ f0_condition=False, # model_ref.md: 22050Hz系を使う
98
+ checkpoint=None,
99
+ config=None,
100
+ fp16=False
101
+ )
102
+
103
+ model, semantic_fn, f0_fn, vocoder_fn, campplus_model, to_mel, mel_fn_args = load_models(args)
104
+ model_sr = mel_fn_args['sampling_rate']
105
+
106
+ print(f"Models loaded! SR={model_sr}")
107
+
108
+ # =============================================================================
109
+ # Pydantic Models (architecture.md Section 3.2)
110
+ # =============================================================================
111
+ class SessionCreateRequest(BaseModel):
112
+ sample_rate: int = DEFAULT_SAMPLE_RATE
113
+ tgt_speaker_id: Optional[str] = None
114
+ ref_preset_id: Optional[str] = None
115
+ use_uploaded_ref: bool = False
116
+ chunk_len_ms: int = DEFAULT_CHUNK_LEN_MS
117
+ overlap_ms: int = DEFAULT_OVERLAP_MS
118
+
119
+ class SessionCreateResponse(BaseModel):
120
+ session_id: str
121
+ sample_rate: int
122
+ chunk_len_ms: int
123
+ overlap_ms: int
124
+
125
+ class SessionEndRequest(BaseModel):
126
+ session_id: str
127
+
128
+ # =============================================================================
129
+ # Utility Functions
130
+ # =============================================================================
131
+ def load_wav_to_numpy(file_bytes: bytes, target_sr: int) -> tuple[np.ndarray, int]:
132
+ """WAVファイルをnumpy配列に変換"""
133
+ audio, sr = sf.read(io.BytesIO(file_bytes))
134
+ if len(audio.shape) > 1:
135
+ audio = audio.mean(axis=1)
136
+ if sr != target_sr:
137
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
138
+ sr = target_sr
139
+ if audio.dtype in (np.float32, np.float64):
140
+ audio = (audio * 32767).astype(np.int16)
141
+ return audio, sr
142
+
143
+ def numpy_to_wav_bytes(audio: np.ndarray, sr: int) -> bytes:
144
+ """numpy配列をWAVバイト列に変換"""
145
+ buffer = io.BytesIO()
146
+ sf.write(buffer, audio, sr, format="WAV", subtype="PCM_16")
147
+ buffer.seek(0)
148
+ return buffer.read()
149
+
150
+ def crossfade(prev_tail: Optional[np.ndarray], new_chunk: np.ndarray, fade_len: int) -> np.ndarray:
151
+ """クロスフェード (architecture.md Section 4.2.1)"""
152
+ if prev_tail is None:
153
+ return new_chunk
154
+
155
+ fade_len = min(fade_len, len(prev_tail), len(new_chunk))
156
+ if fade_len <= 0:
157
+ return new_chunk
158
+
159
+ fade_in = np.linspace(0.0, 1.0, fade_len, endpoint=True)
160
+ fade_out = 1.0 - fade_in
161
+
162
+ mixed_head = (prev_tail[-fade_len:] * fade_out + new_chunk[:fade_len] * fade_in).astype(np.int16)
163
+ tail = new_chunk[fade_len:]
164
+ return np.concatenate([mixed_head, tail])
165
+
166
+ def prepare_reference_audio(audio_path: str, state: SessionState):
167
+ """
168
+ 参照音声を準備 (model_ref.md Section 3)
169
+ inference.py の main() と同じロジック
170
+ """
171
+ # 参照音声をロード
172
+ ref_audio, file_sr = librosa.load(audio_path, sr=model_sr)
173
+ ref_audio = ref_audio[:model_sr * 25] # 25秒まで
174
+
175
+ # tensorに変換
176
+ ref_audio_tensor = torch.tensor(ref_audio).unsqueeze(0).float().to(device)
177
+ state.ref_audio_tensor = ref_audio_tensor
178
+
179
+ # mel spectrogram
180
+ state.ref_mel = to_mel(ref_audio_tensor)
181
+
182
+ # Whisper semantic features
183
+ ref_waves_16k = torchaudio.functional.resample(ref_audio_tensor, model_sr, 16000)
184
+ state.ref_semantic = semantic_fn(ref_waves_16k)
185
+
186
+ # CAMPPlus style embedding
187
+ feat = torchaudio.compliance.kaldi.fbank(
188
+ ref_waves_16k,
189
+ num_mel_bins=80,
190
+ dither=0,
191
+ sample_frequency=16000
192
+ )
193
+ feat = feat - feat.mean(dim=0, keepdim=True)
194
+ state.style_embed = campplus_model(feat.unsqueeze(0))
195
+
196
+ print(f"Reference prepared: mel={state.ref_mel.shape}, semantic={state.ref_semantic.shape}")
197
+
198
+ def seed_vc_infer(chunk_np: np.ndarray, chunk_sr: int, state: SessionState) -> np.ndarray:
199
+ """
200
+ Seed-VCで音声変換 (architecture.md Section 4.3.2)
201
+ inference.py main()のロジックを使用
202
+ """
203
+ # int16 -> float32
204
+ if chunk_np.dtype == np.int16:
205
+ source_audio = chunk_np.astype(np.float32) / 32768.0
206
+ else:
207
+ source_audio = chunk_np.astype(np.float32)
208
+
209
+ # model_sr にリサンプル
210
+ if chunk_sr != model_sr:
211
+ source_audio = librosa.resample(source_audio, orig_sr=chunk_sr, target_sr=model_sr)
212
+
213
+ # tensor化
214
+ source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
215
+
216
+ # 16kHz変換してWhisper特徴抽出
217
+ converted_waves_16k = torchaudio.functional.resample(source_audio, model_sr, 16000)
218
+ S_alt = semantic_fn(converted_waves_16k)
219
+
220
+ # mel spectrogram
221
+ mel = to_mel(source_audio.to(device).float())
222
+
223
+ # target lengths
224
+ target_lengths = torch.LongTensor([mel.size(2)]).to(device)
225
+ target2_lengths = torch.LongTensor([state.ref_mel.size(2)]).to(device)
226
+
227
+ # length regulator (inference.py line 354-360)
228
+ with torch.no_grad():
229
+ cond, _, _, _, _ = model.length_regulator(
230
+ S_alt, ylens=target_lengths, n_quantizers=3, f0=None
231
+ )
232
+ prompt_condition, _, _, _, _ = model.length_regulator(
233
+ state.ref_semantic, ylens=target2_lengths, n_quantizers=3, f0=None
234
+ )
235
+
236
+ # 条件結合
237
+ cat_condition = torch.cat([prompt_condition, cond], dim=1)
238
+
239
+ # CFM inference (inference.py line 373-376)
240
+ with torch.no_grad():
241
+ vc_target = model.cfm.inference(
242
+ cat_condition,
243
+ torch.LongTensor([cat_condition.size(1)]).to(device),
244
+ state.ref_mel,
245
+ state.style_embed,
246
+ None,
247
+ 10, # diffusion_steps
248
+ inference_cfg_rate=0.7
249
+ )
250
+ # プロンプト部分削除
251
+ vc_target = vc_target[:, :, state.ref_mel.size(-1):]
252
+
253
+ # Vocoder (inference.py line 378)
254
+ with torch.no_grad():
255
+ vc_wave = vocoder_fn(vc_target.float()).squeeze()
256
+ vc_wave = vc_wave[None, :]
257
+
258
+ # numpy変換
259
+ output_wave = vc_wave[0].cpu().numpy()
260
+
261
+ # int16に戻す
262
+ output_int16 = (output_wave * 32767).clip(-32768, 32767).astype(np.int16)
263
+
264
+ return output_int16
265
+
266
+ # =============================================================================
267
+ # Endpoints (architecture.md Section 3.2)
268
+ # =============================================================================
269
+ @app.get("/health")
270
+ async def health_check():
271
+ """3.2.1 GET /health"""
272
+ return {"status": "ok"}
273
+
274
+ @app.post("/session", response_model=SessionCreateResponse)
275
+ async def create_session(body: SessionCreateRequest):
276
+ """
277
+ 3.2.2 POST /session
278
+ model_ref.md Section 2.2(A)
279
+ """
280
+ session_id = str(uuid.uuid4())
281
+
282
+ state = SessionState(
283
+ sample_rate=body.sample_rate,
284
+ tgt_speaker_id=body.tgt_speaker_id
285
+ )
286
+ state.chunk_len_ms = body.chunk_len_ms
287
+ state.overlap_ms = body.overlap_ms
288
+
289
+ # 参照音声設定 (model_ref.md Section 3.2)
290
+ if not body.use_uploaded_ref:
291
+ preset_id = body.ref_preset_id or DEFAULT_REF_PRESET
292
+ wav_path = REF_PRESETS.get(preset_id, REF_PRESETS[DEFAULT_REF_PRESET])
293
+ prepare_reference_audio(wav_path, state)
294
+
295
+ SESSIONS[session_id] = state
296
+
297
+ return SessionCreateResponse(
298
+ session_id=session_id,
299
+ sample_rate=body.sample_rate,
300
+ chunk_len_ms=body.chunk_len_ms,
301
+ overlap_ms=body.overlap_ms,
302
+ )
303
+
304
+ @app.post("/session/ref")
305
+ async def upload_ref_audio(
306
+ session_id: str = Form(...),
307
+ ref_audio: UploadFile = File(...)
308
+ ):
309
+ """
310
+ model_ref.md Section 2.2(B)
311
+ """
312
+ if session_id not in SESSIONS:
313
+ raise HTTPException(status_code=400, detail="Invalid session_id")
314
+
315
+ state = SESSIONS[session_id]
316
+
317
+ # 一時ファイル保存
318
+ import tempfile
319
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
320
+ content = await ref_audio.read()
321
+ tmp.write(content)
322
+ tmp_path = tmp.name
323
+
324
+ try:
325
+ prepare_reference_audio(tmp_path, state)
326
+ finally:
327
+ os.unlink(tmp_path)
328
+
329
+ state.last_access_ts = time.time()
330
+ return {"status": "ok"}
331
+
332
+ @app.post("/chunk")
333
+ async def process_chunk(
334
+ session_id: str = Form(...),
335
+ chunk_id: int = Form(...),
336
+ audio: UploadFile = File(...)
337
+ ):
338
+ """
339
+ 3.2.3 POST /chunk
340
+ architecture.md Section 3.2.3 サーバ内部処理フロー
341
+ """
342
+ if session_id not in SESSIONS:
343
+ raise HTTPException(status_code=400, detail="Invalid session_id")
344
+
345
+ state = SESSIONS[session_id]
346
+
347
+ if chunk_id < 0:
348
+ raise HTTPException(status_code=400, detail="chunk_id must be non-negative")
349
+
350
+ # Step 2: 音声読み込み
351
+ audio_bytes = await audio.read()
352
+ chunk_np, chunk_sr = load_wav_to_numpy(audio_bytes, target_sr=state.sample_rate)
353
+
354
+ # Step 3: サンプルレートチェック
355
+ if chunk_sr != state.sample_rate:
356
+ raise HTTPException(
357
+ status_code=400,
358
+ detail=f"Sample rate mismatch: expected {state.sample_rate}, got {chunk_sr}"
359
+ )
360
+
361
+ # Step 4: Seed-VCで変換
362
+ converted = seed_vc_infer(chunk_np, chunk_sr, state)
363
+
364
+ # Step 5: クロスフェード
365
+ fade_len = int(model_sr * state.overlap_ms / 1000)
366
+ output = crossfade(state.last_output_tail, converted, fade_len)
367
+
368
+ # Step 6: tail更新
369
+ if len(output) >= fade_len:
370
+ state.last_output_tail = output[-fade_len:].copy()
371
+ else:
372
+ state.last_output_tail = output.copy()
373
+
374
+ state.last_access_ts = time.time()
375
+
376
+ # Step 7: WAVエンコード
377
+ wav_bytes = numpy_to_wav_bytes(output, model_sr)
378
+
379
+ return Response(
380
+ content=wav_bytes,
381
+ media_type="audio/wav",
382
+ headers={"X-Chunk-Id": str(chunk_id)}
383
+ )
384
+
385
+ @app.post("/end")
386
+ async def end_session(body: SessionEndRequest):
387
+ """3.2.4 POST /end"""
388
+ SESSIONS.pop(body.session_id, None)
389
+ return {"status": "ended"}
390
+
391
+ if __name__ == "__main__":
392
+ import uvicorn
393
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.109.0
2
+ uvicorn[standard]==0.27.0
3
+ python-multipart==0.0.6
4
+ soundfile==0.12.1
5
+
6
+ # Seed-VC dependencies (from seed-vc/requirements.txt)
7
+ torch==2.4.0
8
+ torchaudio==2.4.0
9
+ scipy==1.13.1
10
+ librosa==0.10.2
11
+ huggingface-hub>=0.28.1
12
+ munch==4.0.0
13
+ einops==0.8.0
14
+ descript-audio-codec==1.0.0
15
+ transformers==4.46.3
16
+ numpy==1.26.4
17
+ hydra-core==1.3.2
18
+ pyyaml
19
+ python-dotenv
20
+ accelerate
seed-vc ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 51383efd921027683c89e5348211d93ff12ac2a8