import os, json, tempfile, subprocess, shutil, time, uuid from pathlib import Path from typing import Optional, Tuple, List import gradio as gr import spaces from huggingface_hub import snapshot_download # ========= Paths & Repo ========= ROOT = Path(__file__).parent.resolve() REPO_DIR = ROOT / "HunyuanVideo-Foley" WEIGHTS_DIR = ROOT / "weights" CACHE_DIR = ROOT / "cache" OUT_DIR = ROOT / "outputs" ASSETS = ROOT / "assets" ASSETS.mkdir(exist_ok=True) BILS_BRAND = os.environ.get("BILS_BRAND", "Bilsimaging · Foley Studio") PRIMARY_COLOR = os.environ.get("PRIMARY_COLOR", "#6B5BFF") # purple-ish MAX_SECS = int(os.environ.get("MAX_SECS", "22")) # ZeroGPU-friendly TARGET_H = int(os.environ.get("TARGET_H", "480")) # downscale target height SR = int(os.environ.get("TARGET_SR", "48000")) # target audio sample rate def sh(cmd: str): print(">>", cmd) subprocess.run(cmd, shell=True, check=True) def ffprobe_duration(path: str) -> float: try: out = subprocess.check_output([ "ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", path ]).decode().strip() return float(out) except Exception: return 0.0 def prepare_once(): """Clone repo + download weights on cold start.""" REPO_DIR.exists() or sh("git clone https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley.git") WEIGHTS_DIR.mkdir(parents=True, exist_ok=True) snapshot_download( repo_id="tencent/HunyuanVideo-Foley", local_dir=str(WEIGHTS_DIR), local_dir_use_symlinks=False, repo_type="model", ) os.environ["HIFI_FOLEY_MODEL_PATH"] = str(WEIGHTS_DIR) CACHE_DIR.mkdir(exist_ok=True) OUT_DIR.mkdir(exist_ok=True) prepare_once() # ========= Preprocessing ========= def preprocess_video(in_path: str) -> Tuple[str, float]: """ - Validates duration (<= MAX_SECS). If longer, auto-trims to MAX_SECS. - Downscales to TARGET_H height (keeping AR), H.264 baseline, AAC passthrough. - Returns path to processed mp4 and final duration. """ dur = ffprobe_duration(in_path) temp_dir = Path(tempfile.mkdtemp(prefix="pre_")) trimmed = temp_dir / "trim.mp4" processed = temp_dir / "proc.mp4" # If longer than budget, trim to MAX_SECS (from start). if dur == 0: raise RuntimeError("Unable to read the video duration.") trim_filter = [] if dur > MAX_SECS: trim_filter = ["-t", str(MAX_SECS)] # First, ensure we have a small, uniform container (mp4) sh(" ".join([ "ffmpeg", "-y", "-i", f"\"{in_path}\"", *trim_filter, "-an", # remove original audio (we're generating new foley) "-vcodec", "libx264", "-preset", "veryfast", "-crf", "23", "-movflags", "+faststart", f"\"{trimmed}\"" ])) # Downscale to TARGET_H keeping AR; re-encode efficiently # Use mod2 dimensions for compatibility vf = f"scale=-2:{TARGET_H}:flags=bicubic" sh(" ".join([ "ffmpeg", "-y", "-i", f"\"{trimmed}\"", "-vf", f"\"{vf}\"", "-an", "-vcodec", "libx264", "-profile:v", "baseline", "-level", "3.1", "-pix_fmt", "yuv420p", "-preset", "veryfast", "-crf", "24", "-movflags", "+faststart", f"\"{processed}\"" ])) final_dur = min(dur, float(MAX_SECS)) return str(processed), final_dur # ========= Inference (ZeroGPU) ========= @spaces.GPU(duration=240) # ~4 minutes per call window def run_model(video_path: str, prompt_text: str) -> str: """ Run Tencent's infer.py on ZeroGPU. Returns path to WAV. """ job_id = uuid.uuid4().hex[:8] work_out = OUT_DIR / f"job_{job_id}" work_out.mkdir(parents=True, exist_ok=True) cmd = [ "python", f"{REPO_DIR}/infer.py", "--model_path", str(WEIGHTS_DIR), "--config_path", f"{REPO_DIR}/configs/hunyuanvideo-foley-xxl.yaml", "--single_video", video_path, "--single_prompt", json.dumps(prompt_text or ""), "--output_dir", str(work_out), "--device", "cuda" ] sh(" ".join(cmd)) # Find produced wav wav = None for p in work_out.rglob("*.wav"): wav = p break if not wav: raise RuntimeError("No audio produced by the model.") # Normalize / resample to SR (safeguard) fixed = work_out / "foley_48k.wav" sh(" ".join([ "ffmpeg", "-y", "-i", f"\"{str(wav)}\"", "-ar", str(SR), "-ac", "2", f"\"{str(fixed)}\"" ])) return str(fixed) # ========= Post: optional mux back to the video ========= def mux_audio_with_video(video_path: str, audio_path: str) -> str: out_path = Path(tempfile.mkdtemp(prefix="mux_")) / "with_foley.mp4" # Copy video, add foley audio as AAC sh(" ".join([ "ffmpeg", "-y", "-i", f"\"{video_path}\"", "-i", f"\"{audio_path}\"", "-map", "0:v:0", "-map", "1:a:0", "-c:v", "copy", "-c:a", "aac", "-b:a", "192k", "-shortest", f"\"{out_path}\"" ])) return str(out_path) # ========= Gradio UI Logic ========= def single_generate(video: str, prompt: str, want_mux: bool, project_name: str) -> Tuple[Optional[str], Optional[str], str, list]: """ Returns: (wav_path, muxed_video_path_or_None, status_markdown, history_list) """ history = [] try: if not video: return None, None, "⚠️ Please upload a video.", history # Preprocess history.append(["Preprocess", "Downscaling / trimming…"]) pre_path, final_dur = preprocess_video(video) # Run model (ZeroGPU) history.append(["Inference", "Generating foley on GPU…"]) wav = run_model(pre_path, prompt or "") # Optional Mux muxed = None if want_mux: history.append(["Mux", "Combining foley with video…"]) muxed = mux_audio_with_video(pre_path, wav) history.append(["Done", f"OK · Duration ~{final_dur:.1f}s"]) return wav, muxed, f"✅ Finished (≈ {final_dur:.1f}s)", history except Exception as e: history.append(["Error", str(e)]) return None, None, f"❌ {type(e).__name__}: {e}", history def batch_lite_generate(files: List[str], prompt: str, want_mux: bool) -> Tuple[str, list]: """ Run a tiny queue sequentially; ZeroGPU handles each call in series. We enforce 3 items max to stay quota-friendly. """ log = [] if not files: return "⚠️ Please upload 1–3 videos.", log if len(files) > 3: files = files[:3] log.append(["Info", "Limiting to first 3 videos."]) outputs = [] for i, f in enumerate(files, 1): try: log.append([f"Preprocess {i}", Path(f).name]) pre, final_dur = preprocess_video(f) log.append([f"Run {i}", f"GPU infer ~{final_dur:.1f}s"]) wav = run_model(pre, prompt or "") muxed = mux_audio_with_video(pre, wav) if want_mux else None outputs.append((wav, muxed)) log.append([f"Done {i}", "OK"]) except Exception as e: log.append([f"Error {i}", str(e)]) # Write a small manifest to outputs manifest = OUT_DIR / f"batchlite_{uuid.uuid4().hex[:6]}.json" manifest.write_text(json.dumps( [{"wav": w, "video": v} for (w, v) in outputs], ensure_ascii=False, indent=2 )) return f"✅ Batch-lite finished · items: {len(outputs)}", log # ========= UI ========= THEME_CSS = f""" :root {{ --brand: {PRIMARY_COLOR}; }} .gradio-container {{ font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Cairo, Noto Sans, Arial, "Apple Color Emoji", "Segoe UI Emoji"; }} #brandbar {{ background: linear-gradient(90deg, var(--brand), #222); color: white; padding: 12px 16px; border-radius: 12px; }} #brandbar strong {{ letter-spacing: .3px; }} footer, #footer {{}} """ with gr.Blocks( css=THEME_CSS, title="Foley Studio · ZeroGPU" ) as demo: with gr.Row(): gr.HTML(f'