Spaces:

JusperLee
/

Dolphin

Running

App Files Files Community

JusperLee commited on Sep 30

Commit

0cd6025

0 Parent(s):

clean repo without raw binaries

Browse files

Files changed (14) hide show

.gitattributes +36 -0
.gitignore +33 -0
Inference.py +635 -0
Inference_with_status.py +410 -0
README.md +189 -0
app.py +671 -0
console_capture.py +45 -0
demo1/mix.mp4 +3 -0
face_detection_utils.py +122 -0
look2hear/datas/transform.py +191 -0
look2hear/models/__init__.py +1 -0
look2hear/models/dolphin.py +1376 -0
look2hear/models/video_compoent.py +876 -0
requirements.txt +14 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,33 @@

+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+lerna-debug.log*
+node_modules
+.DS_Store
+dist
+dist-ssr
+coverage
+*.local
+/cypress/videos/
+/cypress/screenshots/
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
+yarn.lock
+tmp/*
+.gradio
+*.pyc

Inference.py ADDED Viewed

	@@ -0,0 +1,635 @@

+import warnings
+warnings.filterwarnings("ignore")
+import os
+import argparse
+import face_alignment
+import torch
+import torchaudio
+import numpy as np
+import cv2
+from PIL import Image, ImageDraw
+from moviepy import *
+from collections import deque
+from skimage import transform as tf
+import yaml
+from look2hear.models import Dolphin
+from look2hear.datas.transform import get_preprocessing_pipelines
+from face_detection_utils import detect_faces
+# -- Landmark interpolation:
+def linear_interpolate(landmarks, start_idx, stop_idx):
+    start_landmarks = landmarks[start_idx]
+    stop_landmarks = landmarks[stop_idx]
+    delta = stop_landmarks - start_landmarks
+    for idx in range(1, stop_idx-start_idx):
+        landmarks[start_idx+idx] = start_landmarks + idx/float(stop_idx-start_idx) * delta
+    return landmarks
+# -- Face Transformation
+def warp_img(src, dst, img, std_size):
+    tform = tf.estimate_transform('similarity', src, dst)  # find the transformation matrix
+    warped = tf.warp(img, inverse_map=tform.inverse, output_shape=std_size)  # wrap the frame image
+    warped = warped * 255  # note output from wrap is double image (value range [0,1])
+    warped = warped.astype('uint8')
+    return warped, tform
+def apply_transform(transform, img, std_size):
+    warped = tf.warp(img, inverse_map=transform.inverse, output_shape=std_size)
+    warped = warped * 255  # note output from wrap is double image (value range [0,1])
+    warped = warped.astype('uint8')
+    return warped
+# -- Crop
+def cut_patch(img, landmarks, height, width, threshold=5):
+    center_x, center_y = np.mean(landmarks, axis=0)
+    if center_y - height < 0:
+        center_y = height
+    if center_y - height < 0 - threshold:
+        raise Exception('too much bias in height')
+    if center_x - width < 0:
+        center_x = width
+    if center_x - width < 0 - threshold:
+        raise Exception('too much bias in width')
+    if center_y + height > img.shape[0]:
+        center_y = img.shape[0] - height
+    if center_y + height > img.shape[0] + threshold:
+        raise Exception('too much bias in height')
+    if center_x + width > img.shape[1]:
+        center_x = img.shape[1] - width
+    if center_x + width > img.shape[1] + threshold:
+        raise Exception('too much bias in width')
+    cutted_img = np.copy(img[ int(round(center_y) - round(height)): int(round(center_y) + round(height)),
+                         int(round(center_x) - round(width)): int(round(center_x) + round(width))])
+    return cutted_img
+# -- RGB to GRAY
+def convert_bgr2gray(data):
+    return np.stack([cv2.cvtColor(_, cv2.COLOR_BGR2GRAY) for _ in data], axis=0)
+def save2npz(filename, data=None):
+    assert data is not None, "data is {}".format(data)
+    if not os.path.exists(os.path.dirname(filename)):
+        os.makedirs(os.path.dirname(filename))
+    np.savez_compressed(filename, data=data)
+def read_video(filename):
+    """Read video frames using MoviePy for better compatibility"""
+    try:
+        video_clip = VideoFileClip(filename)
+        for frame in video_clip.iter_frames():
+            # Convert RGB to BGR to match cv2 format
+            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+            yield frame_bgr
+        video_clip.close()
+    except Exception as e:
+        print(f"Error reading video {filename}: {e}")
+        return
+def face2head(boxes, scale=1.5):
+    new_boxes = []
+    for box in boxes:
+        width = box[2] - box[0]
+        height= box[3] - box[1]
+        width_center = (box[2] + box[0]) / 2
+        height_center = (box[3] + box[1]) / 2
+        square_width = int(max(width, height) * scale)
+        new_box = [width_center - square_width/2, height_center - square_width/2, width_center + square_width/2, height_center + square_width/2]
+        new_boxes.append(new_box)
+    return new_boxes
+def bb_intersection_over_union(boxA, boxB):
+    # determine the (x, y)-coordinates of the intersection rectangle
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    # compute the area of intersection rectangle
+    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
+    # compute the area of both the prediction and ground-truth
+    # rectangles
+    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
+    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
+    # compute the intersection over union by taking the intersection
+    # area and dividing it by the sum of prediction + ground-truth
+    # areas - the interesection area
+    iou = interArea / float(boxAArea + boxBArea - interArea)
+    # return the intersection over union value
+    return iou
+def detectface(video_input_path, output_path, detect_every_N_frame, scalar_face_detection, number_of_speakers):
+    device = torch.device('cuda' if torch.cuda.get_device_name() else 'cpu')
+    print('Running on device: {}'.format(device))
+    os.makedirs(os.path.join(output_path, 'faces'), exist_ok=True)
+    os.makedirs(os.path.join(output_path, 'landmark'), exist_ok=True)
+    landmarks_dic = {}
+    faces_dic = {}
+    boxes_dic = {}
+    for i in range(number_of_speakers):
+        landmarks_dic[i] = []
+        faces_dic[i] = []
+        boxes_dic[i] = []
+    video_clip = VideoFileClip(video_input_path)
+    print("Video statistics: ", video_clip.w, video_clip.h, (video_clip.w, video_clip.h), video_clip.fps)
+    frames = [Image.fromarray(frame) for frame in video_clip.iter_frames()]
+    print('Number of frames in video: ', len(frames))
+    video_clip.close()
+    fa = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, flip_input=False)
+    for i, frame in enumerate(frames):
+        print('\rTracking frame: {}'.format(i + 1), end='')
+        # Detect faces every N frames
+        if i % detect_every_N_frame == 0:
+            frame_array = np.array(frame)
+            detected_boxes, _ = detect_faces(
+                frame_array,
+                threshold=0.9,
+                allow_upscaling=False,
+            )
+            if detected_boxes is None or len(detected_boxes) == 0:
+                detected_boxes, _ = detect_faces(
+                    frame_array,
+                    threshold=0.7,
+                    allow_upscaling=True,
+                )
+            if detected_boxes is not None and len(detected_boxes) > 0:
+                detected_boxes = detected_boxes[:number_of_speakers]
+                detected_boxes = face2head(detected_boxes, scalar_face_detection)
+            else:
+                detected_boxes = []
+        # Process the detection results
+        if i == 0:
+            # First frame - initialize tracking
+            if len(detected_boxes) < number_of_speakers:
+                raise ValueError(f"First frame must detect at least {number_of_speakers} faces, but only found {len(detected_boxes)}")
+            # Assign first detections to speakers in order
+            for j in range(number_of_speakers):
+                box = detected_boxes[j]
+                face = frame.crop((box[0], box[1], box[2], box[3])).resize((224,224))
+                preds = fa.get_landmarks(np.array(face))
+                if preds is None:
+                    raise ValueError(f"Face landmarks not detected in initial frame for speaker {j}")
+                faces_dic[j].append(face)
+                landmarks_dic[j].append(preds)
+                boxes_dic[j].append(box)
+        else:
+            # For subsequent frames, match detected boxes to speakers
+            matched_speakers = set()
+            speaker_boxes = [None] * number_of_speakers
+            # Match each detected box to the most likely speaker
+            for box in detected_boxes:
+                iou_scores = []
+                for speaker_id in range(number_of_speakers):
+                    if speaker_id in matched_speakers:
+                        iou_scores.append(-1)  # Already matched
+                    else:
+                        last_box = boxes_dic[speaker_id][-1]
+                        iou_score = bb_intersection_over_union(box, last_box)
+                        iou_scores.append(iou_score)
+                if max(iou_scores) > 0:  # Valid match found
+                    best_speaker = iou_scores.index(max(iou_scores))
+                    speaker_boxes[best_speaker] = box
+                    matched_speakers.add(best_speaker)
+            # Process each speaker
+            for speaker_id in range(number_of_speakers):
+                if speaker_boxes[speaker_id] is not None:
+                    # Use detected box
+                    box = speaker_boxes[speaker_id]
+                else:
+                    # Use previous box for this speaker
+                    box = boxes_dic[speaker_id][-1]
+                # Extract face and landmarks
+                face = frame.crop((box[0], box[1], box[2], box[3])).resize((224,224))
+                preds = fa.get_landmarks(np.array(face))
+                if preds is None:
+                    # Use previous landmarks if detection fails
+                    preds = landmarks_dic[speaker_id][-1]
+                faces_dic[speaker_id].append(face)
+                landmarks_dic[speaker_id].append(preds)
+                boxes_dic[speaker_id].append(box)
+    # Verify all speakers have same number of frames
+    frame_counts = [len(boxes_dic[s]) for s in range(number_of_speakers)]
+    print(f"\nFrame counts per speaker: {frame_counts}")
+    assert all(count == len(frames) for count in frame_counts), f"Inconsistent frame counts: {frame_counts}"
+    # Continue with saving videos and landmarks...
+    for s in range(number_of_speakers):
+        frames_tracked = []
+        for i, frame in enumerate(frames):
+            frame_draw = frame.copy()
+            draw = ImageDraw.Draw(frame_draw)
+            draw.rectangle(boxes_dic[s][i], outline=(255, 0, 0), width=6)
+            frames_tracked.append(frame_draw)
+        # Save tracked video
+        tracked_frames = [np.array(frame) for frame in frames_tracked]
+        if tracked_frames:
+            tracked_clip = ImageSequenceClip(tracked_frames, fps=25.0)
+            tracked_video_path = os.path.join(output_path, 'video_tracked' + str(s+1) + '.mp4')
+            tracked_clip.write_videofile(tracked_video_path, codec='libx264', audio=False, logger=None)
+            tracked_clip.close()
+    # Save landmarks
+    for i in range(number_of_speakers):
+        save2npz(os.path.join(output_path, 'landmark', 'speaker' + str(i+1)+'.npz'), data=landmarks_dic[i])
+        # Save face video
+        face_frames = [np.array(frame) for frame in faces_dic[i]]
+        if face_frames:
+            face_clip = ImageSequenceClip(face_frames, fps=25.0)
+            face_video_path = os.path.join(output_path, 'faces', 'speaker' + str(i+1) + '.mp4')
+            face_clip.write_videofile(face_video_path, codec='libx264', audio=False, logger=None)
+            face_clip.close()
+    # Output video path
+    parts = video_input_path.split('/')
+    video_name = parts[-1][:-4]
+    if not os.path.exists(os.path.join(output_path, 'filename_input')):
+        os.mkdir(os.path.join(output_path, 'filename_input'))
+    csvfile = open(os.path.join(output_path, 'filename_input', str(video_name) + '.csv'), 'w')
+    for i in range(number_of_speakers):
+        csvfile.write('speaker' + str(i+1)+ ',0\n')
+    csvfile.close()
+    return os.path.join(output_path, 'filename_input', str(video_name) + '.csv')
+def crop_patch(mean_face_landmarks, video_pathname, landmarks, window_margin, start_idx, stop_idx, crop_height, crop_width, STD_SIZE=(256, 256)):
+    """Crop mouth patch
+    :param str video_pathname: pathname for the video_dieo
+    :param list landmarks: interpolated landmarks
+    """
+    stablePntsIDs = [33, 36, 39, 42, 45]
+    frame_idx = 0
+    frame_gen = read_video(video_pathname)
+    while True:
+        try:
+            frame = frame_gen.__next__() ## -- BGR
+        except StopIteration:
+            break
+        if frame_idx == 0:
+            q_frame, q_landmarks = deque(), deque()
+            sequence = []
+        q_landmarks.append(landmarks[frame_idx])
+        q_frame.append(frame)
+        if len(q_frame) == window_margin:
+            smoothed_landmarks = np.mean(q_landmarks, axis=0)
+            cur_landmarks = q_landmarks.popleft()
+            cur_frame = q_frame.popleft()
+            # -- affine transformation
+            trans_frame, trans = warp_img( smoothed_landmarks[stablePntsIDs, :],
+                                           mean_face_landmarks[stablePntsIDs, :],
+                                           cur_frame,
+                                           STD_SIZE)
+            trans_landmarks = trans(cur_landmarks)
+            # -- crop mouth patch
+            sequence.append( cut_patch( trans_frame,
+                                        trans_landmarks[start_idx:stop_idx],
+                                        crop_height//2,
+                                        crop_width//2,))
+        if frame_idx == len(landmarks)-1:
+            #deal with corner case with video too short
+            if len(landmarks) < window_margin:
+                smoothed_landmarks = np.mean(q_landmarks, axis=0)
+                cur_landmarks = q_landmarks.popleft()
+                cur_frame = q_frame.popleft()
+                # -- affine transformation
+                trans_frame, trans = warp_img(smoothed_landmarks[stablePntsIDs, :],
+                                            mean_face_landmarks[stablePntsIDs, :],
+                                            cur_frame,
+                                            STD_SIZE)
+                trans_landmarks = trans(cur_landmarks)
+                # -- crop mouth patch
+                sequence.append(cut_patch( trans_frame,
+                                trans_landmarks[start_idx:stop_idx],
+                                crop_height//2,
+                                crop_width//2,))
+            while q_frame:
+                cur_frame = q_frame.popleft()
+                # -- transform frame
+                trans_frame = apply_transform( trans, cur_frame, STD_SIZE)
+                # -- transform landmarks
+                trans_landmarks = trans(q_landmarks.popleft())
+                # -- crop mouth patch
+                sequence.append( cut_patch( trans_frame,
+                                            trans_landmarks[start_idx:stop_idx],
+                                            crop_height//2,
+                                            crop_width//2,))
+            return np.array(sequence)
+        frame_idx += 1
+    return None
+def landmarks_interpolate(landmarks):
+    """Interpolate landmarks
+    param list landmarks: landmarks detected in raw videos
+    """
+    valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]
+    if not valid_frames_idx:
+        return None
+    for idx in range(1, len(valid_frames_idx)):
+        if valid_frames_idx[idx] - valid_frames_idx[idx-1] == 1:
+            continue
+        else:
+            landmarks = linear_interpolate(landmarks, valid_frames_idx[idx-1], valid_frames_idx[idx])
+    valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]
+    # -- Corner case: keep frames at the beginning or at the end failed to be detected.
+    if valid_frames_idx:
+        landmarks[:valid_frames_idx[0]] = [landmarks[valid_frames_idx[0]]] * valid_frames_idx[0]
+        landmarks[valid_frames_idx[-1]:] = [landmarks[valid_frames_idx[-1]]] * (len(landmarks) - valid_frames_idx[-1])
+    valid_frames_idx = [idx for idx, _ in enumerate(landmarks) if _ is not None]
+    assert len(valid_frames_idx) == len(landmarks), "not every frame has landmark"
+    return landmarks
+def crop_mouth(video_direc, landmark_direc, filename_path, save_direc, convert_gray=False, testset_only=False):
+    lines = open(filename_path).read().splitlines()
+    lines = list(filter(lambda x: 'test' in x, lines)) if testset_only else lines
+    for filename_idx, line in enumerate(lines):
+        filename, person_id = line.split(',')
+        print('idx: {} \tProcessing.\t{}'.format(filename_idx, filename))
+        video_pathname = os.path.join(video_direc, filename+'.mp4')
+        landmarks_pathname = os.path.join(landmark_direc, filename+'.npz')
+        dst_pathname = os.path.join( save_direc, filename+'.npz')
+        # if os.path.exists(dst_pathname):
+        #    continue
+        multi_sub_landmarks = np.load(landmarks_pathname, allow_pickle=True)['data']
+        landmarks = [None] * len(multi_sub_landmarks)
+        for frame_idx in range(len(landmarks)):
+            try:
+                #landmarks[frame_idx] = multi_sub_landmarks[frame_idx][int(person_id)]['facial_landmarks'] #original for LRW
+                landmarks[frame_idx] = multi_sub_landmarks[frame_idx][int(person_id)] #VOXCELEB2
+            except (IndexError, TypeError):
+                continue
+        # -- pre-process landmarks: interpolate frames not being detected.
+        preprocessed_landmarks = landmarks_interpolate(landmarks)
+        if not preprocessed_landmarks:
+            continue
+        # -- crop
+        mean_face_landmarks = np.load('assets/20words_mean_face.npy')
+        sequence = crop_patch(mean_face_landmarks, video_pathname, preprocessed_landmarks, 12, 48, 68, 96, 96)
+        assert sequence is not None, "cannot crop from {}.".format(filename)
+        # -- save
+        data = convert_bgr2gray(sequence) if convert_gray else sequence[...,::-1]
+        save2npz(dst_pathname, data=data)
+def convert_video_fps(input_file, output_file, target_fps=25):
+    """Convert video to target FPS using moviepy"""
+    video = VideoFileClip(input_file)
+    video_fps = video.fps
+    if video_fps != target_fps:
+        video.write_videofile(
+            output_file,
+            fps=target_fps,
+            codec='libx264',
+            audio_codec='aac',
+            temp_audiofile='temp-audio.m4a',
+            remove_temp=True,
+        )
+    else:
+        # If already at target fps, just copy
+        import shutil
+        shutil.copy2(input_file, output_file)
+    video.close()
+    print(f'Video has been converted to {target_fps} fps and saved to {output_file}')
+def extract_audio(video_file, audio_output_file, sample_rate=16000):
+    """Extract audio from video using moviepy"""
+    video = VideoFileClip(video_file)
+    audio = video.audio
+    # Save audio with specified sample rate
+    audio.write_audiofile(audio_output_file, fps=sample_rate, nbytes=2, codec='pcm_s16le')
+    video.close()
+    audio.close()
+def merge_video_audio(video_file, audio_file, output_file):
+    """Merge video and audio using moviepy"""
+    video = VideoFileClip(video_file)
+    audio = AudioFileClip(audio_file)
+    # Attach audio (MoviePy v2 renamed set_audio to with_audio)
+    set_audio_fn = getattr(video, "set_audio", None)
+    if callable(set_audio_fn):
+        final_video = set_audio_fn(audio)
+    else:
+        with_audio_fn = getattr(video, "with_audio", None)
+        if not callable(with_audio_fn):
+            video.close()
+            audio.close()
+            raise AttributeError("VideoFileClip object lacks both set_audio and with_audio methods")
+        final_video = with_audio_fn(audio)
+    # Write the result
+    final_video.write_videofile(output_file, codec='libx264', audio_codec='aac', temp_audiofile='temp-audio.m4a', remove_temp=True)
+    # Clean up
+    video.close()
+    audio.close()
+    final_video.close()
+def process_video(input_file, output_path, number_of_speakers=2,
+                  detect_every_N_frame=8, scalar_face_detection=1.5,
+                  config_path="checkpoints/vox2/conf.yml",
+                  cuda_device=None):
+    """Main processing function for video speaker separation"""
+    # Set CUDA device if specified
+    if cuda_device is not None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device)
+    # Create output directory
+    os.makedirs(output_path, exist_ok=True)
+    # Convert video to 25fps
+    temp_25fps_file = os.path.join(output_path, 'temp_25fps.mp4')
+    convert_video_fps(input_file, temp_25fps_file, target_fps=25)
+    # Detect faces
+    filename_path = detectface(video_input_path=temp_25fps_file,
+                              output_path=output_path,
+                              detect_every_N_frame=detect_every_N_frame,
+                              scalar_face_detection=scalar_face_detection,
+                              number_of_speakers=number_of_speakers)
+    # Extract audio
+    audio_output = os.path.join(output_path, 'audio.wav')
+    extract_audio(temp_25fps_file, audio_output, sample_rate=16000)
+    # Crop mouth
+    crop_mouth(video_direc=os.path.join(output_path, "faces"),
+               landmark_direc=os.path.join(output_path, "landmark"),
+               filename_path=filename_path,
+               save_direc=os.path.join(output_path, "mouthroi"),
+               convert_gray=True,
+               testset_only=False)
+    # Load model
+    audiomodel = Dolphin.from_pretrained("JusperLee/Dolphin")
+    audiomodel.cuda()
+    audiomodel.eval()
+    # Process each speaker
+    with torch.no_grad():
+        for i in range(number_of_speakers):
+            mouth_roi = np.load(os.path.join(output_path, "mouthroi", f"speaker{i+1}.npz"))["data"]
+            mouth_roi = get_preprocessing_pipelines()["val"](mouth_roi)
+            mix, sr = torchaudio.load(audio_output)
+            mix = mix.cuda().mean(dim=0)
+            window_size = 4 * sr
+            hop_size = 4 * sr
+            all_estimates = []
+            # 滑动窗口处理
+            start_idx = 0
+            while start_idx < len(mix):
+                end_idx = min(start_idx + window_size, len(mix))
+                window_mix = mix[start_idx:end_idx]
+                start_frame = int(start_idx / sr * 25)
+                end_frame = int(end_idx / sr * 25)
+                end_frame = min(end_frame, len(mouth_roi))
+                window_mouth_roi = mouth_roi[start_frame:end_frame]
+                est_sources = audiomodel(window_mix[None],
+                                    torch.from_numpy(window_mouth_roi[None, None]).float().cuda())
+                all_estimates.append({
+                    'start': start_idx,
+                    'end': end_idx,
+                    'estimate': est_sources[0].cpu()
+                })
+                start_idx += hop_size
+                if start_idx >= len(mix):
+                    break
+            output_length = len(mix)
+            merged_output = torch.zeros(1, output_length)
+            weights = torch.zeros(output_length)
+            for est in all_estimates:
+                window_len = est['end'] - est['start']
+                hann_window = torch.hann_window(window_len)
+                merged_output[0, est['start']:est['end']] += est['estimate'][0, :window_len] * hann_window
+                weights[est['start']:est['end']] += hann_window
+            merged_output[:, weights > 0] /= weights[weights > 0]
+            torchaudio.save(os.path.join(output_path, f"speaker{i+1}_est.wav"), merged_output, sr)
+    # Merge video with separated audio for each speaker
+    output_files = []
+    for i in range(number_of_speakers):
+        video_input = os.path.join(output_path, f"video_tracked{i+1}.mp4")
+        audio_input = os.path.join(output_path, f"speaker{i+1}_est.wav")
+        video_output = os.path.join(output_path, f"s{i+1}.mp4")
+        merge_video_audio(video_input, audio_input, video_output)
+        output_files.append(video_output)
+    # Clean up temporary file
+    if os.path.exists(temp_25fps_file):
+        os.remove(temp_25fps_file)
+    return output_files
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Video Speaker Separation using Dolphin model')
+    parser.add_argument('--input', '-i', type=str, required=True,
+                        help='Path to input video file')
+    parser.add_argument('--output', '-o', type=str, default=None,
+                        help='Output directory path (default: creates directory based on input filename)')
+    parser.add_argument('--speakers', '-s', type=int, default=2,
+                        help='Number of speakers to separate (default: 2)')
+    parser.add_argument('--detect-every-n', type=int, default=8,
+                        help='Detect faces every N frames (default: 8)')
+    parser.add_argument('--face-scale', type=float, default=1.5,
+                        help='Face detection bounding box scale factor (default: 1.5)')
+    parser.add_argument('--cuda-device', type=int, default=0,
+                        help='CUDA device ID to use (default: 0, set to -1 for CPU)')
+    parser.add_argument('--config', type=str, default="checkpoints/vox2/conf.yml",
+                        help='Path to model configuration file')
+    args = parser.parse_args()
+    # 验证输入文件是否存在
+    if not os.path.exists(args.input):
+        print(f"Error: Input file '{args.input}' does not exist")
+        exit(1)
+    # 如果没有指定输出路径，基于输入文件名创建输出目录
+    if args.output is None:
+        input_basename = os.path.splitext(os.path.basename(args.input))[0]
+        args.output = os.path.join(os.path.dirname(args.input), input_basename + "_output")
+    # 设置CUDA设备
+    cuda_device = args.cuda_device if args.cuda_device >= 0 else None
+    print(f"Processing video: {args.input}")
+    print(f"Output directory: {args.output}")
+    print(f"Number of speakers: {args.speakers}")
+    print(f"CUDA device: {cuda_device if cuda_device is not None else 'CPU'}")
+    # 处理视频
+    output_files = process_video(
+        input_file=args.input,
+        output_path=args.output,
+        number_of_speakers=args.speakers,
+        detect_every_N_frame=args.detect_every_n,
+        scalar_face_detection=args.face_scale,
+        config_path=args.config,
+        cuda_device=cuda_device
+    )
+    print("\nProcessing completed!")
+    print("Output files:")
+    for i, output_file in enumerate(output_files):
+        print(f"  Speaker {i+1}: {output_file}")

Inference_with_status.py ADDED Viewed

	@@ -0,0 +1,410 @@

+import warnings
+warnings.filterwarnings("ignore")
+import os
+import torch
+import torchaudio
+import numpy as np
+from moviepy import *
+from PIL import Image, ImageDraw
+import face_alignment
+import cv2
+from look2hear.models import Dolphin
+from look2hear.datas.transform import get_preprocessing_pipelines
+from face_detection_utils import detect_faces
+# Import functions from original Inference.py
+from Inference import (
+    linear_interpolate, warp_img, apply_transform, cut_patch, convert_bgr2gray,
+    save2npz, read_video, face2head, bb_intersection_over_union,
+    landmarks_interpolate, crop_patch, convert_video_fps, extract_audio, merge_video_audio
+)
+def detectface_with_status(video_input_path, output_path, detect_every_N_frame, scalar_face_detection, number_of_speakers, status_callback=None):
+    """Face detection with status updates"""
+    device = torch.device('cuda' if torch.cuda.get_device_name() else 'cpu')
+    if status_callback:
+        status_callback({'status': f'Running on device: {device}', 'progress': 0.0})
+    os.makedirs(os.path.join(output_path, 'faces'), exist_ok=True)
+    os.makedirs(os.path.join(output_path, 'landmark'), exist_ok=True)
+    landmarks_dic = {}
+    faces_dic = {}
+    boxes_dic = {}
+    for i in range(number_of_speakers):
+        landmarks_dic[i] = []
+        faces_dic[i] = []
+        boxes_dic[i] = []
+    video_clip = VideoFileClip(video_input_path)
+    if status_callback:
+        status_callback({'status': f"Video: {video_clip.w}x{video_clip.h}, {video_clip.fps}fps", 'progress': 0.05})
+    frames = [Image.fromarray(frame) for frame in video_clip.iter_frames()]
+    total_frames = len(frames)
+    if status_callback:
+        status_callback({'status': f'Processing {total_frames} frames', 'progress': 0.1})
+    video_clip.close()
+    fa = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, flip_input=False)
+    for i, frame in enumerate(frames):
+        if status_callback and i % 10 == 0:
+            status_callback({'status': f'Tracking frame: {i+1}/{total_frames}', 'progress': 0.1 + 0.3 * (i / total_frames)})
+        # Detect faces every N frames
+        if i % detect_every_N_frame == 0:
+            frame_array = np.array(frame)
+            detected_boxes, _ = detect_faces(
+                frame_array,
+                threshold=0.9,
+                allow_upscaling=False,
+            )
+            if detected_boxes is None or len(detected_boxes) == 0:
+                detected_boxes, _ = detect_faces(
+                    frame_array,
+                    threshold=0.7,
+                    allow_upscaling=True,
+                )
+            if detected_boxes is not None and len(detected_boxes) > 0:
+                detected_boxes = np.asarray(detected_boxes, dtype=np.float32)
+                areas = (detected_boxes[:, 2] - detected_boxes[:, 0]) * (detected_boxes[:, 3] - detected_boxes[:, 1])
+                sort_idx = np.argsort(areas)[::-1]
+                detected_boxes = detected_boxes[sort_idx][:number_of_speakers]
+                detected_boxes = face2head(detected_boxes, scalar_face_detection)
+                detected_boxes = [box for box in detected_boxes]
+            else:
+                detected_boxes = []
+        # Process the detection results (same as original)
+        if i == 0:
+            # First frame - initialize tracking
+            if len(detected_boxes) < number_of_speakers:
+                raise ValueError(f"First frame must detect at least {number_of_speakers} faces, but only found {len(detected_boxes)}")
+            # Assign first detections to speakers in order
+            for j in range(number_of_speakers):
+                box = detected_boxes[j]
+                face = frame.crop((box[0], box[1], box[2], box[3])).resize((224,224))
+                preds = fa.get_landmarks(np.array(face))
+                if preds is None:
+                    raise ValueError(f"Face landmarks not detected in initial frame for speaker {j}")
+                faces_dic[j].append(face)
+                landmarks_dic[j].append(preds)
+                boxes_dic[j].append(box)
+        else:
+            # For subsequent frames, match detected boxes to speakers
+            matched_speakers = set()
+            speaker_boxes = [None] * number_of_speakers
+            # Match each detected box to the most likely speaker
+            for box in detected_boxes:
+                iou_scores = []
+                for speaker_id in range(number_of_speakers):
+                    if speaker_id in matched_speakers:
+                        iou_scores.append(-1)  # Already matched
+                    else:
+                        last_box = boxes_dic[speaker_id][-1]
+                        iou_score = bb_intersection_over_union(box, last_box)
+                        iou_scores.append(iou_score)
+                if max(iou_scores) > 0:  # Valid match found
+                    best_speaker = iou_scores.index(max(iou_scores))
+                    speaker_boxes[best_speaker] = box
+                    matched_speakers.add(best_speaker)
+            # Process each speaker
+            for speaker_id in range(number_of_speakers):
+                if speaker_boxes[speaker_id] is not None:
+                    # Use detected box
+                    box = speaker_boxes[speaker_id]
+                else:
+                    # Use previous box for this speaker
+                    box = boxes_dic[speaker_id][-1]
+                # Extract face and landmarks
+                face = frame.crop((box[0], box[1], box[2], box[3])).resize((224,224))
+                preds = fa.get_landmarks(np.array(face))
+                if preds is None:
+                    # Use previous landmarks if detection fails
+                    preds = landmarks_dic[speaker_id][-1]
+                faces_dic[speaker_id].append(face)
+                landmarks_dic[speaker_id].append(preds)
+                boxes_dic[speaker_id].append(box)
+    # Verify all speakers have same number of frames
+    frame_counts = [len(boxes_dic[s]) for s in range(number_of_speakers)]
+    if status_callback:
+        status_callback({'status': f"Frame counts per speaker: {frame_counts}", 'progress': 0.4})
+    assert all(count == len(frames) for count in frame_counts), f"Inconsistent frame counts: {frame_counts}"
+    # Continue with saving videos and landmarks...
+    for s in range(number_of_speakers):
+        if status_callback:
+            status_callback({'status': f'Saving tracked video for speaker {s+1}', 'progress': 0.4 + 0.1 * (s / number_of_speakers)})
+        frames_tracked = []
+        for i, frame in enumerate(frames):
+            frame_draw = frame.copy()
+            draw = ImageDraw.Draw(frame_draw)
+            draw.rectangle(boxes_dic[s][i], outline=(255, 0, 0), width=6)
+            frames_tracked.append(frame_draw)
+        # Save tracked video
+        tracked_frames = [np.array(frame) for frame in frames_tracked]
+        if tracked_frames:
+            tracked_clip = ImageSequenceClip(tracked_frames, fps=25.0)
+            tracked_video_path = os.path.join(output_path, 'video_tracked' + str(s+1) + '.mp4')
+            tracked_clip.write_videofile(tracked_video_path, codec='libx264', audio=False, logger=None)
+            tracked_clip.close()
+    # Save landmarks
+    for i in range(number_of_speakers):
+        # Create landmark directory if it doesn't exist
+        landmark_dir = os.path.join(output_path, 'landmark')
+        os.makedirs(landmark_dir, exist_ok=True)
+        save2npz(os.path.join(landmark_dir, 'speaker' + str(i+1)+'.npz'), data=landmarks_dic[i])
+        # Save face video
+        face_frames = [np.array(frame) for frame in faces_dic[i]]
+        if face_frames:
+            face_clip = ImageSequenceClip(face_frames, fps=25.0)
+            face_video_path = os.path.join(output_path, 'faces', 'speaker' + str(i+1) + '.mp4')
+            face_clip.write_videofile(face_video_path, codec='libx264', audio=False, logger=None)
+            face_clip.close()
+    # Output video path
+    parts = video_input_path.split('/')
+    video_name = parts[-1][:-4]
+    filename_dir = os.path.join(output_path, 'filename_input')
+    os.makedirs(filename_dir, exist_ok=True)
+    csvfile = open(os.path.join(filename_dir, str(video_name) + '.csv'), 'w')
+    for i in range(number_of_speakers):
+        csvfile.write('speaker' + str(i+1)+ ',0\n')
+    csvfile.close()
+    return os.path.join(filename_dir, str(video_name) + '.csv')
+def crop_mouth_with_status(video_direc, landmark_direc, filename_path, save_direc, status_callback=None, convert_gray=False, testset_only=False):
+    """Crop mouth with status updates"""
+    lines = open(filename_path).read().splitlines()
+    lines = list(filter(lambda x: 'test' in x, lines)) if testset_only else lines
+    for filename_idx, line in enumerate(lines):
+        filename, person_id = line.split(',')
+        if status_callback:
+            status_callback({'status': f'Processing speaker{int(person_id)+1}', 'progress': 0.5 + 0.1 * filename_idx / len(lines)})
+        video_pathname = os.path.join(video_direc, filename+'.mp4')
+        landmarks_pathname = os.path.join(landmark_direc, filename+'.npz')
+        # Create mouthroi directory if it doesn't exist
+        os.makedirs(save_direc, exist_ok=True)
+        dst_pathname = os.path.join(save_direc, filename+'.npz')
+        multi_sub_landmarks = np.load(landmarks_pathname, allow_pickle=True)['data']
+        if len(multi_sub_landmarks) == 0:
+            print(f"No landmarks found for {filename}, skipping crop.")
+            continue
+        landmark_frame_count = len(multi_sub_landmarks)
+        cap = cv2.VideoCapture(video_pathname)
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
+        cap.release()
+        if frame_count > 0 and frame_count != landmark_frame_count:
+            print(
+                f"Frame count mismatch for {filename}: video has {frame_count} frames, "
+                f"landmarks have {landmark_frame_count} entries. Adjusting to match."
+            )
+            if frame_count < landmark_frame_count:
+                multi_sub_landmarks = multi_sub_landmarks[:frame_count]
+            else:
+                pad_count = frame_count - landmark_frame_count
+                pad = np.repeat(multi_sub_landmarks[-1:], pad_count, axis=0)
+                multi_sub_landmarks = np.concatenate((multi_sub_landmarks, pad), axis=0)
+        landmarks = [None] * len(multi_sub_landmarks)
+        for frame_idx in range(len(landmarks)):
+            try:
+                landmarks[frame_idx] = multi_sub_landmarks[frame_idx][int(person_id)]
+            except (IndexError, TypeError):
+                continue
+        # Pre-process landmarks: interpolate frames not being detected
+        preprocessed_landmarks = landmarks_interpolate(landmarks)
+        if not preprocessed_landmarks:
+            continue
+        # Crop
+        mean_face_landmarks = np.load('assets/20words_mean_face.npy')
+        sequence = crop_patch(mean_face_landmarks, video_pathname, preprocessed_landmarks, 12, 48, 68, 96, 96)
+        assert sequence is not None, "cannot crop from {}.".format(filename)
+        # Save
+        data = convert_bgr2gray(sequence) if convert_gray else sequence[...,::-1]
+        save2npz(dst_pathname, data=data)
+def process_video_with_status(input_file, output_path, number_of_speakers=2,
+                             detect_every_N_frame=8, scalar_face_detection=1.5,
+                             config_path="checkpoints/vox2/conf.yml",
+                             cuda_device=None, status_callback=None):
+    """Main processing function with status updates"""
+    # Set CUDA device if specified
+    if cuda_device is not None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device)
+    # Create output directory
+    os.makedirs(output_path, exist_ok=True)
+    # Convert video to 25fps
+    if status_callback:
+        status_callback({'status': 'Converting video to 25fps', 'progress': 0.0})
+    temp_25fps_file = os.path.join(output_path, 'temp_25fps.mp4')
+    convert_video_fps(input_file, temp_25fps_file, target_fps=25)
+    # Detect faces
+    if status_callback:
+        status_callback({'status': 'Detecting faces and tracking speakers', 'progress': 0.1})
+    filename_path = detectface_with_status(
+        video_input_path=temp_25fps_file,
+        output_path=output_path,
+        detect_every_N_frame=detect_every_N_frame,
+        scalar_face_detection=scalar_face_detection,
+        number_of_speakers=number_of_speakers,
+        status_callback=status_callback
+    )
+    torch.cuda.empty_cache()
+    # Extract audio
+    if status_callback:
+        status_callback({'status': 'Extracting audio from video', 'progress': 0.5})
+    audio_output = os.path.join(output_path, 'audio.wav')
+    extract_audio(temp_25fps_file, audio_output, sample_rate=16000)
+    # Crop mouth
+    if status_callback:
+        status_callback({'status': 'Cropping mouth regions', 'progress': 0.55})
+    crop_mouth_with_status(
+        video_direc=os.path.join(output_path, "faces"),
+        landmark_direc=os.path.join(output_path, "landmark"),
+        filename_path=filename_path,
+        save_direc=os.path.join(output_path, "mouthroi"),
+        convert_gray=True,
+        testset_only=False,
+        status_callback=status_callback
+    )
+    # Load model
+    if status_callback:
+        status_callback({'status': 'Loading Dolphin model', 'progress': 0.6})
+    torch.cuda.empty_cache()
+    audiomodel = Dolphin.from_pretrained("JusperLee/Dolphin")
+    # audiomodel.cuda()
+    audiomodel.eval()
+    # Process each speaker
+    with torch.no_grad():
+        for i in range(number_of_speakers):
+            if status_callback:
+                status_callback({'status': f'Processing audio for speaker {i+1}', 'progress': 0.65 + 0.25 * (i / number_of_speakers)})
+            mouth_roi_path = os.path.join(output_path, "mouthroi", f"speaker{i+1}.npz")
+            mouth_roi = np.load(mouth_roi_path)["data"]
+            mouth_roi = get_preprocessing_pipelines()["val"](mouth_roi)
+            mix, sr = torchaudio.load(audio_output)
+            mix = mix.mean(dim=0)
+            window_size = 4 * sr
+            hop_size = int(4 * sr)
+            all_estimates = []
+            # Sliding window processing
+            start_idx = 0
+            window_count = 0
+            while start_idx < len(mix):
+                end_idx = min(start_idx + window_size, len(mix))
+                window_mix = mix[start_idx:end_idx]
+                start_frame = int(start_idx / sr * 25)
+                end_frame = int(end_idx / sr * 25)
+                end_frame = min(end_frame, len(mouth_roi))
+                window_mouth_roi = mouth_roi[start_frame:end_frame]
+                est_sources = audiomodel(window_mix[None],
+                                    torch.from_numpy(window_mouth_roi[None, None]).float())
+                all_estimates.append({
+                    'start': start_idx,
+                    'end': end_idx,
+                    'estimate': est_sources[0].cpu()
+                })
+                window_count += 1
+                if status_callback:
+                    progress = 0.65 + 0.25 * (i / number_of_speakers) + 0.25 / number_of_speakers * (window_count * hop_size / len(mix))
+                    status_callback({'status': f'Processing audio window {window_count} for speaker {i+1}', 'progress': min(progress, 0.9)})
+                start_idx += hop_size
+                if start_idx >= len(mix):
+                    break
+                torch.cuda.empty_cache()
+            output_length = len(mix)
+            merged_output = torch.zeros(1, output_length)
+            weights = torch.zeros(output_length)
+            for est in all_estimates:
+                window_len = est['end'] - est['start']
+                hann_window = torch.hann_window(window_len)
+                merged_output[0, est['start']:est['end']] += est['estimate'][0, :window_len] * hann_window
+                weights[est['start']:est['end']] += hann_window
+            merged_output[:, weights > 0] /= weights[weights > 0]
+            audio_save_path = os.path.join(output_path, f"speaker{i+1}_est.wav")
+            torchaudio.save(audio_save_path, merged_output, sr)
+    # Merge video with separated audio for each speaker
+    torch.cuda.empty_cache()
+    if status_callback:
+        status_callback({'status': 'Merging videos with separated audio', 'progress': 0.9})
+    output_files = []
+    for i in range(number_of_speakers):
+        video_input = os.path.join(output_path, f"video_tracked{i+1}.mp4")
+        audio_input = os.path.join(output_path, f"speaker{i+1}_est.wav")
+        video_output = os.path.join(output_path, f"s{i+1}.mp4")
+        merge_video_audio(video_input, audio_input, video_output)
+        output_files.append(video_output)
+    # Clean up temporary file
+    if os.path.exists(temp_25fps_file):
+        os.remove(temp_25fps_file)
+    if status_callback:
+        status_callback({'status': 'Processing completed!', 'progress': 1.0})
+    return output_files

README.md ADDED Viewed

	@@ -0,0 +1,189 @@

+<p align="center">
+  <img src="assets/icon.png" alt="Dolphin Logo" width="150"/>
+</p>
+<h3 align="center">Dolphin: Efficient Audio-Visual Speech Separation with Discrete Lip Semantics and Multi-Scale Global-Local Attention</h3>
+<p align="center">
+  <strong>Kai Li*, Kejun Gao*, Xiaolin Hu </strong><br>
+  <strong>Tsinghua University</strong>
+</p>
+<p align="center">
+  <img src="https://visitor-badge.laobi.icu/badge?page_id=JusperLee.Dolphin" alt="访客统计" />
+  <img src="https://img.shields.io/github/stars/JusperLee/Dolphin?style=social" alt="GitHub stars" />
+  <img alt="Static Badge" src="https://img.shields.io/badge/license-Apache%202.0-blue.svg" />
+  <a href="https://arxiv.org/abs/2509.23610" target="_blank" rel="noreferrer noopener">
+    <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-2509.23610-b31b1b.svg?logo=arxiv&logoColor=white" />
+  </a>
+  <a href="https://huggingface.co/JusperLee/Dolphin" target="_blank" rel="noreferrer noopener">
+    <img alt="Hugging Face Models" src="https://img.shields.io/badge/Hugging%20Face-Models-ff9d2c?logo=huggingface&logoColor=white" />
+  </a>
+  <a href="https://dolphin.cslikai.cn/" target="_blank" rel="noreferrer noopener">
+    <img alt="Gradio Live Demo" src="https://img.shields.io/badge/Gradio-Live%20Demo-00a67e?logo=gradio&logoColor=white" />
+  </a>
+</p>
+<p align="center">
+> Dolphin is an efficient audio-visual speech separation framework that leverages discrete lip semantics and global–local attention to achieve state-of-the-art performance with significantly reduced computational complexity.
+## 🎯 Highlights
+- **Balanced Quality & Efficiency**: Single-pass separator achieves state-of-the-art AVSS performance without iterative refinement.
+- **DP-LipCoder**: Dual-path, vector-quantized video encoder produces discrete audio-aligned semantic tokens while staying lightweight.
+- **Global–Local Attention**: TDANet-based separator augments each layer with coarse global self-attention and heat diffusion local attention.
+- **Edge-Friendly Deployment**: Delivers >50% parameter reduction, >2.4× lower MACs, and >6× faster GPU inference versus IIANet.
+## 💥 News
+- **[2025-09-28]** Code and pre-trained models are released! 📦
+## 📜 Abstract
+Audio-visual speech separation (AVSS) methods leverage visual cues to extract target speech in noisy acoustic environments, but most existing systems remain computationally heavy. Dolphin tackles this tension by combining a lightweight, dual-path video encoder with a single-pass global–local collaborative separator. The video pathway, DP-LipCoder, maps lip movements into discrete semantic tokens that remain tightly aligned with audio through vector quantization and distillation from AV-HuBERT. The audio separator builds upon TDANet and injects global–local attention (GLA) blocks—coarse-grained self-attention for long-range context and heat diffusion attention for denoising fine details. Across three public AVSS benchmarks, Dolphin not only outperforms the state-of-the-art IIANet on separation metrics but also delivers over 50% fewer parameters, more than 2.4× lower MACs, and over 6× faster GPU inference, making it practical for edge deployment.
+## 🌍 Motivation
+In real-world environments, target speech is often masked by background noise and interfering speakers. This phenomenon reflects the classic “cocktail party effect,” where listeners selectively attend to a single speaker within a noisy scene (Cherry, 1953). These challenges have spurred extensive research on speech separation.
+Audio-only approaches tend to struggle in complex acoustic conditions, while the integration of synchronous visual cues offers greater robustness. Recent deep learning-based AVSS systems achieve strong performance, yet many rely on computationally intensive separators or heavy iterative refinement, limiting their practicality.
+Beyond the separator itself, AVSS models frequently inherit high computational cost from their video encoders. Large-scale lip-reading backbones provide rich semantic alignment but bring prohibitive parameter counts. Compressing them often erodes lip semantics, whereas designing new lightweight encoders from scratch risks losing semantic fidelity and degrading separation quality. Building a video encoder that balances compactness with semantic alignment therefore remains a central challenge for AVSS.
+## 🧠 Method Overview
+To address these limitations, Dolphin introduces a novel AVSS pipeline centered on two components:
+- **DP-LipCoder**: A dual-path, vector-quantized video encoder that separates compressed visual structure from audio-aligned semantics. By combining vector quantization with knowledge distillation from AV-HuBERT, it converts continuous lip motion into discrete semantic tokens without sacrificing representational capacity.
+- **Single-Pass GLA Separator**: A lightweight TDANet-based audio separator that removes the need for iterative refinement. Each layer hosts a global–local attention block: coarse-grained self-attention captures long-range dependencies at low resolution, while heat diffusion attention smooths features across channels to suppress noise and retain detail.
+Together, these components strike a balance between separation quality and computational efficiency, enabling deployment in resource-constrained scenarios.
+## 🧪 Experimental Highlights
+We evaluate Dolphin on LRS2, LRS3, and VoxCeleb2. Compared with the state-of-the-art IIANet, Dolphin achieves higher scores across all separation metrics while dramatically reducing resource consumption:
+- **Parameters**: >50% reduction
+- **Computation**: >2.4× decrease in MACs
+- **Inference**: >6× speedup on GPU
+These results demonstrate that Dolphin provides competitive AVSS quality on edge hardware without heavy iterative processing.
+## 🏗️ Architecture
+![Dolphin Architecture](assets/overall-pipeline.png)
+> The overall architecture of Dolphin.
+### Video Encoder
+![Dolphin Architecture](assets/video-ae.png)
+> The video encoder of Dolphin.
+### Dolphin Model Overview
+![Dolphin Architecture](assets/separator.png)
+> The overall architecture of Dolphin's separator.
+### Key Components
+![Dolphin Architecture](assets/ga-msa.png)
+1. **Global Attention (GA) Block**
+   - Applies coarse-grained self-attention to capture long-range structure
+   - Operates at low spatial resolution for efficiency
+   - Enhances robustness to complex acoustic mixtures
+2. **Local Attention (LA) Block**
+   - Uses heat diffusion attention to smooth features across channels
+   - Suppresses background noise while preserving details
+   - Complements GA to balance global context and local fidelity
+## 📊 Results
+### Performance Comparison
+Performance metrics on three public AVSS benchmark datasets. Bold indicates best performance.
+![Results Table](assets/results.png)
+### Efficiency Analysis
+![Efficiency Comparison](assets/efficiency_comparison.png)
+Dolphin achieves:
+- ✅ **>50%** parameter reduction
+- ✅ **2.4×** lower computational cost (MACs)
+- ✅ **6×** faster GPU inference speed
+- ✅ Superior separation quality across all metrics
+## 📦 Installation
+```bash
+git clone https://github.com/JusperLee/Dolphin.git
+cd Dolphin
+pip install torch torchvision
+pip install -r requirements.txt
+```
+### Requirements
+- Python >= 3.10
+- PyTorch >= 2.5.0
+- CUDA >= 12.4
+- Other dependencies in requirements.txt
+## 🚀 Quick Start
+### Inference with Pre-trained Model
+```python
+# Single audio-visual separation
+python inference.py \
+    --input /path/to/video.mp4 \
+    --output /path/to/output/directory \
+    --speakers 2 \
+    --detect-every-n 8 \
+    --face-scale 1.5 \
+    --cuda-device 0 \
+    --config checkpoints/vox2/conf.yml
+```
+## 📁 Model Zoo
+| Model | Training Data | SI-SNRi | PESQ | Download |
+|-------|--------------|---------|------|----------|
+| Dolphin | VoxCeleb2 | 16.1 dB | 3.45 | [Link](https://huggingface.co/JusperLee/Dolphin) |
+## 📖 Citation
+If you find Dolphin useful in your research, please cite:
+```bibtex
+@misc{li2025efficientaudiovisualspeechseparation,
+      title={Efficient Audio-Visual Speech Separation with Discrete Lip Semantics and Multi-Scale Global-Local Attention},
+      author={Kai Li and Kejun Gao and Xiaolin Hu},
+      year={2025},
+      eprint={2509.23610},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2509.23610},
+}
+```
+## 🤝 Acknowledgments
+We thank the authors of [IIANet](https://github.com/JusperLee/IIANet) and [SepReformer](https://github.com/dmlguq456/SepReformer) for providing parts of the code used in this project.
+## 📧 Contact
+For questions and feedback, please open an issue on GitHub or contact us at: [tsinghua.kaili@gmail.com](tsinghua.kaili@gmail.com)
+## 📄 License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+<p align="center">
+  Made with stars ⭐️ for efficient audio-visual speech separation
+</p>

app.py ADDED Viewed

	@@ -0,0 +1,671 @@

+#!/usr/bin/env python3
+"""
+Audio-visual Speech Separation Gradio App - Hugging Face Space Version
+Automatically detects and separates all speakers in videos
+"""
+import warnings
+warnings.filterwarnings("ignore")
+import os
+import gradio as gr
+import numpy as np
+import shutil
+import tempfile
+import time
+import sys
+import threading
+from PIL import Image, ImageDraw, ImageFont
+from moviepy import *
+import spaces
+from face_detection_utils import detect_faces
+# Use HF Space's temp directory
+TEMP_DIR = os.environ.get('TMPDIR', '/tmp')
+# Shared state for relaying GPU-side status back to the UI thread.
+GPU_PROGRESS_STATE = {"progress": 0.0, "status": "Processing on GPU..."}
+GPU_PROGRESS_LOCK = threading.Lock()
+class LogCollector:
+    """Collect logs in a list"""
+    def __init__(self):
+        self.logs = []
+    def add(self, message):
+        if message and message.strip():
+            timestamp = time.strftime("%H:%M:%S")
+            self.logs.append(f"[{timestamp}] {message.strip()}")
+    def get_text(self, last_n=None):
+        if last_n:
+            return "\n".join(self.logs[-last_n:])
+        return "\n".join(self.logs)
+# Global log collector for capturing print statements
+GLOBAL_LOG = LogCollector()
+class StdoutCapture:
+    """Capture stdout and add to log"""
+    def __init__(self, original):
+        self.original = original
+    def write(self, text):
+        self.original.write(text)
+        if text.strip():
+            GLOBAL_LOG.add(text.strip())
+    def flush(self):
+        self.original.flush()
+def remove_duplicate_faces(boxes, probs, iou_threshold=0.5):
+    """Remove duplicate face detections using IoU (Intersection over Union)"""
+    if len(boxes) <= 1:
+        return boxes, probs
+    # Calculate IoU between all pairs of boxes
+    def calculate_iou(box1, box2):
+        x1 = max(box1[0], box2[0])
+        y1 = max(box1[1], box2[1])
+        x2 = min(box1[2], box2[2])
+        y2 = min(box1[3], box2[3])
+        intersection = max(0, x2 - x1) * max(0, y2 - y1)
+        area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+        area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+        union = area1 + area2 - intersection
+        return intersection / union if union > 0 else 0
+    # Keep track of which boxes to keep
+    keep = []
+    used = set()
+    # Sort by confidence (if available) or by area
+    if probs is not None:
+        sorted_indices = np.argsort(probs)[::-1]
+    else:
+        areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+        sorted_indices = np.argsort(areas)[::-1]
+    for i in sorted_indices:
+        if i in used:
+            continue
+        keep.append(i)
+        used.add(i)
+        # Mark overlapping boxes as used
+        for j in range(len(boxes)):
+            if j != i and j not in used:
+                iou = calculate_iou(boxes[i], boxes[j])
+                if iou > iou_threshold:
+                    used.add(j)
+    # Return filtered boxes and probs
+    keep = sorted(keep)  # Maintain original order
+    filtered_boxes = boxes[keep]
+    filtered_probs = probs[keep] if probs is not None else None
+    return filtered_boxes, filtered_probs
+def process_detected_faces(boxes, probs, frame_rgb, frame_pil):
+    """Process detected faces and return face images"""
+    face_images = []
+    full_frame_annotated = frame_rgb.copy()
+    if boxes is None or len(boxes) == 0:
+        return [], 0, full_frame_annotated, "No faces detected"
+    boxes = np.asarray(boxes, dtype=np.float32)
+    # Filter by confidence if available
+    if probs is not None:
+        # Keep faces with confidence > 0.9
+        confident_indices = probs > 0.9
+        boxes = boxes[confident_indices]
+        probs = probs[confident_indices]
+        print(f"After filtering by confidence: {len(boxes)} faces")
+    if len(boxes) == 0:
+        return [], 0, full_frame_annotated, "No faces passed the confidence filter"
+    # Remove duplicate detections
+    boxes, probs = remove_duplicate_faces(boxes, probs, iou_threshold=0.3)
+    print(f"After removing duplicates: {len(boxes)} faces")
+    if len(boxes) == 0:
+        return [], 0, full_frame_annotated, "No faces remained after duplicate removal"
+    # Sort boxes by area (larger faces first)
+    areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+    sorted_indices = np.argsort(areas)[::-1]
+    boxes = boxes[sorted_indices]
+    # Annotate full frame
+    full_frame_pil = Image.fromarray(full_frame_annotated)
+    draw = ImageDraw.Draw(full_frame_pil)
+    # Try to use a better font
+    try:
+        font = ImageFont.load_default()
+    except:
+        font = None
+    # Extract face images and annotate
+    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255)]
+    for i, box in enumerate(boxes):
+        color = colors[i % len(colors)]
+        # Draw bounding box
+        draw.rectangle(box.tolist(), outline=color, width=4)
+        label = f"Speaker {i+1}"
+        # Draw label
+        if font:
+            draw.text((box[0] + 5, box[1] - 20), label, fill=color, font=font)
+        # Extract face with margin
+        margin = 50
+        x1 = max(0, int(box[0] - margin))
+        y1 = max(0, int(box[1] - margin))
+        x2 = min(frame_rgb.shape[1], int(box[2] + margin))
+        y2 = min(frame_rgb.shape[0], int(box[3] + margin))
+        face_crop = frame_rgb[y1:y2, x1:x2]
+        # Resize maintaining aspect ratio
+        face_crop = Image.fromarray(face_crop)
+        face_crop.thumbnail((250, 250), Image.Resampling.LANCZOS)
+        face_crop = np.array(face_crop)
+        face_images.append(face_crop)
+    full_frame_annotated = np.array(full_frame_pil)
+    return face_images, len(boxes), full_frame_annotated, None
+@spaces.GPU(duration=60, enable_queue=True)
+def detect_faces_gpu(frame_pil):
+    """GPU-accelerated face detection"""
+    print("Detecting faces with RetinaFace")
+    frame_array = np.array(frame_pil)
+    boxes, probs = detect_faces(
+        frame_array,
+        threshold=0.9,
+        allow_upscaling=False,
+    )
+    if boxes is None or len(boxes) == 0:
+        print("No faces detected at high threshold, relaxing criteria...")
+        boxes, probs = detect_faces(
+            frame_array,
+            threshold=0.7,
+            allow_upscaling=True,
+        )
+    return boxes, probs
+def detect_and_extract_all_faces(video_path):
+    """Detect all faces in the first frame and extract them"""
+    print("Starting face detection...")
+    # Check if video file exists
+    if not os.path.exists(video_path):
+        print(f"Error: Video file does not exist at path: {video_path}")
+        return [], 0, None, f"Video file not found: {video_path}"
+    print(f"Video path: {video_path}")
+    print(f"File size: {os.path.getsize(video_path) / 1024 / 1024:.2f} MB")
+    # Use moviepy to read video
+    print("Opening video with moviepy...")
+    try:
+        clip = VideoFileClip(video_path)
+        # Get video properties
+        fps = clip.fps
+        duration = clip.duration
+        total_frames = int(fps * duration)
+        print(f"Video info: FPS: {fps}, Duration: {duration}s, Total frames: {total_frames}")
+        # Get first frame
+        frame = clip.get_frame(0)  # MoviePy returns RGB
+        frame_rgb = (frame * 255).astype(np.uint8) if frame.max() <= 1.0 else frame.astype(np.uint8)
+        print(f"Successfully read frame with moviepy: {frame_rgb.shape}")
+        # Close the clip to free resources
+        clip.close()
+        # Convert to PIL for downstream processing
+        frame_pil = Image.fromarray(frame_rgb)
+        # Detect faces using RetinaFace
+        print("Detecting faces with RetinaFace...")
+        boxes, probs = detect_faces(
+            frame_rgb,
+            threshold=0.9,
+            allow_upscaling=False,
+        )
+        if boxes is None or len(boxes) == 0:
+            print("No faces detected at high threshold, trying relaxed settings...")
+            boxes, probs = detect_faces(
+                frame_rgb,
+                threshold=0.7,
+                allow_upscaling=True,
+            )
+        if boxes is not None and len(boxes) > 0:
+            print(f"Detected {len(boxes)} faces")
+            return process_detected_faces(boxes, probs, frame_rgb, frame_pil)
+        else:
+            return [], 0, frame_rgb, "No faces detected in the first frame"
+    except Exception as e:
+        print(f"MoviePy failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return [], 0, None, f"Failed to open video file. Error: {str(e)}"
+@spaces.GPU(duration=300, enable_queue=True)
+def process_video_gpu(video_file, temp_dir, num_speakers):
+    """GPU-accelerated video processing"""
+    try:
+        from Inference_with_status import process_video_with_status
+        # Define status callback inside GPU function
+        def gpu_status_callback(message):
+            status_text = message.get('status', 'Processing...')
+            print(f"GPU Processing: {status_text}")
+            progress_value = message.get('progress')
+            with GPU_PROGRESS_LOCK:
+                GPU_PROGRESS_STATE["status"] = status_text
+                if progress_value is not None:
+                    try:
+                        numeric_progress = float(progress_value)
+                        GPU_PROGRESS_STATE["progress"] = min(max(numeric_progress, 0.0), 1.0)
+                    except (TypeError, ValueError):
+                        pass
+        output_files = process_video_with_status(
+            input_file=video_file,
+            output_path=temp_dir,
+            number_of_speakers=num_speakers,
+            detect_every_N_frame=8,
+            scalar_face_detection=1.5,
+            status_callback=gpu_status_callback
+        )
+        return output_files
+    except ImportError:
+        from Inference import process_video
+        print("Using standard process_video (status callbacks not available)")
+        output_files = process_video(
+            input_file=video_file,
+            output_path=temp_dir,
+            number_of_speakers=num_speakers,
+            detect_every_N_frame=8,
+            scalar_face_detection=1.5
+        )
+        return output_files
+def process_video_auto(video_file, progress=gr.Progress()):
+    """Process video with automatic speaker detection and stream status updates"""
+    global GLOBAL_LOG
+    GLOBAL_LOG = LogCollector()
+    old_stdout = sys.stdout
+    sys.stdout = StdoutCapture(old_stdout)
+    status_value = "⏳ Ready to process..."
+    detected_info_output = gr.update(visible=False)
+    face_gallery_output = gr.update(visible=False)
+    output_video_output = gr.update(visible=False)
+    video_dict_value = None
+    annotated_frame_output = gr.update(visible=False)
+    def snapshot():
+        return (
+            status_value,
+            detected_info_output,
+            face_gallery_output,
+            output_video_output,
+            video_dict_value,
+            annotated_frame_output,
+            GLOBAL_LOG.get_text()
+        )
+    try:
+        if video_file is None:
+            status_value = "⚠️ Please upload a video file"
+            yield snapshot()
+            return
+        progress(0, desc="Starting processing...")
+        status_value = "🔄 Starting processing..."
+        GLOBAL_LOG.add("Starting video processing...")
+        yield snapshot()
+        temp_dir = None
+        try:
+            temp_dir = tempfile.mkdtemp(dir=TEMP_DIR)
+            print(f"Created temporary directory: {temp_dir}")
+            progress(0.1, desc="Detecting speakers in video...")
+            status_value = "🔍 Detecting speakers in video..."
+            print("Starting face detection in video...")
+            yield snapshot()
+            face_images, num_speakers, annotated_frame, error_msg = detect_and_extract_all_faces(video_file)
+            print(f"Face detection completed. Found {num_speakers} speakers.")
+            if error_msg:
+                print(f"Error: {error_msg}")
+                status_value = f"❌ {error_msg}"
+                if annotated_frame is not None:
+                    annotated_frame_output = gr.update(value=annotated_frame, visible=True)
+                yield snapshot()
+                return
+            if num_speakers == 0:
+                print("No speakers detected in the video.")
+                status_value = "❌ No speakers detected in the video. Please ensure faces are visible in the first frame."
+                if annotated_frame is not None:
+                    annotated_frame_output = gr.update(value=annotated_frame, visible=True)
+                yield snapshot()
+                return
+            face_gallery_images = [(img, f"Speaker {i+1}") for i, img in enumerate(face_images)]
+            detected_info = f"🎯 Detected {num_speakers} speaker{'s' if num_speakers > 1 else ''} in the video"
+            detected_info_output = gr.update(value=detected_info, visible=True)
+            face_gallery_output = gr.update(value=face_gallery_images, visible=True)
+            if annotated_frame is not None:
+                annotated_frame_output = gr.update(value=annotated_frame, visible=True)
+            progress(0.3, desc=f"Separating {num_speakers} speakers...")
+            status_value = f"🎬 Separating {num_speakers} speakers..."
+            print(f"Starting audio-visual separation for {num_speakers} speakers...")
+            yield snapshot()
+            try:
+                print("Starting GPU-accelerated video processing...")
+                with GPU_PROGRESS_LOCK:
+                    GPU_PROGRESS_STATE["progress"] = 0.0
+                    GPU_PROGRESS_STATE["status"] = "Processing on GPU..."
+                progress(0.4, desc="Processing on GPU...")
+                status_value = "Processing on GPU..."
+                yield snapshot()
+                gpu_result = {"output_files": None, "exception": None}
+                def run_gpu_processing():
+                    try:
+                        gpu_result["output_files"] = process_video_gpu(
+                            video_file=video_file,
+                            temp_dir=temp_dir,
+                            num_speakers=num_speakers
+                        )
+                    except Exception as exc:
+                        gpu_result["exception"] = exc
+                gpu_thread = threading.Thread(target=run_gpu_processing, daemon=True)
+                gpu_thread.start()
+                last_reported_progress = 0.4
+                last_status_message = "Processing on GPU..."
+                while gpu_thread.is_alive():
+                    time.sleep(0.5)
+                    with GPU_PROGRESS_LOCK:
+                        gpu_status = GPU_PROGRESS_STATE.get("status", "Processing on GPU...")
+                        gpu_progress_value = GPU_PROGRESS_STATE.get("progress", 0.0)
+                    mapped_progress = 0.4 + 0.5 * gpu_progress_value
+                    mapped_progress = min(mapped_progress, 0.89)
+                    if (
+                        mapped_progress > last_reported_progress + 0.01
+                        or gpu_status != last_status_message
+                    ):
+                        progress(mapped_progress, desc=gpu_status)
+                        last_reported_progress = mapped_progress
+                        last_status_message = gpu_status
+                        status_value = gpu_status
+                        yield snapshot()
+                gpu_thread.join()
+                if gpu_result["exception"] is not None:
+                    raise gpu_result["exception"]
+                output_files = gpu_result["output_files"]
+                progress(0.9, desc="Preparing results...")
+                status_value = "📦 Preparing results..."
+                print("Processing completed successfully!")
+                print(f"Generated {num_speakers} output videos")
+                yield snapshot()
+                video_dict_value = {i: output_files[i] for i in range(num_speakers)}
+                video_dict_value['temp_dir'] = temp_dir
+                output_video_output = gr.update(value=output_files[0], visible=True)
+                progress(1.0, desc="Complete!")
+                status_value = f"✅ Successfully separated {num_speakers} speakers! Click on any face below to view their video."
+                yield snapshot()
+            except Exception as e:
+                print(f"Processing failed: {str(e)}")
+                import traceback
+                traceback.print_exc()
+                status_value = f"❌ Processing failed: {str(e)}"
+                output_video_output = gr.update(visible=False)
+                video_dict_value = None
+                yield snapshot()
+                return
+        except Exception as e:
+            if temp_dir and os.path.exists(temp_dir):
+                try:
+                    shutil.rmtree(temp_dir)
+                except Exception:
+                    pass
+            print(f"Error: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            status_value = f"❌ Error: {str(e)}"
+            detected_info_output = gr.update(visible=False)
+            face_gallery_output = gr.update(visible=False)
+            output_video_output = gr.update(visible=False)
+            annotated_frame_output = gr.update(visible=False)
+            video_dict_value = None
+            yield snapshot()
+            return
+    finally:
+        sys.stdout = old_stdout
+def on_face_click(evt: gr.SelectData, video_dict):
+    """Handle face gallery click events"""
+    if video_dict is None or evt.index not in video_dict:
+        return None
+    return video_dict[evt.index]
+# Create the Gradio interface
+custom_css = """
+.face-gallery {
+    border-radius: 10px;
+    overflow: hidden;
+}
+.face-gallery img {
+    border-radius: 8px;
+    transition: transform 0.2s ease-in-out;
+}
+.face-gallery img:hover {
+    transform: scale(1.05);
+    cursor: pointer;
+    box-shadow: 0 4px 8px rgba(0,0,0,0.3);
+}
+.detected-info {
+    background-color: #f0f0f0;
+    padding: 10px;
+    border-radius: 5px;
+    margin: 10px 0;
+}
+"""
+with gr.Blocks(
+    title="Video Speaker Auto-Separation",
+    theme=gr.themes.Soft(),
+    css=custom_css
+) as demo:
+    gr.Markdown(
+        """
+        # 🎥 Dolphin: Efficient Audio-Visual Speech Separation with Discrete Lip Semantics and Hierarchical Top-Down Attention
+        <p align="left">
+        <img src="https://visitor-badge.laobi.icu/badge?page_id=JusperLee.Dolphin" alt="访客统计" /><img src="https://img.shields.io/github/stars/JusperLee/Dolphin?style=social" alt="GitHub stars" /><img alt="Static Badge" src="https://img.shields.io/badge/license-Apache%202.0-blue.svg" />
+        </p>
+        ### Automatically detect and separate ALL speakers in your video
+        Simply upload a video and the system will:
+        1. 🔍 Automatically detect all speakers in the video
+        2. 🎭 Show you each detected speaker's face
+        3. 🎬 Generate individual videos for each speaker with their isolated audio
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            video_input = gr.Video(
+                label="📹 Upload Your Video",
+                height=300,
+                interactive=True
+            )
+            # Add example video section
+            gr.Markdown("### 🎬 Try with Example Video")
+            gr.Examples(
+                examples=[["demo1/mix.mp4"]],
+                inputs=video_input,
+                label="Click to load example video",
+                cache_examples=False
+            )
+            process_btn = gr.Button(
+                "🚀 Auto-Detect and Process",
+                variant="primary",
+                size="lg"
+            )
+            status = gr.Textbox(
+                label="Status",
+                interactive=False,
+                value="⏳ Ready to process..."
+            )
+            processing_log = gr.Textbox(
+                label="📋 Processing Details",
+                lines=10,
+                max_lines=15,
+                interactive=False,
+                value=""
+            )
+        with gr.Column(scale=3):
+            annotated_frame = gr.Image(
+                label="📸 Detected Speakers in First Frame",
+                visible=False,
+                height=300
+            )
+            detected_info = gr.Markdown(
+                visible=False,
+                elem_classes="detected-info"
+            )
+            gr.Markdown("### 👇 Click on any face below to view that speaker's video")
+            face_gallery = gr.Gallery(
+                label="Detected Speaker Faces",
+                show_label=False,
+                columns=5,
+                rows=1,
+                height=200,
+                visible=False,
+                object_fit="contain",
+                elem_classes="face-gallery"
+            )
+            output_video = gr.Video(
+                label="🎬 Selected Speaker's Video",
+                height=300,
+                visible=False,
+                autoplay=True
+            )
+    # Hidden state
+    video_dict = gr.State()
+    gr.Markdown(
+        """
+        ---
+        ### 📖 How it works:
+        1. **Upload** - Select any video file
+        2. **Process** - Click the button to start automatic detection
+        3. **Review** - See all detected speakers and their positions
+        4. **Select** - Click on any face to watch that speaker's separated video
+        ### 💡 Tips for best results:
+        - ✅ Ensure all speakers' faces are visible in the first frame
+        - ✅ Use videos with good lighting and clear face views
+        - ✅ Works best with frontal or near-frontal face angles
+        - ⏱️ Processing time depends on video length and number of speakers
+        ### 🚀 Powered by:
+        - RetinaFace for face detection
+        - Dolphin model for audio-visual separation
+        - GPU acceleration when available
+        <footer style="display:none;">
+            <a href='https://clustrmaps.com/site/1c828' title='Visit tracker'>
+                <img src='//clustrmaps.com/map_v2.png?cl=080808&w=300&t=tt&d=XYmTC4S_SxuX7G06iJ16lU43VCNkCBFRLXMfEM5zvmo&co=ffffff&ct=808080'/>
+            </a>
+        </footer>
+        """
+    )
+    # Event handlers
+    outputs_list = [
+        status,
+        detected_info,
+        face_gallery,
+        output_video,
+        video_dict,
+        annotated_frame,
+        processing_log
+    ]
+    process_btn.click(
+        fn=process_video_auto,
+        inputs=[video_input],
+        outputs=outputs_list,
+        show_progress=True
+    )
+    face_gallery.select(
+        fn=on_face_click,
+        inputs=[video_dict],
+        outputs=output_video
+    )
+# Launch the demo - HF Space will handle this automatically
+if __name__ == "__main__":
+    import os
+    demo.launch(server_name="0.0.0.0", server_port=7860)

console_capture.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import sys
+import io
+from contextlib import contextmanager
+class TeeOutput:
+    """Capture stdout/stderr while still printing to console"""
+    def __init__(self, stream, callback=None):
+        self.stream = stream
+        self.callback = callback
+        self.buffer = []
+    def write(self, data):
+        # Write to original stream
+        self.stream.write(data)
+        self.stream.flush()
+        # Capture the data
+        if data.strip():  # Only capture non-empty lines
+            self.buffer.append(data.rstrip())
+            if self.callback:
+                self.callback(data.rstrip())
+    def flush(self):
+        self.stream.flush()
+    def get_captured(self):
+        return '\n'.join(self.buffer)
+@contextmanager
+def capture_console(stdout_callback=None, stderr_callback=None):
+    """Context manager to capture console output"""
+    old_stdout = sys.stdout
+    old_stderr = sys.stderr
+    stdout_capture = TeeOutput(old_stdout, stdout_callback)
+    stderr_capture = TeeOutput(old_stderr, stderr_callback)
+    sys.stdout = stdout_capture
+    sys.stderr = stderr_capture
+    try:
+        yield stdout_capture, stderr_capture
+    finally:
+        sys.stdout = old_stdout
+        sys.stderr = old_stderr

demo1/mix.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbf7577afd8b8ebc4a70d88e5d6a8216dd9ca07a1e26a83a0c677074510ec39c
+size 3387273

face_detection_utils.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""Utility helpers for RetinaFace-based face detection."""
+from __future__ import annotations
+from typing import Optional, Tuple
+import numpy as np
+import cv2
+try:
+    from retinaface import RetinaFace  # type: ignore
+except ImportError as import_error:  # pragma: no cover - handled at runtime
+    RetinaFace = None  # type: ignore
+    _RETINAFACE_IMPORT_ERROR = import_error
+else:
+    _RETINAFACE_IMPORT_ERROR = None
+try:
+    from PIL import Image
+except ImportError:  # pragma: no cover
+    Image = None  # type: ignore
+import spaces
+def _ensure_retinaface_available() -> None:
+    if RetinaFace is None:  # pragma: no cover - runtime safeguard
+        raise ImportError(
+            "RetinaFace package is required but not installed. "
+            "Install it with `pip install retinaface`."
+        ) from _RETINAFACE_IMPORT_ERROR
+def _to_rgb_array(image: np.ndarray, *, assume_bgr: bool = False) -> np.ndarray:
+    """Convert input to an RGB numpy array."""
+    if isinstance(image, np.ndarray):
+        array = image
+    elif Image is not None and isinstance(image, Image.Image):
+        array = np.array(image.convert("RGB"))
+    else:
+        raise TypeError("Expected an ndarray or PIL.Image.Image for face detection")
+    if array.ndim != 3 or array.shape[2] != 3:
+        raise ValueError("Face detection expects an image with shape (H, W, 3)")
+    if array.dtype != np.uint8:
+        array = array.astype(np.uint8)
+    if assume_bgr:
+        return cv2.cvtColor(array, cv2.COLOR_BGR2RGB)
+    return array
+@spaces.GPU(duration=360)
+def detect_faces(
+    image: np.ndarray,
+    *,
+    threshold: float = 0.9,
+    allow_upscaling: bool = False,
+    model: Optional[str] = None,
+    assume_bgr: bool = False,
+) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
+    """Run RetinaFace detection on an image.
+    Returns bounding boxes shaped (N, 4) and confidence scores shaped (N,).
+    If no face is detected, both values are ``None``.
+    """
+    _ensure_retinaface_available()
+    rgb_image = _to_rgb_array(image, assume_bgr=assume_bgr)
+    bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+    detections = RetinaFace.detect_faces(
+        bgr_image,
+        threshold=threshold,
+        model=model,
+        allow_upscaling=allow_upscaling,
+    )
+    if not isinstance(detections, dict) or not detections:
+        return None, None
+    boxes, scores = [], []
+    for face_data in detections.values():
+        facial_area = face_data.get("facial_area")
+        if facial_area is None:
+            continue
+        boxes.append(facial_area)
+        scores.append(face_data.get("score", 0.0))
+    if not boxes:
+        return None, None
+    boxes_array = np.asarray(boxes, dtype=np.float32)
+    scores_array = np.asarray(scores, dtype=np.float32) if scores else None
+    return boxes_array, scores_array
+@spaces.GPU(duration=360)
+def extract_faces(
+    image: np.ndarray,
+    *,
+    align: bool = True,
+    threshold: float = 0.9,
+    allow_upscaling: bool = False,
+    model: Optional[str] = None,
+    assume_bgr: bool = False,
+) -> Optional[np.ndarray]:
+    """Extract faces using RetinaFace.extract_faces for convenience."""
+    _ensure_retinaface_available()
+    rgb_image = _to_rgb_array(image, assume_bgr=assume_bgr)
+    bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+    faces = RetinaFace.extract_faces(
+        bgr_image,
+        align=align,
+        threshold=threshold,
+        model=model,
+        allow_upscaling=allow_upscaling,
+    )
+    if not faces:
+        return None
+    return np.asarray([np.asarray(face, dtype=np.uint8) for face in faces])

look2hear/datas/transform.py ADDED Viewed

	@@ -0,0 +1,191 @@

+###
+# Author: Kai Li
+# Date: 2021-06-19 22:34:13
+# LastEditors: Kai Li
+# LastEditTime: 2021-08-30 20:01:43
+###
+import cv2
+import random
+import numpy as np
+import torchvision
+__all__ = [
+    "Compose",
+    "Normalize",
+    "CenterCrop",
+    "RgbToGray",
+    "RandomCrop",
+    "HorizontalFlip",
+]
+class Compose(object):
+    """Compose several preprocess together.
+    Args:
+        preprocess (list of ``Preprocess`` objects): list of preprocess to compose.
+    """
+    def __init__(self, preprocess):
+        self.preprocess = preprocess
+    def __call__(self, sample):
+        for t in self.preprocess:
+            sample = t(sample)
+        return sample
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.preprocess:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
+class RgbToGray(object):
+    """Convert image to grayscale.
+    Converts a numpy.ndarray (H x W x C) in the range
+    [0, 255] to a numpy.ndarray of shape (H x W x C) in the range [0.0, 1.0].
+    """
+    def __call__(self, frames):
+        """
+        Args:
+            img (numpy.ndarray): Image to be converted to gray.
+        Returns:
+            numpy.ndarray: grey image
+        """
+        frames = np.stack([cv2.cvtColor(_, cv2.COLOR_RGB2GRAY) for _ in frames], axis=0)
+        return frames
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
+class Normalize(object):
+    """Normalize a ndarray image with mean and standard deviation."""
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, frames):
+        """
+        Args:
+            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
+        Returns:
+            Tensor: Normalized Tensor image.
+        """
+        frames = (frames - self.mean) / self.std
+        return frames
+    def __repr__(self):
+        return self.__class__.__name__ + "(mean={0}, std={1})".format(
+            self.mean, self.std
+        )
+class CenterCrop(object):
+    """Crop the given image at the center"""
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, frames):
+        """
+        Args:
+            img (numpy.ndarray): Images to be cropped.
+        Returns:
+            numpy.ndarray: Cropped image.
+        """
+        t, h, w = frames.shape
+        th, tw = self.size
+        delta_w = int(round((w - tw)) / 2.0)
+        delta_h = int(round((h - th)) / 2.0)
+        frames = frames[:, delta_h : delta_h + th, delta_w : delta_w + tw]
+        return frames
+class RandomCrop(object):
+    """Crop the given image at the center"""
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, frames):
+        """
+        Args:
+            img (numpy.ndarray): Images to be cropped.
+        Returns:
+            numpy.ndarray: Cropped image.
+        """
+        t, h, w = frames.shape
+        th, tw = self.size
+        delta_w = random.randint(0, w - tw)
+        delta_h = random.randint(0, h - th)
+        frames = frames[:, delta_h : delta_h + th, delta_w : delta_w + tw]
+        return frames
+    def __repr__(self):
+        return self.__class__.__name__ + "(size={0})".format(self.size)
+class HorizontalFlip(object):
+    """Flip image horizontally."""
+    def __init__(self, flip_ratio):
+        self.flip_ratio = flip_ratio
+    def __call__(self, frames):
+        """
+        Args:
+            img (numpy.ndarray): Images to be flipped with a probability flip_ratio
+        Returns:
+            numpy.ndarray: Cropped image.
+        """
+        t, h, w = frames.shape
+        if random.random() < self.flip_ratio:
+            for index in range(t):
+                frames[index] = cv2.flip(frames[index], 1)
+        return frames
+def get_preprocessing_pipelines():
+    # -- preprocess for the video stream
+    preprocessing = {}
+    # -- LRW config
+    crop_size = (88, 88)
+    (mean, std) = (0.421, 0.165)
+    preprocessing["train"] = Compose(
+        [
+            Normalize(0.0, 255.0),
+            RandomCrop(crop_size),
+            HorizontalFlip(0.5),
+            Normalize(mean, std),
+        ]
+    )
+    preprocessing["val"] = Compose(
+        [Normalize(0.0, 255.0), CenterCrop(crop_size), Normalize(mean, std)]
+    )
+    preprocessing["test"] = preprocessing["val"]
+    return preprocessing
+def get_preprocessing_opt_pipelines():
+    preprocessing = {}
+    # -- LRW config
+    crop_size = (88, 88)
+    (mean, std) = (0.421, 0.165)
+    preprocessing["train"] = torchvision.transforms.Compose([
+        torchvision.transforms.Normalize(0.0, 255.0),
+        torchvision.transforms.RandomCrop(crop_size),
+        torchvision.transforms.RandomHorizontalFlip(0.5),
+        torchvision.transforms.Normalize(mean, std)
+    ])
+    preprocessing["val"] = torchvision.transforms.Compose([
+        torchvision.transforms.Normalize(0.0, 255.0),
+        torchvision.transforms.CenterCrop(crop_size),
+        torchvision.transforms.Normalize(mean, std)
+    ])
+    preprocessing["test"] = preprocessing["val"]
+    return preprocessing

look2hear/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .dolphin import Dolphin

look2hear/models/dolphin.py ADDED Viewed

	@@ -0,0 +1,1376 @@

+"""
+Dolphin Model
+This implementation is inspired by and borrows concepts from Sepformer.
+The original Sepformer work is licensed under the Apache-2.0 License.
+References:
+- SepReformer: https://github.com/dmlguq456/SepReformer
+- Apache-2.0 License: https://www.apache.org/licenses/LICENSE-2.0
+"""
+from re import S
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from vector_quantize_pytorch import ResidualVQ
+from .video_compoent import *
+from huggingface_hub import PyTorchModelHubMixin
+class LayerScale(torch.nn.Module):
+    def __init__(self, dims, input_size, Layer_scale_init=1.0e-5):
+        super().__init__()
+        if dims == 1:
+            self.layer_scale = torch.nn.Parameter(torch.ones(input_size)*Layer_scale_init, requires_grad=True)
+        elif dims == 2:
+            self.layer_scale = torch.nn.Parameter(torch.ones(1,input_size)*Layer_scale_init, requires_grad=True)
+        elif dims == 3:
+            self.layer_scale = torch.nn.Parameter(torch.ones(1,1,input_size)*Layer_scale_init, requires_grad=True)
+    def forward(self, x):
+        return x*self.layer_scale
+class Masking(torch.nn.Module):
+    def __init__(self, input_dim):
+        super(Masking, self).__init__()
+        self.gate_act = torch.nn.ReLU()
+    def forward(self, x, skip):
+        return self.gate_act(x) * skip
+class FFN(torch.nn.Module):
+    def __init__(self, in_channels, dropout_rate, Layer_scale_init=1.0e-5):
+        super().__init__()
+        expand_factor = 3
+        self.net1 = torch.nn.Sequential(
+            torch.nn.LayerNorm(in_channels),
+            torch.nn.Linear(in_channels, in_channels * expand_factor))
+        self.depthwise = torch.nn.Conv1d(in_channels * expand_factor, in_channels * expand_factor, 3, padding=1, groups=in_channels * expand_factor)
+        self.net2 = torch.nn.Sequential(
+            torch.nn.GLU(),
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.Linear(in_channels * expand_factor // 2, in_channels),
+            torch.nn.Dropout(dropout_rate))
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x):
+        y = self.net1(x)
+        y = y.permute(0, 2, 1).contiguous()
+        y = self.depthwise(y)
+        y = y.permute(0, 2, 1).contiguous()
+        y = self.net2(y)
+        return x + self.Layer_scale(y)
+class MultiHeadAttention(torch.nn.Module):
+    """
+    Multi-Head Attention layer.
+        :param int n_head: the number of head s
+        :param int n_feat: the number of features
+        :param float dropout_rate: dropout rate
+    """
+    def __init__(self, n_head: int, in_channels: int, dropout_rate: float, Layer_scale_init=1.0e-5):
+        super().__init__()
+        assert in_channels % n_head == 0
+        self.d_k = in_channels // n_head # We assume d_v always equals d_k
+        self.h = n_head
+        self.layer_norm = torch.nn.LayerNorm(in_channels)
+        self.linear_q = torch.nn.Linear(in_channels, in_channels)
+        self.linear_k = torch.nn.Linear(in_channels, in_channels)
+        self.linear_v = torch.nn.Linear(in_channels, in_channels)
+        self.linear_out = torch.nn.Linear(in_channels, in_channels)
+        self.attn = None
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x, pos_k, mask):
+        """
+        Compute 'Scaled Dot Product Attention'.
+            :param torch.Tensor mask: (batch, time1, time2)
+            :param torch.nn.Dropout dropout:
+            :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
+            weighted by the query dot key attention (batch, head, time1, time2)
+        """
+        n_batch = x.size(0)
+        x = self.layer_norm(x)
+        q = self.linear_q(x).view(n_batch, -1, self.h, self.d_k)  #(b, t, d)
+        k = self.linear_k(x).view(n_batch, -1, self.h, self.d_k)  #(b, t, d)
+        v = self.linear_v(x).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        A = torch.matmul(q, k.transpose(-2, -1))
+        reshape_q = q.contiguous().view(n_batch * self.h, -1, self.d_k).transpose(0,1)
+        if pos_k is not None:
+            B = torch.matmul(reshape_q, pos_k.transpose(-2, -1))
+            B = B.transpose(0, 1).view(n_batch, self.h, pos_k.size(0), pos_k.size(1))
+            scores = (A + B) / math.sqrt(self.d_k)
+        else:
+            scores = A / math.sqrt(self.d_k)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+            min_value = float(np.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, v)  # (batch, head, time1, d_k)
+        x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        return self.Layer_scale(self.dropout(self.linear_out(x)))  # (batch, time1, d_model)
+class DU_MHSA(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'self_attn': MultiHeadAttention(
+                n_head=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate),
+            'linear': torch.nn.Sequential(
+                torch.nn.LayerNorm(normalized_shape=in_channels),
+                torch.nn.Linear(in_features=in_channels, out_features=in_channels),
+                torch.nn.Sigmoid())
+        })
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        down_len = pos_k.shape[0]
+        x_down = torch.nn.functional.adaptive_avg_pool1d(input=x, output_size=down_len)
+        x = x.permute([0, 2, 1])
+        x_down = x_down.permute([0, 2, 1])
+        x_down = self.block['self_attn'](x_down, pos_k, None)
+        x_down = x_down.permute([0, 2, 1])
+        x_downup = torch.nn.functional.upsample(input=x_down, size=x.shape[1])
+        x_downup = x_downup.permute([0, 2, 1])
+        x = x + self.block['linear'](x) * x_downup
+        return x
+class Heat1D(nn.Module):
+    """
+    1D Heat Equation Adaptation:
+    du/dt - k d²u/dx² = 0;
+    du/dx_{x=0, x=a} = 0
+    =>
+    A_n = C(a, n==0) * sum_{0}^{a} { \phi(x) cos(n π / a x) dx }
+    core = cos(n π / a x) exp(- (n π / a)^2 k t)
+    u_{x, t} = sum_{0}^{\infinite} { core }
+    Assume a = T; x in [0, T]; n in [0, T]; with some slight changes
+    =>
+    (\phi(x) = linear(dwconv(input(x))))
+    A(n) = DCT1D(\phi(x))
+    u(x, t) = IDCT1D(A(n) * exp(- (n π / a)^2 kt))
+    """
+    def __init__(self, dim=96, hidden_dim=96, **kwargs):
+        super().__init__()
+        self.dwconv = nn.Conv1d(dim, hidden_dim, kernel_size=3, padding=1, groups=hidden_dim)
+        self.hidden_dim = hidden_dim
+        self.linear = nn.Conv1d(hidden_dim, 2 * hidden_dim, kernel_size=3, padding=1, groups=hidden_dim)
+        self.out_norm = nn.LayerNorm(hidden_dim)
+        self.out_linear = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1, groups=hidden_dim)
+        self.to_k = nn.Sequential(
+            nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1, groups=hidden_dim),
+            nn.GELU(),
+        )
+        self.k = nn.Parameter(torch.ones(hidden_dim))
+    @staticmethod
+    def get_cos_map(N=224, device=torch.device("cpu"), dtype=torch.float):
+        # cos((x + 0.5) / N * n * π) which is also the form of DCT and IDCT
+        # DCT: F(n) = sum( (sqrt(2/N) if n > 0 else sqrt(1/N)) * cos((x + 0.5) / N * n * π) * f(x) )
+        # IDCT: f(x) = sum( (sqrt(2/N) if n > 0 else sqrt(1/N)) * cos((x + 0.5) / N * n * π) * F(n) )
+        # returns: (Res_n, Res_x)
+        weight_x = (torch.linspace(0, N - 1, N, device=device, dtype=dtype).view(1, -1) + 0.5) / N
+        weight_n = torch.linspace(0, N - 1, N, device=device, dtype=dtype).view(-1, 1)
+        weight = torch.cos(weight_n * weight_x * torch.pi) * math.sqrt(2 / N)
+        weight[0, :] = weight[0, :] / math.sqrt(2)
+        return weight
+    @staticmethod
+    def get_decay_map(resolution=224, device=torch.device("cpu"), dtype=torch.float):
+        # exp(- (n π / T)^2) for 1D
+        # returns: (Res_t,)
+        res_t = resolution
+        weight_n = torch.linspace(0, torch.pi, res_t + 1, device=device, dtype=dtype)[:res_t]
+        weight = torch.pow(weight_n, 2)
+        weight = torch.exp(-weight)
+        return weight
+    def forward(self, x: torch.Tensor, freq_embed=None):
+        B, T, C = x.shape
+        x = x.transpose(1, 2)  # [B, T, C] -> [B, C, T]
+        x = self.dwconv(x)  # [B, hidden_dim, T]
+        x = self.linear(x)  # [B, 2 * hidden_dim, T]
+        x, z = x.chunk(chunks=2, dim=1)  # [B, hidden_dim, T], [B, hidden_dim, T]
+        if (T == getattr(self, "__RES__", 0)) and (getattr(self, "__WEIGHT_COSN__", None).device == x.device):
+            weight_cosn = getattr(self, "__WEIGHT_COSN__", None)
+            weight_exp = getattr(self, "__WEIGHT_EXP__", None)
+            assert weight_cosn is not None
+            assert weight_exp is not None
+        else:
+            weight_cosn = self.get_cos_map(T, device=x.device).detach_()
+            weight_exp = self.get_decay_map(T, device=x.device).detach_()
+            setattr(self, "__RES__", T)
+            setattr(self, "__WEIGHT_COSN__", weight_cosn)
+            setattr(self, "__WEIGHT_EXP__", weight_exp)
+        N = weight_cosn.shape[0]  # N == T
+        x = x.transpose(1, 2).contiguous()  # [B, T, hidden_dim]
+        x = F.conv1d(x.contiguous().view(B, T, -1), weight_cosn.contiguous().view(N, T, 1))  # [B, N, hidden_dim]
+        weight_exp = torch.pow(weight_exp[:, None], self.k)
+        x = torch.einsum("bnc,nc->bnc", x, weight_exp)  # exp decay
+        x = F.conv1d(x.contiguous().view(B, N, -1), weight_cosn.t().contiguous().view(T, N, 1))  # [B, T, hidden_dim]
+        x = self.out_norm(x)  # [B, T, hidden_dim]
+        z = z.transpose(1, 2).contiguous()  # [B, T, hidden_dim]
+        x = x * nn.functional.silu(z)  # [B, T, hidden_dim]
+        x = x.transpose(1, 2).contiguous()  # [B, hidden_dim, T]
+        x = self.out_linear(x)  # [B, hidden_dim, T]
+        x = x.transpose(1, 2).contiguous()  # [B, T, hidden_dim]
+        return x
+class CLA(torch.nn.Module):
+    def __init__(self, in_channels, kernel_size, dropout_rate, Layer_scale_init=1.0e-5):
+        super().__init__()
+        # self.layer_norm = torch.nn.LayerNorm(in_channels)
+        self.heat1d = Heat1D(in_channels, in_channels)
+        self.GN1 = torch.nn.GroupNorm(1, in_channels)
+        self.dw_conv_1d = torch.nn.Conv1d(in_channels, in_channels, kernel_size, padding='same', groups=in_channels)
+        self.GN2 = torch.nn.GroupNorm(1, in_channels)
+        self.linear3 = torch.nn.Sequential(
+            torch.nn.GELU(),
+            torch.nn.Conv1d(in_channels, in_channels, kernel_size=3, padding=1, groups=in_channels),
+            torch.nn.Dropout(dropout_rate))
+        self.Layer_scale = LayerScale(dims=3, input_size=in_channels, Layer_scale_init=Layer_scale_init)
+    def forward(self, x):
+        # y = self.layer_norm(x)
+        y = self.heat1d(x)
+        y = y.permute([0, 2, 1]) # B, F, T
+        y = self.GN1(y)
+        y = self.dw_conv_1d(y)
+        y = self.linear3(y)
+        y = self.GN2(y)  # [B, in_channels, T]
+        y = y.permute(0, 2, 1)  # B, T, in_channels
+        return x + self.Layer_scale(y)
+class GlobalBlock(torch.nn.Module):
+    def __init__(self, in_channels: int, num_mha_heads: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'DU_MHSA': DU_MHSA(
+                num_mha_heads=num_mha_heads, in_channels=in_channels, dropout_rate=dropout_rate),
+            'FFN': FFN(in_channels=in_channels, dropout_rate=dropout_rate)
+        })
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        """
+        Compute encoded features.
+            :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+            :param torch.Tensor mask: mask for x (batch, max_time_in)
+            :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        x = self.block['DU_MHSA'](x, pos_k)
+        x = self.block['FFN'](x)
+        x = x.permute([0, 2, 1])
+        return x
+class LocalBlock(torch.nn.Module):
+    def __init__(self, in_channels: int, kernel_size: int, dropout_rate: float):
+        super().__init__()
+        self.block = torch.nn.ModuleDict({
+            'CLA': CLA(in_channels, kernel_size, dropout_rate),
+            'FFN': FFN(in_channels, dropout_rate)
+        })
+    def forward(self, x: torch.Tensor):
+        x = self.block['CLA'](x)
+        x = self.block['FFN'](x)
+        return x
+class AudioEncoder(torch.nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, groups: int, bias: bool):
+        super().__init__()
+        self.conv1d = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, groups=groups, bias=bias)
+        self.gelu = torch.nn.GELU()
+    def forward(self, x: torch.Tensor):
+        x = torch.unsqueeze(x, dim=0) if len(x.shape) == 1 else torch.unsqueeze(x, dim=1) # [T] - >[1, T] OR [B, T] -> [B, 1, T]
+        x = self.conv1d(x)
+        x = self.gelu(x)
+        return x
+class FeatureProjector(torch.nn.Module):
+    def __init__(self, num_channels: int, in_channels: int, out_channels: int, kernel_size: int, bias: bool):
+        super().__init__()
+        self.norm = torch.nn.GroupNorm(num_groups=1, num_channels=num_channels, eps=1e-8)
+        self.conv1d = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, bias=bias)
+    def forward(self, x: torch.Tensor):
+        x = self.norm(x)
+        x = self.conv1d(x)
+        return x
+class HeatConvNorm(nn.Module):
+    """
+    This class defines the convolution layer with normalization and PReLU activation
+    """
+    def __init__(
+        self, nIn, nOut, kSize, stride=1, groups=1, bias=True, norm_type="gLN"
+    ):
+        """
+        :param nIn: number of input channels
+        :param nOut: number of output channels
+        :param kSize: kernel size
+        :param stride: stride rate for down-sampling. Default is 1
+        """
+        super().__init__()
+        padding = int((kSize - 1) / 2)
+        self.conv = Heat1D(
+            nIn, nOut, groups=groups
+        )
+        if norm_type == "gLN":
+            self.norm = nn.GroupNorm(1, nOut, eps=1e-8)
+        if norm_type == "BN":
+            self.norm = nn.BatchNorm1d(nOut)
+    def forward(self, input):
+        input = input.permute(0, 2, 1)
+        output = self.conv(input).permute(0, 2, 1)
+        return self.norm(output)
+class ConvNorm(nn.Module):
+    """
+    This class defines the convolution layer with normalization and PReLU activation
+    """
+    def __init__(
+        self, nIn, nOut, kSize, stride=1, groups=1, bias=True, norm_type="gLN"
+    ):
+        """
+        :param nIn: number of input channels
+        :param nOut: number of output channels
+        :param kSize: kernel size
+        :param stride: stride rate for down-sampling. Default is 1
+        """
+        super().__init__()
+        padding = int((kSize - 1) / 2)
+        self.conv = nn.Conv1d(
+            nIn, nOut, kSize, stride=stride, padding=padding, bias=bias, groups=groups
+        )
+        if norm_type == "gLN":
+            self.norm = nn.GroupNorm(1, nOut, eps=1e-8)
+        if norm_type == "BN":
+            self.norm = nn.BatchNorm1d(nOut)
+    def forward(self, input):
+        output = self.conv(input)
+        return self.norm(output)
+class AVFModule(nn.Module):
+    """
+    1D Attention Fusion Cell，将 tensor_b 导引 tensor_a 的 key & value：
+      Input:
+        tensor_a: [B, Ca, T]
+        tensor_b: [B, Cb, Tb]
+      Output:
+        [B, Ca, T]
+    """
+    def __init__(self,
+                 in_chan_a: int,
+                 in_chan_b: int,
+                 kernel_size: int = 1):
+        super().__init__()
+        self.in_chan_a = in_chan_a
+        self.in_chan_b = in_chan_b
+        self.kernel_size = kernel_size
+        # audio key embedding (depthwise 1×1)
+        self.key_embed = ConvNormAct(
+            nIn=in_chan_a, nOut=in_chan_a, kSize=1,
+            groups=in_chan_a, norm_type="gLN"
+        )
+        # audio value embedding (depthwise 1×1)
+        self.value_embed = ConvNormAct(
+            nIn=in_chan_a, nOut=in_chan_a, kSize=1,
+            groups=in_chan_a, norm_type="gLN"
+        )
+        self.resize = ConvNormAct(
+            nIn=in_chan_b, nOut=in_chan_a, kSize=1,
+            norm_type="gLN"
+        )
+        self.attention_embed = ConvNormAct(
+            nIn=in_chan_b,
+            nOut=in_chan_a * kernel_size,
+            kSize=1,
+            groups=in_chan_b,
+            norm_type="gLN"
+        )
+    def forward(self, tensor_a: torch.Tensor, tensor_b: torch.Tensor):
+        """
+        tensor_a: [B, Ca, T]
+        tensor_b: [B, Cb, Tb]
+        """
+        B, Ca, T = tensor_a.shape
+        # 1) Use video to guide key_embed
+        b2a = self.resize(tensor_b)               # [B, Ca, Tb]
+        b2a = F.interpolate(b2a, size=T, mode="nearest")  # [B, Ca, T]
+        k1 = self.key_embed(tensor_a) * b2a       # [B, Ca, T]
+        # 2) audio value
+        v = self.value_embed(tensor_a)            # [B, Ca, T]
+        # 3) Calculate attention scores
+        att = self.attention_embed(tensor_b)      # [B, Ca*kernel, Tb]
+        # reshape → [B, Ca, kernel, Tb]
+        att = att.view(B, Ca, self.kernel_size, -1)
+        att = att.mean(dim=2)
+        att = torch.softmax(att, dim=-1)          # [B, Ca, Tb]
+        att = F.interpolate(att, size=T, mode="nearest")  # [B, Ca, T]
+        # 4) k2 = attention * value
+        k2 = att * v
+        fused = k1 + k2                           # [B, Ca, T]
+        return fused
+class RelativePositionalEncoding(torch.nn.Module):
+    def __init__(self, in_channels: int, num_heads: int, maxlen: int, embed_v=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_heads = num_heads
+        self.embedding_dim = self.in_channels // self.num_heads
+        self.maxlen = maxlen
+        self.pe_k = torch.nn.Embedding(num_embeddings=2*maxlen, embedding_dim=self.embedding_dim)
+        self.pe_v = torch.nn.Embedding(num_embeddings=2*maxlen, embedding_dim=self.embedding_dim) if embed_v else None
+    def forward(self, pos_seq: torch.Tensor):
+        pos_seq.clamp_(-self.maxlen, self.maxlen - 1)
+        pos_seq += self.maxlen
+        pe_k_output = self.pe_k(pos_seq)
+        pe_v_output = self.pe_v(pos_seq) if self.pe_v is not None else None
+        return pe_k_output, pe_v_output
+class DownConvLayer(torch.nn.Module):
+    def __init__(self, in_channels: int, samp_kernel_size: int):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.down_conv = torch.nn.Conv1d(
+            in_channels=in_channels, out_channels=in_channels, kernel_size=samp_kernel_size, stride=2, padding=(samp_kernel_size-1)//2, groups=in_channels)
+        self.GN = nn.GroupNorm(1, num_channels=in_channels)
+        self.gelu = torch.nn.GELU()
+    def forward(self, x: torch.Tensor):
+        x = x.permute([0, 2, 1])
+        x = self.down_conv(x)
+        x = self.GN(x)
+        x = self.gelu(x)
+        x = x.permute([0, 2, 1])
+        return x
+class ConvNormAct(nn.Module):
+    """
+    This class defines the convolution layer with normalization and a PReLU
+    activation
+    """
+    def __init__(self, nIn, nOut, kSize, stride=1, groups=1, norm_type="gLN"):
+        """
+        :param nIn: number of input channels
+        :param nOut: number of output channels
+        :param kSize: kernel size
+        :param stride: stride rate for down-sampling. Default is 1
+        """
+        super().__init__()
+        padding = int((kSize - 1) / 2)
+        self.conv = nn.Conv1d(
+            nIn, nOut, kSize, stride=stride, padding=padding, bias=True, groups=groups
+        )
+        if norm_type == "gLN":
+            self.norm = nn.GroupNorm(1, nOut, eps=1e-8)
+        if norm_type == "BN":
+            self.norm = nn.BatchNorm1d(nOut)
+        self.act = nn.PReLU()
+    def forward(self, input):
+        output = self.conv(input)
+        output = self.norm(output)
+        return self.act(output)
+class DilatedConvNorm(nn.Module):
+    """
+    This class defines the dilated convolution with normalized output.
+    """
+    def __init__(self, nIn, nOut, kSize, stride=1, d=1, groups=1, norm_type="gLN"):
+        """
+        :param nIn: number of input channels
+        :param nOut: number of output channels
+        :param kSize: kernel size
+        :param stride: optional stride rate for down-sampling
+        :param d: optional dilation rate
+        """
+        super().__init__()
+        self.conv = nn.Conv1d(
+            nIn,
+            nOut,
+            kSize,
+            stride=stride,
+            dilation=d,
+            padding=((kSize - 1) // 2) * d,
+            groups=groups,
+        )
+        # self.norm = nn.GroupNorm(1, nOut, eps=1e-08)
+        if norm_type == "gLN":
+            self.norm = nn.GroupNorm(1, nOut, eps=1e-8)
+        if norm_type == "BN":
+            self.norm = nn.BatchNorm1d(nOut)
+    def forward(self, input):
+        output = self.conv(input)
+        return self.norm(output)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_size, drop=0.1, norm_type="gLN"):
+        super().__init__()
+        self.fc1 = ConvNorm(
+            in_features, hidden_size, 1, bias=False, norm_type=norm_type
+        )
+        self.dwconv = nn.Conv1d(
+            hidden_size, hidden_size, 5, 1, 2, bias=True, groups=hidden_size
+        )
+        self.act = nn.ReLU()
+        self.fc2 = ConvNorm(
+            hidden_size, in_features, 1, bias=False, norm_type=norm_type
+        )
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class InjectionMultiSum(nn.Module):
+    def __init__(self, inp: int, oup: int, kernel: int = 1, norm_type="gLN") -> None:
+        super().__init__()
+        groups = 1
+        if inp == oup:
+            groups = inp
+        self.local_embedding = HeatConvNorm(
+            inp, oup, kernel, groups=groups, bias=False, norm_type=norm_type
+        )
+        self.global_embedding = HeatConvNorm(
+            inp, oup, kernel, groups=groups, bias=False, norm_type=norm_type
+        )
+        self.global_act = HeatConvNorm(
+            inp, oup, kernel, groups=groups, bias=False, norm_type=norm_type
+        )
+        self.act = nn.Sigmoid()
+    def forward(self, x_l, x_g):
+        """
+        x_g: global features
+        x_l: local features
+        """
+        B, N, T = x_l.shape
+        local_feat = self.local_embedding(x_l)
+        global_act = self.global_act(x_g)
+        sig_act = torch.nn.functional.interpolate(self.act(global_act), size=T, mode="nearest")
+        # sig_act = self.act(global_act)
+        global_feat = self.global_embedding(x_g)
+        global_feat = torch.nn.functional.interpolate(global_feat, size=T, mode="nearest")
+        out = local_feat * sig_act + global_feat
+        return out
+class UConvBlock(nn.Module):
+    """
+    This class defines the block which performs successive downsampling and
+    upsampling in order to be able to analyze the input features in multiple
+    resolutions.
+    """
+    def __init__(
+        self, out_channels=128, in_channels=512, upsampling_depth=4, norm_type="gLN"
+    ):
+        super().__init__()
+        self.proj_1x1 = ConvNormAct(out_channels, in_channels, 1, stride=1, groups=1, norm_type=norm_type)
+        self.depth = upsampling_depth
+        self.spp_dw = nn.ModuleList()
+        self.spp_dw.append(
+            DilatedConvNorm(
+                in_channels, in_channels, kSize=5, stride=1, groups=in_channels, d=1, norm_type=norm_type
+            )
+        )
+        for i in range(1, upsampling_depth):
+            self.spp_dw.append(
+                DilatedConvNorm(
+                    in_channels,
+                    in_channels,
+                    kSize=5,
+                    stride=2,
+                    groups=in_channels,
+                    d=1,
+                    norm_type=norm_type
+                )
+            )
+        self.loc_glo_fus = nn.ModuleList([])
+        for i in range(upsampling_depth):
+            self.loc_glo_fus.append(InjectionMultiSum(in_channels, in_channels, norm_type=norm_type))
+        self.res_conv = nn.Conv1d(in_channels, out_channels, 1)
+        self.globalatt = Mlp(in_channels, in_channels, drop=0.1)
+        self.last_layer = nn.ModuleList([])
+        for i in range(self.depth - 1):
+            self.last_layer.append(InjectionMultiSum(in_channels, in_channels, 5, norm_type=norm_type))
+    def forward(self, x):
+        """
+        :param x: input feature map
+        :return: transformed feature map
+        """
+        residual = x.clone()
+        # Reduce --> project high-dimensional feature maps to low-dimensional space
+        output1 = self.proj_1x1(x)
+        output = [self.spp_dw[0](output1)]
+        # Do the downsampling process from the previous level
+        for k in range(1, self.depth):
+            out_k = self.spp_dw[k](output[-1])
+            output.append(out_k)
+        # global features
+        global_f = torch.zeros(
+            output[-1].shape, requires_grad=True, device=output1.device
+        )
+        for fea in output:
+            global_f = global_f + torch.nn.functional.adaptive_avg_pool1d(
+                fea, output_size=output[-1].shape[-1]
+            )
+            # global_f = global_f + fea
+        global_f = self.globalatt(global_f)  # [B, N, T]
+        x_fused = []
+        # Gather them now in reverse order
+        for idx in range(self.depth):
+            local = output[idx]
+            x_fused.append(self.loc_glo_fus[idx](local, global_f))
+        expanded = None
+        for i in range(self.depth - 2, -1, -1):
+            if i == self.depth - 2:
+                expanded = self.last_layer[i](x_fused[i], x_fused[i - 1])
+            else:
+                expanded = self.last_layer[i](x_fused[i], expanded)
+        # import pdb; pdb.set_trace()
+        return self.res_conv(expanded) + residual
+class EncoderLayer(torch.nn.Module):
+    def __init__(self, global_blocks: dict, local_blocks: dict, down_conv_layer: dict, down_conv=True):
+        super().__init__()
+        self.g_block_1 = GlobalBlock(**global_blocks)
+        self.l_block_1 = LocalBlock(**local_blocks)
+        self.g_block_2 = GlobalBlock(**global_blocks)
+        self.l_block_2 = LocalBlock(**local_blocks)
+        self.downconv = DownConvLayer(**down_conv_layer) if down_conv == True else None
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        '''
+        x: [B, N, T]
+        '''
+        x = self.g_block_1(x, pos_k)
+        x = x.permute(0, 2, 1).contiguous()
+        x = self.l_block_1(x)
+        x = x.permute(0, 2, 1).contiguous()
+        x = self.g_block_2(x, pos_k)
+        x = x.permute(0, 2, 1).contiguous()
+        x = self.l_block_2(x)
+        x = x.permute(0, 2, 1).contiguous()
+        skip = x
+        if self.downconv:
+            x = x.permute(0, 2, 1).contiguous()
+            x = self.downconv(x)
+            x = x.permute(0, 2, 1).contiguous()
+        # [BK, S, N]
+        return x, skip
+class DecoderLayer(torch.nn.Module):
+    def __init__(self, global_blocks: dict, local_blocks: dict, spk_attention: dict):
+        super().__init__()
+        self.g_block_1 = GlobalBlock(**global_blocks)
+        self.l_block_1 = LocalBlock(**local_blocks)
+        self.g_block_2 = GlobalBlock(**global_blocks)
+        self.l_block_2 = LocalBlock(**local_blocks)
+        self.g_block_3 = GlobalBlock(**global_blocks)
+        self.l_block_3 = LocalBlock(**local_blocks)
+    def forward(self, x: torch.Tensor, pos_k: torch.Tensor):
+        '''
+        x: [B, N, T]
+        '''
+        # [BS, K, H]
+        x = self.g_block_1(x, pos_k)
+        x = x.permute(0, 2, 1).contiguous()
+        x = self.l_block_1(x)
+        x = x.permute(0, 2, 1).contiguous()
+        x = self.g_block_2(x, pos_k)
+        x = x.permute(0, 2, 1).contiguous()
+        x = self.l_block_2(x)
+        x = x.permute(0, 2, 1).contiguous()
+        x = self.g_block_3(x, pos_k)
+        x = x.permute(0, 2, 1).contiguous()
+        x = self.l_block_3(x)
+        x = x.permute(0, 2, 1).contiguous()
+        skip = x
+        return x, skip
+class Separator(torch.nn.Module):
+    def __init__(self, num_stages: int, relative_positional_encoding: dict, enc_stage: dict, simple_fusion:dict, dec_stage: dict):
+        super().__init__()
+        self.num_stages = num_stages
+        self.pos_emb = RelativePositionalEncoding(**relative_positional_encoding)
+        # Temporal Contracting Part
+        self.enc_stages = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.enc_stages.append(EncoderLayer(**enc_stage, down_conv=True))
+        self.bottleneck_G = nn.ModuleList([
+            MultiHeadAttention(
+                n_head=enc_stage['global_blocks']['num_mha_heads'],
+                in_channels=enc_stage['global_blocks']['in_channels'],
+                dropout_rate=enc_stage['global_blocks']['dropout_rate']
+            ),
+            FFN(
+                in_channels=enc_stage['global_blocks']['in_channels'],
+                dropout_rate=enc_stage['global_blocks']['dropout_rate']
+            )
+        ])
+        # top-down fusion
+        self.loc_glo_fus = nn.ModuleList([])
+        for i in range(self.num_stages):
+            self.loc_glo_fus.append(InjectionMultiSum(simple_fusion['out_channels'], simple_fusion['out_channels']))
+        # Temporal Expanding Part
+        self.simple_fusion = torch.nn.ModuleList([])
+        self.dec_stages = torch.nn.ModuleList([])
+        for _ in range(self.num_stages):
+            self.simple_fusion.append(InjectionMultiSum(simple_fusion['out_channels'], simple_fusion['out_channels'], kernel=5))
+            self.dec_stages.append(DecoderLayer(**dec_stage))
+    def forward(self, input: torch.Tensor):
+        '''input: [B, N, L]'''
+        # feature projection
+        x, _ = self.pad_signal(input)
+        len_x = x.shape[-1]
+        # Temporal Contracting Part
+        min_len = len_x//2**(self.num_stages-1)
+        pos_seq = torch.arange(0, len_x//2**self.num_stages).long().to(x.device)
+        pos_seq = pos_seq[:, None] - pos_seq[None, :]
+        pos_k, _ = self.pos_emb(pos_seq)
+        skip = []
+        fusion_x = torch.zeros([x.shape[0], x.shape[1], min_len], requires_grad=True, device=x.device)
+        for idx in range(self.num_stages):
+            x, skip_ = self.enc_stages[idx](x, pos_k)
+            skip.append(skip_)
+            fusion_x = fusion_x + F.adaptive_avg_pool1d(x, min_len)
+        global_x = self.bottleneck_G[0](fusion_x.permute(0, 2, 1).contiguous(), None, None)
+        global_x = self.bottleneck_G[1](global_x).permute(0, 2, 1).contiguous()
+        # Global topdown attention
+        fusion_skip = []
+        for idx in range(self.num_stages):
+            fusion_skip.append(self.loc_glo_fus[idx](skip[idx], global_x))
+        each_stage_outputs = []
+        # Temporal Expanding Part
+        for idx in range(self.num_stages):
+            each_stage_outputs.append(x)
+            idx_en = self.num_stages - (idx + 1)
+            x = self.simple_fusion[idx](fusion_skip[idx_en], x)
+            x, _ = self.dec_stages[idx](x, pos_k)
+        last_stage_output = x
+        return last_stage_output, each_stage_outputs
+    def pad_signal(self, input: torch.Tensor):
+        #  (B, T) or (B, 1, T)
+        if input.dim() == 1: input = input.unsqueeze(0)
+        elif input.dim() not in [2, 3]: raise RuntimeError("Input can only be 2 or 3 dimensional.")
+        elif input.dim() == 2: input = input.unsqueeze(1)
+        L = 2**self.num_stages
+        batch_size = input.size(0)
+        ndim = input.size(1)
+        nframe = input.size(2)
+        padded_len = (nframe//L + 1)*L
+        rest = 0 if nframe%L == 0 else padded_len - nframe
+        if rest > 0:
+            pad = torch.autograd.Variable(torch.zeros(batch_size, ndim, rest)).type(input.type()).to(input.device)
+            input = torch.cat([input, pad], dim=-1)
+        return input, rest
+class OutputLayer(torch.nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, masking: bool = False):
+        super().__init__()
+        # feature expansion back
+        self.masking = masking
+        self.spe_block = Masking(in_channels)
+        self.end_conv1x1 = torch.nn.Sequential(
+            torch.nn.Linear(out_channels, 4*out_channels),
+            torch.nn.GLU(),
+            torch.nn.Linear(2*out_channels, in_channels))
+    def forward(self, x: torch.Tensor, input: torch.Tensor):
+        x = x[...,:input.shape[-1]]
+        x = x.permute([0, 2, 1])
+        x = self.end_conv1x1(x)
+        x = x.permute([0, 2, 1])
+        if self.masking:
+            x = self.spe_block(x, input)
+        return x
+class AudioDecoder(torch.nn.ConvTranspose1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x):
+        # x: [B, N, L]
+        if x.dim() not in [2, 3]:
+            raise RuntimeError("{} accept 2/3D tensor as input".format(self.__class__.__name__))
+        x = super().forward(x if x.dim() == 3 else torch.unsqueeze(x, 1))
+        x = torch.squeeze(x, dim=1) if torch.squeeze(x).dim() == 1 else torch.squeeze(x)
+        return x
+class ReconstructionPath(nn.Module):
+    def __init__(
+        self,
+        layers = [
+            'residual',
+            'residual',
+            'residual'
+        ],
+        image_size=88,
+        in_channel=1,
+        init_channel=16,
+        max_dim=128,
+        # conv相关
+        input_conv_kernel_size = [7, 7, 7],
+        output_conv_kernel_size = [3, 3, 3],
+        residual_conv_kernel_size=3,
+        pad_mode="constant",
+        # attn相关
+        attn_dim_head = 32,
+        attn_heads = 8,
+        attn_dropout = 0.,
+        flash_attn = True,
+        linear_attn_dim_head = 8,
+        linear_attn_heads = 16,
+        fuse_dim=32,
+        # quantizer相关
+        num_quantizers = 1,
+        codebook_size = 256,
+        codebook_dim=64,
+        commitment_cost=0.25,
+    ):
+        super().__init__()
+        input_conv_kernel_size=tuple(input_conv_kernel_size)
+        self.conv_in = nn.Conv3d(in_channel, init_channel, input_conv_kernel_size,padding='same')
+        layer_fmap_size=image_size
+        self.encoder_layers = nn.ModuleList([])
+        dim=init_channel
+        dim_out=dim
+        time_downsample_factor=1
+        for layer_type in layers:
+            if layer_type == 'residual':
+                encoder_layer = ResidualUnit(dim, residual_conv_kernel_size)
+            elif layer_type == 'consecutive_residual':
+                num_consecutive = 2
+                encoder_layer = Sequential(*[ResidualUnit(dim, residual_conv_kernel_size) for _ in range(num_consecutive)])
+            elif layer_type == 'compress_space':
+                dim_out = dim * 2
+                dim_out = min(dim_out, max_dim)
+                encoder_layer = SpatialDownsample2x(dim, dim_out)
+                assert layer_fmap_size > 1
+                layer_fmap_size //= 2
+            elif layer_type == 'compress_time':
+                dim_out = dim * 2
+                dim_out = min(dim_out, max_dim)
+                encoder_layer = TimeDownsample2x(dim, dim_out)
+                time_downsample_factor *= 2
+            elif layer_type == 'attend_space':
+                attn_kwargs = dict(
+                    dim = dim,
+                    dim_head = attn_dim_head,
+                    heads = attn_heads,
+                    dropout = attn_dropout,
+                    flash = flash_attn
+                )
+                encoder_layer = Sequential(
+                    Residual(SpaceAttention(**attn_kwargs)),
+                    Residual(FeedForward(dim))
+                )
+            elif layer_type == 'linear_attend_space':
+                linear_attn_kwargs = dict(
+                    dim = dim,
+                    dim_head = linear_attn_dim_head,
+                    heads = linear_attn_heads
+                )
+                encoder_layer = Sequential(
+                    Residual(LinearSpaceAttention(**linear_attn_kwargs)),
+                    Residual(FeedForward(dim))
+                )
+            else:
+                raise ValueError(f'unknown layer type {layer_type}')
+            self.encoder_layers.append(encoder_layer)
+            dim = dim_out
+        self.encoder_layers.append(Sequential(
+            Rearrange('b c ... -> b ... c'),
+            nn.LayerNorm(dim),
+            Rearrange('b ... c -> b c ...'),
+        ))
+    def forward(self, x, semantic_fea=None):
+        x = self.conv_in(x)
+        for layer in self.encoder_layers:
+            x = layer(x)
+        z_e = x
+        z_q=z_e
+        if semantic_fea!=None:
+            B,C,T,H,W=z_q.shape
+            z_q=z_q.contiguous().permute(0,2,1,3,4)
+            z_q=z_q.contiguous().view(B,T,-1)
+            z_q=z_q + semantic_fea
+        return z_q
+class SemanticPath(nn.Module):
+    def __init__(
+        self,
+        layers = [
+            'residual',
+            'residual',
+            'residual'
+        ],
+        image_size=88,
+        in_channel=1,
+        init_channel=4,
+        max_dim=32,
+        input_conv_kernel_size = [7, 7, 7],
+        output_conv_kernel_size = [3, 3, 3],
+        residual_conv_kernel_size=3,
+        pad_mode="constant",
+        attn_dim_head = 32,
+        attn_heads = 8,
+        attn_dropout = 0.,
+        flash_attn = True,
+        linear_attn_dim_head = 8,
+        linear_attn_heads = 16,
+        num_quantizers = 1,
+        codebook_size = 256,
+        codebook_dim= 64,
+        commitment_cost=0.25,
+        distill_dim=1024,
+        config=None,
+        pretrain=None
+    ):
+        super().__init__()
+        input_conv_kernel_size=tuple(input_conv_kernel_size)
+        self.conv_in = nn.Conv3d(in_channel, init_channel, input_conv_kernel_size,padding='same')
+        layer_fmap_size=image_size
+        self.encoder_layers = nn.ModuleList([])
+        dim=init_channel
+        dim_out=dim
+        time_downsample_factor=1
+        for layer_type in layers:
+            if layer_type == 'residual':
+                encoder_layer = ResidualUnit(dim, residual_conv_kernel_size)
+            elif layer_type == 'consecutive_residual':
+                num_consecutive = 2
+                encoder_layer = Sequential(*[ResidualUnit(dim, residual_conv_kernel_size) for _ in range(num_consecutive)])
+            elif layer_type == 'compress_space':
+                dim_out = dim * 2
+                dim_out = min(dim_out, max_dim)
+                encoder_layer = SpatialDownsample2x(dim, dim_out)
+                assert layer_fmap_size > 1
+                layer_fmap_size //= 2
+            elif layer_type == 'compress_time':
+                dim_out = dim * 2
+                dim_out = min(dim_out, max_dim)
+                encoder_layer = TimeDownsample2x(dim, dim_out)
+                time_downsample_factor *= 2
+            elif layer_type == 'attend_space':
+                attn_kwargs = dict(
+                    dim = dim,
+                    dim_head = attn_dim_head,
+                    heads = attn_heads,
+                    dropout = attn_dropout,
+                    flash = flash_attn
+                )
+                encoder_layer = Sequential(
+                    Residual(SpaceAttention(**attn_kwargs)),
+                    Residual(FeedForward(dim))
+                )
+            elif layer_type == 'linear_attend_space':
+                linear_attn_kwargs = dict(
+                    dim = dim,
+                    dim_head = linear_attn_dim_head,
+                    heads = linear_attn_heads
+                )
+                encoder_layer = Sequential(
+                    Residual(LinearSpaceAttention(**linear_attn_kwargs)),
+                    Residual(FeedForward(dim))
+                )
+            else:
+                raise ValueError(f'unknown layer type {layer_type}')
+            self.encoder_layers.append(encoder_layer)
+            dim = dim_out
+        self.encoder_layers.append(Sequential(
+            Rearrange('b c ... -> b ... c'),
+            nn.LayerNorm(dim),
+            Rearrange('b ... c -> b c ...'),
+        ))
+        # layer_fmap_size = 3
+        self.quantizer = ResidualVQ(
+            dim = dim*layer_fmap_size*layer_fmap_size,
+            num_quantizers = num_quantizers,
+            codebook_size = codebook_size,
+            codebook_dim = codebook_dim,
+            quantize_dropout=False,
+            stochastic_sample_codes = True,
+            sample_codebook_temp = 0.1,
+            kmeans_init = True,
+            kmeans_iters = 10
+        )
+    def forward(self, x):
+        x = self.conv_in(x)
+        for layer in self.encoder_layers:
+            x = layer(x)
+        b,c,t,h,w=x.shape
+        x = x.contiguous().permute(0,2,1,3,4)
+        z_e = x.contiguous().view(b,t,-1)
+        z_q,_,_=self.quantizer(z_e)
+        return z_q
+class VideoEncoder(nn.Module):
+    def __init__(
+        self,
+        layers,
+        image_size=88,
+        in_channel=1,
+        init_channel=16,
+        max_dim=128,
+        input_conv_kernel_size = [7, 7, 7],
+        output_conv_kernel_size = [3, 3, 3],
+        residual_conv_kernel_size=3,
+        pad_mode="constant",
+        # attn相关
+        attn_dim_head = 32,
+        attn_heads = 8,
+        attn_dropout = 0.,
+        flash_attn = True,
+        linear_attn_dim_head = 8,
+        linear_attn_heads = 16,
+        num_quantizers = 1,
+        codebook_size = 256,
+        codebook_dim=64,
+        commitment_cost=0.25,
+        distill_cost=1.0,
+    ):
+        super().__init__()
+        self.semantic_model=SemanticPath(
+            layers=layers,
+            image_size=image_size,
+            in_channel=in_channel,
+            init_channel=init_channel,
+            max_dim=max_dim,
+            input_conv_kernel_size=input_conv_kernel_size,
+            output_conv_kernel_size=output_conv_kernel_size,
+            residual_conv_kernel_size=residual_conv_kernel_size,
+            pad_mode=pad_mode,
+            attn_dim_head = attn_dim_head,
+            attn_heads = attn_heads,
+            attn_dropout = attn_dropout,
+            flash_attn = flash_attn,
+            linear_attn_dim_head = linear_attn_dim_head,
+            linear_attn_heads = linear_attn_heads,
+            num_quantizers = num_quantizers,
+            codebook_size = codebook_size,
+            codebook_dim = codebook_dim,
+            commitment_cost = commitment_cost,
+        )
+        self.recon_model=ReconstructionPath(
+            layers=layers,
+            image_size=image_size,
+            in_channel=in_channel,
+            init_channel=init_channel,
+            max_dim=max_dim,
+            input_conv_kernel_size=input_conv_kernel_size,
+            output_conv_kernel_size=output_conv_kernel_size,
+            residual_conv_kernel_size=residual_conv_kernel_size,
+            pad_mode=pad_mode,
+            attn_dim_head = attn_dim_head,
+            attn_heads = attn_heads,
+            attn_dropout = attn_dropout,
+            flash_attn = flash_attn,
+            linear_attn_dim_head = linear_attn_dim_head,
+            linear_attn_heads = linear_attn_heads,
+            num_quantizers = num_quantizers,
+            codebook_size = codebook_size,
+            codebook_dim = codebook_dim,
+            commitment_cost = commitment_cost,
+        )
+    def forward(self, x):
+        semantic_fea = self.semantic_model(x)
+        return self.recon_model(x,semantic_fea)
+class Dolphin(nn.Module, PyTorchModelHubMixin):
+    def __init__(self,
+                 num_stages: int,
+                 sample_rate: int,
+                 module_audio_enc: dict,
+                 module_feature_projector: dict,
+                 module_separator: dict,
+                 module_output_layer: dict,
+                 module_audio_dec: dict,
+                 video_encoder_params: dict,
+                 vpre_channels=512,
+                 vmid_channels=512,
+                 vin_channels=64,
+                 vout_channels=64,):
+        super(Dolphin, self).__init__()
+        self.pre_v1 = ConvNormAct(vpre_channels, vin_channels, kSize=3, norm_type="BN")
+        self.num_stages = num_stages
+        self.audio_encoder = AudioEncoder(**module_audio_enc)
+        self.feature_projector = FeatureProjector(**module_feature_projector)
+        self.separator = Separator(**module_separator)
+        self.out_layer = OutputLayer(**module_output_layer)
+        self.audio_decoder = AudioDecoder(**module_audio_dec)
+        self.video_blocks = UConvBlock(vin_channels, vout_channels, 3, norm_type="BN")
+        self.modalfuse = AVFModule(module_feature_projector["out_channels"], vout_channels)
+        self.video_encoder = VideoEncoder(**video_encoder_params)
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        revision: str,
+        cache_dir: str,
+        force_download: bool,
+        proxies: dict,
+        resume_download: bool,
+        local_files_only: bool,
+        token: str,
+        map_location: str = "cpu",
+        strict: bool = False,
+        **model_kwargs,
+    ):
+        """Load model from HuggingFace Hub with proper configuration handling."""
+        import json
+        from huggingface_hub import hf_hub_download
+        # Download config file
+        config_file = hf_hub_download(
+            repo_id=model_id,
+            filename="config.json",
+            revision=revision,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            local_files_only=local_files_only,
+            token=token,
+        )
+        # Load configuration
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        # Extract only the model parameters, excluding HF metadata
+        hf_metadata_keys = {
+            "model_type", "task", "framework", "license", "tags",
+            "architectures", "auto_map"
+        }
+        model_config = {k: v for k, v in config.items() if k not in hf_metadata_keys}
+        # Create model instance with config
+        model = cls(**model_config)
+        # Try to download different possible model file formats
+        import torch
+        model_files_to_try = [
+            "model.safetensors",
+        ]
+        state_dict = None
+        for filename in model_files_to_try:
+            try:
+                model_file = hf_hub_download(
+                    repo_id=model_id,
+                    filename=filename,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                )
+                # Try to load the state dict
+                if filename.endswith('.safetensors'):
+                    # Handle safetensors format
+                    try:
+                        from safetensors.torch import load_file
+                        state_dict = load_file(model_file, device=map_location)
+                    except ImportError:
+                        print("safetensors not available, skipping .safetensors files")
+                        continue
+                else:
+                    # Handle PyTorch format
+                    checkpoint = torch.load(model_file, map_location=map_location, weights_only=False)
+                    # Handle different checkpoint formats
+                    if isinstance(checkpoint, dict):
+                        if 'state_dict' in checkpoint:
+                            state_dict = checkpoint['state_dict']
+                        elif 'model_state_dict' in checkpoint:
+                            state_dict = checkpoint['model_state_dict']
+                        else:
+                            state_dict = checkpoint
+                    else:
+                        state_dict = checkpoint
+                # If we successfully loaded a state dict, break
+                if state_dict is not None:
+                    break
+            except Exception as e:
+                print(f"Failed to load {filename}: {e}")
+                continue
+        if state_dict is None:
+            raise RuntimeError(f"Could not load model weights from any of the tried files: {model_files_to_try}")
+        model.load_state_dict(state_dict, strict=strict)
+        return model
+    def forward(self, input, mouth):
+        mouth = self.video_encoder(mouth).permute(0, 2, 1).contiguous()
+        v=self.pre_v1(mouth)
+        v=self.video_blocks(v)
+        encoder_output = self.audio_encoder(input)
+        projected_feature = self.feature_projector(encoder_output)
+        projected_feature = self.modalfuse(projected_feature,v)
+        last_stage_output, each_stage_outputs = self.separator(projected_feature)
+        out_layer_output = self.out_layer(last_stage_output, encoder_output)
+        audio=self.audio_decoder(out_layer_output)
+        return audio.unsqueeze(dim=1)

look2hear/models/video_compoent.py ADDED Viewed

	@@ -0,0 +1,876 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch import einsum,Tensor
+from functools import partial
+from taylor_series_linear_attention import TaylorSeriesLinearAttn
+from beartype import beartype
+from beartype.typing import Tuple, List
+from einops import rearrange, repeat, reduce, pack, unpack
+from einops.layers.torch import Rearrange
+from typing import Union
+from functools import partial
+from typing import Optional, Tuple
+import torch
+from torch import nn, einsum, Tensor
+import torch.nn.functional as F
+from collections import namedtuple
+from functools import wraps
+from packaging import version
+# constants
+EfficientAttentionConfig = namedtuple('EfficientAttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient'])
+# helpers
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def compact(arr):
+    return [*filter(exists, arr)]
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+print_once = once(print)
+# functions for creating causal mask
+# need a special one for onnx cpu (no support for .triu)
+def create_causal_mask(i, j, device):
+    return torch.ones((i, j), device = device, dtype = torch.bool).triu(j - i + 1)
+def onnx_create_causal_mask(i, j, device):
+    r = torch.arange(i, device = device)
+    causal_mask = rearrange(r, 'i -> i 1') < rearrange(r, 'j -> 1 j')
+    causal_mask = F.pad(causal_mask, (j - i, 0), value = False)
+    return causal_mask
+# main class
+class Attend(nn.Module):
+    def __init__(
+        self,
+        *,
+        dropout = 0.,
+        causal = False,
+        heads = None,
+        scale = None,
+        flash = False,
+        onnxable = False,
+        sdp_kwargs: dict = dict(
+            enable_flash = True,
+            enable_math = True,
+            enable_mem_efficient = True
+        )
+    ):
+        super().__init__()
+        self.scale = scale
+        self.causal = causal
+        self.create_causal_mask = onnx_create_causal_mask if onnxable else create_causal_mask
+        self.dropout = dropout
+        self.attn_dropout = nn.Dropout(dropout)
+        # flash attention
+        self.flash = flash and torch.cuda.is_available()
+        assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
+        self.sdp_kwargs = sdp_kwargs
+    def flash_attn(
+        self,
+        q, k, v,
+        mask = None,
+        attn_bias = None
+    ):
+        batch, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device
+        q, k, v = map(lambda t: t.contiguous(), (q, k, v))
+        # manage scale, since scale is not customizable in sdp, hack around it
+        if exists(self.scale):
+            q = q * self.scale / (q.shape[-1] ** -0.5)
+        # Check if mask exists and expand to compatible shape
+        # The mask is B L, so it would have to be expanded to B H N L
+        causal = self.causal
+        # in the case of kv caching with one token (q_len == 1), just turn off causal masking
+        # in speculative decoding, this may go up to 5-6, so right aligned causal mask will be needed there
+        if q_len == 1 and causal:
+            causal = False
+        # expand key padding mask
+        if exists(mask):
+            assert mask.ndim == 4
+            mask = mask.expand(batch, heads, q_len, k_len)
+        # handle kv cache - this should be bypassable in updated flash attention 2
+        if k_len > q_len and causal:
+            causal_mask = self.create_causal_mask(q_len, k_len, device = device)
+            if not exists(mask):
+                mask = ~causal_mask
+            else:
+                mask = mask & ~causal_mask
+            causal = False
+        # manually handle causal mask, if another mask was given
+        row_is_entirely_masked = None
+        if exists(mask) and causal:
+            causal_mask = self.create_causal_mask(q_len, k_len, device = device)
+            mask = mask & ~causal_mask
+            # protect against an entire row being masked out
+            row_is_entirely_masked = ~mask.any(dim = -1)
+            mask[..., 0] = mask[..., 0] | row_is_entirely_masked
+            causal = False
+        # handle alibi positional bias
+        # convert from bool to float
+        if exists(attn_bias):
+            attn_bias = rearrange(attn_bias, 'h i j -> 1 h i j').expand(batch, heads, -1, -1)
+            # if mask given, the mask would already contain the causal mask from above logic
+            # otherwise, if no mask given but still causal, mask out alibi positional bias to a large negative number
+            mask_value = -torch.finfo(q.dtype).max
+            if exists(mask):
+                attn_bias = attn_bias.masked_fill(~mask, mask_value // 2)
+            elif causal:
+                causal_mask = self.create_causal_mask(q_len, k_len, device = device)
+                attn_bias = attn_bias.masked_fill(causal_mask, mask_value // 2)
+                causal = False
+            # scaled_dot_product_attention handles attn_mask either as bool or additive bias
+            # make it an additive bias here
+            mask = attn_bias
+        # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
+        with torch.backends.cuda.sdp_kernel(**self.sdp_kwargs):
+            out = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask = mask,
+                dropout_p = self.dropout if self.training else 0.,
+                is_causal = causal
+            )
+        # for a row that is entirely masked out, should zero out the output of that row token
+        if exists(row_is_entirely_masked):
+            out = out.masked_fill(row_is_entirely_masked[..., None], 0.)
+        return out
+    def forward(
+        self,
+        q, k, v,
+        mask = None,
+        attn_bias = None,
+        prev_attn = None
+    ):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        n, i, j - sequence length (base sequence length, source, target)
+        d - feature dimension
+        """
+        n, heads, kv_heads, device = q.shape[-2], q.shape[1], k.shape[1], q.device
+        scale = default(self.scale, q.shape[-1] ** -0.5)
+        causal = self.causal
+        # handle kv cached decoding
+        if n == 1 and causal:
+            causal = False
+        # handle zero kv, as means for allowing network to attend to nothing
+        if self.flash:
+            assert not exists(prev_attn), 'residual attention not compatible with flash attention'
+            return self.flash_attn(q, k, v, mask = mask, attn_bias = attn_bias)
+        dots = einsum(f'b h i d, b h j d -> b h i j', q, k) * scale
+        if exists(prev_attn):
+            dots = dots + prev_attn
+        if exists(attn_bias):
+            dots = dots + attn_bias
+        i, j, dtype = *dots.shape[-2:], dots.dtype
+        mask_value = -torch.finfo(dots.dtype).max
+        if exists(mask):
+            dots = dots.masked_fill(~mask, mask_value)
+        if causal:
+            causal_mask = self.create_causal_mask(i, j, device = device)
+            dots = dots.masked_fill(causal_mask, mask_value)
+        attn = dots.softmax(dim = -1)
+        attn = self.attn_dropout(attn)
+        out = einsum(f'b h i j, b h j d -> b h i d', attn, v)
+        return out
+def exists(v):
+    return v is not None
+def default(v, d):
+    return v if exists(v) else d
+def safe_get_index(it, ind, default = None):
+    if ind < len(it):
+        return it[ind]
+    return default
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+def identity(t, *args, **kwargs):
+    return t
+def divisible_by(num, den):
+    return (num % den) == 0
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def append_dims(t, ndims: int):
+    return t.reshape(*t.shape, *((1,) * ndims))
+def is_odd(n):
+    return not divisible_by(n, 2)
+def maybe_del_attr_(o, attr):
+    if hasattr(o, attr):
+        delattr(o, attr)
+def cast_tuple(t, length = 1):
+    return t if isinstance(t, tuple) else ((t,) * length)
+class ResBlock(nn.Module):
+    def __init__(self, in_channel, channel):
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(in_channel, channel, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(channel, in_channel, 1),
+        )
+    def forward(self, input):
+        out = self.conv(input)
+        out += input
+        return out
+class EncoderAE(nn.Module):
+    def __init__(self, in_channel, channel, n_res_block, n_res_channel, stride):
+        super().__init__()
+        if stride == 4:
+            blocks = [
+                nn.Conv2d(in_channel, channel // 2, 4, stride=2, padding=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(channel // 2, channel, 4, stride=2, padding=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(channel, channel, 3, padding=1),
+            ]
+        elif stride == 2:
+            blocks = [
+                nn.Conv2d(in_channel, channel // 2, 4, stride=2, padding=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(channel // 2, channel, 3, padding=1),
+            ]
+        for i in range(n_res_block):
+            blocks.append(ResBlock(channel, n_res_channel))
+        blocks.append(nn.ReLU(inplace=True))
+        self.blocks = nn.Sequential(*blocks)
+    def forward(self, input):
+        return self.blocks(input)
+class DecoderAE(nn.Module):
+    def __init__(
+        self, in_channel, out_channel, channel, n_res_block, n_res_channel, stride
+    ):
+        super().__init__()
+        blocks = [nn.Conv2d(in_channel, channel, 3, padding=1)]
+        for i in range(n_res_block):
+            blocks.append(ResBlock(channel, n_res_channel))
+        blocks.append(nn.ReLU(inplace=True))
+        if stride == 4:
+            blocks.extend(
+                [
+                    nn.ConvTranspose2d(channel, channel // 2, 4, stride=2, padding=1),
+                    nn.ReLU(inplace=True),
+                    nn.ConvTranspose2d(
+                        channel // 2, out_channel, 4, stride=2, padding=1
+                    ),
+                ]
+            )
+        elif stride == 2:
+            blocks.append(
+                nn.ConvTranspose2d(channel, out_channel, 4, stride=2, padding=1)
+            )
+        self.blocks = nn.Sequential(*blocks)
+    def forward(self, input):
+        return self.blocks(input)
+class CausalConv3d(nn.Module):
+    # 因果三维卷积，实则和直接三维卷积区别不大
+    @beartype
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        pad_mode = 'constant',
+        **kwargs
+    ):
+        super().__init__()
+        kernel_size = cast_tuple(kernel_size, 3)
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+        # 这里以及下文中的height_pad，weight_pad的设置都是为了最后的HW size不变
+        assert is_odd(height_kernel_size) and is_odd(width_kernel_size)
+        dilation = kwargs.pop('dilation', 1)
+        stride = kwargs.pop('stride', 1)
+        self.pad_mode = pad_mode
+        time_pad = dilation * (time_kernel_size - 1) + (1 - stride)
+        height_pad = height_kernel_size // 2
+        width_pad = width_kernel_size // 2
+        self.time_pad = time_pad
+        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
+        stride = (stride, 1, 1)
+        dilation = (dilation, 1, 1)
+        self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride = stride, dilation = dilation, **kwargs)
+    def forward(self, x):
+        pad_mode = self.pad_mode if self.time_pad < x.shape[2] else 'constant'
+        x = F.pad(x, self.time_causal_padding, mode = pad_mode)
+        return self.conv(x)
+class SqueezeExcite(nn.Module):
+    # global context network - attention-esque squeeze-excite variant (https://arxiv.org/abs/2012.13375)
+    # 一个轻量化的 channel-wise attn
+    def __init__(
+        self,
+        dim,
+        *,
+        dim_out = None,
+        dim_hidden_min = 16,
+        init_bias = -10
+    ):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        self.to_k = nn.Conv2d(dim, 1, 1)
+        dim_hidden = max(dim_hidden_min, dim_out // 2)
+        self.net = nn.Sequential(
+            nn.Conv2d(dim, dim_hidden, 1),
+            nn.LeakyReLU(0.1),
+            nn.Conv2d(dim_hidden, dim_out, 1),
+            nn.Sigmoid()
+        )
+        nn.init.zeros_(self.net[-2].weight)
+        nn.init.constant_(self.net[-2].bias, init_bias)
+    def forward(self, x):
+        orig_input, batch = x, x.shape[0]
+        is_video = x.ndim == 5
+        if is_video:
+            x = rearrange(x, 'b c f h w -> (b f) c h w')
+        # 根据HW经过conv得到context特征图
+        context = self.to_k(x)
+        context = rearrange(context, 'b c h w -> b c (h w)').softmax(dim = -1)
+        spatial_flattened_input = rearrange(x, 'b c h w -> b c (h w)')
+        out = einsum('b i n, b c n -> b c i', context, spatial_flattened_input)
+        out = rearrange(out, '... -> ... 1')
+        gates = self.net(out)
+        if is_video:
+            gates = rearrange(gates, '(b f) c h w -> b c f h w', b = batch)
+        return gates * orig_input
+class Residual(nn.Module):
+    @beartype
+    def __init__(self, fn: nn.Module):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) + x
+def ResidualUnit(
+    dim,
+    kernel_size: Union[int, Tuple[int, int, int]],
+    pad_mode: str = 'constant'
+):
+    net = nn.Sequential(
+        # 因果3D卷积
+        # CausalConv3d(dim, dim, kernel_size, pad_mode = pad_mode),
+        nn.Conv3d(dim, dim, kernel_size,padding='same'),
+        nn.ELU(),
+        nn.Conv3d(dim, dim, 1),
+        nn.ELU(),
+        # 一个channel wise的conv1d+softmax的global context attn
+        SqueezeExcite(dim)
+    )
+    return Residual(net)
+# strided conv downsamples
+class SpatialDownsample2x(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out = None,
+        kernel_size = 3,
+        antialias = False
+    ):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        self.conv = nn.Conv2d(dim, dim_out, kernel_size, stride = 2, padding = kernel_size // 2)
+    def forward(self, x):
+        x = rearrange(x, 'b c t h w -> b t c h w')
+        x, ps = pack_one(x, '* c h w')
+        out = self.conv(x)
+        out = unpack_one(out, ps, '* c h w')
+        out = rearrange(out, 'b t c h w -> b c t h w')
+        return out
+class TimeDownsample2x(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out = None,
+        kernel_size = 3,
+        antialias = False
+    ):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        self.time_causal_padding = (kernel_size - 1, 0)
+        self.conv = nn.Conv1d(dim, dim_out, kernel_size, stride = 2)
+    def forward(self, x):
+        x = rearrange(x, 'b c t h w -> b h w c t')
+        x, ps = pack_one(x, '* c t')
+        x = F.pad(x, self.time_causal_padding)
+        out = self.conv(x)
+        out = unpack_one(out, ps, '* c t')
+        out = rearrange(out, 'b h w c t -> b c t h w')
+        return out
+# depth to space upsamples
+class SpatialUpsample2x(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out = None
+    ):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        conv = nn.Conv2d(dim, dim_out * 4, 1)
+        self.net = nn.Sequential(
+            conv,
+            nn.SiLU(),
+            Rearrange('b (c p1 p2) h w -> b c (h p1) (w p2)', p1 = 2, p2 = 2)
+        )
+        self.init_conv_(conv)
+    def init_conv_(self, conv):
+        o, i, h, w = conv.weight.shape
+        conv_weight = torch.empty(o // 4, i, h, w)
+        nn.init.kaiming_uniform_(conv_weight)
+        conv_weight = repeat(conv_weight, 'o ... -> (o 4) ...')
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def forward(self, x):
+        x = rearrange(x, 'b c t h w -> b t c h w')
+        x, ps = pack_one(x, '* c h w')
+        out = self.net(x)
+        out = unpack_one(out, ps, '* c h w')
+        out = rearrange(out, 'b t c h w -> b c t h w')
+        return out
+class TimeUpsample2x(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out = None
+    ):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        conv = nn.Conv1d(dim, dim_out * 2, 1)
+        self.net = nn.Sequential(
+            conv,
+            nn.SiLU(),
+            Rearrange('b (c p) t -> b c (t p)', p = 2)
+        )
+        self.init_conv_(conv)
+    def init_conv_(self, conv):
+        o, i, t = conv.weight.shape
+        conv_weight = torch.empty(o // 2, i, t)
+        nn.init.kaiming_uniform_(conv_weight)
+        conv_weight = repeat(conv_weight, 'o ... -> (o 2) ...')
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def forward(self, x):
+        x = rearrange(x, 'b c t h w -> b h w c t')
+        x, ps = pack_one(x, '* c t')
+        out = self.net(x)
+        out = unpack_one(out, ps, '* c t')
+        out = rearrange(out, 'b h w c t -> b c t h w')
+        return out
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim,
+        channel_first = False,
+        images = False,
+        bias = False
+    ):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
+    def forward(self, x):
+        return F.normalize(x, dim = (1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+class AdaptiveRMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        dim_cond,
+        channel_first = False,
+        images = False,
+        bias = False
+    ):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.dim_cond = dim_cond
+        self.channel_first = channel_first
+        self.scale = dim ** 0.5
+        self.to_gamma = nn.Linear(dim_cond, dim)
+        self.to_bias = nn.Linear(dim_cond, dim) if bias else None
+        nn.init.zeros_(self.to_gamma.weight)
+        nn.init.ones_(self.to_gamma.bias)
+        if bias:
+            nn.init.zeros_(self.to_bias.weight)
+            nn.init.zeros_(self.to_bias.bias)
+    @beartype
+    def forward(self, x: Tensor, *, cond: Tensor):
+        batch = x.shape[0]
+        assert cond.shape == (batch, self.dim_cond)
+        gamma = self.to_gamma(cond)
+        bias = 0.
+        if exists(self.to_bias):
+            bias = self.to_bias(cond)
+        if self.channel_first:
+            gamma = append_dims(gamma, x.ndim - 2)
+            if exists(self.to_bias):
+                bias = append_dims(bias, x.ndim - 2)
+        return F.normalize(x, dim = (1 if self.channel_first else -1)) * self.scale * gamma + bias
+class Attention(nn.Module):
+    @beartype
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_cond: Union[int,None] = None,
+        causal = False,
+        dim_head = 32,
+        heads = 8,
+        flash = False,
+        dropout = 0.,
+        num_memory_kv = 4
+    ):
+        super().__init__()
+        dim_inner = dim_head * heads
+        self.need_cond = exists(dim_cond)
+        if self.need_cond:
+            self.norm = AdaptiveRMSNorm(dim, dim_cond = dim_cond)
+        else:
+            self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Sequential(
+            nn.Linear(dim, dim_inner * 3, bias = False),
+            Rearrange('b n (qkv h d) -> qkv b h n d', qkv = 3, h = heads)
+        )
+        assert num_memory_kv > 0
+        self.mem_kv = nn.Parameter(torch.randn(2, heads, num_memory_kv, dim_head))
+        self.attend = Attend(
+            causal = causal,
+            dropout = dropout,
+            flash = flash
+        )
+        self.to_out = nn.Sequential(
+            Rearrange('b h n d -> b n (h d)'),
+            nn.Linear(dim_inner, dim, bias = False)
+        )
+    @beartype
+    def forward(
+        self,
+        x,
+        mask: Union[Tensor,None] = None,
+        cond: Union[Tensor,None] = None
+    ):
+        maybe_cond_kwargs = dict(cond = cond) if self.need_cond else dict()
+        x = self.norm(x, **maybe_cond_kwargs)
+        q, k, v = self.to_qkv(x)
+        mk, mv = map(lambda t: repeat(t, 'h n d -> b h n d', b = q.shape[0]), self.mem_kv)
+        k = torch.cat((mk, k), dim = -2)
+        v = torch.cat((mv, v), dim = -2)
+        out = self.attend(q, k, v, mask = mask)
+        return self.to_out(out)
+class LinearAttention(nn.Module):
+    """
+    using the specific linear attention proposed in https://arxiv.org/abs/2106.09681
+    """
+    @beartype
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_cond: Union[int,None] = None,
+        dim_head = 8,
+        heads = 8,
+        dropout = 0.
+    ):
+        super().__init__()
+        dim_inner = dim_head * heads
+        self.need_cond = exists(dim_cond)
+        if self.need_cond:
+            self.norm = AdaptiveRMSNorm(dim, dim_cond = dim_cond)
+        else:
+            self.norm = RMSNorm(dim)
+        self.attn = TaylorSeriesLinearAttn(
+            dim = dim,
+            dim_head = dim_head,
+            heads = heads
+        )
+    def forward(
+        self,
+        x,
+        cond: Union[Tensor,None] = None
+    ):
+        maybe_cond_kwargs = dict(cond = cond) if self.need_cond else dict()
+        x = self.norm(x, **maybe_cond_kwargs)
+        return self.attn(x)
+class LinearSpaceAttention(LinearAttention):
+    def forward(self, x, *args, **kwargs):
+        x = rearrange(x, 'b c ... h w -> b ... h w c')
+        x, batch_ps = pack_one(x, '* h w c')
+        x, seq_ps = pack_one(x, 'b * c')
+        x = super().forward(x, *args, **kwargs)
+        x = unpack_one(x, seq_ps, 'b * c')
+        x = unpack_one(x, batch_ps, '* h w c')
+        return rearrange(x, 'b ... h w c -> b c ... h w')
+class SpaceAttention(Attention):
+    def forward(self, x, *args, **kwargs):
+        x = rearrange(x, 'b c t h w -> b t h w c')
+        x, batch_ps = pack_one(x, '* h w c')
+        x, seq_ps = pack_one(x, 'b * c')
+        x = super().forward(x, *args, **kwargs)
+        x = unpack_one(x, seq_ps, 'b * c')
+        x = unpack_one(x, batch_ps, '* h w c')
+        return rearrange(x, 'b t h w c -> b c t h w')
+class TimeAttention(Attention):
+    def forward(self, x, *args, **kwargs):
+        x = rearrange(x, 'b c t h w -> b h w t c')
+        x, batch_ps = pack_one(x, '* t c')
+        x = super().forward(x, *args, **kwargs)
+        x = unpack_one(x, batch_ps, '* t c')
+        return rearrange(x, 'b h w t c -> b c t h w')
+class GEGLU(nn.Module):
+    def forward(self, x):
+        x, gate = x.chunk(2, dim = 1)
+        return F.gelu(gate) * x
+class FeedForward(nn.Module):
+    @beartype
+    def __init__(
+        self,
+        dim,
+        *,
+        dim_cond: Union[int,None] = None,
+        mult = 4,
+        images = False
+    ):
+        super().__init__()
+        conv_klass = nn.Conv2d if images else nn.Conv3d
+        rmsnorm_klass = RMSNorm if not exists(dim_cond) else partial(AdaptiveRMSNorm, dim_cond = dim_cond)
+        maybe_adaptive_norm_klass = partial(rmsnorm_klass, channel_first = True, images = images)
+        dim_inner = int(dim * mult * 2 / 3)
+        self.norm = maybe_adaptive_norm_klass(dim)
+        self.net = Sequential(
+            conv_klass(dim, dim_inner * 2, 1),
+            GEGLU(),
+            conv_klass(dim_inner, dim, 1)
+        )
+    @beartype
+    def forward(
+        self,
+        x: Tensor,
+        *,
+        cond: Union[Tensor,None] = None
+    ):
+        maybe_cond_kwargs = dict(cond = cond) if exists(cond) else dict()
+        x = self.norm(x, **maybe_cond_kwargs)
+        return self.net(x)
+def Sequential(*modules):
+    modules = [*filter(exists, modules)]
+    if len(modules) == 0:
+        return nn.Identity()
+    return nn.Sequential(*modules)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch
+torchaudio
+torchvision
+moviepy
+face_alignment
+beartype
+taylor_series_linear_attention
+huggingface_hub
+einops
+vector_quantize_pytorch
+spaces
+tf-keras
+retina-face
+safetensors