Spaces:

jhj0517
/

Whisper-WebUI

Running

jhj0517 commited on Jul 15, 2024

Commit

670baea

1 Parent(s): 16a0393

Move vad feature into abstract class

Files changed (2) hide show

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -71,20 +71,6 @@ class FasterWhisperInference(WhisperBase):
         if not params.hotwords:
             params.hotwords = None
-        vad_options = None
-        if params.vad_filter:
-            # Explicit value set for float('inf') from gr.Number()
-            if params.max_speech_duration_s >= 9999:
-                params.max_speech_duration_s = float('inf')
-            vad_options = VadOptions(
-                threshold=params.threshold,
-                min_speech_duration_ms=params.min_speech_duration_ms,
-                max_speech_duration_s=params.max_speech_duration_s,
-                min_silence_duration_ms=params.min_silence_duration_ms,
-                speech_pad_ms=params.speech_pad_ms
-            )
         params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
         segments, info = self.model.transcribe(
@@ -115,8 +101,6 @@ class FasterWhisperInference(WhisperBase):
             language_detection_threshold=params.language_detection_threshold,
             language_detection_segments=params.language_detection_segments,
             prompt_reset_on_temperature=params.prompt_reset_on_temperature,
-            vad_filter=params.vad_filter,
-            vad_parameters=vad_options
         )
         progress(0, desc="Loading audio..")

         if not params.hotwords:
             params.hotwords = None
         params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
         segments, info = self.model.transcribe(
             language_detection_threshold=params.language_detection_threshold,
             language_detection_segments=params.language_detection_segments,
             prompt_reset_on_temperature=params.prompt_reset_on_temperature,
         )
         progress(0, desc="Loading audio..")

modules/whisper/whisper_base.py CHANGED Viewed

@@ -91,12 +91,38 @@ class WhisperBase(ABC):
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
             params.lang = language_code_dict[params.lang]
         result, elapsed_time = self.transcribe(
             audio,
             progress,
             *astuple(params)
         )
         if params.is_diarize:
             result, elapsed_time_diarization = self.diarizer.run(
                 audio=audio,

             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
             params.lang = language_code_dict[params.lang]
+        speech_chunks = None
+        if params.vad_filter:
+            # Explicit value set for float('inf') from gr.Number()
+            if params.max_speech_duration_s >= 9999:
+                params.max_speech_duration_s = float('inf')
+            vad_options = VadOptions(
+                threshold=params.threshold,
+                min_speech_duration_ms=params.min_speech_duration_ms,
+                max_speech_duration_s=params.max_speech_duration_s,
+                min_silence_duration_ms=params.min_silence_duration_ms,
+                speech_pad_ms=params.speech_pad_ms
+            )
+            audio, speech_chunks = self.vad.run(
+                audio=audio,
+                vad_parameters=vad_options,
+                progress=progress
+            )
         result, elapsed_time = self.transcribe(
             audio,
             progress,
             *astuple(params)
         )
+        if params.vad_filter:
+            result = self.vad.restore_speech_timestamps(
+                segments=result,
+                speech_chunks=speech_chunks,
+            )
         if params.is_diarize:
             result, elapsed_time_diarization = self.diarizer.run(
                 audio=audio,