Spaces:
Running
Running
jhj0517
commited on
Commit
·
670baea
1
Parent(s):
16a0393
Move vad feature into abstract class
Browse files
modules/whisper/faster_whisper_inference.py
CHANGED
|
@@ -71,20 +71,6 @@ class FasterWhisperInference(WhisperBase):
|
|
| 71 |
if not params.hotwords:
|
| 72 |
params.hotwords = None
|
| 73 |
|
| 74 |
-
vad_options = None
|
| 75 |
-
if params.vad_filter:
|
| 76 |
-
# Explicit value set for float('inf') from gr.Number()
|
| 77 |
-
if params.max_speech_duration_s >= 9999:
|
| 78 |
-
params.max_speech_duration_s = float('inf')
|
| 79 |
-
|
| 80 |
-
vad_options = VadOptions(
|
| 81 |
-
threshold=params.threshold,
|
| 82 |
-
min_speech_duration_ms=params.min_speech_duration_ms,
|
| 83 |
-
max_speech_duration_s=params.max_speech_duration_s,
|
| 84 |
-
min_silence_duration_ms=params.min_silence_duration_ms,
|
| 85 |
-
speech_pad_ms=params.speech_pad_ms
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
|
| 89 |
|
| 90 |
segments, info = self.model.transcribe(
|
|
@@ -115,8 +101,6 @@ class FasterWhisperInference(WhisperBase):
|
|
| 115 |
language_detection_threshold=params.language_detection_threshold,
|
| 116 |
language_detection_segments=params.language_detection_segments,
|
| 117 |
prompt_reset_on_temperature=params.prompt_reset_on_temperature,
|
| 118 |
-
vad_filter=params.vad_filter,
|
| 119 |
-
vad_parameters=vad_options
|
| 120 |
)
|
| 121 |
progress(0, desc="Loading audio..")
|
| 122 |
|
|
|
|
| 71 |
if not params.hotwords:
|
| 72 |
params.hotwords = None
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
|
| 75 |
|
| 76 |
segments, info = self.model.transcribe(
|
|
|
|
| 101 |
language_detection_threshold=params.language_detection_threshold,
|
| 102 |
language_detection_segments=params.language_detection_segments,
|
| 103 |
prompt_reset_on_temperature=params.prompt_reset_on_temperature,
|
|
|
|
|
|
|
| 104 |
)
|
| 105 |
progress(0, desc="Loading audio..")
|
| 106 |
|
modules/whisper/whisper_base.py
CHANGED
|
@@ -91,12 +91,38 @@ class WhisperBase(ABC):
|
|
| 91 |
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
| 92 |
params.lang = language_code_dict[params.lang]
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
result, elapsed_time = self.transcribe(
|
| 95 |
audio,
|
| 96 |
progress,
|
| 97 |
*astuple(params)
|
| 98 |
)
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
if params.is_diarize:
|
| 101 |
result, elapsed_time_diarization = self.diarizer.run(
|
| 102 |
audio=audio,
|
|
|
|
| 91 |
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
| 92 |
params.lang = language_code_dict[params.lang]
|
| 93 |
|
| 94 |
+
speech_chunks = None
|
| 95 |
+
if params.vad_filter:
|
| 96 |
+
# Explicit value set for float('inf') from gr.Number()
|
| 97 |
+
if params.max_speech_duration_s >= 9999:
|
| 98 |
+
params.max_speech_duration_s = float('inf')
|
| 99 |
+
|
| 100 |
+
vad_options = VadOptions(
|
| 101 |
+
threshold=params.threshold,
|
| 102 |
+
min_speech_duration_ms=params.min_speech_duration_ms,
|
| 103 |
+
max_speech_duration_s=params.max_speech_duration_s,
|
| 104 |
+
min_silence_duration_ms=params.min_silence_duration_ms,
|
| 105 |
+
speech_pad_ms=params.speech_pad_ms
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
audio, speech_chunks = self.vad.run(
|
| 109 |
+
audio=audio,
|
| 110 |
+
vad_parameters=vad_options,
|
| 111 |
+
progress=progress
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
result, elapsed_time = self.transcribe(
|
| 115 |
audio,
|
| 116 |
progress,
|
| 117 |
*astuple(params)
|
| 118 |
)
|
| 119 |
|
| 120 |
+
if params.vad_filter:
|
| 121 |
+
result = self.vad.restore_speech_timestamps(
|
| 122 |
+
segments=result,
|
| 123 |
+
speech_chunks=speech_chunks,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
if params.is_diarize:
|
| 127 |
result, elapsed_time_diarization = self.diarizer.run(
|
| 128 |
audio=audio,
|