Transcribe non-speech areas too in "silero vad"
Browse filesThe old version is now in "silero-vad-skip-gaps". This
may introduce more noise in the transcript, but some of it
will be correct as well.
app.py
CHANGED
@@ -72,10 +72,17 @@ class UI:
|
|
72 |
|
73 |
# The results
|
74 |
if (vad == 'silero-vad'):
|
75 |
-
# Use Silero VAD
|
76 |
if (self.vad_model is None):
|
77 |
-
self.vad_model = VadSileroTranscription()
|
78 |
result = self.vad_model.transcribe(source, whisperCallable)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
elif (vad == 'periodic-vad'):
|
80 |
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
81 |
# it may create a break in the middle of a sentence, causing some artifacts.
|
@@ -184,7 +191,7 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
|
|
184 |
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
185 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
186 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
187 |
-
gr.Dropdown(choices=["none", "silero-vad", "periodic-vad"], label="VAD"),
|
188 |
], outputs=[
|
189 |
gr.File(label="Download"),
|
190 |
gr.Text(label="Transcription"),
|
|
|
72 |
|
73 |
# The results
|
74 |
if (vad == 'silero-vad'):
|
75 |
+
# Use Silero VAD and include gaps
|
76 |
if (self.vad_model is None):
|
77 |
+
self.vad_model = VadSileroTranscription(transcribe_non_speech= True)
|
78 |
result = self.vad_model.transcribe(source, whisperCallable)
|
79 |
+
elif (vad == 'silero-vad-skip-gaps'):
|
80 |
+
# Use Silero VAD
|
81 |
+
if (self.vad_model is None):
|
82 |
+
self.vad_model = VadSileroTranscription(transcribe_non_speech= True)
|
83 |
+
|
84 |
+
skip_gaps = VadSileroTranscription(transcribe_non_speech = False, copy=self.vad_model)
|
85 |
+
result = skip_gaps.transcribe(source, whisperCallable)
|
86 |
elif (vad == 'periodic-vad'):
|
87 |
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
88 |
# it may create a break in the middle of a sentence, causing some artifacts.
|
|
|
191 |
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
192 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
193 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
194 |
+
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
|
195 |
], outputs=[
|
196 |
gr.File(label="Download"),
|
197 |
gr.Text(label="Transcription"),
|
vad.py
CHANGED
@@ -9,19 +9,24 @@ import torch
|
|
9 |
import ffmpeg
|
10 |
import numpy as np
|
11 |
|
12 |
-
|
|
|
|
|
13 |
SPEECH_TRESHOLD = 0.3
|
14 |
MAX_SILENT_PERIOD = 10 # seconds
|
15 |
SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
|
16 |
-
SEGMENT_PADDING_RIGHT =
|
17 |
|
|
|
|
|
18 |
|
19 |
class AbstractTranscription(ABC):
|
20 |
-
def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None):
|
21 |
self.sampling_rate = 16000
|
22 |
self.segment_padding_left = segment_padding_left
|
23 |
self.segment_padding_right = segment_padding_right
|
24 |
self.max_silent_period = max_silent_period
|
|
|
25 |
|
26 |
def get_audio_segment(self, str, start_time: str = None, duration: str = None):
|
27 |
return load_audio(str, self.sampling_rate, start_time, duration)
|
@@ -68,6 +73,13 @@ class AbstractTranscription(ABC):
|
|
68 |
print("Timestamps:")
|
69 |
pprint(merged)
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
result = {
|
72 |
'text': "",
|
73 |
'segments': [],
|
@@ -78,12 +90,19 @@ class AbstractTranscription(ABC):
|
|
78 |
# For each time segment, run whisper
|
79 |
for segment in merged:
|
80 |
segment_start = segment['start']
|
81 |
-
|
|
|
|
|
|
|
82 |
|
83 |
segment_audio = self.get_audio_segment(audio, start_time = str(segment_start) + "s", duration = str(segment_duration) + "s")
|
84 |
|
85 |
-
print("Running whisper
|
86 |
-
|
|
|
|
|
|
|
|
|
87 |
adjusted_segments = self.adjust_whisper_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
88 |
|
89 |
# Append to output
|
@@ -98,6 +117,32 @@ class AbstractTranscription(ABC):
|
|
98 |
|
99 |
return result
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
def adjust_whisper_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
|
102 |
result = []
|
103 |
|
@@ -178,11 +223,15 @@ class AbstractTranscription(ABC):
|
|
178 |
return result
|
179 |
|
180 |
class VadSileroTranscription(AbstractTranscription):
|
181 |
-
def __init__(self):
|
182 |
-
super().__init__(SEGMENT_PADDING_LEFT, SEGMENT_PADDING_RIGHT, MAX_SILENT_PERIOD)
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
186 |
|
187 |
def get_transcribe_timestamps(self, audio: str):
|
188 |
wav = self.get_audio_segment(audio)
|
|
|
9 |
import ffmpeg
|
10 |
import numpy as np
|
11 |
|
12 |
+
from utils import format_timestamp
|
13 |
+
|
14 |
+
# Defaults for Silero
|
15 |
SPEECH_TRESHOLD = 0.3
|
16 |
MAX_SILENT_PERIOD = 10 # seconds
|
17 |
SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
|
18 |
+
SEGMENT_PADDING_RIGHT = 3 # End detected segments late
|
19 |
|
20 |
+
# Whether to attempt to transcribe non-speech
|
21 |
+
TRANSCRIBE_NON_SPEECH = False
|
22 |
|
23 |
class AbstractTranscription(ABC):
|
24 |
+
def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, transcribe_non_speech: bool = False):
|
25 |
self.sampling_rate = 16000
|
26 |
self.segment_padding_left = segment_padding_left
|
27 |
self.segment_padding_right = segment_padding_right
|
28 |
self.max_silent_period = max_silent_period
|
29 |
+
self.transcribe_non_speech = transcribe_non_speech
|
30 |
|
31 |
def get_audio_segment(self, str, start_time: str = None, duration: str = None):
|
32 |
return load_audio(str, self.sampling_rate, start_time, duration)
|
|
|
73 |
print("Timestamps:")
|
74 |
pprint(merged)
|
75 |
|
76 |
+
if self.transcribe_non_speech:
|
77 |
+
max_audio_duration = float(ffmpeg.probe(audio)["format"]["duration"])
|
78 |
+
merged = self.include_gaps(merged, min_gap_length=5, total_duration=max_audio_duration)
|
79 |
+
|
80 |
+
print("Transcribing non-speech:")
|
81 |
+
pprint(merged)
|
82 |
+
|
83 |
result = {
|
84 |
'text': "",
|
85 |
'segments': [],
|
|
|
90 |
# For each time segment, run whisper
|
91 |
for segment in merged:
|
92 |
segment_start = segment['start']
|
93 |
+
segment_end = segment['end']
|
94 |
+
segment_gap = segment.get('gap', False)
|
95 |
+
|
96 |
+
segment_duration = segment_end - segment_start
|
97 |
|
98 |
segment_audio = self.get_audio_segment(audio, start_time = str(segment_start) + "s", duration = str(segment_duration) + "s")
|
99 |
|
100 |
+
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ", segment_duration, "gap: ", segment_gap)
|
101 |
+
if segment_gap:
|
102 |
+
# TODO: Use different parameters for these segments, as they are less likely to contain speech
|
103 |
+
segment_result = whisperCallable(segment_audio)
|
104 |
+
else:
|
105 |
+
segment_result = whisperCallable(segment_audio)
|
106 |
adjusted_segments = self.adjust_whisper_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
107 |
|
108 |
# Append to output
|
|
|
117 |
|
118 |
return result
|
119 |
|
120 |
+
def include_gaps(self, segments: Iterator[dict], min_gap_length: float, total_duration: float):
|
121 |
+
result = []
|
122 |
+
last_end_time = 0
|
123 |
+
|
124 |
+
for segment in segments:
|
125 |
+
segment_start = float(segment['start'])
|
126 |
+
segment_end = float(segment['end'])
|
127 |
+
|
128 |
+
if (last_end_time != segment_start):
|
129 |
+
delta = segment_start - last_end_time
|
130 |
+
|
131 |
+
if (min_gap_length is None or delta >= min_gap_length):
|
132 |
+
result.append( { 'start': last_end_time, 'end': segment_start, 'gap': True } )
|
133 |
+
|
134 |
+
last_end_time = segment_end
|
135 |
+
result.append(segment)
|
136 |
+
|
137 |
+
# Also include total duration if specified
|
138 |
+
if (total_duration is not None and last_end_time < total_duration):
|
139 |
+
delta = total_duration - segment_start
|
140 |
+
|
141 |
+
if (min_gap_length is None or delta >= min_gap_length):
|
142 |
+
result.append( { 'start': last_end_time, 'end': total_duration, 'gap': True } )
|
143 |
+
|
144 |
+
return result
|
145 |
+
|
146 |
def adjust_whisper_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
|
147 |
result = []
|
148 |
|
|
|
223 |
return result
|
224 |
|
225 |
class VadSileroTranscription(AbstractTranscription):
|
226 |
+
def __init__(self, transcribe_non_speech: bool = False, copy = None):
|
227 |
+
super().__init__(SEGMENT_PADDING_LEFT, SEGMENT_PADDING_RIGHT, MAX_SILENT_PERIOD, transcribe_non_speech)
|
228 |
+
|
229 |
+
if copy:
|
230 |
+
self.model = copy.model
|
231 |
+
self.get_speech_timestamps = copy.get_speech_timestamps
|
232 |
+
else:
|
233 |
+
self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
|
234 |
+
(self.get_speech_timestamps, _, _, _, _) = utils
|
235 |
|
236 |
def get_transcribe_timestamps(self, audio: str):
|
237 |
wav = self.get_audio_segment(audio)
|