Update custom_interface_app.py
Browse files- custom_interface_app.py +16 -19
custom_interface_app.py
CHANGED
@@ -130,9 +130,6 @@ class ASR(Pretrained):
|
|
130 |
# Load the audio file
|
131 |
waveform, sr = librosa.load(path, sr=16000)
|
132 |
|
133 |
-
# increase the volume if needed
|
134 |
-
# waveform = self.increase_volume(waveform)
|
135 |
-
|
136 |
# Get audio length in seconds
|
137 |
audio_length = len(waveform) / sr
|
138 |
|
@@ -142,15 +139,16 @@ class ASR(Pretrained):
|
|
142 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
143 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
144 |
start = 0
|
145 |
-
end = 0
|
146 |
for i in range(num_segments):
|
147 |
-
|
148 |
-
end
|
|
|
149 |
segment_part = waveform[start:end]
|
150 |
segment_len = len(segment_part) / sr
|
151 |
if segment_len < 1:
|
152 |
continue
|
153 |
segments.append(segment_part)
|
|
|
154 |
|
155 |
for segment in segments:
|
156 |
segment_tensor = torch.tensor(segment).to(device)
|
@@ -171,14 +169,14 @@ class ASR(Pretrained):
|
|
171 |
outputs = self.encode_batch_w2v2(device, batch, rel_length)
|
172 |
yield outputs
|
173 |
|
|
|
|
|
174 |
|
175 |
def classify_file_whisper_mkd(self, path, device):
|
176 |
# Load the audio file
|
|
|
177 |
waveform, sr = librosa.load(path, sr=16000)
|
178 |
|
179 |
-
# increase the volume if needed
|
180 |
-
# waveform = self.increase_volume(waveform)
|
181 |
-
|
182 |
# Get audio length in seconds
|
183 |
audio_length = len(waveform) / sr
|
184 |
|
@@ -188,22 +186,23 @@ class ASR(Pretrained):
|
|
188 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
189 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
190 |
start = 0
|
191 |
-
end = 0
|
192 |
for i in range(num_segments):
|
193 |
-
|
194 |
-
end
|
|
|
195 |
segment_part = waveform[start:end]
|
196 |
segment_len = len(segment_part) / sr
|
197 |
if segment_len < 1:
|
198 |
continue
|
199 |
segments.append(segment_part)
|
|
|
200 |
|
201 |
for segment in segments:
|
202 |
segment_tensor = torch.tensor(segment).to(device)
|
203 |
|
204 |
# Fake a batch for the segment
|
205 |
batch = segment_tensor.unsqueeze(0).to(device)
|
206 |
-
rel_length = torch.tensor([1.0]).to(device)
|
207 |
|
208 |
# Pass the segment through the ASR model
|
209 |
segment_output = self.encode_batch_whisper(device, batch, rel_length)
|
@@ -228,9 +227,6 @@ class ASR(Pretrained):
|
|
228 |
# Load the audio file
|
229 |
waveform, sr = librosa.load(path, sr=16000)
|
230 |
|
231 |
-
# increase the volume if needed
|
232 |
-
# waveform = self.increase_volume(waveform)
|
233 |
-
|
234 |
# Get audio length in seconds
|
235 |
audio_length = len(waveform) / sr
|
236 |
|
@@ -240,15 +236,16 @@ class ASR(Pretrained):
|
|
240 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
241 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
242 |
start = 0
|
243 |
-
end = 0
|
244 |
for i in range(num_segments):
|
245 |
-
|
246 |
-
end
|
|
|
247 |
segment_part = waveform[start:end]
|
248 |
segment_len = len(segment_part) / sr
|
249 |
if segment_len < 1:
|
250 |
continue
|
251 |
segments.append(segment_part)
|
|
|
252 |
|
253 |
for segment in segments:
|
254 |
segment_tensor = torch.tensor(segment).to(device)
|
|
|
130 |
# Load the audio file
|
131 |
waveform, sr = librosa.load(path, sr=16000)
|
132 |
|
|
|
|
|
|
|
133 |
# Get audio length in seconds
|
134 |
audio_length = len(waveform) / sr
|
135 |
|
|
|
139 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
140 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
141 |
start = 0
|
|
|
142 |
for i in range(num_segments):
|
143 |
+
end = start + max_duration
|
144 |
+
if end > len(waveform):
|
145 |
+
end = len(waveform)
|
146 |
segment_part = waveform[start:end]
|
147 |
segment_len = len(segment_part) / sr
|
148 |
if segment_len < 1:
|
149 |
continue
|
150 |
segments.append(segment_part)
|
151 |
+
start = end
|
152 |
|
153 |
for segment in segments:
|
154 |
segment_tensor = torch.tensor(segment).to(device)
|
|
|
169 |
outputs = self.encode_batch_w2v2(device, batch, rel_length)
|
170 |
yield outputs
|
171 |
|
172 |
+
|
173 |
+
|
174 |
|
175 |
def classify_file_whisper_mkd(self, path, device):
|
176 |
# Load the audio file
|
177 |
+
# path = "long_sample.wav"
|
178 |
waveform, sr = librosa.load(path, sr=16000)
|
179 |
|
|
|
|
|
|
|
180 |
# Get audio length in seconds
|
181 |
audio_length = len(waveform) / sr
|
182 |
|
|
|
186 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
187 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
188 |
start = 0
|
|
|
189 |
for i in range(num_segments):
|
190 |
+
end = start + max_duration
|
191 |
+
if end > len(waveform):
|
192 |
+
end = len(waveform)
|
193 |
segment_part = waveform[start:end]
|
194 |
segment_len = len(segment_part) / sr
|
195 |
if segment_len < 1:
|
196 |
continue
|
197 |
segments.append(segment_part)
|
198 |
+
start = end
|
199 |
|
200 |
for segment in segments:
|
201 |
segment_tensor = torch.tensor(segment).to(device)
|
202 |
|
203 |
# Fake a batch for the segment
|
204 |
batch = segment_tensor.unsqueeze(0).to(device)
|
205 |
+
rel_length = torch.tensor([1.0]).to(device)
|
206 |
|
207 |
# Pass the segment through the ASR model
|
208 |
segment_output = self.encode_batch_whisper(device, batch, rel_length)
|
|
|
227 |
# Load the audio file
|
228 |
waveform, sr = librosa.load(path, sr=16000)
|
229 |
|
|
|
|
|
|
|
230 |
# Get audio length in seconds
|
231 |
audio_length = len(waveform) / sr
|
232 |
|
|
|
236 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
237 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
238 |
start = 0
|
|
|
239 |
for i in range(num_segments):
|
240 |
+
end = start + max_duration
|
241 |
+
if end > len(waveform):
|
242 |
+
end = len(waveform)
|
243 |
segment_part = waveform[start:end]
|
244 |
segment_len = len(segment_part) / sr
|
245 |
if segment_len < 1:
|
246 |
continue
|
247 |
segments.append(segment_part)
|
248 |
+
start = end
|
249 |
|
250 |
for segment in segments:
|
251 |
segment_tensor = torch.tensor(segment).to(device)
|