Porjaz commited on
Commit
01b727f
1 Parent(s): bbad3d5

Update custom_interface_app.py

Browse files
Files changed (1) hide show
  1. custom_interface_app.py +39 -94
custom_interface_app.py CHANGED
@@ -128,7 +128,6 @@ class ASR(Pretrained):
128
 
129
  def classify_file_w2v2(self, path, device):
130
  # Load the audio file
131
- # path = "long_sample.wav"
132
  waveform, sr = librosa.load(path, sr=16000)
133
 
134
  # increase the volume if needed
@@ -138,42 +137,22 @@ class ASR(Pretrained):
138
  audio_length = len(waveform) / sr
139
 
140
  if audio_length >= 20:
141
- print(f"Audio is too long ({audio_length:.2f} seconds), splitting into segments")
142
- # Detect non-silent segments
143
-
144
- non_silent_intervals = librosa.effects.split(waveform, top_db=20) # Adjust top_db for sensitivity
145
-
146
  segments = []
147
- current_segment = []
148
- current_length = 0
149
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
150
-
151
-
152
- for interval in non_silent_intervals:
153
- start, end = interval
 
 
154
  segment_part = waveform[start:end]
 
 
 
 
155
 
156
- # If adding the next part exceeds max duration, store the segment and start a new one
157
- if current_length + len(segment_part) > max_duration:
158
- segments.append(np.concatenate(current_segment))
159
- current_segment = []
160
- current_length = 0
161
-
162
- current_segment.append(segment_part)
163
- current_length += len(segment_part)
164
-
165
- # Append the last segment if it's not empty
166
- if current_segment:
167
- segments.append(np.concatenate(current_segment))
168
-
169
- # Process each segment
170
- outputs = []
171
- for i, segment in enumerate(segments):
172
- print(f"Processing segment {i + 1}/{len(segments)}, length: {len(segment) / sr:.2f} seconds")
173
-
174
- # import soundfile as sf
175
- # sf.write(f"outputs/segment_{i}.wav", segment, sr)
176
-
177
  segment_tensor = torch.tensor(segment).to(device)
178
 
179
  # Fake a batch for the segment
@@ -195,7 +174,6 @@ class ASR(Pretrained):
195
 
196
  def classify_file_whisper_mkd(self, path, device):
197
  # Load the audio file
198
- # path = "long_sample.wav"
199
  waveform, sr = librosa.load(path, sr=16000)
200
 
201
  # increase the volume if needed
@@ -205,42 +183,22 @@ class ASR(Pretrained):
205
  audio_length = len(waveform) / sr
206
 
207
  if audio_length >= 20:
208
- print(f"Audio is too long ({audio_length:.2f} seconds), splitting into segments")
209
- # Detect non-silent segments
210
-
211
- non_silent_intervals = librosa.effects.split(waveform, top_db=20) # Adjust top_db for sensitivity
212
-
213
  segments = []
214
- current_segment = []
215
- current_length = 0
216
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
217
-
218
-
219
- for interval in non_silent_intervals:
220
- start, end = interval
 
 
221
  segment_part = waveform[start:end]
 
 
 
 
222
 
223
- # If adding the next part exceeds max duration, store the segment and start a new one
224
- if current_length + len(segment_part) > max_duration:
225
- segments.append(np.concatenate(current_segment))
226
- current_segment = []
227
- current_length = 0
228
-
229
- current_segment.append(segment_part)
230
- current_length += len(segment_part)
231
-
232
- # Append the last segment if it's not empty
233
- if current_segment:
234
- segments.append(np.concatenate(current_segment))
235
-
236
- # Process each segment
237
- outputs = []
238
- for i, segment in enumerate(segments):
239
- print(f"Processing segment {i + 1}/{len(segments)}, length: {len(segment) / sr:.2f} seconds")
240
-
241
- # import soundfile as sf
242
- # sf.write(f"outputs/segment_{i}.wav", segment, sr)
243
-
244
  segment_tensor = torch.tensor(segment).to(device)
245
 
246
  # Fake a batch for the segment
@@ -270,42 +228,29 @@ class ASR(Pretrained):
270
  # Load the audio file
271
  waveform, sr = librosa.load(path, sr=16000)
272
 
 
 
 
273
  # Get audio length in seconds
274
  audio_length = len(waveform) / sr
275
 
276
  if audio_length >= 20:
277
- print(f"MMS Audio is too long ({audio_length:.2f} seconds), splitting into segments")
278
- # Detect non-silent segments
279
- non_silent_intervals = librosa.effects.split(waveform, top_db=20) # Adjust top_db for sensitivity
280
-
281
  segments = []
282
- current_segment = []
283
- current_length = 0
284
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
285
-
286
-
287
- for interval in non_silent_intervals:
288
- start, end = interval
 
 
289
  segment_part = waveform[start:end]
 
 
 
 
290
 
291
- # If adding the next part exceeds max duration, store the segment and start a new one
292
- if current_length + len(segment_part) > max_duration:
293
- segments.append(np.concatenate(current_segment))
294
- current_segment = []
295
- current_length = 0
296
-
297
- current_segment.append(segment_part)
298
- current_length += len(segment_part)
299
-
300
- # Append the last segment if it's not empty
301
- if current_segment:
302
- segments.append(np.concatenate(current_segment))
303
-
304
- # Process each segment
305
- outputs = []
306
- for i, segment in enumerate(segments):
307
- print(f"MMS Processing segment {i + 1}/{len(segments)}, length: {len(segment) / sr:.2f} seconds")
308
-
309
  segment_tensor = torch.tensor(segment).to(device)
310
 
311
  # Pass the segment through the ASR model
 
128
 
129
  def classify_file_w2v2(self, path, device):
130
  # Load the audio file
 
131
  waveform, sr = librosa.load(path, sr=16000)
132
 
133
  # increase the volume if needed
 
137
  audio_length = len(waveform) / sr
138
 
139
  if audio_length >= 20:
140
+ # split audio every 20 seconds
 
 
 
 
141
  segments = []
 
 
142
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
143
+ num_segments = int(np.ceil(len(waveform) / max_duration))
144
+ start = 0
145
+ end = 0
146
+ for i in range(num_segments):
147
+ start = start + end
148
+ end = start + max_duration * sr
149
  segment_part = waveform[start:end]
150
+ segment_len = len(segment_part) / sr
151
+ if segment_len < 1:
152
+ continue
153
+ segments.append(segment_part)
154
 
155
+ for segment in segments:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  segment_tensor = torch.tensor(segment).to(device)
157
 
158
  # Fake a batch for the segment
 
174
 
175
  def classify_file_whisper_mkd(self, path, device):
176
  # Load the audio file
 
177
  waveform, sr = librosa.load(path, sr=16000)
178
 
179
  # increase the volume if needed
 
183
  audio_length = len(waveform) / sr
184
 
185
  if audio_length >= 20:
186
+ # split audio every 20 seconds
 
 
 
 
187
  segments = []
 
 
188
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
189
+ num_segments = int(np.ceil(len(waveform) / max_duration))
190
+ start = 0
191
+ end = 0
192
+ for i in range(num_segments):
193
+ start = start + end
194
+ end = start + max_duration * sr
195
  segment_part = waveform[start:end]
196
+ segment_len = len(segment_part) / sr
197
+ if segment_len < 1:
198
+ continue
199
+ segments.append(segment_part)
200
 
201
+ for segment in segments:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  segment_tensor = torch.tensor(segment).to(device)
203
 
204
  # Fake a batch for the segment
 
228
  # Load the audio file
229
  waveform, sr = librosa.load(path, sr=16000)
230
 
231
+ # increase the volume if needed
232
+ # waveform = self.increase_volume(waveform)
233
+
234
  # Get audio length in seconds
235
  audio_length = len(waveform) / sr
236
 
237
  if audio_length >= 20:
238
+ # split audio every 20 seconds
 
 
 
239
  segments = []
 
 
240
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
241
+ num_segments = int(np.ceil(len(waveform) / max_duration))
242
+ start = 0
243
+ end = 0
244
+ for i in range(num_segments):
245
+ start = start + end
246
+ end = start + max_duration * sr
247
  segment_part = waveform[start:end]
248
+ segment_len = len(segment_part) / sr
249
+ if segment_len < 1:
250
+ continue
251
+ segments.append(segment_part)
252
 
253
+ for segment in segments:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  segment_tensor = torch.tensor(segment).to(device)
255
 
256
  # Pass the segment through the ASR model