Porjaz commited on
Commit
433d102
1 Parent(s): 01b727f

Update custom_interface_app.py

Browse files
Files changed (1) hide show
  1. custom_interface_app.py +16 -19
custom_interface_app.py CHANGED
@@ -130,9 +130,6 @@ class ASR(Pretrained):
130
  # Load the audio file
131
  waveform, sr = librosa.load(path, sr=16000)
132
 
133
- # increase the volume if needed
134
- # waveform = self.increase_volume(waveform)
135
-
136
  # Get audio length in seconds
137
  audio_length = len(waveform) / sr
138
 
@@ -142,15 +139,16 @@ class ASR(Pretrained):
142
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
143
  num_segments = int(np.ceil(len(waveform) / max_duration))
144
  start = 0
145
- end = 0
146
  for i in range(num_segments):
147
- start = start + end
148
- end = start + max_duration * sr
 
149
  segment_part = waveform[start:end]
150
  segment_len = len(segment_part) / sr
151
  if segment_len < 1:
152
  continue
153
  segments.append(segment_part)
 
154
 
155
  for segment in segments:
156
  segment_tensor = torch.tensor(segment).to(device)
@@ -171,14 +169,14 @@ class ASR(Pretrained):
171
  outputs = self.encode_batch_w2v2(device, batch, rel_length)
172
  yield outputs
173
 
 
 
174
 
175
  def classify_file_whisper_mkd(self, path, device):
176
  # Load the audio file
 
177
  waveform, sr = librosa.load(path, sr=16000)
178
 
179
- # increase the volume if needed
180
- # waveform = self.increase_volume(waveform)
181
-
182
  # Get audio length in seconds
183
  audio_length = len(waveform) / sr
184
 
@@ -188,22 +186,23 @@ class ASR(Pretrained):
188
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
189
  num_segments = int(np.ceil(len(waveform) / max_duration))
190
  start = 0
191
- end = 0
192
  for i in range(num_segments):
193
- start = start + end
194
- end = start + max_duration * sr
 
195
  segment_part = waveform[start:end]
196
  segment_len = len(segment_part) / sr
197
  if segment_len < 1:
198
  continue
199
  segments.append(segment_part)
 
200
 
201
  for segment in segments:
202
  segment_tensor = torch.tensor(segment).to(device)
203
 
204
  # Fake a batch for the segment
205
  batch = segment_tensor.unsqueeze(0).to(device)
206
- rel_length = torch.tensor([1.0]).to(device) # Adjust if necessary
207
 
208
  # Pass the segment through the ASR model
209
  segment_output = self.encode_batch_whisper(device, batch, rel_length)
@@ -228,9 +227,6 @@ class ASR(Pretrained):
228
  # Load the audio file
229
  waveform, sr = librosa.load(path, sr=16000)
230
 
231
- # increase the volume if needed
232
- # waveform = self.increase_volume(waveform)
233
-
234
  # Get audio length in seconds
235
  audio_length = len(waveform) / sr
236
 
@@ -240,15 +236,16 @@ class ASR(Pretrained):
240
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
241
  num_segments = int(np.ceil(len(waveform) / max_duration))
242
  start = 0
243
- end = 0
244
  for i in range(num_segments):
245
- start = start + end
246
- end = start + max_duration * sr
 
247
  segment_part = waveform[start:end]
248
  segment_len = len(segment_part) / sr
249
  if segment_len < 1:
250
  continue
251
  segments.append(segment_part)
 
252
 
253
  for segment in segments:
254
  segment_tensor = torch.tensor(segment).to(device)
 
130
  # Load the audio file
131
  waveform, sr = librosa.load(path, sr=16000)
132
 
 
 
 
133
  # Get audio length in seconds
134
  audio_length = len(waveform) / sr
135
 
 
139
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
140
  num_segments = int(np.ceil(len(waveform) / max_duration))
141
  start = 0
 
142
  for i in range(num_segments):
143
+ end = start + max_duration
144
+ if end > len(waveform):
145
+ end = len(waveform)
146
  segment_part = waveform[start:end]
147
  segment_len = len(segment_part) / sr
148
  if segment_len < 1:
149
  continue
150
  segments.append(segment_part)
151
+ start = end
152
 
153
  for segment in segments:
154
  segment_tensor = torch.tensor(segment).to(device)
 
169
  outputs = self.encode_batch_w2v2(device, batch, rel_length)
170
  yield outputs
171
 
172
+
173
+
174
 
175
  def classify_file_whisper_mkd(self, path, device):
176
  # Load the audio file
177
+ # path = "long_sample.wav"
178
  waveform, sr = librosa.load(path, sr=16000)
179
 
 
 
 
180
  # Get audio length in seconds
181
  audio_length = len(waveform) / sr
182
 
 
186
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
187
  num_segments = int(np.ceil(len(waveform) / max_duration))
188
  start = 0
 
189
  for i in range(num_segments):
190
+ end = start + max_duration
191
+ if end > len(waveform):
192
+ end = len(waveform)
193
  segment_part = waveform[start:end]
194
  segment_len = len(segment_part) / sr
195
  if segment_len < 1:
196
  continue
197
  segments.append(segment_part)
198
+ start = end
199
 
200
  for segment in segments:
201
  segment_tensor = torch.tensor(segment).to(device)
202
 
203
  # Fake a batch for the segment
204
  batch = segment_tensor.unsqueeze(0).to(device)
205
+ rel_length = torch.tensor([1.0]).to(device)
206
 
207
  # Pass the segment through the ASR model
208
  segment_output = self.encode_batch_whisper(device, batch, rel_length)
 
227
  # Load the audio file
228
  waveform, sr = librosa.load(path, sr=16000)
229
 
 
 
 
230
  # Get audio length in seconds
231
  audio_length = len(waveform) / sr
232
 
 
236
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
237
  num_segments = int(np.ceil(len(waveform) / max_duration))
238
  start = 0
 
239
  for i in range(num_segments):
240
+ end = start + max_duration
241
+ if end > len(waveform):
242
+ end = len(waveform)
243
  segment_part = waveform[start:end]
244
  segment_len = len(segment_part) / sr
245
  if segment_len < 1:
246
  continue
247
  segments.append(segment_part)
248
+ start = end
249
 
250
  for segment in segments:
251
  segment_tensor = torch.tensor(segment).to(device)