Update custom_interface_app.py
Browse files- custom_interface_app.py +12 -12
custom_interface_app.py
CHANGED
@@ -126,17 +126,17 @@ class ASR(Pretrained):
|
|
126 |
return waveform
|
127 |
|
128 |
|
129 |
-
def classify_file_w2v2(self,
|
130 |
# Load the audio file
|
131 |
-
waveform, sr = librosa.load(path, sr=16000)
|
132 |
|
133 |
# Get audio length in seconds
|
134 |
-
audio_length = len(waveform) /
|
135 |
|
136 |
if audio_length >= 20:
|
137 |
# split audio every 20 seconds
|
138 |
segments = []
|
139 |
-
max_duration = 20 *
|
140 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
141 |
start = 0
|
142 |
for i in range(num_segments):
|
@@ -144,7 +144,7 @@ class ASR(Pretrained):
|
|
144 |
if end > len(waveform):
|
145 |
end = len(waveform)
|
146 |
segment_part = waveform[start:end]
|
147 |
-
segment_len = len(segment_part) /
|
148 |
if segment_len < 1:
|
149 |
continue
|
150 |
segments.append(segment_part)
|
@@ -216,23 +216,23 @@ class ASR(Pretrained):
|
|
216 |
|
217 |
|
218 |
|
219 |
-
def classify_file_whisper(self,
|
220 |
-
waveform, sr = librosa.load(path, sr=16000)
|
221 |
transcription = pipe(waveform, generate_kwargs={"language": "macedonian"})["text"]
|
222 |
return transcription
|
223 |
|
224 |
|
225 |
-
def classify_file_mms(self,
|
226 |
# Load the audio file
|
227 |
-
waveform, sr = librosa.load(path, sr=16000)
|
228 |
|
229 |
# Get audio length in seconds
|
230 |
-
audio_length = len(waveform) /
|
231 |
|
232 |
if audio_length >= 20:
|
233 |
# split audio every 20 seconds
|
234 |
segments = []
|
235 |
-
max_duration = 20 *
|
236 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
237 |
start = 0
|
238 |
for i in range(num_segments):
|
@@ -240,7 +240,7 @@ class ASR(Pretrained):
|
|
240 |
if end > len(waveform):
|
241 |
end = len(waveform)
|
242 |
segment_part = waveform[start:end]
|
243 |
-
segment_len = len(segment_part) /
|
244 |
if segment_len < 1:
|
245 |
continue
|
246 |
segments.append(segment_part)
|
|
|
126 |
return waveform
|
127 |
|
128 |
|
129 |
+
def classify_file_w2v2(self, waveform, device):
|
130 |
# Load the audio file
|
131 |
+
# waveform, sr = librosa.load(path, sr=16000)
|
132 |
|
133 |
# Get audio length in seconds
|
134 |
+
audio_length = len(waveform) / 16000
|
135 |
|
136 |
if audio_length >= 20:
|
137 |
# split audio every 20 seconds
|
138 |
segments = []
|
139 |
+
max_duration = 20 * 16000 # Maximum segment duration in samples (20 seconds)
|
140 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
141 |
start = 0
|
142 |
for i in range(num_segments):
|
|
|
144 |
if end > len(waveform):
|
145 |
end = len(waveform)
|
146 |
segment_part = waveform[start:end]
|
147 |
+
segment_len = len(segment_part) / 16000
|
148 |
if segment_len < 1:
|
149 |
continue
|
150 |
segments.append(segment_part)
|
|
|
216 |
|
217 |
|
218 |
|
219 |
+
def classify_file_whisper(self, waveform, pipe, device):
|
220 |
+
# waveform, sr = librosa.load(path, sr=16000)
|
221 |
transcription = pipe(waveform, generate_kwargs={"language": "macedonian"})["text"]
|
222 |
return transcription
|
223 |
|
224 |
|
225 |
+
def classify_file_mms(self, waveform, processor, model, device):
|
226 |
# Load the audio file
|
227 |
+
# waveform, sr = librosa.load(path, sr=16000)
|
228 |
|
229 |
# Get audio length in seconds
|
230 |
+
audio_length = len(waveform) / 16000
|
231 |
|
232 |
if audio_length >= 20:
|
233 |
# split audio every 20 seconds
|
234 |
segments = []
|
235 |
+
max_duration = 20 * 16000 # Maximum segment duration in samples (20 seconds)
|
236 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
237 |
start = 0
|
238 |
for i in range(num_segments):
|
|
|
240 |
if end > len(waveform):
|
241 |
end = len(waveform)
|
242 |
segment_part = waveform[start:end]
|
243 |
+
segment_len = len(segment_part) / 16000
|
244 |
if segment_len < 1:
|
245 |
continue
|
246 |
segments.append(segment_part)
|