Macedonian-ASR
/

buki-wav2vec2-2.0

Automatic Speech Recognition

speechbrain

Macedonian

Model card Files Files and versions Community

Porjaz commited on Jan 10

Commit

d22911d

verified ·

1 Parent(s): a353d95

Update custom_interface_app.py

Browse files

Files changed (1) hide show

custom_interface_app.py +51 -52

custom_interface_app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import torch
 from speechbrain.inference.interfaces import Pretrained
 import librosa
 import numpy as np
 class ASR(Pretrained):
@@ -85,69 +86,66 @@ class ASR(Pretrained):
                     seq.append(token)
             output = []
         return seq
-    def increase_volume(self, waveform, threshold_db=-25):
-        # Measure loudness using RMS
-        loudness_vector = librosa.feature.rms(y=waveform)
-        average_loudness = np.mean(loudness_vector)
-        average_loudness_db = librosa.amplitude_to_db(average_loudness)
-        print(f"Average Loudness: {average_loudness_db} dB")
-        # Check if loudness is below threshold and apply gain if needed
-        if average_loudness_db < threshold_db:
-            # Calculate gain needed
-            gain_db = threshold_db - average_loudness_db
-            gain = librosa.db_to_amplitude(gain_db)  # Convert dB to amplitude factor
-            # Apply gain to the audio signal
-            waveform = waveform * gain
-            loudness_vector = librosa.feature.rms(y=waveform)
-            average_loudness = np.mean(loudness_vector)
-            average_loudness_db = librosa.amplitude_to_db(average_loudness)
-            print(f"Average Loudness: {average_loudness_db} dB")
-        return waveform
-    def classify_file_w2v2(self, waveform, device):
         # Get audio length in seconds
         sr = 16000
         audio_length = len(waveform) / sr
-        if audio_length >= 30:
             print(f"Audio is too long ({audio_length:.2f} seconds), splitting into segments")
-            # Detect non-silent segments
-            non_silent_intervals = librosa.effects.split(waveform, top_db=20)  # Adjust top_db for sensitivity
             segments = []
-            current_segment = []
-            current_length = 0
-            max_duration = 30 * sr  # Maximum segment duration in samples (20 seconds)
-            for interval in non_silent_intervals:
-                start, end = interval
-                segment_part = waveform[start:end]
-                # If adding the next part exceeds max duration, store the segment and start a new one
-                if current_length + len(segment_part) > max_duration:
-                    segments.append(np.concatenate(current_segment))
-                    current_segment = []
-                    current_length = 0
-                current_segment.append(segment_part)
-                current_length += len(segment_part)
-            # Append the last segment if it's not empty
-            if current_segment:
-                segments.append(np.concatenate(current_segment))
             # Process each segment
             outputs = []
             for i, segment in enumerate(segments):
                 print(f"Processing segment {i + 1}/{len(segments)}, length: {len(segment) / sr:.2f} seconds")
                 # import soundfile as sf
@@ -164,12 +162,13 @@ class ASR(Pretrained):
                 # outputs.append(result)
                 yield result
         else:
-            waveform = torch.tensor(waveform).to(device)
             waveform = waveform.to(device)
-            # Fake a batch:
-            batch = waveform.unsqueeze(0)
             rel_length = torch.tensor([1.0]).to(device)
-            outputs = " ".join(self.encode_batch_w2v2(device, batch, rel_length)[0])
             yield outputs

 from speechbrain.inference.interfaces import Pretrained
 import librosa
 import numpy as np
+import torchaudio
 class ASR(Pretrained):
                     seq.append(token)
             output = []
         return seq
+    def classify_file_w2v2(self, file, vad_model, device):
         # Get audio length in seconds
         sr = 16000
+        max_segment_length = 30
+        # waveform, sr = librosa.load(file, sr=sr)
+        waveform, file_sr = torchaudio.load(file)
+        # resample if not 16kHz
+        if file_sr != sr:
+            waveform = torchaudio.transforms.Resample(file_sr, sr)(waveform)
+        waveform = waveform.squeeze()
         audio_length = len(waveform) / sr
+        print(f"Audio length: {audio_length:.2f} seconds")
+        if audio_length >= max_segment_length:
             print(f"Audio is too long ({audio_length:.2f} seconds), splitting into segments")
+            # save waveform temporarily
+            torchaudio.save("temp.wav", waveform.unsqueeze(0), sr)
+            # get boundaries based on VAD
+            boundaries = vad_model.get_speech_segments("temp.wav",
+                                                large_chunk_size=30,
+                                                small_chunk_size=10,
+                                                apply_energy_VAD=True,
+                                                double_check=True)
+            # remove temp file
+            os.remove("temp.wav")
+            # Merge the segments to max max_segment_length
             segments = []
+            current_start = boundaries[0][0].item()
+            current_end = boundaries[0][1].item()
+            for i in range(1, len(boundaries)):
+                next_start = boundaries[i][0].item()
+                next_end = boundaries[i][1].item()
+                # Check if the current segment can merge with the next segment
+                if (current_end - current_start) + (next_end - next_start) <= max_segment_length:
+                    # Extend the current segment
+                    current_end = next_end
+                else:
+                    # Add the current segment to the result and start a new one
+                    segments.append([current_start, current_end])
+                    current_start = next_start
+                    current_end = next_end
+            # Add the last segment
+            segments.append([current_start, current_end])
             # Process each segment
             outputs = []
             for i, segment in enumerate(segments):
+                start, end = segment
+                start = int(start * sr)
+                end = int(end * sr)
+                segment = waveform[start:end]
                 print(f"Processing segment {i + 1}/{len(segments)}, length: {len(segment) / sr:.2f} seconds")
                 # import soundfile as sf
                 # outputs.append(result)
                 yield result
         else:
+            waveform, file_sr = torchaudio.load(file)
+            # resample if not 16kHz
+            if file_sr != sr:
+                waveform = torchaudio.transforms.Resample(file_sr, sr)(waveform)
             waveform = waveform.to(device)
             rel_length = torch.tensor([1.0]).to(device)
+            outputs = " ".join(self.encode_batch_w2v2(device, waveform, rel_length)[0])
             yield outputs