Update chunkedTranscriber.py
Browse files- chunkedTranscriber.py +16 -9
chunkedTranscriber.py
CHANGED
@@ -6,15 +6,18 @@ import torch
|
|
6 |
import spaces
|
7 |
import torchaudio
|
8 |
import numpy as np
|
9 |
-
|
10 |
-
|
|
|
11 |
from dotenv import load_dotenv
|
12 |
load_dotenv()
|
|
|
|
|
13 |
from difflib import SequenceMatcher
|
14 |
from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor, Wav2Vec2ForCTC, AutoProcessor, AutoTokenizer, AutoModelForSeq2SeqLM
|
15 |
-
from difflib import SequenceMatcher
|
16 |
-
import logging
|
17 |
|
|
|
|
|
18 |
logging.basicConfig(
|
19 |
level=logging.INFO,
|
20 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
@@ -38,7 +41,7 @@ class ChunkedTranscriber:
|
|
38 |
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.getenv("HF_TOKEN"))
|
39 |
return pipeline
|
40 |
|
41 |
-
@spaces.GPU(duration=
|
42 |
def diarize_audio(self, audio_path):
|
43 |
"""
|
44 |
Perform speaker diarization on the input audio.
|
@@ -53,7 +56,7 @@ class ChunkedTranscriber:
|
|
53 |
return processor, model
|
54 |
|
55 |
|
56 |
-
@spaces.GPU(duration=
|
57 |
def language_identification(self, model, processor, chunk, device="cuda"):
|
58 |
inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
|
59 |
model.to(device)
|
@@ -77,7 +80,7 @@ class ChunkedTranscriber:
|
|
77 |
return model, processor
|
78 |
|
79 |
|
80 |
-
@spaces.GPU(duration=
|
81 |
def mms_transcription(self, model, processor, chunk, device="cuda"):
|
82 |
|
83 |
inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
|
@@ -102,7 +105,7 @@ class ChunkedTranscriber:
|
|
102 |
return model, tokenizer
|
103 |
|
104 |
|
105 |
-
@spaces.GPU(duration=
|
106 |
def text2text_translation(self, translation_model, translation_tokenizer, transcript, device="cuda"):
|
107 |
# model, tokenizer = load_translation_model()
|
108 |
|
@@ -336,6 +339,10 @@ class ChunkedTranscriber:
|
|
336 |
return translation
|
337 |
|
338 |
|
|
|
|
|
|
|
|
|
339 |
|
340 |
def transcribe_audio(self, audio_path, translate=False):
|
341 |
"""
|
@@ -353,7 +360,7 @@ class ChunkedTranscriber:
|
|
353 |
'end_time': turn.end,
|
354 |
'speaker': speaker
|
355 |
})
|
356 |
-
|
357 |
audio = self.load_audio(audio_path)
|
358 |
chunks = self.preprocess_audio(audio)
|
359 |
|
|
|
6 |
import spaces
|
7 |
import torchaudio
|
8 |
import numpy as np
|
9 |
+
|
10 |
+
|
11 |
+
from df.enhance import enhance, init_df
|
12 |
from dotenv import load_dotenv
|
13 |
load_dotenv()
|
14 |
+
from scipy.signal import resample
|
15 |
+
from pyannote.audio import Pipeline
|
16 |
from difflib import SequenceMatcher
|
17 |
from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor, Wav2Vec2ForCTC, AutoProcessor, AutoTokenizer, AutoModelForSeq2SeqLM
|
|
|
|
|
18 |
|
19 |
+
|
20 |
+
import logging
|
21 |
logging.basicConfig(
|
22 |
level=logging.INFO,
|
23 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
|
41 |
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.getenv("HF_TOKEN"))
|
42 |
return pipeline
|
43 |
|
44 |
+
@spaces.GPU(duration=180)
|
45 |
def diarize_audio(self, audio_path):
|
46 |
"""
|
47 |
Perform speaker diarization on the input audio.
|
|
|
56 |
return processor, model
|
57 |
|
58 |
|
59 |
+
@spaces.GPU(duration=180)
|
60 |
def language_identification(self, model, processor, chunk, device="cuda"):
|
61 |
inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
|
62 |
model.to(device)
|
|
|
80 |
return model, processor
|
81 |
|
82 |
|
83 |
+
@spaces.GPU(duration=180)
|
84 |
def mms_transcription(self, model, processor, chunk, device="cuda"):
|
85 |
|
86 |
inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
|
|
|
105 |
return model, tokenizer
|
106 |
|
107 |
|
108 |
+
@spaces.GPU(duration=180)
|
109 |
def text2text_translation(self, translation_model, translation_tokenizer, transcript, device="cuda"):
|
110 |
# model, tokenizer = load_translation_model()
|
111 |
|
|
|
339 |
return translation
|
340 |
|
341 |
|
342 |
+
def audio_denoising():
|
343 |
+
model, df_state = init_df()
|
344 |
+
enhanced_audio = enhance(model, df_state, noisy_audio)
|
345 |
+
return enhanced_audio
|
346 |
|
347 |
def transcribe_audio(self, audio_path, translate=False):
|
348 |
"""
|
|
|
360 |
'end_time': turn.end,
|
361 |
'speaker': speaker
|
362 |
})
|
363 |
+
|
364 |
audio = self.load_audio(audio_path)
|
365 |
chunks = self.preprocess_audio(audio)
|
366 |
|