Kr08 commited on
Commit
73c774a
·
verified ·
1 Parent(s): 54feb59

Update chunkedTranscriber.py

Browse files
Files changed (1) hide show
  1. chunkedTranscriber.py +16 -9
chunkedTranscriber.py CHANGED
@@ -6,15 +6,18 @@ import torch
6
  import spaces
7
  import torchaudio
8
  import numpy as np
9
- from scipy.signal import resample
10
- from pyannote.audio import Pipeline
 
11
  from dotenv import load_dotenv
12
  load_dotenv()
 
 
13
  from difflib import SequenceMatcher
14
  from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor, Wav2Vec2ForCTC, AutoProcessor, AutoTokenizer, AutoModelForSeq2SeqLM
15
- from difflib import SequenceMatcher
16
- import logging
17
 
 
 
18
  logging.basicConfig(
19
  level=logging.INFO,
20
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
@@ -38,7 +41,7 @@ class ChunkedTranscriber:
38
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.getenv("HF_TOKEN"))
39
  return pipeline
40
 
41
- @spaces.GPU(duration=60)
42
  def diarize_audio(self, audio_path):
43
  """
44
  Perform speaker diarization on the input audio.
@@ -53,7 +56,7 @@ class ChunkedTranscriber:
53
  return processor, model
54
 
55
 
56
- @spaces.GPU(duration=60)
57
  def language_identification(self, model, processor, chunk, device="cuda"):
58
  inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
59
  model.to(device)
@@ -77,7 +80,7 @@ class ChunkedTranscriber:
77
  return model, processor
78
 
79
 
80
- @spaces.GPU(duration=60)
81
  def mms_transcription(self, model, processor, chunk, device="cuda"):
82
 
83
  inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
@@ -102,7 +105,7 @@ class ChunkedTranscriber:
102
  return model, tokenizer
103
 
104
 
105
- @spaces.GPU(duration=60)
106
  def text2text_translation(self, translation_model, translation_tokenizer, transcript, device="cuda"):
107
  # model, tokenizer = load_translation_model()
108
 
@@ -336,6 +339,10 @@ class ChunkedTranscriber:
336
  return translation
337
 
338
 
 
 
 
 
339
 
340
  def transcribe_audio(self, audio_path, translate=False):
341
  """
@@ -353,7 +360,7 @@ class ChunkedTranscriber:
353
  'end_time': turn.end,
354
  'speaker': speaker
355
  })
356
-
357
  audio = self.load_audio(audio_path)
358
  chunks = self.preprocess_audio(audio)
359
 
 
6
  import spaces
7
  import torchaudio
8
  import numpy as np
9
+
10
+
11
+ from df.enhance import enhance, init_df
12
  from dotenv import load_dotenv
13
  load_dotenv()
14
+ from scipy.signal import resample
15
+ from pyannote.audio import Pipeline
16
  from difflib import SequenceMatcher
17
  from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor, Wav2Vec2ForCTC, AutoProcessor, AutoTokenizer, AutoModelForSeq2SeqLM
 
 
18
 
19
+
20
+ import logging
21
  logging.basicConfig(
22
  level=logging.INFO,
23
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 
41
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.getenv("HF_TOKEN"))
42
  return pipeline
43
 
44
+ @spaces.GPU(duration=180)
45
  def diarize_audio(self, audio_path):
46
  """
47
  Perform speaker diarization on the input audio.
 
56
  return processor, model
57
 
58
 
59
+ @spaces.GPU(duration=180)
60
  def language_identification(self, model, processor, chunk, device="cuda"):
61
  inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
62
  model.to(device)
 
80
  return model, processor
81
 
82
 
83
+ @spaces.GPU(duration=180)
84
  def mms_transcription(self, model, processor, chunk, device="cuda"):
85
 
86
  inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
 
105
  return model, tokenizer
106
 
107
 
108
+ @spaces.GPU(duration=180)
109
  def text2text_translation(self, translation_model, translation_tokenizer, transcript, device="cuda"):
110
  # model, tokenizer = load_translation_model()
111
 
 
339
  return translation
340
 
341
 
342
+ def audio_denoising():
343
+ model, df_state = init_df()
344
+ enhanced_audio = enhance(model, df_state, noisy_audio)
345
+ return enhanced_audio
346
 
347
  def transcribe_audio(self, audio_path, translate=False):
348
  """
 
360
  'end_time': turn.end,
361
  'speaker': speaker
362
  })
363
+
364
  audio = self.load_audio(audio_path)
365
  chunks = self.preprocess_audio(audio)
366