Spaces:

DrishtiSharma
/

Diarization

Sleeping

App Files Files Community

DrishtiSharma commited on Jan 3, 2022

Commit

5da8d71

•

1 Parent(s): ca5824e

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -53

app.py CHANGED Viewed

@@ -1,68 +1,69 @@
-# -*- coding: utf-8 -*-
-"""Untitled29.ipynb
-Automatically generated by Colaboratory.
-Original file is located at
-    https://colab.research.google.com/drive/1Lv3LjRH9bHwMhKsWvFcELMzKqmXd9UIb
-"""
-!pip install -q transformers
-!pip install -q gradio
-import nltk
-import librosa
-import torch
-import soundfile as sf
-import gradio as gr
-from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
-nltk.download("punkt")
-input_file = "/content/drive/MyDrive/AAAAUDIO/My Audio.wav"
-tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
-model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
-def load_data(input_file):
-  """ Function for resampling to ensure that the speech input is sampled at 16KHz.
-  """
-  #read the file
-  speech, sample_rate = sf.read(input_file)
-  #make it 1-D
-  if len(speech.shape) > 1:
-      speech = speech[:,0] + speech[:,1]
-  #Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
-  if sample_rate !=16000:
-    speech = librosa.resample(speech, sample_rate,16000)
-  return speech
-def asr_transcript(input_file):
-  speech = load_data(input_file)
-  #Tokenize
-  input_values = tokenizer(speech, return_tensors="pt").input_values
-  #Take logits
-  logits = model(input_values).logits
-  #Take argmax
-  predicted_ids = torch.argmax(logits, dim=-1)
-  #Get the words from predicted word ids
-  transcription = tokenizer.decode(predicted_ids[0])
-  #Output is all upper case
-  transcription = correct_casing(transcription.lower())
-  return transcription
-gr.Interface(asr_transcript,
-             inputs = gr.inputs.Audio(label = "Input Audio", type= "file"),
-             outputs = gr.outputs.Textbox(label="Output Text"),
-             title="Real-time ASR using Wav2Vec 2.0",
-             description = "asdfghnjmk",
-             examples = [["/content/drive/MyDrive/AAAAUDIO/My Audio.wav"]]).launch()

+#importing all the necessary packages
+import torch
+import transformers
+import gradio as gr
+from torchaudio.sox_effects import apply_effects_file
+from termcolor import colored
+from transformers import Wav2Vec2FeatureExtractor, UniSpeechSatForAudioFrameClassification
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Defines the effects to apply to the audio file
+EFFECTS = [
+    ['remix', '-'],        # merge all the channels
+    ["channels", "1"],     #channel-->mono
+    ["rate", "16000"],     # resample to 16000 Hz
+    ["gain", "-1.0"],      #Attenuation -1 dB
+    ["silence", "1", "0.1", "0.1%", "-1", "0.1", "0.1%"],
+    #['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
+    ['trim', '0', '10'],   # get the first 10 seconds
+]
+THRESHOLD = 0.85 #depends on dataset
+model_name = "microsoft/unispeech-sat-base-sd"
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
+model = UniSpeechSatForAudioFrameClassification.from_pretrained(model_name).to(device)
+def fn(path):
+  #Applying the effects to the audio input file
+  wav, _ = apply_effects_file(path, EFFECTS)
+  #Extracting features
+  input = feature_extractor(wav.squeez(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
+  with torch.no_grad():
+    logits = model(input).logits
+  logits = logits.to(device)
+  probabilities = torch.sigmoid(logits[0])
+  # labels is a one-hot array of shape (num_frames, num_speakers)
+  labels = (probabilities > 0.5).long()
+  return labels
+inputs = [
+    gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"),
+]
+output = gr.outputs.HTML(label="")
+gr.Interface(
+    fn=fn,
+    inputs=inputs,
+    outputs=output,
+    title="Speaker diarization using UniSpeech-SAT and X-Vectors").launch(enable_queue=True)