Spaces:

Bagus
/

speaker-verification-demo

Runtime error

App Files Files Community

Bagus commited on Nov 26, 2024

Commit

8acf810

verified ·

1 Parent(s): a6e9c70

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -55

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import torch
 import torchaudio
-# from torchaudio.sox_effects import apply_effects_file
 from transformers import AutoFeatureExtractor, AutoModelForAudioXVector
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -34,15 +34,6 @@ OUTPUT_FAIL = (
 """
 )
-EFFECTS = [
-    ["remix", "-"],
-    ["channels", "1"],
-    ["rate", "16000"],
-    ["gain", "-1.0"],
-    ["silence", "1", "0.1", "0.1%", "-1", "0.1", "0.1%"],
-    ["trim", "0", "10"],
-]
 THRESHOLD = 0.80
 model_name = "microsoft/wavlm-base-plus-sv"
@@ -51,15 +42,19 @@ model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)
 cosine_sim = torch.nn.CosineSimilarity(dim=-1)
 def similarity_fn(path1, path2):
     if not (path1 and path2):
         return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
-    # wav1, _ = apply_effects_file(path1, EFFECTS)
-    # wav2, _ = apply_effects_file(path2, EFFECTS)
-    wav1, _ = torchaudio.load(path1)
-    wav2, _ = torchaudio.load(path2)
-    print(wav1.shape, wav2.shape)
     input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
     input2 = feature_extractor(wav2.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
@@ -79,43 +74,38 @@ def similarity_fn(path1, path2):
     return output
-inputs = [
-    gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speaker #1"),
-    gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speaker #2"),
-]
-output = gr.HTML(label="")
-description = (
-    "This demo will compare two speech samples and determine if they are from the same speaker. "
-    "Try it with your own voice!"
-)
-article = (
-    "<p style='text-align: center'>"
-    "<a href='https://huggingface.co/microsoft/wavlm-base-plus-sv' target='_blank'>🎙️ Learn more about WavLM</a> | "
-    "<a href='https://arxiv.org/abs/2110.13900' target='_blank'>📚 WavLM paper</a> | "
-    "<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>📚 X-Vector paper</a>"
-    "</p>"
-)
-examples = [
-    ["samples/denzel_washington.mp3", "samples/denzel_washington.mp3"],
-    ["samples/heath_ledger_2.mp3", "samples/heath_ledger_3.mp3"],
-    ["samples/heath_ledger_3.mp3", "samples/denzel_washington.mp3"],
-    ["samples/denzel_washington.mp3", "samples/heath_ledger_2.mp3"],
-]
-interface = gr.Interface(
-    fn=similarity_fn,
-    inputs=inputs,
-    outputs=output,
-    title="Voice Authentication with WavLM + X-Vectors",
-    description=description,
-    article=article,
-    # layout="horizontal",
-    theme="huggingface",
-    allow_flagging=False,
-    live=False,
-    examples=examples,
-)
-interface.launch()

 import gradio as gr
 import torch
 import torchaudio
+from torchaudio.transforms import Resample
 from transformers import AutoFeatureExtractor, AutoModelForAudioXVector
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 """
 )
 THRESHOLD = 0.80
 model_name = "microsoft/wavlm-base-plus-sv"
 cosine_sim = torch.nn.CosineSimilarity(dim=-1)
+def preprocess_audio(file_path, target_sr=16000):
+    wav, sr = torchaudio.load(file_path)
+    if sr != target_sr:
+        wav = Resample(orig_freq=sr, new_freq=target_sr)(wav)
+    return wav
 def similarity_fn(path1, path2):
     if not (path1 and path2):
         return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
+    wav1 = preprocess_audio(path1)
+    wav2 = preprocess_audio(path2)
     input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
     input2 = feature_extractor(wav2.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
     return output
+with gr.Blocks() as demo:
+    gr.Markdown("# Voice Authentication with WavLM + X-Vectors")
+    gr.Markdown(
+        "This demo compares two speech samples to determine if they are from the same speaker. "
+        "Try it with your own voice!"
+    )
+    with gr.Row():
+        input1 = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speaker #1")
+        input2 = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speaker #2")
+    output = gr.HTML(label="Result")
+    btn = gr.Button("Compare Speakers")
+    btn.click(similarity_fn, inputs=[input1, input2], outputs=output)
+    gr.Examples(
+        examples=[
+            ["samples/denzel_washington.mp3", "samples/denzel_washington.mp3"],
+            ["samples/heath_ledger_2.mp3", "samples/heath_ledger_3.mp3"],
+            ["samples/heath_ledger_3.mp3", "samples/denzel_washington.mp3"],
+            ["samples/denzel_washington.mp3", "samples/heath_ledger_2.mp3"],
+        ],
+        inputs=[input1, input2],
+    )
+    gr.Markdown(
+        "<p style='text-align: center'>"
+        "<a href='https://huggingface.co/microsoft/wavlm-base-plus-sv' target='_blank'>🎙️ Learn more about WavLM</a> | "
+        "<a href='https://arxiv.org/abs/2110.13900' target='_blank'>📚 WavLM paper</a> | "
+        "<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>📚 X-Vector paper</a>"
+        "</p>"
+    )
+demo.launch()