Bagus commited on
Commit
8acf810
Β·
verified Β·
1 Parent(s): a6e9c70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -55
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import torch
3
  import torchaudio
4
- # from torchaudio.sox_effects import apply_effects_file
5
  from transformers import AutoFeatureExtractor, AutoModelForAudioXVector
6
 
7
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -34,15 +34,6 @@ OUTPUT_FAIL = (
34
  """
35
  )
36
 
37
- EFFECTS = [
38
- ["remix", "-"],
39
- ["channels", "1"],
40
- ["rate", "16000"],
41
- ["gain", "-1.0"],
42
- ["silence", "1", "0.1", "0.1%", "-1", "0.1", "0.1%"],
43
- ["trim", "0", "10"],
44
- ]
45
-
46
  THRESHOLD = 0.80
47
 
48
  model_name = "microsoft/wavlm-base-plus-sv"
@@ -51,15 +42,19 @@ model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)
51
  cosine_sim = torch.nn.CosineSimilarity(dim=-1)
52
 
53
 
 
 
 
 
 
 
 
54
  def similarity_fn(path1, path2):
55
  if not (path1 and path2):
56
  return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
57
 
58
- # wav1, _ = apply_effects_file(path1, EFFECTS)
59
- # wav2, _ = apply_effects_file(path2, EFFECTS)
60
- wav1, _ = torchaudio.load(path1)
61
- wav2, _ = torchaudio.load(path2)
62
- print(wav1.shape, wav2.shape)
63
 
64
  input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
65
  input2 = feature_extractor(wav2.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
@@ -79,43 +74,38 @@ def similarity_fn(path1, path2):
79
  return output
80
 
81
 
82
- inputs = [
83
- gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speaker #1"),
84
- gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speaker #2"),
85
- ]
86
- output = gr.HTML(label="")
87
-
88
-
89
- description = (
90
- "This demo will compare two speech samples and determine if they are from the same speaker. "
91
- "Try it with your own voice!"
92
- )
93
- article = (
94
- "<p style='text-align: center'>"
95
- "<a href='https://huggingface.co/microsoft/wavlm-base-plus-sv' target='_blank'>πŸŽ™οΈ Learn more about WavLM</a> | "
96
- "<a href='https://arxiv.org/abs/2110.13900' target='_blank'>πŸ“š WavLM paper</a> | "
97
- "<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>πŸ“š X-Vector paper</a>"
98
- "</p>"
99
- )
100
- examples = [
101
- ["samples/denzel_washington.mp3", "samples/denzel_washington.mp3"],
102
- ["samples/heath_ledger_2.mp3", "samples/heath_ledger_3.mp3"],
103
- ["samples/heath_ledger_3.mp3", "samples/denzel_washington.mp3"],
104
- ["samples/denzel_washington.mp3", "samples/heath_ledger_2.mp3"],
105
- ]
106
-
107
- interface = gr.Interface(
108
- fn=similarity_fn,
109
- inputs=inputs,
110
- outputs=output,
111
- title="Voice Authentication with WavLM + X-Vectors",
112
- description=description,
113
- article=article,
114
- # layout="horizontal",
115
- theme="huggingface",
116
- allow_flagging=False,
117
- live=False,
118
- examples=examples,
119
- )
120
-
121
- interface.launch()
 
1
  import gradio as gr
2
  import torch
3
  import torchaudio
4
+ from torchaudio.transforms import Resample
5
  from transformers import AutoFeatureExtractor, AutoModelForAudioXVector
6
 
7
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
34
  """
35
  )
36
 
 
 
 
 
 
 
 
 
 
37
  THRESHOLD = 0.80
38
 
39
  model_name = "microsoft/wavlm-base-plus-sv"
 
42
  cosine_sim = torch.nn.CosineSimilarity(dim=-1)
43
 
44
 
45
+ def preprocess_audio(file_path, target_sr=16000):
46
+ wav, sr = torchaudio.load(file_path)
47
+ if sr != target_sr:
48
+ wav = Resample(orig_freq=sr, new_freq=target_sr)(wav)
49
+ return wav
50
+
51
+
52
  def similarity_fn(path1, path2):
53
  if not (path1 and path2):
54
  return '<b style="color:red">ERROR: Please record audio for *both* speakers!</b>'
55
 
56
+ wav1 = preprocess_audio(path1)
57
+ wav2 = preprocess_audio(path2)
 
 
 
58
 
59
  input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
60
  input2 = feature_extractor(wav2.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)
 
74
  return output
75
 
76
 
77
+ with gr.Blocks() as demo:
78
+ gr.Markdown("# Voice Authentication with WavLM + X-Vectors")
79
+ gr.Markdown(
80
+ "This demo compares two speech samples to determine if they are from the same speaker. "
81
+ "Try it with your own voice!"
82
+ )
83
+
84
+ with gr.Row():
85
+ input1 = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speaker #1")
86
+ input2 = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Speaker #2")
87
+
88
+ output = gr.HTML(label="Result")
89
+
90
+ btn = gr.Button("Compare Speakers")
91
+ btn.click(similarity_fn, inputs=[input1, input2], outputs=output)
92
+
93
+ gr.Examples(
94
+ examples=[
95
+ ["samples/denzel_washington.mp3", "samples/denzel_washington.mp3"],
96
+ ["samples/heath_ledger_2.mp3", "samples/heath_ledger_3.mp3"],
97
+ ["samples/heath_ledger_3.mp3", "samples/denzel_washington.mp3"],
98
+ ["samples/denzel_washington.mp3", "samples/heath_ledger_2.mp3"],
99
+ ],
100
+ inputs=[input1, input2],
101
+ )
102
+
103
+ gr.Markdown(
104
+ "<p style='text-align: center'>"
105
+ "<a href='https://huggingface.co/microsoft/wavlm-base-plus-sv' target='_blank'>πŸŽ™οΈ Learn more about WavLM</a> | "
106
+ "<a href='https://arxiv.org/abs/2110.13900' target='_blank'>πŸ“š WavLM paper</a> | "
107
+ "<a href='https://www.danielpovey.com/files/2018_icassp_xvectors.pdf' target='_blank'>πŸ“š X-Vector paper</a>"
108
+ "</p>"
109
+ )
110
+
111
+ demo.launch()