Andreyalth's picture
corregir archivo
cb194f1
import gradio as gr
import torch
from nemo.collections.asr.models import EncDecSpeakerLabelModel
import json
device = torch.device("cuda" if torch.cuda.is_available() else "cpu" )
THRESHOLD = 0.60
model_name = "nvidia/speakerverification_en_titanet_large"
model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device)
def create_voice_print(audio):
if not audio:
return json.dumps({ "error": "no se proporciono un audio"})
embs1 = model.get_embedding(audio).squeeze()
X = embs1 / torch.linalg.norm(embs1)
# return X.tolist()
return X
def compare_voice_print(X, Y):
# Score
similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
similarity_score = (similarity_score + 1) / 2
return similarity_score.item()
# encontrar como ejecutar la huella de voz
def find_matches(file, voice_print):
matches = []
if not file:
return json.dumps({"error": "No se proporcionó un archivo JSON"})
try:
json_content = json.load(open(file))
except json.JSONDecodeError:
return json.dumps({"error": "El archivo JSON no es válido"})
data = json_content.get("data", [])
# Convertir a tensor
voice_print = torch.tensor(json.loads(voice_print))
for speaker in data:
speaker_voice_print = torch.tensor(json.loads(speaker['voice_print']))
# speaker_voice_print = eval(speaker['voice_print'])
similarity_score = compare_voice_print(voice_print, speaker_voice_print)
print(similarity_score)
if similarity_score >= THRESHOLD:
matches.append({ "speaker": speaker, "similarity_score": similarity_score })
matches.sort(key=lambda match: match['similarity_score'], reverse=True)
return matches[:3]
voice_print_maker = gr.Interface(
fn=create_voice_print,
inputs=[gr.Audio(type="filepath")],
outputs=gr.JSON(),
)
voice_prints_loader = gr.Interface(
fn=find_matches,
inputs=[
gr.File(type="filepath", label="Upload JSON file"),
gr.TextArea()
],
outputs=gr.JSON(),
)
demo = gr.TabbedInterface([voice_print_maker, voice_prints_loader], ["app", "loader"])
demo.launch()