DavidCombei's picture
Update app.py
fe7089f verified
raw
history blame
3.23 kB
import joblib
from transformers import AutoFeatureExtractor, WavLMModel
import torch
import soundfile as sf
import numpy as np
import gradio as gr
class HuggingFaceFeatureExtractor:
def __init__(self, model_class, name):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.feature_extractor = AutoFeatureExtractor.from_pretrained(name)
self.model = model_class.from_pretrained(name)
self.model.eval()
self.model.to(self.device)
def __call__(self, audio, sr):
inputs = self.feature_extractor(
audio,
sampling_rate=sr,
return_tensors="pt",
padding=True,
)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
return outputs.last_hidden_state
FEATURE_EXTRACTORS = {
"wavlm-base": lambda: HuggingFaceFeatureExtractor(WavLMModel, "microsoft/wavlm-base"),
"wavLM-V1": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-DeepFake_UTCN"),
"wavLM-V2": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-UTCN"),
"wavLM-V3": lambda: HuggingFaceFeatureExtractor(WavLMModel, "DavidCombei/wavLM-base-UTCN_114k"),
}
model1 = joblib.load('model1.joblib')
model2 = joblib.load('model2.joblib')
model3 = joblib.load('model3.joblib')
model4 = joblib.load('model4.joblib')
final_model = joblib.load('final_model.joblib')
def process_audio(file_audio):
#audio, sr = sf.read(file_audio)
audio, sr = librosa.load(file_audio,sr=16000)
if len(audio.shape)>1:
audio = audio[0]
extractor_1 = FEATURE_EXTRACTORS['wavlm-base']()
extractor_2 = FEATURE_EXTRACTORS['wavLM-V1']()
extractor_3 = FEATURE_EXTRACTORS['wavLM-V2']()
extractor_4 = FEATURE_EXTRACTORS['wavLM-V3']()
eval1 = extractor_1(audio, sr)
eval1 = torch.mean(eval1, dim=1).cpu().numpy()
eval2 = extractor_2(audio, sr)
eval2 = torch.mean(eval2, dim=1).cpu().numpy()
eval3 = extractor_3(audio, sr)
eval3 = torch.mean(eval3, dim=1).cpu().numpy()
eval4 = extractor_4(audio, sr)
eval4 = torch.mean(eval4, dim=1).cpu().numpy()
eval1 = eval1.reshape(1, -1)
eval2 = eval2.reshape(1, -1)
eval3 = eval3.reshape(1, -1)
eval4 = eval4.reshape(1, -1)
eval_prob1 = model1.predict_proba(eval1)[:, 1].reshape(-1, 1)
eval_prob2 = model2.predict_proba(eval2)[:, 1].reshape(-1, 1)
eval_prob3 = model3.predict_proba(eval3)[:, 1].reshape(-1, 1)
eval_prob4 = model4.predict_proba(eval4)[:, 1].reshape(-1, 1)
eval_combined_probs = np.hstack((eval_prob1, eval_prob2, eval_prob3, eval_prob4))
final_prob = final_model.predict_proba(eval_combined_probs)[:, 1]
if final_prob < 0.5:
return f"Fake with a confidence of: {100-final_prob[0] * 100:.2f}"
else:
return f"Real with a confidence of: {final_prob[0] * 100:.2f}"
interface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Audio Deepfake Detection",
description="Upload an audio file to detect whether it is fake or real.",
)
interface.launch(share=True)