File size: 2,470 Bytes
462d118
 
 
 
 
573ac8e
 
 
 
 
 
 
 
 
 
 
 
 
 
462d118
 
a43fdc1
573ac8e
 
4a6c542
573ac8e
462d118
 
 
 
a43fdc1
462d118
 
 
573ac8e
 
 
 
 
462d118
 
 
573ac8e
a43fdc1
573ac8e
 
462d118
573ac8e
a43fdc1
462d118
a43fdc1
 
 
 
 
 
 
 
 
 
 
 
462d118
a43fdc1
573ac8e
a43fdc1
462d118
 
a43fdc1
462d118
 
 
573ac8e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import torchaudio
import torch
from model import M11
import gradio as gr

def _cut_if_necessary(signal):
    if signal.shape[1] > 400000:
      signal = signal[:, :400000]

    return signal

def _right_pad_if_necessary(signal):
    signal_length = signal.shape[1]
    if signal_length < 400000:
      num_missing_samples = 400000 - signal_length
      last_dim_padding = (0, num_missing_samples)   # will add 0 number of zeros in the left side of array and num_missing_samples number of zeros in the right part
      signal = torch.nn.functional.pad(signal, last_dim_padding)

    return signal

def preprocess(signal, sr, device):
    
    # add a channel dimension for 1d samples
    if len(signal.shape) == 1:
        signal = signal.unsqueeze(0)
        
    # resampling the audio signal with the training sample rate
    if sr != 8_000:
        resampler = torchaudio.transforms.Resample(sr, 8_000).to(device)
        signal = resampler(signal)
        
    # turning the stereo signals into mono
    if signal.shape[0] > 1:
        signal = torch.mean(signal, dim=0, keepdim=True)

    signal = _cut_if_necessary(signal)   # truncating longer signals
    signal = _right_pad_if_necessary(signal)   # extending shorter signals
    

    return signal


def pipeline(audio_file):
    
    audio_PATH = audio_file.name
    audio, sample_rate = torchaudio.load(audio_PATH)

    processed_audio = preprocess(audio.to(DEVICE), sample_rate, DEVICE)
    
    with torch.no_grad():
        pred = torch.exp(classifier(processed_audio.unsqueeze(0)).squeeze())    # turning log_softmax into probabilities

    return {labels[i]: float(pred[i]) for i in range(3)}


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model_PATH = "./model.ckpt"

labels = ["Threat", "Normal", "Sarcastic"]

classifier = M11.load_from_checkpoint(model_PATH).to(DEVICE)
classifier.eval()
  

inputs = gr.inputs.Audio(label="Input Audio", type="file")
outputs = gr.outputs.Label(num_top_classes=3)
title = "Threat Detection From Bengali Voice Calls"
description = "Gradio demo for Audio Classification, simply upload your audio, or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://github.com/khalidsaifullaah' target='_blank'>Github Repo</a></p>"
examples = [
    ['sample_audio.wav']
]
gr.Interface(pipeline, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()