File size: 1,660 Bytes
08b1403
 
 
 
daff1a1
08b1403
 
 
 
 
 
2055db9
d10e3f6
e5c80b6
2055db9
544ba80
5c42cb4
08b1403
 
daff1a1
e5c80b6
08b1403
 
 
 
 
 
 
 
 
 
 
 
 
 
3d435c7
 
 
 
 
 
08b1403
 
3d435c7
 
 
d10e3f6
3d435c7
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import gradio as gr
import torch
from wenet.cli.model import load_model
from huggingface_hub import hf_hub_download
#import spaces

REPO_ID = "Revai/reverb-asr"
files = ['reverb_asr_v1.jit.zip', 'tk.units.txt']
downloaded_files = [hf_hub_download(repo_id=REPO_ID, filename=f) for f in files]
model = load_model(downloaded_files[0], downloaded_files[1])
    


def process_cat_embs(style):
    device = torch.device("cpu")
    cat_embs = torch.tensor([float(c) for c in style.split(',')]).to(device)
    return cat_embs


#@spaces.GPU
def transcribe_audio(audio, style=0):
    if not audio:
        return "Input Error! Please enter one audio!"
    
    cat_embs = process_cat_embs(f'{style},{1-style}')
    result = model.transcribe(audio, cat_embs=cat_embs)

    if not result or 'text' not in result:
        return "ERROR! No text output! Please try again!"
    
    text_output = result['text'].replace('▁', ' ')
    return text_output



audio_input = gr.Audio(type="filepath", label="Upload or Record Audio")
style_slider = gr.Slider(0, 1, value=0, step=0.1, label="Transcription Style",
                             info="Adjust the transcription style: 0 (casual) to 1 (formal).")
output_textbox = gr.Textbox(label="Transcription Output")
    
description = "This tool transcribes audio using a customizable transcription style ranging from casual to formal. Upload or record an audio file to begin."

iface = gr.Interface(
        fn=transcribe_audio,
        inputs=[audio_input, style_slider],
        outputs=output_textbox,
        title="Audio Transcription",
        description=description,
        theme="default"
    )
    

iface.launch()