File size: 6,324 Bytes
0b2a06f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- coding: utf-8 -*-
"""Assignment-2-IT164_ajchri5

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1RtE7mmtyUWwiuowgyQq4eCuH-ep_D1QQ
"""

# mount gd
from google.colab import drive
drive.mount('/content/drive')

# Commented out IPython magic to ensure Python compatibility.
# # token
# %%capture
# from google.colab import userdata
# hftoken=userdata.get('hftoken')

# Commented out IPython magic to ensure Python compatibility.
# # pi
# %%capture
# !pip install gradio
# !pip install huggingface_hub

# packages required for colab
!pip install gradio
!pip install transformers
!pip install torchaudio
!pip install fasttext

# fastText for language detection
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

# imports required for colab
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline, EncoderDecoderCache
import torchaudio
import warnings
import fasttext
import pandas as pd
import csv
import os

# hides warnings with pysoundfile
warnings.filterwarnings("ignore", category=UserWarning, message="PySoundFile failed.*")

# load model 1 transcription
whisper_model_name = "openai/whisper-large"
processor = WhisperProcessor.from_pretrained(whisper_model_name)
whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)

# load model 2 translation
translation_model = pipeline("translation", model="Helsinki-NLP/opus-mt-ROMANCE-en")

# load additional model 3 language detection
lang_model = fasttext.load_model('lid.176.bin')  # pre-trained model

# app usage history
history_data = []

# save data csv
def saveData(text, language, translated_text, confidence_score):
    # gd path
    file_path = '/content/drive/MyDrive/IT164/a2prompt.csv'

    # check if file exists, if not make new one with headers
    file_exists = os.path.isfile(file_path)

    # open csv file to append data
    with open(file_path, 'a', newline='', encoding='utf-8') as f:
        w = csv.writer(f)
        if not file_exists:
            # write header if file is created
            w.writerow(['Text', 'Language', 'Translation', 'Confidence Score'])
        # write new data row
        w.writerow([text, language, translated_text, confidence_score])

# load audio input and transcribe
def transcribe_audio(audio_file, sampling_rate=48000):  # set to 48 kHz
    # load audio file with torchaudio
    waveform, sr = torchaudio.load(audio_file, normalize=True)

    # max 16kHz (resample)
    if sr != 16000:
        transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)  # resample to 16 kHz
        waveform = transform(waveform)
        sr = 16000  # update as 16 kHz

    # whisperprocessor
    inputs = processor(waveform.squeeze(0).numpy(), return_tensors="pt", sampling_rate=sr)

    # generate transcription and handle "past_key_values deprecation" error
    past_key_values = None
    generated_ids = whisper_model.generate(
        inputs["input_features"],
        past_key_values=past_key_values
    )

    # encoderdecodercache (to handle past_key_values)
    if past_key_values is not None:
        past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)

    return processor.decode(generated_ids[0], skip_special_tokens=True)

# detect language using fastText
def detect_language(text):
    result = lang_model.predict(text)  # predict language with fasttext
    language = result[0][0].replace('__label__', '')  # extract the predicted language label
    score = result[1][0]  # confidence score
    return language, score

# translate text (to english)
def translate_text_to_english(text, source_lang="fr"):
    # translate detected language
    translation = translation_model(text, src_lang=source_lang, tgt_lang="en")
    return translation[0]['translation_text']

# function to track history (save results to the list and save to csv)
def save_to_history(text, language, translation, confidence_score):
    history_data.append([text, language, translation, confidence_score])
    # save csv
    saveData(text, language, translation, confidence_score)

# process audio, transcribe, detect language, and translate
def process_audio(audio_file):
    transcription = transcribe_audio(audio_file, sampling_rate=48000)  # use 48 kHz initially (mac rate)
    language, score = detect_language(transcription)  # detect language of the transcription
    translated_text = translate_text_to_english(transcription, source_lang=language)  # translate
    save_to_history(transcription, language, translated_text, score)  # save results
    return transcription, language, score, translated_text

# update visibility of the history table in gradio
def update_vis(radio_value):
    if radio_value == 'show':
        return gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=True)
    else:
        return gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=False)

# gradio interface
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="Record your voice", type="filepath")  # audio input
            transcription_output = gr.Textbox(label="Transcription")  # transcription output
            language_output = gr.Textbox(label="Detected Language")  # detected language output
            score_output = gr.Textbox(label="Confidence Score")  # confidence score output
            translated_output = gr.Textbox(label="Translated Text to English")  # translated text output
            process_button = gr.Button("Process Audio")  # button to process the audio

        with gr.Column():
            history = gr.Radio(['show', 'hide'], label="App usage history")  # "show" or "hide" (history)
            dataframe = gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=False)

        # button click (process audio and display output)
        process_button.click(fn=process_audio, inputs=[audio_input], outputs=[transcription_output, language_output, score_output, translated_output])
        history.change(fn=update_vis, inputs=history, outputs=dataframe)

demo.launch(debug=True)