Spaces:
Runtime error
Runtime error
File size: 6,324 Bytes
0b2a06f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# -*- coding: utf-8 -*-
"""Assignment-2-IT164_ajchri5
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1RtE7mmtyUWwiuowgyQq4eCuH-ep_D1QQ
"""
# mount gd
from google.colab import drive
drive.mount('/content/drive')
# Commented out IPython magic to ensure Python compatibility.
# # token
# %%capture
# from google.colab import userdata
# hftoken=userdata.get('hftoken')
# Commented out IPython magic to ensure Python compatibility.
# # pi
# %%capture
# !pip install gradio
# !pip install huggingface_hub
# packages required for colab
!pip install gradio
!pip install transformers
!pip install torchaudio
!pip install fasttext
# fastText for language detection
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
# imports required for colab
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline, EncoderDecoderCache
import torchaudio
import warnings
import fasttext
import pandas as pd
import csv
import os
# hides warnings with pysoundfile
warnings.filterwarnings("ignore", category=UserWarning, message="PySoundFile failed.*")
# load model 1 transcription
whisper_model_name = "openai/whisper-large"
processor = WhisperProcessor.from_pretrained(whisper_model_name)
whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
# load model 2 translation
translation_model = pipeline("translation", model="Helsinki-NLP/opus-mt-ROMANCE-en")
# load additional model 3 language detection
lang_model = fasttext.load_model('lid.176.bin') # pre-trained model
# app usage history
history_data = []
# save data csv
def saveData(text, language, translated_text, confidence_score):
# gd path
file_path = '/content/drive/MyDrive/IT164/a2prompt.csv'
# check if file exists, if not make new one with headers
file_exists = os.path.isfile(file_path)
# open csv file to append data
with open(file_path, 'a', newline='', encoding='utf-8') as f:
w = csv.writer(f)
if not file_exists:
# write header if file is created
w.writerow(['Text', 'Language', 'Translation', 'Confidence Score'])
# write new data row
w.writerow([text, language, translated_text, confidence_score])
# load audio input and transcribe
def transcribe_audio(audio_file, sampling_rate=48000): # set to 48 kHz
# load audio file with torchaudio
waveform, sr = torchaudio.load(audio_file, normalize=True)
# max 16kHz (resample)
if sr != 16000:
transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000) # resample to 16 kHz
waveform = transform(waveform)
sr = 16000 # update as 16 kHz
# whisperprocessor
inputs = processor(waveform.squeeze(0).numpy(), return_tensors="pt", sampling_rate=sr)
# generate transcription and handle "past_key_values deprecation" error
past_key_values = None
generated_ids = whisper_model.generate(
inputs["input_features"],
past_key_values=past_key_values
)
# encoderdecodercache (to handle past_key_values)
if past_key_values is not None:
past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
return processor.decode(generated_ids[0], skip_special_tokens=True)
# detect language using fastText
def detect_language(text):
result = lang_model.predict(text) # predict language with fasttext
language = result[0][0].replace('__label__', '') # extract the predicted language label
score = result[1][0] # confidence score
return language, score
# translate text (to english)
def translate_text_to_english(text, source_lang="fr"):
# translate detected language
translation = translation_model(text, src_lang=source_lang, tgt_lang="en")
return translation[0]['translation_text']
# function to track history (save results to the list and save to csv)
def save_to_history(text, language, translation, confidence_score):
history_data.append([text, language, translation, confidence_score])
# save csv
saveData(text, language, translation, confidence_score)
# process audio, transcribe, detect language, and translate
def process_audio(audio_file):
transcription = transcribe_audio(audio_file, sampling_rate=48000) # use 48 kHz initially (mac rate)
language, score = detect_language(transcription) # detect language of the transcription
translated_text = translate_text_to_english(transcription, source_lang=language) # translate
save_to_history(transcription, language, translated_text, score) # save results
return transcription, language, score, translated_text
# update visibility of the history table in gradio
def update_vis(radio_value):
if radio_value == 'show':
return gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=True)
else:
return gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=False)
# gradio interface
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="Record your voice", type="filepath") # audio input
transcription_output = gr.Textbox(label="Transcription") # transcription output
language_output = gr.Textbox(label="Detected Language") # detected language output
score_output = gr.Textbox(label="Confidence Score") # confidence score output
translated_output = gr.Textbox(label="Translated Text to English") # translated text output
process_button = gr.Button("Process Audio") # button to process the audio
with gr.Column():
history = gr.Radio(['show', 'hide'], label="App usage history") # "show" or "hide" (history)
dataframe = gr.DataFrame(pd.DataFrame(history_data, columns=["Text", "Language", "Translation", "Confidence Score"]), visible=False)
# button click (process audio and display output)
process_button.click(fn=process_audio, inputs=[audio_input], outputs=[transcription_output, language_output, score_output, translated_output])
history.change(fn=update_vis, inputs=history, outputs=dataframe)
demo.launch(debug=True) |