Spaces:
Runtime error
Runtime error
import os | |
import openai | |
import locale | |
locale.getpreferredencoding = lambda: "UTF-8" | |
import dl_translate as dlt | |
from deep_translator import GoogleTranslator | |
from languages import CODE2LANG, r2l_languages | |
from config import OPENAI_API_KEY, OPENAI_API_URL | |
openai.api_key = OPENAI_API_KEY | |
class Translation: | |
def __init__(self, model, transcript_dict, source_lang, target_lang, output_path): | |
self.transcript_dict = transcript_dict | |
self.output_path = os.path.join(os.getcwd(), output_path) | |
# Languages | |
self.source_lang = source_lang # Whisper Detected Language | |
self.target_lang = target_lang | |
# Transcript | |
self.transcript = transcript_dict['text'].strip() | |
self.subtitles = self.__get_subtitles() | |
# Translation Model | |
self.nllb = model | |
def __get_subtitles(self): | |
''' | |
Returns the subtitles from transcript dictionary | |
''' | |
subtitles = [] | |
for s in self.transcript_dict['segments']: | |
segment = { | |
'start': s['start'], | |
'end': s['end'], | |
'text': s['text'].strip() | |
} | |
subtitles.append(segment) | |
return subtitles | |
def __correct_punctuation_gpt(self): | |
''' | |
Corrects the Punctuation from GPT | |
''' | |
system_prompt = """ | |
You are a helpful NLP assistant. | |
Your task is to identify language of the provided text, | |
correct any spelling discrepancies in the transcribed text | |
as well as add punctuation in the multilingual text if they are missing. | |
Only add necessary punctuation such as periods, commas, and capitalization, | |
and use only the context provided. | |
You response should be as follows: | |
Corrected Text: | |
Here goes the corrected text with punctuation. | |
""" | |
user_prompt = f""" | |
Here is the text: | |
{self.transcript} | |
""" | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": user_prompt}, | |
] | |
) | |
text = response.choices[0].message.content.replace('Corrected Text:\n', '') | |
return text | |
def get_translated_transcript(self): | |
''' | |
Translates the transcript into required language | |
''' | |
# Correcting Punctuation using GPT | |
transcript = self.__correct_punctuation_gpt() | |
# Splitting Text into Sentences | |
if self.source_lang in r2l_languages.keys(): | |
splitter = '۔' | |
else: | |
splitter = '.' | |
sentences = transcript.split(splitter) | |
# Getting Translation using NLLB | |
translated_transcript = '' | |
for sentence in sentences: | |
translated_sentence = self.nllb.translate(sentence, source=CODE2LANG[self.source_lang], target=CODE2LANG[self.target_lang]) | |
translated_transcript += translated_sentence + splitter + ' ' | |
# print('Text:', sentence) | |
# print('Text:', translated_sentence) | |
# print() | |
translated_transcript = translated_transcript.strip() | |
return translated_transcript | |
def get_translated_subtitles(self): | |
''' | |
Translates the subtitles into required language | |
''' | |
# Creating copy of Transcript Dictionary | |
subtitles = self.subtitles.copy() | |
# Creating Instance for Google Translator | |
gt = GoogleTranslator(source='auto', target=self.target_lang) | |
for i, s in enumerate(subtitles): | |
subtitles[i]['text'] = gt.translate(text=s['text']) | |
return subtitles |