whispertube_backend / translation.py
uzi007's picture
Added GPU Models
a689179
import os
import openai
import locale
locale.getpreferredencoding = lambda: "UTF-8"
import dl_translate as dlt
from deep_translator import GoogleTranslator
from languages import CODE2LANG, r2l_languages
from config import OPENAI_API_KEY, OPENAI_API_URL
openai.api_key = OPENAI_API_KEY
class Translation:
def __init__(self, model, transcript_dict, source_lang, target_lang, output_path):
self.transcript_dict = transcript_dict
self.output_path = os.path.join(os.getcwd(), output_path)
# Languages
self.source_lang = source_lang # Whisper Detected Language
self.target_lang = target_lang
# Transcript
self.transcript = transcript_dict['text'].strip()
self.subtitles = self.__get_subtitles()
# Translation Model
self.nllb = model
def __get_subtitles(self):
'''
Returns the subtitles from transcript dictionary
'''
subtitles = []
for s in self.transcript_dict['segments']:
segment = {
'start': s['start'],
'end': s['end'],
'text': s['text'].strip()
}
subtitles.append(segment)
return subtitles
def __correct_punctuation_gpt(self):
'''
Corrects the Punctuation from GPT
'''
system_prompt = """
You are a helpful NLP assistant.
Your task is to identify language of the provided text,
correct any spelling discrepancies in the transcribed text
as well as add punctuation in the multilingual text if they are missing.
Only add necessary punctuation such as periods, commas, and capitalization,
and use only the context provided.
You response should be as follows:
Corrected Text:
Here goes the corrected text with punctuation.
"""
user_prompt = f"""
Here is the text:
{self.transcript}
"""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
)
text = response.choices[0].message.content.replace('Corrected Text:\n', '')
return text
def get_translated_transcript(self):
'''
Translates the transcript into required language
'''
# Correcting Punctuation using GPT
transcript = self.__correct_punctuation_gpt()
# Splitting Text into Sentences
if self.source_lang in r2l_languages.keys():
splitter = '۔'
else:
splitter = '.'
sentences = transcript.split(splitter)
# Getting Translation using NLLB
translated_transcript = ''
for sentence in sentences:
translated_sentence = self.nllb.translate(sentence, source=CODE2LANG[self.source_lang], target=CODE2LANG[self.target_lang])
translated_transcript += translated_sentence + splitter + ' '
# print('Text:', sentence)
# print('Text:', translated_sentence)
# print()
translated_transcript = translated_transcript.strip()
return translated_transcript
def get_translated_subtitles(self):
'''
Translates the subtitles into required language
'''
# Creating copy of Transcript Dictionary
subtitles = self.subtitles.copy()
# Creating Instance for Google Translator
gt = GoogleTranslator(source='auto', target=self.target_lang)
for i, s in enumerate(subtitles):
subtitles[i]['text'] = gt.translate(text=s['text'])
return subtitles