import gradio as gr import torch import time import os from src.transcription_utils import transcribe, language_options, model_options, ModelManager class TranscriptionApp: def __init__(self): """ Initializes an instance with a ModelManager for managing AI models, sets default device and model based on CUDA availability, and prepares a Gradio app and outputs dictionary for UI interactions and storing results. """ self.model_manager = ModelManager() self.default_device = "cuda" if torch.cuda.is_available() else "cpu" self.default_model = "Large-v3" if torch.cuda.is_available() else "Small" self.app = gr.Blocks() self.outputs = {} self.last_transcription_time = 0 # Crear carpeta Temp si no existe if not os.path.exists('Temp'): os.makedirs('Temp') def start_transcription(self, file, device, language, model): """Start transcription process.""" start_time = time.time() try: results = transcribe(file, device, language, model, self.model_manager) except ValueError as e: return str(e), 0 end_time = time.time() self.last_transcription_time = round(end_time - start_time, 1) if results: json_output, txt_path, vtt_path, srt_path = results self.outputs = { 'TXT': txt_path, 'SRT': srt_path, 'JSON': json_output, 'VTT': vtt_path } return self.update_output_text('TXT'), self.last_transcription_time return "No transcription available.", self.last_transcription_time def update_output_text(self, format_choice): """Update the text area based on the format choice.""" if format_choice and self.outputs.get(format_choice): file_path = self.outputs[format_choice] try: with open(file_path, 'r', encoding='utf-8') as file: return file.read() except FileNotFoundError: return "File not found." return "No file available or format not selected." # User interface for the transcription kit using Gradio def setup_ui(self): with self.app: gr.Markdown("# Whisperx Kit") gr.Markdown("❤️ Try to make a nice open source transcription toolkit.") gr.Markdown("❤️ Base on https://github.com/rgcodeai/Kit-Whisperx. Thanks for Ricardo G.") with gr.Row(): with gr.Column(): gr.Markdown("### Supported Formats: Audio (mp3, wav) and Video (mp4, avi, mov, flv)") file_input = gr.File(label="Upload your multimedia file", type="file") device_dropdown = gr.Dropdown(label="Select device", choices=["cuda", "cpu"], value=self.default_device) model_dropdown = gr.Dropdown(label="Select model", choices=list(model_options.keys()), value=self.default_model) language_dropdown = gr.Dropdown(label="Select language", choices=list(language_options.keys()), value="Identify") transcribe_button = gr.Button("Start Transcription") with gr.Column(): transcription_time_display = gr.Textbox(label="Last Transcription Time (seconds)", interactive=False, lines=1) format_choice = gr.Radio(['TXT', 'SRT', 'VTT', 'JSON'], label="Select format to view:", value='TXT') output_text = gr.Textbox(label="File Content", interactive=False, lines=10) download_button = gr.Button("Download Transcription") format_choice.change(fn=self.update_output_text, inputs=format_choice, outputs=output_text) download_button.click(fn=lambda x: self.outputs.get(x), inputs=format_choice, outputs=gr.File()) transcribe_button.click(fn=self.start_transcription, inputs=[file_input, device_dropdown, language_dropdown, model_dropdown], outputs=[output_text, transcription_time_display]) def launch(self): """Launch the transcription application.""" self.setup_ui() self.app.launch() if __name__ == '__main__': app = TranscriptionApp() app.launch()