rgcodeai commited on
Commit
b8f59b3
1 Parent(s): aeec9bd

Upload 9 files

Browse files
README.md CHANGED
@@ -1,12 +1,42 @@
1
- ---
2
- title: Kit Whisperx Web Ui
3
- emoji: 🚀
4
- colorFrom: pink
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 4.31.5
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # **WhisperX Local Installation Kit**
2
+
3
+ ## **Description**
4
+
5
+ This project enables the local installation and use of WhisperX, an advanced audio transcription system based on OpenAI's Whisper but optimized for running on local hardware with or without a GPU. This project is made possible thanks to [Whisperx](https://github.com/m-bain/whisperX) and [Faster Whisper](https://github.com/SYSTRAN/faster-whisper). This document provides a general overview of the installation and links to the website where the [complete installation and usage](https://mistercontenidos.com/en/how-to-install-whisperx-locally) procedure for this project can be found.
6
+
7
+ ## **Requirements**
8
+
9
+ - Miniconda
10
+ - CUDA (only for NVIDIA GPU users)
11
+
12
+ ## **File Description**
13
+
14
+ - **`environment-cuda.yml`**: Configuration file for automatic installation on systems with NVIDIA GPU.
15
+ - **`environment-cpu.yml`**: Configuration file for automatic installation on systems without NVIDIA GPU.
16
+ - **`app.py`**: Script to run the WhisperX user interface on Gradio.
17
+ - **`transcription_utils.py`**: Transcription logic.
18
+
19
+ ## **Installation**
20
+
21
+ 1. **Miniconda**: [Miniconda Installation](https://docs.anaconda.com/free/miniconda/)
22
+ 2. **CUDA**: [CUDA Installation](https://developer.nvidia.com/cuda-toolkit-archive) (Only for NVIDIA GPU users)
23
+ 3. **GitHub Repository**: Download and setup of the repository. See details on our website.
24
+
25
+ For a detailed step-by-step installation process, please visit our website: [View complete installation process](https://mistercontenidos.com/en/how-to-install-whisperx-locally)
26
+
27
+ ## **Usage**
28
+
29
+ To use WhisperX after completing the installation:
30
+
31
+ 1. Activate the corresponding Conda environment.
32
+ 2. Run **`python app.py`** to start the Gradio user interface.
33
+
34
+ ## **Authors**
35
+
36
+ - [MISTER CONTENTS](https://mistercontenidos.com/)
37
+ - [Ricardo Gonzalez](https://www.linkedin.com/in/pedrocuervomkt/)
38
+
39
+ ## **Languages**
40
+
41
+ - [Spanish](docs/README_ES.md)
42
+ - [Portuguese](docs/README_PT.md)
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import time
4
+ import os
5
+ from src.transcription_utils import transcribe, language_options, model_options, ModelManager
6
+
7
+ class TranscriptionApp:
8
+ def __init__(self):
9
+ """
10
+ Initializes an instance with a ModelManager for managing AI models,
11
+ sets default device and model based on CUDA availability,
12
+ and prepares a Gradio app and outputs dictionary for UI interactions and storing results.
13
+ """
14
+ self.model_manager = ModelManager()
15
+ self.default_device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ self.default_model = "Large-v2" if torch.cuda.is_available() else "Medium"
17
+ self.app = gr.Blocks()
18
+ self.outputs = {}
19
+ self.last_transcription_time = 0
20
+
21
+ # Crear carpeta Temp si no existe
22
+ if not os.path.exists('Temp'):
23
+ os.makedirs('Temp')
24
+
25
+ def start_transcription(self, file, device, language, model):
26
+ """Start transcription process."""
27
+ start_time = time.time()
28
+
29
+ try:
30
+ results = transcribe(file, device, language, model, self.model_manager)
31
+ except ValueError as e:
32
+ return str(e), 0
33
+
34
+ end_time = time.time()
35
+ self.last_transcription_time = round(end_time - start_time, 1)
36
+
37
+ if results:
38
+ json_output, txt_path, vtt_path, srt_path = results
39
+ self.outputs = {
40
+ 'TXT': txt_path,
41
+ 'SRT': srt_path,
42
+ 'JSON': json_output,
43
+ 'VTT': vtt_path
44
+ }
45
+ return self.update_output_text('TXT'), self.last_transcription_time
46
+ return "No transcription available.", self.last_transcription_time
47
+
48
+
49
+ def update_output_text(self, format_choice):
50
+ """Update the text area based on the format choice."""
51
+ if format_choice and self.outputs.get(format_choice):
52
+ file_path = self.outputs[format_choice]
53
+ try:
54
+ with open(file_path, 'r', encoding='utf-8') as file:
55
+ return file.read()
56
+ except FileNotFoundError:
57
+ return "File not found."
58
+ return "No file available or format not selected."
59
+
60
+ # User interface for the transcription kit using Gradio
61
+ def setup_ui(self):
62
+ with self.app:
63
+ gr.Markdown("# Kit Transcriptor Whisperx")
64
+ gr.Markdown("❤️ Follow us on [YouTube](https://www.youtube.com/channel/UC_YzjCh-CSSCSGANvt5wBNQ?sub_confirmation=1), [GitHub](https://github.com/rgcodeai) 🌐 More on [Mister Contenidos](https://mistercontenidos.com)")
65
+ with gr.Row():
66
+ with gr.Column():
67
+ gr.Markdown("### Supported Formats: Audio (mp3, wav) and Video (mp4, avi, mov, flv)")
68
+ file_input = gr.File(label="Upload your multimedia file", type="filepath")
69
+ device_dropdown = gr.Dropdown(label="Select device", choices=["cuda", "cpu"], value=self.default_device)
70
+ model_dropdown = gr.Dropdown(label="Select model", choices=list(model_options.keys()), value=self.default_model)
71
+ language_dropdown = gr.Dropdown(label="Select language", choices=list(language_options.keys()), value="Identify")
72
+ transcribe_button = gr.Button("Start Transcription")
73
+
74
+ with gr.Column():
75
+ transcription_time_display = gr.Textbox(label="Last Transcription Time (seconds)", interactive=False, lines=1)
76
+ format_choice = gr.Radio(['TXT', 'SRT', 'VTT', 'JSON'], label="Select format to view:", value='TXT')
77
+ output_text = gr.Textbox(label="File Content", interactive=False, lines=10)
78
+ download_button = gr.Button("Download Transcription")
79
+ format_choice.change(fn=self.update_output_text, inputs=format_choice, outputs=output_text, queue=True)
80
+ download_button.click(fn=lambda x: self.outputs.get(x), inputs=format_choice, outputs=gr.File())
81
+
82
+ transcribe_button.click(fn=self.start_transcription, inputs=[file_input, device_dropdown, language_dropdown, model_dropdown], outputs=[output_text, transcription_time_display])
83
+
84
+ def launch(self):
85
+ """Launch the transcription application."""
86
+ self.setup_ui()
87
+ self.app.launch()
88
+
89
+
90
+ if __name__ == '__main__':
91
+ app = TranscriptionApp()
92
+ app.launch()
docs/README_ES.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # WhisperX Local Installation Kit
2
+
3
+ ## Descripción
4
+ Este proyecto permite la instalación y uso local de WhisperX, un avanzado transcriptor de audio basado en OpenAI Whisper pero optimizado para ejecución en hardware local con o sin GPU. Este proyecyo es posible gracias a [Whisperx](https://github.com/m-bain/whisperX) y [Faster Whisper](https://github.com/SYSTRAN/faster-whisper). En este documento se ofrece una descripción general de la instalación y la referencia al sitio web donde se encuentra el [procedimiento completo de instalación y uso](https://mistercontenidos.com/como-instalar-whisperx-en-local) de este proyecto.
5
+ ## Rrequisitos
6
+ - Miniconda
7
+ - CUDA (solo para usuarios con GPU NVIDIA)
8
+
9
+ ## Descripción de los archivos
10
+ - `environment-cuda.yml`: Archivo de configuración para la instalación automática en sistemas con GPU NVIDIA.
11
+ - `environment-cpu.yml`: Archivo de configuración para la instalación automática en sistemas sin GPU NVIDIA.
12
+ - `app.py`: Script para ejecutar la interfaz de usuario de WhisperX en Gradio.
13
+ - `transcription_utils.py`: Logica de trascripción.
14
+
15
+ ## Instalación
16
+
17
+ 1. **Miniconda**: [Instalación de Miniconda](https://docs.anaconda.com/free/miniconda/)
18
+ 2. **CUDA**: [Instalación de CUDA](https://developer.nvidia.com/cuda-toolkit-archive) (Solo para usuarios con GPU NVIDIA)
19
+ 3. **Repositorio de GitHub**: Descarga y configuración del repositorio. Ver detalles en nuestro sitio web.
20
+
21
+ Para un proceso de instalación detallado paso a paso, por favor visita nuestra página web: [Ver proceso de instalación completo](https://mistercontenidos.com/como-instalar-whisperx-en-local)
22
+
23
+ ## Uso
24
+ Para usar WhisperX después de completar la instalación:
25
+ 1. Activar el entorno Conda correspondiente.
26
+ 2. Ejecutar `python app.py` para iniciar la interfaz de usuario de Gradio.
27
+
28
+ ## Autores
29
+ - [MISTER CONTENIDOS](https://mistercontenidos.com/)
30
+ - [Ricardo Gonzalez](https://www.linkedin.com/in/pedrocuervomkt/)
31
+
32
+ ## Languages
33
+
34
+ - [Ingles](README.md)
35
+ - [Português](docs/README_PT.md)
docs/README_PT.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # **Kit de Instalação Local WhisperX**
2
+
3
+ ## **Descrição**
4
+
5
+ Este projeto possibilita a instalação e uso local do WhisperX, um avançado sistema de transcrição de áudio baseado no OpenAI Whisper, mas otimizado para execução em hardware local com ou sem GPU. Este projeto é possível graças ao [Whisperx](https://github.com/m-bain/whisperX) e ao [Faster Whisper](https://github.com/SYSTRAN/faster-whisper). Este documento oferece uma visão geral da instalação e o link para o site onde está disponível o [procedimento completo de instalação e uso deste projeto.](https://mistercontenidos.com/pt/como-instalar-o-whisperx-localmente)
6
+
7
+ ## **Requisitos**
8
+
9
+ - Miniconda
10
+ - CUDA (apenas para usuários com GPU NVIDIA)
11
+
12
+ ## **Descrição dos Arquivos**
13
+
14
+ - **`environment-cuda.yml`**: Arquivo de configuração para instalação automática em sistemas com GPU NVIDIA.
15
+ - **`environment-cpu.yml`**: Arquivo de configuração para instalação automática em sistemas sem GPU NVIDIA.
16
+ - **`app.py`**: Script para executar a interface de usuário do WhisperX no Gradio.
17
+ - **`transcription_utils.py`**: Lógica de transcrição.
18
+
19
+ ## **Instalação**
20
+
21
+ 1. **Miniconda**: [Instalação do Miniconda](https://docs.anaconda.com/free/miniconda/)
22
+ 2. **CUDA**: [Instalação do CUDA](https://developer.nvidia.com/cuda-toolkit-archive) (Apenas para usuários com GPU NVIDIA)
23
+ 3. **Repositório do GitHub**: Download e configuração do repositório. Veja detalhes em nosso site.
24
+
25
+ Para um processo de instalação detalhado passo a passo, por favor visite nosso site: [Ver processo de instalação completo](https://mistercontenidos.com/pt/como-instalar-o-whisperx-localmente)
26
+
27
+ ## **Uso**
28
+
29
+ Para usar o WhisperX após completar a instalação:
30
+
31
+ 1. Ative o ambiente Conda correspondente.
32
+ 2. Execute **`python app.py`** para iniciar a interface de usuário do Gradio.
33
+
34
+ ## **Autores**
35
+
36
+ - [MISTER CONTEÚDOS](https://mistercontenidos.com/)
37
+ - [Ricardo Gonzalez](https://www.linkedin.com/in/pedrocuervomkt/)
38
+
39
+ ## **Idiomas**
40
+
41
+ - [Inglês](README.md)
42
+ - [Português](docs/pt/README_PT.md)
environment-cpu.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: whisperx-web-ui
2
+ channels:
3
+ - defaults
4
+ - conda-forge
5
+ - pytorch
6
+ dependencies:
7
+ - python=3.10
8
+ - pytorch=2.0.0
9
+ - torchaudio=2.0.0
10
+ - conda-forge::gradio
11
+ - conda-forge::ffmpeg
12
+ - pip
13
+ - pip:
14
+ - whisperx
environment-cuda.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: whisperx-web-ui
2
+ channels:
3
+ - defaults
4
+ - conda-forge
5
+ - pytorch
6
+ - nvidia
7
+ dependencies:
8
+ - python=3.10
9
+ - pytorch=2.0.0
10
+ - torchaudio=2.0.0
11
+ - pytorch-cuda=11.8
12
+ - conda-forge::gradio
13
+ - conda-forge::ffmpeg
14
+ - pip
15
+ - pip:
16
+ - whisperx
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch==2.0.0+cu118 -f https://download.pytorch.org/whl/torch_stable.html
2
+ torchaudio==2.0.0 -f https://download.pytorch.org/whl/torch_stable.html
3
+ gradio
4
+ ffmpeg
5
+ whisperx
src/__init__.py ADDED
File without changes
src/transcription_utils.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisperx
2
+ import json
3
+ import os
4
+ import torch
5
+ import mimetypes
6
+ import shutil
7
+
8
+ # Define language options
9
+ language_options = {
10
+ "Identify": None,
11
+ "English": "en", "Spanish": "es", "Chinese": "zh", "Hindi": "hi", "Arabic": "ar",
12
+ "Portuguese": "pt", "Bengali": "bn", "Russian": "ru", "Japanese": "ja", "Punjabi": "pa",
13
+ "German": "de", "Javanese": "jv", "Wu Chinese": "zh", "Malay": "ms", "Telugu": "te",
14
+ "Vietnamese": "vi", "Korean": "ko", "French": "fr", "Marathi": "mr", "Turkish": "tr"
15
+ }
16
+
17
+ # Available models for transcription
18
+ model_options = {
19
+ "Large-v2": "large-v2",
20
+ "Medium": "medium",
21
+ "Small": "small",
22
+ "Base": "base"
23
+ }
24
+
25
+ # Initializes the ModelManager by setting default values and loading a model based on system capabilities (CUDA availability).
26
+ class ModelManager:
27
+ def __init__(self):
28
+ self.current_model = None
29
+ self.current_model_name = None
30
+ self.current_device = None
31
+ if torch.cuda.is_available():
32
+ default_device = "cuda"
33
+ default_model = "Large-v2"
34
+ else:
35
+ default_device = "cpu"
36
+ default_model = "Medium"
37
+ self.load_model(default_model, default_device)
38
+
39
+ def load_model(self, model_choice, device):
40
+ if self.current_model is None or model_choice != self.current_model_name or device != self.current_device:
41
+ print(f"Attempting to load model: {model_choice} on device: {device}")
42
+ compute_type = "float32" if device == "cpu" else "float16"
43
+ self.current_model = whisperx.load_model(model_options[model_choice], device, compute_type=compute_type)
44
+ self.current_model_name = model_choice
45
+ self.current_device = device
46
+ else:
47
+ print(f"Using already loaded model: {self.current_model_name} on device: {self.current_device}")
48
+ return self.current_model
49
+
50
+ # Validates if the given file path corresponds to a multimedia file (audio or video) by checking MIME types and specific file extensions.
51
+ def validate_multimedia_file(file_path):
52
+ file_path = os.path.normpath(file_path)
53
+ mime_type, _ = mimetypes.guess_type(file_path)
54
+ if mime_type and (mime_type.startswith('audio') or mime_type.startswith('video')):
55
+ return file_path
56
+ else:
57
+ if file_path.lower().endswith(('.mp3', '.mp4', '.wav', '.avi', '.mov', '.flv')):
58
+ return file_path
59
+ else:
60
+ raise ValueError("The uploaded file is not a multimedia file. Please upload an appropriate audio or video file.")
61
+
62
+ # Transcribes a multimedia file
63
+ def transcribe(file_obj, device, language, model_choice, model_manager):
64
+ """
65
+ Transcribes a multimedia file using a specified model, handling file operations,
66
+ language identification, and transcription alignment, and outputs transcription in multiple formats.
67
+ """
68
+ _, ext = os.path.splitext(file_obj.name)
69
+ temp_dir = os.path.join(os.getcwd(), 'Temp')
70
+
71
+ if not os.path.exists(temp_dir):
72
+ os.makedirs(temp_dir)
73
+ new_file_path = os.path.join(temp_dir, f'resource{ext}')
74
+
75
+ shutil.copy(file_obj.name, new_file_path)
76
+
77
+ model = model_manager.load_model(model_choice, device)
78
+
79
+ validated_file_path = validate_multimedia_file(new_file_path)
80
+ audio = whisperx.load_audio(validated_file_path)
81
+
82
+ if language == "Identify":
83
+ result = model.transcribe(audio)
84
+ language_code = result["language"]
85
+ else:
86
+ language_code = language_options[language]
87
+ result = model.transcribe(audio, language=language_code)
88
+
89
+ model_a, metadata = whisperx.load_align_model(language_code=language_code, device=device)
90
+ try:
91
+ aligned_segments = []
92
+ for segment in result["segments"]:
93
+ aligned_segment = whisperx.align([segment], model_a, metadata, audio, device, return_char_alignments=False)
94
+ aligned_segments.extend(aligned_segment["segments"])
95
+ except Exception as e:
96
+ print(f"Error during alignment: {e}")
97
+ return None
98
+
99
+ segments_output = {"segments": aligned_segments}
100
+ json_output = json.dumps(segments_output, ensure_ascii=False, indent=4)
101
+ json_file_path = download_json_interface(json_output, temp_dir)
102
+ txt_path = save_as_text(aligned_segments, temp_dir)
103
+ vtt_path = save_as_vtt(aligned_segments, temp_dir)
104
+ srt_path = save_as_srt(aligned_segments, temp_dir)
105
+ return json_file_path, txt_path, vtt_path, srt_path
106
+
107
+ # Saves the transcription text of audio segments to a file in the specified temporary directory and returns the file path.
108
+ def save_as_text(segments, temp_dir):
109
+ txt_file_path = os.path.join(temp_dir, 'transcription_output.txt')
110
+ with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
111
+ for segment in segments:
112
+ txt_file.write(f"{segment['text'].strip()}\n")
113
+ return txt_file_path
114
+
115
+
116
+ def save_as_vtt(segments, temp_dir):
117
+ """
118
+ Saves the transcription text as a .vtt file (Web Video Text Tracks format),
119
+ which includes timestamps for each segment, in the specified temporary directory and returns the file path.
120
+ """
121
+ vtt_file_path = os.path.join(temp_dir, 'transcription_output.vtt')
122
+ with open(vtt_file_path, 'w', encoding='utf-8') as vtt_file:
123
+ vtt_file.write("WEBVTT\n\n")
124
+ for i, segment in enumerate(segments):
125
+ start = segment['start']
126
+ end = segment['end']
127
+ vtt_file.write(f"{i}\n")
128
+ vtt_file.write(f"{format_time(start)} --> {format_time(end)}\n")
129
+ vtt_file.write(f"{segment['text'].strip()}\n\n")
130
+ return vtt_file_path
131
+
132
+ def download_json_interface(json_data, temp_dir):
133
+ """
134
+ Reads JSON-formatted transcription data, modifies and re-saves it in a neatly
135
+ formatted JSON file in the specified temporary directory, and returns the file path.
136
+ """
137
+ json_file_path = os.path.join(temp_dir, 'transcription_output.json')
138
+ with open(json_file_path, 'w', encoding='utf-8') as json_file:
139
+ json_data = json.loads(json_data)
140
+ for segment in json_data['segments']:
141
+ segment['text'] = segment['text'].strip()
142
+ json_data = json.dumps(json_data, ensure_ascii=False, indent=4)
143
+ json_file.write(json_data)
144
+ return json_file_path
145
+
146
+
147
+ def save_as_srt(segments, temp_dir):
148
+ """
149
+ Saves the transcription text as an .srt file (SubRip Subtitle format),
150
+ which includes numbered entries with start and end times and corresponding text for each segment,
151
+ in the specified temporary directory and returns the file path.
152
+ """
153
+ srt_file_path = os.path.join(temp_dir, 'transcription_output.srt')
154
+ with open(srt_file_path, 'w', encoding='utf-8') as srt_file:
155
+ for i, segment in enumerate(segments):
156
+ start = segment['start']
157
+ end = segment['end']
158
+ srt_file.write(f"{i+1}\n")
159
+ srt_file.write(f"{format_time_srt(start)} --> {format_time_srt(end)}\n")
160
+ srt_file.write(f"{segment['text'].strip()}\n\n")
161
+ return srt_file_path
162
+
163
+ # Converts a time value in seconds to a formatted string in the "hours:minutes:seconds,milliseconds" format, used for timestamps in VTT files.
164
+ def format_time(time_in_seconds):
165
+ hours = int(time_in_seconds // 3600)
166
+ minutes = int((time_in_seconds % 3600) // 60)
167
+ seconds = time_in_seconds % 60
168
+ return f"{hours:02}:{minutes:02}:{seconds:06.3f}"
169
+
170
+ # Converts a time value in seconds to a formatted string suitable for SRT files, specifically in the "hours:minutes:seconds,milliseconds" format.
171
+ def format_time_srt(time_in_seconds):
172
+ hours = int(time_in_seconds // 3600)
173
+ minutes = int((time_in_seconds % 3600) // 60)
174
+ seconds = int(time_in_seconds % 60)
175
+ milliseconds = int((time_in_seconds - int(time_in_seconds)) * 1000)
176
+ return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"