datasetTTS / app.py
Woziii's picture
Update app.py
90c7625 verified
raw
history blame
8.02 kB
# Version: Corrected After Test 3 (V2.3.0 - Structured, Commented, and ZIP Generation Restored)
# Description: Cette version structure le script selon l'ordre des étapes du processus.
# La génération du fichier ZIP a été réintégrée après avoir été omise dans la version précédente.
# Chaque section est commentée pour assurer une meilleure lisibilité et une logique claire.
import os
import shutil
import zipfile
import torch
import numpy as np
from pathlib import Path
import gradio as gr
from pydub import AudioSegment
from transformers import pipeline
# -------------------------------------------------
# 1. Configuration et Initialisation
# -------------------------------------------------
MODEL_NAME = "openai/whisper-large-v3"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialisation du modèle Whisper
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
device=device,
model_kwargs={"low_cpu_mem_usage": True},
)
# Création du répertoire temporaire pour stocker les extraits audio
TEMP_DIR = "./temp_audio"
os.makedirs(TEMP_DIR, exist_ok=True)
def init_metadata_state():
return []
# -------------------------------------------------
# 2. Transcription de l'audio avec Whisper
# -------------------------------------------------
def transcribe_audio(audio_path):
if not audio_path:
print("[LOG] Aucun fichier audio fourni.")
return "Aucun fichier audio fourni", [], None, [], ""
print(f"[LOG] Début de la transcription de {audio_path}...")
result = pipe(audio_path, return_timestamps="word")
words = result.get("chunks", [])
if not words:
print("[LOG ERROR] Erreur : Aucun timestamp détecté.")
return "Erreur : Aucun timestamp détecté.", [], None, [], ""
raw_transcription = " ".join([w["text"] for w in words])
word_timestamps = []
for i, w in enumerate(words):
word = w["text"]
start = w["timestamp"][0]
end = words[i + 1]["timestamp"][0] - 0.01 if i + 1 < len(words) else start + 0.5 # Fin = début du mot suivant - 10ms
word_timestamps.append((word, (start, end)))
transcription_with_timestamps = " ".join([f"{w[0]}[{w[1][0]:.2f}-{w[1][1]:.2f}]" for w in word_timestamps])
print(f"[LOG] Transcription brute : {raw_transcription}")
print(f"[LOG DETAIL] Timestamps associés : {word_timestamps}")
return raw_transcription, [], audio_path, word_timestamps, transcription_with_timestamps
# -------------------------------------------------
# 3. Prétraitement des segments : Associer les timestamps aux phrases sélectionnées
# -------------------------------------------------
def preprocess_segments(table_data, word_timestamps):
print("[LOG] Début du prétraitement des segments...")
formatted_data = []
for i, row in enumerate(table_data):
if not row or len(row) < 1 or not row[0].strip():
print(f"[LOG WARNING] Ignoré : ligne vide à l'index {i}.")
continue
text = row[0].strip()
segment_id = f"seg_{i+1:02d}"
start_time, end_time = None, None
words_in_segment = text.split()
segment_indices = []
for j, (word, (start, end)) in enumerate(word_timestamps):
if word in words_in_segment:
segment_indices.append((j, start, end))
if segment_indices:
start_time = segment_indices[0][1]
end_time = segment_indices[-1][2]
formatted_data.append([text, start_time, end_time, segment_id])
print(f"[LOG] Segment ajouté : {text} | Début: {start_time}, Fin: {end_time}, ID: {segment_id}")
return formatted_data
# -------------------------------------------------
# 4. Validation et découpage des extraits audio
# -------------------------------------------------
def validate_segments(audio_path, table_data, metadata_state, word_timestamps):
print("[LOG] Début de la validation des segments...")
if not audio_path or not word_timestamps:
print("[LOG ERROR] Erreur : Aucun timestamp valide trouvé !")
return [], metadata_state
if os.path.exists(TEMP_DIR):
shutil.rmtree(TEMP_DIR)
os.makedirs(TEMP_DIR, exist_ok=True)
original_audio = AudioSegment.from_file(audio_path)
segment_paths = []
updated_metadata = []
for text, start_time, end_time, segment_id in table_data:
if start_time is None or end_time is None:
print(f"[LOG ERROR] Timestamp manquant pour : {text}")
continue
start_ms, end_ms = int(float(start_time) * 1000), int(float(end_time) * 1000)
if start_ms < 0 or end_ms <= start_ms:
print(f"[LOG ERROR] Problème de découpage : {text} | {start_time}s - {end_time}s")
continue
segment_filename = f"{Path(audio_path).stem}_{segment_id}.wav"
segment_path = os.path.join(TEMP_DIR, segment_filename)
extract = original_audio[start_ms:end_ms]
extract.export(segment_path, format="wav")
segment_paths.append(segment_path)
updated_metadata.append({
"audio_file": segment_filename,
"text": text,
"start_time": start_time,
"end_time": end_time,
"id": segment_id,
})
print(f"[LOG] Extrait généré : {segment_filename}")
return segment_paths, updated_metadata
# -------------------------------------------------
# 5. Génération du fichier ZIP
# -------------------------------------------------
def generate_zip(metadata_state):
if not metadata_state:
print("[LOG ERROR] Aucun segment valide trouvé pour la génération du ZIP.")
return None
zip_path = os.path.join(TEMP_DIR, "dataset.zip")
if os.path.exists(zip_path):
os.remove(zip_path)
metadata_csv_path = os.path.join(TEMP_DIR, "metadata.csv")
with open(metadata_csv_path, "w", encoding="utf-8") as f:
f.write("audio_file|text|speaker_name|API\n")
for seg in metadata_state:
f.write(f"{seg['audio_file']}|{seg['text']}|projectname|/API_PHONETIC/\n")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
zf.write(metadata_csv_path, "metadata.csv")
for seg in metadata_state:
file_path = os.path.join(TEMP_DIR, seg["audio_file"])
if os.path.exists(file_path):
zf.write(file_path, seg["audio_file"])
print("[LOG] Fichier ZIP généré avec succès.")
return zip_path
# -------------------------------------------------
# 6. Interface utilisateur Gradio
# -------------------------------------------------
with gr.Blocks() as demo:
gr.Markdown("# Application de Découpe Audio")
metadata_state = gr.State(init_metadata_state())
extracted_segments = gr.State([])
audio_input = gr.Audio(type="filepath", label="Fichier audio")
raw_transcription = gr.Textbox(label="Transcription", interactive=False)
transcription_timestamps = gr.Textbox(label="Transcription avec Timestamps", interactive=False)
table = gr.Dataframe(headers=["Texte"], datatype=["str"], row_count=(1, "dynamic"), col_count=1)
generate_timestamps_button = gr.Button("Générer les timestamps")
validate_button = gr.Button("Valider")
generate_button = gr.Button("Générer ZIP")
zip_file = gr.File(label="Télécharger le ZIP")
word_timestamps = gr.State()
audio_input.change(transcribe_audio, inputs=audio_input, outputs=[raw_transcription, table, audio_input, word_timestamps, transcription_timestamps])
generate_timestamps_button.click(preprocess_segments, inputs=[table, word_timestamps], outputs=table)
validate_button.click(validate_segments, inputs=[audio_input, table, metadata_state, word_timestamps], outputs=[extracted_segments, metadata_state])
generate_button.click(generate_zip, inputs=metadata_state, outputs=zip_file)
demo.queue().launch()