Spaces:

Woziii
/

datasetTTS

Sleeping

App Files Files Community

datasetTTS / app.py

Woziii

Update app.py

90c7625 verified 30 days ago

raw

history blame

8.02 kB

	# Version: Corrected After Test 3 (V2.3.0 - Structured, Commented, and ZIP Generation Restored)
	# Description: Cette version structure le script selon l'ordre des étapes du processus.
	# La génération du fichier ZIP a été réintégrée après avoir été omise dans la version précédente.
	# Chaque section est commentée pour assurer une meilleure lisibilité et une logique claire.

	import os
	import shutil
	import zipfile
	import torch
	import numpy as np
	from pathlib import Path
	import gradio as gr
	from pydub import AudioSegment
	from transformers import pipeline

	# -------------------------------------------------
	# 1. Configuration et Initialisation
	# -------------------------------------------------
	MODEL_NAME = "openai/whisper-large-v3"
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Initialisation du modèle Whisper
	pipe = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	device=device,
	model_kwargs={"low_cpu_mem_usage": True},
	)

	# Création du répertoire temporaire pour stocker les extraits audio
	TEMP_DIR = "./temp_audio"
	os.makedirs(TEMP_DIR, exist_ok=True)

	def init_metadata_state():
	return []

	# -------------------------------------------------
	# 2. Transcription de l'audio avec Whisper
	# -------------------------------------------------
	def transcribe_audio(audio_path):
	if not audio_path:
	print("[LOG] Aucun fichier audio fourni.")
	return "Aucun fichier audio fourni", [], None, [], ""

	print(f"[LOG] Début de la transcription de {audio_path}...")
	result = pipe(audio_path, return_timestamps="word")
	words = result.get("chunks", [])

	if not words:
	print("[LOG ERROR] Erreur : Aucun timestamp détecté.")
	return "Erreur : Aucun timestamp détecté.", [], None, [], ""

	raw_transcription = " ".join([w["text"] for w in words])
	word_timestamps = []

	for i, w in enumerate(words):
	word = w["text"]
	start = w["timestamp"][0]
	end = words[i + 1]["timestamp"][0] - 0.01 if i + 1 < len(words) else start + 0.5 # Fin = début du mot suivant - 10ms
	word_timestamps.append((word, (start, end)))

	transcription_with_timestamps = " ".join([f"{w[0]}[{w[1][0]:.2f}-{w[1][1]:.2f}]" for w in word_timestamps])

	print(f"[LOG] Transcription brute : {raw_transcription}")
	print(f"[LOG DETAIL] Timestamps associés : {word_timestamps}")
	return raw_transcription, [], audio_path, word_timestamps, transcription_with_timestamps

	# -------------------------------------------------
	# 3. Prétraitement des segments : Associer les timestamps aux phrases sélectionnées
	# -------------------------------------------------
	def preprocess_segments(table_data, word_timestamps):
	print("[LOG] Début du prétraitement des segments...")
	formatted_data = []
	for i, row in enumerate(table_data):
	if not row or len(row) < 1 or not row[0].strip():
	print(f"[LOG WARNING] Ignoré : ligne vide à l'index {i}.")
	continue

	text = row[0].strip()
	segment_id = f"seg_{i+1:02d}"

	start_time, end_time = None, None
	words_in_segment = text.split()
	segment_indices = []

	for j, (word, (start, end)) in enumerate(word_timestamps):
	if word in words_in_segment:
	segment_indices.append((j, start, end))

	if segment_indices:
	start_time = segment_indices[0][1]
	end_time = segment_indices[-1][2]

	formatted_data.append([text, start_time, end_time, segment_id])
	print(f"[LOG] Segment ajouté : {text} \| Début: {start_time}, Fin: {end_time}, ID: {segment_id}")

	return formatted_data

	# -------------------------------------------------
	# 4. Validation et découpage des extraits audio
	# -------------------------------------------------
	def validate_segments(audio_path, table_data, metadata_state, word_timestamps):
	print("[LOG] Début de la validation des segments...")
	if not audio_path or not word_timestamps:
	print("[LOG ERROR] Erreur : Aucun timestamp valide trouvé !")
	return [], metadata_state

	if os.path.exists(TEMP_DIR):
	shutil.rmtree(TEMP_DIR)
	os.makedirs(TEMP_DIR, exist_ok=True)

	original_audio = AudioSegment.from_file(audio_path)
	segment_paths = []
	updated_metadata = []

	for text, start_time, end_time, segment_id in table_data:
	if start_time is None or end_time is None:
	print(f"[LOG ERROR] Timestamp manquant pour : {text}")
	continue

	start_ms, end_ms = int(float(start_time) * 1000), int(float(end_time) * 1000)
	if start_ms < 0 or end_ms <= start_ms:
	print(f"[LOG ERROR] Problème de découpage : {text} \| {start_time}s - {end_time}s")
	continue

	segment_filename = f"{Path(audio_path).stem}_{segment_id}.wav"
	segment_path = os.path.join(TEMP_DIR, segment_filename)

	extract = original_audio[start_ms:end_ms]
	extract.export(segment_path, format="wav")

	segment_paths.append(segment_path)
	updated_metadata.append({
	"audio_file": segment_filename,
	"text": text,
	"start_time": start_time,
	"end_time": end_time,
	"id": segment_id,
	})
	print(f"[LOG] Extrait généré : {segment_filename}")

	return segment_paths, updated_metadata

	# -------------------------------------------------
	# 5. Génération du fichier ZIP
	# -------------------------------------------------
	def generate_zip(metadata_state):
	if not metadata_state:
	print("[LOG ERROR] Aucun segment valide trouvé pour la génération du ZIP.")
	return None

	zip_path = os.path.join(TEMP_DIR, "dataset.zip")
	if os.path.exists(zip_path):
	os.remove(zip_path)

	metadata_csv_path = os.path.join(TEMP_DIR, "metadata.csv")
	with open(metadata_csv_path, "w", encoding="utf-8") as f:
	f.write("audio_file\|text\|speaker_name\|API\n")
	for seg in metadata_state:
	f.write(f"{seg['audio_file']}\|{seg['text']}\|projectname\|/API_PHONETIC/\n")

	with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
	zf.write(metadata_csv_path, "metadata.csv")
	for seg in metadata_state:
	file_path = os.path.join(TEMP_DIR, seg["audio_file"])
	if os.path.exists(file_path):
	zf.write(file_path, seg["audio_file"])

	print("[LOG] Fichier ZIP généré avec succès.")
	return zip_path

	# -------------------------------------------------
	# 6. Interface utilisateur Gradio
	# -------------------------------------------------
	with gr.Blocks() as demo:
	gr.Markdown("# Application de Découpe Audio")
	metadata_state = gr.State(init_metadata_state())
	extracted_segments = gr.State([])

	audio_input = gr.Audio(type="filepath", label="Fichier audio")
	raw_transcription = gr.Textbox(label="Transcription", interactive=False)
	transcription_timestamps = gr.Textbox(label="Transcription avec Timestamps", interactive=False)
	table = gr.Dataframe(headers=["Texte"], datatype=["str"], row_count=(1, "dynamic"), col_count=1)
	generate_timestamps_button = gr.Button("Générer les timestamps")
	validate_button = gr.Button("Valider")
	generate_button = gr.Button("Générer ZIP")
	zip_file = gr.File(label="Télécharger le ZIP")
	word_timestamps = gr.State()

	audio_input.change(transcribe_audio, inputs=audio_input, outputs=[raw_transcription, table, audio_input, word_timestamps, transcription_timestamps])
	generate_timestamps_button.click(preprocess_segments, inputs=[table, word_timestamps], outputs=table)
	validate_button.click(validate_segments, inputs=[audio_input, table, metadata_state, word_timestamps], outputs=[extracted_segments, metadata_state])
	generate_button.click(generate_zip, inputs=metadata_state, outputs=zip_file)

	demo.queue().launch()