Spaces:

Woziii
/

datasetTTS

Sleeping

File size: 8,020 Bytes

90c7625
 
 
 
a8ecd5f
a74c3ba
 
 
1a099fd
a8ecd5f
 
a74c3ba
ae7bb8d
a74c3ba
 
a8ecd5f
075f0bd
a8ecd5f
1a099fd
a8ecd5f
7ef7a21
075f0bd
a74c3ba
ec37ecb
a74c3ba
 
1a099fd
a74c3ba
 
075f0bd
a74c3ba
 
 
 
 
 
075f0bd
 
 
a74c3ba
 
a8ecd5f
 
 
 
 
 
 
 
 
 
eb72b93
a8ecd5f
fac847d
 
 
 
 
 
 
ec37ecb
a8ecd5f
3d786ab
a8ecd5f
 
 
ec37ecb
075f0bd
 
 
a8ecd5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fac847d
a8ecd5f
 
 
 
 
 
075f0bd
 
 
a8ecd5f
 
 
 
ec37ecb
 
 
 
 
a8ecd5f
1a099fd
 
 
 
a8ecd5f
 
 
 
 
 
1a099fd
a8ecd5f
1a099fd
ec37ecb
1a099fd
 
3d786ab
1a099fd
 
3d786ab
1a099fd
 
 
 
 
 
 
 
a8ecd5f
1a099fd
 
a74c3ba
90c7625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
075f0bd
 
a8ecd5f
eb72b93
a8ecd5f
9023c36
a8ecd5f
 
 
 
 
 
 
 
 
ec37ecb
90c7625
 
 
 
 
 
573c72c
3bbe62c

# Version: Corrected After Test 3 (V2.3.0 - Structured, Commented, and ZIP Generation Restored)
# Description: Cette version structure le script selon l'ordre des étapes du processus.
# La génération du fichier ZIP a été réintégrée après avoir été omise dans la version précédente.
# Chaque section est commentée pour assurer une meilleure lisibilité et une logique claire.

import os
import shutil
import zipfile
import torch
import numpy as np
from pathlib import Path
import gradio as gr
from pydub import AudioSegment
from transformers import pipeline

# -------------------------------------------------
# 1. Configuration et Initialisation
# -------------------------------------------------
MODEL_NAME = "openai/whisper-large-v3"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialisation du modèle Whisper
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    device=device,
    model_kwargs={"low_cpu_mem_usage": True},
)

# Création du répertoire temporaire pour stocker les extraits audio
TEMP_DIR = "./temp_audio"
os.makedirs(TEMP_DIR, exist_ok=True)

def init_metadata_state():
    return []

# -------------------------------------------------
# 2. Transcription de l'audio avec Whisper
# -------------------------------------------------
def transcribe_audio(audio_path):
    if not audio_path:
        print("[LOG] Aucun fichier audio fourni.")
        return "Aucun fichier audio fourni", [], None, [], ""
    
    print(f"[LOG] Début de la transcription de {audio_path}...")
    result = pipe(audio_path, return_timestamps="word")
    words = result.get("chunks", [])
    
    if not words:
        print("[LOG ERROR] Erreur : Aucun timestamp détecté.")
        return "Erreur : Aucun timestamp détecté.", [], None, [], ""
    
    raw_transcription = " ".join([w["text"] for w in words])
    word_timestamps = []
    
    for i, w in enumerate(words):
        word = w["text"]
        start = w["timestamp"][0]
        end = words[i + 1]["timestamp"][0] - 0.01 if i + 1 < len(words) else start + 0.5  # Fin = début du mot suivant - 10ms
        word_timestamps.append((word, (start, end)))
    
    transcription_with_timestamps = " ".join([f"{w[0]}[{w[1][0]:.2f}-{w[1][1]:.2f}]" for w in word_timestamps])
    
    print(f"[LOG] Transcription brute : {raw_transcription}")
    print(f"[LOG DETAIL] Timestamps associés : {word_timestamps}")
    return raw_transcription, [], audio_path, word_timestamps, transcription_with_timestamps

# -------------------------------------------------
# 3. Prétraitement des segments : Associer les timestamps aux phrases sélectionnées
# -------------------------------------------------
def preprocess_segments(table_data, word_timestamps):
    print("[LOG] Début du prétraitement des segments...")
    formatted_data = []
    for i, row in enumerate(table_data):
        if not row or len(row) < 1 or not row[0].strip():
            print(f"[LOG WARNING] Ignoré : ligne vide à l'index {i}.")
            continue

        text = row[0].strip()
        segment_id = f"seg_{i+1:02d}"

        start_time, end_time = None, None
        words_in_segment = text.split()
        segment_indices = []
        
        for j, (word, (start, end)) in enumerate(word_timestamps):
            if word in words_in_segment:
                segment_indices.append((j, start, end))

        if segment_indices:
            start_time = segment_indices[0][1]
            end_time = segment_indices[-1][2]
        
        formatted_data.append([text, start_time, end_time, segment_id])
        print(f"[LOG] Segment ajouté : {text} | Début: {start_time}, Fin: {end_time}, ID: {segment_id}")
    
    return formatted_data

# -------------------------------------------------
# 4. Validation et découpage des extraits audio
# -------------------------------------------------
def validate_segments(audio_path, table_data, metadata_state, word_timestamps):
    print("[LOG] Début de la validation des segments...")
    if not audio_path or not word_timestamps:
        print("[LOG ERROR] Erreur : Aucun timestamp valide trouvé !")
        return [], metadata_state
    
    if os.path.exists(TEMP_DIR):
        shutil.rmtree(TEMP_DIR)
    os.makedirs(TEMP_DIR, exist_ok=True)

    original_audio = AudioSegment.from_file(audio_path)
    segment_paths = []
    updated_metadata = []

    for text, start_time, end_time, segment_id in table_data:
        if start_time is None or end_time is None:
            print(f"[LOG ERROR] Timestamp manquant pour : {text}")
            continue

        start_ms, end_ms = int(float(start_time) * 1000), int(float(end_time) * 1000)
        if start_ms < 0 or end_ms <= start_ms:
            print(f"[LOG ERROR] Problème de découpage : {text} | {start_time}s - {end_time}s")
            continue

        segment_filename = f"{Path(audio_path).stem}_{segment_id}.wav"
        segment_path = os.path.join(TEMP_DIR, segment_filename)

        extract = original_audio[start_ms:end_ms]
        extract.export(segment_path, format="wav")

        segment_paths.append(segment_path)
        updated_metadata.append({
            "audio_file": segment_filename,
            "text": text,
            "start_time": start_time,
            "end_time": end_time,
            "id": segment_id,
        })
        print(f"[LOG] Extrait généré : {segment_filename}")
    
    return segment_paths, updated_metadata

# -------------------------------------------------
# 5. Génération du fichier ZIP
# -------------------------------------------------
def generate_zip(metadata_state):
    if not metadata_state:
        print("[LOG ERROR] Aucun segment valide trouvé pour la génération du ZIP.")
        return None
    
    zip_path = os.path.join(TEMP_DIR, "dataset.zip")
    if os.path.exists(zip_path):
        os.remove(zip_path)
    
    metadata_csv_path = os.path.join(TEMP_DIR, "metadata.csv")
    with open(metadata_csv_path, "w", encoding="utf-8") as f:
        f.write("audio_file|text|speaker_name|API\n")
        for seg in metadata_state:
            f.write(f"{seg['audio_file']}|{seg['text']}|projectname|/API_PHONETIC/\n")
    
    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
        zf.write(metadata_csv_path, "metadata.csv")
        for seg in metadata_state:
            file_path = os.path.join(TEMP_DIR, seg["audio_file"])
            if os.path.exists(file_path):
                zf.write(file_path, seg["audio_file"])
    
    print("[LOG] Fichier ZIP généré avec succès.")
    return zip_path

# -------------------------------------------------
# 6. Interface utilisateur Gradio
# -------------------------------------------------
with gr.Blocks() as demo:
    gr.Markdown("# Application de Découpe Audio")
    metadata_state = gr.State(init_metadata_state())
    extracted_segments = gr.State([])

    audio_input = gr.Audio(type="filepath", label="Fichier audio")
    raw_transcription = gr.Textbox(label="Transcription", interactive=False)
    transcription_timestamps = gr.Textbox(label="Transcription avec Timestamps", interactive=False)
    table = gr.Dataframe(headers=["Texte"], datatype=["str"], row_count=(1, "dynamic"), col_count=1)
    generate_timestamps_button = gr.Button("Générer les timestamps")
    validate_button = gr.Button("Valider")
    generate_button = gr.Button("Générer ZIP")
    zip_file = gr.File(label="Télécharger le ZIP")
    word_timestamps = gr.State()

    audio_input.change(transcribe_audio, inputs=audio_input, outputs=[raw_transcription, table, audio_input, word_timestamps, transcription_timestamps])
    generate_timestamps_button.click(preprocess_segments, inputs=[table, word_timestamps], outputs=table)
    validate_button.click(validate_segments, inputs=[audio_input, table, metadata_state, word_timestamps], outputs=[extracted_segments, metadata_state])
    generate_button.click(generate_zip, inputs=metadata_state, outputs=zip_file)

demo.queue().launch()