Spaces:
Sleeping
Sleeping
File size: 8,020 Bytes
90c7625 a8ecd5f a74c3ba 1a099fd a8ecd5f a74c3ba ae7bb8d a74c3ba a8ecd5f 075f0bd a8ecd5f 1a099fd a8ecd5f 7ef7a21 075f0bd a74c3ba ec37ecb a74c3ba 1a099fd a74c3ba 075f0bd a74c3ba 075f0bd a74c3ba a8ecd5f eb72b93 a8ecd5f fac847d ec37ecb a8ecd5f 3d786ab a8ecd5f ec37ecb 075f0bd a8ecd5f fac847d a8ecd5f 075f0bd a8ecd5f ec37ecb a8ecd5f 1a099fd a8ecd5f 1a099fd a8ecd5f 1a099fd ec37ecb 1a099fd 3d786ab 1a099fd 3d786ab 1a099fd a8ecd5f 1a099fd a74c3ba 90c7625 075f0bd a8ecd5f eb72b93 a8ecd5f 9023c36 a8ecd5f ec37ecb 90c7625 573c72c 3bbe62c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# Version: Corrected After Test 3 (V2.3.0 - Structured, Commented, and ZIP Generation Restored)
# Description: Cette version structure le script selon l'ordre des étapes du processus.
# La génération du fichier ZIP a été réintégrée après avoir été omise dans la version précédente.
# Chaque section est commentée pour assurer une meilleure lisibilité et une logique claire.
import os
import shutil
import zipfile
import torch
import numpy as np
from pathlib import Path
import gradio as gr
from pydub import AudioSegment
from transformers import pipeline
# -------------------------------------------------
# 1. Configuration et Initialisation
# -------------------------------------------------
MODEL_NAME = "openai/whisper-large-v3"
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialisation du modèle Whisper
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
device=device,
model_kwargs={"low_cpu_mem_usage": True},
)
# Création du répertoire temporaire pour stocker les extraits audio
TEMP_DIR = "./temp_audio"
os.makedirs(TEMP_DIR, exist_ok=True)
def init_metadata_state():
return []
# -------------------------------------------------
# 2. Transcription de l'audio avec Whisper
# -------------------------------------------------
def transcribe_audio(audio_path):
if not audio_path:
print("[LOG] Aucun fichier audio fourni.")
return "Aucun fichier audio fourni", [], None, [], ""
print(f"[LOG] Début de la transcription de {audio_path}...")
result = pipe(audio_path, return_timestamps="word")
words = result.get("chunks", [])
if not words:
print("[LOG ERROR] Erreur : Aucun timestamp détecté.")
return "Erreur : Aucun timestamp détecté.", [], None, [], ""
raw_transcription = " ".join([w["text"] for w in words])
word_timestamps = []
for i, w in enumerate(words):
word = w["text"]
start = w["timestamp"][0]
end = words[i + 1]["timestamp"][0] - 0.01 if i + 1 < len(words) else start + 0.5 # Fin = début du mot suivant - 10ms
word_timestamps.append((word, (start, end)))
transcription_with_timestamps = " ".join([f"{w[0]}[{w[1][0]:.2f}-{w[1][1]:.2f}]" for w in word_timestamps])
print(f"[LOG] Transcription brute : {raw_transcription}")
print(f"[LOG DETAIL] Timestamps associés : {word_timestamps}")
return raw_transcription, [], audio_path, word_timestamps, transcription_with_timestamps
# -------------------------------------------------
# 3. Prétraitement des segments : Associer les timestamps aux phrases sélectionnées
# -------------------------------------------------
def preprocess_segments(table_data, word_timestamps):
print("[LOG] Début du prétraitement des segments...")
formatted_data = []
for i, row in enumerate(table_data):
if not row or len(row) < 1 or not row[0].strip():
print(f"[LOG WARNING] Ignoré : ligne vide à l'index {i}.")
continue
text = row[0].strip()
segment_id = f"seg_{i+1:02d}"
start_time, end_time = None, None
words_in_segment = text.split()
segment_indices = []
for j, (word, (start, end)) in enumerate(word_timestamps):
if word in words_in_segment:
segment_indices.append((j, start, end))
if segment_indices:
start_time = segment_indices[0][1]
end_time = segment_indices[-1][2]
formatted_data.append([text, start_time, end_time, segment_id])
print(f"[LOG] Segment ajouté : {text} | Début: {start_time}, Fin: {end_time}, ID: {segment_id}")
return formatted_data
# -------------------------------------------------
# 4. Validation et découpage des extraits audio
# -------------------------------------------------
def validate_segments(audio_path, table_data, metadata_state, word_timestamps):
print("[LOG] Début de la validation des segments...")
if not audio_path or not word_timestamps:
print("[LOG ERROR] Erreur : Aucun timestamp valide trouvé !")
return [], metadata_state
if os.path.exists(TEMP_DIR):
shutil.rmtree(TEMP_DIR)
os.makedirs(TEMP_DIR, exist_ok=True)
original_audio = AudioSegment.from_file(audio_path)
segment_paths = []
updated_metadata = []
for text, start_time, end_time, segment_id in table_data:
if start_time is None or end_time is None:
print(f"[LOG ERROR] Timestamp manquant pour : {text}")
continue
start_ms, end_ms = int(float(start_time) * 1000), int(float(end_time) * 1000)
if start_ms < 0 or end_ms <= start_ms:
print(f"[LOG ERROR] Problème de découpage : {text} | {start_time}s - {end_time}s")
continue
segment_filename = f"{Path(audio_path).stem}_{segment_id}.wav"
segment_path = os.path.join(TEMP_DIR, segment_filename)
extract = original_audio[start_ms:end_ms]
extract.export(segment_path, format="wav")
segment_paths.append(segment_path)
updated_metadata.append({
"audio_file": segment_filename,
"text": text,
"start_time": start_time,
"end_time": end_time,
"id": segment_id,
})
print(f"[LOG] Extrait généré : {segment_filename}")
return segment_paths, updated_metadata
# -------------------------------------------------
# 5. Génération du fichier ZIP
# -------------------------------------------------
def generate_zip(metadata_state):
if not metadata_state:
print("[LOG ERROR] Aucun segment valide trouvé pour la génération du ZIP.")
return None
zip_path = os.path.join(TEMP_DIR, "dataset.zip")
if os.path.exists(zip_path):
os.remove(zip_path)
metadata_csv_path = os.path.join(TEMP_DIR, "metadata.csv")
with open(metadata_csv_path, "w", encoding="utf-8") as f:
f.write("audio_file|text|speaker_name|API\n")
for seg in metadata_state:
f.write(f"{seg['audio_file']}|{seg['text']}|projectname|/API_PHONETIC/\n")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
zf.write(metadata_csv_path, "metadata.csv")
for seg in metadata_state:
file_path = os.path.join(TEMP_DIR, seg["audio_file"])
if os.path.exists(file_path):
zf.write(file_path, seg["audio_file"])
print("[LOG] Fichier ZIP généré avec succès.")
return zip_path
# -------------------------------------------------
# 6. Interface utilisateur Gradio
# -------------------------------------------------
with gr.Blocks() as demo:
gr.Markdown("# Application de Découpe Audio")
metadata_state = gr.State(init_metadata_state())
extracted_segments = gr.State([])
audio_input = gr.Audio(type="filepath", label="Fichier audio")
raw_transcription = gr.Textbox(label="Transcription", interactive=False)
transcription_timestamps = gr.Textbox(label="Transcription avec Timestamps", interactive=False)
table = gr.Dataframe(headers=["Texte"], datatype=["str"], row_count=(1, "dynamic"), col_count=1)
generate_timestamps_button = gr.Button("Générer les timestamps")
validate_button = gr.Button("Valider")
generate_button = gr.Button("Générer ZIP")
zip_file = gr.File(label="Télécharger le ZIP")
word_timestamps = gr.State()
audio_input.change(transcribe_audio, inputs=audio_input, outputs=[raw_transcription, table, audio_input, word_timestamps, transcription_timestamps])
generate_timestamps_button.click(preprocess_segments, inputs=[table, word_timestamps], outputs=table)
validate_button.click(validate_segments, inputs=[audio_input, table, metadata_state, word_timestamps], outputs=[extracted_segments, metadata_state])
generate_button.click(generate_zip, inputs=metadata_state, outputs=zip_file)
demo.queue().launch()
|