Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,9 +7,7 @@ import gradio as gr
|
|
7 |
from pydub import AudioSegment
|
8 |
from transformers import pipeline
|
9 |
|
10 |
-
# -------------------------------------------------
|
11 |
# Configuration
|
12 |
-
# -------------------------------------------------
|
13 |
MODEL_NAME = "openai/whisper-large-v3"
|
14 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
15 |
|
@@ -28,137 +26,181 @@ def init_metadata_state():
|
|
28 |
|
29 |
def transcribe_audio(audio_path):
|
30 |
if not audio_path:
|
31 |
-
return "Aucun fichier audio fourni", [], None
|
32 |
|
33 |
-
|
34 |
-
|
35 |
|
36 |
-
|
37 |
-
|
38 |
|
39 |
-
|
40 |
-
|
|
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
formatted_data = []
|
46 |
|
47 |
-
for
|
48 |
-
if not
|
49 |
-
|
50 |
-
|
51 |
-
text
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
def validate_segments(audio_path, table_data, metadata_state
|
68 |
-
if not audio_path
|
69 |
return [], metadata_state
|
70 |
|
71 |
if os.path.exists(TEMP_DIR):
|
72 |
shutil.rmtree(TEMP_DIR)
|
73 |
os.makedirs(TEMP_DIR, exist_ok=True)
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
"
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
def generate_zip(metadata_state):
|
105 |
if not metadata_state:
|
106 |
return None
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
os.
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
126 |
|
127 |
-
# -------------------------------------------------
|
128 |
# Interface Gradio
|
129 |
-
# -------------------------------------------------
|
130 |
with gr.Blocks() as demo:
|
131 |
gr.Markdown("# Application de Découpe Audio")
|
132 |
metadata_state = gr.State(init_metadata_state())
|
133 |
-
extracted_segments = gr.State([])
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
zip_file = gr.File(label="Télécharger le ZIP")
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
145 |
|
|
|
146 |
validate_button.click(
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
inputs=[table, word_timestamps, audio_input, metadata_state],
|
151 |
-
outputs=[extracted_segments, metadata_state],
|
152 |
)
|
153 |
-
|
154 |
-
@gr.render(inputs=extracted_segments)
|
155 |
-
def show_audio_excerpts(segments):
|
156 |
-
if not segments:
|
157 |
-
gr.Markdown("Aucun extrait généré.")
|
158 |
-
else:
|
159 |
-
for i, seg in enumerate(segments):
|
160 |
-
gr.Audio(label=f"Extrait {i+1}", value=seg)
|
161 |
-
|
162 |
generate_button.click(generate_zip, inputs=metadata_state, outputs=zip_file)
|
163 |
|
164 |
-
demo.queue().launch()
|
|
|
7 |
from pydub import AudioSegment
|
8 |
from transformers import pipeline
|
9 |
|
|
|
10 |
# Configuration
|
|
|
11 |
MODEL_NAME = "openai/whisper-large-v3"
|
12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
|
|
|
26 |
|
27 |
def transcribe_audio(audio_path):
|
28 |
if not audio_path:
|
29 |
+
return "Aucun fichier audio fourni", [], None
|
30 |
|
31 |
+
# Utilisation explicite des timestamps au niveau des mots
|
32 |
+
result = pipe(audio_path, return_timestamps="word")
|
33 |
|
34 |
+
# Extraction du texte complet
|
35 |
+
text = result["text"]
|
36 |
|
37 |
+
# Création des segments initiaux à partir des timestamps
|
38 |
+
chunks = result["chunks"]
|
39 |
+
table_data = []
|
40 |
|
41 |
+
# Regrouper les mots en phrases (séparées par la ponctuation)
|
42 |
+
current_segment = []
|
43 |
+
current_start = None
|
|
|
44 |
|
45 |
+
for chunk in chunks:
|
46 |
+
if not current_start:
|
47 |
+
current_start = chunk["timestamp"][0]
|
48 |
+
|
49 |
+
current_segment.append(chunk["text"])
|
50 |
+
|
51 |
+
# Si on trouve une ponctuation de fin ou si le segment est assez long
|
52 |
+
if any(punct in chunk["text"] for punct in ".!?") or len(" ".join(current_segment)) > 100:
|
53 |
+
segment_text = " ".join(current_segment).strip()
|
54 |
+
if segment_text:
|
55 |
+
table_data.append([
|
56 |
+
segment_text,
|
57 |
+
round(current_start, 2),
|
58 |
+
round(chunk["timestamp"][1], 2),
|
59 |
+
f"seg_{len(table_data)+1:02d}"
|
60 |
+
])
|
61 |
+
current_segment = []
|
62 |
+
current_start = None
|
63 |
+
|
64 |
+
# Ajouter le dernier segment s'il existe
|
65 |
+
if current_segment:
|
66 |
+
last_chunk = chunks[-1]
|
67 |
+
segment_text = " ".join(current_segment).strip()
|
68 |
+
if segment_text:
|
69 |
+
table_data.append([
|
70 |
+
segment_text,
|
71 |
+
round(current_start, 2),
|
72 |
+
round(last_chunk["timestamp"][1], 2),
|
73 |
+
f"seg_{len(table_data)+1:02d}"
|
74 |
+
])
|
75 |
+
|
76 |
+
# S'assurer qu'on a au moins 5 lignes pour l'interface
|
77 |
+
while len(table_data) < 5:
|
78 |
+
table_data.append(["", None, None, ""])
|
79 |
+
|
80 |
+
return text, table_data, audio_path
|
81 |
|
82 |
+
def validate_segments(audio_path, table_data, metadata_state):
|
83 |
+
if not audio_path:
|
84 |
return [], metadata_state
|
85 |
|
86 |
if os.path.exists(TEMP_DIR):
|
87 |
shutil.rmtree(TEMP_DIR)
|
88 |
os.makedirs(TEMP_DIR, exist_ok=True)
|
89 |
|
90 |
+
try:
|
91 |
+
original_audio = AudioSegment.from_file(audio_path)
|
92 |
+
segment_paths = []
|
93 |
+
updated_metadata = []
|
94 |
+
|
95 |
+
for i, row in enumerate(table_data):
|
96 |
+
if not row or len(row) < 4:
|
97 |
+
continue
|
98 |
+
|
99 |
+
text, start_time, end_time, segment_id = row
|
100 |
+
|
101 |
+
if not text or start_time is None or end_time is None:
|
102 |
+
continue
|
103 |
+
|
104 |
+
if not segment_id:
|
105 |
+
segment_id = f"seg_{i+1:02d}"
|
106 |
+
|
107 |
+
start_ms = int(float(start_time) * 1000)
|
108 |
+
end_ms = int(float(end_time) * 1000)
|
109 |
+
|
110 |
+
if start_ms < 0 or end_ms <= start_ms or end_ms > len(original_audio):
|
111 |
+
print(f"Invalid timestamps for segment {segment_id}: {start_time}-{end_time}")
|
112 |
+
continue
|
113 |
+
|
114 |
+
segment_filename = f"{Path(audio_path).stem}_{segment_id}.wav"
|
115 |
+
segment_path = os.path.join(TEMP_DIR, segment_filename)
|
116 |
+
|
117 |
+
try:
|
118 |
+
segment_audio = original_audio[start_ms:end_ms]
|
119 |
+
segment_audio.export(segment_path, format="wav")
|
120 |
+
|
121 |
+
segment_paths.append(segment_path)
|
122 |
+
updated_metadata.append({
|
123 |
+
"audio_file": segment_filename,
|
124 |
+
"text": text,
|
125 |
+
"start_time": start_time,
|
126 |
+
"end_time": end_time,
|
127 |
+
"id": segment_id,
|
128 |
+
})
|
129 |
+
except Exception as e:
|
130 |
+
print(f"Error processing segment {segment_id}: {str(e)}")
|
131 |
+
continue
|
132 |
+
|
133 |
+
return segment_paths, updated_metadata
|
134 |
+
|
135 |
+
except Exception as e:
|
136 |
+
print(f"Error in validate_segments: {str(e)}")
|
137 |
+
return [], metadata_state
|
138 |
|
139 |
def generate_zip(metadata_state):
|
140 |
if not metadata_state:
|
141 |
return None
|
142 |
|
143 |
+
try:
|
144 |
+
zip_path = os.path.join(TEMP_DIR, "dataset.zip")
|
145 |
+
if os.path.exists(zip_path):
|
146 |
+
os.remove(zip_path)
|
147 |
+
|
148 |
+
metadata_csv_path = os.path.join(TEMP_DIR, "metadata.csv")
|
149 |
+
with open(metadata_csv_path, "w", encoding="utf-8") as f:
|
150 |
+
f.write("audio_file|text|speaker_name|API\n")
|
151 |
+
for seg in metadata_state:
|
152 |
+
f.write(f"{seg['audio_file']}|{seg['text']}|projectname|/API_PHONETIC/\n")
|
153 |
+
|
154 |
+
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
155 |
+
zf.write(metadata_csv_path, "metadata.csv")
|
156 |
+
for seg in metadata_state:
|
157 |
+
file_path = os.path.join(TEMP_DIR, seg["audio_file"])
|
158 |
+
if os.path.exists(file_path):
|
159 |
+
zf.write(file_path, seg["audio_file"])
|
160 |
+
|
161 |
+
return zip_path
|
162 |
+
except Exception as e:
|
163 |
+
print(f"Error generating ZIP: {str(e)}")
|
164 |
+
return None
|
165 |
|
|
|
166 |
# Interface Gradio
|
|
|
167 |
with gr.Blocks() as demo:
|
168 |
gr.Markdown("# Application de Découpe Audio")
|
169 |
metadata_state = gr.State(init_metadata_state())
|
|
|
170 |
|
171 |
+
with gr.Row():
|
172 |
+
audio_input = gr.Audio(type="filepath", label="Fichier audio")
|
173 |
+
raw_transcription = gr.Textbox(label="Transcription", interactive=False)
|
174 |
+
|
175 |
+
table = gr.Dataframe(
|
176 |
+
headers=["Texte", "Début (s)", "Fin (s)", "ID"],
|
177 |
+
datatype=["str", "number", "number", "str"],
|
178 |
+
row_count=5,
|
179 |
+
col_count=4,
|
180 |
+
interactive=True
|
181 |
+
)
|
182 |
+
|
183 |
+
with gr.Row():
|
184 |
+
validate_button = gr.Button("Valider")
|
185 |
+
generate_button = gr.Button("Générer ZIP")
|
186 |
+
|
187 |
+
status_message = gr.Textbox(label="Status", interactive=False)
|
188 |
zip_file = gr.File(label="Télécharger le ZIP")
|
189 |
|
190 |
+
def process_validation(audio_path, table_data, metadata_state):
|
191 |
+
if not audio_path:
|
192 |
+
return [], [], "Aucun fichier audio fourni"
|
193 |
+
|
194 |
+
segments, metadata = validate_segments(audio_path, table_data, metadata_state)
|
195 |
+
status = f"{len(segments)} segments générés" if segments else "Aucun segment généré"
|
196 |
+
return table_data, metadata, status
|
197 |
|
198 |
+
audio_input.change(transcribe_audio, inputs=audio_input, outputs=[raw_transcription, table, audio_input])
|
199 |
validate_button.click(
|
200 |
+
process_validation,
|
201 |
+
inputs=[audio_input, table, metadata_state],
|
202 |
+
outputs=[table, metadata_state, status_message]
|
|
|
|
|
203 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
generate_button.click(generate_zip, inputs=metadata_state, outputs=zip_file)
|
205 |
|
206 |
+
demo.queue().launch()
|