deepsync's picture
Update app.py
344aab3 verified
raw
history blame
2.57 kB
import gradio as gr
import os
from uuid import uuid4
from pydub.silence import detect_nonsilent
from pydub import AudioSegment
def get_labels(audio_fp, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_segment_length):
audio = AudioSegment.from_file(audio_fp).set_channels(1)
speech_timestamps = detect_nonsilent(audio, min_silence_len=min_silence_duration_ms, silence_thresh=-40)
speech_timestamps = list(filter(lambda x: x[1]-x[0] > min_speech_duration_ms, speech_timestamps))
speech_timestamps = [{"start": s[0]/1000, "end": s[1]/1000} for s in speech_timestamps]
labels_str = []
labels = []
uppper_merge_threshold = float(uppper_merge_threshold)
for i, st in enumerate(speech_timestamps):
labels_str.append(f"{st['start']}\t{st['end']}\tSound {i+1}")
labels.append((float(st['start']), float(st['end']), f"Sound {i+1}"))
fn = str(uuid4()) + ".txt"
with open(fn, "w") as f:
f.write("\n".join(labels_str))
if not auto_merge:
return fn, None
gaps = [labels[i][0] - labels[i - 1][1] for i in range(1, len(labels))]
duration = lambda x: float(x[1]) - float(x[0])
new_labels = [list(labels[0])]
for i in range(1, len(labels)):
if (
gaps[i - 1] <= uppper_merge_threshold
and duration(new_labels[-1]) + gaps[i - 1] + duration(labels[i])
< max_segment_length
):
new_labels[-1][1] = labels[i][1]
new_labels[-1][
2
] = f'{new_labels[-1][2]} |{round(gaps[i-1], 2)}s| {labels[i][2]}'
else:
new_labels.append(list(labels[i]))
translate_labels = list(map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}", new_labels))
filename_path = f"{fn}_translate_label.txt"
with open(filename_path, "w") as f:
f.write("\n".join(translate_labels))
return fn, filename_path
interface = gr.Interface(
get_labels,
[
gr.Audio(type="filepath", label="Audio file"),
gr.Number(label="min_speech_duration_ms", value=80, info="default (80)"),
gr.Number(label="min_silence_duration_ms", value=40, info="default (100)"),
gr.Checkbox(label="Auto merge", value=True),
gr.Textbox(label="Gap max threshold value (seconds)", value=0.350),
gr.Number(label="Approx Max Segment Length", value=7)
],
[
gr.File(label="VAD Labels"),
gr.File(label="Merged Labels File")
]
)
if __name__ == "__main__":
interface.queue().launch()