tonic
Laion WhisperSpeech Demo
33d9042
raw
history blame
No virus
2.48 kB
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/1B. Voice activity detection.ipynb.
# %% auto 0
__all__ = []
# %% ../nbs/1B. Voice activity detection.ipynb 3
import os
import torch
import torchaudio
from pathlib import Path
from fastprogress import progress_bar
from fastcore.script import call_parse
import whisperx
import random
import numpy as np
import webdataset as wds
# %% ../nbs/1B. Voice activity detection.ipynb 5
# some of the original file names have a dot in their name
# webdataset does not like it so let's patch it
def fix_dots_in_names(name):
name, ext = name.rsplit('.', 1)
return ".".join((name.replace('.', '_'), ext))
def load_dataset(url, decode=True, rename_files=None):
ds = wds.WebDataset(url, rename_files=rename_files)
if not decode: return ds
return ds.decode(wds.torch_audio)
# %% ../nbs/1B. Voice activity detection.ipynb 7
def extract_segments(vad_result, max_duration):
binarize = whisperx.vad.Binarize(max_duration=max_duration)
segments = binarize(vad_result)
return [(x.start, x.end) for x in segments.get_timeline()]
def segment_audio(vad_model, audio, sr=16000):
vad_result = vad_model({"waveform": audio, "sample_rate": sr})
return extract_segments(vad_result, 30)
# %% ../nbs/1B. Voice activity detection.ipynb 13
def flac_to_vad_name(input):
if '-flac-' in input:
return input.rsplit("/", 1)[1].replace('flac', 'vad') + ".gz"
else:
return input.rsplit("/", 1)[1].replace('raw', 'vad') + ".gz"
@call_parse
def process_shard(
input:str, # input shard URL/path
output:str=None, # output shard URL/path
fix_dots:bool=False, # fix dots in LibriLight filenames
):
if output is None: output = flac_to_vad_name(input)
ds = torch.utils.data.DataLoader(load_dataset(input, rename_files=fix_dots_in_names if fix_dots else None), num_workers=2, batch_size=None)
vad_model = whisperx.vad.load_vad_model('cuda')
tmp = output+".tmp"
with wds.TarWriter(tmp) as sink:
for s in progress_bar(ds, total='noinfer'):
audio, sr = s.get('flac', s.get('wav', (None, None)))
if audio is None:
print(f"warning: '{s['__key__']}' does not contain an audio file")
continue
sink.write({
"__key__": s['__key__'],
"vad.npy": np.array(segment_audio(vad_model, audio, sr=sr), dtype=np.float16)
})
os.rename(tmp, output)