WhisperSpeech / whisperspeech /extract_acoustic.py
tonic
Laion WhisperSpeech Demo
33d9042
raw
history blame
No virus
1.98 kB
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/1. Acoustic token extraction.ipynb.
# %% auto 0
__all__ = ['load', 'load_model', 'extract_Atoks', 'extract_acoustic']
# %% ../nbs/1. Acoustic token extraction.ipynb 2
import torch
import torchaudio
import gc
from pathlib import Path
from fastcore.script import *
from fastprogress import progress_bar, master_bar
# %% ../nbs/1. Acoustic token extraction.ipynb 5
def load(fname, newsr=24000):
"""Load an audio file to the GPU and resample to `newsr`."""
x, sr = torchaudio.load(fname)
_tform = torchaudio.transforms.Resample(sr, newsr)
return _tform(x).cuda().unsqueeze(0)
# %% ../nbs/1. Acoustic token extraction.ipynb 6
def load_model():
"Load the pretrained EnCodec model"
from encodec.model import EncodecModel
model = EncodecModel.encodec_model_24khz()
model.set_target_bandwidth(1.5)
model.cuda().eval();
return model
# %% ../nbs/1. Acoustic token extraction.ipynb 7
def extract_Atoks(model, audio):
"""Extract EnCodec tokens for the given `audio` tensor (or file path)
using the given `model` (see `load_model`)."""
if isinstance(audio, (Path, str)):
audio = load(audio)
with torch.no_grad():
frames = torch.cat([model.encode(segment)[0][0]
for segment in torch.split(audio, 320*20000, dim=-1)], dim=-1)
return frames
# %% ../nbs/1. Acoustic token extraction.ipynb 8
@call_parse
def extract_acoustic(
srcdir:Path, # source dir, should contain *.flac files
outdir:Path, # output dir, will get the *.encodec files
):
"Convert audio files to .encodec files with tensors of tokens"
model = load_model()
outdir.mkdir(exist_ok=True, parents=True)
for name in progress_bar(list(srcdir.rglob('*.flac'))):
outname = outdir/name.with_suffix('.encodec').name
tokens = extract_Atoks(model, name)
torch.save(tokens, outname)
del tokens
gc.collect()