# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/7. Pipeline.ipynb. # %% auto 0 __all__ = ['Pipeline'] # %% ../nbs/7. Pipeline.ipynb 1 import torch from whisperspeech.t2s_up_wds_mlang_enclm import TSARTransformer from whisperspeech.s2a_delar_mup_wds_mlang import SADelARTransformer from whisperspeech.a2wav import Vocoder import traceback from pathlib import Path # %% ../nbs/7. Pipeline.ipynb 2 class Pipeline: default_speaker = torch.tensor( [-0.2929, -0.4503, 0.4155, -0.1417, 0.0473, -0.1624, -0.2322, 0.7071, 0.4800, 0.5496, 0.0410, 0.6236, 0.4729, 0.0587, 0.2194, -0.0466, -0.3036, 0.0497, 0.5028, -0.1703, 0.5039, -0.6464, 0.3857, -0.7350, -0.1605, 0.4808, 0.5397, -0.4851, 0.1774, -0.8712, 0.5789, 0.1785, -0.1417, 0.3039, 0.4232, -0.0186, 0.2685, 0.6153, -0.3103, -0.5706, -0.4494, 0.3394, -0.6184, -0.3617, 1.1041, -0.1178, -0.1885, 0.1997, 0.5571, -0.2906, -0.0477, -0.4048, -0.1062, 1.4779, 0.1639, -0.3712, -0.1776, -0.0568, -0.6162, 0.0110, -0.0207, -0.1319, -0.3854, 0.7248, 0.0343, 0.5724, 0.0670, 0.0486, -0.3813, 0.1738, 0.3017, 1.0502, 0.1550, 0.5708, 0.0366, 0.5093, 0.0294, -0.7091, -0.8220, -0.1583, -0.2343, 0.1366, 0.7372, -0.0631, 0.1505, 0.4600, -0.1252, -0.5245, 0.7523, -0.0386, -0.2587, 1.0066, -0.2037, 0.1617, -0.3800, 0.2790, 0.0184, -0.5111, -0.7291, 0.1627, 0.2367, -0.0192, 0.4822, -0.4458, 0.1457, -0.5884, 0.1909, 0.2563, -0.2035, -0.0377, 0.7771, 0.2139, 0.3801, 0.6047, -0.6043, -0.2563, -0.0726, 0.3856, 0.3217, 0.0823, -0.1302, 0.3287, 0.5693, 0.2453, 0.8231, 0.0072, 1.0327, 0.6065, -0.0620, -0.5572, 0.5220, 0.2485, 0.1520, 0.0222, -0.2179, -0.7392, -0.3855, 0.1822, 0.1042, 0.7133, 0.3583, 0.0606, -0.0424, -0.9189, -0.4882, -0.5480, -0.5719, -0.1660, -0.3439, -0.5814, -0.2542, 0.0197, 0.4942, 0.0915, -0.0420, -0.0035, 0.5578, 0.1051, -0.0891, 0.2348, 0.6876, -0.6685, 0.8215, -0.3692, -0.3150, -0.0462, -0.6806, -0.2661, -0.0308, -0.0050, 0.6756, -0.1647, 1.0734, 0.0049, 0.4969, 0.0259, -0.8949, 0.0731, 0.0886, 0.3442, -0.1433, -0.6804, 0.2204, 0.1859, 0.2702, 0.1699, -0.1443, -0.9614, 0.3261, 0.1718, 0.3545, -0.0686] ) def __init__(self, t2s_ref=None, s2a_ref=None, optimize=True, torch_compile=False): args = dict() try: if t2s_ref: args["ref"] = t2s_ref self.t2s = TSARTransformer.load_model(**args).cuda() if optimize: self.t2s.optimize(torch_compile=torch_compile) except: print("Failed to load the T2S model:") print(traceback.format_exc()) try: if s2a_ref: args["ref"] = s2a_ref self.s2a = SADelARTransformer.load_model(**args).cuda() if optimize: self.s2a.optimize(torch_compile=torch_compile) except: print("Failed to load the S2A model:") print(traceback.format_exc()) self.vocoder = Vocoder() self.encoder = None def extract_spk_emb(self, fname): """Extracts a speaker embedding from the first 30 seconds of the give audio file. """ import torchaudio if self.encoder is None: from speechbrain.pretrained import EncoderClassifier self.encoder = EncoderClassifier.from_hparams("speechbrain/spkrec-ecapa-voxceleb", savedir="~/.cache/speechbrain/", run_opts={"device": "cuda"}) samples, sr = torchaudio.load(fname) samples = self.encoder.audio_normalizer(samples[0,:30*sr], sr) spk_emb = self.encoder.encode_batch(samples) return spk_emb[0,0] def generate_atoks(self, text, speaker=None, lang='en', cps=15, step_callback=None): if speaker is None: speaker = self.default_speaker elif isinstance(speaker, (str, Path)): speaker = self.extract_spk_emb(speaker) text = text.replace("\n", " ") stoks = self.t2s.generate(text, cps=cps, lang=lang, step=step_callback) atoks = self.s2a.generate(stoks, speaker.unsqueeze(0), step=step_callback) return atoks def generate(self, text, speaker=None, lang='en', cps=15, step_callback=None): return self.vocoder.decode(self.generate_atoks(text, speaker, lang=lang, cps=cps, step_callback=step_callback)) def generate_to_file(self, fname, text, speaker=None, lang='en', cps=15, step_callback=None): self.vocoder.decode_to_file(fname, self.generate_atoks(text, speaker, lang=lang, cps=cps, step_callback=None)) def generate_to_notebook(self, text, speaker=None, lang='en', cps=15, step_callback=None): self.vocoder.decode_to_notebook(self.generate_atoks(text, speaker, lang=lang, cps=cps, step_callback=None))