import os os.system("git clone https://github.com/v-iashin/SpecVQGAN") os.system("pip install pytorch-lightning==1.2.10 omegaconf==2.0.6 streamlit==0.80 matplotlib==3.4.1 albumentations==0.5.2 SoundFile torch torchvision librosa") from pathlib import Path import soundfile import torch import gradio as gr import sys sys.path.append('./SpecVQGAN') from feature_extraction.demo_utils import (calculate_codebook_bitrate, extract_melspectrogram, get_audio_file_bitrate, get_duration, load_neural_audio_codec) from sample_visualization import tensor_to_plt from torch.utils.data.dataloader import default_collate device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model_name = '2021-05-19T22-16-54_vggsound_codebook' log_dir = './logs' # loading the models might take a few minutes config, model, vocoder = load_neural_audio_codec(model_name, log_dir, device) def inference(audio): # Select an Audio input_wav = audio.name # Spectrogram Extraction model_sr = config.data.params.sample_rate duration = get_duration(input_wav) spec = extract_melspectrogram(input_wav, sr=model_sr, duration=duration) print(f'Audio Duration: {duration} seconds') print('Original Spectrogram Shape:', spec.shape) # Prepare Input spectrogram = {'input': spec} batch = default_collate([spectrogram]) batch['image'] = batch['input'].to(device) x = model.get_input(batch, 'image') with torch.no_grad(): quant_z, diff, info = model.encode(x) xrec = model.decode(quant_z) print('Compressed representation (it is all you need to recover the audio):') F, T = quant_z.shape[-2:] print(info[2].reshape(F, T)) # Calculate Bitrate bitrate = calculate_codebook_bitrate(duration, quant_z, model.quantize.n_e) orig_bitrate = get_audio_file_bitrate(input_wav) # Save and Display x = x.squeeze(0) xrec = xrec.squeeze(0) # specs are in [-1, 1], making them in [0, 1] wav_x = vocoder((x + 1) / 2).squeeze().detach().cpu().numpy() wav_xrec = vocoder((xrec + 1) / 2).squeeze().detach().cpu().numpy() # Creating a temp folder which will hold the results tmp_dir = os.path.join('./tmp/neural_audio_codec', Path(input_wav).parent.stem) os.makedirs(tmp_dir, exist_ok=True) # Save paths x_save_path = Path(tmp_dir) / 'vocoded_orig_spec.wav' xrec_save_path = Path(tmp_dir) / f'specvqgan_{bitrate:.2f}kbps.wav' # Save soundfile.write(x_save_path, wav_x, model_sr, 'PCM_16') soundfile.write(xrec_save_path, wav_xrec, model_sr, 'PCM_16') return './tmp/neural_audio_codec/vocoded_orig_spec.wav', "./tmp/neural_audio_codec/"+f'specvqgan_{bitrate:.2f}kbps.wav' title = "Anime2Sketch" description = "demo for Anime2Sketch. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below." article = "

Adversarial Open Domain Adaption for Sketch-to-Photo Synthesis | Github Repo

" gr.Interface( inference, gr.inputs.Audio(type="file", label="Input Audio"), [gr.outputs.Audio(type="file", label="Original audio"),gr.outputs.Audio(type="file", label="Reconstructed audio")], title=title, description=description, article=article, enable_queue=True ).launch(debug=True)