import json
from tqdm import tqdm
from copy import deepcopy

import soundfile as sf
import numpy as np
import gradio as gr
import torch

import random
random.seed(0)
torch.manual_seed(0)
np.random.seed(0)

from util import print_size, sampling
from network import CleanUNet
import torchaudio
import torchaudio.transforms as T

SAMPLE_RATE = 22050

def load_simple(filename):
    wav, sr = torchaudio.load(filename)
    resampler = T.Resample(sr, SAMPLE_RATE, dtype=wav.dtype)
    resampled_wav = resampler(wav)
    return resampled_wav

CONFIG = "configs/DNS-large-full.json"
CHECKPOINT = "./exp/DNS-large-high/checkpoint/pretrained.pkl"

# Parse configs. Globals nicer in this case
with open(CONFIG) as f:
    data = f.read()
    config = json.loads(data)
    gen_config              = config["gen_config"]
    global network_config
    network_config          = config["network_config"]      # to define wavenet
    global train_config
    train_config            = config["train_config"]        # train config
    global trainset_config
    trainset_config         = config["trainset_config"]     # to read trainset configurations

def denoise(filename, ckpt_path = CHECKPOINT, out = "out.wav"):
    """
    Denoise audio
    Parameters:
    output_directory (str):         save generated speeches to this path
    ckpt_iter (int or 'max'):       the pretrained checkpoint to be loaded; 
                                    automitically selects the maximum iteration if 'max' is selected
    subset (str):                   training, testing, validation
    dump (bool):                    whether save enhanced (denoised) audio
    """

    # setup local experiment path
    exp_path = train_config["exp_path"]
    print('exp_path:', exp_path)

    # load data
    loader_config = deepcopy(trainset_config)
    loader_config["crop_length_sec"] = 0

    # predefine model
    net = CleanUNet(**network_config)
    print_size(net)

    # load checkpoint
    checkpoint = torch.load(ckpt_path, map_location='cpu')
    net.load_state_dict(checkpoint['model_state_dict'])
    net.eval()

    # inference
    noisy_audio = load_simple(filename)

    with torch.no_grad():
        with torch.cuda.amp.autocast():
            generated_audio = sampling(net, noisy_audio)
            generated_audio = generated_audio[0].squeeze().cpu().numpy()
            sf.write(out, np.ravel(generated_audio), SAMPLE_RATE)

    return out

audio = gr.inputs.Audio(label = "Audio to denoise", type = 'filepath')
inputs = [audio]
outputs = gr.outputs.Audio(label = "Denoised audio", type = 'filepath')

title = "Speech Denoising in the Waveform Domain with Self-Attention from Nvidia"

gr.Interface(denoise, inputs, outputs, title=title, enable_queue=True).launch()