Spaces:

innnky
/

soft-vits-singingvc

Runtime error

App Files Files Community

rcell commited on Sep 7, 2022

Commit

422853a

1 Parent(s): 2fba440

debug

Browse files

Files changed (5) hide show

app.py +17 -13
configs/vctk_base.json +4 -4
data_utils.py +106 -79
filelists/train_sing_mul.txt +0 -0
filelists/val_sing_mul.txt +4 -0

app.py CHANGED Viewed

@@ -38,17 +38,19 @@ def get_text(text, hps):
 hps = utils.get_hparams_from_file("configs/ljs_base.json")
-net_g = SynthesizerTrn(
     len(symbols),
-    hps.data.filter_length // 2 + 1,
-    hps.train.segment_size // hps.data.hop_length,
-    **hps.model)
 import numpy as np
 hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")
-_ = utils.load_checkpoint("G_88000.pth", net_g, None)
 def vc_fn(input_audio,vc_transform):
     if input_audio is None:
@@ -64,21 +66,23 @@ def vc_fn(input_audio,vc_transform):
     if sampling_rate != 16000:
         audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
-    audio22050 = librosa.resample(audio, orig_sr=sampling_rate, target_sr=22050)
     f0 = convert_wav_22050_to_f0(audio22050)
     source = torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0)
     print(source.shape)
     with torch.inference_mode():
         units = hubert.units(source)
-        f0 = resize2d(f0, len(units[:, 0])) * vc_transform
-        units[:, 0] = f0 / 10
-    stn_tst = torch.FloatTensor(units.squeeze(0))
     with torch.no_grad():
         x_tst = stn_tst.unsqueeze(0)
         x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
-        audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=0.1, noise_scale_w=0.1, length_scale=1)[0][
             0, 0].data.float().numpy()
     return "Success", (hps.data.sampling_rate, audio)
@@ -90,7 +94,7 @@ with app:
     with gr.Tabs():
         with gr.TabItem("Basic"):
             vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
-            vc_transform = gr.Number(label="transform")
             vc_submit = gr.Button("Convert", variant="primary")
             vc_output1 = gr.Textbox(label="Output Message")
             vc_output2 = gr.Audio(label="Output Audio")

 hps = utils.get_hparams_from_file("configs/ljs_base.json")
+hps_ms = utils.get_hparams_from_file("configs/vctk_base.json")
+net_g_ms = SynthesizerTrn(
     len(symbols),
+    hps_ms.data.filter_length // 2 + 1,
+    hps_ms.train.segment_size // hps.data.hop_length,
+    n_speakers=hps_ms.data.n_speakers,
+    **hps_ms.model)
 import numpy as np
 hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")
+_ = utils.load_checkpoint("G_312000.pth", net_g_ms, None)
 def vc_fn(input_audio,vc_transform):
     if input_audio is None:
     if sampling_rate != 16000:
         audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
+    audio22050 = librosa.resample(audio, orig_sr=16000, target_sr=22050)
     f0 = convert_wav_22050_to_f0(audio22050)
     source = torch.FloatTensor(audio).unsqueeze(0).unsqueeze(0)
     print(source.shape)
     with torch.inference_mode():
         units = hubert.units(source)
+        soft = units.squeeze(0).numpy()
+        print(sampling_rate)
+        f0 = resize2d(f0, len(soft[:, 0])) * vc_transform
+        soft[:, 0] = f0 / 10
+    sid = torch.LongTensor([0])
+    stn_tst = torch.FloatTensor(soft)
     with torch.no_grad():
         x_tst = stn_tst.unsqueeze(0)
         x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
+        audio = net_g_ms.infer(x_tst, x_tst_lengths,sid=sid, noise_scale=0.1, noise_scale_w=0.1, length_scale=1)[0][
             0, 0].data.float().numpy()
     return "Success", (hps.data.sampling_rate, audio)
     with gr.Tabs():
         with gr.TabItem("Basic"):
             vc_input3 = gr.Audio(label="Input Audio (30s limitation)")
+            vc_transform = gr.Number(label="transform",value=1.0)
             vc_submit = gr.Button("Convert", variant="primary")
             vc_output1 = gr.Textbox(label="Output Message")
             vc_output2 = gr.Audio(label="Output Audio")

configs/vctk_base.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "train": {
     "log_interval": 100,
-    "eval_interval": 3000,
     "seed": 1234,
     "epochs": 10000,
     "learning_rate": 2e-4,
@@ -17,8 +17,8 @@
     "c_kl": 1.0
   },
   "data": {
-    "training_files":"filelists/train_mul.txt",
-    "validation_files":"filelists/val_mul.txt",
     "text_cleaners":["english_cleaners2"],
     "max_wav_value": 32768.0,
     "sampling_rate": 22050,
@@ -29,7 +29,7 @@
     "mel_fmin": 0.0,
     "mel_fmax": null,
     "add_blank": true,
-    "n_speakers": 7,
     "cleaned_text": true
   },
   "model": {

 {
   "train": {
     "log_interval": 100,
+    "eval_interval": 2000,
     "seed": 1234,
     "epochs": 10000,
     "learning_rate": 2e-4,
     "c_kl": 1.0
   },
   "data": {
+    "training_files":"filelists/train_sing_mul.txt",
+    "validation_files":"filelists/val_sing_mul.txt",
     "text_cleaners":["english_cleaners2"],
     "max_wav_value": 32768.0,
     "sampling_rate": 22050,
     "mel_fmin": 0.0,
     "mel_fmax": null,
     "add_blank": true,
+    "n_speakers": 2,
     "cleaned_text": true
   },
   "model": {

data_utils.py CHANGED Viewed

@@ -5,27 +5,35 @@ import numpy as np
 import torch
 import torch.utils.data
 import numpy as np
-import commons
 from mel_processing import spectrogram_torch
 from utils import load_wav_to_torch, load_filepaths_and_text
 from text import text_to_sequence, cleaned_text_to_sequence
 class TextAudioLoader(torch.utils.data.Dataset):
     """
         1) loads audio, text pairs
         2) normalizes text and converts them to sequences of integers
         3) computes spectrograms from audio files.
     """
     def __init__(self, audiopaths_and_text, hparams):
         self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
-        self.text_cleaners  = hparams.text_cleaners
-        self.max_wav_value  = hparams.max_wav_value
-        self.sampling_rate  = hparams.sampling_rate
-        self.filter_length  = hparams.filter_length
-        self.hop_length     = hparams.hop_length
-        self.win_length     = hparams.win_length
-        self.sampling_rate  = hparams.sampling_rate
         self.cleaned_text = getattr(hparams, "cleaned_text", False)
@@ -37,7 +45,6 @@ class TextAudioLoader(torch.utils.data.Dataset):
         random.shuffle(self.audiopaths_and_text)
         self._filter()
     def _filter(self):
         """
         Filter text & store spec lengths
@@ -74,8 +81,8 @@ class TextAudioLoader(torch.utils.data.Dataset):
             spec = torch.load(spec_filename)
         else:
             spec = spectrogram_torch(audio_norm, self.filter_length,
-                self.sampling_rate, self.hop_length, self.win_length,
-                center=False)
             spec = torch.squeeze(spec, 0)
             torch.save(spec, spec_filename)
         return spec, audio_norm
@@ -88,8 +95,14 @@ class TextAudioLoader(torch.utils.data.Dataset):
         # if self.add_blank:
         #     text_norm = commons.intersperse(text_norm, 0)
         # text_norm = torch.LongTensor(text_norm)
         soft = np.load(text)
         text_norm = torch.FloatTensor(soft)
         return text_norm
@@ -103,6 +116,7 @@ class TextAudioLoader(torch.utils.data.Dataset):
 class TextAudioCollate():
     """ Zero-pads model inputs and targets
     """
     def __init__(self, return_ids=False):
         self.return_ids = return_ids
@@ -135,7 +149,7 @@ class TextAudioCollate():
             row = batch[ids_sorted_decreasing[i]]
             text = row[0]
-            text_padded[i, :text.size(0),:] = text
             text_lengths[i] = text.size(0)
             spec = row[1]
@@ -152,21 +166,24 @@ class TextAudioCollate():
 """Multi speaker version"""
 class TextAudioSpeakerLoader(torch.utils.data.Dataset):
     """
         1) loads audio, speaker_id, text pairs
         2) normalizes text and converts them to sequences of integers
         3) computes spectrograms from audio files.
     """
     def __init__(self, audiopaths_sid_text, hparams):
         self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
         self.text_cleaners = hparams.text_cleaners
         self.max_wav_value = hparams.max_wav_value
         self.sampling_rate = hparams.sampling_rate
-        self.filter_length  = hparams.filter_length
-        self.hop_length     = hparams.hop_length
-        self.win_length     = hparams.win_length
-        self.sampling_rate  = hparams.sampling_rate
         self.cleaned_text = getattr(hparams, "cleaned_text", False)
@@ -215,15 +232,23 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
             spec = torch.load(spec_filename)
         else:
             spec = spectrogram_torch(audio_norm, self.filter_length,
-                self.sampling_rate, self.hop_length, self.win_length,
-                center=False)
             spec = torch.squeeze(spec, 0)
             torch.save(spec, spec_filename)
         return spec, audio_norm
     def get_text(self, text):
         soft = np.load(text)
         text_norm = torch.FloatTensor(soft)
         return text_norm
@@ -241,6 +266,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
 class TextAudioSpeakerCollate():
     """ Zero-pads model inputs and targets
     """
     def __init__(self, return_ids=False):
         self.return_ids = return_ids
@@ -297,20 +323,21 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
     Maintain similar input lengths in a batch.
     Length groups are specified by boundaries.
     Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
     It removes samples which are not included in the boundaries.
     Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
     """
     def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
         super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
         self.lengths = dataset.lengths
         self.batch_size = batch_size
         self.boundaries = boundaries
         self.buckets, self.num_samples_per_bucket = self._create_buckets()
         self.total_size = sum(self.num_samples_per_bucket)
         self.num_samples = self.total_size // self.num_replicas
     def _create_buckets(self):
         buckets = [[] for _ in range(len(self.boundaries) - 1)]
         for i in range(len(self.lengths)):
@@ -318,12 +345,12 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
             idx_bucket = self._bisect(length)
             if idx_bucket != -1:
                 buckets[idx_bucket].append(i)
         for i in range(len(buckets) - 1, 0, -1):
             if len(buckets[i]) == 0:
                 buckets.pop(i)
-                self.boundaries.pop(i+1)
         num_samples_per_bucket = []
         for i in range(len(buckets)):
             len_bucket = len(buckets[i])
@@ -331,61 +358,61 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
             rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
             num_samples_per_bucket.append(len_bucket + rem)
         return buckets, num_samples_per_bucket
     def __iter__(self):
-      # deterministically shuffle based on epoch
-      g = torch.Generator()
-      g.manual_seed(self.epoch)
-      indices = []
-      if self.shuffle:
-          for bucket in self.buckets:
-              indices.append(torch.randperm(len(bucket), generator=g).tolist())
-      else:
-          for bucket in self.buckets:
-              indices.append(list(range(len(bucket))))
-      batches = []
-      for i in range(len(self.buckets)):
-          bucket = self.buckets[i]
-          len_bucket = len(bucket)
-          ids_bucket = indices[i]
-          num_samples_bucket = self.num_samples_per_bucket[i]
-          # add extra samples to make it evenly divisible
-          rem = num_samples_bucket - len_bucket
-          ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
-          # subsample
-          ids_bucket = ids_bucket[self.rank::self.num_replicas]
-          # batching
-          for j in range(len(ids_bucket) // self.batch_size):
-              batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]]
-              batches.append(batch)
-      if self.shuffle:
-          batch_ids = torch.randperm(len(batches), generator=g).tolist()
-          batches = [batches[i] for i in batch_ids]
-      self.batches = batches
-      assert len(self.batches) * self.batch_size == self.num_samples
-      return iter(self.batches)
     def _bisect(self, x, lo=0, hi=None):
-      if hi is None:
-          hi = len(self.boundaries) - 1
-      if hi > lo:
-          mid = (hi + lo) // 2
-          if self.boundaries[mid] < x and x <= self.boundaries[mid+1]:
-              return mid
-          elif x <= self.boundaries[mid]:
-              return self._bisect(x, lo, mid)
-          else:
-              return self._bisect(x, mid + 1, hi)
-      else:
-          return -1
     def __len__(self):
         return self.num_samples // self.batch_size

 import torch
 import torch.utils.data
 import numpy as np
+import commons
 from mel_processing import spectrogram_torch
 from utils import load_wav_to_torch, load_filepaths_and_text
 from text import text_to_sequence, cleaned_text_to_sequence
+def dropout1d(myarray, ratio=0.5):
+    indices = np.random.choice(np.arange(myarray.size), replace=False,
+                               size=int(myarray.size * ratio))
+    myarray[indices] = 0
+    return myarray
 class TextAudioLoader(torch.utils.data.Dataset):
     """
         1) loads audio, text pairs
         2) normalizes text and converts them to sequences of integers
         3) computes spectrograms from audio files.
     """
     def __init__(self, audiopaths_and_text, hparams):
         self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
+        self.text_cleaners = hparams.text_cleaners
+        self.max_wav_value = hparams.max_wav_value
+        self.sampling_rate = hparams.sampling_rate
+        self.filter_length = hparams.filter_length
+        self.hop_length = hparams.hop_length
+        self.win_length = hparams.win_length
+        self.sampling_rate = hparams.sampling_rate
         self.cleaned_text = getattr(hparams, "cleaned_text", False)
         random.shuffle(self.audiopaths_and_text)
         self._filter()
     def _filter(self):
         """
         Filter text & store spec lengths
             spec = torch.load(spec_filename)
         else:
             spec = spectrogram_torch(audio_norm, self.filter_length,
+                                     self.sampling_rate, self.hop_length, self.win_length,
+                                     center=False)
             spec = torch.squeeze(spec, 0)
             torch.save(spec, spec_filename)
         return spec, audio_norm
         # if self.add_blank:
         #     text_norm = commons.intersperse(text_norm, 0)
         # text_norm = torch.LongTensor(text_norm)
         soft = np.load(text)
+        # # 添加F0信息
+        # head, rear = text.split(".")
+        # f0 = np.load(head+".f0."+rear)
+        # soft[:,0] = f0/10
         text_norm = torch.FloatTensor(soft)
         return text_norm
 class TextAudioCollate():
     """ Zero-pads model inputs and targets
     """
     def __init__(self, return_ids=False):
         self.return_ids = return_ids
             row = batch[ids_sorted_decreasing[i]]
             text = row[0]
+            text_padded[i, :text.size(0), :] = text
             text_lengths[i] = text.size(0)
             spec = row[1]
 """Multi speaker version"""
 class TextAudioSpeakerLoader(torch.utils.data.Dataset):
     """
         1) loads audio, speaker_id, text pairs
         2) normalizes text and converts them to sequences of integers
         3) computes spectrograms from audio files.
     """
     def __init__(self, audiopaths_sid_text, hparams):
         self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
         self.text_cleaners = hparams.text_cleaners
         self.max_wav_value = hparams.max_wav_value
         self.sampling_rate = hparams.sampling_rate
+        self.filter_length = hparams.filter_length
+        self.hop_length = hparams.hop_length
+        self.win_length = hparams.win_length
+        self.sampling_rate = hparams.sampling_rate
         self.cleaned_text = getattr(hparams, "cleaned_text", False)
             spec = torch.load(spec_filename)
         else:
             spec = spectrogram_torch(audio_norm, self.filter_length,
+                                     self.sampling_rate, self.hop_length, self.win_length,
+                                     center=False)
             spec = torch.squeeze(spec, 0)
             torch.save(spec, spec_filename)
         return spec, audio_norm
     def get_text(self, text):
         soft = np.load(text)
+        head, rear = text.split(".")
+        f0 = np.load(head + ".f0." + rear)
+        p = random.random()
+        # print(p)
+        if p < 0.3:
+            f0 = dropout1d(f0, 0.6)
+            # print(f0)
+        soft[:, 0] = f0 / 10
+        # soft = soft + np.expand_dims(np.log(f0),1)*0.2
         text_norm = torch.FloatTensor(soft)
         return text_norm
 class TextAudioSpeakerCollate():
     """ Zero-pads model inputs and targets
     """
     def __init__(self, return_ids=False):
         self.return_ids = return_ids
     Maintain similar input lengths in a batch.
     Length groups are specified by boundaries.
     Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
     It removes samples which are not included in the boundaries.
     Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
     """
     def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
         super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
         self.lengths = dataset.lengths
         self.batch_size = batch_size
         self.boundaries = boundaries
         self.buckets, self.num_samples_per_bucket = self._create_buckets()
         self.total_size = sum(self.num_samples_per_bucket)
         self.num_samples = self.total_size // self.num_replicas
     def _create_buckets(self):
         buckets = [[] for _ in range(len(self.boundaries) - 1)]
         for i in range(len(self.lengths)):
             idx_bucket = self._bisect(length)
             if idx_bucket != -1:
                 buckets[idx_bucket].append(i)
         for i in range(len(buckets) - 1, 0, -1):
             if len(buckets[i]) == 0:
                 buckets.pop(i)
+                self.boundaries.pop(i + 1)
         num_samples_per_bucket = []
         for i in range(len(buckets)):
             len_bucket = len(buckets[i])
             rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
             num_samples_per_bucket.append(len_bucket + rem)
         return buckets, num_samples_per_bucket
     def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        indices = []
+        if self.shuffle:
+            for bucket in self.buckets:
+                indices.append(torch.randperm(len(bucket), generator=g).tolist())
+        else:
+            for bucket in self.buckets:
+                indices.append(list(range(len(bucket))))
+        batches = []
+        for i in range(len(self.buckets)):
+            bucket = self.buckets[i]
+            len_bucket = len(bucket)
+            ids_bucket = indices[i]
+            num_samples_bucket = self.num_samples_per_bucket[i]
+            # add extra samples to make it evenly divisible
+            rem = num_samples_bucket - len_bucket
+            ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
+            # subsample
+            ids_bucket = ids_bucket[self.rank::self.num_replicas]
+            # batching
+            for j in range(len(ids_bucket) // self.batch_size):
+                batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
+                batches.append(batch)
+        if self.shuffle:
+            batch_ids = torch.randperm(len(batches), generator=g).tolist()
+            batches = [batches[i] for i in batch_ids]
+        self.batches = batches
+        assert len(self.batches) * self.batch_size == self.num_samples
+        return iter(self.batches)
     def _bisect(self, x, lo=0, hi=None):
+        if hi is None:
+            hi = len(self.boundaries) - 1
+        if hi > lo:
+            mid = (hi + lo) // 2
+            if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
+                return mid
+            elif x <= self.boundaries[mid]:
+                return self._bisect(x, lo, mid)
+            else:
+                return self._bisect(x, mid + 1, hi)
+        else:
+            return -1
     def __len__(self):
         return self.num_samples // self.batch_size

filelists/train_sing_mul.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

filelists/val_sing_mul.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+/content/cpop/wavs/dev/2001000003.wav|1|/content/cpop/soft/dev/2001000003.npy
+/content/cpop/wavs/dev/2002000055.wav|1|/content/cpop/soft/dev/2002000055.npy
+/content/cpop/wavs/dev/2001000002.wav|1|/content/cpop/soft/dev/2001000002.npy
+/content/cpop/wavs/dev/2001000001.wav|1|/content/cpop/soft/dev/2001000001.npy