laiwu commited on
Commit
c35279b
·
1 Parent(s): cf817d2

Update files for Liyuu model.

Browse files
Files changed (3) hide show
  1. app.py +5 -5
  2. configs/config.json +92 -63
  3. logs/44k/kmeans_10000.pt +0 -3
app.py CHANGED
@@ -14,7 +14,7 @@ logging.getLogger('markdown_it').setLevel(logging.WARNING)
14
  logging.getLogger('urllib3').setLevel(logging.WARNING)
15
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
16
 
17
- model = Svc("logs/44k/G_114400.pth", "configs/config.json", cluster_model_path="logs/44k/kmeans_10000.pt")
18
 
19
 
20
 
@@ -40,7 +40,7 @@ def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, noise_scale):
40
  auto_predict_f0=auto_f0,
41
  noice_scale=noise_scale
42
  )
43
- return "Success", (44100, out_audio.numpy())
44
 
45
 
46
  app = gr.Blocks()
@@ -48,12 +48,12 @@ with app:
48
  with gr.Tabs():
49
  with gr.TabItem("Basic"):
50
  gr.Markdown(value="""
51
- sovits4.0 在线demo
52
 
53
- 此demo为预训练底模在线demo,使用数据:云灏 即霜 辉宇·星AI 派蒙 绫地宁宁
54
  """)
55
  spks = list(model.spk2id.keys())
56
- sid = gr.Dropdown(label="音色", choices=["nen", "yunhao","paimon", "huiyu","jishuang"], value="yunhao")
57
  vc_input3 = gr.Audio(label="上传音频(长度小于45秒)")
58
  vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
59
  cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
 
14
  logging.getLogger('urllib3').setLevel(logging.WARNING)
15
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
16
 
17
+ model = Svc("logs/44k/G_97600.pth", "configs/config.json", cluster_model_path="logs/44k/kmeans_10000.pt")
18
 
19
 
20
 
 
40
  auto_predict_f0=auto_f0,
41
  noice_scale=noise_scale
42
  )
43
+ return "Success", (44100, out_audio.cpu().numpy())
44
 
45
 
46
  app = gr.Blocks()
 
48
  with gr.Tabs():
49
  with gr.TabItem("Basic"):
50
  gr.Markdown(value="""
51
+ Liyuu sovits4.0 在线demo
52
 
53
+ 使用中文直播录音训练,目前未包含聚类模型(咕咕咕)
54
  """)
55
  spks = list(model.spk2id.keys())
56
+ sid = gr.Dropdown(label="音色", choices=["liyuu"], value="liyuu")
57
  vc_input3 = gr.Audio(label="上传音频(长度小于45秒)")
58
  vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
59
  cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)
configs/config.json CHANGED
@@ -1,64 +1,93 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "eval_interval": 800,
5
- "seed": 1234,
6
- "epochs": 10000,
7
- "learning_rate": 0.0001,
8
- "betas": [
9
- 0.8,
10
- 0.99
11
- ],
12
- "eps": 1e-09,
13
- "batch_size": 6,
14
- "fp16_run": false,
15
- "lr_decay": 0.999875,
16
- "segment_size": 10240,
17
- "init_lr_ratio": 1,
18
- "warmup_epochs": 0,
19
- "c_mel": 45,
20
- "c_kl": 1.0,
21
- "use_sr": true,
22
- "max_speclen": 512,
23
- "port": "8001"
24
- },
25
- "data": {
26
- "training_files": "filelists/train.txt",
27
- "validation_files": "filelists/val.txt",
28
- "max_wav_value": 32768.0,
29
- "sampling_rate": 44100,
30
- "filter_length": 2048,
31
- "hop_length": 512,
32
- "win_length": 2048,
33
- "n_mel_channels": 80,
34
- "mel_fmin": 0.0,
35
- "mel_fmax": 22050
36
- },
37
- "model": {
38
- "inter_channels": 192,
39
- "hidden_channels": 192,
40
- "filter_channels": 768,
41
- "n_heads": 2,
42
- "n_layers": 6,
43
- "kernel_size": 3,
44
- "p_dropout": 0.1,
45
- "resblock": "1",
46
- "resblock_kernel_sizes": [3,7,11],
47
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
48
- "upsample_rates": [ 8, 8, 2, 2, 2],
49
- "upsample_initial_channel": 512,
50
- "upsample_kernel_sizes": [16,16, 4, 4, 4],
51
- "n_layers_q": 3,
52
- "use_spectral_norm": false,
53
- "gin_channels": 256,
54
- "ssl_dim": 256,
55
- "n_speakers": 200
56
- },
57
- "spk": {
58
- "jishuang": 0,
59
- "huiyu": 1,
60
- "nen": 2,
61
- "paimon": 3,
62
- "yunhao": 4
63
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  }
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 0
25
+ },
26
+ "data": {
27
+ "training_files": "filelists/train.txt",
28
+ "validation_files": "filelists/val.txt",
29
+ "max_wav_value": 32768.0,
30
+ "sampling_rate": 44100,
31
+ "filter_length": 2048,
32
+ "hop_length": 512,
33
+ "win_length": 2048,
34
+ "n_mel_channels": 80,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": 22050
37
+ },
38
+ "model": {
39
+ "inter_channels": 192,
40
+ "hidden_channels": 192,
41
+ "filter_channels": 768,
42
+ "n_heads": 2,
43
+ "n_layers": 6,
44
+ "kernel_size": 3,
45
+ "p_dropout": 0.1,
46
+ "resblock": "1",
47
+ "resblock_kernel_sizes": [
48
+ 3,
49
+ 7,
50
+ 11
51
+ ],
52
+ "resblock_dilation_sizes": [
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ],
58
+ [
59
+ 1,
60
+ 3,
61
+ 5
62
+ ],
63
+ [
64
+ 1,
65
+ 3,
66
+ 5
67
+ ]
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2,
74
+ 2
75
+ ],
76
+ "upsample_initial_channel": 512,
77
+ "upsample_kernel_sizes": [
78
+ 16,
79
+ 16,
80
+ 4,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false,
86
+ "gin_channels": 256,
87
+ "ssl_dim": 256,
88
+ "n_speakers": 200
89
+ },
90
+ "spk": {
91
+ "liyuu": 0
92
+ }
93
  }
logs/44k/kmeans_10000.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1d6ee9c62e091257a4e68d3d2b5c593a215fc503bab7271cae772fac35345b2
3
- size 77120889