Spaces:
Runtime error
Runtime error
Ahsen Khaliq
commited on
Commit
•
a32e829
1
Parent(s):
0e4103d
Update demo_cli.py
Browse files- demo_cli.py +37 -37
demo_cli.py
CHANGED
@@ -82,45 +82,45 @@ if __name__ == '__main__':
|
|
82 |
|
83 |
|
84 |
## Run a test
|
85 |
-
print("Testing your configuration with small inputs.")
|
86 |
-
# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
|
87 |
-
# sampling rate, which may differ.
|
88 |
-
# If you're unfamiliar with digital audio, know that it is encoded as an array of floats
|
89 |
-
# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
|
90 |
-
# The sampling rate is the number of values (samples) recorded per second, it is set to
|
91 |
-
# 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
|
92 |
-
# to an audio of 1 second.
|
93 |
-
print(" Testing the encoder...")
|
94 |
-
encoder.embed_utterance(np.zeros(encoder.sampling_rate))
|
95 |
|
96 |
-
# Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
|
97 |
-
# returns, but here we're going to make one ourselves just for the sake of showing that it's
|
98 |
-
# possible.
|
99 |
-
embed = np.random.rand(speaker_embedding_size)
|
100 |
-
# Embeddings are L2-normalized (this isn't important here, but if you want to make your own
|
101 |
-
# embeddings it will be).
|
102 |
-
embed /= np.linalg.norm(embed)
|
103 |
-
# The synthesizer can handle multiple inputs with batching. Let's create another embedding to
|
104 |
-
# illustrate that
|
105 |
-
embeds = [embed, np.zeros(speaker_embedding_size)]
|
106 |
-
texts = ["test 1", "test 2"]
|
107 |
-
print(" Testing the synthesizer... (loading the model will output a lot of text)")
|
108 |
-
mels = synthesizer.synthesize_spectrograms(texts, embeds)
|
109 |
|
110 |
-
# The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
|
111 |
-
# can concatenate the mel spectrograms to a single one.
|
112 |
-
mel = np.concatenate(mels, axis=1)
|
113 |
-
# The vocoder can take a callback function to display the generation. More on that later. For
|
114 |
-
# now we'll simply hide it like this:
|
115 |
-
no_action = lambda *args: None
|
116 |
-
print(" Testing the vocoder...")
|
117 |
-
# For the sake of making this test short, we'll pass a short target length. The target length
|
118 |
-
# is the length of the wav segments that are processed in parallel. E.g. for audio sampled
|
119 |
-
# at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
|
120 |
-
# 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
|
121 |
-
# that has a detrimental effect on the quality of the audio. The default parameters are
|
122 |
-
# recommended in general.
|
123 |
-
vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
|
124 |
|
125 |
print("All test passed! You can now synthesize speech.\n\n")
|
126 |
|
|
|
82 |
|
83 |
|
84 |
## Run a test
|
85 |
+
# print("Testing your configuration with small inputs.")
|
86 |
+
# # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
|
87 |
+
# # sampling rate, which may differ.
|
88 |
+
# # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
|
89 |
+
# # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
|
90 |
+
# # The sampling rate is the number of values (samples) recorded per second, it is set to
|
91 |
+
# # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
|
92 |
+
# # to an audio of 1 second.
|
93 |
+
# print(" Testing the encoder...")
|
94 |
+
# encoder.embed_utterance(np.zeros(encoder.sampling_rate))
|
95 |
|
96 |
+
# # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
|
97 |
+
# # returns, but here we're going to make one ourselves just for the sake of showing that it's
|
98 |
+
# # possible.
|
99 |
+
# embed = np.random.rand(speaker_embedding_size)
|
100 |
+
# # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
|
101 |
+
# # embeddings it will be).
|
102 |
+
# embed /= np.linalg.norm(embed)
|
103 |
+
# # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
|
104 |
+
# # illustrate that
|
105 |
+
# embeds = [embed, np.zeros(speaker_embedding_size)]
|
106 |
+
# texts = ["test 1", "test 2"]
|
107 |
+
# print(" Testing the synthesizer... (loading the model will output a lot of text)")
|
108 |
+
# mels = synthesizer.synthesize_spectrograms(texts, embeds)
|
109 |
|
110 |
+
# # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
|
111 |
+
# # can concatenate the mel spectrograms to a single one.
|
112 |
+
# mel = np.concatenate(mels, axis=1)
|
113 |
+
# # The vocoder can take a callback function to display the generation. More on that later. For
|
114 |
+
# # now we'll simply hide it like this:
|
115 |
+
# no_action = lambda *args: None
|
116 |
+
# print(" Testing the vocoder...")
|
117 |
+
# # For the sake of making this test short, we'll pass a short target length. The target length
|
118 |
+
# # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
|
119 |
+
# # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
|
120 |
+
# # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
|
121 |
+
# # that has a detrimental effect on the quality of the audio. The default parameters are
|
122 |
+
# # recommended in general.
|
123 |
+
# vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
|
124 |
|
125 |
print("All test passed! You can now synthesize speech.\n\n")
|
126 |
|