Russell1123213123 commited on
Commit
eefa761
1 Parent(s): 31a24b8

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .gitattributes +4 -0
  3. .gitignore +13 -0
  4. .idea/.gitignore +3 -0
  5. .idea/OpenVoice.iml +12 -0
  6. .idea/inspectionProfiles/Project_Default.xml +16 -0
  7. .idea/inspectionProfiles/profiles_settings.xml +6 -0
  8. .idea/misc.xml +7 -0
  9. .idea/modules.xml +8 -0
  10. .idea/vcs.xml +6 -0
  11. .idea/workspace.xml +81 -0
  12. .ipynb_checkpoints/demo_part1-checkpoint.ipynb +399 -0
  13. .ipynb_checkpoints/demo_part3-checkpoint.ipynb +143 -0
  14. LICENSE +7 -0
  15. MyShell_OpenVoice.egg-info/PKG-INFO +105 -0
  16. MyShell_OpenVoice.egg-info/SOURCES.txt +25 -0
  17. MyShell_OpenVoice.egg-info/dependency_links.txt +1 -0
  18. MyShell_OpenVoice.egg-info/not-zip-safe +1 -0
  19. MyShell_OpenVoice.egg-info/requires.txt +14 -0
  20. MyShell_OpenVoice.egg-info/top_level.txt +1 -0
  21. README.md +76 -8
  22. app.py +71 -0
  23. checkpoints/base_speakers/.DS_Store +0 -0
  24. checkpoints/base_speakers/EN/checkpoint.pth +3 -0
  25. checkpoints/base_speakers/EN/config.json +145 -0
  26. checkpoints/base_speakers/EN/en_default_se.pth +3 -0
  27. checkpoints/base_speakers/EN/en_style_se.pth +3 -0
  28. checkpoints/base_speakers/ZH/checkpoint.pth +3 -0
  29. checkpoints/base_speakers/ZH/config.json +137 -0
  30. checkpoints/base_speakers/ZH/zh_default_se.pth +3 -0
  31. checkpoints/converter/checkpoint.pth +3 -0
  32. checkpoints/converter/config.json +57 -0
  33. checkpoints_v2/.DS_Store +0 -0
  34. checkpoints_v2/converter/checkpoint.pth +3 -0
  35. checkpoints_v2/converter/config.json +57 -0
  36. demo_part1.ipynb +401 -0
  37. demo_part2.ipynb +195 -0
  38. demo_part3.ipynb +256 -0
  39. docs/QA.md +39 -0
  40. docs/USAGE.md +83 -0
  41. openvoice/__init__.py +0 -0
  42. openvoice/__pycache__/__init__.cpython-39.pyc +0 -0
  43. openvoice/__pycache__/api.cpython-39.pyc +0 -0
  44. openvoice/__pycache__/attentions.cpython-39.pyc +0 -0
  45. openvoice/__pycache__/commons.cpython-39.pyc +0 -0
  46. openvoice/__pycache__/mel_processing.cpython-39.pyc +0 -0
  47. openvoice/__pycache__/models.cpython-39.pyc +0 -0
  48. openvoice/__pycache__/modules.cpython-39.pyc +0 -0
  49. openvoice/__pycache__/se_extractor.cpython-39.pyc +0 -0
  50. openvoice/__pycache__/transforms.cpython-39.pyc +0 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ outputs/output_chinese.wav filter=lfs diff=lfs merge=lfs -text
37
+ outputs/tmp.wav filter=lfs diff=lfs merge=lfs -text
38
+ processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg0.wav filter=lfs diff=lfs merge=lfs -text
39
+ processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg1.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ .ipynb_checkpoints/
3
+ processed
4
+ outputs
5
+ outputs_v2
6
+ checkpoints
7
+ checkpoints_v2
8
+ trash
9
+ examples*
10
+ .env
11
+ build
12
+ *.egg-info/
13
+ *.zip
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/OpenVoice.iml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ <component name="PyDocumentationSettings">
9
+ <option name="format" value="PLAIN" />
10
+ <option name="myDocStringFormat" value="Plain" />
11
+ </component>
12
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
5
+ <option name="ignoredErrors">
6
+ <list>
7
+ <option value="N806" />
8
+ <option value="N802" />
9
+ <option value="N801" />
10
+ <option value="N813" />
11
+ <option value="N803" />
12
+ </list>
13
+ </option>
14
+ </inspection_tool>
15
+ </profile>
16
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (openvoice)" project-jdk-type="Python SDK" />
4
+ <component name="PyCharmProfessionalAdvertiser">
5
+ <option name="shown" value="true" />
6
+ </component>
7
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/OpenVoice.iml" filepath="$PROJECT_DIR$/.idea/OpenVoice.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
.idea/workspace.xml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="BranchesTreeState">
4
+ <expand>
5
+ <path>
6
+ <item name="ROOT" type="e8cecc67:BranchNodeDescriptor" />
7
+ <item name="LOCAL_ROOT" type="e8cecc67:BranchNodeDescriptor" />
8
+ </path>
9
+ <path>
10
+ <item name="ROOT" type="e8cecc67:BranchNodeDescriptor" />
11
+ <item name="REMOTE_ROOT" type="e8cecc67:BranchNodeDescriptor" />
12
+ </path>
13
+ <path>
14
+ <item name="ROOT" type="e8cecc67:BranchNodeDescriptor" />
15
+ <item name="REMOTE_ROOT" type="e8cecc67:BranchNodeDescriptor" />
16
+ <item name="GROUP_NODE:origin" type="e8cecc67:BranchNodeDescriptor" />
17
+ </path>
18
+ </expand>
19
+ <select />
20
+ </component>
21
+ <component name="ChangeListManager">
22
+ <list default="true" id="7380c043-d972-4774-8844-edb18bb79433" name="Default Changelist" comment="">
23
+ <change afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
24
+ <change beforePath="$PROJECT_DIR$/demo_part1.ipynb" beforeDir="false" afterPath="$PROJECT_DIR$/demo_part1.ipynb" afterDir="false" />
25
+ <change beforePath="$PROJECT_DIR$/demo_part3.ipynb" beforeDir="false" afterPath="$PROJECT_DIR$/demo_part3.ipynb" afterDir="false" />
26
+ </list>
27
+ <option name="SHOW_DIALOG" value="false" />
28
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
29
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
30
+ <option name="LAST_RESOLUTION" value="IGNORE" />
31
+ </component>
32
+ <component name="Git.Settings">
33
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
34
+ </component>
35
+ <component name="ProjectId" id="2idcySXwJyzz5B7FOKKvwBhhetW" />
36
+ <component name="ProjectViewState">
37
+ <option name="hideEmptyMiddlePackages" value="true" />
38
+ <option name="showLibraryContents" value="true" />
39
+ <option name="showMembers" value="true" />
40
+ </component>
41
+ <component name="PropertiesComponent">
42
+ <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
43
+ </component>
44
+ <component name="RecentsManager">
45
+ <key name="MoveFile.RECENT_KEYS">
46
+ <recent name="$PROJECT_DIR$" />
47
+ </key>
48
+ </component>
49
+ <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
50
+ <component name="TaskManager">
51
+ <task active="true" id="Default" summary="Default task">
52
+ <changelist id="7380c043-d972-4774-8844-edb18bb79433" name="Default Changelist" comment="" />
53
+ <created>1719824154394</created>
54
+ <option name="number" value="Default" />
55
+ <option name="presentableId" value="Default" />
56
+ <updated>1719824154394</updated>
57
+ </task>
58
+ <servers />
59
+ </component>
60
+ <component name="Vcs.Log.Tabs.Properties">
61
+ <option name="TAB_STATES">
62
+ <map>
63
+ <entry key="MAIN">
64
+ <value>
65
+ <State />
66
+ </value>
67
+ </entry>
68
+ </map>
69
+ </option>
70
+ </component>
71
+ <component name="WindowStateProjectService">
72
+ <state x="1186" y="315" key="#com.intellij.fileTypes.FileTypeChooser" timestamp="1719910419213">
73
+ <screen x="0" y="25" width="1920" height="986" />
74
+ </state>
75
+ <state x="1186" y="315" key="#com.intellij.fileTypes.FileTypeChooser/0.25.1920.986/[email protected]" timestamp="1719910419213" />
76
+ <state x="1076" y="263" width="670" height="676" key="search.everywhere.popup" timestamp="1719914571586">
77
+ <screen x="0" y="25" width="1920" height="986" />
78
+ </state>
79
+ <state x="1076" y="263" width="670" height="676" key="search.everywhere.popup/0.25.1920.986/[email protected]" timestamp="1719914571586" />
80
+ </component>
81
+ </project>
.ipynb_checkpoints/demo_part1-checkpoint.ipynb ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "b6ee1ede",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Voice Style Control Demo"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 19,
14
+ "id": "b7f043ee",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import os\n",
19
+ "import torch\n",
20
+ "from openvoice import se_extractor\n",
21
+ "from openvoice.api import BaseSpeakerTTS, ToneColorConverter"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "markdown",
26
+ "id": "15116b59",
27
+ "metadata": {},
28
+ "source": [
29
+ "### Initialization"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 20,
35
+ "id": "aacad912",
36
+ "metadata": {},
37
+ "outputs": [
38
+ {
39
+ "name": "stdout",
40
+ "output_type": "stream",
41
+ "text": [
42
+ "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
43
+ "missing/unexpected keys: [] []\n",
44
+ "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
45
+ "missing/unexpected keys: [] []\n"
46
+ ]
47
+ }
48
+ ],
49
+ "source": [
50
+ "ckpt_base = 'checkpoints/base_speakers/EN'\n",
51
+ "ckpt_converter = 'checkpoints/converter'\n",
52
+ "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
53
+ "output_dir = 'outputs'\n",
54
+ "\n",
55
+ "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
56
+ "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
57
+ "\n",
58
+ "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
59
+ "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
60
+ "\n",
61
+ "os.makedirs(output_dir, exist_ok=True)"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "markdown",
66
+ "id": "7f67740c",
67
+ "metadata": {},
68
+ "source": [
69
+ "### Obtain Tone Color Embedding"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "markdown",
74
+ "id": "f8add279",
75
+ "metadata": {},
76
+ "source": [
77
+ "The `source_se` is the tone color embedding of the base speaker. \n",
78
+ "It is an average of multiple sentences generated by the base speaker. We directly provide the result here but\n",
79
+ "the readers feel free to extract `source_se` by themselves."
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 21,
85
+ "id": "63ff6273",
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "id": "4f71fcc3",
95
+ "metadata": {},
96
+ "source": [
97
+ "The `reference_speaker.mp3` below points to the short audio clip of the reference whose voice we want to clone. We provide an example here. If you use your own reference speakers, please **make sure each speaker has a unique filename.** The `se_extractor` will save the `targeted_se` using the filename of the audio and **will not automatically overwrite.**"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": 22,
103
+ "id": "55105eae",
104
+ "metadata": {},
105
+ "outputs": [
106
+ {
107
+ "name": "stdout",
108
+ "output_type": "stream",
109
+ "text": [
110
+ "OpenVoice version: v1\n",
111
+ "[(0.0, 19.278375)]\n",
112
+ "after vad: dur = 19.27798185941043\n"
113
+ ]
114
+ }
115
+ ],
116
+ "source": [
117
+ "reference_speaker = './resources/demo_speaker0.mp3' # This is the voice you want to clone\n",
118
+ "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "markdown",
123
+ "id": "a40284aa",
124
+ "metadata": {},
125
+ "source": [
126
+ "### Inference"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 23,
132
+ "id": "73dc1259",
133
+ "metadata": {},
134
+ "outputs": [
135
+ {
136
+ "name": "stdout",
137
+ "output_type": "stream",
138
+ "text": [
139
+ " > Text splitted to sentences.\n",
140
+ "This audio is generated by OpenVoice.\n",
141
+ " > ===========================\n",
142
+ "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
143
+ " length:45\n",
144
+ " length:45\n"
145
+ ]
146
+ }
147
+ ],
148
+ "source": [
149
+ "save_path = f'{output_dir}/output_en_default.wav'\n",
150
+ "\n",
151
+ "# Run the base speaker tts\n",
152
+ "text = \"This audio is generated by OpenVoice.\"\n",
153
+ "src_path = f'{output_dir}/tmp.wav'\n",
154
+ "base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
155
+ "\n",
156
+ "# Run the tone color converter\n",
157
+ "encode_message = \"@MyShell\"\n",
158
+ "tone_color_converter.convert(\n",
159
+ " audio_src_path=src_path, \n",
160
+ " src_se=source_se, \n",
161
+ " tgt_se=target_se, \n",
162
+ " output_path=save_path,\n",
163
+ " message=encode_message)"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "markdown",
168
+ "id": "6e3ea28a",
169
+ "metadata": {},
170
+ "source": [
171
+ "**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9."
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 24,
177
+ "id": "fd022d38",
178
+ "metadata": {},
179
+ "outputs": [
180
+ {
181
+ "name": "stdout",
182
+ "output_type": "stream",
183
+ "text": [
184
+ " > Text splitted to sentences.\n",
185
+ "This audio is generated by OpenVoice.\n",
186
+ " > ===========================\n",
187
+ "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
188
+ " length:45\n",
189
+ " length:45\n"
190
+ ]
191
+ }
192
+ ],
193
+ "source": [
194
+ "source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
195
+ "save_path = f'{output_dir}/output_whispering.wav'\n",
196
+ "\n",
197
+ "# Run the base speaker tts\n",
198
+ "text = \"This audio is generated by OpenVoice.\"\n",
199
+ "src_path = f'{output_dir}/tmp.wav'\n",
200
+ "base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n",
201
+ "\n",
202
+ "# Run the tone color converter\n",
203
+ "encode_message = \"@MyShell\"\n",
204
+ "tone_color_converter.convert(\n",
205
+ " audio_src_path=src_path, \n",
206
+ " src_se=source_se, \n",
207
+ " tgt_se=target_se, \n",
208
+ " output_path=save_path,\n",
209
+ " message=encode_message)"
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "markdown",
214
+ "id": "5fcfc70b",
215
+ "metadata": {},
216
+ "source": [
217
+ "**Try with different languages.** OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker. We provide an example with a Chinese base speaker here and we encourage the readers to try `demo_part2.ipynb` for a detailed demo."
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": 28,
223
+ "id": "deff30a4-d430-4b4d-9772-b936f5b564c4",
224
+ "metadata": {},
225
+ "outputs": [
226
+ {
227
+ "name": "stdout",
228
+ "output_type": "stream",
229
+ "text": [
230
+ "Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
231
+ "missing/unexpected keys: [] []\n"
232
+ ]
233
+ }
234
+ ],
235
+ "source": [
236
+ "ckpt_base = 'checkpoints/base_speakers/ZH'\n",
237
+ "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
238
+ "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
239
+ "\n",
240
+ "source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)\n",
241
+ "save_path = f'{output_dir}/output_chinese.wav'\n"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": 30,
247
+ "id": "a71d1387",
248
+ "metadata": {},
249
+ "outputs": [
250
+ {
251
+ "name": "stdout",
252
+ "output_type": "stream",
253
+ "text": [
254
+ " > Text splitted to sentences.\n",
255
+ "毛岛灰绣眼鸟(学名:Zosterops mauritianus)是一种绣眼鸟科绣眼鸟属的鸟类,\n",
256
+ "属于毛里求斯岛上两种特有种绣眼鸟之一,\n",
257
+ "另一种是更为稀少的毛里求斯绣眼鸟.\n",
258
+ "上半身整体为灰色, 下半身为灰白色,\n",
259
+ "臀部和腋羽是十分显眼的白色.\n",
260
+ "这种鸟栖息于次生林、森林和花园中[1].\n",
261
+ "它与留尼汪灰绣眼鸟亲缘关系很近,\n",
262
+ "曾经被认为是同种, 统称为马斯克林绣眼鸟[2]\n",
263
+ " > ===========================\n",
264
+ "mɑʊ↑t⁼ɑʊ↓↑ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑( ʃɥɛ↑miŋ↑,ts⁼eɪ↓oʊ→ɛ↑sɹ↓tʰi↓i↓a↓oʊ→pʰi→ɛ↑sɹ↓ ɛ↑mu↓eɪ→joʊ→a↓aɪ↓tʰi↓aɪ↓eɪ→ən→joʊ→ɛ↑sɹ↓) s`ɹ`↓ i→ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑kʰə→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ s`u↓↑ t⁼ə niɑʊ↓↑leɪ↓,\n",
265
+ " length:199\n",
266
+ " length:197\n",
267
+ "s`u↓↑ɥ↑ mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ t⁼ɑʊ↓↑s`ɑŋ↓ liɑŋ↓↑ts`⁼ʊŋ↓↑ tʰə↓joʊ↓↑ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ ts`⁼ɹ`→i→,\n",
268
+ " length:100\n",
269
+ " length:100\n",
270
+ "liŋ↓ i→ts`⁼ʊŋ↓↑ s`ɹ`↓ k⁼əŋ↓weɪ↑ ʃi→s`ɑʊ↓↑ t⁼ə mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑.\n",
271
+ " length:83\n",
272
+ " length:83\n",
273
+ "s`ɑŋ↓p⁼an↓s`ən→ ts`⁼əŋ↓↑tʰi↓↑ weɪ↓ xweɪ→sə↓, ʃja↓p⁼an↓s`ən→ weɪ↓ xweɪ→p⁼aɪ↑sə↓,\n",
274
+ " length:80\n",
275
+ " length:80\n",
276
+ "tʰwən↑p⁼u↓ xə↑ iɛ↓ɥ↓↑ s`ɹ`↓ s`ɹ`↑fən→ ʃjɛn↓↑jɛn↓↑ t⁼ə p⁼aɪ↑sə↓.\n",
277
+ " length:63\n",
278
+ " length:63\n",
279
+ "ts`⁼ə↓ts`⁼ʊŋ↓↑ niɑʊ↓↑ tʃʰi→ʃi→ ɥ↑ tsʰɹ↓s`əŋ→lin↑, sən→lin↑ xə↑ xwa→ɥæn↑ ts`⁼ʊŋ→[ i→].\n",
280
+ " length:85\n",
281
+ " length:83\n",
282
+ "tʰa→ ɥ↓↑ ljoʊ↑ni↑uɑŋ→ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ tʃʰin→ɥæn↑ k⁼wan→ʃi↓ xən↓↑tʃ⁼in↓,\n",
283
+ " length:79\n",
284
+ " length:79\n",
285
+ "tsʰəŋ↑tʃ⁼iŋ→ p⁼eɪ↓ ɹ`ən↓weɪ↑ s`ɹ`↓ tʰʊŋ↑ts`⁼ʊŋ↓↑, tʰʊŋ↓↑ts`ʰəŋ→ weɪ↓ ma↓↑sɹ→kʰə↓lin↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑[ əɹ`↓].\n",
286
+ " length:111\n",
287
+ " length:109\n"
288
+ ]
289
+ },
290
+ {
291
+ "ename": "TypeError",
292
+ "evalue": "unsupported operand type(s) for -: 'builtin_function_or_method' and 'float'",
293
+ "output_type": "error",
294
+ "traceback": [
295
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
296
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
297
+ "Cell \u001b[0;32mIn[30], line 20\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;66;03m# 记录结束时间\u001b[39;00m\n\u001b[1;32m 19\u001b[0m end_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime\n\u001b[0;32m---> 20\u001b[0m execution_time \u001b[38;5;241m=\u001b[39m \u001b[43mend_time\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mstart_time\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m代码执行时间: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexecution_time\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m 秒\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
298
+ "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for -: 'builtin_function_or_method' and 'float'"
299
+ ]
300
+ }
301
+ ],
302
+ "source": [
303
+ "import time\n",
304
+ "\n",
305
+ "# 记录开始时间\n",
306
+ "start_time = time.time()\n",
307
+ "# Run the base speaker tts\n",
308
+ "text = \"毛岛灰绣眼鸟(学名:Zosterops mauritianus)是一种绣眼鸟科绣眼鸟属的鸟类,属于毛里求斯岛上两种特有种绣眼鸟之一,另一种是更为稀少的毛里求斯绣眼鸟。上半身整体为灰色,下半身为灰白色,臀部和腋羽是十分显眼的白色。这种鸟栖息于次生林、森林和花园中[1]。它与留尼汪灰绣眼鸟亲缘关系很近,曾经被认为是同种,统称为马斯克林绣眼鸟[2]\"\n",
309
+ "src_path = f'{output_dir}/tmp.wav'\n",
310
+ "base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)\n",
311
+ "\n",
312
+ "# Run the tone color converter\n",
313
+ "encode_message = \"@MyShell\"\n",
314
+ "tone_color_converter.convert(\n",
315
+ " audio_src_path=src_path, \n",
316
+ " src_se=source_se, \n",
317
+ " tgt_se=target_se, \n",
318
+ " output_path=save_path,\n",
319
+ " message=encode_message)\n",
320
+ "# 记录结束时间\n",
321
+ "end_time = time.time\n",
322
+ "execution_time = end_time - start_time\n",
323
+ "print(f\"代码执行时间: {execution_time} 秒\")"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "markdown",
328
+ "id": "8e513094",
329
+ "metadata": {},
330
+ "source": [
331
+ "**Tech for good.** For people who will deploy OpenVoice for public usage: We offer you the option to add watermark to avoid potential misuse. Please see the ToneColorConverter class. **MyShell reserves the ability to detect whether an audio is generated by OpenVoice**, no matter whether the watermark is added or not."
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "code",
336
+ "execution_count": null,
337
+ "id": "9628ffa1-1d60-4d1b-a9ed-619add064ebd",
338
+ "metadata": {},
339
+ "outputs": [],
340
+ "source": []
341
+ },
342
+ {
343
+ "cell_type": "code",
344
+ "execution_count": null,
345
+ "id": "377f4b72-dfca-4c58-8a5c-fea056538cc2",
346
+ "metadata": {},
347
+ "outputs": [],
348
+ "source": []
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "execution_count": null,
353
+ "id": "31bf81ab-bac9-4996-8f47-8651052d713a",
354
+ "metadata": {},
355
+ "outputs": [],
356
+ "source": []
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": null,
361
+ "id": "32a84a29-9515-4aaa-b4ad-3a530e8259f0",
362
+ "metadata": {},
363
+ "outputs": [],
364
+ "source": []
365
+ },
366
+ {
367
+ "cell_type": "code",
368
+ "execution_count": null,
369
+ "id": "abd802ad-93ac-4db2-9ee5-0ad78b54e09e",
370
+ "metadata": {},
371
+ "outputs": [],
372
+ "source": []
373
+ }
374
+ ],
375
+ "metadata": {
376
+ "interpreter": {
377
+ "hash": "9d70c38e1c0b038dbdffdaa4f8bfa1f6767c43760905c87a9fbe7800d18c6c35"
378
+ },
379
+ "kernelspec": {
380
+ "display_name": "Python 3 (ipykernel)",
381
+ "language": "python",
382
+ "name": "python3"
383
+ },
384
+ "language_info": {
385
+ "codemirror_mode": {
386
+ "name": "ipython",
387
+ "version": 3
388
+ },
389
+ "file_extension": ".py",
390
+ "mimetype": "text/x-python",
391
+ "name": "python",
392
+ "nbconvert_exporter": "python",
393
+ "pygments_lexer": "ipython3",
394
+ "version": "3.9.19"
395
+ }
396
+ },
397
+ "nbformat": 4,
398
+ "nbformat_minor": 5
399
+ }
.ipynb_checkpoints/demo_part3-checkpoint.ipynb ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## Multi-Accent and Multi-Lingual Voice Clone Demo with MeloTTS"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": null,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import os\n",
17
+ "import torch\n",
18
+ "from openvoice import se_extractor\n",
19
+ "from openvoice.api import ToneColorConverter"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "markdown",
24
+ "metadata": {},
25
+ "source": [
26
+ "### Initialization\n",
27
+ "\n",
28
+ "In this example, we will use the checkpoints from OpenVoiceV2. OpenVoiceV2 is trained with more aggressive augmentations and thus demonstrate better robustness in some cases."
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "ckpt_converter = 'checkpoints_v2/converter'\n",
38
+ "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
39
+ "output_dir = 'outputs_v2'\n",
40
+ "\n",
41
+ "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
42
+ "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
43
+ "\n",
44
+ "os.makedirs(output_dir, exist_ok=True)"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "markdown",
49
+ "metadata": {},
50
+ "source": [
51
+ "### Obtain Tone Color Embedding\n",
52
+ "We only extract the tone color embedding for the target speaker. The source tone color embeddings can be directly loaded from `checkpoints_v2/ses` folder."
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "\n",
62
+ "reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone\n",
63
+ "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "markdown",
68
+ "metadata": {},
69
+ "source": [
70
+ "#### Use MeloTTS as Base Speakers\n",
71
+ "\n",
72
+ "MeloTTS is a high-quality multi-lingual text-to-speech library by @MyShell.ai, supporting languages including English (American, British, Indian, Australian, Default), Spanish, French, Chinese, Japanese, Korean. In the following example, we will use the models in MeloTTS as the base speakers. "
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "metadata": {},
79
+ "outputs": [],
80
+ "source": [
81
+ "from melo.api import TTS\n",
82
+ "\n",
83
+ "texts = {\n",
84
+ " 'EN_NEWEST': \"Did you ever hear a folk tale about a giant turtle?\", # The newest English base speaker model\n",
85
+ " 'EN': \"Did you ever hear a folk tale about a giant turtle?\",\n",
86
+ " 'ES': \"El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.\",\n",
87
+ " 'FR': \"La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.\",\n",
88
+ " 'ZH': \"在这次vacation中,我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。\",\n",
89
+ " 'JP': \"彼は毎朝ジョギングをして体を健康に保っています。\",\n",
90
+ " 'KR': \"안녕하세요! 오늘은 날씨가 정말 좋네요.\",\n",
91
+ "}\n",
92
+ "\n",
93
+ "\n",
94
+ "src_path = f'{output_dir}/tmp.wav'\n",
95
+ "\n",
96
+ "# Speed is adjustable\n",
97
+ "speed = 1.0\n",
98
+ "\n",
99
+ "for language, text in texts.items():\n",
100
+ " model = TTS(language=language, device=device)\n",
101
+ " speaker_ids = model.hps.data.spk2id\n",
102
+ " \n",
103
+ " for speaker_key in speaker_ids.keys():\n",
104
+ " speaker_id = speaker_ids[speaker_key]\n",
105
+ " speaker_key = speaker_key.lower().replace('_', '-')\n",
106
+ " \n",
107
+ " source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)\n",
108
+ " model.tts_to_file(text, speaker_id, src_path, speed=speed)\n",
109
+ " save_path = f'{output_dir}/output_v2_{speaker_key}.wav'\n",
110
+ "\n",
111
+ " # Run the tone color converter\n",
112
+ " encode_message = \"@MyShell\"\n",
113
+ " tone_color_converter.convert(\n",
114
+ " audio_src_path=src_path, \n",
115
+ " src_se=source_se, \n",
116
+ " tgt_se=target_se, \n",
117
+ " output_path=save_path,\n",
118
+ " message=encode_message)"
119
+ ]
120
+ }
121
+ ],
122
+ "metadata": {
123
+ "kernelspec": {
124
+ "display_name": "melo",
125
+ "language": "python",
126
+ "name": "python3"
127
+ },
128
+ "language_info": {
129
+ "codemirror_mode": {
130
+ "name": "ipython",
131
+ "version": 3
132
+ },
133
+ "file_extension": ".py",
134
+ "mimetype": "text/x-python",
135
+ "name": "python",
136
+ "nbconvert_exporter": "python",
137
+ "pygments_lexer": "ipython3",
138
+ "version": "3.9.18"
139
+ }
140
+ },
141
+ "nbformat": 4,
142
+ "nbformat_minor": 2
143
+ }
LICENSE ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Copyright 2024 MyShell.ai
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
MyShell_OpenVoice.egg-info/PKG-INFO ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: MyShell-OpenVoice
3
+ Version: 0.0.0
4
+ Summary: Instant voice cloning by MyShell.
5
+ Home-page: https://github.com/myshell-ai/OpenVoice
6
+ Author: MyShell
7
+ Author-email: [email protected]
8
+ License: MIT License
9
+ Project-URL: Documentation, https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md
10
+ Project-URL: Changes, https://github.com/myshell-ai/OpenVoice/releases
11
+ Project-URL: Code, https://github.com/myshell-ai/OpenVoice
12
+ Project-URL: Issue tracker, https://github.com/myshell-ai/OpenVoice/issues
13
+ Keywords: text-to-speech,tts,voice-clone,zero-shot-tts
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Requires-Dist: librosa==0.9.1
18
+ Requires-Dist: faster-whisper==0.9.0
19
+ Requires-Dist: pydub==0.25.1
20
+ Requires-Dist: wavmark==0.0.3
21
+ Requires-Dist: numpy==1.22.0
22
+ Requires-Dist: eng_to_ipa==0.0.2
23
+ Requires-Dist: inflect==7.0.0
24
+ Requires-Dist: unidecode==1.3.7
25
+ Requires-Dist: whisper-timestamped==1.14.2
26
+ Requires-Dist: pypinyin==0.50.0
27
+ Requires-Dist: cn2an==0.5.22
28
+ Requires-Dist: jieba==0.42.1
29
+ Requires-Dist: gradio==3.48.0
30
+ Requires-Dist: langid==1.1.6
31
+
32
+ <div align="center">
33
+ <div>&nbsp;</div>
34
+ <img src="resources/openvoicelogo.jpg" width="400"/>
35
+
36
+ [Paper](https://arxiv.org/abs/2312.01479) |
37
+ [Website](https://research.myshell.ai/open-voice)
38
+
39
+ </div>
40
+
41
+ ## Introduction
42
+
43
+ ### OpenVoice V1
44
+
45
+ As we detailed in our [paper](https://arxiv.org/abs/2312.01479) and [website](https://research.myshell.ai/open-voice), the advantages of OpenVoice are three-fold:
46
+
47
+ **1. Accurate Tone Color Cloning.**
48
+ OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
49
+
50
+ **2. Flexible Voice Style Control.**
51
+ OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
52
+
53
+ **3. Zero-shot Cross-lingual Voice Cloning.**
54
+ Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
55
+
56
+ ### OpenVoice V2
57
+
58
+ In April 2024, we released OpenVoice V2, which includes all features in V1 and has:
59
+
60
+ **1. Better Audio Quality.**
61
+ OpenVoice V2 adopts a different training strategy that delivers better audio quality.
62
+
63
+ **2. Native Multi-lingual Support.**
64
+ English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
65
+
66
+ **3. Free Commercial Use.**
67
+ Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
68
+
69
+ [Video](https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f)
70
+
71
+ OpenVoice has been powering the instant voice cloning capability of [myshell.ai](https://app.myshell.ai/explore) since May 2023. Until Nov 2023, the voice cloning model has been used tens of millions of times by users worldwide, and witnessed the explosive user growth on the platform.
72
+
73
+ ## Main Contributors
74
+
75
+ - [Zengyi Qin](https://www.qinzy.tech) at MIT and MyShell
76
+ - [Wenliang Zhao](https://wl-zhao.github.io) at Tsinghua University
77
+ - [Xumin Yu](https://yuxumin.github.io) at Tsinghua University
78
+ - [Ethan Sun](https://twitter.com/ethan_myshell) at MyShell
79
+
80
+ ## How to Use
81
+ Please see [usage](docs/USAGE.md) for detailed instructions.
82
+
83
+ ## Common Issues
84
+
85
+ Please see [QA](docs/QA.md) for common questions and answers. We will regularly update the question and answer list.
86
+
87
+ ## Join Our Community
88
+
89
+ Join our [Discord community](https://discord.gg/myshell) and select the `Developer` role upon joining to gain exclusive access to our developer-only channel! Don't miss out on valuable discussions and collaboration opportunities.
90
+
91
+ ## Citation
92
+ ```
93
+ @article{qin2023openvoice,
94
+ title={OpenVoice: Versatile Instant Voice Cloning},
95
+ author={Qin, Zengyi and Zhao, Wenliang and Yu, Xumin and Sun, Xin},
96
+ journal={arXiv preprint arXiv:2312.01479},
97
+ year={2023}
98
+ }
99
+ ```
100
+
101
+ ## License
102
+ OpenVoice V1 and V2 are MIT Licensed. Free for both commercial and research use.
103
+
104
+ ## Acknowledgements
105
+ This implementation is based on several excellent projects, [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), and [VITS2](https://github.com/daniilrobnikov/vits2). Thanks for their awesome work!
MyShell_OpenVoice.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ MyShell_OpenVoice.egg-info/PKG-INFO
5
+ MyShell_OpenVoice.egg-info/SOURCES.txt
6
+ MyShell_OpenVoice.egg-info/dependency_links.txt
7
+ MyShell_OpenVoice.egg-info/not-zip-safe
8
+ MyShell_OpenVoice.egg-info/requires.txt
9
+ MyShell_OpenVoice.egg-info/top_level.txt
10
+ openvoice/__init__.py
11
+ openvoice/api.py
12
+ openvoice/attentions.py
13
+ openvoice/commons.py
14
+ openvoice/mel_processing.py
15
+ openvoice/models.py
16
+ openvoice/modules.py
17
+ openvoice/openvoice_app.py
18
+ openvoice/se_extractor.py
19
+ openvoice/transforms.py
20
+ openvoice/utils.py
21
+ openvoice/text/__init__.py
22
+ openvoice/text/cleaners.py
23
+ openvoice/text/english.py
24
+ openvoice/text/mandarin.py
25
+ openvoice/text/symbols.py
MyShell_OpenVoice.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
MyShell_OpenVoice.egg-info/not-zip-safe ADDED
@@ -0,0 +1 @@
 
 
1
+
MyShell_OpenVoice.egg-info/requires.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ librosa==0.9.1
2
+ faster-whisper==0.9.0
3
+ pydub==0.25.1
4
+ wavmark==0.0.3
5
+ numpy==1.22.0
6
+ eng_to_ipa==0.0.2
7
+ inflect==7.0.0
8
+ unidecode==1.3.7
9
+ whisper-timestamped==1.14.2
10
+ pypinyin==0.50.0
11
+ cn2an==0.5.22
12
+ jieba==0.42.1
13
+ gradio==3.48.0
14
+ langid==1.1.6
MyShell_OpenVoice.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ openvoice
README.md CHANGED
@@ -1,12 +1,80 @@
1
  ---
2
- title: TestOpenVoice
3
- emoji: 💻
4
- colorFrom: indigo
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.37.2
8
  app_file: app.py
9
- pinned: false
 
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: testOpenVoice
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 3.48.0
6
  ---
7
+ <div align="center">
8
+ <div>&nbsp;</div>
9
+ <img src="resources/openvoicelogo.jpg" width="400"/>
10
+
11
+ [Paper](https://arxiv.org/abs/2312.01479) |
12
+ [Website](https://research.myshell.ai/open-voice)
13
+
14
+ </div>
15
+
16
+ ## Introduction
17
+
18
+ ### OpenVoice V1
19
+
20
+ As we detailed in our [paper](https://arxiv.org/abs/2312.01479) and [website](https://research.myshell.ai/open-voice), the advantages of OpenVoice are three-fold:
21
+
22
+ **1. Accurate Tone Color Cloning.**
23
+ OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
24
+
25
+ **2. Flexible Voice Style Control.**
26
+ OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
27
+
28
+ **3. Zero-shot Cross-lingual Voice Cloning.**
29
+ Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
30
+
31
+ ### OpenVoice V2
32
+
33
+ In April 2024, we released OpenVoice V2, which includes all features in V1 and has:
34
+
35
+ **1. Better Audio Quality.**
36
+ OpenVoice V2 adopts a different training strategy that delivers better audio quality.
37
+
38
+ **2. Native Multi-lingual Support.**
39
+ English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
40
+
41
+ **3. Free Commercial Use.**
42
+ Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
43
+
44
+ [Video](https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f)
45
+
46
+ OpenVoice has been powering the instant voice cloning capability of [myshell.ai](https://app.myshell.ai/explore) since May 2023. Until Nov 2023, the voice cloning model has been used tens of millions of times by users worldwide, and witnessed the explosive user growth on the platform.
47
+
48
+ ## Main Contributors
49
+
50
+ - [Zengyi Qin](https://www.qinzy.tech) at MIT and MyShell
51
+ - [Wenliang Zhao](https://wl-zhao.github.io) at Tsinghua University
52
+ - [Xumin Yu](https://yuxumin.github.io) at Tsinghua University
53
+ - [Ethan Sun](https://twitter.com/ethan_myshell) at MyShell
54
+
55
+ ## How to Use
56
+ Please see [usage](docs/USAGE.md) for detailed instructions.
57
+
58
+ ## Common Issues
59
+
60
+ Please see [QA](docs/QA.md) for common questions and answers. We will regularly update the question and answer list.
61
+
62
+ ## Join Our Community
63
+
64
+ Join our [Discord community](https://discord.gg/myshell) and select the `Developer` role upon joining to gain exclusive access to our developer-only channel! Don't miss out on valuable discussions and collaboration opportunities.
65
+
66
+ ## Citation
67
+ ```
68
+ @article{qin2023openvoice,
69
+ title={OpenVoice: Versatile Instant Voice Cloning},
70
+ author={Qin, Zengyi and Zhao, Wenliang and Yu, Xumin and Sun, Xin},
71
+ journal={arXiv preprint arXiv:2312.01479},
72
+ year={2023}
73
+ }
74
+ ```
75
+
76
+ ## License
77
+ OpenVoice V1 and V2 are MIT Licensed. Free for both commercial and research use.
78
 
79
+ ## Acknowledgements
80
+ This implementation is based on several excellent projects, [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), and [VITS2](https://github.com/daniilrobnikov/vits2). Thanks for their awesome work!
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### https://huggingface.co/docs
2
+ # https://huggingface.co/spaces/gradio/asr
3
+ import os
4
+ import gradio as gr
5
+
6
+ import os
7
+ import torch
8
+ from openvoice import se_extractor
9
+ from openvoice.api import BaseSpeakerTTS, ToneColorConverter
10
+
11
+ ckpt_base = 'checkpoints/base_speakers/EN'
12
+ ckpt_converter = 'checkpoints/converter'
13
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
14
+ output_dir = 'outputs'
15
+
16
+ base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
17
+ base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
18
+
19
+ tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
20
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
21
+ source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)
22
+ os.makedirs(output_dir, exist_ok=True)
23
+ reference_speaker = './resources/demo_speaker0.mp3' # This is the voice you want to clone
24
+ target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)
25
+ save_path = f'{output_dir}/output_en_default.wav'
26
+
27
+ # Run the base speaker tts
28
+ text = "This audio is generated by OpenVoice."
29
+ src_path = f'{output_dir}/tmp.wav'
30
+ base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)
31
+
32
+ # Run the tone color converter
33
+ encode_message = "@MyShell"
34
+ tone_color_converter.convert(
35
+ audio_src_path=src_path,
36
+ src_se=source_se,
37
+ tgt_se=target_se,
38
+ output_path=save_path,
39
+ message=encode_message)
40
+
41
+ ckpt_base = 'checkpoints/base_speakers/ZH'
42
+ base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
43
+ base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
44
+
45
+ source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)
46
+ save_path = f'{output_dir}/output_chinese.wav'
47
+
48
+
49
+ def audio_io(input_text: str):
50
+ text = input_text
51
+ src_path = f'{output_dir}/tmp.wav'
52
+ base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)
53
+
54
+ # Run the tone color converter
55
+ encode_message = "@MyShell"
56
+ tone_color_converter.convert(
57
+ audio_src_path=src_path,
58
+ src_se=source_se,
59
+ tgt_se=target_se,
60
+ output_path=save_path,
61
+ message=encode_message)
62
+ return src_path
63
+
64
+
65
+ demo = gr.Interface(
66
+ fn=audio_io,
67
+ inputs=["text"],
68
+ outputs=["audio"],
69
+ )
70
+
71
+ demo.launch()
checkpoints/base_speakers/.DS_Store ADDED
Binary file (6.15 kB). View file
 
checkpoints/base_speakers/EN/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
3
+ size 160467309
checkpoints/base_speakers/EN/config.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "text_cleaners": [
4
+ "cjke_cleaners2"
5
+ ],
6
+ "sampling_rate": 22050,
7
+ "filter_length": 1024,
8
+ "hop_length": 256,
9
+ "win_length": 1024,
10
+ "n_mel_channels": 80,
11
+ "add_blank": true,
12
+ "cleaned_text": true,
13
+ "n_speakers": 10
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "n_layers_trans_flow": 3,
22
+ "kernel_size": 3,
23
+ "p_dropout": 0.1,
24
+ "resblock": "1",
25
+ "resblock_kernel_sizes": [
26
+ 3,
27
+ 7,
28
+ 11
29
+ ],
30
+ "resblock_dilation_sizes": [
31
+ [
32
+ 1,
33
+ 3,
34
+ 5
35
+ ],
36
+ [
37
+ 1,
38
+ 3,
39
+ 5
40
+ ],
41
+ [
42
+ 1,
43
+ 3,
44
+ 5
45
+ ]
46
+ ],
47
+ "upsample_rates": [
48
+ 8,
49
+ 8,
50
+ 2,
51
+ 2
52
+ ],
53
+ "upsample_initial_channel": 512,
54
+ "upsample_kernel_sizes": [
55
+ 16,
56
+ 16,
57
+ 4,
58
+ 4
59
+ ],
60
+ "n_layers_q": 3,
61
+ "use_spectral_norm": false,
62
+ "gin_channels": 256
63
+ },
64
+ "symbols": [
65
+ "_",
66
+ ",",
67
+ ".",
68
+ "!",
69
+ "?",
70
+ "-",
71
+ "~",
72
+ "\u2026",
73
+ "N",
74
+ "Q",
75
+ "a",
76
+ "b",
77
+ "d",
78
+ "e",
79
+ "f",
80
+ "g",
81
+ "h",
82
+ "i",
83
+ "j",
84
+ "k",
85
+ "l",
86
+ "m",
87
+ "n",
88
+ "o",
89
+ "p",
90
+ "s",
91
+ "t",
92
+ "u",
93
+ "v",
94
+ "w",
95
+ "x",
96
+ "y",
97
+ "z",
98
+ "\u0251",
99
+ "\u00e6",
100
+ "\u0283",
101
+ "\u0291",
102
+ "\u00e7",
103
+ "\u026f",
104
+ "\u026a",
105
+ "\u0254",
106
+ "\u025b",
107
+ "\u0279",
108
+ "\u00f0",
109
+ "\u0259",
110
+ "\u026b",
111
+ "\u0265",
112
+ "\u0278",
113
+ "\u028a",
114
+ "\u027e",
115
+ "\u0292",
116
+ "\u03b8",
117
+ "\u03b2",
118
+ "\u014b",
119
+ "\u0266",
120
+ "\u207c",
121
+ "\u02b0",
122
+ "`",
123
+ "^",
124
+ "#",
125
+ "*",
126
+ "=",
127
+ "\u02c8",
128
+ "\u02cc",
129
+ "\u2192",
130
+ "\u2193",
131
+ "\u2191",
132
+ " "
133
+ ],
134
+ "speakers": {
135
+ "default": 1,
136
+ "whispering": 2,
137
+ "shouting": 3,
138
+ "excited": 4,
139
+ "cheerful": 5,
140
+ "terrified": 6,
141
+ "angry": 7,
142
+ "sad": 8,
143
+ "friendly": 9
144
+ }
145
+ }
checkpoints/base_speakers/EN/en_default_se.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
3
+ size 1789
checkpoints/base_speakers/EN/en_style_se.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9
3
+ size 1783
checkpoints/base_speakers/ZH/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
3
+ size 160467309
checkpoints/base_speakers/ZH/config.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "text_cleaners": [
4
+ "cjke_cleaners2"
5
+ ],
6
+ "sampling_rate": 22050,
7
+ "filter_length": 1024,
8
+ "hop_length": 256,
9
+ "win_length": 1024,
10
+ "n_mel_channels": 80,
11
+ "add_blank": true,
12
+ "cleaned_text": true,
13
+ "n_speakers": 10
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "n_layers_trans_flow": 3,
22
+ "kernel_size": 3,
23
+ "p_dropout": 0.1,
24
+ "resblock": "1",
25
+ "resblock_kernel_sizes": [
26
+ 3,
27
+ 7,
28
+ 11
29
+ ],
30
+ "resblock_dilation_sizes": [
31
+ [
32
+ 1,
33
+ 3,
34
+ 5
35
+ ],
36
+ [
37
+ 1,
38
+ 3,
39
+ 5
40
+ ],
41
+ [
42
+ 1,
43
+ 3,
44
+ 5
45
+ ]
46
+ ],
47
+ "upsample_rates": [
48
+ 8,
49
+ 8,
50
+ 2,
51
+ 2
52
+ ],
53
+ "upsample_initial_channel": 512,
54
+ "upsample_kernel_sizes": [
55
+ 16,
56
+ 16,
57
+ 4,
58
+ 4
59
+ ],
60
+ "n_layers_q": 3,
61
+ "use_spectral_norm": false,
62
+ "gin_channels": 256
63
+ },
64
+ "symbols": [
65
+ "_",
66
+ ",",
67
+ ".",
68
+ "!",
69
+ "?",
70
+ "-",
71
+ "~",
72
+ "\u2026",
73
+ "N",
74
+ "Q",
75
+ "a",
76
+ "b",
77
+ "d",
78
+ "e",
79
+ "f",
80
+ "g",
81
+ "h",
82
+ "i",
83
+ "j",
84
+ "k",
85
+ "l",
86
+ "m",
87
+ "n",
88
+ "o",
89
+ "p",
90
+ "s",
91
+ "t",
92
+ "u",
93
+ "v",
94
+ "w",
95
+ "x",
96
+ "y",
97
+ "z",
98
+ "\u0251",
99
+ "\u00e6",
100
+ "\u0283",
101
+ "\u0291",
102
+ "\u00e7",
103
+ "\u026f",
104
+ "\u026a",
105
+ "\u0254",
106
+ "\u025b",
107
+ "\u0279",
108
+ "\u00f0",
109
+ "\u0259",
110
+ "\u026b",
111
+ "\u0265",
112
+ "\u0278",
113
+ "\u028a",
114
+ "\u027e",
115
+ "\u0292",
116
+ "\u03b8",
117
+ "\u03b2",
118
+ "\u014b",
119
+ "\u0266",
120
+ "\u207c",
121
+ "\u02b0",
122
+ "`",
123
+ "^",
124
+ "#",
125
+ "*",
126
+ "=",
127
+ "\u02c8",
128
+ "\u02cc",
129
+ "\u2192",
130
+ "\u2193",
131
+ "\u2191",
132
+ " "
133
+ ],
134
+ "speakers": {
135
+ "default": 0
136
+ }
137
+ }
checkpoints/base_speakers/ZH/zh_default_se.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
3
+ size 1789
checkpoints/converter/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89ae83aa4e3668fef64b388b789ff7b0ce0def9f801069edfc18a00ea420748d
3
+ size 131327338
checkpoints/converter/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "sampling_rate": 22050,
4
+ "filter_length": 1024,
5
+ "hop_length": 256,
6
+ "win_length": 1024,
7
+ "n_speakers": 0
8
+ },
9
+ "model": {
10
+ "inter_channels": 192,
11
+ "hidden_channels": 192,
12
+ "filter_channels": 768,
13
+ "n_heads": 2,
14
+ "n_layers": 6,
15
+ "kernel_size": 3,
16
+ "p_dropout": 0.1,
17
+ "resblock": "1",
18
+ "resblock_kernel_sizes": [
19
+ 3,
20
+ 7,
21
+ 11
22
+ ],
23
+ "resblock_dilation_sizes": [
24
+ [
25
+ 1,
26
+ 3,
27
+ 5
28
+ ],
29
+ [
30
+ 1,
31
+ 3,
32
+ 5
33
+ ],
34
+ [
35
+ 1,
36
+ 3,
37
+ 5
38
+ ]
39
+ ],
40
+ "upsample_rates": [
41
+ 8,
42
+ 8,
43
+ 2,
44
+ 2
45
+ ],
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [
48
+ 16,
49
+ 16,
50
+ 4,
51
+ 4
52
+ ],
53
+ "n_layers_q": 3,
54
+ "use_spectral_norm": false,
55
+ "gin_channels": 256
56
+ }
57
+ }
checkpoints_v2/.DS_Store ADDED
Binary file (6.15 kB). View file
 
checkpoints_v2/converter/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
3
+ size 131320490
checkpoints_v2/converter/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_version_": "v2",
3
+ "data": {
4
+ "sampling_rate": 22050,
5
+ "filter_length": 1024,
6
+ "hop_length": 256,
7
+ "win_length": 1024,
8
+ "n_speakers": 0
9
+ },
10
+ "model": {
11
+ "zero_g": true,
12
+ "inter_channels": 192,
13
+ "hidden_channels": 192,
14
+ "filter_channels": 768,
15
+ "n_heads": 2,
16
+ "n_layers": 6,
17
+ "kernel_size": 3,
18
+ "p_dropout": 0.1,
19
+ "resblock": "1",
20
+ "resblock_kernel_sizes": [
21
+ 3,
22
+ 7,
23
+ 11
24
+ ],
25
+ "resblock_dilation_sizes": [
26
+ [
27
+ 1,
28
+ 3,
29
+ 5
30
+ ],
31
+ [
32
+ 1,
33
+ 3,
34
+ 5
35
+ ],
36
+ [
37
+ 1,
38
+ 3,
39
+ 5
40
+ ]
41
+ ],
42
+ "upsample_rates": [
43
+ 8,
44
+ 8,
45
+ 2,
46
+ 2
47
+ ],
48
+ "upsample_initial_channel": 512,
49
+ "upsample_kernel_sizes": [
50
+ 16,
51
+ 16,
52
+ 4,
53
+ 4
54
+ ],
55
+ "gin_channels": 256
56
+ }
57
+ }
demo_part1.ipynb ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "b6ee1ede",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Voice Style Control Demo"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 17,
14
+ "id": "b7f043ee",
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stdout",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "CPU times: user 15 µs, sys: 1e+03 ns, total: 16 µs\n",
22
+ "Wall time: 18.8 µs\n"
23
+ ]
24
+ }
25
+ ],
26
+ "source": [
27
+ "%%time\n",
28
+ "import os\n",
29
+ "import torch\n",
30
+ "from openvoice import se_extractor\n",
31
+ "from openvoice.api import BaseSpeakerTTS, ToneColorConverter"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "markdown",
36
+ "id": "15116b59",
37
+ "metadata": {},
38
+ "source": [
39
+ "### Initialization"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 18,
45
+ "id": "aacad912",
46
+ "metadata": {},
47
+ "outputs": [
48
+ {
49
+ "name": "stdout",
50
+ "output_type": "stream",
51
+ "text": [
52
+ "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
53
+ "missing/unexpected keys: [] []\n",
54
+ "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
55
+ "missing/unexpected keys: [] []\n"
56
+ ]
57
+ }
58
+ ],
59
+ "source": [
60
+ "ckpt_base = 'checkpoints/base_speakers/EN'\n",
61
+ "ckpt_converter = 'checkpoints/converter'\n",
62
+ "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
63
+ "output_dir = 'outputs'\n",
64
+ "\n",
65
+ "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
66
+ "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
67
+ "\n",
68
+ "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
69
+ "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
70
+ "\n",
71
+ "os.makedirs(output_dir, exist_ok=True)"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "markdown",
76
+ "id": "7f67740c",
77
+ "metadata": {},
78
+ "source": [
79
+ "### Obtain Tone Color Embedding"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "markdown",
84
+ "id": "f8add279",
85
+ "metadata": {},
86
+ "source": [
87
+ "The `source_se` is the tone color embedding of the base speaker. \n",
88
+ "It is an average of multiple sentences generated by the base speaker. We directly provide the result here but\n",
89
+ "the readers feel free to extract `source_se` by themselves."
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": 19,
95
+ "id": "63ff6273",
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "markdown",
104
+ "id": "4f71fcc3",
105
+ "metadata": {},
106
+ "source": [
107
+ "The `reference_speaker.mp3` below points to the short audio clip of the reference whose voice we want to clone. We provide an example here. If you use your own reference speakers, please **make sure each speaker has a unique filename.** The `se_extractor` will save the `targeted_se` using the filename of the audio and **will not automatically overwrite.**"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 20,
113
+ "id": "55105eae",
114
+ "metadata": {},
115
+ "outputs": [
116
+ {
117
+ "name": "stdout",
118
+ "output_type": "stream",
119
+ "text": [
120
+ "OpenVoice version: v1\n",
121
+ "[(0.0, 19.278375)]\n",
122
+ "after vad: dur = 19.27798185941043\n"
123
+ ]
124
+ }
125
+ ],
126
+ "source": [
127
+ "reference_speaker = './resources/demo_speaker0.mp3' # This is the voice you want to clone\n",
128
+ "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "markdown",
133
+ "id": "a40284aa",
134
+ "metadata": {},
135
+ "source": [
136
+ "### Inference"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 21,
142
+ "id": "73dc1259",
143
+ "metadata": {},
144
+ "outputs": [
145
+ {
146
+ "name": "stdout",
147
+ "output_type": "stream",
148
+ "text": [
149
+ " > Text splitted to sentences.\n",
150
+ "This audio is generated by OpenVoice.\n",
151
+ " > ===========================\n",
152
+ "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
153
+ " length:45\n",
154
+ " length:45\n"
155
+ ]
156
+ }
157
+ ],
158
+ "source": [
159
+ "save_path = f'{output_dir}/output_en_default.wav'\n",
160
+ "\n",
161
+ "# Run the base speaker tts\n",
162
+ "text = \"This audio is generated by OpenVoice.\"\n",
163
+ "src_path = f'{output_dir}/tmp.wav'\n",
164
+ "base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
165
+ "\n",
166
+ "# Run the tone color converter\n",
167
+ "encode_message = \"@MyShell\"\n",
168
+ "tone_color_converter.convert(\n",
169
+ " audio_src_path=src_path, \n",
170
+ " src_se=source_se, \n",
171
+ " tgt_se=target_se, \n",
172
+ " output_path=save_path,\n",
173
+ " message=encode_message)"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "markdown",
178
+ "id": "6e3ea28a",
179
+ "metadata": {},
180
+ "source": [
181
+ "**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9."
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 22,
187
+ "id": "fd022d38",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "name": "stdout",
192
+ "output_type": "stream",
193
+ "text": [
194
+ " > Text splitted to sentences.\n",
195
+ "This audio is generated by OpenVoice.\n",
196
+ " > ===========================\n",
197
+ "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
198
+ " length:45\n",
199
+ " length:45\n"
200
+ ]
201
+ }
202
+ ],
203
+ "source": [
204
+ "source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
205
+ "save_path = f'{output_dir}/output_whispering.wav'\n",
206
+ "\n",
207
+ "# Run the base speaker tts\n",
208
+ "text = \"This audio is generated by OpenVoice.\"\n",
209
+ "src_path = f'{output_dir}/tmp.wav'\n",
210
+ "base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n",
211
+ "\n",
212
+ "# Run the tone color converter\n",
213
+ "encode_message = \"@MyShell\"\n",
214
+ "tone_color_converter.convert(\n",
215
+ " audio_src_path=src_path, \n",
216
+ " src_se=source_se, \n",
217
+ " tgt_se=target_se, \n",
218
+ " output_path=save_path,\n",
219
+ " message=encode_message)"
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "markdown",
224
+ "id": "5fcfc70b",
225
+ "metadata": {},
226
+ "source": [
227
+ "**Try with different languages.** OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker. We provide an example with a Chinese base speaker here and we encourage the readers to try `demo_part2.ipynb` for a detailed demo."
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": 23,
233
+ "id": "deff30a4-d430-4b4d-9772-b936f5b564c4",
234
+ "metadata": {},
235
+ "outputs": [
236
+ {
237
+ "name": "stderr",
238
+ "output_type": "stream",
239
+ "text": [
240
+ "/Users/russell/miniconda3/envs/openvoice/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
241
+ " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
242
+ ]
243
+ },
244
+ {
245
+ "name": "stdout",
246
+ "output_type": "stream",
247
+ "text": [
248
+ "Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
249
+ "missing/unexpected keys: [] []\n"
250
+ ]
251
+ }
252
+ ],
253
+ "source": [
254
+ "ckpt_base = 'checkpoints/base_speakers/ZH'\n",
255
+ "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
256
+ "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
257
+ "\n",
258
+ "source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)\n",
259
+ "save_path = f'{output_dir}/output_chinese.wav'\n"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 24,
265
+ "id": "a71d1387",
266
+ "metadata": {},
267
+ "outputs": [
268
+ {
269
+ "name": "stdout",
270
+ "output_type": "stream",
271
+ "text": [
272
+ " > Text splitted to sentences.\n",
273
+ "毛岛灰绣眼鸟(学名:Zosterops mauritianus)是一种绣眼鸟科绣眼鸟属的鸟类,\n",
274
+ "属于毛里求斯岛上两种特有种绣眼鸟之一,\n",
275
+ "另一种是更为稀少的毛里求斯绣眼鸟.\n",
276
+ "上半身整体为灰色, 下半身为灰白色,\n",
277
+ "臀部和腋羽是十分显眼的白色.\n",
278
+ "这种鸟栖息于次生林、森林和花园中[1].\n",
279
+ "它与留尼汪灰绣眼鸟亲缘关系很近,\n",
280
+ "曾经被认为是同种, 统称为马斯克林绣眼鸟[2]\n",
281
+ " > ===========================\n",
282
+ "mɑʊ↑t⁼ɑʊ↓↑ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑( ʃɥɛ↑miŋ↑,ts⁼eɪ↓oʊ→ɛ↑sɹ↓tʰi↓i↓a↓oʊ→pʰi→ɛ↑sɹ↓ ɛ↑mu↓eɪ→joʊ→a↓aɪ↓tʰi↓aɪ↓eɪ→ən→joʊ→ɛ↑sɹ↓) s`ɹ`↓ i→ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑kʰə→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ s`u↓↑ t⁼ə niɑʊ↓↑leɪ↓,\n",
283
+ " length:199\n",
284
+ " length:197\n",
285
+ "s`u↓↑ɥ↑ mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ t⁼ɑʊ↓↑s`ɑŋ↓ liɑŋ↓↑ts`⁼ʊŋ↓↑ tʰə↓joʊ↓↑ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ ts`⁼ɹ`→i→,\n",
286
+ " length:100\n",
287
+ " length:100\n",
288
+ "liŋ↓ i→ts`⁼ʊŋ↓↑ s`ɹ`↓ k⁼əŋ↓weɪ↑ ʃi→s`ɑʊ↓↑ t⁼ə mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑.\n",
289
+ " length:83\n",
290
+ " length:83\n",
291
+ "s`ɑŋ↓p⁼an↓s`ən→ ts`⁼əŋ↓↑tʰi↓↑ weɪ↓ xweɪ→sə↓, ʃja↓p⁼an↓s`ən→ weɪ↓ xweɪ→p⁼aɪ↑sə↓,\n",
292
+ " length:80\n",
293
+ " length:80\n",
294
+ "tʰwən↑p⁼u↓ xə↑ iɛ↓ɥ↓↑ s`ɹ`↓ s`ɹ`↑fən→ ʃjɛn↓↑jɛn↓↑ t⁼ə p⁼aɪ↑sə↓.\n",
295
+ " length:63\n",
296
+ " length:63\n",
297
+ "ts`⁼ə↓ts`⁼ʊŋ↓↑ niɑʊ↓↑ tʃʰi→ʃi→ ɥ↑ tsʰɹ↓s`əŋ→lin↑, sən→lin↑ xə↑ xwa→ɥæn↑ ts`⁼ʊŋ→[ i→].\n",
298
+ " length:85\n",
299
+ " length:83\n",
300
+ "tʰa→ ɥ↓↑ ljoʊ↑ni↑uɑŋ→ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ tʃʰin→ɥæn↑ k⁼wan→ʃi↓ xən↓↑tʃ⁼in↓,\n",
301
+ " length:79\n",
302
+ " length:79\n",
303
+ "tsʰəŋ↑tʃ⁼iŋ→ p⁼eɪ↓ ɹ`ən↓weɪ↑ s`ɹ`↓ tʰʊŋ↑ts`⁼ʊŋ↓↑, tʰʊŋ↓↑ts`ʰəŋ→ weɪ↓ ma↓↑sɹ→kʰə↓lin↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑[ əɹ`↓].\n",
304
+ " length:111\n",
305
+ " length:109\n",
306
+ "CPU times: user 2min 41s, sys: 7.56 s, total: 2min 49s\n",
307
+ "Wall time: 29.7 s\n"
308
+ ]
309
+ }
310
+ ],
311
+ "source": [
312
+ "%%time\n",
313
+ "# Run the base speaker tts\n",
314
+ "text = \"毛岛灰绣眼鸟(学名:Zosterops mauritianus)是一种绣眼鸟科绣眼鸟属的鸟类,属于毛里求斯岛上两种特有种绣眼鸟之一,另一种是更为稀少的毛里求斯绣眼鸟。上半身整体为灰色,下半身为灰白色,臀部和腋羽是十分显眼的白色。这种鸟栖息于次生林、森林和花园中[1]。它与留尼汪灰绣眼鸟亲缘关系很近,曾经被认为是同种,统称为马斯克林绣眼鸟[2]\"\n",
315
+ "src_path = f'{output_dir}/tmp.wav'\n",
316
+ "base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)\n",
317
+ "\n",
318
+ "# Run the tone color converter\n",
319
+ "encode_message = \"@MyShell\"\n",
320
+ "tone_color_converter.convert(\n",
321
+ " audio_src_path=src_path, \n",
322
+ " src_se=source_se, \n",
323
+ " tgt_se=target_se, \n",
324
+ " output_path=save_path,\n",
325
+ " message=encode_message)"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "markdown",
330
+ "id": "8e513094",
331
+ "metadata": {},
332
+ "source": [
333
+ "**Tech for good.** For people who will deploy OpenVoice for public usage: We offer you the option to add watermark to avoid potential misuse. Please see the ToneColorConverter class. **MyShell reserves the ability to detect whether an audio is generated by OpenVoice**, no matter whether the watermark is added or not."
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": null,
339
+ "id": "9628ffa1-1d60-4d1b-a9ed-619add064ebd",
340
+ "metadata": {},
341
+ "outputs": [],
342
+ "source": []
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": null,
347
+ "id": "377f4b72-dfca-4c58-8a5c-fea056538cc2",
348
+ "metadata": {},
349
+ "outputs": [],
350
+ "source": []
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": null,
355
+ "id": "31bf81ab-bac9-4996-8f47-8651052d713a",
356
+ "metadata": {},
357
+ "outputs": [],
358
+ "source": []
359
+ },
360
+ {
361
+ "cell_type": "code",
362
+ "execution_count": null,
363
+ "id": "32a84a29-9515-4aaa-b4ad-3a530e8259f0",
364
+ "metadata": {},
365
+ "outputs": [],
366
+ "source": []
367
+ },
368
+ {
369
+ "cell_type": "code",
370
+ "execution_count": null,
371
+ "id": "abd802ad-93ac-4db2-9ee5-0ad78b54e09e",
372
+ "metadata": {},
373
+ "outputs": [],
374
+ "source": []
375
+ }
376
+ ],
377
+ "metadata": {
378
+ "interpreter": {
379
+ "hash": "9d70c38e1c0b038dbdffdaa4f8bfa1f6767c43760905c87a9fbe7800d18c6c35"
380
+ },
381
+ "kernelspec": {
382
+ "display_name": "Python 3 (ipykernel)",
383
+ "language": "python",
384
+ "name": "python3"
385
+ },
386
+ "language_info": {
387
+ "codemirror_mode": {
388
+ "name": "ipython",
389
+ "version": 3
390
+ },
391
+ "file_extension": ".py",
392
+ "mimetype": "text/x-python",
393
+ "name": "python",
394
+ "nbconvert_exporter": "python",
395
+ "pygments_lexer": "ipython3",
396
+ "version": "3.9.19"
397
+ }
398
+ },
399
+ "nbformat": 4,
400
+ "nbformat_minor": 5
401
+ }
demo_part2.ipynb ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "b6ee1ede",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Cross-Lingual Voice Clone Demo"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "b7f043ee",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import os\n",
19
+ "import torch\n",
20
+ "from openvoice import se_extractor\n",
21
+ "from openvoice.api import ToneColorConverter"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "markdown",
26
+ "id": "15116b59",
27
+ "metadata": {},
28
+ "source": [
29
+ "### Initialization"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "id": "aacad912",
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "ckpt_converter = 'checkpoints/converter'\n",
40
+ "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
41
+ "output_dir = 'outputs'\n",
42
+ "\n",
43
+ "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
44
+ "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
45
+ "\n",
46
+ "os.makedirs(output_dir, exist_ok=True)"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "markdown",
51
+ "id": "3db80fcf",
52
+ "metadata": {},
53
+ "source": [
54
+ "In this demo, we will use OpenAI TTS as the base speaker to produce multi-lingual speech audio. The users can flexibly change the base speaker according to their own needs. Please create a file named `.env` and place OpenAI key as `OPENAI_API_KEY=xxx`. We have also provided a Chinese base speaker model (see `demo_part1.ipynb`)."
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "id": "3b245ca3",
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "from openai import OpenAI\n",
65
+ "from dotenv import load_dotenv\n",
66
+ "\n",
67
+ "# Please create a file named .env and place your\n",
68
+ "# OpenAI key as OPENAI_API_KEY=xxx\n",
69
+ "load_dotenv() \n",
70
+ "\n",
71
+ "client = OpenAI(api_key=os.environ.get(\"OPENAI_API_KEY\"))\n",
72
+ "\n",
73
+ "response = client.audio.speech.create(\n",
74
+ " model=\"tts-1\",\n",
75
+ " voice=\"nova\",\n",
76
+ " input=\"This audio will be used to extract the base speaker tone color embedding. \" + \\\n",
77
+ " \"Typically a very short audio should be sufficient, but increasing the audio \" + \\\n",
78
+ " \"length will also improve the output audio quality.\"\n",
79
+ ")\n",
80
+ "\n",
81
+ "response.stream_to_file(f\"{output_dir}/openai_source_output.mp3\")"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "markdown",
86
+ "id": "7f67740c",
87
+ "metadata": {},
88
+ "source": [
89
+ "### Obtain Tone Color Embedding"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "id": "f8add279",
95
+ "metadata": {},
96
+ "source": [
97
+ "The `source_se` is the tone color embedding of the base speaker. \n",
98
+ "It is an average for multiple sentences with multiple emotions\n",
99
+ "of the base speaker. We directly provide the result here but\n",
100
+ "the readers feel free to extract `source_se` by themselves."
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": null,
106
+ "id": "63ff6273",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "base_speaker = f\"{output_dir}/openai_source_output.mp3\"\n",
111
+ "source_se, audio_name = se_extractor.get_se(base_speaker, tone_color_converter, vad=True)\n",
112
+ "\n",
113
+ "reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone\n",
114
+ "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=True)"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "markdown",
119
+ "id": "a40284aa",
120
+ "metadata": {},
121
+ "source": [
122
+ "### Inference"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": null,
128
+ "id": "73dc1259",
129
+ "metadata": {},
130
+ "outputs": [],
131
+ "source": [
132
+ "# Run the base speaker tts\n",
133
+ "text = [\n",
134
+ " \"MyShell is a decentralized and comprehensive platform for discovering, creating, and staking AI-native apps.\",\n",
135
+ " \"MyShell es una plataforma descentralizada y completa para descubrir, crear y apostar por aplicaciones nativas de IA.\",\n",
136
+ " \"MyShell est une plateforme décentralisée et complète pour découvrir, créer et miser sur des applications natives d'IA.\",\n",
137
+ " \"MyShell ist eine dezentralisierte und umfassende Plattform zum Entdecken, Erstellen und Staken von KI-nativen Apps.\",\n",
138
+ " \"MyShell è una piattaforma decentralizzata e completa per scoprire, creare e scommettere su app native di intelligenza artificiale.\",\n",
139
+ " \"MyShellは、AIネイティブアプリの発見、作成、およびステーキングのための分散型かつ包括的なプラットフォームです。\",\n",
140
+ " \"MyShell — это децентрализованная и всеобъемлющая платформа для обнаружения, создания и стейкинга AI-ориентированных приложений.\",\n",
141
+ " \"MyShell هي منصة لامركزية وشاملة لاكتشاف وإنشاء ورهان تطبيقات الذكاء الاصطناعي الأصلية.\",\n",
142
+ " \"MyShell是一个去中心化且全面的平台,用于发现、创建和投资AI原生应用程序。\",\n",
143
+ " \"MyShell एक विकेंद्रीकृत और व्यापक मंच है, जो AI-मूल ऐप्स की खोज, सृजन और स्टेकिंग के लिए है।\",\n",
144
+ " \"MyShell é uma plataforma descentralizada e abrangente para descobrir, criar e apostar em aplicativos nativos de IA.\"\n",
145
+ "]\n",
146
+ "src_path = f'{output_dir}/tmp.wav'\n",
147
+ "\n",
148
+ "for i, t in enumerate(text):\n",
149
+ "\n",
150
+ " response = client.audio.speech.create(\n",
151
+ " model=\"tts-1\",\n",
152
+ " voice=\"nova\",\n",
153
+ " input=t,\n",
154
+ " )\n",
155
+ "\n",
156
+ " response.stream_to_file(src_path)\n",
157
+ "\n",
158
+ " save_path = f'{output_dir}/output_crosslingual_{i}.wav'\n",
159
+ "\n",
160
+ " # Run the tone color converter\n",
161
+ " encode_message = \"@MyShell\"\n",
162
+ " tone_color_converter.convert(\n",
163
+ " audio_src_path=src_path, \n",
164
+ " src_se=source_se, \n",
165
+ " tgt_se=target_se, \n",
166
+ " output_path=save_path,\n",
167
+ " message=encode_message)"
168
+ ]
169
+ }
170
+ ],
171
+ "metadata": {
172
+ "interpreter": {
173
+ "hash": "9d70c38e1c0b038dbdffdaa4f8bfa1f6767c43760905c87a9fbe7800d18c6c35"
174
+ },
175
+ "kernelspec": {
176
+ "display_name": "Python 3 (ipykernel)",
177
+ "language": "python",
178
+ "name": "python3"
179
+ },
180
+ "language_info": {
181
+ "codemirror_mode": {
182
+ "name": "ipython",
183
+ "version": 3
184
+ },
185
+ "file_extension": ".py",
186
+ "mimetype": "text/x-python",
187
+ "name": "python",
188
+ "nbconvert_exporter": "python",
189
+ "pygments_lexer": "ipython3",
190
+ "version": "3.9.18"
191
+ }
192
+ },
193
+ "nbformat": 4,
194
+ "nbformat_minor": 5
195
+ }
demo_part3.ipynb ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## Multi-Accent and Multi-Lingual Voice Clone Demo with MeloTTS"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 9,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import os\n",
17
+ "import torch\n",
18
+ "from openvoice import se_extractor\n",
19
+ "from openvoice.api import ToneColorConverter"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "markdown",
24
+ "metadata": {},
25
+ "source": [
26
+ "### Initialization\n",
27
+ "\n",
28
+ "In this example, we will use the checkpoints from OpenVoiceV2. OpenVoiceV2 is trained with more aggressive augmentations and thus demonstrate better robustness in some cases."
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 10,
34
+ "metadata": {},
35
+ "outputs": [
36
+ {
37
+ "name": "stderr",
38
+ "output_type": "stream",
39
+ "text": [
40
+ "/Users/russell/miniconda3/envs/openvoice/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
41
+ " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
42
+ ]
43
+ },
44
+ {
45
+ "name": "stdout",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "Loaded checkpoint 'checkpoints_v2/converter/checkpoint.pth'\n",
49
+ "missing/unexpected keys: [] []\n"
50
+ ]
51
+ }
52
+ ],
53
+ "source": [
54
+ "ckpt_converter = 'checkpoints_v2/converter'\n",
55
+ "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
56
+ "output_dir = 'outputs_v2'\n",
57
+ "\n",
58
+ "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
59
+ "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
60
+ "\n",
61
+ "os.makedirs(output_dir, exist_ok=True)"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "markdown",
66
+ "metadata": {},
67
+ "source": [
68
+ "### Obtain Tone Color Embedding\n",
69
+ "We only extract the tone color embedding for the target speaker. The source tone color embeddings can be directly loaded from `checkpoints_v2/ses` folder."
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 11,
75
+ "metadata": {},
76
+ "outputs": [
77
+ {
78
+ "name": "stdout",
79
+ "output_type": "stream",
80
+ "text": [
81
+ "OpenVoice version: v2\n"
82
+ ]
83
+ },
84
+ {
85
+ "data": {
86
+ "application/vnd.jupyter.widget-view+json": {
87
+ "model_id": "dc253b8bc6d34915bec3fa5b526b0348",
88
+ "version_major": 2,
89
+ "version_minor": 0
90
+ },
91
+ "text/plain": [
92
+ "Downloading vocabulary.txt: 0%| | 0.00/460k [00:00<?, ?B/s]"
93
+ ]
94
+ },
95
+ "metadata": {},
96
+ "output_type": "display_data"
97
+ },
98
+ {
99
+ "data": {
100
+ "application/vnd.jupyter.widget-view+json": {
101
+ "model_id": "7c82ae46811248e9abafdf3b901c19a1",
102
+ "version_major": 2,
103
+ "version_minor": 0
104
+ },
105
+ "text/plain": [
106
+ "Downloading tokenizer.json: 0%| | 0.00/2.20M [00:00<?, ?B/s]"
107
+ ]
108
+ },
109
+ "metadata": {},
110
+ "output_type": "display_data"
111
+ },
112
+ {
113
+ "data": {
114
+ "application/vnd.jupyter.widget-view+json": {
115
+ "model_id": "392369f8bd914110a4c7cffe457bda51",
116
+ "version_major": 2,
117
+ "version_minor": 0
118
+ },
119
+ "text/plain": [
120
+ "Downloading model.bin: 0%| | 0.00/1.53G [00:00<?, ?B/s]"
121
+ ]
122
+ },
123
+ "metadata": {},
124
+ "output_type": "display_data"
125
+ },
126
+ {
127
+ "data": {
128
+ "application/vnd.jupyter.widget-view+json": {
129
+ "model_id": "80894d63cbcf4d71a11b654eab6a1320",
130
+ "version_major": 2,
131
+ "version_minor": 0
132
+ },
133
+ "text/plain": [
134
+ "Downloading config.json: 0%| | 0.00/2.26k [00:00<?, ?B/s]"
135
+ ]
136
+ },
137
+ "metadata": {},
138
+ "output_type": "display_data"
139
+ },
140
+ {
141
+ "ename": "KeyboardInterrupt",
142
+ "evalue": "",
143
+ "output_type": "error",
144
+ "traceback": [
145
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
146
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
147
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:51\u001b[0m, in \u001b[0;36m_executor_map\u001b[0;34m(PoolExecutor, fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m PoolExecutor(max_workers\u001b[38;5;241m=\u001b[39mmax_workers, initializer\u001b[38;5;241m=\u001b[39mtqdm_class\u001b[38;5;241m.\u001b[39mset_lock,\n\u001b[1;32m 50\u001b[0m initargs\u001b[38;5;241m=\u001b[39m(lk,)) \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtqdm_class\u001b[49m\u001b[43m(\u001b[49m\u001b[43mex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43miterables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunksize\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
148
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/notebook.py:250\u001b[0m, in \u001b[0;36mtqdm_notebook.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 249\u001b[0m it \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__iter__\u001b[39m()\n\u001b[0;32m--> 250\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m it:\n\u001b[1;32m 251\u001b[0m \u001b[38;5;66;03m# return super(tqdm...) will not catch exception\u001b[39;00m\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m obj\n",
149
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/std.py:1169\u001b[0m, in \u001b[0;36mtqdm.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1168\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdisable:\n\u001b[0;32m-> 1169\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[1;32m 1170\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m obj\n",
150
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/_base.py:609\u001b[0m, in \u001b[0;36mExecutor.map.<locals>.result_iterator\u001b[0;34m()\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 609\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m \u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
151
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/_base.py:441\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[0;32m--> 441\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_condition\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
152
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/threading.py:312\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 311\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 312\u001b[0m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 313\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
153
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
154
+ "\nDuring handling of the above exception, another exception occurred:\n",
155
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
156
+ "Cell \u001b[0;32mIn[11], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m reference_speaker \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mresources/example_reference.mp3\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;66;03m# This is the voice you want to clone\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m target_se, audio_name \u001b[38;5;241m=\u001b[39m \u001b[43mse_extractor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_se\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreference_speaker\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtone_color_converter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
157
+ "File \u001b[0;32m~/Desktop/seamless_communication_test/OpenVoice/openvoice/se_extractor.py:146\u001b[0m, in \u001b[0;36mget_se\u001b[0;34m(audio_path, vc_model, target_dir, vad)\u001b[0m\n\u001b[1;32m 144\u001b[0m wavs_folder \u001b[38;5;241m=\u001b[39m split_audio_vad(audio_path, target_dir\u001b[38;5;241m=\u001b[39mtarget_dir, audio_name\u001b[38;5;241m=\u001b[39maudio_name)\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 146\u001b[0m wavs_folder \u001b[38;5;241m=\u001b[39m \u001b[43msplit_audio_whisper\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtarget_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 148\u001b[0m audio_segs \u001b[38;5;241m=\u001b[39m glob(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mwavs_folder\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/*.wav\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 149\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(audio_segs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
158
+ "File \u001b[0;32m~/Desktop/seamless_communication_test/OpenVoice/openvoice/se_extractor.py:22\u001b[0m, in \u001b[0;36msplit_audio_whisper\u001b[0;34m(audio_path, audio_name, target_dir)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mglobal\u001b[39;00m model\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 22\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mWhisperModel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcuda\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcompute_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfloat16\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 23\u001b[0m audio \u001b[38;5;241m=\u001b[39m AudioSegment\u001b[38;5;241m.\u001b[39mfrom_file(audio_path)\n\u001b[1;32m 24\u001b[0m max_len \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(audio)\n",
159
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/faster_whisper/transcribe.py:122\u001b[0m, in \u001b[0;36mWhisperModel.__init__\u001b[0;34m(self, model_size_or_path, device, device_index, compute_type, cpu_threads, num_workers, download_root, local_files_only)\u001b[0m\n\u001b[1;32m 120\u001b[0m model_path \u001b[38;5;241m=\u001b[39m model_size_or_path\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 122\u001b[0m model_path \u001b[38;5;241m=\u001b[39m \u001b[43mdownload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 123\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_size_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 124\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 125\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_root\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 126\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m ctranslate2\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mWhisper(\n\u001b[1;32m 129\u001b[0m model_path,\n\u001b[1;32m 130\u001b[0m device\u001b[38;5;241m=\u001b[39mdevice,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 134\u001b[0m inter_threads\u001b[38;5;241m=\u001b[39mnum_workers,\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 137\u001b[0m tokenizer_file \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(model_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokenizer.json\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
160
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/faster_whisper/utils.py:98\u001b[0m, in \u001b[0;36mdownload_model\u001b[0;34m(size_or_id, output_dir, local_files_only, cache_dir)\u001b[0m\n\u001b[1;32m 95\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcache_dir\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m cache_dir\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 98\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhuggingface_hub\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msnapshot_download\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[1;32m 100\u001b[0m huggingface_hub\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mHfHubHTTPError,\n\u001b[1;32m 101\u001b[0m requests\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mConnectionError,\n\u001b[1;32m 102\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m exception:\n\u001b[1;32m 103\u001b[0m logger \u001b[38;5;241m=\u001b[39m get_logger()\n",
161
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:118\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m 116\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
162
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/huggingface_hub/_snapshot_download.py:239\u001b[0m, in \u001b[0;36msnapshot_download\u001b[0;34m(repo_id, repo_type, revision, endpoint, cache_dir, local_dir, local_dir_use_symlinks, library_name, library_version, user_agent, proxies, etag_timeout, resume_download, force_download, token, local_files_only, allow_patterns, ignore_patterns, max_workers, tqdm_class)\u001b[0m\n\u001b[1;32m 237\u001b[0m _inner_hf_hub_download(file)\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 239\u001b[0m \u001b[43mthread_map\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 240\u001b[0m \u001b[43m \u001b[49m\u001b[43m_inner_hf_hub_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 241\u001b[0m \u001b[43m \u001b[49m\u001b[43mfiltered_repo_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 242\u001b[0m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mFetching \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfiltered_repo_files\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m files\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 243\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_workers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_workers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 244\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# User can use its own tqdm class or the default one from `huggingface_hub.utils`\u001b[39;49;00m\n\u001b[1;32m 245\u001b[0m \u001b[43m \u001b[49m\u001b[43mtqdm_class\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtqdm_class\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mhf_tqdm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 246\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 248\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m local_dir \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 249\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mrealpath(local_dir))\n",
163
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:69\u001b[0m, in \u001b[0;36mthread_map\u001b[0;34m(fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;124;03mEquivalent of `list(map(fn, *iterables))`\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;124;03mdriven by `concurrent.futures.ThreadPoolExecutor`.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;124;03m [default: max(32, cpu_count() + 4)].\u001b[39;00m\n\u001b[1;32m 67\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconcurrent\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfutures\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ThreadPoolExecutor\n\u001b[0;32m---> 69\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_executor_map\u001b[49m\u001b[43m(\u001b[49m\u001b[43mThreadPoolExecutor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43miterables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mtqdm_kwargs\u001b[49m\u001b[43m)\u001b[49m\n",
164
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:51\u001b[0m, in \u001b[0;36m_executor_map\u001b[0;34m(PoolExecutor, fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ensure_lock(tqdm_class, lock_name\u001b[38;5;241m=\u001b[39mlock_name) \u001b[38;5;28;01mas\u001b[39;00m lk:\n\u001b[1;32m 48\u001b[0m \u001b[38;5;66;03m# share lock in case workers are already using `tqdm`\u001b[39;00m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m PoolExecutor(max_workers\u001b[38;5;241m=\u001b[39mmax_workers, initializer\u001b[38;5;241m=\u001b[39mtqdm_class\u001b[38;5;241m.\u001b[39mset_lock,\n\u001b[1;32m 50\u001b[0m initargs\u001b[38;5;241m=\u001b[39m(lk,)) \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(tqdm_class(ex\u001b[38;5;241m.\u001b[39mmap(fn, \u001b[38;5;241m*\u001b[39miterables, chunksize\u001b[38;5;241m=\u001b[39mchunksize), \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n",
165
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/_base.py:637\u001b[0m, in \u001b[0;36mExecutor.__exit__\u001b[0;34m(self, exc_type, exc_val, exc_tb)\u001b[0m\n\u001b[1;32m 636\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__exit__\u001b[39m(\u001b[38;5;28mself\u001b[39m, exc_type, exc_val, exc_tb):\n\u001b[0;32m--> 637\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshutdown\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwait\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 638\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
166
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/thread.py:235\u001b[0m, in \u001b[0;36mThreadPoolExecutor.shutdown\u001b[0;34m(self, wait, cancel_futures)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m wait:\n\u001b[1;32m 234\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_threads:\n\u001b[0;32m--> 235\u001b[0m \u001b[43mt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
167
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/threading.py:1060\u001b[0m, in \u001b[0;36mThread.join\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1057\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcannot join current thread\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1059\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1060\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait_for_tstate_lock\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1061\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1062\u001b[0m \u001b[38;5;66;03m# the behavior of a negative timeout isn't documented, but\u001b[39;00m\n\u001b[1;32m 1063\u001b[0m \u001b[38;5;66;03m# historically .join(timeout=x) for x<0 has acted as if timeout=0\u001b[39;00m\n\u001b[1;32m 1064\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_wait_for_tstate_lock(timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mmax\u001b[39m(timeout, \u001b[38;5;241m0\u001b[39m))\n",
168
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/threading.py:1080\u001b[0m, in \u001b[0;36mThread._wait_for_tstate_lock\u001b[0;34m(self, block, timeout)\u001b[0m\n\u001b[1;32m 1077\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 1079\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1080\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mlock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblock\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1081\u001b[0m lock\u001b[38;5;241m.\u001b[39mrelease()\n\u001b[1;32m 1082\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stop()\n",
169
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
170
+ ]
171
+ }
172
+ ],
173
+ "source": [
174
+ "\n",
175
+ "reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone\n",
176
+ "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "markdown",
181
+ "metadata": {},
182
+ "source": [
183
+ "#### Use MeloTTS as Base Speakers\n",
184
+ "\n",
185
+ "MeloTTS is a high-quality multi-lingual text-to-speech library by @MyShell.ai, supporting languages including English (American, British, Indian, Australian, Default), Spanish, French, Chinese, Japanese, Korean. In the following example, we will use the models in MeloTTS as the base speakers. "
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": null,
191
+ "metadata": {},
192
+ "outputs": [],
193
+ "source": [
194
+ "from melo.api import TTS\n",
195
+ "\n",
196
+ "texts = {\n",
197
+ " 'EN_NEWEST': \"Did you ever hear a folk tale about a giant turtle?\", # The newest English base speaker model\n",
198
+ " 'EN': \"Did you ever hear a folk tale about a giant turtle?\",\n",
199
+ " 'ES': \"El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.\",\n",
200
+ " 'FR': \"La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.\",\n",
201
+ " 'ZH': \"在这次vacation中,我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。\",\n",
202
+ " 'JP': \"彼は毎朝ジョギングをして体を健康に保っています。\",\n",
203
+ " 'KR': \"안녕하세요! 오늘은 날씨가 정말 좋네요.\",\n",
204
+ "}\n",
205
+ "\n",
206
+ "\n",
207
+ "src_path = f'{output_dir}/tmp.wav'\n",
208
+ "\n",
209
+ "# Speed is adjustable\n",
210
+ "speed = 1.0\n",
211
+ "\n",
212
+ "for language, text in texts.items():\n",
213
+ " model = TTS(language=language, device=device)\n",
214
+ " speaker_ids = model.hps.data.spk2id\n",
215
+ " \n",
216
+ " for speaker_key in speaker_ids.keys():\n",
217
+ " speaker_id = speaker_ids[speaker_key]\n",
218
+ " speaker_key = speaker_key.lower().replace('_', '-')\n",
219
+ " \n",
220
+ " source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)\n",
221
+ " model.tts_to_file(text, speaker_id, src_path, speed=speed)\n",
222
+ " save_path = f'{output_dir}/output_v2_{speaker_key}.wav'\n",
223
+ "\n",
224
+ " # Run the tone color converter\n",
225
+ " encode_message = \"@MyShell\"\n",
226
+ " tone_color_converter.convert(\n",
227
+ " audio_src_path=src_path, \n",
228
+ " src_se=source_se, \n",
229
+ " tgt_se=target_se, \n",
230
+ " output_path=save_path,\n",
231
+ " message=encode_message)"
232
+ ]
233
+ }
234
+ ],
235
+ "metadata": {
236
+ "kernelspec": {
237
+ "display_name": "Python 3 (ipykernel)",
238
+ "language": "python",
239
+ "name": "python3"
240
+ },
241
+ "language_info": {
242
+ "codemirror_mode": {
243
+ "name": "ipython",
244
+ "version": 3
245
+ },
246
+ "file_extension": ".py",
247
+ "mimetype": "text/x-python",
248
+ "name": "python",
249
+ "nbconvert_exporter": "python",
250
+ "pygments_lexer": "ipython3",
251
+ "version": "3.9.19"
252
+ }
253
+ },
254
+ "nbformat": 4,
255
+ "nbformat_minor": 4
256
+ }
docs/QA.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Common Questions and Answers
2
+
3
+ ## General Comments
4
+
5
+ **OpenVoice is a Technology, not a Product**
6
+
7
+ Although it works on a majority of voices if used correctly, please do not expect it to work perfectly on every case, as it takes a lot of engineering effort to translate a technology to a stable product. The targeted users of this technology are developers and researchers, not end users. End users expects a perfect product. However, we are confident to say that OpenVoice is the state-of-the-art among the source-available voice cloning technologies.
8
+
9
+ The contribution of OpenVoice is a versatile instant voice cloning technical approach, not a ready-to-use perfect voice cloning product. However, we firmly believe that by releasing OpenVoice, we can accelerate the open research community's progress on instant voice cloning, and someday in the future the free voice cloning methods will be as good as commercial ones.
10
+
11
+ ## Issues with Voice Quality
12
+
13
+ **Accent and Emotion of the Generated Voice is not Similar to the Reference Voice**
14
+
15
+ First of all, OpenVoice only clones the tone color of the reference speaker. It does NOT clone the accent or emotion. The accent and emotion is controlled by the base speaker TTS model, not cloned by the tone color converter (please refer to our [paper](https://arxiv.org/pdf/2312.01479.pdf) for technical details). If the user wants to change the accent or emotion of the output, they need to have a base speaker model with that accent. OpenVoice provides sufficient flexibility for users to integrate their own base speaker model into the framework by simply replacing the current base speaker we provided.
16
+
17
+ **Bad Audio Quality of the Generated Speech**
18
+
19
+ Please check the followings:
20
+ - Is your reference audio is clean enough without any background noise? You can find some high-quality reference speech [here](https://aiartes.com/voiceai)
21
+ - Is your audio too short?
22
+ - Does your audio contain speech from more than one person?
23
+ - Does the reference audio contain long blank sections?
24
+ - Did you name the reference audio the same name you used before but forgot to delete the `processed` folder?
25
+
26
+ ## Issues with Languages
27
+
28
+ **Support of Other Languages**
29
+
30
+ For multi-lingual and cross-lingual usage, please refer to [`demo_part2.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb). OpenVoice supports any language as long as you have a base speaker in that language. The OpenVoice team already did the most difficult part (tone color converter training) for you. Base speaker TTS model is relatively easy to train, and multiple existing open-source repositories support it. If you don't want to train by yourself, simply use the OpenAI TTS model as the base speaker.
31
+
32
+ ## Issues with Installation
33
+ **Error Related to Silero**
34
+
35
+ When calling `get_vad_segments` from `se_extractor.py`, there should be a message like this:
36
+ ```
37
+ Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /home/user/.cache/torch/hub/master.zip
38
+ ```
39
+ The download would fail if your machine can not access github. Please download the zip from "https://github.com/snakers4/silero-vad/zipball/master" manually and unzip it to `/home/user/.cache/torch/hub/snakers4_silero-vad_master`. You can also see [this issue](https://github.com/myshell-ai/OpenVoice/issues/57) for solutions for other versions of silero.
docs/USAGE.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Usage
2
+
3
+ ## Table of Content
4
+
5
+ - [Quick Use](#quick-use): directly use OpenVoice without installation.
6
+ - [Linux Install](#linux-install): for researchers and developers only.
7
+ - [V1](#openvoice-v1)
8
+ - [V2](#openvoice-v2)
9
+ - [Install on Other Platforms](#install-on-other-platforms): unofficial installation guide contributed by the community
10
+
11
+ ## Quick Use
12
+
13
+ The input speech audio of OpenVoice can be in **Any Language**. OpenVoice can clone the voice in that speech audio, and use the voice to speak in multiple languages. For quick use, we recommend you to try the already deployed services:
14
+
15
+ - [British English](https://app.myshell.ai/widget/vYjqae)
16
+ - [American English](https://app.myshell.ai/widget/nEFFJf)
17
+ - [Indian English](https://app.myshell.ai/widget/V3iYze)
18
+ - [Australian English](https://app.myshell.ai/widget/fM7JVf)
19
+ - [Spanish](https://app.myshell.ai/widget/NNFFVz)
20
+ - [French](https://app.myshell.ai/widget/z2uyUz)
21
+ - [Chinese](https://app.myshell.ai/widget/fU7nUz)
22
+ - [Japanese](https://app.myshell.ai/widget/IfIB3u)
23
+ - [Korean](https://app.myshell.ai/widget/q6ZjIn)
24
+
25
+ ## Minimal Demo
26
+
27
+ For users who want to quickly try OpenVoice and do not require high quality or stability, click any of the following links:
28
+
29
+ <div align="center">
30
+ <a href="https://app.myshell.ai/bot/z6Bvua/1702636181"><img src="../resources/myshell-hd.png" height="28"></a>
31
+ &nbsp;&nbsp;&nbsp;&nbsp;
32
+ <a href="https://huggingface.co/spaces/myshell-ai/OpenVoice"><img src="../resources/huggingface.png" height="32"></a>
33
+ </div>
34
+
35
+ ## Linux Install
36
+
37
+ This section is only for developers and researchers who are familiar with Linux, Python and PyTorch. Clone this repo, and run
38
+
39
+ ```
40
+ conda create -n openvoice python=3.9
41
+ conda activate openvoice
42
+ git clone [email protected]:myshell-ai/OpenVoice.git
43
+ cd OpenVoice
44
+ pip install -e .
45
+ ```
46
+
47
+ No matter if you are using V1 or V2, the above installation is the same.
48
+
49
+ ### OpenVoice V1
50
+
51
+ Download the checkpoint from [here](https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_1226.zip) and extract it to the `checkpoints` folder.
52
+
53
+ **1. Flexible Voice Style Control.**
54
+ Please see [`demo_part1.ipynb`](../demo_part1.ipynb) for an example usage of how OpenVoice enables flexible style control over the cloned voice.
55
+
56
+ **2. Cross-Lingual Voice Cloning.**
57
+ Please see [`demo_part2.ipynb`](../demo_part2.ipynb) for an example for languages seen or unseen in the MSML training set.
58
+
59
+ **3. Gradio Demo.**. We provide a minimalist local gradio demo here. We strongly suggest the users to look into `demo_part1.ipynb`, `demo_part2.ipynb` and the [QnA](QA.md) if they run into issues with the gradio demo. Launch a local gradio demo with `python -m openvoice_app --share`.
60
+
61
+ ### OpenVoice V2
62
+
63
+ Download the checkpoint from [here](https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip) and extract it to the `checkpoints_v2` folder.
64
+
65
+ Install [MeloTTS](https://github.com/myshell-ai/MeloTTS):
66
+ ```
67
+ pip install git+https://github.com/myshell-ai/MeloTTS.git
68
+ python -m unidic download
69
+ ```
70
+
71
+ **Demo Usage.** Please see [`demo_part3.ipynb`](../demo_part3.ipynb) for example usage of OpenVoice V2. Now it natively supports English, Spanish, French, Chinese, Japanese and Korean.
72
+
73
+
74
+ ## Install on Other Platforms
75
+
76
+ This section provides the unofficial installation guides by open-source contributors in the community:
77
+
78
+ - Windows
79
+ - [Guide](https://github.com/Alienpups/OpenVoice/blob/main/docs/USAGE_WINDOWS.md) by [@Alienpups](https://github.com/Alienpups)
80
+ - You are welcome to contribute if you have a better installation guide. We will list you here.
81
+ - Docker
82
+ - [Guide](https://github.com/StevenJSCF/OpenVoice/blob/update-docs/docs/DF_USAGE.md) by [@StevenJSCF](https://github.com/StevenJSCF)
83
+ - You are welcome to contribute if you have a better installation guide. We will list you here.
openvoice/__init__.py ADDED
File without changes
openvoice/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (173 Bytes). View file
 
openvoice/__pycache__/api.cpython-39.pyc ADDED
Binary file (7.31 kB). View file
 
openvoice/__pycache__/attentions.cpython-39.pyc ADDED
Binary file (11.1 kB). View file
 
openvoice/__pycache__/commons.cpython-39.pyc ADDED
Binary file (5.79 kB). View file
 
openvoice/__pycache__/mel_processing.cpython-39.pyc ADDED
Binary file (4.19 kB). View file
 
openvoice/__pycache__/models.cpython-39.pyc ADDED
Binary file (12.7 kB). View file
 
openvoice/__pycache__/modules.cpython-39.pyc ADDED
Binary file (13.1 kB). View file
 
openvoice/__pycache__/se_extractor.cpython-39.pyc ADDED
Binary file (4.14 kB). View file
 
openvoice/__pycache__/transforms.cpython-39.pyc ADDED
Binary file (3.94 kB). View file