Spaces:

Russell1123213123
/

testOpenVoice

Build error

App Files Files Community

Russell1123213123 commited on Jul 2

Commit

eefa761

•

1 Parent(s): 31a24b8

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.gitattributes +4 -0
.gitignore +13 -0
.idea/.gitignore +3 -0
.idea/OpenVoice.iml +12 -0
.idea/inspectionProfiles/Project_Default.xml +16 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +7 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
.idea/workspace.xml +81 -0
.ipynb_checkpoints/demo_part1-checkpoint.ipynb +399 -0
.ipynb_checkpoints/demo_part3-checkpoint.ipynb +143 -0
LICENSE +7 -0
MyShell_OpenVoice.egg-info/PKG-INFO +105 -0
MyShell_OpenVoice.egg-info/SOURCES.txt +25 -0
MyShell_OpenVoice.egg-info/dependency_links.txt +1 -0
MyShell_OpenVoice.egg-info/not-zip-safe +1 -0
MyShell_OpenVoice.egg-info/requires.txt +14 -0
MyShell_OpenVoice.egg-info/top_level.txt +1 -0
README.md +76 -8
app.py +71 -0
checkpoints/base_speakers/.DS_Store +0 -0
checkpoints/base_speakers/EN/checkpoint.pth +3 -0
checkpoints/base_speakers/EN/config.json +145 -0
checkpoints/base_speakers/EN/en_default_se.pth +3 -0
checkpoints/base_speakers/EN/en_style_se.pth +3 -0
checkpoints/base_speakers/ZH/checkpoint.pth +3 -0
checkpoints/base_speakers/ZH/config.json +137 -0
checkpoints/base_speakers/ZH/zh_default_se.pth +3 -0
checkpoints/converter/checkpoint.pth +3 -0
checkpoints/converter/config.json +57 -0
checkpoints_v2/.DS_Store +0 -0
checkpoints_v2/converter/checkpoint.pth +3 -0
checkpoints_v2/converter/config.json +57 -0
demo_part1.ipynb +401 -0
demo_part2.ipynb +195 -0
demo_part3.ipynb +256 -0
docs/QA.md +39 -0
docs/USAGE.md +83 -0
openvoice/__init__.py +0 -0
openvoice/__pycache__/__init__.cpython-39.pyc +0 -0
openvoice/__pycache__/api.cpython-39.pyc +0 -0
openvoice/__pycache__/attentions.cpython-39.pyc +0 -0
openvoice/__pycache__/commons.cpython-39.pyc +0 -0
openvoice/__pycache__/mel_processing.cpython-39.pyc +0 -0
openvoice/__pycache__/models.cpython-39.pyc +0 -0
openvoice/__pycache__/modules.cpython-39.pyc +0 -0
openvoice/__pycache__/se_extractor.cpython-39.pyc +0 -0
openvoice/__pycache__/transforms.cpython-39.pyc +0 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+outputs/output_chinese.wav filter=lfs diff=lfs merge=lfs -text
+outputs/tmp.wav filter=lfs diff=lfs merge=lfs -text
+processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg0.wav filter=lfs diff=lfs merge=lfs -text
+processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg1.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+__pycache__/
+.ipynb_checkpoints/
+processed
+outputs
+outputs_v2
+checkpoints
+checkpoints_v2
+trash
+examples*
+.env
+build
+*.egg-info/
+*.zip

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/OpenVoice.iml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,16 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N806" />
+          <option value="N802" />
+          <option value="N801" />
+          <option value="N813" />
+          <option value="N803" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (openvoice)" project-jdk-type="Python SDK" />
+  <component name="PyCharmProfessionalAdvertiser">
+    <option name="shown" value="true" />
+  </component>
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/OpenVoice.iml" filepath="$PROJECT_DIR$/.idea/OpenVoice.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,81 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="BranchesTreeState">
+    <expand>
+      <path>
+        <item name="ROOT" type="e8cecc67:BranchNodeDescriptor" />
+        <item name="LOCAL_ROOT" type="e8cecc67:BranchNodeDescriptor" />
+      </path>
+      <path>
+        <item name="ROOT" type="e8cecc67:BranchNodeDescriptor" />
+        <item name="REMOTE_ROOT" type="e8cecc67:BranchNodeDescriptor" />
+      </path>
+      <path>
+        <item name="ROOT" type="e8cecc67:BranchNodeDescriptor" />
+        <item name="REMOTE_ROOT" type="e8cecc67:BranchNodeDescriptor" />
+        <item name="GROUP_NODE:origin" type="e8cecc67:BranchNodeDescriptor" />
+      </path>
+    </expand>
+    <select />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="7380c043-d972-4774-8844-edb18bb79433" name="Default Changelist" comment="">
+      <change afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/demo_part1.ipynb" beforeDir="false" afterPath="$PROJECT_DIR$/demo_part1.ipynb" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/demo_part3.ipynb" beforeDir="false" afterPath="$PROJECT_DIR$/demo_part3.ipynb" afterDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="ProjectId" id="2idcySXwJyzz5B7FOKKvwBhhetW" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+    <option name="showMembers" value="true" />
+  </component>
+  <component name="PropertiesComponent">
+    <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
+  </component>
+  <component name="RecentsManager">
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$" />
+    </key>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="7380c043-d972-4774-8844-edb18bb79433" name="Default Changelist" comment="" />
+      <created>1719824154394</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1719824154394</updated>
+    </task>
+    <servers />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="WindowStateProjectService">
+    <state x="1186" y="315" key="#com.intellij.fileTypes.FileTypeChooser" timestamp="1719910419213">
+      <screen x="0" y="25" width="1920" height="986" />
+    </state>
+    <state x="1186" y="315" key="#com.intellij.fileTypes.FileTypeChooser/0.25.1920.986/[email protected]" timestamp="1719910419213" />
+    <state x="1076" y="263" width="670" height="676" key="search.everywhere.popup" timestamp="1719914571586">
+      <screen x="0" y="25" width="1920" height="986" />
+    </state>
+    <state x="1076" y="263" width="670" height="676" key="search.everywhere.popup/0.25.1920.986/[email protected]" timestamp="1719914571586" />
+  </component>
+</project>

.ipynb_checkpoints/demo_part1-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,399 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b6ee1ede",
+   "metadata": {},
+   "source": [
+    "## Voice Style Control Demo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "b7f043ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from openvoice import se_extractor\n",
+    "from openvoice.api import BaseSpeakerTTS, ToneColorConverter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15116b59",
+   "metadata": {},
+   "source": [
+    "### Initialization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "aacad912",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n",
+      "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n"
+     ]
+    }
+   ],
+   "source": [
+    "ckpt_base = 'checkpoints/base_speakers/EN'\n",
+    "ckpt_converter = 'checkpoints/converter'\n",
+    "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+    "output_dir = 'outputs'\n",
+    "\n",
+    "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
+    "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
+    "\n",
+    "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
+    "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
+    "\n",
+    "os.makedirs(output_dir, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7f67740c",
+   "metadata": {},
+   "source": [
+    "### Obtain Tone Color Embedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8add279",
+   "metadata": {},
+   "source": [
+    "The `source_se` is the tone color embedding of the base speaker. \n",
+    "It is an average of multiple sentences generated by the base speaker. We directly provide the result here but\n",
+    "the readers feel free to extract `source_se` by themselves."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "63ff6273",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f71fcc3",
+   "metadata": {},
+   "source": [
+    "The `reference_speaker.mp3` below points to the short audio clip of the reference whose voice we want to clone. We provide an example here. If you use your own reference speakers, please **make sure each speaker has a unique filename.** The `se_extractor` will save the `targeted_se` using the filename of the audio and **will not automatically overwrite.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "55105eae",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "OpenVoice version: v1\n",
+      "[(0.0, 19.278375)]\n",
+      "after vad: dur = 19.27798185941043\n"
+     ]
+    }
+   ],
+   "source": [
+    "reference_speaker = './resources/demo_speaker0.mp3' # This is the voice you want to clone\n",
+    "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a40284aa",
+   "metadata": {},
+   "source": [
+    "### Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "73dc1259",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Text splitted to sentences.\n",
+      "This audio is generated by OpenVoice.\n",
+      " > ===========================\n",
+      "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
+      " length:45\n",
+      " length:45\n"
+     ]
+    }
+   ],
+   "source": [
+    "save_path = f'{output_dir}/output_en_default.wav'\n",
+    "\n",
+    "# Run the base speaker tts\n",
+    "text = \"This audio is generated by OpenVoice.\"\n",
+    "src_path = f'{output_dir}/tmp.wav'\n",
+    "base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
+    "\n",
+    "# Run the tone color converter\n",
+    "encode_message = \"@MyShell\"\n",
+    "tone_color_converter.convert(\n",
+    "    audio_src_path=src_path, \n",
+    "    src_se=source_se, \n",
+    "    tgt_se=target_se, \n",
+    "    output_path=save_path,\n",
+    "    message=encode_message)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6e3ea28a",
+   "metadata": {},
+   "source": [
+    "**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "fd022d38",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Text splitted to sentences.\n",
+      "This audio is generated by OpenVoice.\n",
+      " > ===========================\n",
+      "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
+      " length:45\n",
+      " length:45\n"
+     ]
+    }
+   ],
+   "source": [
+    "source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
+    "save_path = f'{output_dir}/output_whispering.wav'\n",
+    "\n",
+    "# Run the base speaker tts\n",
+    "text = \"This audio is generated by OpenVoice.\"\n",
+    "src_path = f'{output_dir}/tmp.wav'\n",
+    "base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n",
+    "\n",
+    "# Run the tone color converter\n",
+    "encode_message = \"@MyShell\"\n",
+    "tone_color_converter.convert(\n",
+    "    audio_src_path=src_path, \n",
+    "    src_se=source_se, \n",
+    "    tgt_se=target_se, \n",
+    "    output_path=save_path,\n",
+    "    message=encode_message)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5fcfc70b",
+   "metadata": {},
+   "source": [
+    "**Try with different languages.** OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker. We provide an example with a Chinese base speaker here and we encourage the readers to try `demo_part2.ipynb` for a detailed demo."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "deff30a4-d430-4b4d-9772-b936f5b564c4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n"
+     ]
+    }
+   ],
+   "source": [
+    "ckpt_base = 'checkpoints/base_speakers/ZH'\n",
+    "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
+    "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
+    "\n",
+    "source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)\n",
+    "save_path = f'{output_dir}/output_chinese.wav'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "a71d1387",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Text splitted to sentences.\n",
+      "毛岛灰绣眼鸟（学名：Zosterops mauritianus）是一种绣眼鸟科绣眼鸟属的鸟类,\n",
+      "属于毛里求斯岛上两种特有种绣眼鸟之一,\n",
+      "另一种是更为稀少的毛里求斯绣眼鸟.\n",
+      "上半身整体为灰色, 下半身为灰白色,\n",
+      "臀部和腋羽是十分显眼的白色.\n",
+      "这种鸟栖息于次生林、森林和花园中[1].\n",
+      "它与留尼汪灰绣眼鸟亲缘关系很近,\n",
+      "曾经被认为是同种, 统称为马斯克林绣眼鸟[2]\n",
+      " > ===========================\n",
+      "mɑʊ↑t⁼ɑʊ↓↑ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑（ ʃɥɛ↑miŋ↑,ts⁼eɪ↓oʊ→ɛ↑sɹ↓tʰi↓i↓a↓oʊ→pʰi→ɛ↑sɹ↓ ɛ↑mu↓eɪ→joʊ→a↓aɪ↓tʰi↓aɪ↓eɪ→ən→joʊ→ɛ↑sɹ↓） s`ɹ`↓ i→ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑kʰə→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ s`u↓↑ t⁼ə niɑʊ↓↑leɪ↓,\n",
+      " length:199\n",
+      " length:197\n",
+      "s`u↓↑ɥ↑ mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ t⁼ɑʊ↓↑s`ɑŋ↓ liɑŋ↓↑ts`⁼ʊŋ↓↑ tʰə↓joʊ↓↑ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ ts`⁼ɹ`→i→,\n",
+      " length:100\n",
+      " length:100\n",
+      "liŋ↓ i→ts`⁼ʊŋ↓↑ s`ɹ`↓ k⁼əŋ↓weɪ↑ ʃi→s`ɑʊ↓↑ t⁼ə mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑.\n",
+      " length:83\n",
+      " length:83\n",
+      "s`ɑŋ↓p⁼an↓s`ən→ ts`⁼əŋ↓↑tʰi↓↑ weɪ↓ xweɪ→sə↓,  ʃja↓p⁼an↓s`ən→ weɪ↓ xweɪ→p⁼aɪ↑sə↓,\n",
+      " length:80\n",
+      " length:80\n",
+      "tʰwən↑p⁼u↓ xə↑ iɛ↓ɥ↓↑ s`ɹ`↓ s`ɹ`↑fən→ ʃjɛn↓↑jɛn↓↑ t⁼ə p⁼aɪ↑sə↓.\n",
+      " length:63\n",
+      " length:63\n",
+      "ts`⁼ə↓ts`⁼ʊŋ↓↑ niɑʊ↓↑ tʃʰi→ʃi→ ɥ↑ tsʰɹ↓s`əŋ→lin↑, sən→lin↑ xə↑ xwa→ɥæn↑ ts`⁼ʊŋ→[ i→].\n",
+      " length:85\n",
+      " length:83\n",
+      "tʰa→ ɥ↓↑ ljoʊ↑ni↑uɑŋ→ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ tʃʰin→ɥæn↑ k⁼wan→ʃi↓ xən↓↑tʃ⁼in↓,\n",
+      " length:79\n",
+      " length:79\n",
+      "tsʰəŋ↑tʃ⁼iŋ→ p⁼eɪ↓ ɹ`ən↓weɪ↑ s`ɹ`↓ tʰʊŋ↑ts`⁼ʊŋ↓↑,  tʰʊŋ↓↑ts`ʰəŋ→ weɪ↓ ma↓↑sɹ→kʰə↓lin↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑[ əɹ`↓].\n",
+      " length:111\n",
+      " length:109\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "unsupported operand type(s) for -: 'builtin_function_or_method' and 'float'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[30], line 20\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;66;03m# 记录结束时间\u001b[39;00m\n\u001b[1;32m     19\u001b[0m end_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime\n\u001b[0;32m---> 20\u001b[0m execution_time \u001b[38;5;241m=\u001b[39m \u001b[43mend_time\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mstart_time\u001b[49m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m代码执行时间: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexecution_time\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m 秒\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for -: 'builtin_function_or_method' and 'float'"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "\n",
+    "# 记录开始时间\n",
+    "start_time = time.time()\n",
+    "# Run the base speaker tts\n",
+    "text = \"毛岛灰绣眼鸟（学名：Zosterops mauritianus）是一种绣眼鸟科绣眼鸟属的鸟类，属于毛里求斯岛上两种特有种绣眼鸟之一，另一种是更为稀少的毛里求斯绣眼鸟。上半身整体为灰色，下半身为灰白色，臀部和腋羽是十分显眼的白色。这种鸟栖息于次生林、森林和花园中[1]。它与留尼汪灰绣眼鸟亲缘关系很近，曾经被认为是同种，统称为马斯克林绣眼鸟[2]\"\n",
+    "src_path = f'{output_dir}/tmp.wav'\n",
+    "base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)\n",
+    "\n",
+    "# Run the tone color converter\n",
+    "encode_message = \"@MyShell\"\n",
+    "tone_color_converter.convert(\n",
+    "    audio_src_path=src_path, \n",
+    "    src_se=source_se, \n",
+    "    tgt_se=target_se, \n",
+    "    output_path=save_path,\n",
+    "    message=encode_message)\n",
+    "# 记录结束时间\n",
+    "end_time = time.time\n",
+    "execution_time = end_time - start_time\n",
+    "print(f\"代码执行时间: {execution_time} 秒\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e513094",
+   "metadata": {},
+   "source": [
+    "**Tech for good.** For people who will deploy OpenVoice for public usage: We offer you the option to add watermark to avoid potential misuse. Please see the ToneColorConverter class. **MyShell reserves the ability to detect whether an audio is generated by OpenVoice**, no matter whether the watermark is added or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9628ffa1-1d60-4d1b-a9ed-619add064ebd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "377f4b72-dfca-4c58-8a5c-fea056538cc2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31bf81ab-bac9-4996-8f47-8651052d713a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32a84a29-9515-4aaa-b4ad-3a530e8259f0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abd802ad-93ac-4db2-9ee5-0ad78b54e09e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "9d70c38e1c0b038dbdffdaa4f8bfa1f6767c43760905c87a9fbe7800d18c6c35"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

.ipynb_checkpoints/demo_part3-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,143 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-Accent and Multi-Lingual Voice Clone Demo with MeloTTS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from openvoice import se_extractor\n",
+    "from openvoice.api import ToneColorConverter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialization\n",
+    "\n",
+    "In this example, we will use the checkpoints from OpenVoiceV2. OpenVoiceV2 is trained with more aggressive augmentations and thus demonstrate better robustness in some cases."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ckpt_converter = 'checkpoints_v2/converter'\n",
+    "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+    "output_dir = 'outputs_v2'\n",
+    "\n",
+    "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
+    "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
+    "\n",
+    "os.makedirs(output_dir, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Obtain Tone Color Embedding\n",
+    "We only extract the tone color embedding for the target speaker. The source tone color embeddings can be directly loaded from `checkpoints_v2/ses` folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone\n",
+    "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Use MeloTTS as Base Speakers\n",
+    "\n",
+    "MeloTTS is a high-quality multi-lingual text-to-speech library by @MyShell.ai, supporting languages including English (American, British, Indian, Australian, Default), Spanish, French, Chinese, Japanese, Korean. In the following example, we will use the models in MeloTTS as the base speakers. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from melo.api import TTS\n",
+    "\n",
+    "texts = {\n",
+    "    'EN_NEWEST': \"Did you ever hear a folk tale about a giant turtle?\",  # The newest English base speaker model\n",
+    "    'EN': \"Did you ever hear a folk tale about a giant turtle?\",\n",
+    "    'ES': \"El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.\",\n",
+    "    'FR': \"La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.\",\n",
+    "    'ZH': \"在这次vacation中，我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。\",\n",
+    "    'JP': \"彼は毎朝ジョギングをして体を健康に保っています。\",\n",
+    "    'KR': \"안녕하세요! 오늘은 날씨가 정말 좋네요.\",\n",
+    "}\n",
+    "\n",
+    "\n",
+    "src_path = f'{output_dir}/tmp.wav'\n",
+    "\n",
+    "# Speed is adjustable\n",
+    "speed = 1.0\n",
+    "\n",
+    "for language, text in texts.items():\n",
+    "    model = TTS(language=language, device=device)\n",
+    "    speaker_ids = model.hps.data.spk2id\n",
+    "    \n",
+    "    for speaker_key in speaker_ids.keys():\n",
+    "        speaker_id = speaker_ids[speaker_key]\n",
+    "        speaker_key = speaker_key.lower().replace('_', '-')\n",
+    "        \n",
+    "        source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)\n",
+    "        model.tts_to_file(text, speaker_id, src_path, speed=speed)\n",
+    "        save_path = f'{output_dir}/output_v2_{speaker_key}.wav'\n",
+    "\n",
+    "        # Run the tone color converter\n",
+    "        encode_message = \"@MyShell\"\n",
+    "        tone_color_converter.convert(\n",
+    "            audio_src_path=src_path, \n",
+    "            src_se=source_se, \n",
+    "            tgt_se=target_se, \n",
+    "            output_path=save_path,\n",
+    "            message=encode_message)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "melo",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,7 @@

+Copyright 2024 MyShell.ai
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

MyShell_OpenVoice.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,105 @@

+Metadata-Version: 2.1
+Name: MyShell-OpenVoice
+Version: 0.0.0
+Summary: Instant voice cloning by MyShell.
+Home-page: https://github.com/myshell-ai/OpenVoice
+Author: MyShell
+Author-email: [email protected]
+License: MIT License
+Project-URL: Documentation, https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md
+Project-URL: Changes, https://github.com/myshell-ai/OpenVoice/releases
+Project-URL: Code, https://github.com/myshell-ai/OpenVoice
+Project-URL: Issue tracker, https://github.com/myshell-ai/OpenVoice/issues
+Keywords: text-to-speech,tts,voice-clone,zero-shot-tts
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: librosa==0.9.1
+Requires-Dist: faster-whisper==0.9.0
+Requires-Dist: pydub==0.25.1
+Requires-Dist: wavmark==0.0.3
+Requires-Dist: numpy==1.22.0
+Requires-Dist: eng_to_ipa==0.0.2
+Requires-Dist: inflect==7.0.0
+Requires-Dist: unidecode==1.3.7
+Requires-Dist: whisper-timestamped==1.14.2
+Requires-Dist: pypinyin==0.50.0
+Requires-Dist: cn2an==0.5.22
+Requires-Dist: jieba==0.42.1
+Requires-Dist: gradio==3.48.0
+Requires-Dist: langid==1.1.6
+<div align="center">
+  <div>&nbsp;</div>
+  <img src="resources/openvoicelogo.jpg" width="400"/>
+[Paper](https://arxiv.org/abs/2312.01479) |
+[Website](https://research.myshell.ai/open-voice)
+</div>
+## Introduction
+### OpenVoice V1
+As we detailed in our [paper](https://arxiv.org/abs/2312.01479) and [website](https://research.myshell.ai/open-voice), the advantages of OpenVoice are three-fold:
+**1. Accurate Tone Color Cloning.**
+OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
+**2. Flexible Voice Style Control.**
+OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
+**3. Zero-shot Cross-lingual Voice Cloning.**
+Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
+### OpenVoice V2
+In April 2024, we released OpenVoice V2, which includes all features in V1 and has:
+**1. Better Audio Quality.**
+OpenVoice V2 adopts a different training strategy that delivers better audio quality.
+**2. Native Multi-lingual Support.**
+English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
+**3. Free Commercial Use.**
+Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
+[Video](https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f)
+OpenVoice has been powering the instant voice cloning capability of [myshell.ai](https://app.myshell.ai/explore) since May 2023. Until Nov 2023, the voice cloning model has been used tens of millions of times by users worldwide, and witnessed the explosive user growth on the platform.
+## Main Contributors
+- [Zengyi Qin](https://www.qinzy.tech) at MIT and MyShell
+- [Wenliang Zhao](https://wl-zhao.github.io) at Tsinghua University
+- [Xumin Yu](https://yuxumin.github.io) at Tsinghua University
+- [Ethan Sun](https://twitter.com/ethan_myshell) at MyShell
+## How to Use
+Please see [usage](docs/USAGE.md) for detailed instructions.
+## Common Issues
+Please see [QA](docs/QA.md) for common questions and answers. We will regularly update the question and answer list.
+## Join Our Community
+Join our [Discord community](https://discord.gg/myshell) and select the `Developer` role upon joining to gain exclusive access to our developer-only channel! Don't miss out on valuable discussions and collaboration opportunities.
+## Citation
+```
+@article{qin2023openvoice,
+  title={OpenVoice: Versatile Instant Voice Cloning},
+  author={Qin, Zengyi and Zhao, Wenliang and Yu, Xumin and Sun, Xin},
+  journal={arXiv preprint arXiv:2312.01479},
+  year={2023}
+}
+```
+## License
+OpenVoice V1 and V2 are MIT Licensed. Free for both commercial and research use.
+## Acknowledgements
+This implementation is based on several excellent projects, [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), and [VITS2](https://github.com/daniilrobnikov/vits2). Thanks for their awesome work!

MyShell_OpenVoice.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+LICENSE
+README.md
+setup.py
+MyShell_OpenVoice.egg-info/PKG-INFO
+MyShell_OpenVoice.egg-info/SOURCES.txt
+MyShell_OpenVoice.egg-info/dependency_links.txt
+MyShell_OpenVoice.egg-info/not-zip-safe
+MyShell_OpenVoice.egg-info/requires.txt
+MyShell_OpenVoice.egg-info/top_level.txt
+openvoice/__init__.py
+openvoice/api.py
+openvoice/attentions.py
+openvoice/commons.py
+openvoice/mel_processing.py
+openvoice/models.py
+openvoice/modules.py
+openvoice/openvoice_app.py
+openvoice/se_extractor.py
+openvoice/transforms.py
+openvoice/utils.py
+openvoice/text/__init__.py
+openvoice/text/cleaners.py
+openvoice/text/english.py
+openvoice/text/mandarin.py
+openvoice/text/symbols.py

MyShell_OpenVoice.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

MyShell_OpenVoice.egg-info/not-zip-safe ADDED Viewed

	@@ -0,0 +1 @@


1	+

MyShell_OpenVoice.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+librosa==0.9.1
+faster-whisper==0.9.0
+pydub==0.25.1
+wavmark==0.0.3
+numpy==1.22.0
+eng_to_ipa==0.0.2
+inflect==7.0.0
+unidecode==1.3.7
+whisper-timestamped==1.14.2
+pypinyin==0.50.0
+cn2an==0.5.22
+jieba==0.42.1
+gradio==3.48.0
+langid==1.1.6

MyShell_OpenVoice.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ openvoice

README.md CHANGED Viewed

@@ -1,12 +1,80 @@
 ---
-title: TestOpenVoice
-emoji: 💻
-colorFrom: indigo
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.37.2
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: testOpenVoice
 app_file: app.py
+sdk: gradio
+sdk_version: 3.48.0
 ---
+<div align="center">
+  <div>&nbsp;</div>
+  <img src="resources/openvoicelogo.jpg" width="400"/>
+[Paper](https://arxiv.org/abs/2312.01479) |
+[Website](https://research.myshell.ai/open-voice)
+</div>
+## Introduction
+### OpenVoice V1
+As we detailed in our [paper](https://arxiv.org/abs/2312.01479) and [website](https://research.myshell.ai/open-voice), the advantages of OpenVoice are three-fold:
+**1. Accurate Tone Color Cloning.**
+OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
+**2. Flexible Voice Style Control.**
+OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
+**3. Zero-shot Cross-lingual Voice Cloning.**
+Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
+### OpenVoice V2
+In April 2024, we released OpenVoice V2, which includes all features in V1 and has:
+**1. Better Audio Quality.**
+OpenVoice V2 adopts a different training strategy that delivers better audio quality.
+**2. Native Multi-lingual Support.**
+English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
+**3. Free Commercial Use.**
+Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
+[Video](https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f)
+OpenVoice has been powering the instant voice cloning capability of [myshell.ai](https://app.myshell.ai/explore) since May 2023. Until Nov 2023, the voice cloning model has been used tens of millions of times by users worldwide, and witnessed the explosive user growth on the platform.
+## Main Contributors
+- [Zengyi Qin](https://www.qinzy.tech) at MIT and MyShell
+- [Wenliang Zhao](https://wl-zhao.github.io) at Tsinghua University
+- [Xumin Yu](https://yuxumin.github.io) at Tsinghua University
+- [Ethan Sun](https://twitter.com/ethan_myshell) at MyShell
+## How to Use
+Please see [usage](docs/USAGE.md) for detailed instructions.
+## Common Issues
+Please see [QA](docs/QA.md) for common questions and answers. We will regularly update the question and answer list.
+## Join Our Community
+Join our [Discord community](https://discord.gg/myshell) and select the `Developer` role upon joining to gain exclusive access to our developer-only channel! Don't miss out on valuable discussions and collaboration opportunities.
+## Citation
+```
+@article{qin2023openvoice,
+  title={OpenVoice: Versatile Instant Voice Cloning},
+  author={Qin, Zengyi and Zhao, Wenliang and Yu, Xumin and Sun, Xin},
+  journal={arXiv preprint arXiv:2312.01479},
+  year={2023}
+}
+```
+## License
+OpenVoice V1 and V2 are MIT Licensed. Free for both commercial and research use.
+## Acknowledgements
+This implementation is based on several excellent projects, [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), and [VITS2](https://github.com/daniilrobnikov/vits2). Thanks for their awesome work!

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#### https://huggingface.co/docs
+# https://huggingface.co/spaces/gradio/asr
+import os
+import gradio as gr
+import os
+import torch
+from openvoice import se_extractor
+from openvoice.api import BaseSpeakerTTS, ToneColorConverter
+ckpt_base = 'checkpoints/base_speakers/EN'
+ckpt_converter = 'checkpoints/converter'
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+output_dir = 'outputs'
+base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
+base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
+tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
+tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
+source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)
+os.makedirs(output_dir, exist_ok=True)
+reference_speaker = './resources/demo_speaker0.mp3'  # This is the voice you want to clone
+target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)
+save_path = f'{output_dir}/output_en_default.wav'
+# Run the base speaker tts
+text = "This audio is generated by OpenVoice."
+src_path = f'{output_dir}/tmp.wav'
+base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)
+# Run the tone color converter
+encode_message = "@MyShell"
+tone_color_converter.convert(
+    audio_src_path=src_path,
+    src_se=source_se,
+    tgt_se=target_se,
+    output_path=save_path,
+    message=encode_message)
+ckpt_base = 'checkpoints/base_speakers/ZH'
+base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
+base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
+source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)
+save_path = f'{output_dir}/output_chinese.wav'
+def audio_io(input_text: str):
+    text = input_text
+    src_path = f'{output_dir}/tmp.wav'
+    base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)
+    # Run the tone color converter
+    encode_message = "@MyShell"
+    tone_color_converter.convert(
+        audio_src_path=src_path,
+        src_se=source_se,
+        tgt_se=target_se,
+        output_path=save_path,
+        message=encode_message)
+    return src_path
+demo = gr.Interface(
+    fn=audio_io,
+    inputs=["text"],
+    outputs=["audio"],
+)
+demo.launch()

checkpoints/base_speakers/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

checkpoints/base_speakers/EN/checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
+size 160467309

checkpoints/base_speakers/EN/config.json ADDED Viewed

	@@ -0,0 +1,145 @@

+{
+  "data": {
+    "text_cleaners": [
+      "cjke_cleaners2"
+    ],
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "add_blank": true,
+    "cleaned_text": true,
+    "n_speakers": 10
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "n_layers_trans_flow": 3,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  },
+  "symbols": [
+    "_",
+    ",",
+    ".",
+    "!",
+    "?",
+    "-",
+    "~",
+    "\u2026",
+    "N",
+    "Q",
+    "a",
+    "b",
+    "d",
+    "e",
+    "f",
+    "g",
+    "h",
+    "i",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "o",
+    "p",
+    "s",
+    "t",
+    "u",
+    "v",
+    "w",
+    "x",
+    "y",
+    "z",
+    "\u0251",
+    "\u00e6",
+    "\u0283",
+    "\u0291",
+    "\u00e7",
+    "\u026f",
+    "\u026a",
+    "\u0254",
+    "\u025b",
+    "\u0279",
+    "\u00f0",
+    "\u0259",
+    "\u026b",
+    "\u0265",
+    "\u0278",
+    "\u028a",
+    "\u027e",
+    "\u0292",
+    "\u03b8",
+    "\u03b2",
+    "\u014b",
+    "\u0266",
+    "\u207c",
+    "\u02b0",
+    "`",
+    "^",
+    "#",
+    "*",
+    "=",
+    "\u02c8",
+    "\u02cc",
+    "\u2192",
+    "\u2193",
+    "\u2191",
+    " "
+  ],
+  "speakers": {
+    "default": 1,
+    "whispering": 2,
+    "shouting": 3,
+    "excited": 4,
+    "cheerful": 5,
+    "terrified": 6,
+    "angry": 7,
+    "sad": 8,
+    "friendly": 9
+  }
+}

checkpoints/base_speakers/EN/en_default_se.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
+size 1789

checkpoints/base_speakers/EN/en_style_se.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9
+size 1783

checkpoints/base_speakers/ZH/checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
+size 160467309

checkpoints/base_speakers/ZH/config.json ADDED Viewed

	@@ -0,0 +1,137 @@

+{
+  "data": {
+    "text_cleaners": [
+      "cjke_cleaners2"
+    ],
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "add_blank": true,
+    "cleaned_text": true,
+    "n_speakers": 10
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "n_layers_trans_flow": 3,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  },
+  "symbols": [
+    "_",
+    ",",
+    ".",
+    "!",
+    "?",
+    "-",
+    "~",
+    "\u2026",
+    "N",
+    "Q",
+    "a",
+    "b",
+    "d",
+    "e",
+    "f",
+    "g",
+    "h",
+    "i",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "o",
+    "p",
+    "s",
+    "t",
+    "u",
+    "v",
+    "w",
+    "x",
+    "y",
+    "z",
+    "\u0251",
+    "\u00e6",
+    "\u0283",
+    "\u0291",
+    "\u00e7",
+    "\u026f",
+    "\u026a",
+    "\u0254",
+    "\u025b",
+    "\u0279",
+    "\u00f0",
+    "\u0259",
+    "\u026b",
+    "\u0265",
+    "\u0278",
+    "\u028a",
+    "\u027e",
+    "\u0292",
+    "\u03b8",
+    "\u03b2",
+    "\u014b",
+    "\u0266",
+    "\u207c",
+    "\u02b0",
+    "`",
+    "^",
+    "#",
+    "*",
+    "=",
+    "\u02c8",
+    "\u02cc",
+    "\u2192",
+    "\u2193",
+    "\u2191",
+    " "
+  ],
+  "speakers": {
+    "default": 0
+  }
+}

checkpoints/base_speakers/ZH/zh_default_se.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
+size 1789

checkpoints/converter/checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89ae83aa4e3668fef64b388b789ff7b0ce0def9f801069edfc18a00ea420748d
+size 131327338

checkpoints/converter/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "data": {
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_speakers": 0
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  }
+}

checkpoints_v2/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

checkpoints_v2/converter/checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
+size 131320490

checkpoints_v2/converter/config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "_version_": "v2",
+  "data": {
+    "sampling_rate": 22050,
+    "filter_length": 1024,
+    "hop_length": 256,
+    "win_length": 1024,
+    "n_speakers": 0
+  },
+  "model": {
+    "zero_g": true,
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4
+    ],
+    "gin_channels": 256
+  }
+}

demo_part1.ipynb ADDED Viewed

	@@ -0,0 +1,401 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b6ee1ede",
+   "metadata": {},
+   "source": [
+    "## Voice Style Control Demo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "b7f043ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 15 µs, sys: 1e+03 ns, total: 16 µs\n",
+      "Wall time: 18.8 µs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "import os\n",
+    "import torch\n",
+    "from openvoice import se_extractor\n",
+    "from openvoice.api import BaseSpeakerTTS, ToneColorConverter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15116b59",
+   "metadata": {},
+   "source": [
+    "### Initialization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "aacad912",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n",
+      "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n"
+     ]
+    }
+   ],
+   "source": [
+    "ckpt_base = 'checkpoints/base_speakers/EN'\n",
+    "ckpt_converter = 'checkpoints/converter'\n",
+    "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+    "output_dir = 'outputs'\n",
+    "\n",
+    "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
+    "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
+    "\n",
+    "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
+    "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
+    "\n",
+    "os.makedirs(output_dir, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7f67740c",
+   "metadata": {},
+   "source": [
+    "### Obtain Tone Color Embedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8add279",
+   "metadata": {},
+   "source": [
+    "The `source_se` is the tone color embedding of the base speaker. \n",
+    "It is an average of multiple sentences generated by the base speaker. We directly provide the result here but\n",
+    "the readers feel free to extract `source_se` by themselves."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "63ff6273",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f71fcc3",
+   "metadata": {},
+   "source": [
+    "The `reference_speaker.mp3` below points to the short audio clip of the reference whose voice we want to clone. We provide an example here. If you use your own reference speakers, please **make sure each speaker has a unique filename.** The `se_extractor` will save the `targeted_se` using the filename of the audio and **will not automatically overwrite.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "55105eae",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "OpenVoice version: v1\n",
+      "[(0.0, 19.278375)]\n",
+      "after vad: dur = 19.27798185941043\n"
+     ]
+    }
+   ],
+   "source": [
+    "reference_speaker = './resources/demo_speaker0.mp3' # This is the voice you want to clone\n",
+    "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a40284aa",
+   "metadata": {},
+   "source": [
+    "### Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "73dc1259",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Text splitted to sentences.\n",
+      "This audio is generated by OpenVoice.\n",
+      " > ===========================\n",
+      "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
+      " length:45\n",
+      " length:45\n"
+     ]
+    }
+   ],
+   "source": [
+    "save_path = f'{output_dir}/output_en_default.wav'\n",
+    "\n",
+    "# Run the base speaker tts\n",
+    "text = \"This audio is generated by OpenVoice.\"\n",
+    "src_path = f'{output_dir}/tmp.wav'\n",
+    "base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
+    "\n",
+    "# Run the tone color converter\n",
+    "encode_message = \"@MyShell\"\n",
+    "tone_color_converter.convert(\n",
+    "    audio_src_path=src_path, \n",
+    "    src_se=source_se, \n",
+    "    tgt_se=target_se, \n",
+    "    output_path=save_path,\n",
+    "    message=encode_message)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6e3ea28a",
+   "metadata": {},
+   "source": [
+    "**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "fd022d38",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Text splitted to sentences.\n",
+      "This audio is generated by OpenVoice.\n",
+      " > ===========================\n",
+      "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
+      " length:45\n",
+      " length:45\n"
+     ]
+    }
+   ],
+   "source": [
+    "source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
+    "save_path = f'{output_dir}/output_whispering.wav'\n",
+    "\n",
+    "# Run the base speaker tts\n",
+    "text = \"This audio is generated by OpenVoice.\"\n",
+    "src_path = f'{output_dir}/tmp.wav'\n",
+    "base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n",
+    "\n",
+    "# Run the tone color converter\n",
+    "encode_message = \"@MyShell\"\n",
+    "tone_color_converter.convert(\n",
+    "    audio_src_path=src_path, \n",
+    "    src_se=source_se, \n",
+    "    tgt_se=target_se, \n",
+    "    output_path=save_path,\n",
+    "    message=encode_message)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5fcfc70b",
+   "metadata": {},
+   "source": [
+    "**Try with different languages.** OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker. We provide an example with a Chinese base speaker here and we encourage the readers to try `demo_part2.ipynb` for a detailed demo."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "deff30a4-d430-4b4d-9772-b936f5b564c4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/russell/miniconda3/envs/openvoice/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
+      "  warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n"
+     ]
+    }
+   ],
+   "source": [
+    "ckpt_base = 'checkpoints/base_speakers/ZH'\n",
+    "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
+    "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
+    "\n",
+    "source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)\n",
+    "save_path = f'{output_dir}/output_chinese.wav'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "a71d1387",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " > Text splitted to sentences.\n",
+      "毛岛灰绣眼鸟（学名：Zosterops mauritianus）是一种绣眼鸟科绣眼鸟属的鸟类,\n",
+      "属于毛里求斯岛上两种特有种绣眼鸟之一,\n",
+      "另一种是更为稀少的毛里求斯绣眼鸟.\n",
+      "上半身整体为灰色, 下半身为灰白色,\n",
+      "臀部和腋羽是十分显眼的白色.\n",
+      "这种鸟栖息于次生林、森林和花园中[1].\n",
+      "它与留尼汪灰绣眼鸟亲缘关系很近,\n",
+      "曾经被认为是同种, 统称为马斯克林绣眼鸟[2]\n",
+      " > ===========================\n",
+      "mɑʊ↑t⁼ɑʊ↓↑ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑（ ʃɥɛ↑miŋ↑,ts⁼eɪ↓oʊ→ɛ↑sɹ↓tʰi↓i↓a↓oʊ→pʰi→ɛ↑sɹ↓ ɛ↑mu↓eɪ→joʊ→a↓aɪ↓tʰi↓aɪ↓eɪ→ən→joʊ→ɛ↑sɹ↓） s`ɹ`↓ i→ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑kʰə→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ s`u↓↑ t⁼ə niɑʊ↓↑leɪ↓,\n",
+      " length:199\n",
+      " length:197\n",
+      "s`u↓↑ɥ↑ mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ t⁼ɑʊ↓↑s`ɑŋ↓ liɑŋ↓↑ts`⁼ʊŋ↓↑ tʰə↓joʊ↓↑ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ ts`⁼ɹ`→i→,\n",
+      " length:100\n",
+      " length:100\n",
+      "liŋ↓ i→ts`⁼ʊŋ↓↑ s`ɹ`↓ k⁼əŋ↓weɪ↑ ʃi→s`ɑʊ↓↑ t⁼ə mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑.\n",
+      " length:83\n",
+      " length:83\n",
+      "s`ɑŋ↓p⁼an↓s`ən→ ts`⁼əŋ↓↑tʰi↓↑ weɪ↓ xweɪ→sə↓,  ʃja↓p⁼an↓s`ən→ weɪ↓ xweɪ→p⁼aɪ↑sə↓,\n",
+      " length:80\n",
+      " length:80\n",
+      "tʰwən↑p⁼u↓ xə↑ iɛ↓ɥ↓↑ s`ɹ`↓ s`ɹ`↑fən→ ʃjɛn↓↑jɛn↓↑ t⁼ə p⁼aɪ↑sə↓.\n",
+      " length:63\n",
+      " length:63\n",
+      "ts`⁼ə↓ts`⁼ʊŋ↓↑ niɑʊ↓↑ tʃʰi→ʃi→ ɥ↑ tsʰɹ↓s`əŋ→lin↑, sən→lin↑ xə↑ xwa→ɥæn↑ ts`⁼ʊŋ→[ i→].\n",
+      " length:85\n",
+      " length:83\n",
+      "tʰa→ ɥ↓↑ ljoʊ↑ni↑uɑŋ→ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ tʃʰin→ɥæn↑ k⁼wan→ʃi↓ xən↓↑tʃ⁼in↓,\n",
+      " length:79\n",
+      " length:79\n",
+      "tsʰəŋ↑tʃ⁼iŋ→ p⁼eɪ↓ ɹ`ən↓weɪ↑ s`ɹ`↓ tʰʊŋ↑ts`⁼ʊŋ↓↑,  tʰʊŋ↓↑ts`ʰəŋ→ weɪ↓ ma↓↑sɹ→kʰə↓lin↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑[ əɹ`↓].\n",
+      " length:111\n",
+      " length:109\n",
+      "CPU times: user 2min 41s, sys: 7.56 s, total: 2min 49s\n",
+      "Wall time: 29.7 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "# Run the base speaker tts\n",
+    "text = \"毛岛灰绣眼鸟（学名：Zosterops mauritianus）是一种绣眼鸟科绣眼鸟属的鸟类，属于毛里求斯岛上两种特有种绣眼鸟之一，另一种是更为稀少的毛里求斯绣眼鸟。上半身整体为灰色，下半身为灰白色，臀部和腋羽是十分显眼的白色。这种鸟栖息于次生林、森林和花园中[1]。它与留尼汪灰绣眼鸟亲缘关系很近，曾经被认为是同种，统称为马斯克林绣眼鸟[2]\"\n",
+    "src_path = f'{output_dir}/tmp.wav'\n",
+    "base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)\n",
+    "\n",
+    "# Run the tone color converter\n",
+    "encode_message = \"@MyShell\"\n",
+    "tone_color_converter.convert(\n",
+    "    audio_src_path=src_path, \n",
+    "    src_se=source_se, \n",
+    "    tgt_se=target_se, \n",
+    "    output_path=save_path,\n",
+    "    message=encode_message)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e513094",
+   "metadata": {},
+   "source": [
+    "**Tech for good.** For people who will deploy OpenVoice for public usage: We offer you the option to add watermark to avoid potential misuse. Please see the ToneColorConverter class. **MyShell reserves the ability to detect whether an audio is generated by OpenVoice**, no matter whether the watermark is added or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9628ffa1-1d60-4d1b-a9ed-619add064ebd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "377f4b72-dfca-4c58-8a5c-fea056538cc2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31bf81ab-bac9-4996-8f47-8651052d713a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32a84a29-9515-4aaa-b4ad-3a530e8259f0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abd802ad-93ac-4db2-9ee5-0ad78b54e09e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "9d70c38e1c0b038dbdffdaa4f8bfa1f6767c43760905c87a9fbe7800d18c6c35"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

demo_part2.ipynb ADDED Viewed

	@@ -0,0 +1,195 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b6ee1ede",
+   "metadata": {},
+   "source": [
+    "## Cross-Lingual Voice Clone Demo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7f043ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from openvoice import se_extractor\n",
+    "from openvoice.api import ToneColorConverter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15116b59",
+   "metadata": {},
+   "source": [
+    "### Initialization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aacad912",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ckpt_converter = 'checkpoints/converter'\n",
+    "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+    "output_dir = 'outputs'\n",
+    "\n",
+    "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
+    "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
+    "\n",
+    "os.makedirs(output_dir, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3db80fcf",
+   "metadata": {},
+   "source": [
+    "In this demo, we will use OpenAI TTS as the base speaker to produce multi-lingual speech audio. The users can flexibly change the base speaker according to their own needs. Please create a file named `.env` and place OpenAI key as `OPENAI_API_KEY=xxx`. We have also provided a Chinese base speaker model (see `demo_part1.ipynb`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b245ca3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "# Please create a file named .env and place your\n",
+    "# OpenAI key as OPENAI_API_KEY=xxx\n",
+    "load_dotenv() \n",
+    "\n",
+    "client = OpenAI(api_key=os.environ.get(\"OPENAI_API_KEY\"))\n",
+    "\n",
+    "response = client.audio.speech.create(\n",
+    "    model=\"tts-1\",\n",
+    "    voice=\"nova\",\n",
+    "    input=\"This audio will be used to extract the base speaker tone color embedding. \" + \\\n",
+    "        \"Typically a very short audio should be sufficient, but increasing the audio \" + \\\n",
+    "        \"length will also improve the output audio quality.\"\n",
+    ")\n",
+    "\n",
+    "response.stream_to_file(f\"{output_dir}/openai_source_output.mp3\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7f67740c",
+   "metadata": {},
+   "source": [
+    "### Obtain Tone Color Embedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8add279",
+   "metadata": {},
+   "source": [
+    "The `source_se` is the tone color embedding of the base speaker. \n",
+    "It is an average for multiple sentences with multiple emotions\n",
+    "of the base speaker. We directly provide the result here but\n",
+    "the readers feel free to extract `source_se` by themselves."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63ff6273",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_speaker = f\"{output_dir}/openai_source_output.mp3\"\n",
+    "source_se, audio_name = se_extractor.get_se(base_speaker, tone_color_converter, vad=True)\n",
+    "\n",
+    "reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone\n",
+    "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a40284aa",
+   "metadata": {},
+   "source": [
+    "### Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73dc1259",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the base speaker tts\n",
+    "text = [\n",
+    "    \"MyShell is a decentralized and comprehensive platform for discovering, creating, and staking AI-native apps.\",\n",
+    "    \"MyShell es una plataforma descentralizada y completa para descubrir, crear y apostar por aplicaciones nativas de IA.\",\n",
+    "    \"MyShell est une plateforme décentralisée et complète pour découvrir, créer et miser sur des applications natives d'IA.\",\n",
+    "    \"MyShell ist eine dezentralisierte und umfassende Plattform zum Entdecken, Erstellen und Staken von KI-nativen Apps.\",\n",
+    "    \"MyShell è una piattaforma decentralizzata e completa per scoprire, creare e scommettere su app native di intelligenza artificiale.\",\n",
+    "    \"MyShellは、AIネイティブアプリの発見、作成、およびステーキングのための分散型かつ包括的なプラットフォームです。\",\n",
+    "    \"MyShell — это децентрализованная и всеобъемлющая платформа для обнаружения, создания и стейкинга AI-ориентированных приложений.\",\n",
+    "    \"MyShell هي منصة لامركزية وشاملة لاكتشاف وإنشاء ورهان تطبيقات الذكاء الاصطناعي الأصلية.\",\n",
+    "    \"MyShell是一个去中心化且全面的平台，用于发现、创建和投资AI原生应用程序。\",\n",
+    "    \"MyShell एक विकेंद्रीकृत और व्यापक मंच है, जो AI-मूल ऐप्स की खोज, सृजन और स्टेकिंग के लिए है।\",\n",
+    "    \"MyShell é uma plataforma descentralizada e abrangente para descobrir, criar e apostar em aplicativos nativos de IA.\"\n",
+    "]\n",
+    "src_path = f'{output_dir}/tmp.wav'\n",
+    "\n",
+    "for i, t in enumerate(text):\n",
+    "\n",
+    "    response = client.audio.speech.create(\n",
+    "        model=\"tts-1\",\n",
+    "        voice=\"nova\",\n",
+    "        input=t,\n",
+    "    )\n",
+    "\n",
+    "    response.stream_to_file(src_path)\n",
+    "\n",
+    "    save_path = f'{output_dir}/output_crosslingual_{i}.wav'\n",
+    "\n",
+    "    # Run the tone color converter\n",
+    "    encode_message = \"@MyShell\"\n",
+    "    tone_color_converter.convert(\n",
+    "        audio_src_path=src_path, \n",
+    "        src_se=source_se, \n",
+    "        tgt_se=target_se, \n",
+    "        output_path=save_path,\n",
+    "        message=encode_message)"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "9d70c38e1c0b038dbdffdaa4f8bfa1f6767c43760905c87a9fbe7800d18c6c35"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

demo_part3.ipynb ADDED Viewed

	@@ -0,0 +1,256 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-Accent and Multi-Lingual Voice Clone Demo with MeloTTS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from openvoice import se_extractor\n",
+    "from openvoice.api import ToneColorConverter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialization\n",
+    "\n",
+    "In this example, we will use the checkpoints from OpenVoiceV2. OpenVoiceV2 is trained with more aggressive augmentations and thus demonstrate better robustness in some cases."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/russell/miniconda3/envs/openvoice/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
+      "  warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded checkpoint 'checkpoints_v2/converter/checkpoint.pth'\n",
+      "missing/unexpected keys: [] []\n"
+     ]
+    }
+   ],
+   "source": [
+    "ckpt_converter = 'checkpoints_v2/converter'\n",
+    "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+    "output_dir = 'outputs_v2'\n",
+    "\n",
+    "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
+    "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
+    "\n",
+    "os.makedirs(output_dir, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Obtain Tone Color Embedding\n",
+    "We only extract the tone color embedding for the target speaker. The source tone color embeddings can be directly loaded from `checkpoints_v2/ses` folder."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "OpenVoice version: v2\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dc253b8bc6d34915bec3fa5b526b0348",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7c82ae46811248e9abafdf3b901c19a1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "392369f8bd914110a4c7cffe457bda51",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading model.bin:   0%|          | 0.00/1.53G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "80894d63cbcf4d71a11b654eab6a1320",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading config.json:   0%|          | 0.00/2.26k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:51\u001b[0m, in \u001b[0;36m_executor_map\u001b[0;34m(PoolExecutor, fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m     49\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m PoolExecutor(max_workers\u001b[38;5;241m=\u001b[39mmax_workers, initializer\u001b[38;5;241m=\u001b[39mtqdm_class\u001b[38;5;241m.\u001b[39mset_lock,\n\u001b[1;32m     50\u001b[0m                   initargs\u001b[38;5;241m=\u001b[39m(lk,)) \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m---> 51\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtqdm_class\u001b[49m\u001b[43m(\u001b[49m\u001b[43mex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43miterables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunksize\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/notebook.py:250\u001b[0m, in \u001b[0;36mtqdm_notebook.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    249\u001b[0m it \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__iter__\u001b[39m()\n\u001b[0;32m--> 250\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m it:\n\u001b[1;32m    251\u001b[0m     \u001b[38;5;66;03m# return super(tqdm...) will not catch exception\u001b[39;00m\n\u001b[1;32m    252\u001b[0m     \u001b[38;5;28;01myield\u001b[39;00m obj\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/std.py:1169\u001b[0m, in \u001b[0;36mtqdm.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1168\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdisable:\n\u001b[0;32m-> 1169\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[1;32m   1170\u001b[0m         \u001b[38;5;28;01myield\u001b[39;00m obj\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/_base.py:609\u001b[0m, in \u001b[0;36mExecutor.map.<locals>.result_iterator\u001b[0;34m()\u001b[0m\n\u001b[1;32m    608\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 609\u001b[0m     \u001b[38;5;28;01myield\u001b[39;00m \u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    610\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/_base.py:441\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    439\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[0;32m--> 441\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_condition\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    443\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/threading.py:312\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    311\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 312\u001b[0m     \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    313\u001b[0m     gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m reference_speaker \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mresources/example_reference.mp3\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;66;03m# This is the voice you want to clone\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m target_se, audio_name \u001b[38;5;241m=\u001b[39m \u001b[43mse_extractor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_se\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreference_speaker\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtone_color_converter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Desktop/seamless_communication_test/OpenVoice/openvoice/se_extractor.py:146\u001b[0m, in \u001b[0;36mget_se\u001b[0;34m(audio_path, vc_model, target_dir, vad)\u001b[0m\n\u001b[1;32m    144\u001b[0m     wavs_folder \u001b[38;5;241m=\u001b[39m split_audio_vad(audio_path, target_dir\u001b[38;5;241m=\u001b[39mtarget_dir, audio_name\u001b[38;5;241m=\u001b[39maudio_name)\n\u001b[1;32m    145\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 146\u001b[0m     wavs_folder \u001b[38;5;241m=\u001b[39m \u001b[43msplit_audio_whisper\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtarget_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    148\u001b[0m audio_segs \u001b[38;5;241m=\u001b[39m glob(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mwavs_folder\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/*.wav\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m    149\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(audio_segs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
+      "File \u001b[0;32m~/Desktop/seamless_communication_test/OpenVoice/openvoice/se_extractor.py:22\u001b[0m, in \u001b[0;36msplit_audio_whisper\u001b[0;34m(audio_path, audio_name, target_dir)\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;28;01mglobal\u001b[39;00m model\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 22\u001b[0m     model \u001b[38;5;241m=\u001b[39m \u001b[43mWhisperModel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcuda\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcompute_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfloat16\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     23\u001b[0m audio \u001b[38;5;241m=\u001b[39m AudioSegment\u001b[38;5;241m.\u001b[39mfrom_file(audio_path)\n\u001b[1;32m     24\u001b[0m max_len \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(audio)\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/faster_whisper/transcribe.py:122\u001b[0m, in \u001b[0;36mWhisperModel.__init__\u001b[0;34m(self, model_size_or_path, device, device_index, compute_type, cpu_threads, num_workers, download_root, local_files_only)\u001b[0m\n\u001b[1;32m    120\u001b[0m     model_path \u001b[38;5;241m=\u001b[39m model_size_or_path\n\u001b[1;32m    121\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 122\u001b[0m     model_path \u001b[38;5;241m=\u001b[39m \u001b[43mdownload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    123\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmodel_size_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    124\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    125\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_root\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    126\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    128\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m ctranslate2\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mWhisper(\n\u001b[1;32m    129\u001b[0m     model_path,\n\u001b[1;32m    130\u001b[0m     device\u001b[38;5;241m=\u001b[39mdevice,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    134\u001b[0m     inter_threads\u001b[38;5;241m=\u001b[39mnum_workers,\n\u001b[1;32m    135\u001b[0m )\n\u001b[1;32m    137\u001b[0m tokenizer_file \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(model_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokenizer.json\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/faster_whisper/utils.py:98\u001b[0m, in \u001b[0;36mdownload_model\u001b[0;34m(size_or_id, output_dir, local_files_only, cache_dir)\u001b[0m\n\u001b[1;32m     95\u001b[0m     kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcache_dir\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m cache_dir\n\u001b[1;32m     97\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 98\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhuggingface_hub\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msnapshot_download\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     99\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[1;32m    100\u001b[0m     huggingface_hub\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mHfHubHTTPError,\n\u001b[1;32m    101\u001b[0m     requests\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mConnectionError,\n\u001b[1;32m    102\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m exception:\n\u001b[1;32m    103\u001b[0m     logger \u001b[38;5;241m=\u001b[39m get_logger()\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:118\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    115\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m    116\u001b[0m     kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/huggingface_hub/_snapshot_download.py:239\u001b[0m, in \u001b[0;36msnapshot_download\u001b[0;34m(repo_id, repo_type, revision, endpoint, cache_dir, local_dir, local_dir_use_symlinks, library_name, library_version, user_agent, proxies, etag_timeout, resume_download, force_download, token, local_files_only, allow_patterns, ignore_patterns, max_workers, tqdm_class)\u001b[0m\n\u001b[1;32m    237\u001b[0m         _inner_hf_hub_download(file)\n\u001b[1;32m    238\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 239\u001b[0m     \u001b[43mthread_map\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    240\u001b[0m \u001b[43m        \u001b[49m\u001b[43m_inner_hf_hub_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    241\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfiltered_repo_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    242\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mFetching \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfiltered_repo_files\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m files\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    243\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_workers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_workers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    244\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;66;43;03m# User can use its own tqdm class or the default one from `huggingface_hub.utils`\u001b[39;49;00m\n\u001b[1;32m    245\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtqdm_class\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtqdm_class\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mhf_tqdm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    246\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    248\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m local_dir \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    249\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mrealpath(local_dir))\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:69\u001b[0m, in \u001b[0;36mthread_map\u001b[0;34m(fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     56\u001b[0m \u001b[38;5;124;03mEquivalent of `list(map(fn, *iterables))`\u001b[39;00m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;124;03mdriven by `concurrent.futures.ThreadPoolExecutor`.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     66\u001b[0m \u001b[38;5;124;03m    [default: max(32, cpu_count() + 4)].\u001b[39;00m\n\u001b[1;32m     67\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconcurrent\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfutures\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ThreadPoolExecutor\n\u001b[0;32m---> 69\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_executor_map\u001b[49m\u001b[43m(\u001b[49m\u001b[43mThreadPoolExecutor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43miterables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mtqdm_kwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:51\u001b[0m, in \u001b[0;36m_executor_map\u001b[0;34m(PoolExecutor, fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m     47\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ensure_lock(tqdm_class, lock_name\u001b[38;5;241m=\u001b[39mlock_name) \u001b[38;5;28;01mas\u001b[39;00m lk:\n\u001b[1;32m     48\u001b[0m     \u001b[38;5;66;03m# share lock in case workers are already using `tqdm`\u001b[39;00m\n\u001b[1;32m     49\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m PoolExecutor(max_workers\u001b[38;5;241m=\u001b[39mmax_workers, initializer\u001b[38;5;241m=\u001b[39mtqdm_class\u001b[38;5;241m.\u001b[39mset_lock,\n\u001b[1;32m     50\u001b[0m                       initargs\u001b[38;5;241m=\u001b[39m(lk,)) \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m---> 51\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(tqdm_class(ex\u001b[38;5;241m.\u001b[39mmap(fn, \u001b[38;5;241m*\u001b[39miterables, chunksize\u001b[38;5;241m=\u001b[39mchunksize), \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/_base.py:637\u001b[0m, in \u001b[0;36mExecutor.__exit__\u001b[0;34m(self, exc_type, exc_val, exc_tb)\u001b[0m\n\u001b[1;32m    636\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__exit__\u001b[39m(\u001b[38;5;28mself\u001b[39m, exc_type, exc_val, exc_tb):\n\u001b[0;32m--> 637\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshutdown\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwait\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m    638\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/thread.py:235\u001b[0m, in \u001b[0;36mThreadPoolExecutor.shutdown\u001b[0;34m(self, wait, cancel_futures)\u001b[0m\n\u001b[1;32m    233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m wait:\n\u001b[1;32m    234\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_threads:\n\u001b[0;32m--> 235\u001b[0m         \u001b[43mt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/threading.py:1060\u001b[0m, in \u001b[0;36mThread.join\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m   1057\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcannot join current thread\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1059\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1060\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait_for_tstate_lock\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1061\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1062\u001b[0m     \u001b[38;5;66;03m# the behavior of a negative timeout isn't documented, but\u001b[39;00m\n\u001b[1;32m   1063\u001b[0m     \u001b[38;5;66;03m# historically .join(timeout=x) for x<0 has acted as if timeout=0\u001b[39;00m\n\u001b[1;32m   1064\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_wait_for_tstate_lock(timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mmax\u001b[39m(timeout, \u001b[38;5;241m0\u001b[39m))\n",
+      "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/threading.py:1080\u001b[0m, in \u001b[0;36mThread._wait_for_tstate_lock\u001b[0;34m(self, block, timeout)\u001b[0m\n\u001b[1;32m   1077\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m   1079\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1080\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mlock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblock\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m   1081\u001b[0m         lock\u001b[38;5;241m.\u001b[39mrelease()\n\u001b[1;32m   1082\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stop()\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone\n",
+    "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Use MeloTTS as Base Speakers\n",
+    "\n",
+    "MeloTTS is a high-quality multi-lingual text-to-speech library by @MyShell.ai, supporting languages including English (American, British, Indian, Australian, Default), Spanish, French, Chinese, Japanese, Korean. In the following example, we will use the models in MeloTTS as the base speakers. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from melo.api import TTS\n",
+    "\n",
+    "texts = {\n",
+    "    'EN_NEWEST': \"Did you ever hear a folk tale about a giant turtle?\",  # The newest English base speaker model\n",
+    "    'EN': \"Did you ever hear a folk tale about a giant turtle?\",\n",
+    "    'ES': \"El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.\",\n",
+    "    'FR': \"La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.\",\n",
+    "    'ZH': \"在这次vacation中，我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。\",\n",
+    "    'JP': \"彼は毎朝ジョギングをして体を健康に保っています。\",\n",
+    "    'KR': \"안녕하세요! 오늘은 날씨가 정말 좋네요.\",\n",
+    "}\n",
+    "\n",
+    "\n",
+    "src_path = f'{output_dir}/tmp.wav'\n",
+    "\n",
+    "# Speed is adjustable\n",
+    "speed = 1.0\n",
+    "\n",
+    "for language, text in texts.items():\n",
+    "    model = TTS(language=language, device=device)\n",
+    "    speaker_ids = model.hps.data.spk2id\n",
+    "    \n",
+    "    for speaker_key in speaker_ids.keys():\n",
+    "        speaker_id = speaker_ids[speaker_key]\n",
+    "        speaker_key = speaker_key.lower().replace('_', '-')\n",
+    "        \n",
+    "        source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)\n",
+    "        model.tts_to_file(text, speaker_id, src_path, speed=speed)\n",
+    "        save_path = f'{output_dir}/output_v2_{speaker_key}.wav'\n",
+    "\n",
+    "        # Run the tone color converter\n",
+    "        encode_message = \"@MyShell\"\n",
+    "        tone_color_converter.convert(\n",
+    "            audio_src_path=src_path, \n",
+    "            src_se=source_se, \n",
+    "            tgt_se=target_se, \n",
+    "            output_path=save_path,\n",
+    "            message=encode_message)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

docs/QA.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# Common Questions and Answers
+## General Comments
+**OpenVoice is a Technology, not a Product**
+Although it works on a majority of voices if used correctly, please do not expect it to work perfectly on every case, as it takes a lot of engineering effort to translate a technology to a stable product. The targeted users of this technology are developers and researchers, not end users. End users expects a perfect product. However, we are confident to say that OpenVoice is the state-of-the-art among the source-available voice cloning technologies.
+The contribution of OpenVoice is a versatile instant voice cloning technical approach, not a ready-to-use perfect voice cloning product. However, we firmly believe that by releasing OpenVoice, we can accelerate the open research community's progress on instant voice cloning, and someday in the future the free voice cloning methods will be as good as commercial ones.
+## Issues with Voice Quality
+**Accent and Emotion of the Generated Voice is not Similar to the Reference Voice**
+First of all, OpenVoice only clones the tone color of the reference speaker. It does NOT clone the accent or emotion. The accent and emotion is controlled by the base speaker TTS model, not cloned by the tone color converter (please refer to our [paper](https://arxiv.org/pdf/2312.01479.pdf) for technical details). If the user wants to change the accent or emotion of the output, they need to have a base speaker model with that accent. OpenVoice provides sufficient flexibility for users to integrate their own base speaker model into the framework by simply replacing the current base speaker we provided.
+**Bad Audio Quality of the Generated Speech**
+Please check the followings:
+- Is your reference audio is clean enough without any background noise? You can find some high-quality reference speech [here](https://aiartes.com/voiceai)
+- Is your audio too short?
+- Does your audio contain speech from more than one person?
+- Does the reference audio contain long blank sections?
+- Did you name the reference audio the same name you used before but forgot to delete the `processed` folder?
+## Issues with Languages
+**Support of Other Languages**
+For multi-lingual and cross-lingual usage, please refer to [`demo_part2.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb). OpenVoice supports any language as long as you have a base speaker in that language. The OpenVoice team already did the most difficult part (tone color converter training) for you. Base speaker TTS model is relatively easy to train, and multiple existing open-source repositories support it. If you don't want to train by yourself, simply use the OpenAI TTS model as the base speaker.
+## Issues with Installation
+**Error Related to Silero**
+When calling `get_vad_segments` from `se_extractor.py`, there should be a message like this:
+```
+Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /home/user/.cache/torch/hub/master.zip
+```
+The download would fail if your machine can not access github. Please download the zip from "https://github.com/snakers4/silero-vad/zipball/master" manually and unzip it to `/home/user/.cache/torch/hub/snakers4_silero-vad_master`. You can also see [this issue](https://github.com/myshell-ai/OpenVoice/issues/57) for solutions for other versions of silero.

docs/USAGE.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# Usage
+## Table of Content
+- [Quick Use](#quick-use): directly use OpenVoice without installation.
+- [Linux Install](#linux-install): for researchers and developers only.
+    - [V1](#openvoice-v1)
+    - [V2](#openvoice-v2)
+- [Install on Other Platforms](#install-on-other-platforms): unofficial installation guide contributed by the community
+## Quick Use
+The input speech audio of OpenVoice can be in **Any Language**. OpenVoice can clone the voice in that speech audio, and use the voice to speak in multiple languages. For quick use, we recommend you to try the already deployed services:
+- [British English](https://app.myshell.ai/widget/vYjqae)
+- [American English](https://app.myshell.ai/widget/nEFFJf)
+- [Indian English](https://app.myshell.ai/widget/V3iYze)
+- [Australian English](https://app.myshell.ai/widget/fM7JVf)
+- [Spanish](https://app.myshell.ai/widget/NNFFVz)
+- [French](https://app.myshell.ai/widget/z2uyUz)
+- [Chinese](https://app.myshell.ai/widget/fU7nUz)
+- [Japanese](https://app.myshell.ai/widget/IfIB3u)
+- [Korean](https://app.myshell.ai/widget/q6ZjIn)
+## Minimal Demo
+For users who want to quickly try OpenVoice and do not require high quality or stability, click any of the following links:
+<div align="center">
+    <a href="https://app.myshell.ai/bot/z6Bvua/1702636181"><img src="../resources/myshell-hd.png" height="28"></a>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <a href="https://huggingface.co/spaces/myshell-ai/OpenVoice"><img src="../resources/huggingface.png" height="32"></a>
+</div>
+## Linux Install
+This section is only for developers and researchers who are familiar with Linux, Python and PyTorch. Clone this repo, and run
+```
+conda create -n openvoice python=3.9
+conda activate openvoice
+git clone [email protected]:myshell-ai/OpenVoice.git
+cd OpenVoice
+pip install -e .
+```
+No matter if you are using V1 or V2, the above installation is the same.
+### OpenVoice V1
+Download the checkpoint from [here](https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_1226.zip) and extract it to the `checkpoints` folder.
+**1. Flexible Voice Style Control.**
+Please see [`demo_part1.ipynb`](../demo_part1.ipynb) for an example usage of how OpenVoice enables flexible style control over the cloned voice.
+**2. Cross-Lingual Voice Cloning.**
+Please see [`demo_part2.ipynb`](../demo_part2.ipynb) for an example for languages seen or unseen in the MSML training set.
+**3. Gradio Demo.**. We provide a minimalist local gradio demo here. We strongly suggest the users to look into `demo_part1.ipynb`, `demo_part2.ipynb` and the [QnA](QA.md) if they run into issues with the gradio demo. Launch a local gradio demo with `python -m openvoice_app --share`.
+### OpenVoice V2
+Download the checkpoint from [here](https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip) and extract it to the `checkpoints_v2` folder.
+Install [MeloTTS](https://github.com/myshell-ai/MeloTTS):
+```
+pip install git+https://github.com/myshell-ai/MeloTTS.git
+python -m unidic download
+```
+**Demo Usage.** Please see [`demo_part3.ipynb`](../demo_part3.ipynb) for example usage of OpenVoice V2. Now it natively supports English, Spanish, French, Chinese, Japanese and Korean.
+## Install on Other Platforms
+This section provides the unofficial installation guides by open-source contributors in the community:
+- Windows
+  - [Guide](https://github.com/Alienpups/OpenVoice/blob/main/docs/USAGE_WINDOWS.md) by [@Alienpups](https://github.com/Alienpups)
+  - You are welcome to contribute if you have a better installation guide. We will list you here.
+- Docker
+  - [Guide](https://github.com/StevenJSCF/OpenVoice/blob/update-docs/docs/DF_USAGE.md) by [@StevenJSCF](https://github.com/StevenJSCF)
+  - You are welcome to contribute if you have a better installation guide. We will list you here.

openvoice/__init__.py ADDED Viewed

File without changes

openvoice/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (173 Bytes). View file

openvoice/__pycache__/api.cpython-39.pyc ADDED Viewed

Binary file (7.31 kB). View file

openvoice/__pycache__/attentions.cpython-39.pyc ADDED Viewed

Binary file (11.1 kB). View file

openvoice/__pycache__/commons.cpython-39.pyc ADDED Viewed

Binary file (5.79 kB). View file

openvoice/__pycache__/mel_processing.cpython-39.pyc ADDED Viewed

Binary file (4.19 kB). View file

openvoice/__pycache__/models.cpython-39.pyc ADDED Viewed

Binary file (12.7 kB). View file

openvoice/__pycache__/modules.cpython-39.pyc ADDED Viewed

Binary file (13.1 kB). View file

openvoice/__pycache__/se_extractor.cpython-39.pyc ADDED Viewed

Binary file (4.14 kB). View file

openvoice/__pycache__/transforms.cpython-39.pyc ADDED Viewed

Binary file (3.94 kB). View file