diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..0e98056ead8b2b4ddd22a5b0719be234dbd12c0a
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..ff406183521658183e61ba1e99f697ee0d8affd8 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+outputs/output_chinese.wav filter=lfs diff=lfs merge=lfs -text
+outputs/tmp.wav filter=lfs diff=lfs merge=lfs -text
+processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg0.wav filter=lfs diff=lfs merge=lfs -text
+processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg1.wav filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4fb44eb8c1bcf553c0bfcad9d15267c8b1bcc9ec
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,13 @@
+__pycache__/
+.ipynb_checkpoints/
+processed
+outputs
+outputs_v2
+checkpoints
+checkpoints_v2
+trash
+examples*
+.env
+build
+*.egg-info/
+*.zip
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..26d33521af10bcc7fd8cea344038eaaeb78d0ef5
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/OpenVoice.iml b/.idea/OpenVoice.iml
new file mode 100644
index 0000000000000000000000000000000000000000..8b8c395472a5a6b3598af42086e590417ace9933
--- /dev/null
+++ b/.idea/OpenVoice.iml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000000000000000000000000000000000000..29a01e7d1b48b51c55b92bfc3319b49326363eca
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c2cc92c34a730bd5df3b45c1ddf92150b6030ce1
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000000000000000000000000000000000000..969da6b97c730a4cf2c6da3047f1b92b88920e1f
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000000000000000000000000000000000000..94a25f7f4cb416c083d265558da75d457237d671
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000000000000000000000000000000000000..83fc24f2489cd6ef57d36557a83b23e554df113e
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,81 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1719824154394
+
+
+ 1719824154394
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.ipynb_checkpoints/demo_part1-checkpoint.ipynb b/.ipynb_checkpoints/demo_part1-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..263f7ae84529507d67189762cd02f9d68cbaac54
--- /dev/null
+++ b/.ipynb_checkpoints/demo_part1-checkpoint.ipynb
@@ -0,0 +1,399 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "b6ee1ede",
+ "metadata": {},
+ "source": [
+ "## Voice Style Control Demo"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "b7f043ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import torch\n",
+ "from openvoice import se_extractor\n",
+ "from openvoice.api import BaseSpeakerTTS, ToneColorConverter"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "15116b59",
+ "metadata": {},
+ "source": [
+ "### Initialization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "aacad912",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n",
+ "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n"
+ ]
+ }
+ ],
+ "source": [
+ "ckpt_base = 'checkpoints/base_speakers/EN'\n",
+ "ckpt_converter = 'checkpoints/converter'\n",
+ "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+ "output_dir = 'outputs'\n",
+ "\n",
+ "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
+ "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
+ "\n",
+ "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
+ "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
+ "\n",
+ "os.makedirs(output_dir, exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f67740c",
+ "metadata": {},
+ "source": [
+ "### Obtain Tone Color Embedding"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f8add279",
+ "metadata": {},
+ "source": [
+ "The `source_se` is the tone color embedding of the base speaker. \n",
+ "It is an average of multiple sentences generated by the base speaker. We directly provide the result here but\n",
+ "the readers feel free to extract `source_se` by themselves."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "63ff6273",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f71fcc3",
+ "metadata": {},
+ "source": [
+ "The `reference_speaker.mp3` below points to the short audio clip of the reference whose voice we want to clone. We provide an example here. If you use your own reference speakers, please **make sure each speaker has a unique filename.** The `se_extractor` will save the `targeted_se` using the filename of the audio and **will not automatically overwrite.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "55105eae",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "OpenVoice version: v1\n",
+ "[(0.0, 19.278375)]\n",
+ "after vad: dur = 19.27798185941043\n"
+ ]
+ }
+ ],
+ "source": [
+ "reference_speaker = './resources/demo_speaker0.mp3' # This is the voice you want to clone\n",
+ "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a40284aa",
+ "metadata": {},
+ "source": [
+ "### Inference"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "73dc1259",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "This audio is generated by OpenVoice.\n",
+ " > ===========================\n",
+ "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
+ " length:45\n",
+ " length:45\n"
+ ]
+ }
+ ],
+ "source": [
+ "save_path = f'{output_dir}/output_en_default.wav'\n",
+ "\n",
+ "# Run the base speaker tts\n",
+ "text = \"This audio is generated by OpenVoice.\"\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
+ "\n",
+ "# Run the tone color converter\n",
+ "encode_message = \"@MyShell\"\n",
+ "tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=source_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path,\n",
+ " message=encode_message)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6e3ea28a",
+ "metadata": {},
+ "source": [
+ "**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "fd022d38",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "This audio is generated by OpenVoice.\n",
+ " > ===========================\n",
+ "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
+ " length:45\n",
+ " length:45\n"
+ ]
+ }
+ ],
+ "source": [
+ "source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
+ "save_path = f'{output_dir}/output_whispering.wav'\n",
+ "\n",
+ "# Run the base speaker tts\n",
+ "text = \"This audio is generated by OpenVoice.\"\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n",
+ "\n",
+ "# Run the tone color converter\n",
+ "encode_message = \"@MyShell\"\n",
+ "tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=source_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path,\n",
+ " message=encode_message)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5fcfc70b",
+ "metadata": {},
+ "source": [
+ "**Try with different languages.** OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker. We provide an example with a Chinese base speaker here and we encourage the readers to try `demo_part2.ipynb` for a detailed demo."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "deff30a4-d430-4b4d-9772-b936f5b564c4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n"
+ ]
+ }
+ ],
+ "source": [
+ "ckpt_base = 'checkpoints/base_speakers/ZH'\n",
+ "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
+ "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
+ "\n",
+ "source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)\n",
+ "save_path = f'{output_dir}/output_chinese.wav'\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "a71d1387",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "毛岛灰绣眼鸟(学名:Zosterops mauritianus)是一种绣眼鸟科绣眼鸟属的鸟类,\n",
+ "属于毛里求斯岛上两种特有种绣眼鸟之一,\n",
+ "另一种是更为稀少的毛里求斯绣眼鸟.\n",
+ "上半身整体为灰色, 下半身为灰白色,\n",
+ "臀部和腋羽是十分显眼的白色.\n",
+ "这种鸟栖息于次生林、森林和花园中[1].\n",
+ "它与留尼汪灰绣眼鸟亲缘关系很近,\n",
+ "曾经被认为是同种, 统称为马斯克林绣眼鸟[2]\n",
+ " > ===========================\n",
+ "mɑʊ↑t⁼ɑʊ↓↑ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑( ʃɥɛ↑miŋ↑,ts⁼eɪ↓oʊ→ɛ↑sɹ↓tʰi↓i↓a↓oʊ→pʰi→ɛ↑sɹ↓ ɛ↑mu↓eɪ→joʊ→a↓aɪ↓tʰi↓aɪ↓eɪ→ən→joʊ→ɛ↑sɹ↓) s`ɹ`↓ i→ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑kʰə→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ s`u↓↑ t⁼ə niɑʊ↓↑leɪ↓,\n",
+ " length:199\n",
+ " length:197\n",
+ "s`u↓↑ɥ↑ mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ t⁼ɑʊ↓↑s`ɑŋ↓ liɑŋ↓↑ts`⁼ʊŋ↓↑ tʰə↓joʊ↓↑ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ ts`⁼ɹ`→i→,\n",
+ " length:100\n",
+ " length:100\n",
+ "liŋ↓ i→ts`⁼ʊŋ↓↑ s`ɹ`↓ k⁼əŋ↓weɪ↑ ʃi→s`ɑʊ↓↑ t⁼ə mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑.\n",
+ " length:83\n",
+ " length:83\n",
+ "s`ɑŋ↓p⁼an↓s`ən→ ts`⁼əŋ↓↑tʰi↓↑ weɪ↓ xweɪ→sə↓, ʃja↓p⁼an↓s`ən→ weɪ↓ xweɪ→p⁼aɪ↑sə↓,\n",
+ " length:80\n",
+ " length:80\n",
+ "tʰwən↑p⁼u↓ xə↑ iɛ↓ɥ↓↑ s`ɹ`↓ s`ɹ`↑fən→ ʃjɛn↓↑jɛn↓↑ t⁼ə p⁼aɪ↑sə↓.\n",
+ " length:63\n",
+ " length:63\n",
+ "ts`⁼ə↓ts`⁼ʊŋ↓↑ niɑʊ↓↑ tʃʰi→ʃi→ ɥ↑ tsʰɹ↓s`əŋ→lin↑, sən→lin↑ xə↑ xwa→ɥæn↑ ts`⁼ʊŋ→[ i→].\n",
+ " length:85\n",
+ " length:83\n",
+ "tʰa→ ɥ↓↑ ljoʊ↑ni↑uɑŋ→ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ tʃʰin→ɥæn↑ k⁼wan→ʃi↓ xən↓↑tʃ⁼in↓,\n",
+ " length:79\n",
+ " length:79\n",
+ "tsʰəŋ↑tʃ⁼iŋ→ p⁼eɪ↓ ɹ`ən↓weɪ↑ s`ɹ`↓ tʰʊŋ↑ts`⁼ʊŋ↓↑, tʰʊŋ↓↑ts`ʰəŋ→ weɪ↓ ma↓↑sɹ→kʰə↓lin↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑[ əɹ`↓].\n",
+ " length:111\n",
+ " length:109\n"
+ ]
+ },
+ {
+ "ename": "TypeError",
+ "evalue": "unsupported operand type(s) for -: 'builtin_function_or_method' and 'float'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[30], line 20\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;66;03m# 记录结束时间\u001b[39;00m\n\u001b[1;32m 19\u001b[0m end_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime\n\u001b[0;32m---> 20\u001b[0m execution_time \u001b[38;5;241m=\u001b[39m \u001b[43mend_time\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mstart_time\u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m代码执行时间: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexecution_time\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m 秒\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+ "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for -: 'builtin_function_or_method' and 'float'"
+ ]
+ }
+ ],
+ "source": [
+ "import time\n",
+ "\n",
+ "# 记录开始时间\n",
+ "start_time = time.time()\n",
+ "# Run the base speaker tts\n",
+ "text = \"毛岛灰绣眼鸟(学名:Zosterops mauritianus)是一种绣眼鸟科绣眼鸟属的鸟类,属于毛里求斯岛上两种特有种绣眼鸟之一,另一种是更为稀少的毛里求斯绣眼鸟。上半身整体为灰色,下半身为灰白色,臀部和腋羽是十分显眼的白色。这种鸟栖息于次生林、森林和花园中[1]。它与留尼汪灰绣眼鸟亲缘关系很近,曾经被认为是同种,统称为马斯克林绣眼鸟[2]\"\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)\n",
+ "\n",
+ "# Run the tone color converter\n",
+ "encode_message = \"@MyShell\"\n",
+ "tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=source_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path,\n",
+ " message=encode_message)\n",
+ "# 记录结束时间\n",
+ "end_time = time.time\n",
+ "execution_time = end_time - start_time\n",
+ "print(f\"代码执行时间: {execution_time} 秒\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8e513094",
+ "metadata": {},
+ "source": [
+ "**Tech for good.** For people who will deploy OpenVoice for public usage: We offer you the option to add watermark to avoid potential misuse. Please see the ToneColorConverter class. **MyShell reserves the ability to detect whether an audio is generated by OpenVoice**, no matter whether the watermark is added or not."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9628ffa1-1d60-4d1b-a9ed-619add064ebd",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "377f4b72-dfca-4c58-8a5c-fea056538cc2",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31bf81ab-bac9-4996-8f47-8651052d713a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32a84a29-9515-4aaa-b4ad-3a530e8259f0",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "abd802ad-93ac-4db2-9ee5-0ad78b54e09e",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "interpreter": {
+ "hash": "9d70c38e1c0b038dbdffdaa4f8bfa1f6767c43760905c87a9fbe7800d18c6c35"
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.19"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/.ipynb_checkpoints/demo_part3-checkpoint.ipynb b/.ipynb_checkpoints/demo_part3-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..40a6f2fcffeb4656b5dddd3b29ef45675130558f
--- /dev/null
+++ b/.ipynb_checkpoints/demo_part3-checkpoint.ipynb
@@ -0,0 +1,143 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Multi-Accent and Multi-Lingual Voice Clone Demo with MeloTTS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import torch\n",
+ "from openvoice import se_extractor\n",
+ "from openvoice.api import ToneColorConverter"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Initialization\n",
+ "\n",
+ "In this example, we will use the checkpoints from OpenVoiceV2. OpenVoiceV2 is trained with more aggressive augmentations and thus demonstrate better robustness in some cases."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ckpt_converter = 'checkpoints_v2/converter'\n",
+ "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+ "output_dir = 'outputs_v2'\n",
+ "\n",
+ "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
+ "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
+ "\n",
+ "os.makedirs(output_dir, exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Obtain Tone Color Embedding\n",
+ "We only extract the tone color embedding for the target speaker. The source tone color embeddings can be directly loaded from `checkpoints_v2/ses` folder."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone\n",
+ "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Use MeloTTS as Base Speakers\n",
+ "\n",
+ "MeloTTS is a high-quality multi-lingual text-to-speech library by @MyShell.ai, supporting languages including English (American, British, Indian, Australian, Default), Spanish, French, Chinese, Japanese, Korean. In the following example, we will use the models in MeloTTS as the base speakers. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from melo.api import TTS\n",
+ "\n",
+ "texts = {\n",
+ " 'EN_NEWEST': \"Did you ever hear a folk tale about a giant turtle?\", # The newest English base speaker model\n",
+ " 'EN': \"Did you ever hear a folk tale about a giant turtle?\",\n",
+ " 'ES': \"El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.\",\n",
+ " 'FR': \"La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.\",\n",
+ " 'ZH': \"在这次vacation中,我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。\",\n",
+ " 'JP': \"彼は毎朝ジョギングをして体を健康に保っています。\",\n",
+ " 'KR': \"안녕하세요! 오늘은 날씨가 정말 좋네요.\",\n",
+ "}\n",
+ "\n",
+ "\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "\n",
+ "# Speed is adjustable\n",
+ "speed = 1.0\n",
+ "\n",
+ "for language, text in texts.items():\n",
+ " model = TTS(language=language, device=device)\n",
+ " speaker_ids = model.hps.data.spk2id\n",
+ " \n",
+ " for speaker_key in speaker_ids.keys():\n",
+ " speaker_id = speaker_ids[speaker_key]\n",
+ " speaker_key = speaker_key.lower().replace('_', '-')\n",
+ " \n",
+ " source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)\n",
+ " model.tts_to_file(text, speaker_id, src_path, speed=speed)\n",
+ " save_path = f'{output_dir}/output_v2_{speaker_key}.wav'\n",
+ "\n",
+ " # Run the tone color converter\n",
+ " encode_message = \"@MyShell\"\n",
+ " tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=source_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path,\n",
+ " message=encode_message)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "melo",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.18"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..2d29d020b993b9c21e30aa6fc6c97737f34594b8
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,7 @@
+Copyright 2024 MyShell.ai
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/MyShell_OpenVoice.egg-info/PKG-INFO b/MyShell_OpenVoice.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..4cd345d233b298b501c85679a20179729cd6d0a7
--- /dev/null
+++ b/MyShell_OpenVoice.egg-info/PKG-INFO
@@ -0,0 +1,105 @@
+Metadata-Version: 2.1
+Name: MyShell-OpenVoice
+Version: 0.0.0
+Summary: Instant voice cloning by MyShell.
+Home-page: https://github.com/myshell-ai/OpenVoice
+Author: MyShell
+Author-email: ethan@myshell.ai
+License: MIT License
+Project-URL: Documentation, https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md
+Project-URL: Changes, https://github.com/myshell-ai/OpenVoice/releases
+Project-URL: Code, https://github.com/myshell-ai/OpenVoice
+Project-URL: Issue tracker, https://github.com/myshell-ai/OpenVoice/issues
+Keywords: text-to-speech,tts,voice-clone,zero-shot-tts
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: librosa==0.9.1
+Requires-Dist: faster-whisper==0.9.0
+Requires-Dist: pydub==0.25.1
+Requires-Dist: wavmark==0.0.3
+Requires-Dist: numpy==1.22.0
+Requires-Dist: eng_to_ipa==0.0.2
+Requires-Dist: inflect==7.0.0
+Requires-Dist: unidecode==1.3.7
+Requires-Dist: whisper-timestamped==1.14.2
+Requires-Dist: pypinyin==0.50.0
+Requires-Dist: cn2an==0.5.22
+Requires-Dist: jieba==0.42.1
+Requires-Dist: gradio==3.48.0
+Requires-Dist: langid==1.1.6
+
+
+
+
+
+[Paper](https://arxiv.org/abs/2312.01479) |
+[Website](https://research.myshell.ai/open-voice)
+
+
+
+## Introduction
+
+### OpenVoice V1
+
+As we detailed in our [paper](https://arxiv.org/abs/2312.01479) and [website](https://research.myshell.ai/open-voice), the advantages of OpenVoice are three-fold:
+
+**1. Accurate Tone Color Cloning.**
+OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
+
+**2. Flexible Voice Style Control.**
+OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
+
+**3. Zero-shot Cross-lingual Voice Cloning.**
+Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
+
+### OpenVoice V2
+
+In April 2024, we released OpenVoice V2, which includes all features in V1 and has:
+
+**1. Better Audio Quality.**
+OpenVoice V2 adopts a different training strategy that delivers better audio quality.
+
+**2. Native Multi-lingual Support.**
+English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
+
+**3. Free Commercial Use.**
+Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
+
+[Video](https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f)
+
+OpenVoice has been powering the instant voice cloning capability of [myshell.ai](https://app.myshell.ai/explore) since May 2023. Until Nov 2023, the voice cloning model has been used tens of millions of times by users worldwide, and witnessed the explosive user growth on the platform.
+
+## Main Contributors
+
+- [Zengyi Qin](https://www.qinzy.tech) at MIT and MyShell
+- [Wenliang Zhao](https://wl-zhao.github.io) at Tsinghua University
+- [Xumin Yu](https://yuxumin.github.io) at Tsinghua University
+- [Ethan Sun](https://twitter.com/ethan_myshell) at MyShell
+
+## How to Use
+Please see [usage](docs/USAGE.md) for detailed instructions.
+
+## Common Issues
+
+Please see [QA](docs/QA.md) for common questions and answers. We will regularly update the question and answer list.
+
+## Join Our Community
+
+Join our [Discord community](https://discord.gg/myshell) and select the `Developer` role upon joining to gain exclusive access to our developer-only channel! Don't miss out on valuable discussions and collaboration opportunities.
+
+## Citation
+```
+@article{qin2023openvoice,
+ title={OpenVoice: Versatile Instant Voice Cloning},
+ author={Qin, Zengyi and Zhao, Wenliang and Yu, Xumin and Sun, Xin},
+ journal={arXiv preprint arXiv:2312.01479},
+ year={2023}
+}
+```
+
+## License
+OpenVoice V1 and V2 are MIT Licensed. Free for both commercial and research use.
+
+## Acknowledgements
+This implementation is based on several excellent projects, [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), and [VITS2](https://github.com/daniilrobnikov/vits2). Thanks for their awesome work!
diff --git a/MyShell_OpenVoice.egg-info/SOURCES.txt b/MyShell_OpenVoice.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..10c94f5f77cd910c01ba89751a6685e744f55e88
--- /dev/null
+++ b/MyShell_OpenVoice.egg-info/SOURCES.txt
@@ -0,0 +1,25 @@
+LICENSE
+README.md
+setup.py
+MyShell_OpenVoice.egg-info/PKG-INFO
+MyShell_OpenVoice.egg-info/SOURCES.txt
+MyShell_OpenVoice.egg-info/dependency_links.txt
+MyShell_OpenVoice.egg-info/not-zip-safe
+MyShell_OpenVoice.egg-info/requires.txt
+MyShell_OpenVoice.egg-info/top_level.txt
+openvoice/__init__.py
+openvoice/api.py
+openvoice/attentions.py
+openvoice/commons.py
+openvoice/mel_processing.py
+openvoice/models.py
+openvoice/modules.py
+openvoice/openvoice_app.py
+openvoice/se_extractor.py
+openvoice/transforms.py
+openvoice/utils.py
+openvoice/text/__init__.py
+openvoice/text/cleaners.py
+openvoice/text/english.py
+openvoice/text/mandarin.py
+openvoice/text/symbols.py
\ No newline at end of file
diff --git a/MyShell_OpenVoice.egg-info/dependency_links.txt b/MyShell_OpenVoice.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/MyShell_OpenVoice.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/MyShell_OpenVoice.egg-info/not-zip-safe b/MyShell_OpenVoice.egg-info/not-zip-safe
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/MyShell_OpenVoice.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/MyShell_OpenVoice.egg-info/requires.txt b/MyShell_OpenVoice.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..98373f32bbd12af2b0f83c8bf0f36a3fe900c304
--- /dev/null
+++ b/MyShell_OpenVoice.egg-info/requires.txt
@@ -0,0 +1,14 @@
+librosa==0.9.1
+faster-whisper==0.9.0
+pydub==0.25.1
+wavmark==0.0.3
+numpy==1.22.0
+eng_to_ipa==0.0.2
+inflect==7.0.0
+unidecode==1.3.7
+whisper-timestamped==1.14.2
+pypinyin==0.50.0
+cn2an==0.5.22
+jieba==0.42.1
+gradio==3.48.0
+langid==1.1.6
diff --git a/MyShell_OpenVoice.egg-info/top_level.txt b/MyShell_OpenVoice.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f8172ef8a5cf587e3dc7cbf237190784c7472aab
--- /dev/null
+++ b/MyShell_OpenVoice.egg-info/top_level.txt
@@ -0,0 +1 @@
+openvoice
diff --git a/README.md b/README.md
index c33f63b830add3cc94d5ebdedb75e3627923da4c..01302988043c828d217c2634443ac88cdce35be0 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,80 @@
---
-title: TestOpenVoice
-emoji: 💻
-colorFrom: indigo
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.37.2
+title: testOpenVoice
app_file: app.py
-pinned: false
+sdk: gradio
+sdk_version: 3.48.0
---
+
+
+
+
+[Paper](https://arxiv.org/abs/2312.01479) |
+[Website](https://research.myshell.ai/open-voice)
+
+
+
+## Introduction
+
+### OpenVoice V1
+
+As we detailed in our [paper](https://arxiv.org/abs/2312.01479) and [website](https://research.myshell.ai/open-voice), the advantages of OpenVoice are three-fold:
+
+**1. Accurate Tone Color Cloning.**
+OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
+
+**2. Flexible Voice Style Control.**
+OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
+
+**3. Zero-shot Cross-lingual Voice Cloning.**
+Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
+
+### OpenVoice V2
+
+In April 2024, we released OpenVoice V2, which includes all features in V1 and has:
+
+**1. Better Audio Quality.**
+OpenVoice V2 adopts a different training strategy that delivers better audio quality.
+
+**2. Native Multi-lingual Support.**
+English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2.
+
+**3. Free Commercial Use.**
+Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use.
+
+[Video](https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f)
+
+OpenVoice has been powering the instant voice cloning capability of [myshell.ai](https://app.myshell.ai/explore) since May 2023. Until Nov 2023, the voice cloning model has been used tens of millions of times by users worldwide, and witnessed the explosive user growth on the platform.
+
+## Main Contributors
+
+- [Zengyi Qin](https://www.qinzy.tech) at MIT and MyShell
+- [Wenliang Zhao](https://wl-zhao.github.io) at Tsinghua University
+- [Xumin Yu](https://yuxumin.github.io) at Tsinghua University
+- [Ethan Sun](https://twitter.com/ethan_myshell) at MyShell
+
+## How to Use
+Please see [usage](docs/USAGE.md) for detailed instructions.
+
+## Common Issues
+
+Please see [QA](docs/QA.md) for common questions and answers. We will regularly update the question and answer list.
+
+## Join Our Community
+
+Join our [Discord community](https://discord.gg/myshell) and select the `Developer` role upon joining to gain exclusive access to our developer-only channel! Don't miss out on valuable discussions and collaboration opportunities.
+
+## Citation
+```
+@article{qin2023openvoice,
+ title={OpenVoice: Versatile Instant Voice Cloning},
+ author={Qin, Zengyi and Zhao, Wenliang and Yu, Xumin and Sun, Xin},
+ journal={arXiv preprint arXiv:2312.01479},
+ year={2023}
+}
+```
+
+## License
+OpenVoice V1 and V2 are MIT Licensed. Free for both commercial and research use.
-Check out the configuration reference at https://huggingface.co./docs/hub/spaces-config-reference
+## Acknowledgements
+This implementation is based on several excellent projects, [TTS](https://github.com/coqui-ai/TTS), [VITS](https://github.com/jaywalnut310/vits), and [VITS2](https://github.com/daniilrobnikov/vits2). Thanks for their awesome work!
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..86941a6a84b2d5ee82ba03ece091555f6c3edd69
--- /dev/null
+++ b/app.py
@@ -0,0 +1,71 @@
+#### https://huggingface.co./docs
+# https://huggingface.co./spaces/gradio/asr
+import os
+import gradio as gr
+
+import os
+import torch
+from openvoice import se_extractor
+from openvoice.api import BaseSpeakerTTS, ToneColorConverter
+
+ckpt_base = 'checkpoints/base_speakers/EN'
+ckpt_converter = 'checkpoints/converter'
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+output_dir = 'outputs'
+
+base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
+base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
+
+tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
+tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
+source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)
+os.makedirs(output_dir, exist_ok=True)
+reference_speaker = './resources/demo_speaker0.mp3' # This is the voice you want to clone
+target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)
+save_path = f'{output_dir}/output_en_default.wav'
+
+# Run the base speaker tts
+text = "This audio is generated by OpenVoice."
+src_path = f'{output_dir}/tmp.wav'
+base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)
+
+# Run the tone color converter
+encode_message = "@MyShell"
+tone_color_converter.convert(
+ audio_src_path=src_path,
+ src_se=source_se,
+ tgt_se=target_se,
+ output_path=save_path,
+ message=encode_message)
+
+ckpt_base = 'checkpoints/base_speakers/ZH'
+base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)
+base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')
+
+source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)
+save_path = f'{output_dir}/output_chinese.wav'
+
+
+def audio_io(input_text: str):
+ text = input_text
+ src_path = f'{output_dir}/tmp.wav'
+ base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)
+
+ # Run the tone color converter
+ encode_message = "@MyShell"
+ tone_color_converter.convert(
+ audio_src_path=src_path,
+ src_se=source_se,
+ tgt_se=target_se,
+ output_path=save_path,
+ message=encode_message)
+ return src_path
+
+
+demo = gr.Interface(
+ fn=audio_io,
+ inputs=["text"],
+ outputs=["audio"],
+)
+
+demo.launch()
diff --git a/checkpoints/base_speakers/.DS_Store b/checkpoints/base_speakers/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..1150f2044ec08b34bfa39a6e6ef3b5ef22f85b5d
Binary files /dev/null and b/checkpoints/base_speakers/.DS_Store differ
diff --git a/checkpoints/base_speakers/EN/checkpoint.pth b/checkpoints/base_speakers/EN/checkpoint.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fb7c26af57011437a02ebb1c4fe8ed307cc30f21
--- /dev/null
+++ b/checkpoints/base_speakers/EN/checkpoint.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
+size 160467309
diff --git a/checkpoints/base_speakers/EN/config.json b/checkpoints/base_speakers/EN/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7309ad10eae3c160ea0ef44261372c4f3364587
--- /dev/null
+++ b/checkpoints/base_speakers/EN/config.json
@@ -0,0 +1,145 @@
+{
+ "data": {
+ "text_cleaners": [
+ "cjke_cleaners2"
+ ],
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_mel_channels": 80,
+ "add_blank": true,
+ "cleaned_text": true,
+ "n_speakers": 10
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "n_layers_trans_flow": 3,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256
+ },
+ "symbols": [
+ "_",
+ ",",
+ ".",
+ "!",
+ "?",
+ "-",
+ "~",
+ "\u2026",
+ "N",
+ "Q",
+ "a",
+ "b",
+ "d",
+ "e",
+ "f",
+ "g",
+ "h",
+ "i",
+ "j",
+ "k",
+ "l",
+ "m",
+ "n",
+ "o",
+ "p",
+ "s",
+ "t",
+ "u",
+ "v",
+ "w",
+ "x",
+ "y",
+ "z",
+ "\u0251",
+ "\u00e6",
+ "\u0283",
+ "\u0291",
+ "\u00e7",
+ "\u026f",
+ "\u026a",
+ "\u0254",
+ "\u025b",
+ "\u0279",
+ "\u00f0",
+ "\u0259",
+ "\u026b",
+ "\u0265",
+ "\u0278",
+ "\u028a",
+ "\u027e",
+ "\u0292",
+ "\u03b8",
+ "\u03b2",
+ "\u014b",
+ "\u0266",
+ "\u207c",
+ "\u02b0",
+ "`",
+ "^",
+ "#",
+ "*",
+ "=",
+ "\u02c8",
+ "\u02cc",
+ "\u2192",
+ "\u2193",
+ "\u2191",
+ " "
+ ],
+ "speakers": {
+ "default": 1,
+ "whispering": 2,
+ "shouting": 3,
+ "excited": 4,
+ "cheerful": 5,
+ "terrified": 6,
+ "angry": 7,
+ "sad": 8,
+ "friendly": 9
+ }
+}
\ No newline at end of file
diff --git a/checkpoints/base_speakers/EN/en_default_se.pth b/checkpoints/base_speakers/EN/en_default_se.pth
new file mode 100644
index 0000000000000000000000000000000000000000..319d7eb4bee7b785a47f4e6191c2132dec12abcf
--- /dev/null
+++ b/checkpoints/base_speakers/EN/en_default_se.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
+size 1789
diff --git a/checkpoints/base_speakers/EN/en_style_se.pth b/checkpoints/base_speakers/EN/en_style_se.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c2fd50abf058f6ab65879395b62fb7e3c0289b47
--- /dev/null
+++ b/checkpoints/base_speakers/EN/en_style_se.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9
+size 1783
diff --git a/checkpoints/base_speakers/ZH/checkpoint.pth b/checkpoints/base_speakers/ZH/checkpoint.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fcadb5c222e9ea92fc9ada4920249fc65cad1692
--- /dev/null
+++ b/checkpoints/base_speakers/ZH/checkpoint.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
+size 160467309
diff --git a/checkpoints/base_speakers/ZH/config.json b/checkpoints/base_speakers/ZH/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..130256092fb8ad00f938149bf8aa1a62aae30023
--- /dev/null
+++ b/checkpoints/base_speakers/ZH/config.json
@@ -0,0 +1,137 @@
+{
+ "data": {
+ "text_cleaners": [
+ "cjke_cleaners2"
+ ],
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_mel_channels": 80,
+ "add_blank": true,
+ "cleaned_text": true,
+ "n_speakers": 10
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "n_layers_trans_flow": 3,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256
+ },
+ "symbols": [
+ "_",
+ ",",
+ ".",
+ "!",
+ "?",
+ "-",
+ "~",
+ "\u2026",
+ "N",
+ "Q",
+ "a",
+ "b",
+ "d",
+ "e",
+ "f",
+ "g",
+ "h",
+ "i",
+ "j",
+ "k",
+ "l",
+ "m",
+ "n",
+ "o",
+ "p",
+ "s",
+ "t",
+ "u",
+ "v",
+ "w",
+ "x",
+ "y",
+ "z",
+ "\u0251",
+ "\u00e6",
+ "\u0283",
+ "\u0291",
+ "\u00e7",
+ "\u026f",
+ "\u026a",
+ "\u0254",
+ "\u025b",
+ "\u0279",
+ "\u00f0",
+ "\u0259",
+ "\u026b",
+ "\u0265",
+ "\u0278",
+ "\u028a",
+ "\u027e",
+ "\u0292",
+ "\u03b8",
+ "\u03b2",
+ "\u014b",
+ "\u0266",
+ "\u207c",
+ "\u02b0",
+ "`",
+ "^",
+ "#",
+ "*",
+ "=",
+ "\u02c8",
+ "\u02cc",
+ "\u2192",
+ "\u2193",
+ "\u2191",
+ " "
+ ],
+ "speakers": {
+ "default": 0
+ }
+}
\ No newline at end of file
diff --git a/checkpoints/base_speakers/ZH/zh_default_se.pth b/checkpoints/base_speakers/ZH/zh_default_se.pth
new file mode 100644
index 0000000000000000000000000000000000000000..471841ae84a31aae1c8e25c1ef4548b3e87a32bb
--- /dev/null
+++ b/checkpoints/base_speakers/ZH/zh_default_se.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
+size 1789
diff --git a/checkpoints/converter/checkpoint.pth b/checkpoints/converter/checkpoint.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c38ff17666bae2bae4236f85bfe2284f4885b31a
--- /dev/null
+++ b/checkpoints/converter/checkpoint.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89ae83aa4e3668fef64b388b789ff7b0ce0def9f801069edfc18a00ea420748d
+size 131327338
diff --git a/checkpoints/converter/config.json b/checkpoints/converter/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a163d4254b637e9fd489712db40c15aeacda169e
--- /dev/null
+++ b/checkpoints/converter/config.json
@@ -0,0 +1,57 @@
+{
+ "data": {
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_speakers": 0
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "n_layers_q": 3,
+ "use_spectral_norm": false,
+ "gin_channels": 256
+ }
+}
\ No newline at end of file
diff --git a/checkpoints_v2/.DS_Store b/checkpoints_v2/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..f5cae345c16be71e4c7759b300f9fe5b9d0d351b
Binary files /dev/null and b/checkpoints_v2/.DS_Store differ
diff --git a/checkpoints_v2/converter/checkpoint.pth b/checkpoints_v2/converter/checkpoint.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fa2f9421735901fd3db22a904f07b5a591faad7d
--- /dev/null
+++ b/checkpoints_v2/converter/checkpoint.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9652c27e92b6b2a91632590ac9962ef7ae2b712e5c5b7f4c34ec55ee2b37ab9e
+size 131320490
diff --git a/checkpoints_v2/converter/config.json b/checkpoints_v2/converter/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e33566b0d976167bd5f15801ef7005d59143e2f
--- /dev/null
+++ b/checkpoints_v2/converter/config.json
@@ -0,0 +1,57 @@
+{
+ "_version_": "v2",
+ "data": {
+ "sampling_rate": 22050,
+ "filter_length": 1024,
+ "hop_length": 256,
+ "win_length": 1024,
+ "n_speakers": 0
+ },
+ "model": {
+ "zero_g": true,
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ],
+ [
+ 1,
+ 3,
+ 5
+ ]
+ ],
+ "upsample_rates": [
+ 8,
+ 8,
+ 2,
+ 2
+ ],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [
+ 16,
+ 16,
+ 4,
+ 4
+ ],
+ "gin_channels": 256
+ }
+}
\ No newline at end of file
diff --git a/demo_part1.ipynb b/demo_part1.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..fe37cfd8236afc96ceb782c0b71e7c56d8baad4a
--- /dev/null
+++ b/demo_part1.ipynb
@@ -0,0 +1,401 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "b6ee1ede",
+ "metadata": {},
+ "source": [
+ "## Voice Style Control Demo"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "b7f043ee",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 15 µs, sys: 1e+03 ns, total: 16 µs\n",
+ "Wall time: 18.8 µs\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "import os\n",
+ "import torch\n",
+ "from openvoice import se_extractor\n",
+ "from openvoice.api import BaseSpeakerTTS, ToneColorConverter"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "15116b59",
+ "metadata": {},
+ "source": [
+ "### Initialization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "aacad912",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n",
+ "Loaded checkpoint 'checkpoints/converter/checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n"
+ ]
+ }
+ ],
+ "source": [
+ "ckpt_base = 'checkpoints/base_speakers/EN'\n",
+ "ckpt_converter = 'checkpoints/converter'\n",
+ "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+ "output_dir = 'outputs'\n",
+ "\n",
+ "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
+ "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
+ "\n",
+ "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
+ "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
+ "\n",
+ "os.makedirs(output_dir, exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f67740c",
+ "metadata": {},
+ "source": [
+ "### Obtain Tone Color Embedding"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f8add279",
+ "metadata": {},
+ "source": [
+ "The `source_se` is the tone color embedding of the base speaker. \n",
+ "It is an average of multiple sentences generated by the base speaker. We directly provide the result here but\n",
+ "the readers feel free to extract `source_se` by themselves."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "63ff6273",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "source_se = torch.load(f'{ckpt_base}/en_default_se.pth').to(device)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f71fcc3",
+ "metadata": {},
+ "source": [
+ "The `reference_speaker.mp3` below points to the short audio clip of the reference whose voice we want to clone. We provide an example here. If you use your own reference speakers, please **make sure each speaker has a unique filename.** The `se_extractor` will save the `targeted_se` using the filename of the audio and **will not automatically overwrite.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "55105eae",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "OpenVoice version: v1\n",
+ "[(0.0, 19.278375)]\n",
+ "after vad: dur = 19.27798185941043\n"
+ ]
+ }
+ ],
+ "source": [
+ "reference_speaker = './resources/demo_speaker0.mp3' # This is the voice you want to clone\n",
+ "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir='processed', vad=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a40284aa",
+ "metadata": {},
+ "source": [
+ "### Inference"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "73dc1259",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "This audio is generated by OpenVoice.\n",
+ " > ===========================\n",
+ "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
+ " length:45\n",
+ " length:45\n"
+ ]
+ }
+ ],
+ "source": [
+ "save_path = f'{output_dir}/output_en_default.wav'\n",
+ "\n",
+ "# Run the base speaker tts\n",
+ "text = \"This audio is generated by OpenVoice.\"\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "base_speaker_tts.tts(text, src_path, speaker='default', language='English', speed=1.0)\n",
+ "\n",
+ "# Run the tone color converter\n",
+ "encode_message = \"@MyShell\"\n",
+ "tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=source_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path,\n",
+ " message=encode_message)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6e3ea28a",
+ "metadata": {},
+ "source": [
+ "**Try with different styles and speed.** The style can be controlled by the `speaker` parameter in the `base_speaker_tts.tts` method. Available choices: friendly, cheerful, excited, sad, angry, terrified, shouting, whispering. Note that the tone color embedding need to be updated. The speed can be controlled by the `speed` parameter. Let's try whispering with speed 0.9."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "fd022d38",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "This audio is generated by OpenVoice.\n",
+ " > ===========================\n",
+ "ðɪs ˈɑdiˌoʊ ɪz ˈdʒɛnəɹˌeɪtɪd baɪ ˈoʊpən vɔɪs.\n",
+ " length:45\n",
+ " length:45\n"
+ ]
+ }
+ ],
+ "source": [
+ "source_se = torch.load(f'{ckpt_base}/en_style_se.pth').to(device)\n",
+ "save_path = f'{output_dir}/output_whispering.wav'\n",
+ "\n",
+ "# Run the base speaker tts\n",
+ "text = \"This audio is generated by OpenVoice.\"\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "base_speaker_tts.tts(text, src_path, speaker='whispering', language='English', speed=0.9)\n",
+ "\n",
+ "# Run the tone color converter\n",
+ "encode_message = \"@MyShell\"\n",
+ "tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=source_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path,\n",
+ " message=encode_message)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5fcfc70b",
+ "metadata": {},
+ "source": [
+ "**Try with different languages.** OpenVoice can achieve multi-lingual voice cloning by simply replace the base speaker. We provide an example with a Chinese base speaker here and we encourage the readers to try `demo_part2.ipynb` for a detailed demo."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "deff30a4-d430-4b4d-9772-b936f5b564c4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/russell/miniconda3/envs/openvoice/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
+ " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loaded checkpoint 'checkpoints/base_speakers/ZH/checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n"
+ ]
+ }
+ ],
+ "source": [
+ "ckpt_base = 'checkpoints/base_speakers/ZH'\n",
+ "base_speaker_tts = BaseSpeakerTTS(f'{ckpt_base}/config.json', device=device)\n",
+ "base_speaker_tts.load_ckpt(f'{ckpt_base}/checkpoint.pth')\n",
+ "\n",
+ "source_se = torch.load(f'{ckpt_base}/zh_default_se.pth').to(device)\n",
+ "save_path = f'{output_dir}/output_chinese.wav'\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "a71d1387",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " > Text splitted to sentences.\n",
+ "毛岛灰绣眼鸟(学名:Zosterops mauritianus)是一种绣眼鸟科绣眼鸟属的鸟类,\n",
+ "属于毛里求斯岛上两种特有种绣眼鸟之一,\n",
+ "另一种是更为稀少的毛里求斯绣眼鸟.\n",
+ "上半身整体为灰色, 下半身为灰白色,\n",
+ "臀部和腋羽是十分显眼的白色.\n",
+ "这种鸟栖息于次生林、森林和花园中[1].\n",
+ "它与留尼汪灰绣眼鸟亲缘关系很近,\n",
+ "曾经被认为是同种, 统称为马斯克林绣眼鸟[2]\n",
+ " > ===========================\n",
+ "mɑʊ↑t⁼ɑʊ↓↑ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑( ʃɥɛ↑miŋ↑,ts⁼eɪ↓oʊ→ɛ↑sɹ↓tʰi↓i↓a↓oʊ→pʰi→ɛ↑sɹ↓ ɛ↑mu↓eɪ→joʊ→a↓aɪ↓tʰi↓aɪ↓eɪ→ən→joʊ→ɛ↑sɹ↓) s`ɹ`↓ i→ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑kʰə→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ s`u↓↑ t⁼ə niɑʊ↓↑leɪ↓,\n",
+ " length:199\n",
+ " length:197\n",
+ "s`u↓↑ɥ↑ mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ t⁼ɑʊ↓↑s`ɑŋ↓ liɑŋ↓↑ts`⁼ʊŋ↓↑ tʰə↓joʊ↓↑ts`⁼ʊŋ↓↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ ts`⁼ɹ`→i→,\n",
+ " length:100\n",
+ " length:100\n",
+ "liŋ↓ i→ts`⁼ʊŋ↓↑ s`ɹ`↓ k⁼əŋ↓weɪ↑ ʃi→s`ɑʊ↓↑ t⁼ə mɑʊ↑li↓↑tʃʰjoʊ↑sɹ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑.\n",
+ " length:83\n",
+ " length:83\n",
+ "s`ɑŋ↓p⁼an↓s`ən→ ts`⁼əŋ↓↑tʰi↓↑ weɪ↓ xweɪ→sə↓, ʃja↓p⁼an↓s`ən→ weɪ↓ xweɪ→p⁼aɪ↑sə↓,\n",
+ " length:80\n",
+ " length:80\n",
+ "tʰwən↑p⁼u↓ xə↑ iɛ↓ɥ↓↑ s`ɹ`↓ s`ɹ`↑fən→ ʃjɛn↓↑jɛn↓↑ t⁼ə p⁼aɪ↑sə↓.\n",
+ " length:63\n",
+ " length:63\n",
+ "ts`⁼ə↓ts`⁼ʊŋ↓↑ niɑʊ↓↑ tʃʰi→ʃi→ ɥ↑ tsʰɹ↓s`əŋ→lin↑, sən→lin↑ xə↑ xwa→ɥæn↑ ts`⁼ʊŋ→[ i→].\n",
+ " length:85\n",
+ " length:83\n",
+ "tʰa→ ɥ↓↑ ljoʊ↑ni↑uɑŋ→ xweɪ→ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑ tʃʰin→ɥæn↑ k⁼wan→ʃi↓ xən↓↑tʃ⁼in↓,\n",
+ " length:79\n",
+ " length:79\n",
+ "tsʰəŋ↑tʃ⁼iŋ→ p⁼eɪ↓ ɹ`ən↓weɪ↑ s`ɹ`↓ tʰʊŋ↑ts`⁼ʊŋ↓↑, tʰʊŋ↓↑ts`ʰəŋ→ weɪ↓ ma↓↑sɹ→kʰə↓lin↑ ʃjoʊ↓ jɛn↓↑niɑʊ↓↑[ əɹ`↓].\n",
+ " length:111\n",
+ " length:109\n",
+ "CPU times: user 2min 41s, sys: 7.56 s, total: 2min 49s\n",
+ "Wall time: 29.7 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "# Run the base speaker tts\n",
+ "text = \"毛岛灰绣眼鸟(学名:Zosterops mauritianus)是一种绣眼鸟科绣眼鸟属的鸟类,属于毛里求斯岛上两种特有种绣眼鸟之一,另一种是更为稀少的毛里求斯绣眼鸟。上半身整体为灰色,下半身为灰白色,臀部和腋羽是十分显眼的白色。这种鸟栖息于次生林、森林和花园中[1]。它与留尼汪灰绣眼鸟亲缘关系很近,曾经被认为是同种,统称为马斯克林绣眼鸟[2]\"\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "base_speaker_tts.tts(text, src_path, speaker='default', language='Chinese', speed=1.0)\n",
+ "\n",
+ "# Run the tone color converter\n",
+ "encode_message = \"@MyShell\"\n",
+ "tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=source_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path,\n",
+ " message=encode_message)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8e513094",
+ "metadata": {},
+ "source": [
+ "**Tech for good.** For people who will deploy OpenVoice for public usage: We offer you the option to add watermark to avoid potential misuse. Please see the ToneColorConverter class. **MyShell reserves the ability to detect whether an audio is generated by OpenVoice**, no matter whether the watermark is added or not."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9628ffa1-1d60-4d1b-a9ed-619add064ebd",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "377f4b72-dfca-4c58-8a5c-fea056538cc2",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31bf81ab-bac9-4996-8f47-8651052d713a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32a84a29-9515-4aaa-b4ad-3a530e8259f0",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "abd802ad-93ac-4db2-9ee5-0ad78b54e09e",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "interpreter": {
+ "hash": "9d70c38e1c0b038dbdffdaa4f8bfa1f6767c43760905c87a9fbe7800d18c6c35"
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.19"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demo_part2.ipynb b/demo_part2.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..cc61023622d71bf414d9f3ae6c958fbe777806b8
--- /dev/null
+++ b/demo_part2.ipynb
@@ -0,0 +1,195 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "b6ee1ede",
+ "metadata": {},
+ "source": [
+ "## Cross-Lingual Voice Clone Demo"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b7f043ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import torch\n",
+ "from openvoice import se_extractor\n",
+ "from openvoice.api import ToneColorConverter"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "15116b59",
+ "metadata": {},
+ "source": [
+ "### Initialization"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aacad912",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ckpt_converter = 'checkpoints/converter'\n",
+ "device=\"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+ "output_dir = 'outputs'\n",
+ "\n",
+ "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
+ "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
+ "\n",
+ "os.makedirs(output_dir, exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3db80fcf",
+ "metadata": {},
+ "source": [
+ "In this demo, we will use OpenAI TTS as the base speaker to produce multi-lingual speech audio. The users can flexibly change the base speaker according to their own needs. Please create a file named `.env` and place OpenAI key as `OPENAI_API_KEY=xxx`. We have also provided a Chinese base speaker model (see `demo_part1.ipynb`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3b245ca3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from openai import OpenAI\n",
+ "from dotenv import load_dotenv\n",
+ "\n",
+ "# Please create a file named .env and place your\n",
+ "# OpenAI key as OPENAI_API_KEY=xxx\n",
+ "load_dotenv() \n",
+ "\n",
+ "client = OpenAI(api_key=os.environ.get(\"OPENAI_API_KEY\"))\n",
+ "\n",
+ "response = client.audio.speech.create(\n",
+ " model=\"tts-1\",\n",
+ " voice=\"nova\",\n",
+ " input=\"This audio will be used to extract the base speaker tone color embedding. \" + \\\n",
+ " \"Typically a very short audio should be sufficient, but increasing the audio \" + \\\n",
+ " \"length will also improve the output audio quality.\"\n",
+ ")\n",
+ "\n",
+ "response.stream_to_file(f\"{output_dir}/openai_source_output.mp3\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f67740c",
+ "metadata": {},
+ "source": [
+ "### Obtain Tone Color Embedding"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f8add279",
+ "metadata": {},
+ "source": [
+ "The `source_se` is the tone color embedding of the base speaker. \n",
+ "It is an average for multiple sentences with multiple emotions\n",
+ "of the base speaker. We directly provide the result here but\n",
+ "the readers feel free to extract `source_se` by themselves."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "63ff6273",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "base_speaker = f\"{output_dir}/openai_source_output.mp3\"\n",
+ "source_se, audio_name = se_extractor.get_se(base_speaker, tone_color_converter, vad=True)\n",
+ "\n",
+ "reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone\n",
+ "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a40284aa",
+ "metadata": {},
+ "source": [
+ "### Inference"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "73dc1259",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run the base speaker tts\n",
+ "text = [\n",
+ " \"MyShell is a decentralized and comprehensive platform for discovering, creating, and staking AI-native apps.\",\n",
+ " \"MyShell es una plataforma descentralizada y completa para descubrir, crear y apostar por aplicaciones nativas de IA.\",\n",
+ " \"MyShell est une plateforme décentralisée et complète pour découvrir, créer et miser sur des applications natives d'IA.\",\n",
+ " \"MyShell ist eine dezentralisierte und umfassende Plattform zum Entdecken, Erstellen und Staken von KI-nativen Apps.\",\n",
+ " \"MyShell è una piattaforma decentralizzata e completa per scoprire, creare e scommettere su app native di intelligenza artificiale.\",\n",
+ " \"MyShellは、AIネイティブアプリの発見、作成、およびステーキングのための分散型かつ包括的なプラットフォームです。\",\n",
+ " \"MyShell — это децентрализованная и всеобъемлющая платформа для обнаружения, создания и стейкинга AI-ориентированных приложений.\",\n",
+ " \"MyShell هي منصة لامركزية وشاملة لاكتشاف وإنشاء ورهان تطبيقات الذكاء الاصطناعي الأصلية.\",\n",
+ " \"MyShell是一个去中心化且全面的平台,用于发现、创建和投资AI原生应用程序。\",\n",
+ " \"MyShell एक विकेंद्रीकृत और व्यापक मंच है, जो AI-मूल ऐप्स की खोज, सृजन और स्टेकिंग के लिए है।\",\n",
+ " \"MyShell é uma plataforma descentralizada e abrangente para descobrir, criar e apostar em aplicativos nativos de IA.\"\n",
+ "]\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "\n",
+ "for i, t in enumerate(text):\n",
+ "\n",
+ " response = client.audio.speech.create(\n",
+ " model=\"tts-1\",\n",
+ " voice=\"nova\",\n",
+ " input=t,\n",
+ " )\n",
+ "\n",
+ " response.stream_to_file(src_path)\n",
+ "\n",
+ " save_path = f'{output_dir}/output_crosslingual_{i}.wav'\n",
+ "\n",
+ " # Run the tone color converter\n",
+ " encode_message = \"@MyShell\"\n",
+ " tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=source_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path,\n",
+ " message=encode_message)"
+ ]
+ }
+ ],
+ "metadata": {
+ "interpreter": {
+ "hash": "9d70c38e1c0b038dbdffdaa4f8bfa1f6767c43760905c87a9fbe7800d18c6c35"
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.18"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demo_part3.ipynb b/demo_part3.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..959039c0cdac1733bf385764b266f370eb4b43ca
--- /dev/null
+++ b/demo_part3.ipynb
@@ -0,0 +1,256 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Multi-Accent and Multi-Lingual Voice Clone Demo with MeloTTS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import torch\n",
+ "from openvoice import se_extractor\n",
+ "from openvoice.api import ToneColorConverter"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Initialization\n",
+ "\n",
+ "In this example, we will use the checkpoints from OpenVoiceV2. OpenVoiceV2 is trained with more aggressive augmentations and thus demonstrate better robustness in some cases."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/russell/miniconda3/envs/openvoice/lib/python3.9/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
+ " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loaded checkpoint 'checkpoints_v2/converter/checkpoint.pth'\n",
+ "missing/unexpected keys: [] []\n"
+ ]
+ }
+ ],
+ "source": [
+ "ckpt_converter = 'checkpoints_v2/converter'\n",
+ "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+ "output_dir = 'outputs_v2'\n",
+ "\n",
+ "tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)\n",
+ "tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')\n",
+ "\n",
+ "os.makedirs(output_dir, exist_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Obtain Tone Color Embedding\n",
+ "We only extract the tone color embedding for the target speaker. The source tone color embeddings can be directly loaded from `checkpoints_v2/ses` folder."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "OpenVoice version: v2\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "dc253b8bc6d34915bec3fa5b526b0348",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading vocabulary.txt: 0%| | 0.00/460k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7c82ae46811248e9abafdf3b901c19a1",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading tokenizer.json: 0%| | 0.00/2.20M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "392369f8bd914110a4c7cffe457bda51",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading model.bin: 0%| | 0.00/1.53G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "80894d63cbcf4d71a11b654eab6a1320",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Downloading config.json: 0%| | 0.00/2.26k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:51\u001b[0m, in \u001b[0;36m_executor_map\u001b[0;34m(PoolExecutor, fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m PoolExecutor(max_workers\u001b[38;5;241m=\u001b[39mmax_workers, initializer\u001b[38;5;241m=\u001b[39mtqdm_class\u001b[38;5;241m.\u001b[39mset_lock,\n\u001b[1;32m 50\u001b[0m initargs\u001b[38;5;241m=\u001b[39m(lk,)) \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtqdm_class\u001b[49m\u001b[43m(\u001b[49m\u001b[43mex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43miterables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunksize\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/notebook.py:250\u001b[0m, in \u001b[0;36mtqdm_notebook.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 249\u001b[0m it \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__iter__\u001b[39m()\n\u001b[0;32m--> 250\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m it:\n\u001b[1;32m 251\u001b[0m \u001b[38;5;66;03m# return super(tqdm...) will not catch exception\u001b[39;00m\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m obj\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/std.py:1169\u001b[0m, in \u001b[0;36mtqdm.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1168\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdisable:\n\u001b[0;32m-> 1169\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[1;32m 1170\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m obj\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/_base.py:609\u001b[0m, in \u001b[0;36mExecutor.map..result_iterator\u001b[0;34m()\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 609\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m \u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/_base.py:441\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[0;32m--> 441\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_condition\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/threading.py:312\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 311\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 312\u001b[0m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 313\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[11], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m reference_speaker \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mresources/example_reference.mp3\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;66;03m# This is the voice you want to clone\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m target_se, audio_name \u001b[38;5;241m=\u001b[39m \u001b[43mse_extractor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_se\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreference_speaker\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtone_color_converter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/Desktop/seamless_communication_test/OpenVoice/openvoice/se_extractor.py:146\u001b[0m, in \u001b[0;36mget_se\u001b[0;34m(audio_path, vc_model, target_dir, vad)\u001b[0m\n\u001b[1;32m 144\u001b[0m wavs_folder \u001b[38;5;241m=\u001b[39m split_audio_vad(audio_path, target_dir\u001b[38;5;241m=\u001b[39mtarget_dir, audio_name\u001b[38;5;241m=\u001b[39maudio_name)\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 146\u001b[0m wavs_folder \u001b[38;5;241m=\u001b[39m \u001b[43msplit_audio_whisper\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtarget_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 148\u001b[0m audio_segs \u001b[38;5;241m=\u001b[39m glob(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mwavs_folder\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/*.wav\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 149\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(audio_segs) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
+ "File \u001b[0;32m~/Desktop/seamless_communication_test/OpenVoice/openvoice/se_extractor.py:22\u001b[0m, in \u001b[0;36msplit_audio_whisper\u001b[0;34m(audio_path, audio_name, target_dir)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mglobal\u001b[39;00m model\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 22\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mWhisperModel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcuda\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcompute_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfloat16\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 23\u001b[0m audio \u001b[38;5;241m=\u001b[39m AudioSegment\u001b[38;5;241m.\u001b[39mfrom_file(audio_path)\n\u001b[1;32m 24\u001b[0m max_len \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(audio)\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/faster_whisper/transcribe.py:122\u001b[0m, in \u001b[0;36mWhisperModel.__init__\u001b[0;34m(self, model_size_or_path, device, device_index, compute_type, cpu_threads, num_workers, download_root, local_files_only)\u001b[0m\n\u001b[1;32m 120\u001b[0m model_path \u001b[38;5;241m=\u001b[39m model_size_or_path\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 122\u001b[0m model_path \u001b[38;5;241m=\u001b[39m \u001b[43mdownload_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 123\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_size_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 124\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 125\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_root\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 126\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel \u001b[38;5;241m=\u001b[39m ctranslate2\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mWhisper(\n\u001b[1;32m 129\u001b[0m model_path,\n\u001b[1;32m 130\u001b[0m device\u001b[38;5;241m=\u001b[39mdevice,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 134\u001b[0m inter_threads\u001b[38;5;241m=\u001b[39mnum_workers,\n\u001b[1;32m 135\u001b[0m )\n\u001b[1;32m 137\u001b[0m tokenizer_file \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(model_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokenizer.json\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/faster_whisper/utils.py:98\u001b[0m, in \u001b[0;36mdownload_model\u001b[0;34m(size_or_id, output_dir, local_files_only, cache_dir)\u001b[0m\n\u001b[1;32m 95\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcache_dir\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m cache_dir\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 98\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhuggingface_hub\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msnapshot_download\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[1;32m 100\u001b[0m huggingface_hub\u001b[38;5;241m.\u001b[39mutils\u001b[38;5;241m.\u001b[39mHfHubHTTPError,\n\u001b[1;32m 101\u001b[0m requests\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mConnectionError,\n\u001b[1;32m 102\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m exception:\n\u001b[1;32m 103\u001b[0m logger \u001b[38;5;241m=\u001b[39m get_logger()\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:118\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m 116\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/huggingface_hub/_snapshot_download.py:239\u001b[0m, in \u001b[0;36msnapshot_download\u001b[0;34m(repo_id, repo_type, revision, endpoint, cache_dir, local_dir, local_dir_use_symlinks, library_name, library_version, user_agent, proxies, etag_timeout, resume_download, force_download, token, local_files_only, allow_patterns, ignore_patterns, max_workers, tqdm_class)\u001b[0m\n\u001b[1;32m 237\u001b[0m _inner_hf_hub_download(file)\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 239\u001b[0m \u001b[43mthread_map\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 240\u001b[0m \u001b[43m \u001b[49m\u001b[43m_inner_hf_hub_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 241\u001b[0m \u001b[43m \u001b[49m\u001b[43mfiltered_repo_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 242\u001b[0m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mFetching \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfiltered_repo_files\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m files\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 243\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_workers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_workers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 244\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# User can use its own tqdm class or the default one from `huggingface_hub.utils`\u001b[39;49;00m\n\u001b[1;32m 245\u001b[0m \u001b[43m \u001b[49m\u001b[43mtqdm_class\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtqdm_class\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mhf_tqdm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 246\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 248\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m local_dir \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 249\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mrealpath(local_dir))\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:69\u001b[0m, in \u001b[0;36mthread_map\u001b[0;34m(fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;124;03mEquivalent of `list(map(fn, *iterables))`\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;124;03mdriven by `concurrent.futures.ThreadPoolExecutor`.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;124;03m [default: max(32, cpu_count() + 4)].\u001b[39;00m\n\u001b[1;32m 67\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconcurrent\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfutures\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ThreadPoolExecutor\n\u001b[0;32m---> 69\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_executor_map\u001b[49m\u001b[43m(\u001b[49m\u001b[43mThreadPoolExecutor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43miterables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mtqdm_kwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:51\u001b[0m, in \u001b[0;36m_executor_map\u001b[0;34m(PoolExecutor, fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ensure_lock(tqdm_class, lock_name\u001b[38;5;241m=\u001b[39mlock_name) \u001b[38;5;28;01mas\u001b[39;00m lk:\n\u001b[1;32m 48\u001b[0m \u001b[38;5;66;03m# share lock in case workers are already using `tqdm`\u001b[39;00m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m PoolExecutor(max_workers\u001b[38;5;241m=\u001b[39mmax_workers, initializer\u001b[38;5;241m=\u001b[39mtqdm_class\u001b[38;5;241m.\u001b[39mset_lock,\n\u001b[1;32m 50\u001b[0m initargs\u001b[38;5;241m=\u001b[39m(lk,)) \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m---> 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(tqdm_class(ex\u001b[38;5;241m.\u001b[39mmap(fn, \u001b[38;5;241m*\u001b[39miterables, chunksize\u001b[38;5;241m=\u001b[39mchunksize), \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/_base.py:637\u001b[0m, in \u001b[0;36mExecutor.__exit__\u001b[0;34m(self, exc_type, exc_val, exc_tb)\u001b[0m\n\u001b[1;32m 636\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__exit__\u001b[39m(\u001b[38;5;28mself\u001b[39m, exc_type, exc_val, exc_tb):\n\u001b[0;32m--> 637\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshutdown\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwait\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 638\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/concurrent/futures/thread.py:235\u001b[0m, in \u001b[0;36mThreadPoolExecutor.shutdown\u001b[0;34m(self, wait, cancel_futures)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m wait:\n\u001b[1;32m 234\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_threads:\n\u001b[0;32m--> 235\u001b[0m \u001b[43mt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/threading.py:1060\u001b[0m, in \u001b[0;36mThread.join\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1057\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcannot join current thread\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 1059\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1060\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait_for_tstate_lock\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1061\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1062\u001b[0m \u001b[38;5;66;03m# the behavior of a negative timeout isn't documented, but\u001b[39;00m\n\u001b[1;32m 1063\u001b[0m \u001b[38;5;66;03m# historically .join(timeout=x) for x<0 has acted as if timeout=0\u001b[39;00m\n\u001b[1;32m 1064\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_wait_for_tstate_lock(timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mmax\u001b[39m(timeout, \u001b[38;5;241m0\u001b[39m))\n",
+ "File \u001b[0;32m~/miniconda3/envs/openvoice/lib/python3.9/threading.py:1080\u001b[0m, in \u001b[0;36mThread._wait_for_tstate_lock\u001b[0;34m(self, block, timeout)\u001b[0m\n\u001b[1;32m 1077\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 1079\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1080\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mlock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblock\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1081\u001b[0m lock\u001b[38;5;241m.\u001b[39mrelease()\n\u001b[1;32m 1082\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_stop()\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "reference_speaker = 'resources/example_reference.mp3' # This is the voice you want to clone\n",
+ "target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, vad=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Use MeloTTS as Base Speakers\n",
+ "\n",
+ "MeloTTS is a high-quality multi-lingual text-to-speech library by @MyShell.ai, supporting languages including English (American, British, Indian, Australian, Default), Spanish, French, Chinese, Japanese, Korean. In the following example, we will use the models in MeloTTS as the base speakers. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from melo.api import TTS\n",
+ "\n",
+ "texts = {\n",
+ " 'EN_NEWEST': \"Did you ever hear a folk tale about a giant turtle?\", # The newest English base speaker model\n",
+ " 'EN': \"Did you ever hear a folk tale about a giant turtle?\",\n",
+ " 'ES': \"El resplandor del sol acaricia las olas, pintando el cielo con una paleta deslumbrante.\",\n",
+ " 'FR': \"La lueur dorée du soleil caresse les vagues, peignant le ciel d'une palette éblouissante.\",\n",
+ " 'ZH': \"在这次vacation中,我们计划去Paris欣赏埃菲尔铁塔和卢浮宫的美景。\",\n",
+ " 'JP': \"彼は毎朝ジョギングをして体を健康に保っています。\",\n",
+ " 'KR': \"안녕하세요! 오늘은 날씨가 정말 좋네요.\",\n",
+ "}\n",
+ "\n",
+ "\n",
+ "src_path = f'{output_dir}/tmp.wav'\n",
+ "\n",
+ "# Speed is adjustable\n",
+ "speed = 1.0\n",
+ "\n",
+ "for language, text in texts.items():\n",
+ " model = TTS(language=language, device=device)\n",
+ " speaker_ids = model.hps.data.spk2id\n",
+ " \n",
+ " for speaker_key in speaker_ids.keys():\n",
+ " speaker_id = speaker_ids[speaker_key]\n",
+ " speaker_key = speaker_key.lower().replace('_', '-')\n",
+ " \n",
+ " source_se = torch.load(f'checkpoints_v2/base_speakers/ses/{speaker_key}.pth', map_location=device)\n",
+ " model.tts_to_file(text, speaker_id, src_path, speed=speed)\n",
+ " save_path = f'{output_dir}/output_v2_{speaker_key}.wav'\n",
+ "\n",
+ " # Run the tone color converter\n",
+ " encode_message = \"@MyShell\"\n",
+ " tone_color_converter.convert(\n",
+ " audio_src_path=src_path, \n",
+ " src_se=source_se, \n",
+ " tgt_se=target_se, \n",
+ " output_path=save_path,\n",
+ " message=encode_message)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.19"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/QA.md b/docs/QA.md
new file mode 100644
index 0000000000000000000000000000000000000000..4736d1cc0bb67f97762e9f97574a4e827e80144e
--- /dev/null
+++ b/docs/QA.md
@@ -0,0 +1,39 @@
+# Common Questions and Answers
+
+## General Comments
+
+**OpenVoice is a Technology, not a Product**
+
+Although it works on a majority of voices if used correctly, please do not expect it to work perfectly on every case, as it takes a lot of engineering effort to translate a technology to a stable product. The targeted users of this technology are developers and researchers, not end users. End users expects a perfect product. However, we are confident to say that OpenVoice is the state-of-the-art among the source-available voice cloning technologies.
+
+The contribution of OpenVoice is a versatile instant voice cloning technical approach, not a ready-to-use perfect voice cloning product. However, we firmly believe that by releasing OpenVoice, we can accelerate the open research community's progress on instant voice cloning, and someday in the future the free voice cloning methods will be as good as commercial ones.
+
+## Issues with Voice Quality
+
+**Accent and Emotion of the Generated Voice is not Similar to the Reference Voice**
+
+First of all, OpenVoice only clones the tone color of the reference speaker. It does NOT clone the accent or emotion. The accent and emotion is controlled by the base speaker TTS model, not cloned by the tone color converter (please refer to our [paper](https://arxiv.org/pdf/2312.01479.pdf) for technical details). If the user wants to change the accent or emotion of the output, they need to have a base speaker model with that accent. OpenVoice provides sufficient flexibility for users to integrate their own base speaker model into the framework by simply replacing the current base speaker we provided.
+
+**Bad Audio Quality of the Generated Speech**
+
+Please check the followings:
+- Is your reference audio is clean enough without any background noise? You can find some high-quality reference speech [here](https://aiartes.com/voiceai)
+- Is your audio too short?
+- Does your audio contain speech from more than one person?
+- Does the reference audio contain long blank sections?
+- Did you name the reference audio the same name you used before but forgot to delete the `processed` folder?
+
+## Issues with Languages
+
+**Support of Other Languages**
+
+For multi-lingual and cross-lingual usage, please refer to [`demo_part2.ipynb`](https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb). OpenVoice supports any language as long as you have a base speaker in that language. The OpenVoice team already did the most difficult part (tone color converter training) for you. Base speaker TTS model is relatively easy to train, and multiple existing open-source repositories support it. If you don't want to train by yourself, simply use the OpenAI TTS model as the base speaker.
+
+## Issues with Installation
+**Error Related to Silero**
+
+When calling `get_vad_segments` from `se_extractor.py`, there should be a message like this:
+```
+Downloading: "https://github.com/snakers4/silero-vad/zipball/master" to /home/user/.cache/torch/hub/master.zip
+```
+The download would fail if your machine can not access github. Please download the zip from "https://github.com/snakers4/silero-vad/zipball/master" manually and unzip it to `/home/user/.cache/torch/hub/snakers4_silero-vad_master`. You can also see [this issue](https://github.com/myshell-ai/OpenVoice/issues/57) for solutions for other versions of silero.
diff --git a/docs/USAGE.md b/docs/USAGE.md
new file mode 100644
index 0000000000000000000000000000000000000000..497a92dd05b568e84b4916ba9072968ebdc090b5
--- /dev/null
+++ b/docs/USAGE.md
@@ -0,0 +1,83 @@
+# Usage
+
+## Table of Content
+
+- [Quick Use](#quick-use): directly use OpenVoice without installation.
+- [Linux Install](#linux-install): for researchers and developers only.
+ - [V1](#openvoice-v1)
+ - [V2](#openvoice-v2)
+- [Install on Other Platforms](#install-on-other-platforms): unofficial installation guide contributed by the community
+
+## Quick Use
+
+The input speech audio of OpenVoice can be in **Any Language**. OpenVoice can clone the voice in that speech audio, and use the voice to speak in multiple languages. For quick use, we recommend you to try the already deployed services:
+
+- [British English](https://app.myshell.ai/widget/vYjqae)
+- [American English](https://app.myshell.ai/widget/nEFFJf)
+- [Indian English](https://app.myshell.ai/widget/V3iYze)
+- [Australian English](https://app.myshell.ai/widget/fM7JVf)
+- [Spanish](https://app.myshell.ai/widget/NNFFVz)
+- [French](https://app.myshell.ai/widget/z2uyUz)
+- [Chinese](https://app.myshell.ai/widget/fU7nUz)
+- [Japanese](https://app.myshell.ai/widget/IfIB3u)
+- [Korean](https://app.myshell.ai/widget/q6ZjIn)
+
+## Minimal Demo
+
+For users who want to quickly try OpenVoice and do not require high quality or stability, click any of the following links:
+
+
+
+
+
+
+
+## Linux Install
+
+This section is only for developers and researchers who are familiar with Linux, Python and PyTorch. Clone this repo, and run
+
+```
+conda create -n openvoice python=3.9
+conda activate openvoice
+git clone git@github.com:myshell-ai/OpenVoice.git
+cd OpenVoice
+pip install -e .
+```
+
+No matter if you are using V1 or V2, the above installation is the same.
+
+### OpenVoice V1
+
+Download the checkpoint from [here](https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_1226.zip) and extract it to the `checkpoints` folder.
+
+**1. Flexible Voice Style Control.**
+Please see [`demo_part1.ipynb`](../demo_part1.ipynb) for an example usage of how OpenVoice enables flexible style control over the cloned voice.
+
+**2. Cross-Lingual Voice Cloning.**
+Please see [`demo_part2.ipynb`](../demo_part2.ipynb) for an example for languages seen or unseen in the MSML training set.
+
+**3. Gradio Demo.**. We provide a minimalist local gradio demo here. We strongly suggest the users to look into `demo_part1.ipynb`, `demo_part2.ipynb` and the [QnA](QA.md) if they run into issues with the gradio demo. Launch a local gradio demo with `python -m openvoice_app --share`.
+
+### OpenVoice V2
+
+Download the checkpoint from [here](https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip) and extract it to the `checkpoints_v2` folder.
+
+Install [MeloTTS](https://github.com/myshell-ai/MeloTTS):
+```
+pip install git+https://github.com/myshell-ai/MeloTTS.git
+python -m unidic download
+```
+
+**Demo Usage.** Please see [`demo_part3.ipynb`](../demo_part3.ipynb) for example usage of OpenVoice V2. Now it natively supports English, Spanish, French, Chinese, Japanese and Korean.
+
+
+## Install on Other Platforms
+
+This section provides the unofficial installation guides by open-source contributors in the community:
+
+- Windows
+ - [Guide](https://github.com/Alienpups/OpenVoice/blob/main/docs/USAGE_WINDOWS.md) by [@Alienpups](https://github.com/Alienpups)
+ - You are welcome to contribute if you have a better installation guide. We will list you here.
+- Docker
+ - [Guide](https://github.com/StevenJSCF/OpenVoice/blob/update-docs/docs/DF_USAGE.md) by [@StevenJSCF](https://github.com/StevenJSCF)
+ - You are welcome to contribute if you have a better installation guide. We will list you here.
diff --git a/openvoice/__init__.py b/openvoice/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/openvoice/__pycache__/__init__.cpython-39.pyc b/openvoice/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11cee5b9b8107504d5ea442a5e0bc89682a76277
Binary files /dev/null and b/openvoice/__pycache__/__init__.cpython-39.pyc differ
diff --git a/openvoice/__pycache__/api.cpython-39.pyc b/openvoice/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1edbd20b3113df62ac6e295d729671a88c42b79e
Binary files /dev/null and b/openvoice/__pycache__/api.cpython-39.pyc differ
diff --git a/openvoice/__pycache__/attentions.cpython-39.pyc b/openvoice/__pycache__/attentions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d37befc7857e05462d91293a6dcad6620ebba4c
Binary files /dev/null and b/openvoice/__pycache__/attentions.cpython-39.pyc differ
diff --git a/openvoice/__pycache__/commons.cpython-39.pyc b/openvoice/__pycache__/commons.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..855f63e8310b093c7ed926f5ffaf3fedbcbb8bca
Binary files /dev/null and b/openvoice/__pycache__/commons.cpython-39.pyc differ
diff --git a/openvoice/__pycache__/mel_processing.cpython-39.pyc b/openvoice/__pycache__/mel_processing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f2f1ef01062ee1913e3be5c63988bc1eef36bf1
Binary files /dev/null and b/openvoice/__pycache__/mel_processing.cpython-39.pyc differ
diff --git a/openvoice/__pycache__/models.cpython-39.pyc b/openvoice/__pycache__/models.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aed08e02d950d83e82959a5d036281171c6ecd34
Binary files /dev/null and b/openvoice/__pycache__/models.cpython-39.pyc differ
diff --git a/openvoice/__pycache__/modules.cpython-39.pyc b/openvoice/__pycache__/modules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b54c1d79813e9e5dde44d3d3bef5a7fa2b89f72a
Binary files /dev/null and b/openvoice/__pycache__/modules.cpython-39.pyc differ
diff --git a/openvoice/__pycache__/se_extractor.cpython-39.pyc b/openvoice/__pycache__/se_extractor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7462f41add1a5f5e1e8939aa40a224b20ff00550
Binary files /dev/null and b/openvoice/__pycache__/se_extractor.cpython-39.pyc differ
diff --git a/openvoice/__pycache__/transforms.cpython-39.pyc b/openvoice/__pycache__/transforms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..842fef26946204d549165e04f223f9fe3d12bffc
Binary files /dev/null and b/openvoice/__pycache__/transforms.cpython-39.pyc differ
diff --git a/openvoice/__pycache__/utils.cpython-39.pyc b/openvoice/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf965ae75bc5cc2cbc53b5edff8073b365db23a7
Binary files /dev/null and b/openvoice/__pycache__/utils.cpython-39.pyc differ
diff --git a/openvoice/api.py b/openvoice/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..48f7eebb098c7028edb65f004b9270a7aa11c520
--- /dev/null
+++ b/openvoice/api.py
@@ -0,0 +1,202 @@
+import torch
+import numpy as np
+import re
+import soundfile
+from openvoice import utils
+from openvoice import commons
+import os
+import librosa
+from openvoice.text import text_to_sequence
+from openvoice.mel_processing import spectrogram_torch
+from openvoice.models import SynthesizerTrn
+
+
+class OpenVoiceBaseClass(object):
+ def __init__(self,
+ config_path,
+ device='cuda:0'):
+ if 'cuda' in device:
+ assert torch.cuda.is_available()
+
+ hps = utils.get_hparams_from_file(config_path)
+
+ model = SynthesizerTrn(
+ len(getattr(hps, 'symbols', [])),
+ hps.data.filter_length // 2 + 1,
+ n_speakers=hps.data.n_speakers,
+ **hps.model,
+ ).to(device)
+
+ model.eval()
+ self.model = model
+ self.hps = hps
+ self.device = device
+
+ def load_ckpt(self, ckpt_path):
+ checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
+ a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
+ print("Loaded checkpoint '{}'".format(ckpt_path))
+ print('missing/unexpected keys:', a, b)
+
+
+class BaseSpeakerTTS(OpenVoiceBaseClass):
+ language_marks = {
+ "english": "EN",
+ "chinese": "ZH",
+ }
+
+ @staticmethod
+ def get_text(text, hps, is_symbol):
+ text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
+ if hps.data.add_blank:
+ text_norm = commons.intersperse(text_norm, 0)
+ text_norm = torch.LongTensor(text_norm)
+ return text_norm
+
+ @staticmethod
+ def audio_numpy_concat(segment_data_list, sr, speed=1.):
+ audio_segments = []
+ for segment_data in segment_data_list:
+ audio_segments += segment_data.reshape(-1).tolist()
+ audio_segments += [0] * int((sr * 0.05)/speed)
+ audio_segments = np.array(audio_segments).astype(np.float32)
+ return audio_segments
+
+ @staticmethod
+ def split_sentences_into_pieces(text, language_str):
+ texts = utils.split_sentence(text, language_str=language_str)
+ print(" > Text splitted to sentences.")
+ print('\n'.join(texts))
+ print(" > ===========================")
+ return texts
+
+ def tts(self, text, output_path, speaker, language='English', speed=1.0):
+ mark = self.language_marks.get(language.lower(), None)
+ assert mark is not None, f"language {language} is not supported"
+
+ texts = self.split_sentences_into_pieces(text, mark)
+
+ audio_list = []
+ for t in texts:
+ t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
+ t = f'[{mark}]{t}[{mark}]'
+ stn_tst = self.get_text(t, self.hps, False)
+ device = self.device
+ speaker_id = self.hps.speakers[speaker]
+ with torch.no_grad():
+ x_tst = stn_tst.unsqueeze(0).to(device)
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
+ sid = torch.LongTensor([speaker_id]).to(device)
+ audio = self.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6,
+ length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
+ audio_list.append(audio)
+ audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
+
+ if output_path is None:
+ return audio
+ else:
+ soundfile.write(output_path, audio, self.hps.data.sampling_rate)
+
+
+class ToneColorConverter(OpenVoiceBaseClass):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ if kwargs.get('enable_watermark', True):
+ import wavmark
+ self.watermark_model = wavmark.load_model().to(self.device)
+ else:
+ self.watermark_model = None
+ self.version = getattr(self.hps, '_version_', "v1")
+
+
+
+ def extract_se(self, ref_wav_list, se_save_path=None):
+ if isinstance(ref_wav_list, str):
+ ref_wav_list = [ref_wav_list]
+
+ device = self.device
+ hps = self.hps
+ gs = []
+
+ for fname in ref_wav_list:
+ audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
+ y = torch.FloatTensor(audio_ref)
+ y = y.to(device)
+ y = y.unsqueeze(0)
+ y = spectrogram_torch(y, hps.data.filter_length,
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
+ center=False).to(device)
+ with torch.no_grad():
+ g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
+ gs.append(g.detach())
+ gs = torch.stack(gs).mean(0)
+
+ if se_save_path is not None:
+ os.makedirs(os.path.dirname(se_save_path), exist_ok=True)
+ torch.save(gs.cpu(), se_save_path)
+
+ return gs
+
+ def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
+ hps = self.hps
+ # load audio
+ audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
+ audio = torch.tensor(audio).float()
+
+ with torch.no_grad():
+ y = torch.FloatTensor(audio).to(self.device)
+ y = y.unsqueeze(0)
+ spec = spectrogram_torch(y, hps.data.filter_length,
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
+ center=False).to(self.device)
+ spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
+ audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
+ 0, 0].data.cpu().float().numpy()
+ audio = self.add_watermark(audio, message)
+ if output_path is None:
+ return audio
+ else:
+ soundfile.write(output_path, audio, hps.data.sampling_rate)
+
+ def add_watermark(self, audio, message):
+ if self.watermark_model is None:
+ return audio
+ device = self.device
+ bits = utils.string_to_bits(message).reshape(-1)
+ n_repeat = len(bits) // 32
+
+ K = 16000
+ coeff = 2
+ for n in range(n_repeat):
+ trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
+ if len(trunck) != K:
+ print('Audio too short, fail to add watermark')
+ break
+ message_npy = bits[n * 32: (n + 1) * 32]
+
+ with torch.no_grad():
+ signal = torch.FloatTensor(trunck).to(device)[None]
+ message_tensor = torch.FloatTensor(message_npy).to(device)[None]
+ signal_wmd_tensor = self.watermark_model.encode(signal, message_tensor)
+ signal_wmd_npy = signal_wmd_tensor.detach().cpu().squeeze()
+ audio[(coeff * n) * K: (coeff * n + 1) * K] = signal_wmd_npy
+ return audio
+
+ def detect_watermark(self, audio, n_repeat):
+ bits = []
+ K = 16000
+ coeff = 2
+ for n in range(n_repeat):
+ trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
+ if len(trunck) != K:
+ print('Audio too short, fail to detect watermark')
+ return 'Fail'
+ with torch.no_grad():
+ signal = torch.FloatTensor(trunck).to(self.device).unsqueeze(0)
+ message_decoded_npy = (self.watermark_model.decode(signal) >= 0.5).int().detach().cpu().numpy().squeeze()
+ bits.append(message_decoded_npy)
+ bits = np.stack(bits).reshape(-1, 8)
+ message = utils.bits_to_string(bits)
+ return message
+
diff --git a/openvoice/attentions.py b/openvoice/attentions.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c410f07a6751bedfb94bb0d169f3f36f797ee45
--- /dev/null
+++ b/openvoice/attentions.py
@@ -0,0 +1,465 @@
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from openvoice import commons
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class LayerNorm(nn.Module):
+ def __init__(self, channels, eps=1e-5):
+ super().__init__()
+ self.channels = channels
+ self.eps = eps
+
+ self.gamma = nn.Parameter(torch.ones(channels))
+ self.beta = nn.Parameter(torch.zeros(channels))
+
+ def forward(self, x):
+ x = x.transpose(1, -1)
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+ return x.transpose(1, -1)
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a + input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+class Encoder(nn.Module):
+ def __init__(
+ self,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size=1,
+ p_dropout=0.0,
+ window_size=4,
+ isflow=True,
+ **kwargs
+ ):
+ super().__init__()
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.window_size = window_size
+ # if isflow:
+ # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
+ # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
+ # self.cond_layer = weight_norm(cond_layer, name='weight')
+ # self.gin_channels = 256
+ self.cond_layer_idx = self.n_layers
+ if "gin_channels" in kwargs:
+ self.gin_channels = kwargs["gin_channels"]
+ if self.gin_channels != 0:
+ self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
+ # vits2 says 3rd block, so idx is 2 by default
+ self.cond_layer_idx = (
+ kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
+ )
+ # logging.debug(self.gin_channels, self.cond_layer_idx)
+ assert (
+ self.cond_layer_idx < self.n_layers
+ ), "cond_layer_idx should be less than n_layers"
+ self.drop = nn.Dropout(p_dropout)
+ self.attn_layers = nn.ModuleList()
+ self.norm_layers_1 = nn.ModuleList()
+ self.ffn_layers = nn.ModuleList()
+ self.norm_layers_2 = nn.ModuleList()
+
+ for i in range(self.n_layers):
+ self.attn_layers.append(
+ MultiHeadAttention(
+ hidden_channels,
+ hidden_channels,
+ n_heads,
+ p_dropout=p_dropout,
+ window_size=window_size,
+ )
+ )
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
+ self.ffn_layers.append(
+ FFN(
+ hidden_channels,
+ hidden_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout=p_dropout,
+ )
+ )
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+ def forward(self, x, x_mask, g=None):
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+ x = x * x_mask
+ for i in range(self.n_layers):
+ if i == self.cond_layer_idx and g is not None:
+ g = self.spk_emb_linear(g.transpose(1, 2))
+ g = g.transpose(1, 2)
+ x = x + g
+ x = x * x_mask
+ y = self.attn_layers[i](x, x, attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_1[i](x + y)
+
+ y = self.ffn_layers[i](x, x_mask)
+ y = self.drop(y)
+ x = self.norm_layers_2[i](x + y)
+ x = x * x_mask
+ return x
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size=1,
+ p_dropout=0.0,
+ proximal_bias=False,
+ proximal_init=True,
+ **kwargs
+ ):
+ super().__init__()
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.proximal_bias = proximal_bias
+ self.proximal_init = proximal_init
+
+ self.drop = nn.Dropout(p_dropout)
+ self.self_attn_layers = nn.ModuleList()
+ self.norm_layers_0 = nn.ModuleList()
+ self.encdec_attn_layers = nn.ModuleList()
+ self.norm_layers_1 = nn.ModuleList()
+ self.ffn_layers = nn.ModuleList()
+ self.norm_layers_2 = nn.ModuleList()
+ for i in range(self.n_layers):
+ self.self_attn_layers.append(
+ MultiHeadAttention(
+ hidden_channels,
+ hidden_channels,
+ n_heads,
+ p_dropout=p_dropout,
+ proximal_bias=proximal_bias,
+ proximal_init=proximal_init,
+ )
+ )
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
+ self.encdec_attn_layers.append(
+ MultiHeadAttention(
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
+ )
+ )
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
+ self.ffn_layers.append(
+ FFN(
+ hidden_channels,
+ hidden_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout=p_dropout,
+ causal=True,
+ )
+ )
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+ def forward(self, x, x_mask, h, h_mask):
+ """
+ x: decoder input
+ h: encoder output
+ """
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
+ device=x.device, dtype=x.dtype
+ )
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+ x = x * x_mask
+ for i in range(self.n_layers):
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_0[i](x + y)
+
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_1[i](x + y)
+
+ y = self.ffn_layers[i](x, x_mask)
+ y = self.drop(y)
+ x = self.norm_layers_2[i](x + y)
+ x = x * x_mask
+ return x
+
+
+class MultiHeadAttention(nn.Module):
+ def __init__(
+ self,
+ channels,
+ out_channels,
+ n_heads,
+ p_dropout=0.0,
+ window_size=None,
+ heads_share=True,
+ block_length=None,
+ proximal_bias=False,
+ proximal_init=False,
+ ):
+ super().__init__()
+ assert channels % n_heads == 0
+
+ self.channels = channels
+ self.out_channels = out_channels
+ self.n_heads = n_heads
+ self.p_dropout = p_dropout
+ self.window_size = window_size
+ self.heads_share = heads_share
+ self.block_length = block_length
+ self.proximal_bias = proximal_bias
+ self.proximal_init = proximal_init
+ self.attn = None
+
+ self.k_channels = channels // n_heads
+ self.conv_q = nn.Conv1d(channels, channels, 1)
+ self.conv_k = nn.Conv1d(channels, channels, 1)
+ self.conv_v = nn.Conv1d(channels, channels, 1)
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
+ self.drop = nn.Dropout(p_dropout)
+
+ if window_size is not None:
+ n_heads_rel = 1 if heads_share else n_heads
+ rel_stddev = self.k_channels**-0.5
+ self.emb_rel_k = nn.Parameter(
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+ * rel_stddev
+ )
+ self.emb_rel_v = nn.Parameter(
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+ * rel_stddev
+ )
+
+ nn.init.xavier_uniform_(self.conv_q.weight)
+ nn.init.xavier_uniform_(self.conv_k.weight)
+ nn.init.xavier_uniform_(self.conv_v.weight)
+ if proximal_init:
+ with torch.no_grad():
+ self.conv_k.weight.copy_(self.conv_q.weight)
+ self.conv_k.bias.copy_(self.conv_q.bias)
+
+ def forward(self, x, c, attn_mask=None):
+ q = self.conv_q(x)
+ k = self.conv_k(c)
+ v = self.conv_v(c)
+
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+ x = self.conv_o(x)
+ return x
+
+ def attention(self, query, key, value, mask=None):
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
+ b, d, t_s, t_t = (*key.size(), query.size(2))
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+ if self.window_size is not None:
+ assert (
+ t_s == t_t
+ ), "Relative attention is only available for self-attention."
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+ rel_logits = self._matmul_with_relative_keys(
+ query / math.sqrt(self.k_channels), key_relative_embeddings
+ )
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
+ scores = scores + scores_local
+ if self.proximal_bias:
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
+ scores = scores + self._attention_bias_proximal(t_s).to(
+ device=scores.device, dtype=scores.dtype
+ )
+ if mask is not None:
+ scores = scores.masked_fill(mask == 0, -1e4)
+ if self.block_length is not None:
+ assert (
+ t_s == t_t
+ ), "Local attention is only available for self-attention."
+ block_mask = (
+ torch.ones_like(scores)
+ .triu(-self.block_length)
+ .tril(self.block_length)
+ )
+ scores = scores.masked_fill(block_mask == 0, -1e4)
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
+ p_attn = self.drop(p_attn)
+ output = torch.matmul(p_attn, value)
+ if self.window_size is not None:
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
+ value_relative_embeddings = self._get_relative_embeddings(
+ self.emb_rel_v, t_s
+ )
+ output = output + self._matmul_with_relative_values(
+ relative_weights, value_relative_embeddings
+ )
+ output = (
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
+ return output, p_attn
+
+ def _matmul_with_relative_values(self, x, y):
+ """
+ x: [b, h, l, m]
+ y: [h or 1, m, d]
+ ret: [b, h, l, d]
+ """
+ ret = torch.matmul(x, y.unsqueeze(0))
+ return ret
+
+ def _matmul_with_relative_keys(self, x, y):
+ """
+ x: [b, h, l, d]
+ y: [h or 1, m, d]
+ ret: [b, h, l, m]
+ """
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+ return ret
+
+ def _get_relative_embeddings(self, relative_embeddings, length):
+ 2 * self.window_size + 1
+ # Pad first before slice to avoid using cond ops.
+ pad_length = max(length - (self.window_size + 1), 0)
+ slice_start_position = max((self.window_size + 1) - length, 0)
+ slice_end_position = slice_start_position + 2 * length - 1
+ if pad_length > 0:
+ padded_relative_embeddings = F.pad(
+ relative_embeddings,
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+ )
+ else:
+ padded_relative_embeddings = relative_embeddings
+ used_relative_embeddings = padded_relative_embeddings[
+ :, slice_start_position:slice_end_position
+ ]
+ return used_relative_embeddings
+
+ def _relative_position_to_absolute_position(self, x):
+ """
+ x: [b, h, l, 2*l-1]
+ ret: [b, h, l, l]
+ """
+ batch, heads, length, _ = x.size()
+ # Concat columns of pad to shift from relative to absolute indexing.
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
+ x_flat = x.view([batch, heads, length * 2 * length])
+ x_flat = F.pad(
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+ )
+
+ # Reshape and slice out the padded elements.
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+ :, :, :length, length - 1 :
+ ]
+ return x_final
+
+ def _absolute_position_to_relative_position(self, x):
+ """
+ x: [b, h, l, l]
+ ret: [b, h, l, 2*l-1]
+ """
+ batch, heads, length, _ = x.size()
+ # pad along column
+ x = F.pad(
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+ )
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+ # add 0's in the beginning that will skew the elements after reshape
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+ return x_final
+
+ def _attention_bias_proximal(self, length):
+ """Bias for self-attention to encourage attention to close positions.
+ Args:
+ length: an integer scalar.
+ Returns:
+ a Tensor with shape [1, 1, length, length]
+ """
+ r = torch.arange(length, dtype=torch.float32)
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class FFN(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout=0.0,
+ activation=None,
+ causal=False,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.filter_channels = filter_channels
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.activation = activation
+ self.causal = causal
+
+ if causal:
+ self.padding = self._causal_padding
+ else:
+ self.padding = self._same_padding
+
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+ self.drop = nn.Dropout(p_dropout)
+
+ def forward(self, x, x_mask):
+ x = self.conv_1(self.padding(x * x_mask))
+ if self.activation == "gelu":
+ x = x * torch.sigmoid(1.702 * x)
+ else:
+ x = torch.relu(x)
+ x = self.drop(x)
+ x = self.conv_2(self.padding(x * x_mask))
+ return x * x_mask
+
+ def _causal_padding(self, x):
+ if self.kernel_size == 1:
+ return x
+ pad_l = self.kernel_size - 1
+ pad_r = 0
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(x, commons.convert_pad_shape(padding))
+ return x
+
+ def _same_padding(self, x):
+ if self.kernel_size == 1:
+ return x
+ pad_l = (self.kernel_size - 1) // 2
+ pad_r = self.kernel_size // 2
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(x, commons.convert_pad_shape(padding))
+ return x
diff --git a/openvoice/commons.py b/openvoice/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3fa07f65b1681e1f469b04b2fe689b7c174eaaa
--- /dev/null
+++ b/openvoice/commons.py
@@ -0,0 +1,160 @@
+import math
+import torch
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+ classname = m.__class__.__name__
+ if classname.find("Conv") != -1:
+ m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+ return int((kernel_size * dilation - dilation) / 2)
+
+
+def convert_pad_shape(pad_shape):
+ layer = pad_shape[::-1]
+ pad_shape = [item for sublist in layer for item in sublist]
+ return pad_shape
+
+
+def intersperse(lst, item):
+ result = [item] * (len(lst) * 2 + 1)
+ result[1::2] = lst
+ return result
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+ """KL(P||Q)"""
+ kl = (logs_q - logs_p) - 0.5
+ kl += (
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
+ )
+ return kl
+
+
+def rand_gumbel(shape):
+ """Sample from the Gumbel distribution, protect from overflows."""
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+ return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+ return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+ ret = torch.zeros_like(x[:, :, :segment_size])
+ for i in range(x.size(0)):
+ idx_str = ids_str[i]
+ idx_end = idx_str + segment_size
+ ret[i] = x[i, :, idx_str:idx_end]
+ return ret
+
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+ b, d, t = x.size()
+ if x_lengths is None:
+ x_lengths = t
+ ids_str_max = x_lengths - segment_size + 1
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+ ret = slice_segments(x, ids_str, segment_size)
+ return ret, ids_str
+
+
+def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
+ position = torch.arange(length, dtype=torch.float)
+ num_timescales = channels // 2
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
+ num_timescales - 1
+ )
+ inv_timescales = min_timescale * torch.exp(
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
+ )
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
+ signal = signal.view(1, channels, length)
+ return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+ b, channels, length = x.size()
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+ return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+ b, channels, length = x.size()
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+ return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a + input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+def convert_pad_shape(pad_shape):
+ layer = pad_shape[::-1]
+ pad_shape = [item for sublist in layer for item in sublist]
+ return pad_shape
+
+
+def shift_1d(x):
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+ return x
+
+
+def sequence_mask(length, max_length=None):
+ if max_length is None:
+ max_length = length.max()
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+ return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+ """
+ duration: [b, 1, t_x]
+ mask: [b, 1, t_y, t_x]
+ """
+
+ b, _, t_y, t_x = mask.shape
+ cum_duration = torch.cumsum(duration, -1)
+
+ cum_duration_flat = cum_duration.view(b * t_x)
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+ path = path.view(b, t_x, t_y)
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+ path = path.unsqueeze(1).transpose(2, 3) * mask
+ return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+ if isinstance(parameters, torch.Tensor):
+ parameters = [parameters]
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
+ norm_type = float(norm_type)
+ if clip_value is not None:
+ clip_value = float(clip_value)
+
+ total_norm = 0
+ for p in parameters:
+ param_norm = p.grad.data.norm(norm_type)
+ total_norm += param_norm.item() ** norm_type
+ if clip_value is not None:
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
+ total_norm = total_norm ** (1.0 / norm_type)
+ return total_norm
diff --git a/openvoice/mel_processing.py b/openvoice/mel_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..822d7f19062497b198ae54554a3ab828c10147ad
--- /dev/null
+++ b/openvoice/mel_processing.py
@@ -0,0 +1,183 @@
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+
+MAX_WAV_VALUE = 32768.0
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+ """
+ PARAMS
+ ------
+ C: compression factor
+ """
+ return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression_torch(x, C=1):
+ """
+ PARAMS
+ ------
+ C: compression factor used to compress
+ """
+ return torch.exp(x) / C
+
+
+def spectral_normalize_torch(magnitudes):
+ output = dynamic_range_compression_torch(magnitudes)
+ return output
+
+
+def spectral_de_normalize_torch(magnitudes):
+ output = dynamic_range_decompression_torch(magnitudes)
+ return output
+
+
+mel_basis = {}
+hann_window = {}
+
+
+def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+ if torch.min(y) < -1.1:
+ print("min value is ", torch.min(y))
+ if torch.max(y) > 1.1:
+ print("max value is ", torch.max(y))
+
+ global hann_window
+ dtype_device = str(y.dtype) + "_" + str(y.device)
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
+ dtype=y.dtype, device=y.device
+ )
+
+ y = torch.nn.functional.pad(
+ y.unsqueeze(1),
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+ mode="reflect",
+ )
+ y = y.squeeze(1)
+
+ spec = torch.stft(
+ y,
+ n_fft,
+ hop_length=hop_size,
+ win_length=win_size,
+ window=hann_window[wnsize_dtype_device],
+ center=center,
+ pad_mode="reflect",
+ normalized=False,
+ onesided=True,
+ return_complex=False,
+ )
+
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+ return spec
+
+
+def spectrogram_torch_conv(y, n_fft, sampling_rate, hop_size, win_size, center=False):
+ # if torch.min(y) < -1.:
+ # print('min value is ', torch.min(y))
+ # if torch.max(y) > 1.:
+ # print('max value is ', torch.max(y))
+
+ global hann_window
+ dtype_device = str(y.dtype) + '_' + str(y.device)
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
+
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+
+ # ******************** original ************************#
+ # y = y.squeeze(1)
+ # spec1 = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+ # center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+
+ # ******************** ConvSTFT ************************#
+ freq_cutoff = n_fft // 2 + 1
+ fourier_basis = torch.view_as_real(torch.fft.fft(torch.eye(n_fft)))
+ forward_basis = fourier_basis[:freq_cutoff].permute(2, 0, 1).reshape(-1, 1, fourier_basis.shape[1])
+ forward_basis = forward_basis * torch.as_tensor(librosa.util.pad_center(torch.hann_window(win_size), size=n_fft)).float()
+
+ import torch.nn.functional as F
+
+ # if center:
+ # signal = F.pad(y[:, None, None, :], (n_fft // 2, n_fft // 2, 0, 0), mode = 'reflect').squeeze(1)
+ assert center is False
+
+ forward_transform_squared = F.conv1d(y, forward_basis.to(y.device), stride = hop_size)
+ spec2 = torch.stack([forward_transform_squared[:, :freq_cutoff, :], forward_transform_squared[:, freq_cutoff:, :]], dim = -1)
+
+
+ # ******************** Verification ************************#
+ spec1 = torch.stft(y.squeeze(1), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+ assert torch.allclose(spec1, spec2, atol=1e-4)
+
+ spec = torch.sqrt(spec2.pow(2).sum(-1) + 1e-6)
+ return spec
+
+
+def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
+ global mel_basis
+ dtype_device = str(spec.dtype) + "_" + str(spec.device)
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
+ if fmax_dtype_device not in mel_basis:
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
+ dtype=spec.dtype, device=spec.device
+ )
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+ spec = spectral_normalize_torch(spec)
+ return spec
+
+
+def mel_spectrogram_torch(
+ y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
+):
+ if torch.min(y) < -1.0:
+ print("min value is ", torch.min(y))
+ if torch.max(y) > 1.0:
+ print("max value is ", torch.max(y))
+
+ global mel_basis, hann_window
+ dtype_device = str(y.dtype) + "_" + str(y.device)
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
+ if fmax_dtype_device not in mel_basis:
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
+ dtype=y.dtype, device=y.device
+ )
+ if wnsize_dtype_device not in hann_window:
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
+ dtype=y.dtype, device=y.device
+ )
+
+ y = torch.nn.functional.pad(
+ y.unsqueeze(1),
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
+ mode="reflect",
+ )
+ y = y.squeeze(1)
+
+ spec = torch.stft(
+ y,
+ n_fft,
+ hop_length=hop_size,
+ win_length=win_size,
+ window=hann_window[wnsize_dtype_device],
+ center=center,
+ pad_mode="reflect",
+ normalized=False,
+ onesided=True,
+ return_complex=False,
+ )
+
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
+ spec = spectral_normalize_torch(spec)
+
+ return spec
\ No newline at end of file
diff --git a/openvoice/models.py b/openvoice/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7c659a0f6347de37adc9362d873cd48eee840ef
--- /dev/null
+++ b/openvoice/models.py
@@ -0,0 +1,499 @@
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from openvoice import commons
+from openvoice import modules
+from openvoice import attentions
+
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+
+from openvoice.commons import init_weights, get_padding
+
+
+class TextEncoder(nn.Module):
+ def __init__(self,
+ n_vocab,
+ out_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout):
+ super().__init__()
+ self.n_vocab = n_vocab
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+
+ self.emb = nn.Embedding(n_vocab, hidden_channels)
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+
+ self.encoder = attentions.Encoder(
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout)
+ self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, x, x_lengths):
+ x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
+ x = torch.transpose(x, 1, -1) # [b, h, t]
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+
+ x = self.encoder(x * x_mask, x_mask)
+ stats = self.proj(x) * x_mask
+
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ return x, m, logs, x_mask
+
+
+class DurationPredictor(nn.Module):
+ def __init__(
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
+ ):
+ super().__init__()
+
+ self.in_channels = in_channels
+ self.filter_channels = filter_channels
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.gin_channels = gin_channels
+
+ self.drop = nn.Dropout(p_dropout)
+ self.conv_1 = nn.Conv1d(
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+ )
+ self.norm_1 = modules.LayerNorm(filter_channels)
+ self.conv_2 = nn.Conv1d(
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+ )
+ self.norm_2 = modules.LayerNorm(filter_channels)
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+
+ def forward(self, x, x_mask, g=None):
+ x = torch.detach(x)
+ if g is not None:
+ g = torch.detach(g)
+ x = x + self.cond(g)
+ x = self.conv_1(x * x_mask)
+ x = torch.relu(x)
+ x = self.norm_1(x)
+ x = self.drop(x)
+ x = self.conv_2(x * x_mask)
+ x = torch.relu(x)
+ x = self.norm_2(x)
+ x = self.drop(x)
+ x = self.proj(x * x_mask)
+ return x * x_mask
+
+class StochasticDurationPredictor(nn.Module):
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
+ super().__init__()
+ filter_channels = in_channels # it needs to be removed from future version.
+ self.in_channels = in_channels
+ self.filter_channels = filter_channels
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.n_flows = n_flows
+ self.gin_channels = gin_channels
+
+ self.log_flow = modules.Log()
+ self.flows = nn.ModuleList()
+ self.flows.append(modules.ElementwiseAffine(2))
+ for i in range(n_flows):
+ self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+ self.flows.append(modules.Flip())
+
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+ self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+ self.post_flows = nn.ModuleList()
+ self.post_flows.append(modules.ElementwiseAffine(2))
+ for i in range(4):
+ self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
+ self.post_flows.append(modules.Flip())
+
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
+ self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
+ x = torch.detach(x)
+ x = self.pre(x)
+ if g is not None:
+ g = torch.detach(g)
+ x = x + self.cond(g)
+ x = self.convs(x, x_mask)
+ x = self.proj(x) * x_mask
+
+ if not reverse:
+ flows = self.flows
+ assert w is not None
+
+ logdet_tot_q = 0
+ h_w = self.post_pre(w)
+ h_w = self.post_convs(h_w, x_mask)
+ h_w = self.post_proj(h_w) * x_mask
+ e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
+ z_q = e_q
+ for flow in self.post_flows:
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
+ logdet_tot_q += logdet_q
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
+ u = torch.sigmoid(z_u) * x_mask
+ z0 = (w - u) * x_mask
+ logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
+ logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
+
+ logdet_tot = 0
+ z0, logdet = self.log_flow(z0, x_mask)
+ logdet_tot += logdet
+ z = torch.cat([z0, z1], 1)
+ for flow in flows:
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
+ logdet_tot = logdet_tot + logdet
+ nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
+ return nll + logq # [b]
+ else:
+ flows = list(reversed(self.flows))
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
+ z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
+ for flow in flows:
+ z = flow(z, x_mask, g=x, reverse=reverse)
+ z0, z1 = torch.split(z, [1, 1], 1)
+ logw = z0
+ return logw
+
+class PosteriorEncoder(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=0,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+ self.enc = modules.WN(
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=gin_channels,
+ )
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, x, x_lengths, g=None, tau=1.0):
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+ x.dtype
+ )
+ x = self.pre(x) * x_mask
+ x = self.enc(x, x_mask, g=g)
+ stats = self.proj(x) * x_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ z = (m + torch.randn_like(m) * tau * torch.exp(logs)) * x_mask
+ return z, m, logs, x_mask
+
+
+class Generator(torch.nn.Module):
+ def __init__(
+ self,
+ initial_channel,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=0,
+ ):
+ super(Generator, self).__init__()
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+ self.conv_pre = Conv1d(
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
+ )
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+
+ self.ups = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ self.ups.append(
+ weight_norm(
+ ConvTranspose1d(
+ upsample_initial_channel // (2**i),
+ upsample_initial_channel // (2 ** (i + 1)),
+ k,
+ u,
+ padding=(k - u) // 2,
+ )
+ )
+ )
+
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel // (2 ** (i + 1))
+ for j, (k, d) in enumerate(
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
+ ):
+ self.resblocks.append(resblock(ch, k, d))
+
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+ self.ups.apply(init_weights)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ def forward(self, x, g=None):
+ x = self.conv_pre(x)
+ if g is not None:
+ x = x + self.cond(g)
+
+ for i in range(self.num_upsamples):
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ x = self.ups[i](x)
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i * self.num_kernels + j](x)
+ else:
+ xs += self.resblocks[i * self.num_kernels + j](x)
+ x = xs / self.num_kernels
+ x = F.leaky_relu(x)
+ x = self.conv_post(x)
+ x = torch.tanh(x)
+
+ return x
+
+ def remove_weight_norm(self):
+ print("Removing weight norm...")
+ for layer in self.ups:
+ remove_weight_norm(layer)
+ for layer in self.resblocks:
+ layer.remove_weight_norm()
+
+
+class ReferenceEncoder(nn.Module):
+ """
+ inputs --- [N, Ty/r, n_mels*r] mels
+ outputs --- [N, ref_enc_gru_size]
+ """
+
+ def __init__(self, spec_channels, gin_channels=0, layernorm=True):
+ super().__init__()
+ self.spec_channels = spec_channels
+ ref_enc_filters = [32, 32, 64, 64, 128, 128]
+ K = len(ref_enc_filters)
+ filters = [1] + ref_enc_filters
+ convs = [
+ weight_norm(
+ nn.Conv2d(
+ in_channels=filters[i],
+ out_channels=filters[i + 1],
+ kernel_size=(3, 3),
+ stride=(2, 2),
+ padding=(1, 1),
+ )
+ )
+ for i in range(K)
+ ]
+ self.convs = nn.ModuleList(convs)
+
+ out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
+ self.gru = nn.GRU(
+ input_size=ref_enc_filters[-1] * out_channels,
+ hidden_size=256 // 2,
+ batch_first=True,
+ )
+ self.proj = nn.Linear(128, gin_channels)
+ if layernorm:
+ self.layernorm = nn.LayerNorm(self.spec_channels)
+ else:
+ self.layernorm = None
+
+ def forward(self, inputs, mask=None):
+ N = inputs.size(0)
+
+ out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
+ if self.layernorm is not None:
+ out = self.layernorm(out)
+
+ for conv in self.convs:
+ out = conv(out)
+ # out = wn(out)
+ out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
+
+ out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
+ T = out.size(1)
+ N = out.size(0)
+ out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
+
+ self.gru.flatten_parameters()
+ memory, out = self.gru(out) # out --- [1, N, 128]
+
+ return self.proj(out.squeeze(0))
+
+ def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
+ for i in range(n_convs):
+ L = (L - kernel_size + 2 * pad) // stride + 1
+ return L
+
+
+class ResidualCouplingBlock(nn.Module):
+ def __init__(self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ n_flows=4,
+ gin_channels=0):
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.n_flows = n_flows
+ self.gin_channels = gin_channels
+
+ self.flows = nn.ModuleList()
+ for i in range(n_flows):
+ self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+ self.flows.append(modules.Flip())
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ if not reverse:
+ for flow in self.flows:
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
+ else:
+ for flow in reversed(self.flows):
+ x = flow(x, x_mask, g=g, reverse=reverse)
+ return x
+
+class SynthesizerTrn(nn.Module):
+ """
+ Synthesizer for Training
+ """
+
+ def __init__(
+ self,
+ n_vocab,
+ spec_channels,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ n_speakers=256,
+ gin_channels=256,
+ zero_g=False,
+ **kwargs
+ ):
+ super().__init__()
+
+ self.dec = Generator(
+ inter_channels,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=gin_channels,
+ )
+ self.enc_q = PosteriorEncoder(
+ spec_channels,
+ inter_channels,
+ hidden_channels,
+ 5,
+ 1,
+ 16,
+ gin_channels=gin_channels,
+ )
+
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
+
+ self.n_speakers = n_speakers
+ if n_speakers == 0:
+ self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
+ else:
+ self.enc_p = TextEncoder(n_vocab,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout)
+ self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
+ self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
+ self.zero_g = zero_g
+
+ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
+ if self.n_speakers > 0:
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
+ else:
+ g = None
+
+ logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * sdp_ratio \
+ + self.dp(x, x_mask, g=g) * (1 - sdp_ratio)
+
+ w = torch.exp(logw) * x_mask * length_scale
+ w_ceil = torch.ceil(w)
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+ attn = commons.generate_path(w_ceil, attn_mask)
+
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
+
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
+ o = self.dec((z * y_mask)[:,:,:max_len], g=g)
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)
+
+ def voice_conversion(self, y, y_lengths, sid_src, sid_tgt, tau=1.0):
+ g_src = sid_src
+ g_tgt = sid_tgt
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src if not self.zero_g else torch.zeros_like(g_src), tau=tau)
+ z_p = self.flow(z, y_mask, g=g_src)
+ z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
+ o_hat = self.dec(z_hat * y_mask, g=g_tgt if not self.zero_g else torch.zeros_like(g_tgt))
+ return o_hat, y_mask, (z, z_p, z_hat)
diff --git a/openvoice/modules.py b/openvoice/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..d659a32626bdc34eae8650961611948a3405928b
--- /dev/null
+++ b/openvoice/modules.py
@@ -0,0 +1,598 @@
+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+from openvoice import commons
+from openvoice.commons import init_weights, get_padding
+from openvoice.transforms import piecewise_rational_quadratic_transform
+from openvoice.attentions import Encoder
+
+LRELU_SLOPE = 0.1
+
+
+class LayerNorm(nn.Module):
+ def __init__(self, channels, eps=1e-5):
+ super().__init__()
+ self.channels = channels
+ self.eps = eps
+
+ self.gamma = nn.Parameter(torch.ones(channels))
+ self.beta = nn.Parameter(torch.zeros(channels))
+
+ def forward(self, x):
+ x = x.transpose(1, -1)
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+ return x.transpose(1, -1)
+
+
+class ConvReluNorm(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ hidden_channels,
+ out_channels,
+ kernel_size,
+ n_layers,
+ p_dropout,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.hidden_channels = hidden_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.p_dropout = p_dropout
+ assert n_layers > 1, "Number of layers should be larger than 0."
+
+ self.conv_layers = nn.ModuleList()
+ self.norm_layers = nn.ModuleList()
+ self.conv_layers.append(
+ nn.Conv1d(
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+ )
+ )
+ self.norm_layers.append(LayerNorm(hidden_channels))
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+ for _ in range(n_layers - 1):
+ self.conv_layers.append(
+ nn.Conv1d(
+ hidden_channels,
+ hidden_channels,
+ kernel_size,
+ padding=kernel_size // 2,
+ )
+ )
+ self.norm_layers.append(LayerNorm(hidden_channels))
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+ self.proj.weight.data.zero_()
+ self.proj.bias.data.zero_()
+
+ def forward(self, x, x_mask):
+ x_org = x
+ for i in range(self.n_layers):
+ x = self.conv_layers[i](x * x_mask)
+ x = self.norm_layers[i](x)
+ x = self.relu_drop(x)
+ x = x_org + self.proj(x)
+ return x * x_mask
+
+
+class DDSConv(nn.Module):
+ """
+ Dilated and Depth-Separable Convolution
+ """
+
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
+ super().__init__()
+ self.channels = channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.p_dropout = p_dropout
+
+ self.drop = nn.Dropout(p_dropout)
+ self.convs_sep = nn.ModuleList()
+ self.convs_1x1 = nn.ModuleList()
+ self.norms_1 = nn.ModuleList()
+ self.norms_2 = nn.ModuleList()
+ for i in range(n_layers):
+ dilation = kernel_size**i
+ padding = (kernel_size * dilation - dilation) // 2
+ self.convs_sep.append(
+ nn.Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ groups=channels,
+ dilation=dilation,
+ padding=padding,
+ )
+ )
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+ self.norms_1.append(LayerNorm(channels))
+ self.norms_2.append(LayerNorm(channels))
+
+ def forward(self, x, x_mask, g=None):
+ if g is not None:
+ x = x + g
+ for i in range(self.n_layers):
+ y = self.convs_sep[i](x * x_mask)
+ y = self.norms_1[i](y)
+ y = F.gelu(y)
+ y = self.convs_1x1[i](y)
+ y = self.norms_2[i](y)
+ y = F.gelu(y)
+ y = self.drop(y)
+ x = x + y
+ return x * x_mask
+
+
+class WN(torch.nn.Module):
+ def __init__(
+ self,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=0,
+ p_dropout=0,
+ ):
+ super(WN, self).__init__()
+ assert kernel_size % 2 == 1
+ self.hidden_channels = hidden_channels
+ self.kernel_size = (kernel_size,)
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+ self.p_dropout = p_dropout
+
+ self.in_layers = torch.nn.ModuleList()
+ self.res_skip_layers = torch.nn.ModuleList()
+ self.drop = nn.Dropout(p_dropout)
+
+ if gin_channels != 0:
+ cond_layer = torch.nn.Conv1d(
+ gin_channels, 2 * hidden_channels * n_layers, 1
+ )
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+
+ for i in range(n_layers):
+ dilation = dilation_rate**i
+ padding = int((kernel_size * dilation - dilation) / 2)
+ in_layer = torch.nn.Conv1d(
+ hidden_channels,
+ 2 * hidden_channels,
+ kernel_size,
+ dilation=dilation,
+ padding=padding,
+ )
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+ self.in_layers.append(in_layer)
+
+ # last one is not necessary
+ if i < n_layers - 1:
+ res_skip_channels = 2 * hidden_channels
+ else:
+ res_skip_channels = hidden_channels
+
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+ self.res_skip_layers.append(res_skip_layer)
+
+ def forward(self, x, x_mask, g=None, **kwargs):
+ output = torch.zeros_like(x)
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+ if g is not None:
+ g = self.cond_layer(g)
+
+ for i in range(self.n_layers):
+ x_in = self.in_layers[i](x)
+ if g is not None:
+ cond_offset = i * 2 * self.hidden_channels
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+ else:
+ g_l = torch.zeros_like(x_in)
+
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+ acts = self.drop(acts)
+
+ res_skip_acts = self.res_skip_layers[i](acts)
+ if i < self.n_layers - 1:
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
+ x = (x + res_acts) * x_mask
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
+ else:
+ output = output + res_skip_acts
+ return output * x_mask
+
+ def remove_weight_norm(self):
+ if self.gin_channels != 0:
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
+ for l in self.in_layers:
+ torch.nn.utils.remove_weight_norm(l)
+ for l in self.res_skip_layers:
+ torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+ super(ResBlock1, self).__init__()
+ self.convs1 = nn.ModuleList(
+ [
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]),
+ )
+ ),
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1]),
+ )
+ ),
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation[2],
+ padding=get_padding(kernel_size, dilation[2]),
+ )
+ ),
+ ]
+ )
+ self.convs1.apply(init_weights)
+
+ self.convs2 = nn.ModuleList(
+ [
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=1,
+ padding=get_padding(kernel_size, 1),
+ )
+ ),
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=1,
+ padding=get_padding(kernel_size, 1),
+ )
+ ),
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=1,
+ padding=get_padding(kernel_size, 1),
+ )
+ ),
+ ]
+ )
+ self.convs2.apply(init_weights)
+
+ def forward(self, x, x_mask=None):
+ for c1, c2 in zip(self.convs1, self.convs2):
+ xt = F.leaky_relu(x, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c1(xt)
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c2(xt)
+ x = xt + x
+ if x_mask is not None:
+ x = x * x_mask
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs1:
+ remove_weight_norm(l)
+ for l in self.convs2:
+ remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+ super(ResBlock2, self).__init__()
+ self.convs = nn.ModuleList(
+ [
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]),
+ )
+ ),
+ weight_norm(
+ Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ 1,
+ dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1]),
+ )
+ ),
+ ]
+ )
+ self.convs.apply(init_weights)
+
+ def forward(self, x, x_mask=None):
+ for c in self.convs:
+ xt = F.leaky_relu(x, LRELU_SLOPE)
+ if x_mask is not None:
+ xt = xt * x_mask
+ xt = c(xt)
+ x = xt + x
+ if x_mask is not None:
+ x = x * x_mask
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs:
+ remove_weight_norm(l)
+
+
+class Log(nn.Module):
+ def forward(self, x, x_mask, reverse=False, **kwargs):
+ if not reverse:
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+ logdet = torch.sum(-y, [1, 2])
+ return y, logdet
+ else:
+ x = torch.exp(x) * x_mask
+ return x
+
+
+class Flip(nn.Module):
+ def forward(self, x, *args, reverse=False, **kwargs):
+ x = torch.flip(x, [1])
+ if not reverse:
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+ return x, logdet
+ else:
+ return x
+
+
+class ElementwiseAffine(nn.Module):
+ def __init__(self, channels):
+ super().__init__()
+ self.channels = channels
+ self.m = nn.Parameter(torch.zeros(channels, 1))
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
+
+ def forward(self, x, x_mask, reverse=False, **kwargs):
+ if not reverse:
+ y = self.m + torch.exp(self.logs) * x
+ y = y * x_mask
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
+ return y, logdet
+ else:
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
+ return x
+
+
+class ResidualCouplingLayer(nn.Module):
+ def __init__(
+ self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ p_dropout=0,
+ gin_channels=0,
+ mean_only=False,
+ ):
+ assert channels % 2 == 0, "channels should be divisible by 2"
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.half_channels = channels // 2
+ self.mean_only = mean_only
+
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+ self.enc = WN(
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ p_dropout=p_dropout,
+ gin_channels=gin_channels,
+ )
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+ self.post.weight.data.zero_()
+ self.post.bias.data.zero_()
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+ h = self.pre(x0) * x_mask
+ h = self.enc(h, x_mask, g=g)
+ stats = self.post(h) * x_mask
+ if not self.mean_only:
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+ else:
+ m = stats
+ logs = torch.zeros_like(m)
+
+ if not reverse:
+ x1 = m + x1 * torch.exp(logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ logdet = torch.sum(logs, [1, 2])
+ return x, logdet
+ else:
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ return x
+
+
+class ConvFlow(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ filter_channels,
+ kernel_size,
+ n_layers,
+ num_bins=10,
+ tail_bound=5.0,
+ ):
+ super().__init__()
+ self.in_channels = in_channels
+ self.filter_channels = filter_channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.num_bins = num_bins
+ self.tail_bound = tail_bound
+ self.half_channels = in_channels // 2
+
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
+ self.proj = nn.Conv1d(
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
+ )
+ self.proj.weight.data.zero_()
+ self.proj.bias.data.zero_()
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+ h = self.pre(x0)
+ h = self.convs(h, x_mask, g=g)
+ h = self.proj(h) * x_mask
+
+ b, c, t = x0.shape
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
+
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
+ self.filter_channels
+ )
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
+
+ x1, logabsdet = piecewise_rational_quadratic_transform(
+ x1,
+ unnormalized_widths,
+ unnormalized_heights,
+ unnormalized_derivatives,
+ inverse=reverse,
+ tails="linear",
+ tail_bound=self.tail_bound,
+ )
+
+ x = torch.cat([x0, x1], 1) * x_mask
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
+ if not reverse:
+ return x, logdet
+ else:
+ return x
+
+
+class TransformerCouplingLayer(nn.Module):
+ def __init__(
+ self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ n_layers,
+ n_heads,
+ p_dropout=0,
+ filter_channels=0,
+ mean_only=False,
+ wn_sharing_parameter=None,
+ gin_channels=0,
+ ):
+ assert n_layers == 3, n_layers
+ assert channels % 2 == 0, "channels should be divisible by 2"
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.half_channels = channels // 2
+ self.mean_only = mean_only
+
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+ self.enc = (
+ Encoder(
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ isflow=True,
+ gin_channels=gin_channels,
+ )
+ if wn_sharing_parameter is None
+ else wn_sharing_parameter
+ )
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+ self.post.weight.data.zero_()
+ self.post.bias.data.zero_()
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+ h = self.pre(x0) * x_mask
+ h = self.enc(h, x_mask, g=g)
+ stats = self.post(h) * x_mask
+ if not self.mean_only:
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+ else:
+ m = stats
+ logs = torch.zeros_like(m)
+
+ if not reverse:
+ x1 = m + x1 * torch.exp(logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ logdet = torch.sum(logs, [1, 2])
+ return x, logdet
+ else:
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
+ x = torch.cat([x0, x1], 1)
+ return x
+
+ x1, logabsdet = piecewise_rational_quadratic_transform(
+ x1,
+ unnormalized_widths,
+ unnormalized_heights,
+ unnormalized_derivatives,
+ inverse=reverse,
+ tails="linear",
+ tail_bound=self.tail_bound,
+ )
+
+ x = torch.cat([x0, x1], 1) * x_mask
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
+ if not reverse:
+ return x, logdet
+ else:
+ return x
diff --git a/openvoice/openvoice_app.py b/openvoice/openvoice_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..15b0b43a1ff07d2c9a0d5b106303f9502357ac74
--- /dev/null
+++ b/openvoice/openvoice_app.py
@@ -0,0 +1,275 @@
+import os
+import torch
+import argparse
+import gradio as gr
+from zipfile import ZipFile
+import langid
+from openvoice import se_extractor
+from openvoice.api import BaseSpeakerTTS, ToneColorConverter
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--share", action='store_true', default=False, help="make link public")
+args = parser.parse_args()
+
+en_ckpt_base = 'checkpoints/base_speakers/EN'
+zh_ckpt_base = 'checkpoints/base_speakers/ZH'
+ckpt_converter = 'checkpoints/converter'
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+output_dir = 'outputs'
+os.makedirs(output_dir, exist_ok=True)
+
+# load models
+en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
+en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
+zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
+zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
+tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
+tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
+
+# load speaker embeddings
+en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
+en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
+zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
+
+# This online demo mainly supports English and Chinese
+supported_languages = ['zh', 'en']
+
+def predict(prompt, style, audio_file_pth, agree):
+ # initialize a empty info
+ text_hint = ''
+ # agree with the terms
+ if agree == False:
+ text_hint += '[ERROR] Please accept the Terms & Condition!\n'
+ gr.Warning("Please accept the Terms & Condition!")
+ return (
+ text_hint,
+ None,
+ None,
+ )
+
+ # first detect the input language
+ language_predicted = langid.classify(prompt)[0].strip()
+ print(f"Detected language:{language_predicted}")
+
+ if language_predicted not in supported_languages:
+ text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
+ gr.Warning(
+ f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
+ )
+
+ return (
+ text_hint,
+ None,
+ None,
+ )
+
+ if language_predicted == "zh":
+ tts_model = zh_base_speaker_tts
+ source_se = zh_source_se
+ language = 'Chinese'
+ if style not in ['default']:
+ text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
+ gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
+ return (
+ text_hint,
+ None,
+ None,
+ )
+
+ else:
+ tts_model = en_base_speaker_tts
+ if style == 'default':
+ source_se = en_source_default_se
+ else:
+ source_se = en_source_style_se
+ language = 'English'
+ if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
+ text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
+ gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
+ return (
+ text_hint,
+ None,
+ None,
+ )
+
+ speaker_wav = audio_file_pth
+
+ if len(prompt) < 2:
+ text_hint += f"[ERROR] Please give a longer prompt text \n"
+ gr.Warning("Please give a longer prompt text")
+ return (
+ text_hint,
+ None,
+ None,
+ )
+ if len(prompt) > 200:
+ text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
+ gr.Warning(
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
+ )
+ return (
+ text_hint,
+ None,
+ None,
+ )
+
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
+ try:
+ target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
+ except Exception as e:
+ text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
+ gr.Warning(
+ "[ERROR] Get target tone color error {str(e)} \n"
+ )
+ return (
+ text_hint,
+ None,
+ None,
+ )
+
+ src_path = f'{output_dir}/tmp.wav'
+ tts_model.tts(prompt, src_path, speaker=style, language=language)
+
+ save_path = f'{output_dir}/output.wav'
+ # Run the tone color converter
+ encode_message = "@MyShell"
+ tone_color_converter.convert(
+ audio_src_path=src_path,
+ src_se=source_se,
+ tgt_se=target_se,
+ output_path=save_path,
+ message=encode_message)
+
+ text_hint += f'''Get response successfully \n'''
+
+ return (
+ text_hint,
+ save_path,
+ speaker_wav,
+ )
+
+
+
+title = "MyShell OpenVoice"
+
+description = """
+We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
+"""
+
+markdown_table = """
+
+
+| | | |
+| :-----------: | :-----------: | :-----------: |
+| **OpenSource Repo** | **Project Page** | **Join the Community** |
+|
| [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
+
+
+"""
+
+markdown_table_v2 = """
+
+
+| | | | |
+| :-----------: | :-----------: | :-----------: | :-----------: |
+| **OpenSource Repo** |
| **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
+
+| | |
+| :-----------: | :-----------: |
+**Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
+
+
+"""
+content = """
+
+
If the generated voice does not sound like the reference voice, please refer to this QnA . For multi-lingual & cross-lingual examples, please refer to this jupyter notebook .
+ This online demo mainly supports
English . The
default style also supports
Chinese . But OpenVoice can adapt to any other language as long as a base speaker is provided.
+
+"""
+wrapped_markdown_content = f"{content}
"
+
+
+examples = [
+ [
+ "今天天气真好,我们一起出去吃饭吧。",
+ 'default',
+ "resources/demo_speaker1.mp3",
+ True,
+ ],[
+ "This audio is generated by open voice with a half-performance model.",
+ 'whispering',
+ "resources/demo_speaker2.mp3",
+ True,
+ ],
+ [
+ "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
+ 'sad',
+ "resources/demo_speaker0.mp3",
+ True,
+ ],
+]
+
+with gr.Blocks(analytics_enabled=False) as demo:
+
+ with gr.Row():
+ with gr.Column():
+ with gr.Row():
+ gr.Markdown(
+ """
+ ##
+ """
+ )
+ with gr.Row():
+ gr.Markdown(markdown_table_v2)
+ with gr.Row():
+ gr.Markdown(description)
+ with gr.Column():
+ gr.Video('https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f', autoplay=True)
+
+ with gr.Row():
+ gr.HTML(wrapped_markdown_content)
+
+ with gr.Row():
+ with gr.Column():
+ input_text_gr = gr.Textbox(
+ label="Text Prompt",
+ info="One or two sentences at a time is better. Up to 200 text characters.",
+ value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
+ )
+ style_gr = gr.Dropdown(
+ label="Style",
+ info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
+ choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
+ max_choices=1,
+ value="default",
+ )
+ ref_gr = gr.Audio(
+ label="Reference Audio",
+ info="Click on the ✎ button to upload your own target speaker audio",
+ type="filepath",
+ value="resources/demo_speaker2.mp3",
+ )
+ tos_gr = gr.Checkbox(
+ label="Agree",
+ value=False,
+ info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
+ )
+
+ tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
+
+
+ with gr.Column():
+ out_text_gr = gr.Text(label="Info")
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
+
+ gr.Examples(examples,
+ label="Examples",
+ inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
+ outputs=[out_text_gr, audio_gr, ref_audio_gr],
+ fn=predict,
+ cache_examples=False,)
+ tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
+
+demo.queue()
+demo.launch(debug=True, show_api=True, share=args.share)
diff --git a/openvoice/se_extractor.py b/openvoice/se_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a41c25661013674260c2b63ec4ffdae56ca302b6
--- /dev/null
+++ b/openvoice/se_extractor.py
@@ -0,0 +1,153 @@
+import os
+import glob
+import torch
+import hashlib
+import librosa
+import base64
+from glob import glob
+import numpy as np
+from pydub import AudioSegment
+from faster_whisper import WhisperModel
+import hashlib
+import base64
+import librosa
+from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
+
+model_size = "medium"
+# Run on GPU with FP16
+model = None
+def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
+ global model
+ if model is None:
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
+ audio = AudioSegment.from_file(audio_path)
+ max_len = len(audio)
+
+ target_folder = os.path.join(target_dir, audio_name)
+
+ segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
+ segments = list(segments)
+
+ # create directory
+ os.makedirs(target_folder, exist_ok=True)
+ wavs_folder = os.path.join(target_folder, 'wavs')
+ os.makedirs(wavs_folder, exist_ok=True)
+
+ # segments
+ s_ind = 0
+ start_time = None
+
+ for k, w in enumerate(segments):
+ # process with the time
+ if k == 0:
+ start_time = max(0, w.start)
+
+ end_time = w.end
+
+ # calculate confidence
+ if len(w.words) > 0:
+ confidence = sum([s.probability for s in w.words]) / len(w.words)
+ else:
+ confidence = 0.
+ # clean text
+ text = w.text.replace('...', '')
+
+ # left 0.08s for each audios
+ audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
+
+ # segment file name
+ fname = f"{audio_name}_seg{s_ind}.wav"
+
+ # filter out the segment shorter than 1.5s and longer than 20s
+ save = audio_seg.duration_seconds > 1.5 and \
+ audio_seg.duration_seconds < 20. and \
+ len(text) >= 2 and len(text) < 200
+
+ if save:
+ output_file = os.path.join(wavs_folder, fname)
+ audio_seg.export(output_file, format='wav')
+
+ if k < len(segments) - 1:
+ start_time = max(0, segments[k+1].start - 0.08)
+
+ s_ind = s_ind + 1
+ return wavs_folder
+
+
+def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
+ SAMPLE_RATE = 16000
+ audio_vad = get_audio_tensor(audio_path)
+ segments = get_vad_segments(
+ audio_vad,
+ output_sample=True,
+ min_speech_duration=0.1,
+ min_silence_duration=1,
+ method="silero",
+ )
+ segments = [(seg["start"], seg["end"]) for seg in segments]
+ segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
+ print(segments)
+ audio_active = AudioSegment.silent(duration=0)
+ audio = AudioSegment.from_file(audio_path)
+
+ for start_time, end_time in segments:
+ audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
+
+ audio_dur = audio_active.duration_seconds
+ print(f'after vad: dur = {audio_dur}')
+ target_folder = os.path.join(target_dir, audio_name)
+ wavs_folder = os.path.join(target_folder, 'wavs')
+ os.makedirs(wavs_folder, exist_ok=True)
+ start_time = 0.
+ count = 0
+ num_splits = int(np.round(audio_dur / split_seconds))
+ assert num_splits > 0, 'input audio is too short'
+ interval = audio_dur / num_splits
+
+ for i in range(num_splits):
+ end_time = min(start_time + interval, audio_dur)
+ if i == num_splits - 1:
+ end_time = audio_dur
+ output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
+ audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
+ audio_seg.export(output_file, format='wav')
+ start_time = end_time
+ count += 1
+ return wavs_folder
+
+def hash_numpy_array(audio_path):
+ array, _ = librosa.load(audio_path, sr=None, mono=True)
+ # Convert the array to bytes
+ array_bytes = array.tobytes()
+ # Calculate the hash of the array bytes
+ hash_object = hashlib.sha256(array_bytes)
+ hash_value = hash_object.digest()
+ # Convert the hash value to base64
+ base64_value = base64.b64encode(hash_value)
+ return base64_value.decode('utf-8')[:16].replace('/', '_^')
+
+def get_se(audio_path, vc_model, target_dir='processed', vad=True):
+ device = vc_model.device
+ version = vc_model.version
+ print("OpenVoice version:", version)
+
+ audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
+ se_path = os.path.join(target_dir, audio_name, 'se.pth')
+
+ # if os.path.isfile(se_path):
+ # se = torch.load(se_path).to(device)
+ # return se, audio_name
+ # if os.path.isdir(audio_path):
+ # wavs_folder = audio_path
+
+ if vad:
+ wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
+ else:
+ wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
+
+ audio_segs = glob(f'{wavs_folder}/*.wav')
+ if len(audio_segs) == 0:
+ raise NotImplementedError('No audio segments found!')
+
+ return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
+
diff --git a/openvoice/text/__init__.py b/openvoice/text/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6494bcb8fb9ccbf4bd9df1cc3c054619f4553942
--- /dev/null
+++ b/openvoice/text/__init__.py
@@ -0,0 +1,79 @@
+""" from https://github.com/keithito/tacotron """
+from openvoice.text import cleaners
+from openvoice.text.symbols import symbols
+
+
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+
+
+def text_to_sequence(text, symbols, cleaner_names):
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+ Args:
+ text: string to convert to a sequence
+ cleaner_names: names of the cleaner functions to run the text through
+ Returns:
+ List of integers corresponding to the symbols in the text
+ '''
+ sequence = []
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
+ clean_text = _clean_text(text, cleaner_names)
+ print(clean_text)
+ print(f" length:{len(clean_text)}")
+ for symbol in clean_text:
+ if symbol not in symbol_to_id.keys():
+ continue
+ symbol_id = symbol_to_id[symbol]
+ sequence += [symbol_id]
+ print(f" length:{len(sequence)}")
+ return sequence
+
+
+def cleaned_text_to_sequence(cleaned_text, symbols):
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+ Args:
+ text: string to convert to a sequence
+ Returns:
+ List of integers corresponding to the symbols in the text
+ '''
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
+ sequence = [symbol_to_id[symbol] for symbol in cleaned_text if symbol in symbol_to_id.keys()]
+ return sequence
+
+
+
+from openvoice.text.symbols import language_tone_start_map
+def cleaned_text_to_sequence_vits2(cleaned_text, tones, language, symbols, languages):
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+ Args:
+ text: string to convert to a sequence
+ Returns:
+ List of integers corresponding to the symbols in the text
+ """
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
+ language_id_map = {s: i for i, s in enumerate(languages)}
+ phones = [symbol_to_id[symbol] for symbol in cleaned_text]
+ tone_start = language_tone_start_map[language]
+ tones = [i + tone_start for i in tones]
+ lang_id = language_id_map[language]
+ lang_ids = [lang_id for i in phones]
+ return phones, tones, lang_ids
+
+
+def sequence_to_text(sequence):
+ '''Converts a sequence of IDs back to a string'''
+ result = ''
+ for symbol_id in sequence:
+ s = _id_to_symbol[symbol_id]
+ result += s
+ return result
+
+
+def _clean_text(text, cleaner_names):
+ for name in cleaner_names:
+ cleaner = getattr(cleaners, name)
+ if not cleaner:
+ raise Exception('Unknown cleaner: %s' % name)
+ text = cleaner(text)
+ return text
diff --git a/openvoice/text/__pycache__/__init__.cpython-39.pyc b/openvoice/text/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d22b764c5c34d26696a787aeffc15d9b8df03041
Binary files /dev/null and b/openvoice/text/__pycache__/__init__.cpython-39.pyc differ
diff --git a/openvoice/text/__pycache__/cleaners.cpython-39.pyc b/openvoice/text/__pycache__/cleaners.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..872f12fb1841c818aafe1e73bc523d8397a4a6b8
Binary files /dev/null and b/openvoice/text/__pycache__/cleaners.cpython-39.pyc differ
diff --git a/openvoice/text/__pycache__/english.cpython-39.pyc b/openvoice/text/__pycache__/english.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ae5c5cfea71f8867beb71d3eacfb23a3c517f46
Binary files /dev/null and b/openvoice/text/__pycache__/english.cpython-39.pyc differ
diff --git a/openvoice/text/__pycache__/mandarin.cpython-39.pyc b/openvoice/text/__pycache__/mandarin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3088ee73177594ce5a35bab92ce6ef5e2aa6eab
Binary files /dev/null and b/openvoice/text/__pycache__/mandarin.cpython-39.pyc differ
diff --git a/openvoice/text/__pycache__/symbols.cpython-39.pyc b/openvoice/text/__pycache__/symbols.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c3a19ae1ce887f15ae9cd27ff86529517ad2988
Binary files /dev/null and b/openvoice/text/__pycache__/symbols.cpython-39.pyc differ
diff --git a/openvoice/text/cleaners.py b/openvoice/text/cleaners.py
new file mode 100644
index 0000000000000000000000000000000000000000..16dd168f2a34f9d00e31bd769f0d05df5bdd82a1
--- /dev/null
+++ b/openvoice/text/cleaners.py
@@ -0,0 +1,16 @@
+import re
+from openvoice.text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
+from openvoice.text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
+
+def cjke_cleaners2(text):
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+ lambda x: chinese_to_ipa(x.group(1))+' ', text)
+ text = re.sub(r'\[JA\](.*?)\[JA\]',
+ lambda x: japanese_to_ipa2(x.group(1))+' ', text)
+ text = re.sub(r'\[KO\](.*?)\[KO\]',
+ lambda x: korean_to_ipa(x.group(1))+' ', text)
+ text = re.sub(r'\[EN\](.*?)\[EN\]',
+ lambda x: english_to_ipa2(x.group(1))+' ', text)
+ text = re.sub(r'\s+$', '', text)
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
+ return text
\ No newline at end of file
diff --git a/openvoice/text/english.py b/openvoice/text/english.py
new file mode 100644
index 0000000000000000000000000000000000000000..736a53a7bc66cfdd776aa1fa01439f1e6e46f1c9
--- /dev/null
+++ b/openvoice/text/english.py
@@ -0,0 +1,188 @@
+""" from https://github.com/keithito/tacotron """
+
+'''
+Cleaners are transformations that run over the input text at both training and eval time.
+
+Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to use:
+ 1. "english_cleaners" for English text
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
+ the symbols in symbols.py to match your data).
+'''
+
+
+# Regular expression matching whitespace:
+
+
+import re
+import inflect
+from unidecode import unidecode
+import eng_to_ipa as ipa
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+ ('mrs', 'misess'),
+ ('mr', 'mister'),
+ ('dr', 'doctor'),
+ ('st', 'saint'),
+ ('co', 'company'),
+ ('jr', 'junior'),
+ ('maj', 'major'),
+ ('gen', 'general'),
+ ('drs', 'doctors'),
+ ('rev', 'reverend'),
+ ('lt', 'lieutenant'),
+ ('hon', 'honorable'),
+ ('sgt', 'sergeant'),
+ ('capt', 'captain'),
+ ('esq', 'esquire'),
+ ('ltd', 'limited'),
+ ('col', 'colonel'),
+ ('ft', 'fort'),
+]]
+
+
+# List of (ipa, lazy ipa) pairs:
+_lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+ ('r', 'ɹ'),
+ ('æ', 'e'),
+ ('ɑ', 'a'),
+ ('ɔ', 'o'),
+ ('ð', 'z'),
+ ('θ', 's'),
+ ('ɛ', 'e'),
+ ('ɪ', 'i'),
+ ('ʊ', 'u'),
+ ('ʒ', 'ʥ'),
+ ('ʤ', 'ʥ'),
+ ('ˈ', '↓'),
+]]
+
+# List of (ipa, lazy ipa2) pairs:
+_lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+ ('r', 'ɹ'),
+ ('ð', 'z'),
+ ('θ', 's'),
+ ('ʒ', 'ʑ'),
+ ('ʤ', 'dʑ'),
+ ('ˈ', '↓'),
+]]
+
+# List of (ipa, ipa2) pairs
+_ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+ ('r', 'ɹ'),
+ ('ʤ', 'dʒ'),
+ ('ʧ', 'tʃ')
+]]
+
+
+def expand_abbreviations(text):
+ for regex, replacement in _abbreviations:
+ text = re.sub(regex, replacement, text)
+ return text
+
+
+def collapse_whitespace(text):
+ return re.sub(r'\s+', ' ', text)
+
+
+def _remove_commas(m):
+ return m.group(1).replace(',', '')
+
+
+def _expand_decimal_point(m):
+ return m.group(1).replace('.', ' point ')
+
+
+def _expand_dollars(m):
+ match = m.group(1)
+ parts = match.split('.')
+ if len(parts) > 2:
+ return match + ' dollars' # Unexpected format
+ dollars = int(parts[0]) if parts[0] else 0
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+ if dollars and cents:
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+ cent_unit = 'cent' if cents == 1 else 'cents'
+ return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+ elif dollars:
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+ return '%s %s' % (dollars, dollar_unit)
+ elif cents:
+ cent_unit = 'cent' if cents == 1 else 'cents'
+ return '%s %s' % (cents, cent_unit)
+ else:
+ return 'zero dollars'
+
+
+def _expand_ordinal(m):
+ return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+ num = int(m.group(0))
+ if num > 1000 and num < 3000:
+ if num == 2000:
+ return 'two thousand'
+ elif num > 2000 and num < 2010:
+ return 'two thousand ' + _inflect.number_to_words(num % 100)
+ elif num % 100 == 0:
+ return _inflect.number_to_words(num // 100) + ' hundred'
+ else:
+ return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+ else:
+ return _inflect.number_to_words(num, andword='')
+
+
+def normalize_numbers(text):
+ text = re.sub(_comma_number_re, _remove_commas, text)
+ text = re.sub(_pounds_re, r'\1 pounds', text)
+ text = re.sub(_dollars_re, _expand_dollars, text)
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
+ text = re.sub(_number_re, _expand_number, text)
+ return text
+
+
+def mark_dark_l(text):
+ return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
+
+
+def english_to_ipa(text):
+ text = unidecode(text).lower()
+ text = expand_abbreviations(text)
+ text = normalize_numbers(text)
+ phonemes = ipa.convert(text)
+ phonemes = collapse_whitespace(phonemes)
+ return phonemes
+
+
+def english_to_lazy_ipa(text):
+ text = english_to_ipa(text)
+ for regex, replacement in _lazy_ipa:
+ text = re.sub(regex, replacement, text)
+ return text
+
+
+def english_to_ipa2(text):
+ text = english_to_ipa(text)
+ text = mark_dark_l(text)
+ for regex, replacement in _ipa_to_ipa2:
+ text = re.sub(regex, replacement, text)
+ return text.replace('...', '…')
+
+
+def english_to_lazy_ipa2(text):
+ text = english_to_ipa(text)
+ for regex, replacement in _lazy_ipa2:
+ text = re.sub(regex, replacement, text)
+ return text
diff --git a/openvoice/text/mandarin.py b/openvoice/text/mandarin.py
new file mode 100644
index 0000000000000000000000000000000000000000..162e1b912dabec4b448ccd3d00d56306f82ce076
--- /dev/null
+++ b/openvoice/text/mandarin.py
@@ -0,0 +1,326 @@
+import os
+import sys
+import re
+from pypinyin import lazy_pinyin, BOPOMOFO
+import jieba
+import cn2an
+import logging
+
+
+# List of (Latin alphabet, bopomofo) pairs:
+_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+ ('a', 'ㄟˉ'),
+ ('b', 'ㄅㄧˋ'),
+ ('c', 'ㄙㄧˉ'),
+ ('d', 'ㄉㄧˋ'),
+ ('e', 'ㄧˋ'),
+ ('f', 'ㄝˊㄈㄨˋ'),
+ ('g', 'ㄐㄧˋ'),
+ ('h', 'ㄝˇㄑㄩˋ'),
+ ('i', 'ㄞˋ'),
+ ('j', 'ㄐㄟˋ'),
+ ('k', 'ㄎㄟˋ'),
+ ('l', 'ㄝˊㄛˋ'),
+ ('m', 'ㄝˊㄇㄨˋ'),
+ ('n', 'ㄣˉ'),
+ ('o', 'ㄡˉ'),
+ ('p', 'ㄆㄧˉ'),
+ ('q', 'ㄎㄧㄡˉ'),
+ ('r', 'ㄚˋ'),
+ ('s', 'ㄝˊㄙˋ'),
+ ('t', 'ㄊㄧˋ'),
+ ('u', 'ㄧㄡˉ'),
+ ('v', 'ㄨㄧˉ'),
+ ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
+ ('x', 'ㄝˉㄎㄨˋㄙˋ'),
+ ('y', 'ㄨㄞˋ'),
+ ('z', 'ㄗㄟˋ')
+]]
+
+# List of (bopomofo, romaji) pairs:
+_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
+ ('ㄅㄛ', 'p⁼wo'),
+ ('ㄆㄛ', 'pʰwo'),
+ ('ㄇㄛ', 'mwo'),
+ ('ㄈㄛ', 'fwo'),
+ ('ㄅ', 'p⁼'),
+ ('ㄆ', 'pʰ'),
+ ('ㄇ', 'm'),
+ ('ㄈ', 'f'),
+ ('ㄉ', 't⁼'),
+ ('ㄊ', 'tʰ'),
+ ('ㄋ', 'n'),
+ ('ㄌ', 'l'),
+ ('ㄍ', 'k⁼'),
+ ('ㄎ', 'kʰ'),
+ ('ㄏ', 'h'),
+ ('ㄐ', 'ʧ⁼'),
+ ('ㄑ', 'ʧʰ'),
+ ('ㄒ', 'ʃ'),
+ ('ㄓ', 'ʦ`⁼'),
+ ('ㄔ', 'ʦ`ʰ'),
+ ('ㄕ', 's`'),
+ ('ㄖ', 'ɹ`'),
+ ('ㄗ', 'ʦ⁼'),
+ ('ㄘ', 'ʦʰ'),
+ ('ㄙ', 's'),
+ ('ㄚ', 'a'),
+ ('ㄛ', 'o'),
+ ('ㄜ', 'ə'),
+ ('ㄝ', 'e'),
+ ('ㄞ', 'ai'),
+ ('ㄟ', 'ei'),
+ ('ㄠ', 'au'),
+ ('ㄡ', 'ou'),
+ ('ㄧㄢ', 'yeNN'),
+ ('ㄢ', 'aNN'),
+ ('ㄧㄣ', 'iNN'),
+ ('ㄣ', 'əNN'),
+ ('ㄤ', 'aNg'),
+ ('ㄧㄥ', 'iNg'),
+ ('ㄨㄥ', 'uNg'),
+ ('ㄩㄥ', 'yuNg'),
+ ('ㄥ', 'əNg'),
+ ('ㄦ', 'əɻ'),
+ ('ㄧ', 'i'),
+ ('ㄨ', 'u'),
+ ('ㄩ', 'ɥ'),
+ ('ˉ', '→'),
+ ('ˊ', '↑'),
+ ('ˇ', '↓↑'),
+ ('ˋ', '↓'),
+ ('˙', ''),
+ (',', ','),
+ ('。', '.'),
+ ('!', '!'),
+ ('?', '?'),
+ ('—', '-')
+]]
+
+# List of (romaji, ipa) pairs:
+_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+ ('ʃy', 'ʃ'),
+ ('ʧʰy', 'ʧʰ'),
+ ('ʧ⁼y', 'ʧ⁼'),
+ ('NN', 'n'),
+ ('Ng', 'ŋ'),
+ ('y', 'j'),
+ ('h', 'x')
+]]
+
+# List of (bopomofo, ipa) pairs:
+_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+ ('ㄅㄛ', 'p⁼wo'),
+ ('ㄆㄛ', 'pʰwo'),
+ ('ㄇㄛ', 'mwo'),
+ ('ㄈㄛ', 'fwo'),
+ ('ㄅ', 'p⁼'),
+ ('ㄆ', 'pʰ'),
+ ('ㄇ', 'm'),
+ ('ㄈ', 'f'),
+ ('ㄉ', 't⁼'),
+ ('ㄊ', 'tʰ'),
+ ('ㄋ', 'n'),
+ ('ㄌ', 'l'),
+ ('ㄍ', 'k⁼'),
+ ('ㄎ', 'kʰ'),
+ ('ㄏ', 'x'),
+ ('ㄐ', 'tʃ⁼'),
+ ('ㄑ', 'tʃʰ'),
+ ('ㄒ', 'ʃ'),
+ ('ㄓ', 'ts`⁼'),
+ ('ㄔ', 'ts`ʰ'),
+ ('ㄕ', 's`'),
+ ('ㄖ', 'ɹ`'),
+ ('ㄗ', 'ts⁼'),
+ ('ㄘ', 'tsʰ'),
+ ('ㄙ', 's'),
+ ('ㄚ', 'a'),
+ ('ㄛ', 'o'),
+ ('ㄜ', 'ə'),
+ ('ㄝ', 'ɛ'),
+ ('ㄞ', 'aɪ'),
+ ('ㄟ', 'eɪ'),
+ ('ㄠ', 'ɑʊ'),
+ ('ㄡ', 'oʊ'),
+ ('ㄧㄢ', 'jɛn'),
+ ('ㄩㄢ', 'ɥæn'),
+ ('ㄢ', 'an'),
+ ('ㄧㄣ', 'in'),
+ ('ㄩㄣ', 'ɥn'),
+ ('ㄣ', 'ən'),
+ ('ㄤ', 'ɑŋ'),
+ ('ㄧㄥ', 'iŋ'),
+ ('ㄨㄥ', 'ʊŋ'),
+ ('ㄩㄥ', 'jʊŋ'),
+ ('ㄥ', 'əŋ'),
+ ('ㄦ', 'əɻ'),
+ ('ㄧ', 'i'),
+ ('ㄨ', 'u'),
+ ('ㄩ', 'ɥ'),
+ ('ˉ', '→'),
+ ('ˊ', '↑'),
+ ('ˇ', '↓↑'),
+ ('ˋ', '↓'),
+ ('˙', ''),
+ (',', ','),
+ ('。', '.'),
+ ('!', '!'),
+ ('?', '?'),
+ ('—', '-')
+]]
+
+# List of (bopomofo, ipa2) pairs:
+_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
+ ('ㄅㄛ', 'pwo'),
+ ('ㄆㄛ', 'pʰwo'),
+ ('ㄇㄛ', 'mwo'),
+ ('ㄈㄛ', 'fwo'),
+ ('ㄅ', 'p'),
+ ('ㄆ', 'pʰ'),
+ ('ㄇ', 'm'),
+ ('ㄈ', 'f'),
+ ('ㄉ', 't'),
+ ('ㄊ', 'tʰ'),
+ ('ㄋ', 'n'),
+ ('ㄌ', 'l'),
+ ('ㄍ', 'k'),
+ ('ㄎ', 'kʰ'),
+ ('ㄏ', 'h'),
+ ('ㄐ', 'tɕ'),
+ ('ㄑ', 'tɕʰ'),
+ ('ㄒ', 'ɕ'),
+ ('ㄓ', 'tʂ'),
+ ('ㄔ', 'tʂʰ'),
+ ('ㄕ', 'ʂ'),
+ ('ㄖ', 'ɻ'),
+ ('ㄗ', 'ts'),
+ ('ㄘ', 'tsʰ'),
+ ('ㄙ', 's'),
+ ('ㄚ', 'a'),
+ ('ㄛ', 'o'),
+ ('ㄜ', 'ɤ'),
+ ('ㄝ', 'ɛ'),
+ ('ㄞ', 'aɪ'),
+ ('ㄟ', 'eɪ'),
+ ('ㄠ', 'ɑʊ'),
+ ('ㄡ', 'oʊ'),
+ ('ㄧㄢ', 'jɛn'),
+ ('ㄩㄢ', 'yæn'),
+ ('ㄢ', 'an'),
+ ('ㄧㄣ', 'in'),
+ ('ㄩㄣ', 'yn'),
+ ('ㄣ', 'ən'),
+ ('ㄤ', 'ɑŋ'),
+ ('ㄧㄥ', 'iŋ'),
+ ('ㄨㄥ', 'ʊŋ'),
+ ('ㄩㄥ', 'jʊŋ'),
+ ('ㄥ', 'ɤŋ'),
+ ('ㄦ', 'əɻ'),
+ ('ㄧ', 'i'),
+ ('ㄨ', 'u'),
+ ('ㄩ', 'y'),
+ ('ˉ', '˥'),
+ ('ˊ', '˧˥'),
+ ('ˇ', '˨˩˦'),
+ ('ˋ', '˥˩'),
+ ('˙', ''),
+ (',', ','),
+ ('。', '.'),
+ ('!', '!'),
+ ('?', '?'),
+ ('—', '-')
+]]
+
+
+def number_to_chinese(text):
+ numbers = re.findall(r'\d+(?:\.?\d+)?', text)
+ for number in numbers:
+ text = text.replace(number, cn2an.an2cn(number), 1)
+ return text
+
+
+def chinese_to_bopomofo(text):
+ text = text.replace('、', ',').replace(';', ',').replace(':', ',')
+ words = jieba.lcut(text, cut_all=False)
+ text = ''
+ for word in words:
+ bopomofos = lazy_pinyin(word, BOPOMOFO)
+ if not re.search('[\u4e00-\u9fff]', word):
+ text += word
+ continue
+ for i in range(len(bopomofos)):
+ bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
+ if text != '':
+ text += ' '
+ text += ''.join(bopomofos)
+ return text
+
+
+def latin_to_bopomofo(text):
+ for regex, replacement in _latin_to_bopomofo:
+ text = re.sub(regex, replacement, text)
+ return text
+
+
+def bopomofo_to_romaji(text):
+ for regex, replacement in _bopomofo_to_romaji:
+ text = re.sub(regex, replacement, text)
+ return text
+
+
+def bopomofo_to_ipa(text):
+ for regex, replacement in _bopomofo_to_ipa:
+ text = re.sub(regex, replacement, text)
+ return text
+
+
+def bopomofo_to_ipa2(text):
+ for regex, replacement in _bopomofo_to_ipa2:
+ text = re.sub(regex, replacement, text)
+ return text
+
+
+def chinese_to_romaji(text):
+ text = number_to_chinese(text)
+ text = chinese_to_bopomofo(text)
+ text = latin_to_bopomofo(text)
+ text = bopomofo_to_romaji(text)
+ text = re.sub('i([aoe])', r'y\1', text)
+ text = re.sub('u([aoəe])', r'w\1', text)
+ text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
+ r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
+ text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
+ return text
+
+
+def chinese_to_lazy_ipa(text):
+ text = chinese_to_romaji(text)
+ for regex, replacement in _romaji_to_ipa:
+ text = re.sub(regex, replacement, text)
+ return text
+
+
+def chinese_to_ipa(text):
+ text = number_to_chinese(text)
+ text = chinese_to_bopomofo(text)
+ text = latin_to_bopomofo(text)
+ text = bopomofo_to_ipa(text)
+ text = re.sub('i([aoe])', r'j\1', text)
+ text = re.sub('u([aoəe])', r'w\1', text)
+ text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
+ r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
+ text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
+ return text
+
+
+def chinese_to_ipa2(text):
+ text = number_to_chinese(text)
+ text = chinese_to_bopomofo(text)
+ text = latin_to_bopomofo(text)
+ text = bopomofo_to_ipa2(text)
+ text = re.sub(r'i([aoe])', r'j\1', text)
+ text = re.sub(r'u([aoəe])', r'w\1', text)
+ text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
+ text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
+ return text
diff --git a/openvoice/text/symbols.py b/openvoice/text/symbols.py
new file mode 100644
index 0000000000000000000000000000000000000000..1231728d35b1f76b9da3f81a60fc46649c91501e
--- /dev/null
+++ b/openvoice/text/symbols.py
@@ -0,0 +1,88 @@
+'''
+Defines the set of symbols used in text input to the model.
+'''
+
+# japanese_cleaners
+# _pad = '_'
+# _punctuation = ',.!?-'
+# _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
+
+
+'''# japanese_cleaners2
+_pad = '_'
+_punctuation = ',.!?-~…'
+_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
+'''
+
+
+'''# korean_cleaners
+_pad = '_'
+_punctuation = ',.!?…~'
+_letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
+'''
+
+'''# chinese_cleaners
+_pad = '_'
+_punctuation = ',。!?—…'
+_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
+'''
+
+# # zh_ja_mixture_cleaners
+# _pad = '_'
+# _punctuation = ',.!?-~…'
+# _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
+
+
+'''# sanskrit_cleaners
+_pad = '_'
+_punctuation = '।'
+_letters = 'ँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहऽािीुूृॄेैोौ्ॠॢ '
+'''
+
+'''# cjks_cleaners
+_pad = '_'
+_punctuation = ',.!?-~…'
+_letters = 'NQabdefghijklmnopstuvwxyzʃʧʥʦɯɹəɥçɸɾβŋɦː⁼ʰ`^#*=→↓↑ '
+'''
+
+'''# thai_cleaners
+_pad = '_'
+_punctuation = '.!? '
+_letters = 'กขฃคฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลวศษสหฬอฮฯะัาำิีึืุูเแโใไๅๆ็่้๊๋์'
+'''
+
+# # cjke_cleaners2
+_pad = '_'
+_punctuation = ',.!?-~…'
+_letters = 'NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ '
+
+
+'''# shanghainese_cleaners
+_pad = '_'
+_punctuation = ',.!?…'
+_letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
+'''
+
+'''# chinese_dialect_cleaners
+_pad = '_'
+_punctuation = ',.!?~…─'
+_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
+'''
+
+# Export all symbols:
+symbols = [_pad] + list(_punctuation) + list(_letters)
+
+# Special symbol ids
+SPACE_ID = symbols.index(" ")
+
+num_ja_tones = 1
+num_kr_tones = 1
+num_zh_tones = 6
+num_en_tones = 4
+
+language_tone_start_map = {
+ "ZH": 0,
+ "JP": num_zh_tones,
+ "EN": num_zh_tones + num_ja_tones,
+ 'KR': num_zh_tones + num_ja_tones + num_en_tones,
+}
\ No newline at end of file
diff --git a/openvoice/transforms.py b/openvoice/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..a11f799e023864ff7082c1f49c0cc18351a13b47
--- /dev/null
+++ b/openvoice/transforms.py
@@ -0,0 +1,209 @@
+import torch
+from torch.nn import functional as F
+
+import numpy as np
+
+
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+
+
+def piecewise_rational_quadratic_transform(
+ inputs,
+ unnormalized_widths,
+ unnormalized_heights,
+ unnormalized_derivatives,
+ inverse=False,
+ tails=None,
+ tail_bound=1.0,
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+ if tails is None:
+ spline_fn = rational_quadratic_spline
+ spline_kwargs = {}
+ else:
+ spline_fn = unconstrained_rational_quadratic_spline
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
+
+ outputs, logabsdet = spline_fn(
+ inputs=inputs,
+ unnormalized_widths=unnormalized_widths,
+ unnormalized_heights=unnormalized_heights,
+ unnormalized_derivatives=unnormalized_derivatives,
+ inverse=inverse,
+ min_bin_width=min_bin_width,
+ min_bin_height=min_bin_height,
+ min_derivative=min_derivative,
+ **spline_kwargs
+ )
+ return outputs, logabsdet
+
+
+def searchsorted(bin_locations, inputs, eps=1e-6):
+ bin_locations[..., -1] += eps
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
+
+
+def unconstrained_rational_quadratic_spline(
+ inputs,
+ unnormalized_widths,
+ unnormalized_heights,
+ unnormalized_derivatives,
+ inverse=False,
+ tails="linear",
+ tail_bound=1.0,
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+ outside_interval_mask = ~inside_interval_mask
+
+ outputs = torch.zeros_like(inputs)
+ logabsdet = torch.zeros_like(inputs)
+
+ if tails == "linear":
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
+ constant = np.log(np.exp(1 - min_derivative) - 1)
+ unnormalized_derivatives[..., 0] = constant
+ unnormalized_derivatives[..., -1] = constant
+
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
+ logabsdet[outside_interval_mask] = 0
+ else:
+ raise RuntimeError("{} tails are not implemented.".format(tails))
+
+ (
+ outputs[inside_interval_mask],
+ logabsdet[inside_interval_mask],
+ ) = rational_quadratic_spline(
+ inputs=inputs[inside_interval_mask],
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+ inverse=inverse,
+ left=-tail_bound,
+ right=tail_bound,
+ bottom=-tail_bound,
+ top=tail_bound,
+ min_bin_width=min_bin_width,
+ min_bin_height=min_bin_height,
+ min_derivative=min_derivative,
+ )
+
+ return outputs, logabsdet
+
+
+def rational_quadratic_spline(
+ inputs,
+ unnormalized_widths,
+ unnormalized_heights,
+ unnormalized_derivatives,
+ inverse=False,
+ left=0.0,
+ right=1.0,
+ bottom=0.0,
+ top=1.0,
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+ if torch.min(inputs) < left or torch.max(inputs) > right:
+ raise ValueError("Input to a transform is not within its domain")
+
+ num_bins = unnormalized_widths.shape[-1]
+
+ if min_bin_width * num_bins > 1.0:
+ raise ValueError("Minimal bin width too large for the number of bins")
+ if min_bin_height * num_bins > 1.0:
+ raise ValueError("Minimal bin height too large for the number of bins")
+
+ widths = F.softmax(unnormalized_widths, dim=-1)
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+ cumwidths = torch.cumsum(widths, dim=-1)
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
+ cumwidths = (right - left) * cumwidths + left
+ cumwidths[..., 0] = left
+ cumwidths[..., -1] = right
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
+
+ heights = F.softmax(unnormalized_heights, dim=-1)
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+ cumheights = torch.cumsum(heights, dim=-1)
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
+ cumheights = (top - bottom) * cumheights + bottom
+ cumheights[..., 0] = bottom
+ cumheights[..., -1] = top
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
+
+ if inverse:
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
+ else:
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
+
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+ delta = heights / widths
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
+
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
+
+ if inverse:
+ a = (inputs - input_cumheights) * (
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
+ ) + input_heights * (input_delta - input_derivatives)
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
+ )
+ c = -input_delta * (inputs - input_cumheights)
+
+ discriminant = b.pow(2) - 4 * a * c
+ assert (discriminant >= 0).all()
+
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
+ outputs = root * input_bin_widths + input_cumwidths
+
+ theta_one_minus_theta = root * (1 - root)
+ denominator = input_delta + (
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+ * theta_one_minus_theta
+ )
+ derivative_numerator = input_delta.pow(2) * (
+ input_derivatives_plus_one * root.pow(2)
+ + 2 * input_delta * theta_one_minus_theta
+ + input_derivatives * (1 - root).pow(2)
+ )
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+ return outputs, -logabsdet
+ else:
+ theta = (inputs - input_cumwidths) / input_bin_widths
+ theta_one_minus_theta = theta * (1 - theta)
+
+ numerator = input_heights * (
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
+ )
+ denominator = input_delta + (
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+ * theta_one_minus_theta
+ )
+ outputs = input_cumheights + numerator / denominator
+
+ derivative_numerator = input_delta.pow(2) * (
+ input_derivatives_plus_one * theta.pow(2)
+ + 2 * input_delta * theta_one_minus_theta
+ + input_derivatives * (1 - theta).pow(2)
+ )
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+ return outputs, logabsdet
diff --git a/openvoice/utils.py b/openvoice/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e80909d6a03976400322cc0219d1871e9f84bfa
--- /dev/null
+++ b/openvoice/utils.py
@@ -0,0 +1,194 @@
+import re
+import json
+import numpy as np
+
+
+def get_hparams_from_file(config_path):
+ with open(config_path, "r", encoding="utf-8") as f:
+ data = f.read()
+ config = json.loads(data)
+
+ hparams = HParams(**config)
+ return hparams
+
+class HParams:
+ def __init__(self, **kwargs):
+ for k, v in kwargs.items():
+ if type(v) == dict:
+ v = HParams(**v)
+ self[k] = v
+
+ def keys(self):
+ return self.__dict__.keys()
+
+ def items(self):
+ return self.__dict__.items()
+
+ def values(self):
+ return self.__dict__.values()
+
+ def __len__(self):
+ return len(self.__dict__)
+
+ def __getitem__(self, key):
+ return getattr(self, key)
+
+ def __setitem__(self, key, value):
+ return setattr(self, key, value)
+
+ def __contains__(self, key):
+ return key in self.__dict__
+
+ def __repr__(self):
+ return self.__dict__.__repr__()
+
+
+def string_to_bits(string, pad_len=8):
+ # Convert each character to its ASCII value
+ ascii_values = [ord(char) for char in string]
+
+ # Convert ASCII values to binary representation
+ binary_values = [bin(value)[2:].zfill(8) for value in ascii_values]
+
+ # Convert binary strings to integer arrays
+ bit_arrays = [[int(bit) for bit in binary] for binary in binary_values]
+
+ # Convert list of arrays to NumPy array
+ numpy_array = np.array(bit_arrays)
+ numpy_array_full = np.zeros((pad_len, 8), dtype=numpy_array.dtype)
+ numpy_array_full[:, 2] = 1
+ max_len = min(pad_len, len(numpy_array))
+ numpy_array_full[:max_len] = numpy_array[:max_len]
+ return numpy_array_full
+
+
+def bits_to_string(bits_array):
+ # Convert each row of the array to a binary string
+ binary_values = [''.join(str(bit) for bit in row) for row in bits_array]
+
+ # Convert binary strings to ASCII values
+ ascii_values = [int(binary, 2) for binary in binary_values]
+
+ # Convert ASCII values to characters
+ output_string = ''.join(chr(value) for value in ascii_values)
+
+ return output_string
+
+
+def split_sentence(text, min_len=10, language_str='[EN]'):
+ if language_str in ['EN']:
+ sentences = split_sentences_latin(text, min_len=min_len)
+ else:
+ sentences = split_sentences_zh(text, min_len=min_len)
+ return sentences
+
+def split_sentences_latin(text, min_len=10):
+ """Split Long sentences into list of short ones
+
+ Args:
+ str: Input sentences.
+
+ Returns:
+ List[str]: list of output sentences.
+ """
+ # deal with dirty sentences
+ text = re.sub('[。!?;]', '.', text)
+ text = re.sub('[,]', ',', text)
+ text = re.sub('[“”]', '"', text)
+ text = re.sub('[‘’]', "'", text)
+ text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
+ text = re.sub('[\n\t ]+', ' ', text)
+ text = re.sub('([,.!?;])', r'\1 $#!', text)
+ # split
+ sentences = [s.strip() for s in text.split('$#!')]
+ if len(sentences[-1]) == 0: del sentences[-1]
+
+ new_sentences = []
+ new_sent = []
+ count_len = 0
+ for ind, sent in enumerate(sentences):
+ # print(sent)
+ new_sent.append(sent)
+ count_len += len(sent.split(" "))
+ if count_len > min_len or ind == len(sentences) - 1:
+ count_len = 0
+ new_sentences.append(' '.join(new_sent))
+ new_sent = []
+ return merge_short_sentences_latin(new_sentences)
+
+
+def merge_short_sentences_latin(sens):
+ """Avoid short sentences by merging them with the following sentence.
+
+ Args:
+ List[str]: list of input sentences.
+
+ Returns:
+ List[str]: list of output sentences.
+ """
+ sens_out = []
+ for s in sens:
+ # If the previous sentence is too short, merge them with
+ # the current sentence.
+ if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
+ sens_out[-1] = sens_out[-1] + " " + s
+ else:
+ sens_out.append(s)
+ try:
+ if len(sens_out[-1].split(" ")) <= 2:
+ sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
+ sens_out.pop(-1)
+ except:
+ pass
+ return sens_out
+
+def split_sentences_zh(text, min_len=10):
+ text = re.sub('[。!?;]', '.', text)
+ text = re.sub('[,]', ',', text)
+ # 将文本中的换行符、空格和制表符替换为空格
+ text = re.sub('[\n\t ]+', ' ', text)
+ # 在标点符号后添加一个空格
+ text = re.sub('([,.!?;])', r'\1 $#!', text)
+ # 分隔句子并去除前后空格
+ # sentences = [s.strip() for s in re.split('(。|!|?|;)', text)]
+ sentences = [s.strip() for s in text.split('$#!')]
+ if len(sentences[-1]) == 0: del sentences[-1]
+
+ new_sentences = []
+ new_sent = []
+ count_len = 0
+ for ind, sent in enumerate(sentences):
+ new_sent.append(sent)
+ count_len += len(sent)
+ if count_len > min_len or ind == len(sentences) - 1:
+ count_len = 0
+ new_sentences.append(' '.join(new_sent))
+ new_sent = []
+ return merge_short_sentences_zh(new_sentences)
+
+
+def merge_short_sentences_zh(sens):
+ # return sens
+ """Avoid short sentences by merging them with the following sentence.
+
+ Args:
+ List[str]: list of input sentences.
+
+ Returns:
+ List[str]: list of output sentences.
+ """
+ sens_out = []
+ for s in sens:
+ # If the previous sentense is too short, merge them with
+ # the current sentence.
+ if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
+ sens_out[-1] = sens_out[-1] + " " + s
+ else:
+ sens_out.append(s)
+ try:
+ if len(sens_out[-1]) <= 2:
+ sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
+ sens_out.pop(-1)
+ except:
+ pass
+ return sens_out
\ No newline at end of file
diff --git a/outputs/output_chinese.wav b/outputs/output_chinese.wav
new file mode 100644
index 0000000000000000000000000000000000000000..a96c53407610f883cda1ea669dd588214ca1a458
--- /dev/null
+++ b/outputs/output_chinese.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebf0c51592ad1d5f4bdf1eb51e4c0595a1c9fd5011655fc859999ec31ab43f48
+size 1354796
diff --git a/outputs/output_en_default.wav b/outputs/output_en_default.wav
new file mode 100644
index 0000000000000000000000000000000000000000..9f484639ce6d400ebc51568411e7c7b3d9681c9f
Binary files /dev/null and b/outputs/output_en_default.wav differ
diff --git a/outputs/output_whispering.wav b/outputs/output_whispering.wav
new file mode 100644
index 0000000000000000000000000000000000000000..08375ce2aba429fd51961f9763c67f83a3dc01aa
Binary files /dev/null and b/outputs/output_whispering.wav differ
diff --git a/outputs/tmp.wav b/outputs/tmp.wav
new file mode 100644
index 0000000000000000000000000000000000000000..4eb1ab3690cea0eb0759a7b0c002f893c95f3c24
--- /dev/null
+++ b/outputs/tmp.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5035b53792c67c7ba2d1b38aad0b337be804b51c98ee05dd70eac7f2169c080
+size 2714024
diff --git a/processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/se.pth b/processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/se.pth
new file mode 100644
index 0000000000000000000000000000000000000000..cca081b05ee60921fa3413d3737b8ff4ef5f12b7
--- /dev/null
+++ b/processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/se.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:874cc4d5b02a6cc4d2c6732a24b46b9a05b0c874174a129a9524568c420b38b4
+size 2115
diff --git a/processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg0.wav b/processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg0.wav
new file mode 100644
index 0000000000000000000000000000000000000000..c578a478b7de70a7138e44fd5b1684ea2a614bed
--- /dev/null
+++ b/processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg0.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c633d0970b01ff9cd24967140aadb91dc805d8569cc10351c42ab20c39ba3cb
+size 1700184
diff --git a/processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg1.wav b/processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg1.wav
new file mode 100644
index 0000000000000000000000000000000000000000..cbcc8f7670d9ef3abcda587208b92f9208f24043
--- /dev/null
+++ b/processed/demo_speaker0_v1_47DEQpj8HBSa+_^TI/wavs/demo_speaker0_v1_47DEQpj8HBSa+_^TI_seg1.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de784d788f6fa1bb68332779344dadcbd00352a4590af2dfbb61eee516f0a64d
+size 1700364
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8ddba70d611d3b15ed5b4b2ef4a020a2bd0c61df
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+librosa==0.9.1
+faster-whisper==0.9.0
+pydub==0.25.1
+wavmark==0.0.3
+numpy==1.22.0
+eng_to_ipa==0.0.2
+inflect==7.0.0
+unidecode==1.3.7
+whisper-timestamped==1.14.2
+openai
+python-dotenv
+pypinyin==0.50.0
+cn2an==0.5.22
+jieba==0.42.1
+gradio==3.48.0
+langid==1.1.6
diff --git a/resources/demo_speaker0.mp3 b/resources/demo_speaker0.mp3
new file mode 100644
index 0000000000000000000000000000000000000000..bf1e54698dc002c32ed5455a1ba343c2dfa21b1a
Binary files /dev/null and b/resources/demo_speaker0.mp3 differ
diff --git a/resources/demo_speaker1.mp3 b/resources/demo_speaker1.mp3
new file mode 100644
index 0000000000000000000000000000000000000000..28e324cfd1393b8b4593b172ced0ac8cce5cdb81
Binary files /dev/null and b/resources/demo_speaker1.mp3 differ
diff --git a/resources/demo_speaker2.mp3 b/resources/demo_speaker2.mp3
new file mode 100644
index 0000000000000000000000000000000000000000..e20c8c654803218ae22c42ef36b7e8be20320557
Binary files /dev/null and b/resources/demo_speaker2.mp3 differ
diff --git a/resources/example_reference.mp3 b/resources/example_reference.mp3
new file mode 100644
index 0000000000000000000000000000000000000000..af220d86e7e2aaad9f746bdd4e0b099e151988d3
Binary files /dev/null and b/resources/example_reference.mp3 differ
diff --git a/resources/framework-ipa.png b/resources/framework-ipa.png
new file mode 100644
index 0000000000000000000000000000000000000000..7cdfbe421d2844d729103f4ea6ad74d449b5ee5f
Binary files /dev/null and b/resources/framework-ipa.png differ
diff --git a/resources/huggingface.png b/resources/huggingface.png
new file mode 100644
index 0000000000000000000000000000000000000000..eb5f30148b2f284f7e3d4532784899255352c616
Binary files /dev/null and b/resources/huggingface.png differ
diff --git a/resources/lepton-hd.png b/resources/lepton-hd.png
new file mode 100644
index 0000000000000000000000000000000000000000..c8e6c5406ba6521886e603989cf264a9231936ca
Binary files /dev/null and b/resources/lepton-hd.png differ
diff --git a/resources/myshell-hd.png b/resources/myshell-hd.png
new file mode 100644
index 0000000000000000000000000000000000000000..f62bc37c663c62c6f332c6d0446979e8b0e94ebd
Binary files /dev/null and b/resources/myshell-hd.png differ
diff --git a/resources/openvoicelogo.jpg b/resources/openvoicelogo.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1bc9b9e38bae8ee38e998f5136a1e7a9ed967e80
Binary files /dev/null and b/resources/openvoicelogo.jpg differ
diff --git a/resources/tts-guide.png b/resources/tts-guide.png
new file mode 100644
index 0000000000000000000000000000000000000000..169e810e78e260a3304f0f26ff723344f99ab2d7
Binary files /dev/null and b/resources/tts-guide.png differ
diff --git a/resources/voice-clone-guide.png b/resources/voice-clone-guide.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9ff2bcb6ff469b255b6909418b83bab91fbfb84
Binary files /dev/null and b/resources/voice-clone-guide.png differ
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..33d43c4ee51b31b9beeb394af39698510d569d30
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,45 @@
+from setuptools import setup, find_packages
+
+
+setup(name='MyShell-OpenVoice',
+ version='0.0.0',
+ description='Instant voice cloning by MyShell.',
+ long_description=open('README.md').read().strip(),
+ long_description_content_type='text/markdown',
+ keywords=[
+ 'text-to-speech',
+ 'tts',
+ 'voice-clone',
+ 'zero-shot-tts'
+ ],
+ url='https://github.com/myshell-ai/OpenVoice',
+ project_urls={
+ 'Documentation': 'https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md',
+ 'Changes': 'https://github.com/myshell-ai/OpenVoice/releases',
+ 'Code': 'https://github.com/myshell-ai/OpenVoice',
+ 'Issue tracker': 'https://github.com/myshell-ai/OpenVoice/issues',
+ },
+ author='MyShell',
+ author_email='ethan@myshell.ai',
+ license='MIT License',
+ packages=find_packages(),
+
+ python_requires='>=3.9',
+ install_requires=[
+ 'librosa==0.9.1',
+ 'faster-whisper==0.9.0',
+ 'pydub==0.25.1',
+ 'wavmark==0.0.3',
+ 'numpy==1.22.0',
+ 'eng_to_ipa==0.0.2',
+ 'inflect==7.0.0',
+ 'unidecode==1.3.7',
+ 'whisper-timestamped==1.14.2',
+ 'pypinyin==0.50.0',
+ 'cn2an==0.5.22',
+ 'jieba==0.42.1',
+ 'gradio==3.48.0',
+ 'langid==1.1.6'
+ ],
+ zip_safe=False
+ )