XTTS_V1_CPU

Running

App Files Files Community

waynewang1119 commited on Apr 10, 2024

Commit

a1a7f90

verified ·

1 Parent(s): 2e71df4

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -126

app.py CHANGED Viewed

@@ -14,8 +14,33 @@ print(f"Model: {m}")
 tts = TTS(m, gpu=False)
 tts.to("cpu") # no GPU or Amd
 #tts.to("cuda") # cuda only
 def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree):
     if agree == True:
         if use_mic == True:
             if mic_file_path is not None:
@@ -79,124 +104,18 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree):
 title = "XTTS Glz's remake (Fonctional Text-2-Speech)"
-description = """
-<a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
-<br/>
-XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
-<br/>
-This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
-<br/>
-Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, where our open-source inference and training code lives.
-<br/>
-<p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
-<br/>
-<a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
-<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-</p>
-"""
-article = """
-<div style='margin:20px auto;'>
-<p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
-</div>
-"""
 examples = [
     [
-        "Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality",
         "en",
         "examples/female.wav",
         None,
         False,
         True,
-    ],
-    [
-        "Je suis un lycéen français de 17 ans, passioner par la Cyber-Sécuritée et les models d'IA.",
-        "fr",
-        "examples/male.wav",
-        None,
-        False,
-        True,
-    ],
-    [
-        "Als ich sechs war, sah ich einmal ein wunderbares Bild",
-        "de",
-        "examples/female.wav",
-        None,
-        False,
-        True,
-    ],
-    [
-        "Cuando tenía seis años, vi una vez una imagen magnífica",
-        "es",
-        "examples/male.wav",
-        None,
-        False,
-        True,
-    ],
-    [
-        "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
-        "pt",
-        "examples/female.wav",
-        None,
-        False,
-        True,
-    ],
-    [
-        "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
-        "pl",
-        "examples/male.wav",
-        None,
-        False,
-        True,
-    ],
-    [
-        "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
-        "it",
-        "examples/female.wav",
-        None,
-        False,
-        True,
-    ],
-    [
-        "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
-        "tr",
-        "examples/female.wav",
-        None,
-        False,
-        True,
-    ],
-    [
-        "Когда мне было шесть лет, я увидел однажды удивительную картинку",
-        "ru",
-        "examples/female.wav",
-        None,
-        False,
-        True,
-    ],
-    [
-        "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
-        "nl",
-        "examples/male.wav",
-        None,
-        False,
-        True,
-    ],
-    [
-        "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
-        "cs",
-        "examples/female.wav",
-        None,
-        False,
-        True,
-    ],
-    [
-        "当我还只有六岁的时候， 看到了一副精彩的插画",
-        "zh-cn",
-        "examples/female.wav",
-        None,
-        False,
-        True,
-    ],
 ]
@@ -205,9 +124,9 @@ gr.Interface(
     fn=predict,
     inputs=[
         gr.Textbox(
-            label="Text Prompt",
             info="One or two sentences at a time is better",
-            value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality",
         ),
         gr.Dropdown(
             label="Language",
@@ -231,23 +150,11 @@ gr.Interface(
             value="en",
         ),
         gr.Audio(
-            label="Reference Audio",
-            info="Click on the ✎ button to upload your own target speaker audio",
             type="filepath",
             value="examples/female.wav",
         ),
-        gr.Audio(source="microphone",
-                 type="filepath",
-                 info="Use your microphone to record audio",
-                 label="Use Microphone for Reference"),
-        gr.Checkbox(label="Check to use Microphone as Reference",
-                    value=False,
-                    info="Notice: Microphone input may not work properly under traffic",),
-        gr.Checkbox(
-            label="Agree",
-            value=True,
-            info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
-        ),
     ],
     outputs=[
         gr.Video(label="Waveform Visual"),

 tts = TTS(m, gpu=False)
 tts.to("cpu") # no GPU or Amd
 #tts.to("cuda") # cuda only
+import openai
+openai.api_key = os.getenv("OPENAI_API_KEY")
+def generate_prompt(user_prompt):
+    response = openai.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {
+                "role": "system",
+                "content": "You are a close family member who has been lost recently due to uncontrollable incidents. You are reconstructed with AI technologies to try to reunite with families who miss you and cherish you. You should respond as vividly, as lovingly, as possible to make the users feel warm and engaged and eager to continue the discussion further. The users will ask you questions as if you are the person they are trying to reconnect with, examples are like moms or lost children. Try to mimic who they are trying to reach."
+            },
+            {
+                "role": "user",
+                "content": user_prompt
+            }
+        ],
+        max_tokens=60,
+        n=1,
+        temperature=0.5,
+    )
+    # Assuming the API structure and response object structure; adjust as needed based on actual usage.
+    keywords = response.choices[0].message.content.strip()
+    return keywords
 def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree):
+    prompt = generate_prompt(prompt)
     if agree == True:
         if use_mic == True:
             if mic_file_path is not None:
 title = "XTTS Glz's remake (Fonctional Text-2-Speech)"
+description = ""
+article = ""
 examples = [
     [
+        "Upload your voice like this one here.",
         "en",
         "examples/female.wav",
         None,
         False,
         True,
+    ]
 ]
     fn=predict,
     inputs=[
         gr.Textbox(
+            label="Ask anything, get a cloned voice response",
             info="One or two sentences at a time is better",
+            value="Hello, Mom ! How are you?",
         ),
         gr.Dropdown(
             label="Language",
             value="en",
         ),
         gr.Audio(
+            label="Upload Audio",
+            info="Click on the ✎ button to upload your own speaker audio",
             type="filepath",
             value="examples/female.wav",
         ),
     ],
     outputs=[
         gr.Video(label="Waveform Visual"),