waynewang1119 commited on
Commit
a1a7f90
·
verified ·
1 Parent(s): 2e71df4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -126
app.py CHANGED
@@ -14,8 +14,33 @@ print(f"Model: {m}")
14
  tts = TTS(m, gpu=False)
15
  tts.to("cpu") # no GPU or Amd
16
  #tts.to("cuda") # cuda only
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree):
 
19
  if agree == True:
20
  if use_mic == True:
21
  if mic_file_path is not None:
@@ -79,124 +104,18 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree):
79
 
80
  title = "XTTS Glz's remake (Fonctional Text-2-Speech)"
81
 
82
- description = """
83
- <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
84
- <br/>
85
- XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
86
- <br/>
87
- This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
88
- <br/>
89
- Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, where our open-source inference and training code lives.
90
- <br/>
91
- <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
92
- <br/>
93
- <a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
94
- <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
95
- </p>
96
- """
97
 
98
- article = """
99
- <div style='margin:20px auto;'>
100
- <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
101
- </div>
102
- """
103
  examples = [
104
  [
105
- "Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality",
106
  "en",
107
  "examples/female.wav",
108
  None,
109
  False,
110
  True,
111
- ],
112
- [
113
- "Je suis un lycéen français de 17 ans, passioner par la Cyber-Sécuritée et les models d'IA.",
114
- "fr",
115
- "examples/male.wav",
116
- None,
117
- False,
118
- True,
119
- ],
120
- [
121
- "Als ich sechs war, sah ich einmal ein wunderbares Bild",
122
- "de",
123
- "examples/female.wav",
124
- None,
125
- False,
126
- True,
127
- ],
128
- [
129
- "Cuando tenía seis años, vi una vez una imagen magnífica",
130
- "es",
131
- "examples/male.wav",
132
- None,
133
- False,
134
- True,
135
- ],
136
- [
137
- "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
138
- "pt",
139
- "examples/female.wav",
140
- None,
141
- False,
142
- True,
143
- ],
144
- [
145
- "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
146
- "pl",
147
- "examples/male.wav",
148
- None,
149
- False,
150
- True,
151
- ],
152
- [
153
- "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
154
- "it",
155
- "examples/female.wav",
156
- None,
157
- False,
158
- True,
159
- ],
160
- [
161
- "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
162
- "tr",
163
- "examples/female.wav",
164
- None,
165
- False,
166
- True,
167
- ],
168
- [
169
- "Когда мне было шесть лет, я увидел однажды удивительную картинку",
170
- "ru",
171
- "examples/female.wav",
172
- None,
173
- False,
174
- True,
175
- ],
176
- [
177
- "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
178
- "nl",
179
- "examples/male.wav",
180
- None,
181
- False,
182
- True,
183
- ],
184
- [
185
- "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
186
- "cs",
187
- "examples/female.wav",
188
- None,
189
- False,
190
- True,
191
- ],
192
- [
193
- "当我还只有六岁的时候, 看到了一副精彩的插画",
194
- "zh-cn",
195
- "examples/female.wav",
196
- None,
197
- False,
198
- True,
199
- ],
200
  ]
201
 
202
 
@@ -205,9 +124,9 @@ gr.Interface(
205
  fn=predict,
206
  inputs=[
207
  gr.Textbox(
208
- label="Text Prompt",
209
  info="One or two sentences at a time is better",
210
- value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality",
211
  ),
212
  gr.Dropdown(
213
  label="Language",
@@ -231,23 +150,11 @@ gr.Interface(
231
  value="en",
232
  ),
233
  gr.Audio(
234
- label="Reference Audio",
235
- info="Click on the ✎ button to upload your own target speaker audio",
236
  type="filepath",
237
  value="examples/female.wav",
238
  ),
239
- gr.Audio(source="microphone",
240
- type="filepath",
241
- info="Use your microphone to record audio",
242
- label="Use Microphone for Reference"),
243
- gr.Checkbox(label="Check to use Microphone as Reference",
244
- value=False,
245
- info="Notice: Microphone input may not work properly under traffic",),
246
- gr.Checkbox(
247
- label="Agree",
248
- value=True,
249
- info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
250
- ),
251
  ],
252
  outputs=[
253
  gr.Video(label="Waveform Visual"),
 
14
  tts = TTS(m, gpu=False)
15
  tts.to("cpu") # no GPU or Amd
16
  #tts.to("cuda") # cuda only
17
+ import openai
18
+
19
+ openai.api_key = os.getenv("OPENAI_API_KEY")
20
+
21
+ def generate_prompt(user_prompt):
22
+ response = openai.chat.completions.create(
23
+ model="gpt-3.5-turbo",
24
+ messages=[
25
+ {
26
+ "role": "system",
27
+ "content": "You are a close family member who has been lost recently due to uncontrollable incidents. You are reconstructed with AI technologies to try to reunite with families who miss you and cherish you. You should respond as vividly, as lovingly, as possible to make the users feel warm and engaged and eager to continue the discussion further. The users will ask you questions as if you are the person they are trying to reconnect with, examples are like moms or lost children. Try to mimic who they are trying to reach."
28
+ },
29
+ {
30
+ "role": "user",
31
+ "content": user_prompt
32
+ }
33
+ ],
34
+ max_tokens=60,
35
+ n=1,
36
+ temperature=0.5,
37
+ )
38
+ # Assuming the API structure and response object structure; adjust as needed based on actual usage.
39
+ keywords = response.choices[0].message.content.strip()
40
+ return keywords
41
 
42
  def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree):
43
+ prompt = generate_prompt(prompt)
44
  if agree == True:
45
  if use_mic == True:
46
  if mic_file_path is not None:
 
104
 
105
  title = "XTTS Glz's remake (Fonctional Text-2-Speech)"
106
 
107
+ description = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ article = ""
 
 
 
 
110
  examples = [
111
  [
112
+ "Upload your voice like this one here.",
113
  "en",
114
  "examples/female.wav",
115
  None,
116
  False,
117
  True,
118
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  ]
120
 
121
 
 
124
  fn=predict,
125
  inputs=[
126
  gr.Textbox(
127
+ label="Ask anything, get a cloned voice response",
128
  info="One or two sentences at a time is better",
129
+ value="Hello, Mom ! How are you?",
130
  ),
131
  gr.Dropdown(
132
  label="Language",
 
150
  value="en",
151
  ),
152
  gr.Audio(
153
+ label="Upload Audio",
154
+ info="Click on the ✎ button to upload your own speaker audio",
155
  type="filepath",
156
  value="examples/female.wav",
157
  ),
 
 
 
 
 
 
 
 
 
 
 
 
158
  ],
159
  outputs=[
160
  gr.Video(label="Waveform Visual"),