Spaces:

coqui
/

voice-chat-with-mistral

Paused

App Files Files Community

xtts-and-whisper-update

by gorkemgoknar - opened Oct 13, 2023

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+138

-33

Files changed (2) hide show

app.py +133 -31
requirements.txt +5 -2

app.py CHANGED Viewed

@@ -11,8 +11,36 @@ import nltk  # we'll use this to split into sentences
 nltk.download('punkt')
 import uuid
 from TTS.api import TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1", gpu=True)
 title = "Voice chat with Mistral 7B Instruct"
@@ -44,11 +72,20 @@ from gradio_client import Client
 from huggingface_hub import InferenceClient
-whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
 text_client = InferenceClient(
     "mistralai/Mistral-7B-Instruct-v0.1"
 )
 def format_prompt(message, history):
   prompt = "<s>"
@@ -77,22 +114,35 @@ def generate(
     formatted_prompt = format_prompt(prompt, history)
-    stream = text_client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
-    output = ""
-    for response in stream:
-        output += response.token.text
-        yield output
     return output
 def transcribe(wav_path):
     return whisper_client.predict(
 				wav_path,	# str (filepath or URL to file) in 'inputs' Audio component
 				"transcribe",	# str in 'Task' Radio component
 				api_name="/predict"
-)
 # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
@@ -106,9 +156,17 @@ def add_text(history, text):
 def add_file(history, file):
     history = [] if history is None else history
-    text = transcribe(
-        file
-    )
     history = history + [(text, None)]
     return history
@@ -126,29 +184,65 @@ def bot(history, system_prompt=""):
         history[-1][1] = character
         yield history
 def generate_speech(history):
     text_to_generate = history[-1][1]
     text_to_generate = text_to_generate.replace("\n", " ").strip()
     text_to_generate = nltk.sent_tokenize(text_to_generate)
-    filename = f"{uuid.uuid4()}.wav"
-    sampling_rate = tts.synthesizer.tts_config.audio["sample_rate"]
-    silence = [0] * int(0.25 * sampling_rate)
-    for sentence in text_to_generate:
-        try:
-            # generate speech by cloning a voice using default settings
-            wav = tts.tts(text=sentence,
-                        speaker_wav="examples/female.wav",
-                        decoder_iterations=25,
-                        decoder_sampler="dpm++2m",
-                        speed=1.2,
-                        language="en")
-            yield (sampling_rate, np.array(wav)) #np.array(wav + silence))
         except RuntimeError as e :
             if "device-side assert" in str(e):
@@ -163,6 +257,14 @@ def generate_speech(history):
             else:
                 print("RuntimeError: non device-side assert error:", str(e))
                 raise e
 with gr.Blocks(title=title) as demo:
     gr.Markdown(DESCRIPTION)
@@ -186,7 +288,7 @@ with gr.Blocks(title=title) as demo:
         btn = gr.Audio(source="microphone", type="filepath", scale=4)
     with gr.Row():
-        audio = gr.Audio(type="numpy", streaming=True, autoplay=True, label="Generated audio response", show_label=True)
     clear_btn = gr.ClearButton([chatbot, audio])
@@ -210,11 +312,11 @@ with gr.Blocks(title=title) as demo:
     gr.Markdown("""
 This Space demonstrates how to speak to a chatbot, based solely on open-source models.
 It relies on 3 models:
-1. [Whisper-large-v2](https://huggingface.co/spaces/sanchit-gandhi/whisper-large-v2) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
 2. [Mistral-7b-instruct](https://huggingface.co/spaces/osanseviero/mistral-super-fast) as the chat model, the actual chat model. It is called from [huggingface_hub](https://huggingface.co/docs/huggingface_hub/guides/inference).
 3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:
 - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml""")
 demo.queue()
-demo.launch(debug=True)

 nltk.download('punkt')
 import uuid
+import ffmpeg
+import librosa
+import torchaudio
 from TTS.api import TTS
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+from TTS.utils.generic_utils import get_user_data_dir
+# This will trigger downloading model
+print("Downloading if not downloaded Coqui XTTS V1")
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
+del tts
+print("XTTS downloaded")
+print("Loading XTTS")
+#Below will use model directly for inference
+model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
+config = XttsConfig()
+config.load_json(os.path.join(model_path, "config.json"))
+model = Xtts.init_from_config(config)
+model.load_checkpoint(
+    config,
+    checkpoint_path=os.path.join(model_path, "model.pth"),
+    vocab_path=os.path.join(model_path, "vocab.json"),
+    eval=True,
+    use_deepspeed=True
+)
+model.cuda()
+print("Done loading TTS")
 title = "Voice chat with Mistral 7B Instruct"
 from huggingface_hub import InferenceClient
+# This client is down
+#whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
+# Replacement whisper client, it may be time limited
+whisper_client = Client("https://sanchit-gandhi-whisper-jax.hf.space")
 text_client = InferenceClient(
     "mistralai/Mistral-7B-Instruct-v0.1"
 )
+###### COQUI TTS FUNCTIONS ######
+def get_latents(speaker_wav):
+    # create as function as we can populate here with voice cleanup/filtering
+    gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
+    return gpt_cond_latent, diffusion_conditioning, speaker_embedding
 def format_prompt(message, history):
   prompt = "<s>"
     formatted_prompt = format_prompt(prompt, history)
+    try:
+        stream = text_client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+        output = ""
+        for response in stream:
+            output += response.token.text
+            yield output
+    except Exception as e:
+         if "Too Many Requests" in str(e):
+             print("ERROR: Too many requests on mistral client")
+             gr.Warning("Unfortunately Mistral is unable to process")
+             output = "Unfortuanately I am not able to process your request now !"
+         else:
+             print("Unhandled Exception: ", str(e))
+             gr.Warning("Unfortunately Mistral is unable to process")
+             output = "I do not know what happened but I could not understand you ."
     return output
 def transcribe(wav_path):
+    # get first element from whisper_jax and strip it to delete begin and end space
     return whisper_client.predict(
 				wav_path,	# str (filepath or URL to file) in 'inputs' Audio component
 				"transcribe",	# str in 'Task' Radio component
+                False, # return_timestamps=False for whisper-jax https://gist.github.com/sanchit-gandhi/781dd7003c5b201bfe16d28634c8d4cf#file-whisper_jax_endpoint-py
 				api_name="/predict"
+    )[0].strip()
 # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
 def add_file(history, file):
     history = [] if history is None else history
+    try:
+        text = transcribe(
+            file
+        )
+        print("Transcribed text:",text)
+    except Exception as e:
+        print(str(e))
+        gr.Warning("There was an issue with transcription, please try writing for now")
+        # Apply a null text on error
+        text = "Transcription seems failed, please tell me a joke about chickens"
     history = history + [(text, None)]
     return history
         history[-1][1] = character
         yield history
+def get_latents(speaker_wav):
+    # Generate speaker embedding and latents for TTS
+    gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
+    return gpt_cond_latent, diffusion_conditioning, speaker_embedding
+latent_map={}
+latent_map["Female_Voice"] = get_latents("examples/female.wav")
+def get_voice(prompt,language, latent_tuple,suffix="0"):
+    gpt_cond_latent,diffusion_conditioning, speaker_embedding = latent_tuple
+    # Direct version
+    t0 = time.time()
+    out = model.inference(
+        prompt,
+        language,
+        gpt_cond_latent,
+        speaker_embedding,
+        diffusion_conditioning
+    )
+    inference_time = time.time() - t0
+    print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
+    real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
+    print(f"Real-time factor (RTF): {real_time_factor}")
+    wav_filename=f"output_{suffix}.wav"
+    torchaudio.save(wav_filename, torch.tensor(out["wav"]).unsqueeze(0), 24000)
+    return wav_filename
 def generate_speech(history):
     text_to_generate = history[-1][1]
     text_to_generate = text_to_generate.replace("\n", " ").strip()
     text_to_generate = nltk.sent_tokenize(text_to_generate)
+    language = "en"
+    wav_list = []
+    for i,sentence in enumerate(text_to_generate):
+        # Sometimes prompt </s> coming on output remove it
+        sentence= sentence.replace("</s>","")
+        # A fast fix for last chacter, may produce weird sounds if it is with text
+        if sentence[-1] in ["!","?",".",","]:
+            #just add a space
+            sentence = sentence[:-1] + " " + sentence[-1]
+        print("Sentence:", sentence)
+        try:
+            # generate speech using precomputed latents
+            # This is not streaming but it will be fast
+            # giving sentence suffix so we can merge all to single audio at end
+            # On mobile there is no autoplay support due to mobile security!
+            wav = get_voice(sentence,language, latent_map["Female_Voice"], suffix=i)
+            wav_list.append(wav)
+            yield wav
+            wait_time= librosa.get_duration(path=wav)
+            print("Sleeping till audio end")
+            time.sleep(wait_time)
         except RuntimeError as e :
             if "device-side assert" in str(e):
             else:
                 print("RuntimeError: non device-side assert error:", str(e))
                 raise e
+    #Spoken on autoplay everysencen now produce a concataned one at the one
+    #requires pip install ffmpeg-python
+    files_to_concat= [ffmpeg.input(w) for w in wav_list]
+    combined_file_name="combined.wav"
+    ffmpeg.concat(*files_to_concat,v=0, a=1).output(combined_file_name).run(overwrite_output=True)
+    return gr.Audio.update(value=combined_file_name, autoplay=False)
 with gr.Blocks(title=title) as demo:
     gr.Markdown(DESCRIPTION)
         btn = gr.Audio(source="microphone", type="filepath", scale=4)
     with gr.Row():
+        audio = gr.Audio(type="numpy", streaming=False, autoplay=True, label="Generated audio response", show_label=True)
     clear_btn = gr.ClearButton([chatbot, audio])
     gr.Markdown("""
 This Space demonstrates how to speak to a chatbot, based solely on open-source models.
 It relies on 3 models:
+1. [Whisper-large-v2](https://huggingface.co/spaces/sanchit-gandhi/whisper-jax) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
 2. [Mistral-7b-instruct](https://huggingface.co/spaces/osanseviero/mistral-super-fast) as the chat model, the actual chat model. It is called from [huggingface_hub](https://huggingface.co/docs/huggingface_hub/guides/inference).
 3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 Note:
 - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml""")
 demo.queue()
+demo.launch(debug=True)

requirements.txt CHANGED Viewed

@@ -53,8 +53,11 @@ encodec==0.1.*
 # deps for XTTS
 unidecode==1.3.*
 langid
-# Install tts
-git+https://github.com/coqui-ai/tts.git@43a7ca800b6508d95e084728a948846556f71a40
 deepspeed==0.8.3
 pydub
 gradio_client

 # deps for XTTS
 unidecode==1.3.*
 langid
+# Install Coqui TTS
+TTS==0.17.8
+# Deepspeed for fast inference
 deepspeed==0.8.3
 pydub
+librosa
+ffmpeg-python
 gradio_client