Spaces:
Sleeping
Sleeping
File size: 2,740 Bytes
9088133 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
from llama_cpp import Llama
import whisper
from TTS.api import TTS
import numpy as np
import gradio as gr
from gradio_unifiedaudio import UnifiedAudio
from pathlib import Path
import torch
from scipy.io import wavfile
from collections import deque
whisper_model = whisper.load_model("base")
llm = Llama.from_pretrained(
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
filename="*q8_0.gguf",
verbose=False
)
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
dir_ = Path(__file__).parent
instream = None
def detect_pause(instream, energy_threshold=800, pause_duration=2.0, sample_rate=16000):
pause_samples = int(pause_duration * sample_rate)
energy = np.abs(instream[1])
window = deque(maxlen=pause_samples)
for i, e in enumerate(energy):
window.append(e < energy_threshold)
if len(window) == pause_samples and all(window):
return True
return False
def add_to_stream(audio, instream, pause_detected):
if instream is None:
ret = audio
else:
ret = (audio[0], np.concatenate((instream[1], audio[1])))
if detect_pause(instream):
pause_detected = True
stop_recording(ret)
return audio, ret, pause_detected
def stop_recording(audio):
wavfile.write("user_output.wav", audio[0], audio[1])
text = whisper_model.transcribe("user_output.wav")['text']
print(f"You said: {text}")
if text.lower() in ["exit", "quit", "stop"]:
print("Voice Assistant is shutting down.")
response = generate_response(text)
print(f"Assistant: {response}")
return UnifiedAudio(value=speak_text(response), streaming=False)
def stop_playing():
pause_detected = False
return UnifiedAudio(value=None, streaming=True), None, pause_detected
def transcribe_audio(audio_data):
return whisper_model.transcribe("user_output.wav", language='en')['text']
def generate_response(prompt):
response = llm(prompt=prompt)
return response['choices'][0]['text'].strip()
def speak_text(text):
tts.tts_to_file(text=text.strip(), file_path="bot_output.wav")
return "bot_output.wav"
with gr.Blocks() as demo:
mic = UnifiedAudio(sources=["microphone"], streaming=True)
stream = gr.State()
pause_detected = gr.State(False)
mic.stop_recording(stop_recording, stream, mic)
mic.end(stop_playing, None, [mic, stream, pause_detected])
mic.stream(add_to_stream, [mic, stream, pause_detected], [mic, stream, pause_detected])
# @gr.render(inputs=[mic, stream, pause_detected])
# def recording_paused(microphone, stream, pause_detected):
# if pause_detected:
# stop_recording(stream)
if __name__ == '__main__':
demo.launch()
|