File size: 2,740 Bytes
9088133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from llama_cpp import Llama
import whisper
from TTS.api import TTS
import numpy as np
import gradio as gr
from gradio_unifiedaudio import UnifiedAudio
from pathlib import Path
import torch
from scipy.io import wavfile
from collections import deque

whisper_model = whisper.load_model("base")
llm = Llama.from_pretrained(
    repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
    filename="*q8_0.gguf",
    verbose=False
)
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
dir_ = Path(__file__).parent
instream = None

def detect_pause(instream, energy_threshold=800, pause_duration=2.0, sample_rate=16000):
    pause_samples = int(pause_duration * sample_rate)
    energy = np.abs(instream[1])

    window = deque(maxlen=pause_samples)
    for i, e in enumerate(energy):
        window.append(e < energy_threshold)
        if len(window) == pause_samples and all(window):
            return True
    return False

def add_to_stream(audio, instream, pause_detected):
    if instream is None:
        ret = audio
    else:
        ret = (audio[0], np.concatenate((instream[1], audio[1])))
        if detect_pause(instream):
            pause_detected = True
            stop_recording(ret)
    return audio, ret, pause_detected

def stop_recording(audio):
    wavfile.write("user_output.wav", audio[0], audio[1])
    text = whisper_model.transcribe("user_output.wav")['text']
    print(f"You said: {text}")
    
    if text.lower() in ["exit", "quit", "stop"]:
        print("Voice Assistant is shutting down.")
    
    response = generate_response(text)
    print(f"Assistant: {response}")
    return UnifiedAudio(value=speak_text(response), streaming=False)

def stop_playing():
    pause_detected = False
    return UnifiedAudio(value=None, streaming=True), None, pause_detected

def transcribe_audio(audio_data):
    return whisper_model.transcribe("user_output.wav", language='en')['text']

def generate_response(prompt):
    response = llm(prompt=prompt)
    return response['choices'][0]['text'].strip()

def speak_text(text):
    tts.tts_to_file(text=text.strip(), file_path="bot_output.wav")
    return "bot_output.wav"

with gr.Blocks() as demo:
    mic = UnifiedAudio(sources=["microphone"], streaming=True)
    stream = gr.State()
    pause_detected = gr.State(False)
    mic.stop_recording(stop_recording, stream, mic)
    mic.end(stop_playing, None, [mic, stream, pause_detected])
    mic.stream(add_to_stream, [mic, stream, pause_detected], [mic, stream, pause_detected])

    # @gr.render(inputs=[mic, stream, pause_detected])
    # def recording_paused(microphone, stream, pause_detected):
    #     if pause_detected:
    #         stop_recording(stream)

if __name__ == '__main__':
    demo.launch()