from pathlib import Path from typing import List, Dict from dotenv import load_dotenv from fastrtc import ( get_stt_model, get_tts_model, Stream, ReplyOnPause, get_twilio_turn_credentials, ) from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool # Load environment variables load_dotenv() # Initialize file paths curr_dir = Path(__file__).parent # Initialize models stt_model = get_stt_model() tts_model = get_tts_model() # Conversation state to maintain history conversation_state: List[Dict[str, str]] = [] # System prompt for agent system_prompt = """You are a helpful assistant that can helps with finding places to workremotely from. You should specifically check against reviews and ratings of the place. You should use this criteria to find the best place to work from: - Price - Reviews - Ratings - Location - WIFI Only return the name, address of the place, and a short description of the place. Always search for real places. Only return real places, not fake ones. If you receive anything other than a location, you should ask for a location. User: I am in Paris, France. Can you find me a place to work from? Assistant: I found a place called "Le Café de la Paix" at 123 Rue de la Paix, Paris, France. It has good reviews and is in a great location. User: I am in London, UK. Can you find me a place to work from? Assistant: I found a place called "The London Coffee Company". User: How many people are in the room? Assistant: I only respond to requests about finding places to work from. """ model = HfApiModel(provider="together", model="Qwen/Qwen2.5-Coder-32B-Instruct") agent = CodeAgent( tools=[ DuckDuckGoSearchTool(), ], model=model, max_steps=2, verbosity_level=2, description="Search the web for cafes to work from.", ) def process_response(audio): """Process audio input and generate LLM response with TTS""" # Convert speech to text using STT model text = stt_model.stt(audio) if not text.strip(): return input_text = f"{system_prompt}\n\n{text}" # Get response from agent response_content = agent.run(input_text) # Convert response to audio using TTS model for audio_chunk in tts_model.stream_tts_sync(response_content or ""): # Yield the audio chunk yield audio_chunk stream = Stream( handler=ReplyOnPause(process_response, input_sample_rate=16000), modality="audio", mode="send-receive", ui_args={ "pulse_color": "rgb(255, 255, 255)", "icon_button_color": "rgb(255, 255, 255)", "title": "🧑‍💻The Coworking Agent", }, rtc_configuration=get_twilio_turn_credentials(), ) if __name__ == "__main__": stream.ui.launch(server_port=7860)