Spaces:
Running
on
Zero
Running
on
Zero
VanguardAI
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,82 +1,190 @@
|
|
1 |
-
import sounddevice as sd
|
2 |
-
import scipy.io.wavfile as wavfile
|
3 |
-
import numpy as np
|
4 |
import gradio as gr
|
5 |
-
|
6 |
-
import tempfile
|
7 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
self.sample_rate = sample_rate
|
14 |
-
self.stream = None
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
self.frames = []
|
19 |
-
self.stream = sd.InputStream(callback=self.callback, channels=2, samplerate=self.sample_rate)
|
20 |
-
self.stream.start()
|
21 |
-
self.recording = True
|
22 |
-
return "Recording... Press to Stop"
|
23 |
-
else:
|
24 |
-
self.stream.stop()
|
25 |
-
self.stream.close()
|
26 |
-
self.recording = False
|
27 |
-
return "Recording stopped. Press to Record"
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
file=(audio_file, file.read()),
|
54 |
-
model="whisper-large-v3",
|
55 |
-
prompt="Specify context or spelling", # Optional
|
56 |
-
response_format="json", # Optional
|
57 |
-
language="en", # Optional
|
58 |
-
temperature=0.0 # Optional
|
59 |
-
)
|
60 |
-
os.remove(audio_file) # Clean up the temporary file
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
else:
|
72 |
-
return
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
-
|
82 |
-
gradio_interface.launch()
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import torch
|
|
|
3 |
import os
|
4 |
+
import numpy as np
|
5 |
+
from groq import Groq
|
6 |
+
from transformers import AutoModel, AutoTokenizer
|
7 |
+
from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
|
8 |
+
from parler_tts import ParlerTTSForConditionalGeneration
|
9 |
+
import soundfile as sf
|
10 |
+
from llama_index import SimpleDirectoryReader, GPTSimpleVectorIndex, LLMPredictor, ServiceContext
|
11 |
+
from llama_index.langchain_helpers.text_splitter import RecursiveCharacterTextSplitter
|
12 |
+
from langchain import OpenAI
|
13 |
+
from PIL import Image
|
14 |
+
from decord import VideoReader, cpu
|
15 |
+
import requests
|
16 |
+
|
17 |
+
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
18 |
+
MODEL = 'llama3-groq-70b-8192-tool-use-preview'
|
19 |
|
20 |
+
# Load models for text, speech, and image processing
|
21 |
+
text_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
|
22 |
+
attn_implementation='sdpa', torch_dtype=torch.bfloat16).eval().cuda()
|
23 |
+
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
|
|
|
|
|
24 |
|
25 |
+
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to('cuda')
|
26 |
+
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
image_model = UNet2DConditionModel.from_config("stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet").to("cuda", torch.float16)
|
29 |
+
image_pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", unet=image_model, torch_dtype=torch.float16, variant="fp16").to("cuda")
|
30 |
+
image_pipe.scheduler = EulerDiscreteScheduler.from_config(image_pipe.scheduler.config, timestep_spacing="trailing")
|
31 |
|
32 |
+
# Initialize voice-only mode
|
33 |
+
def play_voice_output(response):
|
34 |
+
description = "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
|
35 |
+
input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to('cuda')
|
36 |
+
prompt_input_ids = tts_tokenizer(response, return_tensors="pt").input_ids.to('cuda')
|
37 |
+
generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
38 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
39 |
+
sf.write("output.wav", audio_arr, tts_model.config.sampling_rate)
|
40 |
+
return "output.wav"
|
41 |
|
42 |
+
# Web search function
|
43 |
+
def web_search(query):
|
44 |
+
api_key = os.environ.get("BING_API_KEY")
|
45 |
+
search_url = "https://api.bing.microsoft.com/v7.0/search"
|
46 |
+
headers = {"Ocp-Apim-Subscription-Key": api_key}
|
47 |
+
params = {"q": query, "textDecorations": True, "textFormat": "HTML"}
|
48 |
+
response = requests.get(search_url, headers=headers, params=params)
|
49 |
+
response.raise_for_status()
|
50 |
+
search_results = response.json()
|
51 |
+
snippets = [result['snippet'] for result in search_results.get('webPages', {}).get('value', [])]
|
52 |
+
return "\n".join(snippets)
|
53 |
|
54 |
+
# NumPy Calculation function
|
55 |
+
def numpy_calculate(code: str) -> str:
|
56 |
+
try:
|
57 |
+
local_dict = {}
|
58 |
+
exec(code, {"np": np}, local_dict)
|
59 |
+
result = local_dict.get("result", "No result found")
|
60 |
+
return str(result)
|
61 |
+
except Exception as e:
|
62 |
+
return f"An error occurred: {str(e)}"
|
63 |
+
|
64 |
+
# Function to handle different input types
|
65 |
+
def handle_input(user_prompt, image=None, video=None, audio=None, doc=None):
|
66 |
+
messages = [{"role": "user", "content": user_prompt}]
|
67 |
+
|
68 |
+
if audio:
|
69 |
+
transcription = client.audio.transcriptions.create(
|
70 |
+
file=(audio.name, audio.read()),
|
71 |
+
model="whisper-large-v3"
|
72 |
+
)
|
73 |
+
user_prompt = transcription.text
|
74 |
+
|
75 |
+
if doc:
|
76 |
+
index = create_rag_index(doc.name, doc.read())
|
77 |
+
response = index.query(user_prompt)
|
78 |
+
elif image and not video:
|
79 |
+
image = Image.open(image).convert('RGB')
|
80 |
+
messages[0]['content'] = [image, user_prompt]
|
81 |
+
response = text_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
|
82 |
+
elif video:
|
83 |
+
frames = encode_video(video.name)
|
84 |
+
messages[0]['content'] = frames + [user_prompt]
|
85 |
+
response = text_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
|
86 |
+
else:
|
87 |
+
response = client.chat.completions.create(
|
88 |
+
model=MODEL,
|
89 |
+
messages=messages,
|
90 |
+
tools=initialize_tools()
|
91 |
+
).choices[0].message.content
|
92 |
+
|
93 |
+
return response
|
94 |
|
95 |
+
# Function to create RAG index using LlamaIndex or Langchain
|
96 |
+
def create_rag_index(file_name, file_content):
|
97 |
+
docs = SimpleDirectoryReader(file_name, file_content).load_data()
|
98 |
+
service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=OpenAI(temperature=0)))
|
99 |
+
index = GPTSimpleVectorIndex.from_documents(docs, service_context=service_context)
|
100 |
+
return index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
+
# Function to encode video
|
103 |
+
def encode_video(video_path):
|
104 |
+
MAX_NUM_FRAMES = 64
|
105 |
+
vr = VideoReader(video_path, ctx=cpu(0))
|
106 |
+
sample_fps = round(vr.get_avg_fps() / 1)
|
107 |
+
frame_idx = [i for i in range(0, len(vr), sample_fps)]
|
108 |
+
if len(frame_idx) > MAX_NUM_FRAMES:
|
109 |
+
frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
|
110 |
+
frames = vr.get_batch(frame_idx).asnumpy()
|
111 |
+
frames = [Image.fromarray(v.astype('uint8')) for v in frames]
|
112 |
+
return frames
|
113 |
|
114 |
+
# Initialize tools with web search and NumPy calculation
|
115 |
+
def initialize_tools():
|
116 |
+
tools = [
|
117 |
+
{
|
118 |
+
"type": "function",
|
119 |
+
"function": {
|
120 |
+
"name": "calculate",
|
121 |
+
"description": "Evaluate a mathematical expression",
|
122 |
+
"parameters": {
|
123 |
+
"type": "object",
|
124 |
+
"properties": {
|
125 |
+
"expression": {"type": "string", "description": "The mathematical expression to evaluate"}
|
126 |
+
},
|
127 |
+
"required": ["expression"]
|
128 |
+
},
|
129 |
+
}
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"type": "function",
|
133 |
+
"function": {
|
134 |
+
"name": "web_search",
|
135 |
+
"description": "Perform a web search",
|
136 |
+
"parameters": {
|
137 |
+
"type": "object",
|
138 |
+
"properties": {
|
139 |
+
"query": {"type": "string", "description": "The search query"}
|
140 |
+
},
|
141 |
+
"required": ["query"]
|
142 |
+
},
|
143 |
+
"implementation": web_search
|
144 |
+
}
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"type": "function",
|
148 |
+
"function": {
|
149 |
+
"name": "numpy_calculate",
|
150 |
+
"description": "Execute NumPy-based Python code for calculations",
|
151 |
+
"parameters": {
|
152 |
+
"type": "object",
|
153 |
+
"properties": {
|
154 |
+
"code": {"type": "string", "description": "The Python code with NumPy operations"}
|
155 |
+
},
|
156 |
+
"required": ["code"]
|
157 |
+
},
|
158 |
+
"implementation": numpy_calculate
|
159 |
+
}
|
160 |
+
}
|
161 |
+
]
|
162 |
+
return tools
|
163 |
+
@spaces.GPU()
|
164 |
+
# Gradio Interface
|
165 |
+
def main_interface(user_prompt, image=None, video=None, audio=None, doc=None, voice_only=False):
|
166 |
+
response = handle_input(user_prompt, image=image, video=video, audio=audio, doc=doc)
|
167 |
+
if voice_only:
|
168 |
+
audio_file = play_voice_output(response)
|
169 |
+
return gr.Audio.update(value=audio_file, visible=True)
|
170 |
else:
|
171 |
+
return response
|
172 |
|
173 |
+
# Gradio App Setup
|
174 |
+
with gr.Blocks() as demo:
|
175 |
+
user_prompt = gr.Textbox(placeholder="Type your message here...", lines=1)
|
176 |
+
image_input = gr.Image(type="file", label="Upload an image")
|
177 |
+
video_input = gr.Video(type="file", label="Upload a video")
|
178 |
+
audio_input = gr.Audio(type="file", label="Upload audio")
|
179 |
+
doc_input = gr.File(type="file", label="Upload a document")
|
180 |
+
voice_only_mode = gr.Checkbox(label="Enable Voice Only Mode")
|
181 |
+
output = gr.Output()
|
182 |
+
|
183 |
+
submit = gr.Button("Submit")
|
184 |
+
submit.click(
|
185 |
+
fn=main_interface,
|
186 |
+
inputs=[user_prompt, image_input, video_input, audio_input, doc_input, voice_only_mode],
|
187 |
+
outputs=output
|
188 |
+
)
|
189 |
|
190 |
+
demo.launch(inline=False)
|
|