VanguardAI commited on
Commit
ed2f5ce
·
verified ·
1 Parent(s): 1c761d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +176 -68
app.py CHANGED
@@ -1,82 +1,190 @@
1
- import sounddevice as sd
2
- import scipy.io.wavfile as wavfile
3
- import numpy as np
4
  import gradio as gr
5
- from groq import Groq
6
- import tempfile
7
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- class Recorder:
10
- def __init__(self, sample_rate=44100):
11
- self.recording = False
12
- self.frames = []
13
- self.sample_rate = sample_rate
14
- self.stream = None
15
 
16
- def toggle_recording(self):
17
- if not self.recording:
18
- self.frames = []
19
- self.stream = sd.InputStream(callback=self.callback, channels=2, samplerate=self.sample_rate)
20
- self.stream.start()
21
- self.recording = True
22
- return "Recording... Press to Stop"
23
- else:
24
- self.stream.stop()
25
- self.stream.close()
26
- self.recording = False
27
- return "Recording stopped. Press to Record"
28
 
29
- def callback(self, indata, frames, time, status):
30
- if self.recording:
31
- self.frames.append(indata.copy())
32
 
33
- def save_audio(self):
34
- if self.frames:
35
- audio_data = np.concatenate(self.frames, axis=0)
36
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
37
- wavfile.write(temp_wav_file.name, self.sample_rate, audio_data)
38
- return temp_wav_file.name
39
- else:
40
- return None
 
41
 
42
- recorder = Recorder()
 
 
 
 
 
 
 
 
 
 
43
 
44
- def record():
45
- return recorder.toggle_recording()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- def transcribe():
48
- audio_file = recorder.save_audio()
49
- if audio_file:
50
- client = Groq(api_key="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")
51
- with open(audio_file, "rb") as file:
52
- transcription = client.audio.transcriptions.create(
53
- file=(audio_file, file.read()),
54
- model="whisper-large-v3",
55
- prompt="Specify context or spelling", # Optional
56
- response_format="json", # Optional
57
- language="en", # Optional
58
- temperature=0.0 # Optional
59
- )
60
- os.remove(audio_file) # Clean up the temporary file
61
 
62
- # Inspect the transcription object to find the text
63
- print(transcription)
64
-
65
- # Access the text attribute directly if available
66
- if hasattr(transcription, 'text'):
67
- return transcription.text
68
- else:
69
- return "Transcription text not found."
 
 
 
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  else:
72
- return "No audio recorded."
73
 
74
- with gr.Blocks() as gradio_interface:
75
- with gr.Column():
76
- record_button = gr.Button("Press to Record")
77
- record_button.click(fn=record, outputs=record_button)
78
- transcription_output = gr.Textbox(label="Transcription")
79
- record_button.click(fn=transcribe, outputs=transcription_output)
 
 
 
 
 
 
 
 
 
 
80
 
81
- if __name__ == "__main__":
82
- gradio_interface.launch()
 
 
 
 
1
  import gradio as gr
2
+ import torch
 
3
  import os
4
+ import numpy as np
5
+ from groq import Groq
6
+ from transformers import AutoModel, AutoTokenizer
7
+ from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler
8
+ from parler_tts import ParlerTTSForConditionalGeneration
9
+ import soundfile as sf
10
+ from llama_index import SimpleDirectoryReader, GPTSimpleVectorIndex, LLMPredictor, ServiceContext
11
+ from llama_index.langchain_helpers.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain import OpenAI
13
+ from PIL import Image
14
+ from decord import VideoReader, cpu
15
+ import requests
16
+
17
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
18
+ MODEL = 'llama3-groq-70b-8192-tool-use-preview'
19
 
20
+ # Load models for text, speech, and image processing
21
+ text_model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True,
22
+ attn_implementation='sdpa', torch_dtype=torch.bfloat16).eval().cuda()
23
+ tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
 
 
24
 
25
+ tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to('cuda')
26
+ tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
 
 
 
 
 
 
 
 
 
 
27
 
28
+ image_model = UNet2DConditionModel.from_config("stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet").to("cuda", torch.float16)
29
+ image_pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", unet=image_model, torch_dtype=torch.float16, variant="fp16").to("cuda")
30
+ image_pipe.scheduler = EulerDiscreteScheduler.from_config(image_pipe.scheduler.config, timestep_spacing="trailing")
31
 
32
+ # Initialize voice-only mode
33
+ def play_voice_output(response):
34
+ description = "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."
35
+ input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to('cuda')
36
+ prompt_input_ids = tts_tokenizer(response, return_tensors="pt").input_ids.to('cuda')
37
+ generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
38
+ audio_arr = generation.cpu().numpy().squeeze()
39
+ sf.write("output.wav", audio_arr, tts_model.config.sampling_rate)
40
+ return "output.wav"
41
 
42
+ # Web search function
43
+ def web_search(query):
44
+ api_key = os.environ.get("BING_API_KEY")
45
+ search_url = "https://api.bing.microsoft.com/v7.0/search"
46
+ headers = {"Ocp-Apim-Subscription-Key": api_key}
47
+ params = {"q": query, "textDecorations": True, "textFormat": "HTML"}
48
+ response = requests.get(search_url, headers=headers, params=params)
49
+ response.raise_for_status()
50
+ search_results = response.json()
51
+ snippets = [result['snippet'] for result in search_results.get('webPages', {}).get('value', [])]
52
+ return "\n".join(snippets)
53
 
54
+ # NumPy Calculation function
55
+ def numpy_calculate(code: str) -> str:
56
+ try:
57
+ local_dict = {}
58
+ exec(code, {"np": np}, local_dict)
59
+ result = local_dict.get("result", "No result found")
60
+ return str(result)
61
+ except Exception as e:
62
+ return f"An error occurred: {str(e)}"
63
+
64
+ # Function to handle different input types
65
+ def handle_input(user_prompt, image=None, video=None, audio=None, doc=None):
66
+ messages = [{"role": "user", "content": user_prompt}]
67
+
68
+ if audio:
69
+ transcription = client.audio.transcriptions.create(
70
+ file=(audio.name, audio.read()),
71
+ model="whisper-large-v3"
72
+ )
73
+ user_prompt = transcription.text
74
+
75
+ if doc:
76
+ index = create_rag_index(doc.name, doc.read())
77
+ response = index.query(user_prompt)
78
+ elif image and not video:
79
+ image = Image.open(image).convert('RGB')
80
+ messages[0]['content'] = [image, user_prompt]
81
+ response = text_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
82
+ elif video:
83
+ frames = encode_video(video.name)
84
+ messages[0]['content'] = frames + [user_prompt]
85
+ response = text_model.chat(image=None, msgs=messages, tokenizer=tokenizer)
86
+ else:
87
+ response = client.chat.completions.create(
88
+ model=MODEL,
89
+ messages=messages,
90
+ tools=initialize_tools()
91
+ ).choices[0].message.content
92
+
93
+ return response
94
 
95
+ # Function to create RAG index using LlamaIndex or Langchain
96
+ def create_rag_index(file_name, file_content):
97
+ docs = SimpleDirectoryReader(file_name, file_content).load_data()
98
+ service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=OpenAI(temperature=0)))
99
+ index = GPTSimpleVectorIndex.from_documents(docs, service_context=service_context)
100
+ return index
 
 
 
 
 
 
 
 
101
 
102
+ # Function to encode video
103
+ def encode_video(video_path):
104
+ MAX_NUM_FRAMES = 64
105
+ vr = VideoReader(video_path, ctx=cpu(0))
106
+ sample_fps = round(vr.get_avg_fps() / 1)
107
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
108
+ if len(frame_idx) > MAX_NUM_FRAMES:
109
+ frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
110
+ frames = vr.get_batch(frame_idx).asnumpy()
111
+ frames = [Image.fromarray(v.astype('uint8')) for v in frames]
112
+ return frames
113
 
114
+ # Initialize tools with web search and NumPy calculation
115
+ def initialize_tools():
116
+ tools = [
117
+ {
118
+ "type": "function",
119
+ "function": {
120
+ "name": "calculate",
121
+ "description": "Evaluate a mathematical expression",
122
+ "parameters": {
123
+ "type": "object",
124
+ "properties": {
125
+ "expression": {"type": "string", "description": "The mathematical expression to evaluate"}
126
+ },
127
+ "required": ["expression"]
128
+ },
129
+ }
130
+ },
131
+ {
132
+ "type": "function",
133
+ "function": {
134
+ "name": "web_search",
135
+ "description": "Perform a web search",
136
+ "parameters": {
137
+ "type": "object",
138
+ "properties": {
139
+ "query": {"type": "string", "description": "The search query"}
140
+ },
141
+ "required": ["query"]
142
+ },
143
+ "implementation": web_search
144
+ }
145
+ },
146
+ {
147
+ "type": "function",
148
+ "function": {
149
+ "name": "numpy_calculate",
150
+ "description": "Execute NumPy-based Python code for calculations",
151
+ "parameters": {
152
+ "type": "object",
153
+ "properties": {
154
+ "code": {"type": "string", "description": "The Python code with NumPy operations"}
155
+ },
156
+ "required": ["code"]
157
+ },
158
+ "implementation": numpy_calculate
159
+ }
160
+ }
161
+ ]
162
+ return tools
163
+ @spaces.GPU()
164
+ # Gradio Interface
165
+ def main_interface(user_prompt, image=None, video=None, audio=None, doc=None, voice_only=False):
166
+ response = handle_input(user_prompt, image=image, video=video, audio=audio, doc=doc)
167
+ if voice_only:
168
+ audio_file = play_voice_output(response)
169
+ return gr.Audio.update(value=audio_file, visible=True)
170
  else:
171
+ return response
172
 
173
+ # Gradio App Setup
174
+ with gr.Blocks() as demo:
175
+ user_prompt = gr.Textbox(placeholder="Type your message here...", lines=1)
176
+ image_input = gr.Image(type="file", label="Upload an image")
177
+ video_input = gr.Video(type="file", label="Upload a video")
178
+ audio_input = gr.Audio(type="file", label="Upload audio")
179
+ doc_input = gr.File(type="file", label="Upload a document")
180
+ voice_only_mode = gr.Checkbox(label="Enable Voice Only Mode")
181
+ output = gr.Output()
182
+
183
+ submit = gr.Button("Submit")
184
+ submit.click(
185
+ fn=main_interface,
186
+ inputs=[user_prompt, image_input, video_input, audio_input, doc_input, voice_only_mode],
187
+ outputs=output
188
+ )
189
 
190
+ demo.launch(inline=False)