Spaces:
Running
on
Zero
Running
on
Zero
dongyh20
commited on
Commit
·
7f4e0db
1
Parent(s):
8a47087
update space
Browse files
app.py
CHANGED
@@ -154,15 +154,15 @@ def extract_audio(videos_file_path):
|
|
154 |
@spaces.GPU(duration=120)
|
155 |
def ola_inference(multimodal, audio_path):
|
156 |
visual, text = multimodal["files"][0], multimodal["text"]
|
157 |
-
if not visual:
|
158 |
-
return "ERROR: Image or Video is required.", None
|
159 |
if visual.endswith("image2.png"):
|
160 |
modality = "video"
|
161 |
visual = f"{cur_dir}/case/case1.mp4"
|
162 |
if visual.endswith(".mp4"):
|
163 |
modality = "video"
|
164 |
-
|
165 |
modality = "image"
|
|
|
|
|
166 |
|
167 |
# input audio and video, do not parse audio in the video, else parse audio in the video
|
168 |
if audio_path:
|
@@ -184,9 +184,13 @@ def ola_inference(multimodal, audio_path):
|
|
184 |
frame_idx = uniform_sampled_frames.tolist()
|
185 |
spare_frames = vr.get_batch(frame_idx).asnumpy()
|
186 |
video = [Image.fromarray(frame) for frame in spare_frames]
|
187 |
-
|
188 |
image = [Image.open(visual)]
|
189 |
image_sizes = [image[0].size]
|
|
|
|
|
|
|
|
|
190 |
|
191 |
if USE_SPEECH and audio_path:
|
192 |
audio_path = audio_path
|
@@ -217,14 +221,18 @@ def ola_inference(multimodal, audio_path):
|
|
217 |
qs = text
|
218 |
else:
|
219 |
qs = ''
|
220 |
-
if USE_SPEECH and audio_path:
|
221 |
if text:
|
222 |
return "ERROR: Please provide either text or audio question for image, not both.", None
|
223 |
qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
|
224 |
-
elif USE_SPEECH:
|
225 |
qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
|
226 |
-
|
|
|
|
|
227 |
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
|
|
|
|
|
228 |
|
229 |
conv = conv_templates[conv_mode].copy()
|
230 |
conv.append_message(conv.roles[0], qs)
|
@@ -256,7 +264,7 @@ def ola_inference(multimodal, audio_path):
|
|
256 |
video_processed = (video_processed, video_processed)
|
257 |
|
258 |
video_data = (video_processed, (384, 384), "video")
|
259 |
-
|
260 |
image_processor.do_resize = False
|
261 |
image_processor.do_center_crop = False
|
262 |
image_tensor, image_highres_tensor = [], []
|
@@ -315,7 +323,7 @@ def ola_inference(multimodal, audio_path):
|
|
315 |
num_beams=gen_kwargs["num_beams"],
|
316 |
max_new_tokens=gen_kwargs["max_new_tokens"],
|
317 |
)
|
318 |
-
|
319 |
output_ids = model.generate(
|
320 |
inputs=input_ids,
|
321 |
images=image_tensor,
|
@@ -335,6 +343,26 @@ def ola_inference(multimodal, audio_path):
|
|
335 |
num_beams=gen_kwargs["num_beams"],
|
336 |
max_new_tokens=gen_kwargs["max_new_tokens"],
|
337 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
|
339 |
|
340 |
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
|
|
|
154 |
@spaces.GPU(duration=120)
|
155 |
def ola_inference(multimodal, audio_path):
|
156 |
visual, text = multimodal["files"][0], multimodal["text"]
|
|
|
|
|
157 |
if visual.endswith("image2.png"):
|
158 |
modality = "video"
|
159 |
visual = f"{cur_dir}/case/case1.mp4"
|
160 |
if visual.endswith(".mp4"):
|
161 |
modality = "video"
|
162 |
+
elif visual:
|
163 |
modality = "image"
|
164 |
+
elif audio_path is not None:
|
165 |
+
modality = "text"
|
166 |
|
167 |
# input audio and video, do not parse audio in the video, else parse audio in the video
|
168 |
if audio_path:
|
|
|
184 |
frame_idx = uniform_sampled_frames.tolist()
|
185 |
spare_frames = vr.get_batch(frame_idx).asnumpy()
|
186 |
video = [Image.fromarray(frame) for frame in spare_frames]
|
187 |
+
elif modality == "image":
|
188 |
image = [Image.open(visual)]
|
189 |
image_sizes = [image[0].size]
|
190 |
+
else:
|
191 |
+
images = [torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device='cuda', non_blocking=True)]
|
192 |
+
images_highres = [torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device='cuda', non_blocking=True)]
|
193 |
+
image_sizes = [(224, 224)]
|
194 |
|
195 |
if USE_SPEECH and audio_path:
|
196 |
audio_path = audio_path
|
|
|
221 |
qs = text
|
222 |
else:
|
223 |
qs = ''
|
224 |
+
if USE_SPEECH and audio_path and modality == "image":
|
225 |
if text:
|
226 |
return "ERROR: Please provide either text or audio question for image, not both.", None
|
227 |
qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
|
228 |
+
elif USE_SPEECH and modality == "video":
|
229 |
qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
|
230 |
+
elif USE_SPEECH and audio_path: # audio + text
|
231 |
+
qs = DEFAULT_SPEECH_TOKEN + "\n" + qs
|
232 |
+
elif modality == "video" or modality == "image":
|
233 |
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
|
234 |
+
elif text: # text
|
235 |
+
qs = qs
|
236 |
|
237 |
conv = conv_templates[conv_mode].copy()
|
238 |
conv.append_message(conv.roles[0], qs)
|
|
|
264 |
video_processed = (video_processed, video_processed)
|
265 |
|
266 |
video_data = (video_processed, (384, 384), "video")
|
267 |
+
elif modality == "image":
|
268 |
image_processor.do_resize = False
|
269 |
image_processor.do_center_crop = False
|
270 |
image_tensor, image_highres_tensor = [], []
|
|
|
323 |
num_beams=gen_kwargs["num_beams"],
|
324 |
max_new_tokens=gen_kwargs["max_new_tokens"],
|
325 |
)
|
326 |
+
elif modality == "image":
|
327 |
output_ids = model.generate(
|
328 |
inputs=input_ids,
|
329 |
images=image_tensor,
|
|
|
343 |
num_beams=gen_kwargs["num_beams"],
|
344 |
max_new_tokens=gen_kwargs["max_new_tokens"],
|
345 |
)
|
346 |
+
elif modality == "text":
|
347 |
+
output_ids = model.generate(
|
348 |
+
input_ids,
|
349 |
+
images=images,
|
350 |
+
images_highres=images_highres,
|
351 |
+
image_sizes=image_sizes,
|
352 |
+
modalities=['text'],
|
353 |
+
speech=speechs,
|
354 |
+
speech_lengths=speech_lengths,
|
355 |
+
speech_chunks=speech_chunks,
|
356 |
+
speech_wav=speech_wavs,
|
357 |
+
attention_mask=attention_masks,
|
358 |
+
use_cache=True,
|
359 |
+
stopping_criteria=[stopping_criteria],
|
360 |
+
do_sample=True if gen_kwargs["temperature"] > 0 else False,
|
361 |
+
temperature=gen_kwargs["temperature"],
|
362 |
+
top_p=gen_kwargs["top_p"],
|
363 |
+
num_beams=gen_kwargs["num_beams"],
|
364 |
+
max_new_tokens=gen_kwargs["max_new_tokens"],
|
365 |
+
)
|
366 |
|
367 |
|
368 |
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
|