dongyh20 commited on
Commit
7f4e0db
·
1 Parent(s): 8a47087

update space

Browse files
Files changed (1) hide show
  1. app.py +37 -9
app.py CHANGED
@@ -154,15 +154,15 @@ def extract_audio(videos_file_path):
154
  @spaces.GPU(duration=120)
155
  def ola_inference(multimodal, audio_path):
156
  visual, text = multimodal["files"][0], multimodal["text"]
157
- if not visual:
158
- return "ERROR: Image or Video is required.", None
159
  if visual.endswith("image2.png"):
160
  modality = "video"
161
  visual = f"{cur_dir}/case/case1.mp4"
162
  if visual.endswith(".mp4"):
163
  modality = "video"
164
- else:
165
  modality = "image"
 
 
166
 
167
  # input audio and video, do not parse audio in the video, else parse audio in the video
168
  if audio_path:
@@ -184,9 +184,13 @@ def ola_inference(multimodal, audio_path):
184
  frame_idx = uniform_sampled_frames.tolist()
185
  spare_frames = vr.get_batch(frame_idx).asnumpy()
186
  video = [Image.fromarray(frame) for frame in spare_frames]
187
- else:
188
  image = [Image.open(visual)]
189
  image_sizes = [image[0].size]
 
 
 
 
190
 
191
  if USE_SPEECH and audio_path:
192
  audio_path = audio_path
@@ -217,14 +221,18 @@ def ola_inference(multimodal, audio_path):
217
  qs = text
218
  else:
219
  qs = ''
220
- if USE_SPEECH and audio_path:
221
  if text:
222
  return "ERROR: Please provide either text or audio question for image, not both.", None
223
  qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
224
- elif USE_SPEECH:
225
  qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
226
- else:
 
 
227
  qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
 
 
228
 
229
  conv = conv_templates[conv_mode].copy()
230
  conv.append_message(conv.roles[0], qs)
@@ -256,7 +264,7 @@ def ola_inference(multimodal, audio_path):
256
  video_processed = (video_processed, video_processed)
257
 
258
  video_data = (video_processed, (384, 384), "video")
259
- else:
260
  image_processor.do_resize = False
261
  image_processor.do_center_crop = False
262
  image_tensor, image_highres_tensor = [], []
@@ -315,7 +323,7 @@ def ola_inference(multimodal, audio_path):
315
  num_beams=gen_kwargs["num_beams"],
316
  max_new_tokens=gen_kwargs["max_new_tokens"],
317
  )
318
- else:
319
  output_ids = model.generate(
320
  inputs=input_ids,
321
  images=image_tensor,
@@ -335,6 +343,26 @@ def ola_inference(multimodal, audio_path):
335
  num_beams=gen_kwargs["num_beams"],
336
  max_new_tokens=gen_kwargs["max_new_tokens"],
337
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
 
340
  outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
 
154
  @spaces.GPU(duration=120)
155
  def ola_inference(multimodal, audio_path):
156
  visual, text = multimodal["files"][0], multimodal["text"]
 
 
157
  if visual.endswith("image2.png"):
158
  modality = "video"
159
  visual = f"{cur_dir}/case/case1.mp4"
160
  if visual.endswith(".mp4"):
161
  modality = "video"
162
+ elif visual:
163
  modality = "image"
164
+ elif audio_path is not None:
165
+ modality = "text"
166
 
167
  # input audio and video, do not parse audio in the video, else parse audio in the video
168
  if audio_path:
 
184
  frame_idx = uniform_sampled_frames.tolist()
185
  spare_frames = vr.get_batch(frame_idx).asnumpy()
186
  video = [Image.fromarray(frame) for frame in spare_frames]
187
+ elif modality == "image":
188
  image = [Image.open(visual)]
189
  image_sizes = [image[0].size]
190
+ else:
191
+ images = [torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device='cuda', non_blocking=True)]
192
+ images_highres = [torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device='cuda', non_blocking=True)]
193
+ image_sizes = [(224, 224)]
194
 
195
  if USE_SPEECH and audio_path:
196
  audio_path = audio_path
 
221
  qs = text
222
  else:
223
  qs = ''
224
+ if USE_SPEECH and audio_path and modality == "image":
225
  if text:
226
  return "ERROR: Please provide either text or audio question for image, not both.", None
227
  qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
228
+ elif USE_SPEECH and modality == "video":
229
  qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
230
+ elif USE_SPEECH and audio_path: # audio + text
231
+ qs = DEFAULT_SPEECH_TOKEN + "\n" + qs
232
+ elif modality == "video" or modality == "image":
233
  qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
234
+ elif text: # text
235
+ qs = qs
236
 
237
  conv = conv_templates[conv_mode].copy()
238
  conv.append_message(conv.roles[0], qs)
 
264
  video_processed = (video_processed, video_processed)
265
 
266
  video_data = (video_processed, (384, 384), "video")
267
+ elif modality == "image":
268
  image_processor.do_resize = False
269
  image_processor.do_center_crop = False
270
  image_tensor, image_highres_tensor = [], []
 
323
  num_beams=gen_kwargs["num_beams"],
324
  max_new_tokens=gen_kwargs["max_new_tokens"],
325
  )
326
+ elif modality == "image":
327
  output_ids = model.generate(
328
  inputs=input_ids,
329
  images=image_tensor,
 
343
  num_beams=gen_kwargs["num_beams"],
344
  max_new_tokens=gen_kwargs["max_new_tokens"],
345
  )
346
+ elif modality == "text":
347
+ output_ids = model.generate(
348
+ input_ids,
349
+ images=images,
350
+ images_highres=images_highres,
351
+ image_sizes=image_sizes,
352
+ modalities=['text'],
353
+ speech=speechs,
354
+ speech_lengths=speech_lengths,
355
+ speech_chunks=speech_chunks,
356
+ speech_wav=speech_wavs,
357
+ attention_mask=attention_masks,
358
+ use_cache=True,
359
+ stopping_criteria=[stopping_criteria],
360
+ do_sample=True if gen_kwargs["temperature"] > 0 else False,
361
+ temperature=gen_kwargs["temperature"],
362
+ top_p=gen_kwargs["top_p"],
363
+ num_beams=gen_kwargs["num_beams"],
364
+ max_new_tokens=gen_kwargs["max_new_tokens"],
365
+ )
366
 
367
 
368
  outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]