Spaces:

THUdyh
/

Ola

Running on Zero

App Files Files Community

dongyh20 commited on 9 days ago

Commit

7f4e0db

1 Parent(s): 8a47087

update space

Browse files

Files changed (1) hide show

app.py +37 -9

app.py CHANGED Viewed

@@ -154,15 +154,15 @@ def extract_audio(videos_file_path):
 @spaces.GPU(duration=120)
 def ola_inference(multimodal, audio_path):
     visual, text = multimodal["files"][0], multimodal["text"]
-    if not visual:
-        return "ERROR: Image or Video is required.", None
     if visual.endswith("image2.png"):
         modality = "video"
         visual = f"{cur_dir}/case/case1.mp4"
     if visual.endswith(".mp4"):
         modality = "video"
-    else:
         modality = "image"
     # input audio and video, do not parse audio in the video, else parse audio in the video
     if audio_path:
@@ -184,9 +184,13 @@ def ola_inference(multimodal, audio_path):
         frame_idx = uniform_sampled_frames.tolist()
         spare_frames = vr.get_batch(frame_idx).asnumpy()
         video = [Image.fromarray(frame) for frame in spare_frames]
-    else:
         image = [Image.open(visual)]
         image_sizes = [image[0].size]
     if USE_SPEECH and audio_path:
         audio_path = audio_path
@@ -217,14 +221,18 @@ def ola_inference(multimodal, audio_path):
         qs = text
     else:
         qs = ''
-    if USE_SPEECH and audio_path:
         if text:
             return "ERROR: Please provide either text or audio question for image, not both.", None
         qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
-    elif USE_SPEECH:
         qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
-    else:
         qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
     conv = conv_templates[conv_mode].copy()
     conv.append_message(conv.roles[0], qs)
@@ -256,7 +264,7 @@ def ola_inference(multimodal, audio_path):
         video_processed = (video_processed, video_processed)
         video_data = (video_processed, (384, 384), "video")
-    else:
         image_processor.do_resize = False
         image_processor.do_center_crop = False
         image_tensor, image_highres_tensor = [], []
@@ -315,7 +323,7 @@ def ola_inference(multimodal, audio_path):
                 num_beams=gen_kwargs["num_beams"],
                 max_new_tokens=gen_kwargs["max_new_tokens"],
             )
-        else:
             output_ids = model.generate(
                 inputs=input_ids,
                 images=image_tensor,
@@ -335,6 +343,26 @@ def ola_inference(multimodal, audio_path):
                 num_beams=gen_kwargs["num_beams"],
                 max_new_tokens=gen_kwargs["max_new_tokens"],
             )
     outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]

 @spaces.GPU(duration=120)
 def ola_inference(multimodal, audio_path):
     visual, text = multimodal["files"][0], multimodal["text"]
     if visual.endswith("image2.png"):
         modality = "video"
         visual = f"{cur_dir}/case/case1.mp4"
     if visual.endswith(".mp4"):
         modality = "video"
+    elif visual:
         modality = "image"
+    elif audio_path is not None:
+        modality = "text"
     # input audio and video, do not parse audio in the video, else parse audio in the video
     if audio_path:
         frame_idx = uniform_sampled_frames.tolist()
         spare_frames = vr.get_batch(frame_idx).asnumpy()
         video = [Image.fromarray(frame) for frame in spare_frames]
+    elif modality == "image":
         image = [Image.open(visual)]
         image_sizes = [image[0].size]
+    else:
+        images = [torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device='cuda', non_blocking=True)]
+        images_highres = [torch.zeros(1, 3, 224, 224).to(dtype=torch.bfloat16, device='cuda', non_blocking=True)]
+        image_sizes = [(224, 224)]
     if USE_SPEECH and audio_path:
         audio_path = audio_path
         qs = text
     else:
         qs = ''
+    if USE_SPEECH and audio_path and modality == "image":
         if text:
             return "ERROR: Please provide either text or audio question for image, not both.", None
         qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
+    elif USE_SPEECH and modality == "video":
         qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
+    elif USE_SPEECH and audio_path: # audio + text
+        qs = DEFAULT_SPEECH_TOKEN + "\n" + qs
+    elif modality == "video" or modality == "image":
         qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+    elif text: # text
+        qs = qs
     conv = conv_templates[conv_mode].copy()
     conv.append_message(conv.roles[0], qs)
         video_processed = (video_processed, video_processed)
         video_data = (video_processed, (384, 384), "video")
+    elif modality == "image":
         image_processor.do_resize = False
         image_processor.do_center_crop = False
         image_tensor, image_highres_tensor = [], []
                 num_beams=gen_kwargs["num_beams"],
                 max_new_tokens=gen_kwargs["max_new_tokens"],
             )
+        elif modality == "image":
             output_ids = model.generate(
                 inputs=input_ids,
                 images=image_tensor,
                 num_beams=gen_kwargs["num_beams"],
                 max_new_tokens=gen_kwargs["max_new_tokens"],
             )
+        elif modality == "text":
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                images_highres=images_highres,
+                image_sizes=image_sizes,
+                modalities=['text'],
+                speech=speechs,
+                speech_lengths=speech_lengths,
+                speech_chunks=speech_chunks,
+                speech_wav=speech_wavs,
+                attention_mask=attention_masks,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+                do_sample=True if gen_kwargs["temperature"] > 0 else False,
+                temperature=gen_kwargs["temperature"],
+                top_p=gen_kwargs["top_p"],
+                num_beams=gen_kwargs["num_beams"],
+                max_new_tokens=gen_kwargs["max_new_tokens"],
+                )
     outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]