Spaces:

adept
/

fuyu-8b-demo

Runtime error

App Files Files Community

pcuenq HF staff commited on Oct 22, 2023

Commit

35feaa0

1 Parent(s): 0909501

Screenshot text location: pad images

Browse files

Files changed (1) hide show

app.py +30 -4

app.py CHANGED Viewed

@@ -15,6 +15,27 @@ processor = FuyuProcessor(image_processor=FuyuImageProcessor(), tokenizer=tokeni
 CAPTION_PROMPT = "Generate a coco-style caption.\n"
 DETAILED_CAPTION_PROMPT = "What is happening in this image?"
 def predict(image, prompt):
     # image = image.convert('RGB')
     model_inputs = processor(text=prompt, images=[image])
@@ -84,8 +105,13 @@ def coords_from_response(response):
         gr.Error("The string is malformed or does not match the expected pattern.")
 def localize(image, query):
-    prompt= f"When presented with a box, perform OCR to extract text contained within it. If provided with text, generate the corresponding bounding box.\n{query}"
-    model_inputs = processor(text=prompt, images=[image])
     model_inputs = {k: v.to(dtype=dtype if torch.is_floating_point(v) else v.dtype, device=device) for k,v in model_inputs.items()}
     generation_output = model.generate(**model_inputs, max_new_tokens=40)
@@ -159,7 +185,6 @@ with gr.Blocks(css=css) as demo:
     vqa_btn.click(fn=predict, inputs=[image_input, text_input], outputs=vqa_output)
     with gr.Tab("Find Text in Screenshots"):
-        gr.Markdown("This demo is designed to locate text in desktop screenshots. Please, ensure to upload images of 1920x1080 for best results!")
         with gr.Row():
             with gr.Column():
                 localization_input = gr.Image(label="Upload your Image", type="pil")
@@ -170,7 +195,8 @@ with gr.Blocks(css=css) as demo:
                     localization_output = gr.AnnotatedImage(label="Text Position")
         gr.Examples(
-            [["assets/localization_example_1.jpeg", "Share your repair"]],
             inputs = [localization_input, query_input],
             outputs = [localization_output],
             fn=localize,

 CAPTION_PROMPT = "Generate a coco-style caption.\n"
 DETAILED_CAPTION_PROMPT = "What is happening in this image?"
+def resize_to_max(image, max_width=1920, max_height=1080):
+    width, height = image.size
+    if width <= max_width and height <= max_height:
+        return image
+    scale = min(max_width/width, max_height/height)
+    width = int(width*scale)
+    height = int(height*scale)
+    return image.resize((width, height), Image.LANCZOS)
+def pad_to_size(image, canvas_width=1920, canvas_height=1080):
+    width, height = image.size
+    if width >= canvas_width and height >= canvas_height:
+        return image
+    # Paste at (0, 0)
+    canvas = Image.new("RGB", (canvas_width, canvas_height))
+    canvas.paste(image)
+    return canvas
 def predict(image, prompt):
     # image = image.convert('RGB')
     model_inputs = processor(text=prompt, images=[image])
         gr.Error("The string is malformed or does not match the expected pattern.")
 def localize(image, query):
+    prompt = f"When presented with a box, perform OCR to extract text contained within it. If provided with text, generate the corresponding bounding box.\n{query}"
+    # Downscale and/or pad to 1920x1080
+    padded = resize_to_max(image)
+    padded = pad_to_size(padded)
+    model_inputs = processor(text=prompt, images=[padded])
     model_inputs = {k: v.to(dtype=dtype if torch.is_floating_point(v) else v.dtype, device=device) for k,v in model_inputs.items()}
     generation_output = model.generate(**model_inputs, max_new_tokens=40)
     vqa_btn.click(fn=predict, inputs=[image_input, text_input], outputs=vqa_output)
     with gr.Tab("Find Text in Screenshots"):
         with gr.Row():
             with gr.Column():
                 localization_input = gr.Image(label="Upload your Image", type="pil")
                     localization_output = gr.AnnotatedImage(label="Text Position")
         gr.Examples(
+            [["assets/localization_example_1.jpeg", "Share your repair"],
+             ["assets/screen2words_ui_example.png", "statistics"]],
             inputs = [localization_input, query_input],
             outputs = [localization_output],
             fn=localize,