Spaces:

Shak33l-UiRev
/

Ui-Rev-Doc-Model

Running

App Files Files Community

Shak33l-UiRev commited on Nov 13, 2024

Commit

d1abdf9

verified ·

1 Parent(s): e5a11be

removed BROS model & Adding OmniParser

Browse files

main changes include:

OmniParser Integration:

Added YOLO model loading for icon detection
Added Florence-2 model for captioning
Proper handling of both models in the pipeline

Analysis Pipeline:

Object detection with YOLO
Caption generation for detected elements
Structured output with bounding boxes and descriptions

User Interface:

Updated model information
Added UI-specific strengths and capabilities
Proper debug information for UI parsing

Files changed (1) hide show

app.py +96 -46

app.py CHANGED Viewed

@@ -1,20 +1,20 @@
 import streamlit as st
 from PIL import Image
 import torch
-import json
 from transformers import (
     DonutProcessor,
     VisionEncoderDecoderModel,
     LayoutLMv3Processor,
     LayoutLMv3ForSequenceClassification,
-    BrosProcessor,
-    BrosForTokenClassification,
-    LlavaProcessor,
-    LlavaForConditionalGeneration
 )
 from datetime import datetime
-# Cache the model loading to improve performance
 @st.cache_resource
 def load_model(model_name):
     """Load the selected model and processor"""
@@ -31,13 +31,21 @@ def load_model(model_name):
             processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
             model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
-        elif model_name == "BROS":
-            processor = BrosProcessor.from_pretrained("microsoft/bros-base")
-            model = BrosForTokenClassification.from_pretrained("microsoft/bros-base")
-        elif model_name == "LLaVA-1.5":
-            processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
-            model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
         return model, processor
     except Exception as e:
@@ -47,14 +55,54 @@ def load_model(model_name):
 def analyze_document(image, model_name, model, processor):
     """Analyze document using selected model"""
     try:
-        # Process image according to model requirements
-        if model_name == "Donut":
-            # Prepare input with task prompt
             pixel_values = processor(image, return_tensors="pt").pixel_values
             task_prompt = "<s_cord>analyze the document and extract information</s_cord>"
             decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
-            # Generate output with improved parameters
             outputs = model.generate(
                 pixel_values,
                 decoder_input_ids=decoder_input_ids,
@@ -68,31 +116,41 @@ def analyze_document(image, model_name, model, processor):
                 return_dict_in_generate=True
             )
-            # Process and clean the output
             sequence = processor.batch_decode(outputs.sequences)[0]
             sequence = sequence.replace(task_prompt, "").replace("</s_cord>", "").strip()
-            # Try to parse as JSON, fallback to raw text
             try:
                 result = json.loads(sequence)
             except json.JSONDecodeError:
                 result = {"raw_text": sequence}
         elif model_name == "LayoutLMv3":
-            inputs = processor(image, return_tensors="pt")
-            outputs = model(**inputs)
-            result = {"logits": outputs.logits.tolist()}  # Convert tensor to list for JSON serialization
-        elif model_name == "BROS":
-            inputs = processor(image, return_tensors="pt")
-            outputs = model(**inputs)
-            result = {"predictions": outputs.logits.tolist()}
-        elif model_name == "LLaVA-1.5":
-            inputs = processor(image, return_tensors="pt")
-            outputs = model.generate(**inputs, max_length=256)
-            result = {"generated_text": processor.decode(outputs[0], skip_special_tokens=True)}
         return result
     except Exception as e:
@@ -157,26 +215,18 @@ with col2:
         "Donut": {
             "description": "Best for structured OCR and document format understanding",
             "memory": "6-8GB",
-            "strengths": ["Structured OCR", "Memory efficient", "Good with fixed formats"],
-            "best_for": ["Invoices", "Forms", "Structured documents"]
         },
         "LayoutLMv3": {
             "description": "Strong layout understanding with reasoning capabilities",
             "memory": "12-15GB",
-            "strengths": ["Layout understanding", "Reasoning", "Pre-trained knowledge"],
-            "best_for": ["Complex layouts", "Mixed content", "Tables"]
-        },
-        "BROS": {
-            "description": "Memory efficient with fast inference",
-            "memory": "4-6GB",
-            "strengths": ["Fast inference", "Memory efficient", "Easy fine-tuning"],
-            "best_for": ["Simple documents", "Quick analysis", "Basic OCR"]
         },
-        "LLaVA-1.5": {
-            "description": "Comprehensive OCR with strong reasoning",
-            "memory": "25-40GB",
-            "strengths": ["Strong reasoning", "Zero-shot capable", "Visual understanding"],
-            "best_for": ["Complex documents", "Natural language understanding", "Visual QA"]
         }
     }

 import streamlit as st
 from PIL import Image
 import torch
 from transformers import (
     DonutProcessor,
     VisionEncoderDecoderModel,
     LayoutLMv3Processor,
     LayoutLMv3ForSequenceClassification,
+    AutoProcessor,
+    AutoModelForCausalLM
 )
+from ultralytics import YOLO
+import io
+import base64
+import json
 from datetime import datetime
 @st.cache_resource
 def load_model(model_name):
     """Load the selected model and processor"""
             processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
             model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")
+        elif model_name == "OmniParser":
+            # Load YOLO model for icon detection
+            yolo_model = YOLO('microsoft/OmniParser', task='detect')
+            # Load Florence-2 model for captioning
+            processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(
+                "microsoft/OmniParser",
+                torch_dtype=torch.float16,
+                trust_remote_code=True
+            )
+            return {
+                'yolo': yolo_model,
+                'processor': processor,
+                'model': model
+            }
         return model, processor
     except Exception as e:
 def analyze_document(image, model_name, model, processor):
     """Analyze document using selected model"""
     try:
+        if model_name == "OmniParser":
+            # Save image temporarily
+            temp_path = "temp_image.png"
+            image.save(temp_path)
+            # Configure box detection parameters
+            box_threshold = 0.05  # Can be made configurable
+            iou_threshold = 0.1   # Can be made configurable
+            # Run YOLO detection
+            yolo_results = model['yolo'](
+                temp_path,
+                conf=box_threshold,
+                iou=iou_threshold,
+                device='cpu' if not torch.cuda.is_available() else 'cuda'
+            )
+            # Process detections
+            results = []
+            for det in yolo_results[0].boxes.data:
+                x1, y1, x2, y2, conf, cls = det
+                # Get region of interest
+                roi = image.crop((x1, y1, x2, y2))
+                # Generate caption using Florence-2
+                inputs = processor(images=roi, return_tensors="pt")
+                outputs = model['model'].generate(**inputs, max_length=50)
+                caption = processor.decode(outputs[0], skip_special_tokens=True)
+                results.append({
+                    "bbox": [float(x) for x in [x1, y1, x2, y2]],
+                    "confidence": float(conf),
+                    "class": int(cls),
+                    "caption": caption
+                })
+            return {
+                "detected_elements": len(results),
+                "elements": results
+            }
+        # [Previous model handling remains the same...]
+        elif model_name == "Donut":
             pixel_values = processor(image, return_tensors="pt").pixel_values
             task_prompt = "<s_cord>analyze the document and extract information</s_cord>"
             decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
             outputs = model.generate(
                 pixel_values,
                 decoder_input_ids=decoder_input_ids,
                 return_dict_in_generate=True
             )
             sequence = processor.batch_decode(outputs.sequences)[0]
             sequence = sequence.replace(task_prompt, "").replace("</s_cord>", "").strip()
             try:
                 result = json.loads(sequence)
             except json.JSONDecodeError:
                 result = {"raw_text": sequence}
         elif model_name == "LayoutLMv3":
+            encoded_inputs = processor(
+                image,
+                return_tensors="pt",
+                add_special_tokens=True,
+                return_offsets_mapping=True
+            )
+            outputs = model(**encoded_inputs)
+            predictions = outputs.logits.argmax(-1).squeeze().tolist()
+            words = processor.tokenizer.convert_ids_to_tokens(
+                encoded_inputs.input_ids.squeeze().tolist()
+            )
+            result = {
+                "predictions": [
+                    {
+                        "text": word,
+                        "label": pred
+                    }
+                    for word, pred in zip(words, predictions)
+                    if word not in ["<s>", "</s>", "<pad>"]
+                ],
+                "confidence_scores": outputs.logits.softmax(-1).max(-1).values.squeeze().tolist()
+            }
         return result
     except Exception as e:
         "Donut": {
             "description": "Best for structured OCR and document format understanding",
             "memory": "6-8GB",
+            "strengths": ["Structured OCR", "Memory efficient", "Good with fixed formats"]
         },
         "LayoutLMv3": {
             "description": "Strong layout understanding with reasoning capabilities",
             "memory": "12-15GB",
+            "strengths": ["Layout understanding", "Reasoning", "Pre-trained knowledge"]
         },
+        "OmniParser": {
+            "description": "General screen parsing tool for UI understanding",
+            "memory": "8-10GB",
+            "strengths": ["UI element detection", "Interactive element recognition", "Function description"],
+            "best_for": ["Screenshots", "UI analysis", "Interactive elements"]
         }
     }