Spaces:

wjbmattingly
/

caracal

Running on Zero

App Files Files Community

wjm55 commited on Jan 2

Commit

529282d

1 Parent(s): 0388107

added NER

Browse files

Files changed (2) hide show

app.py +55 -5
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -8,7 +8,12 @@ import subprocess
 from datetime import datetime
 import numpy as np
 import os
 # subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
@@ -52,14 +57,14 @@ assistant_prompt = '<|assistant|>\n'
 prompt_suffix = "<|end|>\n"
 @spaces.GPU
-def run_example(image, model_id="Qwen/Qwen2-VL-7B-Instruct"):
     text_input = "Convert the image to text."
     image_path = array_to_image_path(image)
-    print(image_path)
     model = models[model_id]
     processor = processors[model_id]
     prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
     image = Image.fromarray(image).convert("RGB")
     messages = [
@@ -98,7 +103,30 @@ def run_example(image, model_id="Qwen/Qwen2-VL-7B-Instruct"):
         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )
-    return output_text[0]
 css = """
   /* Overall app styling */
@@ -178,11 +206,33 @@ with gr.Blocks(css=css) as demo:
             with gr.Column(elem_classes="input-container"):
                 input_img = gr.Image(label="Input Picture", elem_classes="gr-image-input")
                 model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct", elem_classes="gr-dropdown")
                 submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
             with gr.Column(elem_classes="output-container"):
                 output_text = gr.Textbox(label="Output Text", elem_id="output")
-        submit_btn.click(run_example, [input_img, model_selector], [output_text])
     with gr.Row():
         filename = gr.Textbox(label="Save filename (without extension)", placeholder="Enter filename to save")
         download_btn = gr.Button("Download Image & Text", elem_classes="submit-btn")

 from datetime import datetime
 import numpy as np
 import os
+from gliner import GLiNER
+# Initialize GLiNER model
+gliner_model = GLiNER.from_pretrained("knowledgator/modern-gliner-bi-large-v1.0")
+DEFAULT_NER_LABELS = "person, organization, location, date, event"
 # subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 prompt_suffix = "<|end|>\n"
 @spaces.GPU
+def run_example(image, model_id="Qwen/Qwen2-VL-7B-Instruct", run_ner=False, ner_labels=DEFAULT_NER_LABELS):
+    # First get the OCR text
     text_input = "Convert the image to text."
     image_path = array_to_image_path(image)
     model = models[model_id]
     processor = processors[model_id]
     prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
     image = Image.fromarray(image).convert("RGB")
     messages = [
         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )
+    ocr_text = output_text[0]
+    # If NER is enabled, process the OCR text
+    if run_ner:
+        ner_results = gliner_model.predict_entities(
+            ocr_text,
+            ner_labels.split(","),
+            threshold=0.3
+        )
+        # Format the text with entity annotations
+        annotated_text = ocr_text
+        for entity in sorted(ner_results, key=lambda x: x["start"], reverse=True):
+            entity_text = entity["text"]
+            entity_label = entity["label"]
+            annotated_text = (
+                annotated_text[:entity["start"]] +
+                f"[{entity_text}]({entity_label})" +
+                annotated_text[entity["end"]:]
+            )
+        return f"OCR Text:\n{ocr_text}\n\nAnnotated Entities:\n{annotated_text}"
+    return ocr_text
 css = """
   /* Overall app styling */
             with gr.Column(elem_classes="input-container"):
                 input_img = gr.Image(label="Input Picture", elem_classes="gr-image-input")
                 model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct", elem_classes="gr-dropdown")
+                # Add NER controls
+                with gr.Row():
+                    ner_checkbox = gr.Checkbox(label="Run Named Entity Recognition", value=False)
+                    ner_labels = gr.Textbox(
+                        label="NER Labels (comma-separated)",
+                        value=DEFAULT_NER_LABELS,
+                        visible=False
+                    )
                 submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
             with gr.Column(elem_classes="output-container"):
                 output_text = gr.Textbox(label="Output Text", elem_id="output")
+        # Show/hide NER labels based on checkbox
+        ner_checkbox.change(
+            lambda x: gr.update(visible=x),
+            inputs=[ner_checkbox],
+            outputs=[ner_labels]
+        )
+        # Update submit button click handler
+        submit_btn.click(
+            run_example,
+            inputs=[input_img, model_selector, ner_checkbox, ner_labels],
+            outputs=[output_text]
+        )
     with gr.Row():
         filename = gr.Textbox(label="Save filename (without extension)", placeholder="Enter filename to save")
         download_btn = gr.Button("Download Image & Text", elem_classes="submit-btn")

requirements.txt CHANGED Viewed

@@ -5,4 +5,6 @@ torch
 torchvision
 git+https://github.com/huggingface/transformers.git
 accelerate
-qwen-vl-utils

 torchvision
 git+https://github.com/huggingface/transformers.git
 accelerate
+qwen-vl-utils
+git+https://github.com/huggingface/transformers.git
+gliner