Spaces:

Tonic
/

florence-pdf

Sleeping

App Files Files Community

Tonic commited on Sep 12, 2024

Commit

fa2eb8b

unverified ·

1 Parent(s): 3bdebf9

add interface logic and debug print and text extraction logic

Browse files

Files changed (2) hide show

app.py +113 -18
config.json +237 -0

app.py CHANGED Viewed

@@ -8,6 +8,79 @@ import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 import numpy as np
 import random
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
@@ -16,18 +89,19 @@ model = Florence2ForConditionalGeneration.from_pretrained("PleIAs/Florence-PDF",
 processor = AutoProcessor.from_pretrained("PleIAs/Florence-PDF", trust_remote_code=True)
 TASK_PROMPTS = {
-    "Caption": "<CAPTION>",
-    "Detailed Caption": "<DETAILED_CAPTION>",
-    "More Detailed Caption": "<MORE_DETAILED_CAPTION>",
-    "Object Detection": "<OD>",
-    "Dense Region Caption": "<DENSE_REGION_CAPTION>",
-    "OCR": "<OCR>",
-    "OCR with Region": "<OCR_WITH_REGION>",
-    "Region Proposal": "<REGION_PROPOSAL>"
 }
-IMAGE_TASKS = ["Object Detection", "Dense Region Caption", "Region Proposal", "OCR with Region"]
-TEXT_TASKS = ["Caption", "Detailed Caption", "More Detailed Caption", "OCR"]
 colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
             'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
@@ -65,39 +139,60 @@ def draw_ocr_bboxes(image, prediction):
 def process_image(image, task):
     prompt = TASK_PROMPTS[task]
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
     generated_ids = model.generate(
         **inputs,
         max_new_tokens=1024,
         num_beams=3,
         do_sample=False
     )
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
     parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
     return parsed_answer
 def main_process(image, task):
     result = process_image(image, task)
     if task in IMAGE_TASKS:
         if task == "OCR with Region":
-            output_image = draw_ocr_bboxes(image.copy(), result[TASK_PROMPTS[task]])
         else:
             fig = plot_bbox(image, result[TASK_PROMPTS[task]])
             output_image = fig_to_pil(fig)
-        return output_image, gr.update(visible=True), None, gr.update(visible=False)
     else:
         return None, gr.update(visible=False), str(result), gr.update(visible=True)
 def reset_outputs():
     return None, gr.update(visible=False), None, gr.update(visible=True)
-with gr.Blocks(title="Florence-2 Demo") as iface:
-    gr.Markdown("# Florence-2 Demo")
-    gr.Markdown("Upload an image and select a task to process with Florence-2.")
     with gr.Column():
         image_input = gr.Image(type="pil", label="Input Image")
@@ -107,8 +202,8 @@ with gr.Blocks(title="Florence-2 Demo") as iface:
         submit_button = gr.Button("Process")
         reset_button = gr.Button("Reset")
-    output_image = gr.Image(label="Processed Image", visible=False)
-    output_text = gr.Textbox(label="Output", visible=True)
     def process_and_update(image, task):
         if image is None:

 import matplotlib.patches as patches
 import numpy as np
 import random
+import json
+# Load and parse the config.json file
+with open("config.json", "r") as f:
+    config = json.load(f)
+# Extract necessary variables from the config
+d_model = config['text_config']['d_model']
+num_layers = config['text_config']['encoder_layers']
+attention_heads = config['text_config']['encoder_attention_heads']
+vocab_size = config['text_config']['vocab_size']
+max_length = config['text_config']['max_length']
+beam_size = config['text_config']['num_beams']
+dropout = config['text_config']['dropout']
+activation_function = config['text_config']['activation_function']
+no_repeat_ngram_size = config['text_config']['no_repeat_ngram_size']
+patch_size = config['vision_config']['patch_size'][0]
+temporal_embeddings = config['vision_config']['visual_temporal_embedding']['max_temporal_embeddings']
+title = """# 🙋🏻‍♂️Welcome to Tonic's PLeIAs/📸📈✍🏻Florence-PDF"""
+description = """
+---
+This application showcases the **PLeIAs/📸📈✍🏻Florence-PDF** model, a powerful AI system designed for both **text and image generation tasks**. The model is capable of handling complex tasks such as object detection, image captioning, OCR (Optical Character Recognition), and detailed region-based image analysis.
+### **How to Use**:
+1. **Upload an Image**: Select an image for processing.
+2. **Choose a Task**: Pick a task from the dropdown menu, such as "Caption", "Object Detection", "OCR", etc.
+3. **Process**: Click the "Process" button to let PLeIAs/📸📈✍🏻Florence-PDF analyze the image and generate the output.
+4. **View Results**: Depending on the task, you’ll either see a processed image (e.g., with bounding boxes or labels) or a text-based result (e.g., a generated caption or extracted text).
+You can reset the interface anytime by clicking the **Reset** button.
+### **Available Tasks**:
+- **✍🏻Caption**: Generate a concise description of the image.
+- **📸Object Detection**: Identify and label objects within the image.
+- **📸✍🏻OCR**: Extract text from the image.
+- **📸Region Proposal**: Detect key regions in the image for detailed captioning.
+---
+### Join us :
+🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
+"""
+model_presentation = f"""
+## PLeIAs/📸📈✍🏻Florence-PDF Model Overview
+The **PLeIAs/📸📈✍🏻Florence-PDF** model is a state-of-the-art model for conditional generation tasks, designed to be highly effective for both **text** and **vision** tasks. It is built as an **encoder-decoder** architecture, which allows for enhanced flexibility and performance in generating outputs based on diverse inputs.
+### Key Features
+- **Model Architecture**: PLeIAs/📸📈✍🏻Florence-PDF uses an encoder-decoder structure, which makes it effective in tasks like **text generation**, **summarization**, and **translation**. It has **{num_layers} layers** for both the encoder and decoder, with a model dimension (`d_model`) of **{d_model}**.
+- **Conditional Generation**: The model can generate text conditionally, with a maximum length of **{max_length} tokens** for each generated sequence, making it ideal for tasks that require concise output.
+- **Beam Search**: PLeIAs/📸📈✍🏻Florence-PDFsupports **beam search** with up to **{beam_size} beams**, enabling more diverse and accurate text generation by exploring multiple potential outputs before selecting the best one.
+- **Tokenization**: It includes a tokenizer with a vocabulary size of **{vocab_size} tokens**. Special tokens such as the **bos_token_id (0)** and **eos_token_id (2)** help control the generation process by marking the beginning and end of a sequence.
+- **Attention Mechanism**: Both the encoder and decoder utilize **{attention_heads} attention heads** per layer, ensuring that the model can focus on relevant parts of the input when generating text.
+- **Dropout and Activation**: PLeIAs/📸📈✍🏻Florence-PDF employs a **{activation_function} activation function** and a **dropout rate of {dropout}**, which enhances model performance by preventing overfitting and improving generalization.
+- **Training Configuration**: The model uses **float32** precision for training, and it supports fine-tuning for specific tasks by setting `finetuning_task` appropriately.
+### Vision Integration
+In addition to text tasks, PLeIAs/📸📈✍🏻Florence-PDF also incorporates **vision capabilities**:
+- **Patch-based Image Processing**: The vision component operates on image patches with a patch size of **{patch_size}x{patch_size}**.
+- **Temporal Embedding**: Visual tasks benefit from temporal embeddings with up to **{temporal_embeddings} steps**, making Florence-2 well-suited for video analysis.
+### Model Usage and Flexibility
+- **No Repeat N-Grams**: To reduce repetition in text generation, the model is configured with a **no_repeat_ngram_size** of **{no_repeat_ngram_size}**, ensuring more diverse and meaningful outputs.
+- **Sampling Strategies**: PLeIAs/📸📈✍🏻Florence-PDF offers flexible sampling strategies, including **top-k** and **top-p (nucleus) sampling**, allowing for both creative and constrained generation based on user needs.
+📸📈✍🏻Florence-PDF is a robust model capable of handling various **text and image** tasks with high precision and flexibility, making it a valuable tool for both academic research and practical applications.
+"""
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 processor = AutoProcessor.from_pretrained("PleIAs/Florence-PDF", trust_remote_code=True)
 TASK_PROMPTS = {
+    "✍🏻Caption": "<CAPTION>",
+    "✍🏻✍🏻Caption": "<DETAILED_CAPTION>",
+    "✍🏻✍🏻✍🏻Caption": "<MORE_DETAILED_CAPTION>",
+    "📸Object Detection": "<OD>",
+    "📸Dense Region Caption": "<DENSE_REGION_CAPTION>",
+    "📸✍🏻OCR": "<OCR>",
+    "📸✍🏻OCR with Region": "<OCR_WITH_REGION>",
+    "📸Region Proposal": "<REGION_PROPOSAL>"
 }
+IMAGE_TASKS = ["📸Object Detection", "📸Dense Region Caption", "📸Region Proposal", "📸✍🏻OCR with Region"]
+TEXT_TASKS = ["✍🏻Caption", "✍🏻✍🏻Caption", "✍🏻✍🏻✍🏻Caption", "📸✍🏻OCR", "📸✍🏻OCR with Region"]
 colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
             'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
 def process_image(image, task):
     prompt = TASK_PROMPTS[task]
+    # Print the inputs for debugging
+    print(f"\n--- Processing Task: {task} ---")
+    print(f"Prompt: {prompt}")
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
+    # Print the input tensors for debugging
+    print(f"Model Input: {inputs}")
     generated_ids = model.generate(
         **inputs,
         max_new_tokens=1024,
         num_beams=3,
         do_sample=False
     )
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    # Print the raw generated output for debugging
+    print(f"Raw Model Output: {generated_text}")
     parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
+    # Print the parsed answer for debugging
+    print(f"Parsed Answer: {parsed_answer}")
     return parsed_answer
 def main_process(image, task):
     result = process_image(image, task)
     if task in IMAGE_TASKS:
         if task == "OCR with Region":
+            output_image = draw_ocr_bboxes(image.copy(), result['quad_boxes'])
+            text_output = result.get('recognized_text', 'No text found')  # Extract recognized text
+            # Debugging: Print the recognized text
+            print(f"Recognized Text: {text_output}")
+            return output_image, gr.update(visible=True), text_output, gr.update(visible=True)
         else:
             fig = plot_bbox(image, result[TASK_PROMPTS[task]])
             output_image = fig_to_pil(fig)
+            return output_image, gr.update(visible=True), None, gr.update(visible=False)
     else:
         return None, gr.update(visible=False), str(result), gr.update(visible=True)
 def reset_outputs():
     return None, gr.update(visible=False), None, gr.update(visible=True)
+with gr.Blocks(title="PLeIAs/📸📈✍🏻Florence-PDF") as iface:
+    gr.Markdown(title)
+    gr.Markdown(description)
     with gr.Column():
         image_input = gr.Image(type="pil", label="Input Image")
         submit_button = gr.Button("Process")
         reset_button = gr.Button("Reset")
+    output_image = gr.Image(label="PLeIAs/📸📈✍🏻Florence-PDF", visible=False)
+    output_text = gr.Textbox(label="PLeIAs/📸📈✍🏻Florence-PDF", visible=True)
     def process_and_update(image, task):
         if image is None:

config.json ADDED Viewed

	@@ -0,0 +1,237 @@

+{
+  "_name_or_path": "florence-large-ft",
+  "architectures": [
+    "Florence2ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_florence2.Florence2Config",
+    "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
+  },
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "ignore_index": -100,
+  "is_encoder_decoder": true,
+  "model_type": "florence2",
+  "pad_token_id": 1,
+  "projection_dim": 1024,
+  "text_config": {
+    "_name_or_path": "",
+    "activation_dropout": 0.1,
+    "activation_function": "gelu",
+    "add_bias_logits": false,
+    "add_cross_attention": false,
+    "add_final_layer_norm": false,
+    "architectures": null,
+    "attention_dropout": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "classif_dropout": 0.1,
+    "classifier_dropout": 0.0,
+    "cross_attention_hidden_size": null,
+    "d_model": 1024,
+    "decoder_attention_heads": 16,
+    "decoder_ffn_dim": 4096,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 12,
+    "decoder_start_token_id": 2,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.1,
+    "early_stopping": true,
+    "encoder_attention_heads": 16,
+    "encoder_ffn_dim": 4096,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 12,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": 0,
+    "forced_eos_token_id": 2,
+    "gradient_checkpointing": false,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1",
+      "2": "LABEL_2"
+    },
+    "init_std": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1,
+      "LABEL_2": 2
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 1024,
+    "min_length": 0,
+    "model_type": "florence2_language",
+    "no_repeat_ngram_size": 3,
+    "normalize_before": false,
+    "num_beam_groups": 1,
+    "num_beams": 3,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_embedding": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 51289
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.4",
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depths": [
+      1,
+      1,
+      9,
+      1
+    ],
+    "dim_embed": [
+      256,
+      512,
+      1024,
+      2048
+    ],
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "drop_path_rate": 0.1,
+    "early_stopping": false,
+    "enable_checkpoint": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_feature_source": [
+      "spatial_avg_pool",
+      "temporal_avg_pool"
+    ],
+    "image_pos_embed": {
+      "max_pos_embeddings": 50,
+      "type": "learned_abs_2d"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "davit",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_groups": [
+      8,
+      16,
+      32,
+      64
+    ],
+    "num_heads": [
+      8,
+      16,
+      32,
+      64
+    ],
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_padding": [
+      3,
+      1,
+      1,
+      1
+    ],
+    "patch_prenorm": [
+      false,
+      true,
+      true,
+      true
+    ],
+    "patch_size": [
+      7,
+      3,
+      3,
+      3
+    ],
+    "patch_stride": [
+      4,
+      2,
+      2,
+      2
+    ],
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 1024,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "visual_temporal_embedding": {
+      "max_temporal_embeddings": 100,
+      "type": "COSINE"
+    },
+    "window_size": 12
+  },
+  "vocab_size": 51289
+}