Tonic commited on
Commit
fa2eb8b
·
unverified ·
1 Parent(s): 3bdebf9

add interface logic and debug print and text extraction logic

Browse files
Files changed (2) hide show
  1. app.py +113 -18
  2. config.json +237 -0
app.py CHANGED
@@ -8,6 +8,79 @@ import matplotlib.pyplot as plt
8
  import matplotlib.patches as patches
9
  import numpy as np
10
  import random
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
@@ -16,18 +89,19 @@ model = Florence2ForConditionalGeneration.from_pretrained("PleIAs/Florence-PDF",
16
  processor = AutoProcessor.from_pretrained("PleIAs/Florence-PDF", trust_remote_code=True)
17
 
18
  TASK_PROMPTS = {
19
- "Caption": "<CAPTION>",
20
- "Detailed Caption": "<DETAILED_CAPTION>",
21
- "More Detailed Caption": "<MORE_DETAILED_CAPTION>",
22
- "Object Detection": "<OD>",
23
- "Dense Region Caption": "<DENSE_REGION_CAPTION>",
24
- "OCR": "<OCR>",
25
- "OCR with Region": "<OCR_WITH_REGION>",
26
- "Region Proposal": "<REGION_PROPOSAL>"
27
  }
28
 
29
- IMAGE_TASKS = ["Object Detection", "Dense Region Caption", "Region Proposal", "OCR with Region"]
30
- TEXT_TASKS = ["Caption", "Detailed Caption", "More Detailed Caption", "OCR"]
 
31
 
32
  colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
33
  'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
@@ -65,39 +139,60 @@ def draw_ocr_bboxes(image, prediction):
65
 
66
  def process_image(image, task):
67
  prompt = TASK_PROMPTS[task]
 
 
 
 
 
68
  inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
69
 
 
 
 
70
  generated_ids = model.generate(
71
  **inputs,
72
  max_new_tokens=1024,
73
  num_beams=3,
74
  do_sample=False
75
  )
 
76
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
77
 
 
 
 
78
  parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
79
 
 
 
 
80
  return parsed_answer
81
 
82
  def main_process(image, task):
83
  result = process_image(image, task)
84
-
85
  if task in IMAGE_TASKS:
86
  if task == "OCR with Region":
87
- output_image = draw_ocr_bboxes(image.copy(), result[TASK_PROMPTS[task]])
 
 
 
 
 
 
88
  else:
89
  fig = plot_bbox(image, result[TASK_PROMPTS[task]])
90
  output_image = fig_to_pil(fig)
91
- return output_image, gr.update(visible=True), None, gr.update(visible=False)
92
  else:
93
  return None, gr.update(visible=False), str(result), gr.update(visible=True)
94
 
95
  def reset_outputs():
96
  return None, gr.update(visible=False), None, gr.update(visible=True)
97
 
98
- with gr.Blocks(title="Florence-2 Demo") as iface:
99
- gr.Markdown("# Florence-2 Demo")
100
- gr.Markdown("Upload an image and select a task to process with Florence-2.")
101
 
102
  with gr.Column():
103
  image_input = gr.Image(type="pil", label="Input Image")
@@ -107,8 +202,8 @@ with gr.Blocks(title="Florence-2 Demo") as iface:
107
  submit_button = gr.Button("Process")
108
  reset_button = gr.Button("Reset")
109
 
110
- output_image = gr.Image(label="Processed Image", visible=False)
111
- output_text = gr.Textbox(label="Output", visible=True)
112
 
113
  def process_and_update(image, task):
114
  if image is None:
 
8
  import matplotlib.patches as patches
9
  import numpy as np
10
  import random
11
+ import json
12
+ # Load and parse the config.json file
13
+ with open("config.json", "r") as f:
14
+ config = json.load(f)
15
+
16
+ # Extract necessary variables from the config
17
+ d_model = config['text_config']['d_model']
18
+ num_layers = config['text_config']['encoder_layers']
19
+ attention_heads = config['text_config']['encoder_attention_heads']
20
+ vocab_size = config['text_config']['vocab_size']
21
+ max_length = config['text_config']['max_length']
22
+ beam_size = config['text_config']['num_beams']
23
+ dropout = config['text_config']['dropout']
24
+ activation_function = config['text_config']['activation_function']
25
+ no_repeat_ngram_size = config['text_config']['no_repeat_ngram_size']
26
+ patch_size = config['vision_config']['patch_size'][0]
27
+ temporal_embeddings = config['vision_config']['visual_temporal_embedding']['max_temporal_embeddings']
28
+
29
+
30
+ title = """# 🙋🏻‍♂️Welcome to Tonic's PLeIAs/📸📈✍🏻Florence-PDF"""
31
+ description = """
32
+ ---
33
+
34
+
35
+ This application showcases the **PLeIAs/📸📈✍🏻Florence-PDF** model, a powerful AI system designed for both **text and image generation tasks**. The model is capable of handling complex tasks such as object detection, image captioning, OCR (Optical Character Recognition), and detailed region-based image analysis.
36
+
37
+ ### **How to Use**:
38
+ 1. **Upload an Image**: Select an image for processing.
39
+ 2. **Choose a Task**: Pick a task from the dropdown menu, such as "Caption", "Object Detection", "OCR", etc.
40
+ 3. **Process**: Click the "Process" button to let PLeIAs/📸📈✍🏻Florence-PDF analyze the image and generate the output.
41
+ 4. **View Results**: Depending on the task, you’ll either see a processed image (e.g., with bounding boxes or labels) or a text-based result (e.g., a generated caption or extracted text).
42
+
43
+ You can reset the interface anytime by clicking the **Reset** button.
44
+
45
+ ### **Available Tasks**:
46
+ - **✍🏻Caption**: Generate a concise description of the image.
47
+ - **📸Object Detection**: Identify and label objects within the image.
48
+ - **📸✍🏻OCR**: Extract text from the image.
49
+ - **📸Region Proposal**: Detect key regions in the image for detailed captioning.
50
+
51
+ ---
52
+
53
+ ### Join us :
54
+ 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
55
+ """
56
+ model_presentation = f"""
57
+ ## PLeIAs/📸📈✍🏻Florence-PDF Model Overview
58
+
59
+ The **PLeIAs/📸📈✍🏻Florence-PDF** model is a state-of-the-art model for conditional generation tasks, designed to be highly effective for both **text** and **vision** tasks. It is built as an **encoder-decoder** architecture, which allows for enhanced flexibility and performance in generating outputs based on diverse inputs.
60
+
61
+ ### Key Features
62
+
63
+ - **Model Architecture**: PLeIAs/📸📈✍🏻Florence-PDF uses an encoder-decoder structure, which makes it effective in tasks like **text generation**, **summarization**, and **translation**. It has **{num_layers} layers** for both the encoder and decoder, with a model dimension (`d_model`) of **{d_model}**.
64
+ - **Conditional Generation**: The model can generate text conditionally, with a maximum length of **{max_length} tokens** for each generated sequence, making it ideal for tasks that require concise output.
65
+ - **Beam Search**: PLeIAs/📸📈✍🏻Florence-PDFsupports **beam search** with up to **{beam_size} beams**, enabling more diverse and accurate text generation by exploring multiple potential outputs before selecting the best one.
66
+ - **Tokenization**: It includes a tokenizer with a vocabulary size of **{vocab_size} tokens**. Special tokens such as the **bos_token_id (0)** and **eos_token_id (2)** help control the generation process by marking the beginning and end of a sequence.
67
+ - **Attention Mechanism**: Both the encoder and decoder utilize **{attention_heads} attention heads** per layer, ensuring that the model can focus on relevant parts of the input when generating text.
68
+ - **Dropout and Activation**: PLeIAs/📸📈✍🏻Florence-PDF employs a **{activation_function} activation function** and a **dropout rate of {dropout}**, which enhances model performance by preventing overfitting and improving generalization.
69
+ - **Training Configuration**: The model uses **float32** precision for training, and it supports fine-tuning for specific tasks by setting `finetuning_task` appropriately.
70
+
71
+ ### Vision Integration
72
+
73
+ In addition to text tasks, PLeIAs/📸📈✍🏻Florence-PDF also incorporates **vision capabilities**:
74
+ - **Patch-based Image Processing**: The vision component operates on image patches with a patch size of **{patch_size}x{patch_size}**.
75
+ - **Temporal Embedding**: Visual tasks benefit from temporal embeddings with up to **{temporal_embeddings} steps**, making Florence-2 well-suited for video analysis.
76
+
77
+ ### Model Usage and Flexibility
78
+
79
+ - **No Repeat N-Grams**: To reduce repetition in text generation, the model is configured with a **no_repeat_ngram_size** of **{no_repeat_ngram_size}**, ensuring more diverse and meaningful outputs.
80
+ - **Sampling Strategies**: PLeIAs/📸📈✍🏻Florence-PDF offers flexible sampling strategies, including **top-k** and **top-p (nucleus) sampling**, allowing for both creative and constrained generation based on user needs.
81
+
82
+ 📸📈✍🏻Florence-PDF is a robust model capable of handling various **text and image** tasks with high precision and flexibility, making it a valuable tool for both academic research and practical applications.
83
+ """
84
 
85
  device = "cuda" if torch.cuda.is_available() else "cpu"
86
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
89
  processor = AutoProcessor.from_pretrained("PleIAs/Florence-PDF", trust_remote_code=True)
90
 
91
  TASK_PROMPTS = {
92
+ "✍🏻Caption": "<CAPTION>",
93
+ "✍🏻✍🏻Caption": "<DETAILED_CAPTION>",
94
+ "✍🏻✍🏻✍🏻Caption": "<MORE_DETAILED_CAPTION>",
95
+ "📸Object Detection": "<OD>",
96
+ "📸Dense Region Caption": "<DENSE_REGION_CAPTION>",
97
+ "📸✍🏻OCR": "<OCR>",
98
+ "📸✍🏻OCR with Region": "<OCR_WITH_REGION>",
99
+ "📸Region Proposal": "<REGION_PROPOSAL>"
100
  }
101
 
102
+
103
+ IMAGE_TASKS = ["📸Object Detection", "📸Dense Region Caption", "📸Region Proposal", "📸✍🏻OCR with Region"]
104
+ TEXT_TASKS = ["✍🏻Caption", "✍🏻✍🏻Caption", "✍🏻✍🏻✍🏻Caption", "📸✍🏻OCR", "📸✍🏻OCR with Region"]
105
 
106
  colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
107
  'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
 
139
 
140
  def process_image(image, task):
141
  prompt = TASK_PROMPTS[task]
142
+
143
+ # Print the inputs for debugging
144
+ print(f"\n--- Processing Task: {task} ---")
145
+ print(f"Prompt: {prompt}")
146
+
147
  inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
148
 
149
+ # Print the input tensors for debugging
150
+ print(f"Model Input: {inputs}")
151
+
152
  generated_ids = model.generate(
153
  **inputs,
154
  max_new_tokens=1024,
155
  num_beams=3,
156
  do_sample=False
157
  )
158
+
159
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
160
 
161
+ # Print the raw generated output for debugging
162
+ print(f"Raw Model Output: {generated_text}")
163
+
164
  parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
165
 
166
+ # Print the parsed answer for debugging
167
+ print(f"Parsed Answer: {parsed_answer}")
168
+
169
  return parsed_answer
170
 
171
  def main_process(image, task):
172
  result = process_image(image, task)
173
+
174
  if task in IMAGE_TASKS:
175
  if task == "OCR with Region":
176
+ output_image = draw_ocr_bboxes(image.copy(), result['quad_boxes'])
177
+ text_output = result.get('recognized_text', 'No text found') # Extract recognized text
178
+
179
+ # Debugging: Print the recognized text
180
+ print(f"Recognized Text: {text_output}")
181
+
182
+ return output_image, gr.update(visible=True), text_output, gr.update(visible=True)
183
  else:
184
  fig = plot_bbox(image, result[TASK_PROMPTS[task]])
185
  output_image = fig_to_pil(fig)
186
+ return output_image, gr.update(visible=True), None, gr.update(visible=False)
187
  else:
188
  return None, gr.update(visible=False), str(result), gr.update(visible=True)
189
 
190
  def reset_outputs():
191
  return None, gr.update(visible=False), None, gr.update(visible=True)
192
 
193
+ with gr.Blocks(title="PLeIAs/📸📈✍🏻Florence-PDF") as iface:
194
+ gr.Markdown(title)
195
+ gr.Markdown(description)
196
 
197
  with gr.Column():
198
  image_input = gr.Image(type="pil", label="Input Image")
 
202
  submit_button = gr.Button("Process")
203
  reset_button = gr.Button("Reset")
204
 
205
+ output_image = gr.Image(label="PLeIAs/📸📈✍🏻Florence-PDF", visible=False)
206
+ output_text = gr.Textbox(label="PLeIAs/📸📈✍🏻Florence-PDF", visible=True)
207
 
208
  def process_and_update(image, task):
209
  if image is None:
config.json ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "florence-large-ft",
3
+ "architectures": [
4
+ "Florence2ForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_florence2.Florence2Config",
8
+ "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
9
+ },
10
+ "bos_token_id": 0,
11
+ "eos_token_id": 2,
12
+ "ignore_index": -100,
13
+ "is_encoder_decoder": true,
14
+ "model_type": "florence2",
15
+ "pad_token_id": 1,
16
+ "projection_dim": 1024,
17
+ "text_config": {
18
+ "_name_or_path": "",
19
+ "activation_dropout": 0.1,
20
+ "activation_function": "gelu",
21
+ "add_bias_logits": false,
22
+ "add_cross_attention": false,
23
+ "add_final_layer_norm": false,
24
+ "architectures": null,
25
+ "attention_dropout": 0.1,
26
+ "bad_words_ids": null,
27
+ "begin_suppress_tokens": null,
28
+ "bos_token_id": 0,
29
+ "chunk_size_feed_forward": 0,
30
+ "classif_dropout": 0.1,
31
+ "classifier_dropout": 0.0,
32
+ "cross_attention_hidden_size": null,
33
+ "d_model": 1024,
34
+ "decoder_attention_heads": 16,
35
+ "decoder_ffn_dim": 4096,
36
+ "decoder_layerdrop": 0.0,
37
+ "decoder_layers": 12,
38
+ "decoder_start_token_id": 2,
39
+ "diversity_penalty": 0.0,
40
+ "do_sample": false,
41
+ "dropout": 0.1,
42
+ "early_stopping": true,
43
+ "encoder_attention_heads": 16,
44
+ "encoder_ffn_dim": 4096,
45
+ "encoder_layerdrop": 0.0,
46
+ "encoder_layers": 12,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 2,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": 0,
52
+ "forced_eos_token_id": 2,
53
+ "gradient_checkpointing": false,
54
+ "id2label": {
55
+ "0": "LABEL_0",
56
+ "1": "LABEL_1",
57
+ "2": "LABEL_2"
58
+ },
59
+ "init_std": 0.02,
60
+ "is_decoder": false,
61
+ "is_encoder_decoder": true,
62
+ "label2id": {
63
+ "LABEL_0": 0,
64
+ "LABEL_1": 1,
65
+ "LABEL_2": 2
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 1024,
70
+ "min_length": 0,
71
+ "model_type": "florence2_language",
72
+ "no_repeat_ngram_size": 3,
73
+ "normalize_before": false,
74
+ "num_beam_groups": 1,
75
+ "num_beams": 3,
76
+ "num_hidden_layers": 12,
77
+ "num_return_sequences": 1,
78
+ "output_attentions": false,
79
+ "output_hidden_states": false,
80
+ "output_scores": false,
81
+ "pad_token_id": 1,
82
+ "prefix": null,
83
+ "problem_type": null,
84
+ "pruned_heads": {},
85
+ "remove_invalid_values": false,
86
+ "repetition_penalty": 1.0,
87
+ "return_dict": true,
88
+ "return_dict_in_generate": false,
89
+ "scale_embedding": false,
90
+ "sep_token_id": null,
91
+ "suppress_tokens": null,
92
+ "task_specific_params": null,
93
+ "temperature": 1.0,
94
+ "tf_legacy_loss": false,
95
+ "tie_encoder_decoder": false,
96
+ "tie_word_embeddings": true,
97
+ "tokenizer_class": null,
98
+ "top_k": 50,
99
+ "top_p": 1.0,
100
+ "torch_dtype": null,
101
+ "torchscript": false,
102
+ "typical_p": 1.0,
103
+ "use_bfloat16": false,
104
+ "use_cache": true,
105
+ "vocab_size": 51289
106
+ },
107
+ "torch_dtype": "float32",
108
+ "transformers_version": "4.42.4",
109
+ "vision_config": {
110
+ "_name_or_path": "",
111
+ "add_cross_attention": false,
112
+ "architectures": null,
113
+ "bad_words_ids": null,
114
+ "begin_suppress_tokens": null,
115
+ "bos_token_id": null,
116
+ "chunk_size_feed_forward": 0,
117
+ "cross_attention_hidden_size": null,
118
+ "decoder_start_token_id": null,
119
+ "depths": [
120
+ 1,
121
+ 1,
122
+ 9,
123
+ 1
124
+ ],
125
+ "dim_embed": [
126
+ 256,
127
+ 512,
128
+ 1024,
129
+ 2048
130
+ ],
131
+ "diversity_penalty": 0.0,
132
+ "do_sample": false,
133
+ "drop_path_rate": 0.1,
134
+ "early_stopping": false,
135
+ "enable_checkpoint": false,
136
+ "encoder_no_repeat_ngram_size": 0,
137
+ "eos_token_id": null,
138
+ "exponential_decay_length_penalty": null,
139
+ "finetuning_task": null,
140
+ "forced_bos_token_id": null,
141
+ "forced_eos_token_id": null,
142
+ "id2label": {
143
+ "0": "LABEL_0",
144
+ "1": "LABEL_1"
145
+ },
146
+ "image_feature_source": [
147
+ "spatial_avg_pool",
148
+ "temporal_avg_pool"
149
+ ],
150
+ "image_pos_embed": {
151
+ "max_pos_embeddings": 50,
152
+ "type": "learned_abs_2d"
153
+ },
154
+ "is_decoder": false,
155
+ "is_encoder_decoder": false,
156
+ "label2id": {
157
+ "LABEL_0": 0,
158
+ "LABEL_1": 1
159
+ },
160
+ "length_penalty": 1.0,
161
+ "max_length": 20,
162
+ "min_length": 0,
163
+ "model_type": "davit",
164
+ "no_repeat_ngram_size": 0,
165
+ "num_beam_groups": 1,
166
+ "num_beams": 1,
167
+ "num_groups": [
168
+ 8,
169
+ 16,
170
+ 32,
171
+ 64
172
+ ],
173
+ "num_heads": [
174
+ 8,
175
+ 16,
176
+ 32,
177
+ 64
178
+ ],
179
+ "num_return_sequences": 1,
180
+ "output_attentions": false,
181
+ "output_hidden_states": false,
182
+ "output_scores": false,
183
+ "pad_token_id": null,
184
+ "patch_padding": [
185
+ 3,
186
+ 1,
187
+ 1,
188
+ 1
189
+ ],
190
+ "patch_prenorm": [
191
+ false,
192
+ true,
193
+ true,
194
+ true
195
+ ],
196
+ "patch_size": [
197
+ 7,
198
+ 3,
199
+ 3,
200
+ 3
201
+ ],
202
+ "patch_stride": [
203
+ 4,
204
+ 2,
205
+ 2,
206
+ 2
207
+ ],
208
+ "prefix": null,
209
+ "problem_type": null,
210
+ "projection_dim": 1024,
211
+ "pruned_heads": {},
212
+ "remove_invalid_values": false,
213
+ "repetition_penalty": 1.0,
214
+ "return_dict": true,
215
+ "return_dict_in_generate": false,
216
+ "sep_token_id": null,
217
+ "suppress_tokens": null,
218
+ "task_specific_params": null,
219
+ "temperature": 1.0,
220
+ "tf_legacy_loss": false,
221
+ "tie_encoder_decoder": false,
222
+ "tie_word_embeddings": true,
223
+ "tokenizer_class": null,
224
+ "top_k": 50,
225
+ "top_p": 1.0,
226
+ "torch_dtype": null,
227
+ "torchscript": false,
228
+ "typical_p": 1.0,
229
+ "use_bfloat16": false,
230
+ "visual_temporal_embedding": {
231
+ "max_temporal_embeddings": 100,
232
+ "type": "COSINE"
233
+ },
234
+ "window_size": 12
235
+ },
236
+ "vocab_size": 51289
237
+ }